Skip to content

PG-1813 Make WAL keys TLI aware #509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions contrib/pg_tde/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ tap_tests = [
't/unlogged_tables.pl',
't/wal_archiving.pl',
't/wal_encrypt.pl',
't/wal_key_tli.pl',
]

tests += {
Expand Down
35 changes: 19 additions & 16 deletions contrib/pg_tde/src/access/pg_tde_xlog_keys.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#define PG_TDE_WAL_KEY_FILE_NAME "wal_keys"

#define MaxXLogRecPtr (~(XLogRecPtr)0)
#define MaxTimeLineID (~(TimeLineID)0)

typedef struct WalKeyFileHeader
{
Expand All @@ -44,7 +45,7 @@ typedef struct WalKeyFileEntry
static WALKeyCacheRec *tde_wal_key_cache = NULL;
static WALKeyCacheRec *tde_wal_key_last_rec = NULL;

static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key, XLogRecPtr start_lsn);
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key);
static WalEncryptionKey *pg_tde_decrypt_wal_key(TDEPrincipalKey *principal_key, WalKeyFileEntry *entry);
static void pg_tde_initialize_wal_key_file_entry(WalKeyFileEntry *entry, const TDEPrincipalKey *principal_key, const WalEncryptionKey *rel_key_data);
static int pg_tde_open_wal_key_file_basic(const char *filename, int flags, bool ignore_missing);
Expand All @@ -69,7 +70,7 @@ get_wal_key_file_path(void)
}

void
pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn)
pg_tde_wal_last_key_set_location(WalLocation loc)
{
LWLock *lock_pk = tde_lwlock_enc_keys();
int fd;
Expand All @@ -85,9 +86,9 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn)
write_pos = sizeof(WalKeyFileHeader) +
(last_key_idx * sizeof(WalKeyFileEntry)) +
offsetof(WalKeyFileEntry, enc_key) +
offsetof(WalEncryptionKey, start_lsn);
offsetof(WalEncryptionKey, wal_start);

if (pg_pwrite(fd, &lsn, sizeof(XLogRecPtr), write_pos) != sizeof(XLogRecPtr))
if (pg_pwrite(fd, &loc, sizeof(WalLocation), write_pos) != sizeof(WalLocation))
{
ereport(ERROR,
errcode_for_file_access(),
Expand All @@ -111,7 +112,7 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn)
errmsg("could not read previous WAL key: %m"));
}

if (prev_entry.enc_key.start_lsn >= lsn)
if (wal_location_cmp(prev_entry.enc_key.wal_start, loc) >= 0)
{
prev_entry.enc_key.type = TDE_KEY_TYPE_WAL_INVALID;

Expand Down Expand Up @@ -160,7 +161,8 @@ pg_tde_create_wal_key(WalEncryptionKey *rel_key_data, TDEMapEntryType entry_type

/* TODO: no need in generating key if TDE_KEY_TYPE_WAL_UNENCRYPTED */
rel_key_data->type = entry_type;
rel_key_data->start_lsn = InvalidXLogRecPtr;
rel_key_data->wal_start.lsn = InvalidXLogRecPtr;
rel_key_data->wal_start.tli = 0;

if (!RAND_bytes(rel_key_data->key, INTERNAL_KEY_LEN))
ereport(ERROR,
Expand Down Expand Up @@ -245,7 +247,7 @@ pg_tde_read_last_wal_key(void)

/* Fetches WAL keys from disk and adds them to the WAL cache */
WALKeyCacheRec *
pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
pg_tde_fetch_wal_keys(WalLocation start)
{
off_t read_pos = 0;
LWLock *lock_pk = tde_lwlock_enc_keys();
Expand Down Expand Up @@ -276,10 +278,10 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
{
WALKeyCacheRec *wal_rec;
WalEncryptionKey stub_key = {
.start_lsn = InvalidXLogRecPtr,
.wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr},
};

wal_rec = pg_tde_add_wal_key_to_cache(&stub_key, InvalidXLogRecPtr);
wal_rec = pg_tde_add_wal_key_to_cache(&stub_key);

#ifdef FRONTEND
/* The backend frees it after copying to the cache. */
Expand All @@ -299,15 +301,15 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
/*
* Skip new (just created but not updated by write) and invalid keys
*/
if (entry.enc_key.start_lsn != InvalidXLogRecPtr &&
if (wal_location_valid(entry.enc_key.wal_start) &&
(entry.enc_key.type == TDE_KEY_TYPE_WAL_UNENCRYPTED ||
entry.enc_key.type == TDE_KEY_TYPE_WAL_ENCRYPTED) &&
entry.enc_key.start_lsn >= start_lsn)
wal_location_cmp(entry.enc_key.wal_start, start) >= 0)
{
WalEncryptionKey *rel_key_data = pg_tde_decrypt_wal_key(principal_key, &entry);
WALKeyCacheRec *wal_rec;

wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data, entry.enc_key.start_lsn);
wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data);

pfree(rel_key_data);

Expand All @@ -325,7 +327,7 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
}

static WALKeyCacheRec *
pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn)
pg_tde_add_wal_key_to_cache(WalEncryptionKey *key)
{
WALKeyCacheRec *wal_rec;
#ifndef FRONTEND
Expand All @@ -338,8 +340,9 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn)
MemoryContextSwitchTo(oldCtx);
#endif

wal_rec->start_lsn = start_lsn;
wal_rec->end_lsn = MaxXLogRecPtr;
wal_rec->start = key->wal_start;
wal_rec->end.tli = MaxTimeLineID;
wal_rec->end.lsn = MaxXLogRecPtr;
wal_rec->key = *key;
wal_rec->crypt_ctx = NULL;
if (!tde_wal_key_last_rec)
Expand All @@ -350,7 +353,7 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn)
else
{
tde_wal_key_last_rec->next = wal_rec;
tde_wal_key_last_rec->end_lsn = wal_rec->start_lsn;
tde_wal_key_last_rec->end = wal_rec->start;
tde_wal_key_last_rec = wal_rec;
}

Expand Down
103 changes: 69 additions & 34 deletions contrib/pg_tde/src/access/pg_tde_xlog_smgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ static void *EncryptionCryptCtx = NULL;
static WalEncryptionKey EncryptionKey =
{
.type = MAP_ENTRY_EMPTY,
.start_lsn = InvalidXLogRecPtr,
.wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr},
};

/*
Expand All @@ -65,7 +65,12 @@ static WalEncryptionKey EncryptionKey =

typedef struct EncryptionStateData
{
pg_atomic_uint64 enc_key_lsn; /* to sync with readers */
/*
* To sync with readers. We sync on LSN only and TLI here just to
* communicate its value to readers.
*/
pg_atomic_uint32 enc_key_tli;
pg_atomic_uint64 enc_key_lsn;
} EncryptionStateData;

static EncryptionStateData *EncryptionState = NULL;
Expand All @@ -78,10 +83,24 @@ TDEXLogGetEncKeyLsn()
return (XLogRecPtr) pg_atomic_read_u64(&EncryptionState->enc_key_lsn);
}

static TimeLineID
TDEXLogGetEncKeyTli()
{
return (TimeLineID) pg_atomic_read_u32(&EncryptionState->enc_key_tli);
}

static void
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
TDEXLogSetEncKeyLocation(WalLocation loc)
{
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, start_lsn);
/*
* Write TLI first and then LSN. The barrier ensures writes won't be
* reordered. When reading, the opposite must be done (with a matching
* barrier in between), so we always see a valid TLI after observing a
* valid LSN.
*/
pg_atomic_write_u32(&EncryptionState->enc_key_tli, loc.tli);
pg_write_barrier();
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, loc.lsn);
}

static Size TDEXLogEncryptBuffSize(void);
Expand Down Expand Up @@ -166,7 +185,8 @@ TDEXLogShmemInit(void)

typedef struct EncryptionStateData
{
XLogRecPtr enc_key_lsn; /* to sync with reader */
TimeLineID enc_key_tli;
XLogRecPtr enc_key_lsn;
} EncryptionStateData;

static EncryptionStateData EncryptionStateD = {0};
Expand All @@ -181,10 +201,17 @@ TDEXLogGetEncKeyLsn()
return (XLogRecPtr) EncryptionState->enc_key_lsn;
}

static TimeLineID
TDEXLogGetEncKeyTli()
{
return (TimeLineID) EncryptionState->enc_key_tli;
}

static void
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
TDEXLogSetEncKeyLocation(WalLocation loc)
{
EncryptionState->enc_key_lsn = EncryptionKey.start_lsn;
EncryptionState->enc_key_tli = loc.tli;
EncryptionState->enc_key_lsn = loc.lsn;
}

#endif /* FRONTEND */
Expand Down Expand Up @@ -216,7 +243,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
else if (key)
{
EncryptionKey = *key;
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
}

if (key)
Expand All @@ -231,7 +258,7 @@ TDEXLogSmgrInitWriteReuseKey()
if (key)
{
EncryptionKey = *key;
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
pfree(key);
}
}
Expand All @@ -252,8 +279,8 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
#endif

#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno), LSN_FORMAT_ARGS(key->start_lsn));
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %u_%X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno), key->wal_start.tli, LSN_FORMAT_ARGS(key->wal_start.lsn));
#endif

CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
Expand All @@ -279,13 +306,13 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
*/
if (EncryptionKey.type != MAP_ENTRY_EMPTY && TDEXLogGetEncKeyLsn() == 0)
{
XLogRecPtr lsn;
WalLocation loc = {.tli = tli};

XLogSegNoOffsetToRecPtr(segno, offset, segSize, lsn);
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);

pg_tde_wal_last_key_set_lsn(lsn);
EncryptionKey.start_lsn = lsn;
TDEXLogSetEncKeyLsn(lsn);
pg_tde_wal_last_key_set_location(loc);
EncryptionKey.wal_start = loc;
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
}

if (EncryptionKey.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
Expand All @@ -304,12 +331,12 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
ssize_t readsz;
WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys();
XLogRecPtr write_key_lsn;
XLogRecPtr data_start;
XLogRecPtr data_end;
WalLocation data_end = {.tli = tli};
WalLocation data_start = {.tli = tli};

#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno));
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %u_%X/%X",
count, offset, offset, tli, LSN_FORMAT_ARGS(segno));
#endif

readsz = pg_pread(fd, buf, count, offset);
Expand All @@ -319,30 +346,38 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,

if (!keys)
{
WalLocation start = {.tli = 1,.lsn = 0};

/* cache is empty, try to read keys from disk */
keys = pg_tde_fetch_wal_keys(InvalidXLogRecPtr);
keys = pg_tde_fetch_wal_keys(start);
}

/*
* The barrier ensures that we always read a vaild TLI after the valid
* LSN. See the comment in TDEXLogSetEncKeyLocation()
*/
write_key_lsn = TDEXLogGetEncKeyLsn();
pg_read_barrier();

if (!XLogRecPtrIsInvalid(write_key_lsn))
{
WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
WalLocation write_loc = {.tli = TDEXLogGetEncKeyTli(),.lsn = write_key_lsn};

Assert(last_key);

/* write has generated a new key, need to fetch it */
if (last_key->start_lsn < write_key_lsn)
if (wal_location_cmp(last_key->start, write_loc) < 0)
{
pg_tde_fetch_wal_keys(write_key_lsn);
pg_tde_fetch_wal_keys(write_loc);

/* in case cache was empty before */
keys = pg_tde_get_wal_cache_keys();
}
}

XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start);
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end);
XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start.lsn);
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end.lsn);

/*
* TODO: this is higly ineffective. We should get rid of linked list and
Expand All @@ -351,24 +386,24 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
for (WALKeyCacheRec *curr_key = keys; curr_key != NULL; curr_key = curr_key->next)
{
#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "WAL key %X/%X-%X/%X, encrypted: %s",
LSN_FORMAT_ARGS(curr_key->start_lsn),
LSN_FORMAT_ARGS(curr_key->end_lsn),
elog(DEBUG1, "WAL key %u_%X/%X - %u_%X/%X, encrypted: %s",
curr_key->start.tli, LSN_FORMAT_ARGS(curr_key->start.lsn),
curr_key->end.tli, LSN_FORMAT_ARGS(curr_key->end.lsn),
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED ? "yes" : "no");
#endif

if (curr_key->key.start_lsn != InvalidXLogRecPtr &&
if (wal_location_valid(curr_key->key.wal_start) &&
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
{
/*
* Check if the key's range overlaps with the buffer's and decypt
* the part that does.
*/
if (data_start < curr_key->end_lsn && data_end > curr_key->start_lsn)
if (wal_location_cmp(data_start, curr_key->end) < 0 && wal_location_cmp(data_end, curr_key->start) > 0)
{
char iv_prefix[16];
off_t dec_off = XLogSegmentOffset(Max(data_start, curr_key->start_lsn), segSize);
off_t dec_end = XLogSegmentOffset(Min(data_end, curr_key->end_lsn), segSize);
off_t dec_off = XLogSegmentOffset(Max(data_start.lsn, curr_key->start.lsn), segSize);
off_t dec_end = XLogSegmentOffset(Min(data_end.lsn, curr_key->end.lsn), segSize);
size_t dec_sz;
char *dec_buf = (char *) buf + (dec_off - offset);

Expand All @@ -385,8 +420,8 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
dec_sz = dec_end - dec_off;

#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %X/%X",
dec_off, dec_off - offset, dec_sz, LSN_FORMAT_ARGS(curr_key->key->start_lsn));
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X",
dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn));
#endif
pg_tde_stream_crypt(iv_prefix,
dec_off,
Expand Down
Loading