Skip to content

Make WAL keys TLI aware #491

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 18 additions & 15 deletions contrib/pg_tde/src/access/pg_tde_tdemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#define TDE_FILE_HEADER_SIZE sizeof(TDEFileHeader)

#define MaxXLogRecPtr (~(XLogRecPtr)0)
#define MaxTimeLineID (~(TimeLineID)0)

typedef struct TDEFileHeader
{
Expand All @@ -65,7 +66,7 @@ static void pg_tde_file_header_read(const char *tde_filename, int fd, TDEFileHea
static int pg_tde_file_header_write(const char *tde_filename, int fd, const TDESignedPrincipalKeyInfo *signed_key_info, off_t *bytes_written);
static bool pg_tde_read_one_map_entry(int fd, TDEMapEntry *map_entry, off_t *offset);
static void pg_tde_read_one_map_entry2(int keydata_fd, int32 key_index, TDEMapEntry *map_entry, Oid databaseId);
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(InternalKey *cached_key, XLogRecPtr start_lsn);
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(InternalKey *cached_key);

#ifndef FRONTEND
static void pg_tde_sign_principal_key_info(TDESignedPrincipalKeyInfo *signed_key_info, const TDEPrincipalKey *principal_key);
Expand Down Expand Up @@ -369,22 +370,23 @@ pg_tde_delete_principal_key(Oid dbOid)
* needs keyfile_path
*/
void
pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, const char *keyfile_path)
pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, TimeLineID tli, const char *keyfile_path)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why doesn't this take a WalLocation?

{
LWLock *lock_pk = tde_lwlock_enc_keys();
int fd;
off_t read_pos,
write_pos,
last_key_idx;
WALLocation loc = {.tli = tli,.lsn = lsn};

LWLockAcquire(lock_pk, LW_EXCLUSIVE);

fd = pg_tde_open_file_write(keyfile_path, NULL, false, &read_pos);

last_key_idx = ((lseek(fd, 0, SEEK_END) - TDE_FILE_HEADER_SIZE) / MAP_ENTRY_SIZE) - 1;
write_pos = TDE_FILE_HEADER_SIZE + (last_key_idx * MAP_ENTRY_SIZE) + offsetof(TDEMapEntry, enc_key) + offsetof(InternalKey, start_lsn);
write_pos = TDE_FILE_HEADER_SIZE + (last_key_idx * MAP_ENTRY_SIZE) + offsetof(TDEMapEntry, enc_key) + offsetof(InternalKey, wal_start);

if (pg_pwrite(fd, &lsn, sizeof(XLogRecPtr), write_pos) != sizeof(XLogRecPtr))
if (pg_pwrite(fd, &loc, sizeof(WALLocation), write_pos) != sizeof(WALLocation))
{
ereport(ERROR,
errcode_for_file_access(),
Expand All @@ -408,7 +410,7 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, const char *keyfile_path)
errmsg("could not read previous WAL key: %m"));
}

if (prev_map_entry.enc_key.start_lsn >= lsn)
if (wal_location_cmp(prev_map_entry.enc_key.wal_start, loc) >= 0)
{
prev_map_entry.enc_key.type = TDE_KEY_TYPE_WAL_INVALID;

Expand Down Expand Up @@ -1035,7 +1037,7 @@ pg_tde_read_last_wal_key(void)

/* Fetches WAL keys from disk and adds them to the WAL cache */
WALKeyCacheRec *
pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
pg_tde_fetch_wal_keys(WALLocation start)
{
RelFileLocator rlocator = GLOBAL_SPACE_RLOCATOR(XLOG_TDE_OID);
char db_map_path[MAXPGPATH];
Expand Down Expand Up @@ -1070,10 +1072,10 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
{
WALKeyCacheRec *wal_rec;
InternalKey stub_key = {
.start_lsn = InvalidXLogRecPtr,
.wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr},
};

wal_rec = pg_tde_add_wal_key_to_cache(&stub_key, InvalidXLogRecPtr);
wal_rec = pg_tde_add_wal_key_to_cache(&stub_key);

#ifdef FRONTEND
/* The backend frees it after copying to the cache. */
Expand All @@ -1093,15 +1095,15 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
/*
* Skip new (just created but not updated by write) and invalid keys
*/
if (map_entry.enc_key.start_lsn != InvalidXLogRecPtr &&
if (wal_location_valid(map_entry.enc_key.wal_start) &&
(map_entry.enc_key.type == TDE_KEY_TYPE_WAL_UNENCRYPTED ||
map_entry.enc_key.type == TDE_KEY_TYPE_WAL_ENCRYPTED) &&
map_entry.enc_key.start_lsn >= start_lsn)
wal_location_cmp(map_entry.enc_key.wal_start, start) >= 0)
{
InternalKey *rel_key_data = tde_decrypt_rel_key(principal_key, &map_entry);
WALKeyCacheRec *wal_rec;

wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data, map_entry.enc_key.start_lsn);
wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data);

pfree(rel_key_data);

Expand All @@ -1119,7 +1121,7 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
}

static WALKeyCacheRec *
pg_tde_add_wal_key_to_cache(InternalKey *key, XLogRecPtr start_lsn)
pg_tde_add_wal_key_to_cache(InternalKey *key)
{
WALKeyCacheRec *wal_rec;
#ifndef FRONTEND
Expand All @@ -1132,8 +1134,9 @@ pg_tde_add_wal_key_to_cache(InternalKey *key, XLogRecPtr start_lsn)
MemoryContextSwitchTo(oldCtx);
#endif

wal_rec->start_lsn = start_lsn;
wal_rec->end_lsn = MaxXLogRecPtr;
wal_rec->start = key->wal_start;
wal_rec->end.tli = MaxTimeLineID;
wal_rec->end.lsn = MaxXLogRecPtr;
wal_rec->key = *key;
wal_rec->crypt_ctx = NULL;
if (!tde_wal_key_last_rec)
Expand All @@ -1144,7 +1147,7 @@ pg_tde_add_wal_key_to_cache(InternalKey *key, XLogRecPtr start_lsn)
else
{
tde_wal_key_last_rec->next = wal_rec;
tde_wal_key_last_rec->end_lsn = wal_rec->start_lsn;
tde_wal_key_last_rec->end = wal_rec->start;
tde_wal_key_last_rec = wal_rec;
}

Expand Down
102 changes: 71 additions & 31 deletions contrib/pg_tde/src/access/pg_tde_xlog_smgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ static void *EncryptionCryptCtx = NULL;
static InternalKey EncryptionKey =
{
.type = MAP_ENTRY_EMPTY,
.start_lsn = InvalidXLogRecPtr,
.wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr},
};

/*
Expand All @@ -66,7 +66,13 @@ static InternalKey EncryptionKey =
typedef struct EncryptionStateData
{
char db_map_path[MAXPGPATH];
pg_atomic_uint64 enc_key_lsn; /* to sync with readers */

/*
* To sync with readers. We sync on LSN only and TLI here just to
* communicate its value to readers.
*/
pg_atomic_uint32 enc_key_tli;
pg_atomic_uint64 enc_key_lsn;
} EncryptionStateData;

static EncryptionStateData *EncryptionState = NULL;
Expand All @@ -80,9 +86,23 @@ TDEXLogGetEncKeyLsn()
}

static void
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
TDEXLogSetEncKeyLocation(WALLocation loc)
{
/*
* Write TLI first and then LSN. The barrier ensures writes won't be
* reordered. When reading, the opposite must be done (with a matching
* barrier in between), so we always see a valid TLI after observing a
* valid LSN.
*/
pg_atomic_write_u32(&EncryptionState->enc_key_tli, loc.tli);
pg_write_barrier();
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, loc.lsn);
}

static TimeLineID
TDEXLogGetEncKeyTli()
{
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, start_lsn);
return (TimeLineID) pg_atomic_read_u32(&EncryptionState->enc_key_tli);
}

static Size TDEXLogEncryptBuffSize(void);
Expand Down Expand Up @@ -159,6 +179,7 @@ TDEXLogShmemInit(void)
}

pg_atomic_init_u64(&EncryptionState->enc_key_lsn, 0);
pg_atomic_init_u32(&EncryptionState->enc_key_tli, 0);

elog(DEBUG1, "pg_tde: initialized encryption buffer %lu bytes", TDEXLogEncryptStateSize());
}
Expand All @@ -168,6 +189,7 @@ TDEXLogShmemInit(void)
typedef struct EncryptionStateData
{
char db_map_path[MAXPGPATH];
XLogRecPtr enc_key_tli; /* to sync with reader */
XLogRecPtr enc_key_lsn; /* to sync with reader */
} EncryptionStateData;

Expand All @@ -184,9 +206,16 @@ TDEXLogGetEncKeyLsn()
}

static void
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
TDEXLogSetEncKeyLocation(WALLocation loc)
{
EncryptionState->enc_key_lsn = EncryptionKey.start_lsn;
EncryptionState->enc_key_tli = loc.tli;
EncryptionState->enc_key_lsn = loc.lsn;
}

static TimeLineID
TDEXLogGetEncKeyTli()
{
return (TimeLineID) EncryptionState->enc_key_tli;
}

#endif /* FRONTEND */
Expand All @@ -197,6 +226,7 @@ TDEXLogSmgrInit()
SetXLogSmgr(&tde_xlog_smgr);
}

/* On backend it should be called only during the startup */
void
TDEXLogSmgrInitWrite(bool encrypt_xlog)
{
Expand All @@ -220,7 +250,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
else if (key)
{
EncryptionKey = *key;
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
}

if (key)
Expand All @@ -237,7 +267,7 @@ TDEXLogSmgrInitWriteReuseKey()
if (key)
{
EncryptionKey = *key;
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
pfree(key);
}

Expand All @@ -260,8 +290,8 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
#endif

#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno), LSN_FORMAT_ARGS(key->start_lsn));
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %u_%X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno), key->wal_start.tli, LSN_FORMAT_ARGS(key->wal_start.lsn));
#endif

CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
Expand All @@ -287,9 +317,10 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,

XLogSegNoOffsetToRecPtr(segno, offset, segSize, lsn);

pg_tde_wal_last_key_set_lsn(lsn, EncryptionState->db_map_path);
EncryptionKey.start_lsn = lsn;
TDEXLogSetEncKeyLsn(lsn);
pg_tde_wal_last_key_set_lsn(lsn, tli, EncryptionState->db_map_path);
EncryptionKey.wal_start.tli = tli;
EncryptionKey.wal_start.lsn = lsn;
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
}

if (EncryptionKey.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
Expand All @@ -308,12 +339,12 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
ssize_t readsz;
WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys();
XLogRecPtr write_key_lsn;
XLogRecPtr data_start;
XLogRecPtr data_end;
WALLocation data_start = {.tli = tli};
WALLocation data_end = {.tli = tli};

#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno));
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %u_%X/%X",
count, offset, offset, tli, LSN_FORMAT_ARGS(segno));
#endif

readsz = pg_pread(fd, buf, count, offset);
Expand All @@ -323,30 +354,38 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,

if (!keys)
{
WALLocation start = {.tli = 1,.lsn = 0};

/* cache is empty, try to read keys from disk */
keys = pg_tde_fetch_wal_keys(InvalidXLogRecPtr);
keys = pg_tde_fetch_wal_keys(start);
}

/*
* The barrier ensures that we always read a vaild TLI after the valid
* LSN. See the comment in TDEXLogSetEncKeyLocation()
*/
write_key_lsn = TDEXLogGetEncKeyLsn();
pg_read_barrier();

if (!XLogRecPtrIsInvalid(write_key_lsn))
{
WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
WALLocation write_loc = {.tli = TDEXLogGetEncKeyTli(),.lsn = write_key_lsn};

Assert(last_key);

/* write has generated a new key, need to fetch it */
if (last_key->start_lsn < write_key_lsn)
if (wal_location_cmp(last_key->start, write_loc) < 0)
{
pg_tde_fetch_wal_keys(write_key_lsn);
pg_tde_fetch_wal_keys(write_loc);

/* in case cache was empty before */
keys = pg_tde_get_wal_cache_keys();
}
}

XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start);
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end);
XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start.lsn);
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end.lsn);

/*
* TODO: this is higly ineffective. We should get rid of linked list and
Expand All @@ -355,24 +394,25 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
for (WALKeyCacheRec *curr_key = keys; curr_key != NULL; curr_key = curr_key->next)
{
#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "WAL key %X/%X-%X/%X, encrypted: %s",
LSN_FORMAT_ARGS(curr_key->start_lsn),
LSN_FORMAT_ARGS(curr_key->end_lsn),
elog(DEBUG1, "WAL key %u_%X/%X - %u_%X/%X, encrypted: %s",
curr_key->start.tli, LSN_FORMAT_ARGS(curr_key->start.lsn),
curr_key->end.tli, LSN_FORMAT_ARGS(curr_key->end.lsn),
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED ? "yes" : "no");
#endif

if (curr_key->key.start_lsn != InvalidXLogRecPtr &&
if (wal_location_valid(curr_key->key.wal_start) &&
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
{
/*
* Check if the key's range overlaps with the buffer's and decypt
* the part that does.
*/
if (data_start < curr_key->end_lsn && data_end > curr_key->start_lsn)

if (wal_location_cmp(data_start, curr_key->end) < 0 && wal_location_cmp(data_end, curr_key->start) > 0)
{
char iv_prefix[16];
off_t dec_off = XLogSegmentOffset(Max(data_start, curr_key->start_lsn), segSize);
off_t dec_end = XLogSegmentOffset(Min(data_end, curr_key->end_lsn), segSize);
off_t dec_off = XLogSegmentOffset(Max(data_start.lsn, curr_key->start.lsn), segSize);
off_t dec_end = XLogSegmentOffset(Min(data_end.lsn, curr_key->end.lsn), segSize);
size_t dec_sz;
char *dec_buf = (char *) buf + (dec_off - offset);

Expand All @@ -389,8 +429,8 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
dec_sz = dec_end - dec_off;

#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %X/%X",
dec_off, dec_off - offset, dec_sz, LSN_FORMAT_ARGS(curr_key->key->start_lsn));
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X",
dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn));
#endif
pg_tde_stream_crypt(iv_prefix, dec_off, dec_buf, dec_sz, dec_buf,
&curr_key->key, &curr_key->crypt_ctx);
Expand Down
3 changes: 2 additions & 1 deletion contrib/pg_tde/src/encryption/enc_tde.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ void
pg_tde_generate_internal_key(InternalKey *int_key, TDEMapEntryType entry_type)
{
int_key->type = entry_type;
int_key->start_lsn = InvalidXLogRecPtr;
int_key->wal_start.tli = 0;
int_key->wal_start.lsn = InvalidXLogRecPtr;

if (!RAND_bytes(int_key->key, INTERNAL_KEY_LEN))
ereport(ERROR,
Expand Down
Loading
Loading