diff --git a/contrib/pg_tde/meson.build b/contrib/pg_tde/meson.build index f2c21edc8dd2f..6410a3b406295 100644 --- a/contrib/pg_tde/meson.build +++ b/contrib/pg_tde/meson.build @@ -126,6 +126,7 @@ tap_tests = [ 't/unlogged_tables.pl', 't/wal_archiving.pl', 't/wal_encrypt.pl', + 't/wal_key_tli.pl', ] tests += { diff --git a/contrib/pg_tde/src/access/pg_tde_xlog_keys.c b/contrib/pg_tde/src/access/pg_tde_xlog_keys.c index 25c8473a20198..fc3e8820e194c 100644 --- a/contrib/pg_tde/src/access/pg_tde_xlog_keys.c +++ b/contrib/pg_tde/src/access/pg_tde_xlog_keys.c @@ -25,6 +25,7 @@ #define PG_TDE_WAL_KEY_FILE_NAME "wal_keys" #define MaxXLogRecPtr (~(XLogRecPtr)0) +#define MaxTimeLineID (~(TimeLineID)0) typedef struct WalKeyFileHeader { @@ -44,7 +45,7 @@ typedef struct WalKeyFileEntry static WALKeyCacheRec *tde_wal_key_cache = NULL; static WALKeyCacheRec *tde_wal_key_last_rec = NULL; -static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key, XLogRecPtr start_lsn); +static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key); static WalEncryptionKey *pg_tde_decrypt_wal_key(TDEPrincipalKey *principal_key, WalKeyFileEntry *entry); static void pg_tde_initialize_wal_key_file_entry(WalKeyFileEntry *entry, const TDEPrincipalKey *principal_key, const WalEncryptionKey *rel_key_data); static int pg_tde_open_wal_key_file_basic(const char *filename, int flags, bool ignore_missing); @@ -69,7 +70,7 @@ get_wal_key_file_path(void) } void -pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn) +pg_tde_wal_last_key_set_location(WalLocation loc) { LWLock *lock_pk = tde_lwlock_enc_keys(); int fd; @@ -85,9 +86,9 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn) write_pos = sizeof(WalKeyFileHeader) + (last_key_idx * sizeof(WalKeyFileEntry)) + offsetof(WalKeyFileEntry, enc_key) + - offsetof(WalEncryptionKey, start_lsn); + offsetof(WalEncryptionKey, wal_start); - if (pg_pwrite(fd, &lsn, sizeof(XLogRecPtr), write_pos) != sizeof(XLogRecPtr)) + if (pg_pwrite(fd, &loc, sizeof(WalLocation), write_pos) != sizeof(WalLocation)) { ereport(ERROR, errcode_for_file_access(), @@ -111,7 +112,7 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn) errmsg("could not read previous WAL key: %m")); } - if (prev_entry.enc_key.start_lsn >= lsn) + if (wal_location_cmp(prev_entry.enc_key.wal_start, loc) >= 0) { prev_entry.enc_key.type = TDE_KEY_TYPE_WAL_INVALID; @@ -160,7 +161,8 @@ pg_tde_create_wal_key(WalEncryptionKey *rel_key_data, TDEMapEntryType entry_type /* TODO: no need in generating key if TDE_KEY_TYPE_WAL_UNENCRYPTED */ rel_key_data->type = entry_type; - rel_key_data->start_lsn = InvalidXLogRecPtr; + rel_key_data->wal_start.lsn = InvalidXLogRecPtr; + rel_key_data->wal_start.tli = 0; if (!RAND_bytes(rel_key_data->key, INTERNAL_KEY_LEN)) ereport(ERROR, @@ -245,7 +247,7 @@ pg_tde_read_last_wal_key(void) /* Fetches WAL keys from disk and adds them to the WAL cache */ WALKeyCacheRec * -pg_tde_fetch_wal_keys(XLogRecPtr start_lsn) +pg_tde_fetch_wal_keys(WalLocation start) { off_t read_pos = 0; LWLock *lock_pk = tde_lwlock_enc_keys(); @@ -276,10 +278,10 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn) { WALKeyCacheRec *wal_rec; WalEncryptionKey stub_key = { - .start_lsn = InvalidXLogRecPtr, + .wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr}, }; - wal_rec = pg_tde_add_wal_key_to_cache(&stub_key, InvalidXLogRecPtr); + wal_rec = pg_tde_add_wal_key_to_cache(&stub_key); #ifdef FRONTEND /* The backend frees it after copying to the cache. */ @@ -299,15 +301,15 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn) /* * Skip new (just created but not updated by write) and invalid keys */ - if (entry.enc_key.start_lsn != InvalidXLogRecPtr && + if (wal_location_valid(entry.enc_key.wal_start) && (entry.enc_key.type == TDE_KEY_TYPE_WAL_UNENCRYPTED || entry.enc_key.type == TDE_KEY_TYPE_WAL_ENCRYPTED) && - entry.enc_key.start_lsn >= start_lsn) + wal_location_cmp(entry.enc_key.wal_start, start) >= 0) { WalEncryptionKey *rel_key_data = pg_tde_decrypt_wal_key(principal_key, &entry); WALKeyCacheRec *wal_rec; - wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data, entry.enc_key.start_lsn); + wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data); pfree(rel_key_data); @@ -325,7 +327,7 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn) } static WALKeyCacheRec * -pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn) +pg_tde_add_wal_key_to_cache(WalEncryptionKey *key) { WALKeyCacheRec *wal_rec; #ifndef FRONTEND @@ -338,8 +340,9 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn) MemoryContextSwitchTo(oldCtx); #endif - wal_rec->start_lsn = start_lsn; - wal_rec->end_lsn = MaxXLogRecPtr; + wal_rec->start = key->wal_start; + wal_rec->end.tli = MaxTimeLineID; + wal_rec->end.lsn = MaxXLogRecPtr; wal_rec->key = *key; wal_rec->crypt_ctx = NULL; if (!tde_wal_key_last_rec) @@ -350,7 +353,7 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn) else { tde_wal_key_last_rec->next = wal_rec; - tde_wal_key_last_rec->end_lsn = wal_rec->start_lsn; + tde_wal_key_last_rec->end = wal_rec->start; tde_wal_key_last_rec = wal_rec; } diff --git a/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c b/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c index 7dd9e7b821872..cbffeeff4584e 100644 --- a/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c +++ b/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c @@ -45,7 +45,7 @@ static void *EncryptionCryptCtx = NULL; static WalEncryptionKey EncryptionKey = { .type = MAP_ENTRY_EMPTY, - .start_lsn = InvalidXLogRecPtr, + .wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr}, }; /* @@ -65,7 +65,12 @@ static WalEncryptionKey EncryptionKey = typedef struct EncryptionStateData { - pg_atomic_uint64 enc_key_lsn; /* to sync with readers */ + /* + * To sync with readers. We sync on LSN only and TLI here just to + * communicate its value to readers. + */ + pg_atomic_uint32 enc_key_tli; + pg_atomic_uint64 enc_key_lsn; } EncryptionStateData; static EncryptionStateData *EncryptionState = NULL; @@ -78,10 +83,24 @@ TDEXLogGetEncKeyLsn() return (XLogRecPtr) pg_atomic_read_u64(&EncryptionState->enc_key_lsn); } +static TimeLineID +TDEXLogGetEncKeyTli() +{ + return (TimeLineID) pg_atomic_read_u32(&EncryptionState->enc_key_tli); +} + static void -TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn) +TDEXLogSetEncKeyLocation(WalLocation loc) { - pg_atomic_write_u64(&EncryptionState->enc_key_lsn, start_lsn); + /* + * Write TLI first and then LSN. The barrier ensures writes won't be + * reordered. When reading, the opposite must be done (with a matching + * barrier in between), so we always see a valid TLI after observing a + * valid LSN. + */ + pg_atomic_write_u32(&EncryptionState->enc_key_tli, loc.tli); + pg_write_barrier(); + pg_atomic_write_u64(&EncryptionState->enc_key_lsn, loc.lsn); } static Size TDEXLogEncryptBuffSize(void); @@ -166,7 +185,8 @@ TDEXLogShmemInit(void) typedef struct EncryptionStateData { - XLogRecPtr enc_key_lsn; /* to sync with reader */ + TimeLineID enc_key_tli; + XLogRecPtr enc_key_lsn; } EncryptionStateData; static EncryptionStateData EncryptionStateD = {0}; @@ -181,10 +201,17 @@ TDEXLogGetEncKeyLsn() return (XLogRecPtr) EncryptionState->enc_key_lsn; } +static TimeLineID +TDEXLogGetEncKeyTli() +{ + return (TimeLineID) EncryptionState->enc_key_tli; +} + static void -TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn) +TDEXLogSetEncKeyLocation(WalLocation loc) { - EncryptionState->enc_key_lsn = EncryptionKey.start_lsn; + EncryptionState->enc_key_tli = loc.tli; + EncryptionState->enc_key_lsn = loc.lsn; } #endif /* FRONTEND */ @@ -216,7 +243,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog) else if (key) { EncryptionKey = *key; - TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn); + TDEXLogSetEncKeyLocation(EncryptionKey.wal_start); } if (key) @@ -231,7 +258,7 @@ TDEXLogSmgrInitWriteReuseKey() if (key) { EncryptionKey = *key; - TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn); + TDEXLogSetEncKeyLocation(EncryptionKey.wal_start); pfree(key); } } @@ -252,8 +279,8 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset, #endif #ifdef TDE_XLOG_DEBUG - elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X", - count, offset, offset, LSN_FORMAT_ARGS(segno), LSN_FORMAT_ARGS(key->start_lsn)); + elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %u_%X/%X", + count, offset, offset, LSN_FORMAT_ARGS(segno), key->wal_start.tli, LSN_FORMAT_ARGS(key->wal_start.lsn)); #endif CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix); @@ -279,13 +306,13 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset, */ if (EncryptionKey.type != MAP_ENTRY_EMPTY && TDEXLogGetEncKeyLsn() == 0) { - XLogRecPtr lsn; + WalLocation loc = {.tli = tli}; - XLogSegNoOffsetToRecPtr(segno, offset, segSize, lsn); + XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn); - pg_tde_wal_last_key_set_lsn(lsn); - EncryptionKey.start_lsn = lsn; - TDEXLogSetEncKeyLsn(lsn); + pg_tde_wal_last_key_set_location(loc); + EncryptionKey.wal_start = loc; + TDEXLogSetEncKeyLocation(EncryptionKey.wal_start); } if (EncryptionKey.type == TDE_KEY_TYPE_WAL_ENCRYPTED) @@ -304,12 +331,12 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, ssize_t readsz; WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys(); XLogRecPtr write_key_lsn; - XLogRecPtr data_start; - XLogRecPtr data_end; + WalLocation data_end = {.tli = tli}; + WalLocation data_start = {.tli = tli}; #ifdef TDE_XLOG_DEBUG - elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %X/%X", - count, offset, offset, LSN_FORMAT_ARGS(segno)); + elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %u_%X/%X", + count, offset, offset, tli, LSN_FORMAT_ARGS(segno)); #endif readsz = pg_pread(fd, buf, count, offset); @@ -319,30 +346,38 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, if (!keys) { + WalLocation start = {.tli = 1,.lsn = 0}; + /* cache is empty, try to read keys from disk */ - keys = pg_tde_fetch_wal_keys(InvalidXLogRecPtr); + keys = pg_tde_fetch_wal_keys(start); } + /* + * The barrier ensures that we always read a vaild TLI after the valid + * LSN. See the comment in TDEXLogSetEncKeyLocation() + */ write_key_lsn = TDEXLogGetEncKeyLsn(); + pg_read_barrier(); if (!XLogRecPtrIsInvalid(write_key_lsn)) { WALKeyCacheRec *last_key = pg_tde_get_last_wal_key(); + WalLocation write_loc = {.tli = TDEXLogGetEncKeyTli(),.lsn = write_key_lsn}; Assert(last_key); /* write has generated a new key, need to fetch it */ - if (last_key->start_lsn < write_key_lsn) + if (wal_location_cmp(last_key->start, write_loc) < 0) { - pg_tde_fetch_wal_keys(write_key_lsn); + pg_tde_fetch_wal_keys(write_loc); /* in case cache was empty before */ keys = pg_tde_get_wal_cache_keys(); } } - XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start); - XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end); + XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start.lsn); + XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end.lsn); /* * TODO: this is higly ineffective. We should get rid of linked list and @@ -351,24 +386,24 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, for (WALKeyCacheRec *curr_key = keys; curr_key != NULL; curr_key = curr_key->next) { #ifdef TDE_XLOG_DEBUG - elog(DEBUG1, "WAL key %X/%X-%X/%X, encrypted: %s", - LSN_FORMAT_ARGS(curr_key->start_lsn), - LSN_FORMAT_ARGS(curr_key->end_lsn), + elog(DEBUG1, "WAL key %u_%X/%X - %u_%X/%X, encrypted: %s", + curr_key->start.tli, LSN_FORMAT_ARGS(curr_key->start.lsn), + curr_key->end.tli, LSN_FORMAT_ARGS(curr_key->end.lsn), curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED ? "yes" : "no"); #endif - if (curr_key->key.start_lsn != InvalidXLogRecPtr && + if (wal_location_valid(curr_key->key.wal_start) && curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED) { /* * Check if the key's range overlaps with the buffer's and decypt * the part that does. */ - if (data_start < curr_key->end_lsn && data_end > curr_key->start_lsn) + if (wal_location_cmp(data_start, curr_key->end) < 0 && wal_location_cmp(data_end, curr_key->start) > 0) { char iv_prefix[16]; - off_t dec_off = XLogSegmentOffset(Max(data_start, curr_key->start_lsn), segSize); - off_t dec_end = XLogSegmentOffset(Min(data_end, curr_key->end_lsn), segSize); + off_t dec_off = XLogSegmentOffset(Max(data_start.lsn, curr_key->start.lsn), segSize); + off_t dec_end = XLogSegmentOffset(Min(data_end.lsn, curr_key->end.lsn), segSize); size_t dec_sz; char *dec_buf = (char *) buf + (dec_off - offset); @@ -385,8 +420,8 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset, dec_sz = dec_end - dec_off; #ifdef TDE_XLOG_DEBUG - elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %X/%X", - dec_off, dec_off - offset, dec_sz, LSN_FORMAT_ARGS(curr_key->key->start_lsn)); + elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X", + dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn)); #endif pg_tde_stream_crypt(iv_prefix, dec_off, diff --git a/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h b/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h index e1c6efc665b97..68835ac11e835 100644 --- a/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h +++ b/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h @@ -6,13 +6,47 @@ #include "access/pg_tde_tdemap.h" #include "catalog/tde_principal_key.h" +typedef struct WalLocation +{ + XLogRecPtr lsn; + TimeLineID tli; +} WalLocation; + +/* + * Compares given WAL locations and returns -1 if l1 < l2, 0 if l1 == l2, + * and 1 if l1 > l2 + */ +static inline int +wal_location_cmp(WalLocation l1, WalLocation l2) +{ + if (unlikely(l1.tli < l2.tli)) + return -1; + + if (unlikely(l1.tli > l2.tli)) + return 1; + + if (l1.lsn < l2.lsn) + return -1; + + if (l1.lsn > l2.lsn) + return 1; + + return 0; +} + +static inline bool +wal_location_valid(WalLocation loc) +{ + return loc.tli != 0 && loc.lsn != InvalidXLogRecPtr; +} + typedef struct WalEncryptionKey { uint8 key[INTERNAL_KEY_LEN]; uint8 base_iv[INTERNAL_KEY_IV_LEN]; uint32 type; - XLogRecPtr start_lsn; + WalLocation wal_start; } WalEncryptionKey; /* @@ -21,8 +55,8 @@ typedef struct WalEncryptionKey */ typedef struct WALKeyCacheRec { - XLogRecPtr start_lsn; - XLogRecPtr end_lsn; + WalLocation start; + WalLocation end; WalEncryptionKey key; void *crypt_ctx; @@ -33,7 +67,7 @@ typedef struct WALKeyCacheRec extern int pg_tde_count_wal_keys_in_file(void); extern void pg_tde_create_wal_key(WalEncryptionKey *rel_key_data, TDEMapEntryType entry_type); extern void pg_tde_delete_server_key(void); -extern WALKeyCacheRec *pg_tde_fetch_wal_keys(XLogRecPtr start_lsn); +extern WALKeyCacheRec *pg_tde_fetch_wal_keys(WalLocation start); extern WALKeyCacheRec *pg_tde_get_last_wal_key(void); extern TDESignedPrincipalKeyInfo *pg_tde_get_server_key_info(void); extern WALKeyCacheRec *pg_tde_get_wal_cache_keys(void); @@ -41,6 +75,6 @@ extern void pg_tde_perform_rotate_server_key(TDEPrincipalKey *principal_key, TDE extern WalEncryptionKey *pg_tde_read_last_wal_key(void); extern void pg_tde_save_server_key(const TDEPrincipalKey *principal_key, bool write_xlog); extern void pg_tde_save_server_key_redo(const TDESignedPrincipalKeyInfo *signed_key_info); -extern void pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn); +extern void pg_tde_wal_last_key_set_location(WalLocation loc); #endif /* PG_TDE_XLOG_KEYS_H */ diff --git a/contrib/pg_tde/src/include/pg_tde_fe.h b/contrib/pg_tde/src/include/pg_tde_fe.h index 86d4f2377998d..7d11d9c6a85ad 100644 --- a/contrib/pg_tde/src/include/pg_tde_fe.h +++ b/contrib/pg_tde/src/include/pg_tde_fe.h @@ -88,6 +88,8 @@ static int tde_fe_error_level = 0; #define FreeFile(file) fclose(file) #define pg_fsync(fd) fsync(fd) + +#define pg_read_barrier() NULL #endif /* FRONTEND */ #endif /* PG_TDE_EREPORT_H */ diff --git a/contrib/pg_tde/t/wal_key_tli.pl b/contrib/pg_tde/t/wal_key_tli.pl new file mode 100644 index 0000000000000..c9ba707983176 --- /dev/null +++ b/contrib/pg_tde/t/wal_key_tli.pl @@ -0,0 +1,83 @@ + +# Copyright (c) 2021-2024, PostgreSQL Global Development Group + +# A copy pg_rewind_databases with added restart of the standby, which forces two +# WAL keys with the same LSN but different TLI on the primary after pg_rewind. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use RewindTest; + +sub run_test +{ + my $test_mode = shift; + + RewindTest::setup_cluster($test_mode, ['-g']); + RewindTest::start_primary(); + + # Create a database in primary with a table. + primary_psql('CREATE DATABASE inprimary'); + primary_psql('CREATE TABLE inprimary_tab (a int)', 'inprimary'); + + RewindTest::create_standby($test_mode); + + # Generates a new WAL key with the start LSN 0/300000. After running + # pg_rewind, the primary will end up with that key and another one with the + # same LSN 0/300000, but different TLI. + $node_standby->restart; + + # Create another database with another table, the creation is + # replicated to the standby. + primary_psql('CREATE DATABASE beforepromotion'); + primary_psql('CREATE TABLE beforepromotion_tab (a int)', + 'beforepromotion'); + + RewindTest::promote_standby(); + + # Create databases in the old primary and the new promoted standby. + primary_psql('CREATE DATABASE primary_afterpromotion'); + primary_psql('CREATE TABLE primary_promotion_tab (a int)', + 'primary_afterpromotion'); + standby_psql('CREATE DATABASE standby_afterpromotion'); + standby_psql('CREATE TABLE standby_promotion_tab (a int)', + 'standby_afterpromotion'); + + # The clusters are now diverged. + + RewindTest::run_pg_rewind($test_mode); + + # Check that the correct databases are present after pg_rewind. + check_query( + 'SELECT datname FROM pg_database ORDER BY 1', + qq(beforepromotion +inprimary +postgres +standby_afterpromotion +template0 +template1 +), + 'database names'); + + # Permissions on PGDATA should have group permissions + SKIP: + { + skip "unix-style permissions not supported on Windows", 1 + if ($windows_os || $Config::Config{osname} eq 'cygwin'); + + ok(check_mode_recursive($node_primary->data_dir(), 0750, 0640), + 'check PGDATA permissions'); + } + + RewindTest::clean_rewind_test(); + return; +} + +run_test('remote'); + +done_testing();