Skip to content

Commit 6409ebd

Browse files
committed
PG-1813 Make WAL keys TLI aware
Before this commit, WAL keys didn't mind TLI at all. But after pg_rewind, for example, pg_wal/ may contain segments from two timelines. And the wal reader choosing the key may pick the wrong one because LSNs of different TLIs may overlap. There was also another bug: There is a key with the start LSN 0/30000 in TLI 1. And after the start in TLI 2, the wal writer creates a new key with the SN 0/30000, but in TLI 2. But the reader wouldn't fetch the latest key because w/o TLI, these are the same. This commit adds TLI to the Internal keys and makes use of it along with LSN for key compares.
1 parent 8d7192c commit 6409ebd

File tree

6 files changed

+213
-55
lines changed

6 files changed

+213
-55
lines changed

contrib/pg_tde/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ tap_tests = [
126126
't/unlogged_tables.pl',
127127
't/wal_archiving.pl',
128128
't/wal_encrypt.pl',
129+
't/wal_key_tli.pl',
129130
]
130131

131132
tests += {

contrib/pg_tde/src/access/pg_tde_xlog_keys.c

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#define PG_TDE_WAL_KEY_FILE_NAME "wal_keys"
2626

2727
#define MaxXLogRecPtr (~(XLogRecPtr)0)
28+
#define MaxTimeLineID (~(TimeLineID)0)
2829

2930
typedef struct WalKeyFileHeader
3031
{
@@ -44,7 +45,7 @@ typedef struct WalKeyFileEntry
4445
static WALKeyCacheRec *tde_wal_key_cache = NULL;
4546
static WALKeyCacheRec *tde_wal_key_last_rec = NULL;
4647

47-
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key, XLogRecPtr start_lsn);
48+
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key);
4849
static WalEncryptionKey *pg_tde_decrypt_wal_key(TDEPrincipalKey *principal_key, WalKeyFileEntry *entry);
4950
static void pg_tde_initialize_wal_key_file_entry(WalKeyFileEntry *entry, const TDEPrincipalKey *principal_key, const WalEncryptionKey *rel_key_data);
5051
static int pg_tde_open_wal_key_file_basic(const char *filename, int flags, bool ignore_missing);
@@ -69,7 +70,7 @@ get_wal_key_file_path(void)
6970
}
7071

7172
void
72-
pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn)
73+
pg_tde_wal_last_key_set_location(WalLocation loc)
7374
{
7475
LWLock *lock_pk = tde_lwlock_enc_keys();
7576
int fd;
@@ -85,9 +86,9 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn)
8586
write_pos = sizeof(WalKeyFileHeader) +
8687
(last_key_idx * sizeof(WalKeyFileEntry)) +
8788
offsetof(WalKeyFileEntry, enc_key) +
88-
offsetof(WalEncryptionKey, start_lsn);
89+
offsetof(WalEncryptionKey, wal_start);
8990

90-
if (pg_pwrite(fd, &lsn, sizeof(XLogRecPtr), write_pos) != sizeof(XLogRecPtr))
91+
if (pg_pwrite(fd, &loc, sizeof(WalLocation), write_pos) != sizeof(WalLocation))
9192
{
9293
ereport(ERROR,
9394
errcode_for_file_access(),
@@ -111,7 +112,7 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn)
111112
errmsg("could not read previous WAL key: %m"));
112113
}
113114

114-
if (prev_entry.enc_key.start_lsn >= lsn)
115+
if (wal_location_cmp(prev_entry.enc_key.wal_start, loc) >= 0)
115116
{
116117
prev_entry.enc_key.type = TDE_KEY_TYPE_WAL_INVALID;
117118

@@ -160,7 +161,8 @@ pg_tde_create_wal_key(WalEncryptionKey *rel_key_data, TDEMapEntryType entry_type
160161

161162
/* TODO: no need in generating key if TDE_KEY_TYPE_WAL_UNENCRYPTED */
162163
rel_key_data->type = entry_type;
163-
rel_key_data->start_lsn = InvalidXLogRecPtr;
164+
rel_key_data->wal_start.lsn = InvalidXLogRecPtr;
165+
rel_key_data->wal_start.tli = 0;
164166

165167
if (!RAND_bytes(rel_key_data->key, INTERNAL_KEY_LEN))
166168
ereport(ERROR,
@@ -245,7 +247,7 @@ pg_tde_read_last_wal_key(void)
245247

246248
/* Fetches WAL keys from disk and adds them to the WAL cache */
247249
WALKeyCacheRec *
248-
pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
250+
pg_tde_fetch_wal_keys(WalLocation start)
249251
{
250252
off_t read_pos = 0;
251253
LWLock *lock_pk = tde_lwlock_enc_keys();
@@ -276,10 +278,10 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
276278
{
277279
WALKeyCacheRec *wal_rec;
278280
WalEncryptionKey stub_key = {
279-
.start_lsn = InvalidXLogRecPtr,
281+
.wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr},
280282
};
281283

282-
wal_rec = pg_tde_add_wal_key_to_cache(&stub_key, InvalidXLogRecPtr);
284+
wal_rec = pg_tde_add_wal_key_to_cache(&stub_key);
283285

284286
#ifdef FRONTEND
285287
/* The backend frees it after copying to the cache. */
@@ -299,15 +301,15 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
299301
/*
300302
* Skip new (just created but not updated by write) and invalid keys
301303
*/
302-
if (entry.enc_key.start_lsn != InvalidXLogRecPtr &&
304+
if (wal_location_valid(entry.enc_key.wal_start) &&
303305
(entry.enc_key.type == TDE_KEY_TYPE_WAL_UNENCRYPTED ||
304306
entry.enc_key.type == TDE_KEY_TYPE_WAL_ENCRYPTED) &&
305-
entry.enc_key.start_lsn >= start_lsn)
307+
wal_location_cmp(entry.enc_key.wal_start, start) >= 0)
306308
{
307309
WalEncryptionKey *rel_key_data = pg_tde_decrypt_wal_key(principal_key, &entry);
308310
WALKeyCacheRec *wal_rec;
309311

310-
wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data, entry.enc_key.start_lsn);
312+
wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data);
311313

312314
pfree(rel_key_data);
313315

@@ -325,7 +327,7 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
325327
}
326328

327329
static WALKeyCacheRec *
328-
pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn)
330+
pg_tde_add_wal_key_to_cache(WalEncryptionKey *key)
329331
{
330332
WALKeyCacheRec *wal_rec;
331333
#ifndef FRONTEND
@@ -338,8 +340,9 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn)
338340
MemoryContextSwitchTo(oldCtx);
339341
#endif
340342

341-
wal_rec->start_lsn = start_lsn;
342-
wal_rec->end_lsn = MaxXLogRecPtr;
343+
wal_rec->start = key->wal_start;
344+
wal_rec->end.tli = MaxTimeLineID;
345+
wal_rec->end.lsn = MaxXLogRecPtr;
343346
wal_rec->key = *key;
344347
wal_rec->crypt_ctx = NULL;
345348
if (!tde_wal_key_last_rec)
@@ -350,7 +353,7 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn)
350353
else
351354
{
352355
tde_wal_key_last_rec->next = wal_rec;
353-
tde_wal_key_last_rec->end_lsn = wal_rec->start_lsn;
356+
tde_wal_key_last_rec->end = wal_rec->start;
354357
tde_wal_key_last_rec = wal_rec;
355358
}
356359

contrib/pg_tde/src/access/pg_tde_xlog_smgr.c

Lines changed: 69 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ static void *EncryptionCryptCtx = NULL;
4545
static WalEncryptionKey EncryptionKey =
4646
{
4747
.type = MAP_ENTRY_EMPTY,
48-
.start_lsn = InvalidXLogRecPtr,
48+
.wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr},
4949
};
5050

5151
/*
@@ -65,7 +65,12 @@ static WalEncryptionKey EncryptionKey =
6565

6666
typedef struct EncryptionStateData
6767
{
68-
pg_atomic_uint64 enc_key_lsn; /* to sync with readers */
68+
/*
69+
* To sync with readers. We sync on LSN only and TLI here just to
70+
* communicate its value to readers.
71+
*/
72+
pg_atomic_uint32 enc_key_tli;
73+
pg_atomic_uint64 enc_key_lsn;
6974
} EncryptionStateData;
7075

7176
static EncryptionStateData *EncryptionState = NULL;
@@ -78,10 +83,24 @@ TDEXLogGetEncKeyLsn()
7883
return (XLogRecPtr) pg_atomic_read_u64(&EncryptionState->enc_key_lsn);
7984
}
8085

86+
static TimeLineID
87+
TDEXLogGetEncKeyTli()
88+
{
89+
return (TimeLineID) pg_atomic_read_u32(&EncryptionState->enc_key_tli);
90+
}
91+
8192
static void
82-
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
93+
TDEXLogSetEncKeyLocation(WalLocation loc)
8394
{
84-
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, start_lsn);
95+
/*
96+
* Write TLI first and then LSN. The barrier ensures writes won't be
97+
* reordered. When reading, the opposite must be done (with a matching
98+
* barrier in between), so we always see a valid TLI after observing a
99+
* valid LSN.
100+
*/
101+
pg_atomic_write_u32(&EncryptionState->enc_key_tli, loc.tli);
102+
pg_write_barrier();
103+
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, loc.lsn);
85104
}
86105

87106
static Size TDEXLogEncryptBuffSize(void);
@@ -166,7 +185,8 @@ TDEXLogShmemInit(void)
166185

167186
typedef struct EncryptionStateData
168187
{
169-
XLogRecPtr enc_key_lsn; /* to sync with reader */
188+
XLogRecPtr enc_key_tli;
189+
XLogRecPtr enc_key_lsn;
170190
} EncryptionStateData;
171191

172192
static EncryptionStateData EncryptionStateD = {0};
@@ -181,10 +201,17 @@ TDEXLogGetEncKeyLsn()
181201
return (XLogRecPtr) EncryptionState->enc_key_lsn;
182202
}
183203

204+
static TimeLineID
205+
TDEXLogGetEncKeyTli()
206+
{
207+
return (TimeLineID) EncryptionState->enc_key_tli;
208+
}
209+
184210
static void
185-
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
211+
TDEXLogSetEncKeyLocation(WalLocation loc)
186212
{
187-
EncryptionState->enc_key_lsn = EncryptionKey.start_lsn;
213+
EncryptionState->enc_key_tli = loc.tli;
214+
EncryptionState->enc_key_lsn = loc.lsn;
188215
}
189216

190217
#endif /* FRONTEND */
@@ -216,7 +243,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
216243
else if (key)
217244
{
218245
EncryptionKey = *key;
219-
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
246+
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
220247
}
221248

222249
if (key)
@@ -231,7 +258,7 @@ TDEXLogSmgrInitWriteReuseKey()
231258
if (key)
232259
{
233260
EncryptionKey = *key;
234-
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
261+
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
235262
pfree(key);
236263
}
237264
}
@@ -252,8 +279,8 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
252279
#endif
253280

254281
#ifdef TDE_XLOG_DEBUG
255-
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X",
256-
count, offset, offset, LSN_FORMAT_ARGS(segno), LSN_FORMAT_ARGS(key->start_lsn));
282+
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %u_%X/%X",
283+
count, offset, offset, LSN_FORMAT_ARGS(segno), key->wal_start.tli, LSN_FORMAT_ARGS(key->wal_start.lsn));
257284
#endif
258285

259286
CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
@@ -279,13 +306,13 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
279306
*/
280307
if (EncryptionKey.type != MAP_ENTRY_EMPTY && TDEXLogGetEncKeyLsn() == 0)
281308
{
282-
XLogRecPtr lsn;
309+
WalLocation loc = {.tli = tli};
283310

284-
XLogSegNoOffsetToRecPtr(segno, offset, segSize, lsn);
311+
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
285312

286-
pg_tde_wal_last_key_set_lsn(lsn);
287-
EncryptionKey.start_lsn = lsn;
288-
TDEXLogSetEncKeyLsn(lsn);
313+
pg_tde_wal_last_key_set_location(loc);
314+
EncryptionKey.wal_start = loc;
315+
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
289316
}
290317

291318
if (EncryptionKey.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
@@ -304,12 +331,12 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
304331
ssize_t readsz;
305332
WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys();
306333
XLogRecPtr write_key_lsn;
307-
XLogRecPtr data_start;
308-
XLogRecPtr data_end;
334+
WalLocation data_end = {.tli = tli};
335+
WalLocation data_start = {.tli = tli};
309336

310337
#ifdef TDE_XLOG_DEBUG
311-
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %X/%X",
312-
count, offset, offset, LSN_FORMAT_ARGS(segno));
338+
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %u_%X/%X",
339+
count, offset, offset, tli, LSN_FORMAT_ARGS(segno));
313340
#endif
314341

315342
readsz = pg_pread(fd, buf, count, offset);
@@ -319,30 +346,38 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
319346

320347
if (!keys)
321348
{
349+
WalLocation start = {.tli = 1,.lsn = 0};
350+
322351
/* cache is empty, try to read keys from disk */
323-
keys = pg_tde_fetch_wal_keys(InvalidXLogRecPtr);
352+
keys = pg_tde_fetch_wal_keys(start);
324353
}
325354

355+
/*
356+
* The barrier ensures that we always read a vaild TLI after the valid
357+
* LSN. See the comment in TDEXLogSetEncKeyLocation()
358+
*/
326359
write_key_lsn = TDEXLogGetEncKeyLsn();
360+
pg_read_barrier();
327361

328362
if (!XLogRecPtrIsInvalid(write_key_lsn))
329363
{
330364
WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
365+
WalLocation write_loc = {.tli = TDEXLogGetEncKeyTli(),.lsn = write_key_lsn};
331366

332367
Assert(last_key);
333368

334369
/* write has generated a new key, need to fetch it */
335-
if (last_key->start_lsn < write_key_lsn)
370+
if (wal_location_cmp(last_key->start, write_loc) < 0)
336371
{
337-
pg_tde_fetch_wal_keys(write_key_lsn);
372+
pg_tde_fetch_wal_keys(write_loc);
338373

339374
/* in case cache was empty before */
340375
keys = pg_tde_get_wal_cache_keys();
341376
}
342377
}
343378

344-
XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start);
345-
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end);
379+
XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start.lsn);
380+
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end.lsn);
346381

347382
/*
348383
* TODO: this is higly ineffective. We should get rid of linked list and
@@ -351,24 +386,24 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
351386
for (WALKeyCacheRec *curr_key = keys; curr_key != NULL; curr_key = curr_key->next)
352387
{
353388
#ifdef TDE_XLOG_DEBUG
354-
elog(DEBUG1, "WAL key %X/%X-%X/%X, encrypted: %s",
355-
LSN_FORMAT_ARGS(curr_key->start_lsn),
356-
LSN_FORMAT_ARGS(curr_key->end_lsn),
389+
elog(DEBUG1, "WAL key %u_%X/%X - %u_%X/%X, encrypted: %s",
390+
curr_key->start.tli, LSN_FORMAT_ARGS(curr_key->start.lsn),
391+
curr_key->end.tli, LSN_FORMAT_ARGS(curr_key->end.lsn),
357392
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED ? "yes" : "no");
358393
#endif
359394

360-
if (curr_key->key.start_lsn != InvalidXLogRecPtr &&
395+
if (wal_location_valid(curr_key->key.wal_start) &&
361396
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
362397
{
363398
/*
364399
* Check if the key's range overlaps with the buffer's and decypt
365400
* the part that does.
366401
*/
367-
if (data_start < curr_key->end_lsn && data_end > curr_key->start_lsn)
402+
if (wal_location_cmp(data_start, curr_key->end) < 0 && wal_location_cmp(data_end, curr_key->start) > 0)
368403
{
369404
char iv_prefix[16];
370-
off_t dec_off = XLogSegmentOffset(Max(data_start, curr_key->start_lsn), segSize);
371-
off_t dec_end = XLogSegmentOffset(Min(data_end, curr_key->end_lsn), segSize);
405+
off_t dec_off = XLogSegmentOffset(Max(data_start.lsn, curr_key->start.lsn), segSize);
406+
off_t dec_end = XLogSegmentOffset(Min(data_end.lsn, curr_key->end.lsn), segSize);
372407
size_t dec_sz;
373408
char *dec_buf = (char *) buf + (dec_off - offset);
374409

@@ -385,8 +420,8 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
385420
dec_sz = dec_end - dec_off;
386421

387422
#ifdef TDE_XLOG_DEBUG
388-
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %X/%X",
389-
dec_off, dec_off - offset, dec_sz, LSN_FORMAT_ARGS(curr_key->key->start_lsn));
423+
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X",
424+
dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn));
390425
#endif
391426
pg_tde_stream_crypt(iv_prefix,
392427
dec_off,

0 commit comments

Comments
 (0)