Skip to content

Commit 4a2d9fc

Browse files
committed
Make WAL keys TLI aware
Before this commit, WAL keys didn't mind TLI at all. But after pg_rewind, for example, pg_wal/ may contain segments from two timelines. And the wal reader choosing the key may pick the wrong one because LSNs of different TLIs may overlap. There was also another bug: There is a key with the start LSN 0/30000 in TLI 1. And after the start in TLI 2, the wal writer creates a new key with the SN 0/30000, but in TLI 2. But the reader wouldn't fetch the latest key because w/o TLI, these are the same. This commit adds TLI to the Internal keys and makes use of it along with LSN for key compares.
1 parent f939a24 commit 4a2d9fc

File tree

4 files changed

+116
-51
lines changed

4 files changed

+116
-51
lines changed

contrib/pg_tde/src/access/pg_tde_tdemap.c

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#define TDE_FILE_HEADER_SIZE sizeof(TDEFileHeader)
4545

4646
#define MaxXLogRecPtr (~(XLogRecPtr)0)
47+
#define MaxTimeLineID (~(TimeLineID)0)
4748

4849
typedef struct TDEFileHeader
4950
{
@@ -65,7 +66,7 @@ static void pg_tde_file_header_read(const char *tde_filename, int fd, TDEFileHea
6566
static int pg_tde_file_header_write(const char *tde_filename, int fd, const TDESignedPrincipalKeyInfo *signed_key_info, off_t *bytes_written);
6667
static bool pg_tde_read_one_map_entry(int fd, TDEMapEntry *map_entry, off_t *offset);
6768
static void pg_tde_read_one_map_entry2(int keydata_fd, int32 key_index, TDEMapEntry *map_entry, Oid databaseId);
68-
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(InternalKey *cached_key, XLogRecPtr start_lsn);
69+
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(InternalKey *cached_key);
6970

7071
#ifndef FRONTEND
7172
static void pg_tde_sign_principal_key_info(TDESignedPrincipalKeyInfo *signed_key_info, const TDEPrincipalKey *principal_key);
@@ -369,22 +370,23 @@ pg_tde_delete_principal_key(Oid dbOid)
369370
* needs keyfile_path
370371
*/
371372
void
372-
pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, const char *keyfile_path)
373+
pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, TimeLineID tli, const char *keyfile_path)
373374
{
374375
LWLock *lock_pk = tde_lwlock_enc_keys();
375376
int fd;
376377
off_t read_pos,
377378
write_pos,
378379
last_key_idx;
380+
WALLocation loc = {.tli = tli, .lsn = lsn};
379381

380382
LWLockAcquire(lock_pk, LW_EXCLUSIVE);
381383

382384
fd = pg_tde_open_file_write(keyfile_path, NULL, false, &read_pos);
383385

384386
last_key_idx = ((lseek(fd, 0, SEEK_END) - TDE_FILE_HEADER_SIZE) / MAP_ENTRY_SIZE) - 1;
385-
write_pos = TDE_FILE_HEADER_SIZE + (last_key_idx * MAP_ENTRY_SIZE) + offsetof(TDEMapEntry, enc_key) + offsetof(InternalKey, start_lsn);
387+
write_pos = TDE_FILE_HEADER_SIZE + (last_key_idx * MAP_ENTRY_SIZE) + offsetof(TDEMapEntry, enc_key) + offsetof(InternalKey, wal_start);
386388

387-
if (pg_pwrite(fd, &lsn, sizeof(XLogRecPtr), write_pos) != sizeof(XLogRecPtr))
389+
if (pg_pwrite(fd, &loc, sizeof(WALLocation), write_pos) != sizeof(WALLocation))
388390
{
389391
ereport(ERROR,
390392
errcode_for_file_access(),
@@ -408,7 +410,7 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn, const char *keyfile_path)
408410
errmsg("could not read previous WAL key: %m"));
409411
}
410412

411-
if (prev_map_entry.enc_key.start_lsn >= lsn)
413+
if (wal_location_cmp(prev_map_entry.enc_key.wal_start, loc) >= 0)
412414
{
413415
prev_map_entry.enc_key.type = TDE_KEY_TYPE_WAL_INVALID;
414416

@@ -1035,7 +1037,7 @@ pg_tde_read_last_wal_key(void)
10351037

10361038
/* Fetches WAL keys from disk and adds them to the WAL cache */
10371039
WALKeyCacheRec *
1038-
pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
1040+
pg_tde_fetch_wal_keys(WALLocation start)
10391041
{
10401042
RelFileLocator rlocator = GLOBAL_SPACE_RLOCATOR(XLOG_TDE_OID);
10411043
char db_map_path[MAXPGPATH];
@@ -1070,10 +1072,10 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
10701072
{
10711073
WALKeyCacheRec *wal_rec;
10721074
InternalKey stub_key = {
1073-
.start_lsn = InvalidXLogRecPtr,
1075+
.wal_start = {.tli = 0, .lsn = InvalidXLogRecPtr},
10741076
};
10751077

1076-
wal_rec = pg_tde_add_wal_key_to_cache(&stub_key, InvalidXLogRecPtr);
1078+
wal_rec = pg_tde_add_wal_key_to_cache(&stub_key);
10771079

10781080
#ifdef FRONTEND
10791081
/* The backend frees it after copying to the cache. */
@@ -1093,15 +1095,15 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
10931095
/*
10941096
* Skip new (just created but not updated by write) and invalid keys
10951097
*/
1096-
if (map_entry.enc_key.start_lsn != InvalidXLogRecPtr &&
1098+
if (wal_location_valid(map_entry.enc_key.wal_start) &&
10971099
(map_entry.enc_key.type == TDE_KEY_TYPE_WAL_UNENCRYPTED ||
10981100
map_entry.enc_key.type == TDE_KEY_TYPE_WAL_ENCRYPTED) &&
1099-
map_entry.enc_key.start_lsn >= start_lsn)
1101+
wal_location_cmp(map_entry.enc_key.wal_start, start) >= 0)
11001102
{
11011103
InternalKey *rel_key_data = tde_decrypt_rel_key(principal_key, &map_entry);
11021104
WALKeyCacheRec *wal_rec;
11031105

1104-
wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data, map_entry.enc_key.start_lsn);
1106+
wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data);
11051107

11061108
pfree(rel_key_data);
11071109

@@ -1119,7 +1121,7 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
11191121
}
11201122

11211123
static WALKeyCacheRec *
1122-
pg_tde_add_wal_key_to_cache(InternalKey *key, XLogRecPtr start_lsn)
1124+
pg_tde_add_wal_key_to_cache(InternalKey *key)
11231125
{
11241126
WALKeyCacheRec *wal_rec;
11251127
#ifndef FRONTEND
@@ -1132,8 +1134,9 @@ pg_tde_add_wal_key_to_cache(InternalKey *key, XLogRecPtr start_lsn)
11321134
MemoryContextSwitchTo(oldCtx);
11331135
#endif
11341136

1135-
wal_rec->start_lsn = start_lsn;
1136-
wal_rec->end_lsn = MaxXLogRecPtr;
1137+
wal_rec->start = key->wal_start;
1138+
wal_rec->end.tli = MaxTimeLineID;
1139+
wal_rec->end.lsn = MaxXLogRecPtr;
11371140
wal_rec->key = *key;
11381141
wal_rec->crypt_ctx = NULL;
11391142
if (!tde_wal_key_last_rec)
@@ -1144,7 +1147,7 @@ pg_tde_add_wal_key_to_cache(InternalKey *key, XLogRecPtr start_lsn)
11441147
else
11451148
{
11461149
tde_wal_key_last_rec->next = wal_rec;
1147-
tde_wal_key_last_rec->end_lsn = wal_rec->start_lsn;
1150+
tde_wal_key_last_rec->end = wal_rec->start;
11481151
tde_wal_key_last_rec = wal_rec;
11491152
}
11501153

contrib/pg_tde/src/access/pg_tde_xlog_smgr.c

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ static void *EncryptionCryptCtx = NULL;
4545
static InternalKey EncryptionKey =
4646
{
4747
.type = MAP_ENTRY_EMPTY,
48-
.start_lsn = InvalidXLogRecPtr,
48+
.wal_start = {.tli = 0, .lsn = InvalidXLogRecPtr},
4949
};
5050

5151
/*
@@ -66,7 +66,13 @@ static InternalKey EncryptionKey =
6666
typedef struct EncryptionStateData
6767
{
6868
char db_map_path[MAXPGPATH];
69-
pg_atomic_uint64 enc_key_lsn; /* to sync with readers */
69+
70+
/*
71+
* To sync with readers. We sync on LSN only and TLI here just to
72+
* communicate its value to readers.
73+
*/
74+
pg_atomic_uint32 enc_key_tli;
75+
pg_atomic_uint64 enc_key_lsn;
7076
} EncryptionStateData;
7177

7278
static EncryptionStateData *EncryptionState = NULL;
@@ -80,11 +86,18 @@ TDEXLogGetEncKeyLsn()
8086
}
8187

8288
static void
83-
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
89+
TDEXLogSetEncKeyLocation(WALLocation loc)
8490
{
85-
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, start_lsn);
91+
pg_atomic_init_u32(&EncryptionState->enc_key_tli, loc.tli);
92+
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, loc.lsn);
8693
}
8794

95+
static TimeLineID
96+
TDEXLogGetEncKeyTli()
97+
{
98+
return (TimeLineID) pg_atomic_read_u32(&EncryptionState->enc_key_tli);
99+
}
100+
88101
static Size TDEXLogEncryptBuffSize(void);
89102

90103
static int XLOGChooseNumBuffers(void);
@@ -159,6 +172,7 @@ TDEXLogShmemInit(void)
159172
}
160173

161174
pg_atomic_init_u64(&EncryptionState->enc_key_lsn, 0);
175+
pg_atomic_init_u32(&EncryptionState->enc_key_tli, 0);
162176

163177
elog(DEBUG1, "pg_tde: initialized encryption buffer %lu bytes", TDEXLogEncryptStateSize());
164178
}
@@ -168,6 +182,7 @@ TDEXLogShmemInit(void)
168182
typedef struct EncryptionStateData
169183
{
170184
char db_map_path[MAXPGPATH];
185+
XLogRecPtr enc_key_tli; /* to sync with reader */
171186
XLogRecPtr enc_key_lsn; /* to sync with reader */
172187
} EncryptionStateData;
173188

@@ -184,9 +199,16 @@ TDEXLogGetEncKeyLsn()
184199
}
185200

186201
static void
187-
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
202+
TDEXLogSetEncKeyLocation(WALLocation loc)
203+
{
204+
EncryptionState->enc_key_tli = loc.tli;
205+
EncryptionState->enc_key_lsn = loc.lsn;
206+
}
207+
208+
static TimeLineID
209+
TDEXLogGetEncKeyTli()
188210
{
189-
EncryptionState->enc_key_lsn = EncryptionKey.start_lsn;
211+
return (TimeLineID) EncryptionState->enc_key_tli;
190212
}
191213

192214
#endif /* FRONTEND */
@@ -197,6 +219,7 @@ TDEXLogSmgrInit()
197219
SetXLogSmgr(&tde_xlog_smgr);
198220
}
199221

222+
/* On backend it should be called only during the startup */
200223
void
201224
TDEXLogSmgrInitWrite(bool encrypt_xlog)
202225
{
@@ -220,7 +243,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
220243
else if (key)
221244
{
222245
EncryptionKey = *key;
223-
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
246+
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
224247
}
225248

226249
if (key)
@@ -245,8 +268,8 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
245268
#endif
246269

247270
#ifdef TDE_XLOG_DEBUG
248-
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X",
249-
count, offset, offset, LSN_FORMAT_ARGS(segno), LSN_FORMAT_ARGS(key->start_lsn));
271+
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %u_%X/%X",
272+
count, offset, offset, LSN_FORMAT_ARGS(segno), key->wal_start.tli, LSN_FORMAT_ARGS(key->wal_start.lsn));
250273
#endif
251274

252275
CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
@@ -272,9 +295,10 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
272295

273296
XLogSegNoOffsetToRecPtr(segno, offset, segSize, lsn);
274297

275-
pg_tde_wal_last_key_set_lsn(lsn, EncryptionState->db_map_path);
276-
EncryptionKey.start_lsn = lsn;
277-
TDEXLogSetEncKeyLsn(lsn);
298+
pg_tde_wal_last_key_set_lsn(lsn, tli, EncryptionState->db_map_path);
299+
EncryptionKey.wal_start.tli = tli;
300+
EncryptionKey.wal_start.lsn = lsn;
301+
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
278302
}
279303

280304
if (EncryptionKey.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
@@ -293,12 +317,12 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
293317
ssize_t readsz;
294318
WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys();
295319
XLogRecPtr write_key_lsn;
296-
XLogRecPtr data_start;
297-
XLogRecPtr data_end;
320+
WALLocation data_start = {.tli = tli};
321+
WALLocation data_end = {.tli = tli};
298322

299323
#ifdef TDE_XLOG_DEBUG
300-
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %X/%X",
301-
count, offset, offset, LSN_FORMAT_ARGS(segno));
324+
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %u_%X/%X",
325+
count, offset, offset, tli, LSN_FORMAT_ARGS(segno));
302326
#endif
303327

304328
readsz = pg_pread(fd, buf, count, offset);
@@ -308,30 +332,32 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
308332

309333
if (!keys)
310334
{
335+
WALLocation start = {.tli = 1, .lsn = 0};
311336
/* cache is empty, try to read keys from disk */
312-
keys = pg_tde_fetch_wal_keys(InvalidXLogRecPtr);
337+
keys = pg_tde_fetch_wal_keys(start);
313338
}
314339

315340
write_key_lsn = TDEXLogGetEncKeyLsn();
316341

317342
if (!XLogRecPtrIsInvalid(write_key_lsn))
318343
{
319344
WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
345+
WALLocation write_loc = {.tli = TDEXLogGetEncKeyTli(), .lsn = write_key_lsn};
320346

321347
Assert(last_key);
322348

323349
/* write has generated a new key, need to fetch it */
324-
if (last_key->start_lsn < write_key_lsn)
350+
if (wal_location_cmp(last_key->start, write_loc) < 0)
325351
{
326-
pg_tde_fetch_wal_keys(write_key_lsn);
352+
pg_tde_fetch_wal_keys(write_loc);
327353

328354
/* in case cache was empty before */
329355
keys = pg_tde_get_wal_cache_keys();
330356
}
331357
}
332358

333-
XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start);
334-
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end);
359+
XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start.lsn);
360+
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end.lsn);
335361

336362
/*
337363
* TODO: this is higly ineffective. We should get rid of linked list and
@@ -340,24 +366,25 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
340366
for (WALKeyCacheRec *curr_key = keys; curr_key != NULL; curr_key = curr_key->next)
341367
{
342368
#ifdef TDE_XLOG_DEBUG
343-
elog(DEBUG1, "WAL key %X/%X-%X/%X, encrypted: %s",
344-
LSN_FORMAT_ARGS(curr_key->start_lsn),
345-
LSN_FORMAT_ARGS(curr_key->end_lsn),
369+
elog(DEBUG1, "WAL key %u_%X/%X - %u_%X/%X, encrypted: %s",
370+
curr_key->start.tli, LSN_FORMAT_ARGS(curr_key->start.lsn),
371+
curr_key->end.tli, LSN_FORMAT_ARGS(curr_key->end.lsn),
346372
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED ? "yes" : "no");
347373
#endif
348374

349-
if (curr_key->key.start_lsn != InvalidXLogRecPtr &&
375+
if (wal_location_valid(curr_key->key.wal_start) &&
350376
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
351377
{
352378
/*
353379
* Check if the key's range overlaps with the buffer's and decypt
354380
* the part that does.
355381
*/
356-
if (data_start < curr_key->end_lsn && data_end > curr_key->start_lsn)
382+
383+
if (wal_location_cmp(data_start, curr_key->end) < 0 && wal_location_cmp(data_end, curr_key->start) > 0)
357384
{
358385
char iv_prefix[16];
359-
off_t dec_off = XLogSegmentOffset(Max(data_start, curr_key->start_lsn), segSize);
360-
off_t dec_end = XLogSegmentOffset(Min(data_end, curr_key->end_lsn), segSize);
386+
off_t dec_off = XLogSegmentOffset(Max(data_start.lsn, curr_key->start.lsn), segSize);
387+
off_t dec_end = XLogSegmentOffset(Min(data_end.lsn, curr_key->end.lsn), segSize);
361388
size_t dec_sz;
362389
char *dec_buf = (char *) buf + (dec_off - offset);
363390

@@ -374,8 +401,8 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
374401
dec_sz = dec_end - dec_off;
375402

376403
#ifdef TDE_XLOG_DEBUG
377-
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %X/%X",
378-
dec_off, dec_off - offset, dec_sz, LSN_FORMAT_ARGS(curr_key->key->start_lsn));
404+
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X",
405+
dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn));
379406
#endif
380407
pg_tde_stream_crypt(iv_prefix, dec_off, dec_buf, dec_sz, dec_buf,
381408
&curr_key->key, &curr_key->crypt_ctx);

contrib/pg_tde/src/encryption/enc_tde.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ void
3030
pg_tde_generate_internal_key(InternalKey *int_key, TDEMapEntryType entry_type)
3131
{
3232
int_key->type = entry_type;
33-
int_key->start_lsn = InvalidXLogRecPtr;
33+
int_key->wal_start.tli = 0;
34+
int_key->wal_start.lsn = InvalidXLogRecPtr;
3435

3536
if (!RAND_bytes(int_key->key, INTERNAL_KEY_LEN))
3637
ereport(ERROR,

0 commit comments

Comments
 (0)