@@ -45,7 +45,7 @@ static void *EncryptionCryptCtx = NULL;
45
45
static WalEncryptionKey EncryptionKey =
46
46
{
47
47
.type = MAP_ENTRY_EMPTY ,
48
- .start_lsn = InvalidXLogRecPtr ,
48
+ .wal_start = {. tli = 0 ,. lsn = InvalidXLogRecPtr } ,
49
49
};
50
50
51
51
/*
@@ -65,7 +65,12 @@ static WalEncryptionKey EncryptionKey =
65
65
66
66
typedef struct EncryptionStateData
67
67
{
68
- pg_atomic_uint64 enc_key_lsn ; /* to sync with readers */
68
+ /*
69
+ * To sync with readers. We sync on LSN only and TLI here just to
70
+ * communicate its value to readers.
71
+ */
72
+ pg_atomic_uint32 enc_key_tli ;
73
+ pg_atomic_uint64 enc_key_lsn ;
69
74
} EncryptionStateData ;
70
75
71
76
static EncryptionStateData * EncryptionState = NULL ;
@@ -78,10 +83,24 @@ TDEXLogGetEncKeyLsn()
78
83
return (XLogRecPtr ) pg_atomic_read_u64 (& EncryptionState -> enc_key_lsn );
79
84
}
80
85
86
+ static TimeLineID
87
+ TDEXLogGetEncKeyTli ()
88
+ {
89
+ return (TimeLineID ) pg_atomic_read_u32 (& EncryptionState -> enc_key_tli );
90
+ }
91
+
81
92
static void
82
- TDEXLogSetEncKeyLsn ( XLogRecPtr start_lsn )
93
+ TDEXLogSetEncKeyLocation ( WalLocation loc )
83
94
{
84
- pg_atomic_write_u64 (& EncryptionState -> enc_key_lsn , start_lsn );
95
+ /*
96
+ * Write TLI first and then LSN. The barrier ensures writes won't be
97
+ * reordered. When reading, the opposite must be done (with a matching
98
+ * barrier in between), so we always see a valid TLI after observing a
99
+ * valid LSN.
100
+ */
101
+ pg_atomic_write_u32 (& EncryptionState -> enc_key_tli , loc .tli );
102
+ pg_write_barrier ();
103
+ pg_atomic_write_u64 (& EncryptionState -> enc_key_lsn , loc .lsn );
85
104
}
86
105
87
106
static Size TDEXLogEncryptBuffSize (void );
@@ -166,7 +185,8 @@ TDEXLogShmemInit(void)
166
185
167
186
typedef struct EncryptionStateData
168
187
{
169
- XLogRecPtr enc_key_lsn ; /* to sync with reader */
188
+ XLogRecPtr enc_key_tli ;
189
+ XLogRecPtr enc_key_lsn ;
170
190
} EncryptionStateData ;
171
191
172
192
static EncryptionStateData EncryptionStateD = {0 };
@@ -181,10 +201,17 @@ TDEXLogGetEncKeyLsn()
181
201
return (XLogRecPtr ) EncryptionState -> enc_key_lsn ;
182
202
}
183
203
204
+ static TimeLineID
205
+ TDEXLogGetEncKeyTli ()
206
+ {
207
+ return (TimeLineID ) EncryptionState -> enc_key_tli ;
208
+ }
209
+
184
210
static void
185
- TDEXLogSetEncKeyLsn ( XLogRecPtr start_lsn )
211
+ TDEXLogSetEncKeyLocation ( WalLocation loc )
186
212
{
187
- EncryptionState -> enc_key_lsn = EncryptionKey .start_lsn ;
213
+ EncryptionState -> enc_key_tli = loc .tli ;
214
+ EncryptionState -> enc_key_lsn = loc .lsn ;
188
215
}
189
216
190
217
#endif /* FRONTEND */
@@ -216,7 +243,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
216
243
else if (key )
217
244
{
218
245
EncryptionKey = * key ;
219
- TDEXLogSetEncKeyLsn (EncryptionKey .start_lsn );
246
+ TDEXLogSetEncKeyLocation (EncryptionKey .wal_start );
220
247
}
221
248
222
249
if (key )
@@ -231,7 +258,7 @@ TDEXLogSmgrInitWriteReuseKey()
231
258
if (key )
232
259
{
233
260
EncryptionKey = * key ;
234
- TDEXLogSetEncKeyLsn (EncryptionKey .start_lsn );
261
+ TDEXLogSetEncKeyLocation (EncryptionKey .wal_start );
235
262
pfree (key );
236
263
}
237
264
}
@@ -252,8 +279,8 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
252
279
#endif
253
280
254
281
#ifdef TDE_XLOG_DEBUG
255
- elog (DEBUG1 , "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X" ,
256
- count , offset , offset , LSN_FORMAT_ARGS (segno ), LSN_FORMAT_ARGS (key -> start_lsn ));
282
+ elog (DEBUG1 , "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %u_% X/%X" ,
283
+ count , offset , offset , LSN_FORMAT_ARGS (segno ), key -> wal_start . tli , LSN_FORMAT_ARGS (key -> wal_start . lsn ));
257
284
#endif
258
285
259
286
CalcXLogPageIVPrefix (tli , segno , key -> base_iv , iv_prefix );
@@ -279,13 +306,13 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
279
306
*/
280
307
if (EncryptionKey .type != MAP_ENTRY_EMPTY && TDEXLogGetEncKeyLsn () == 0 )
281
308
{
282
- XLogRecPtr lsn ;
309
+ WalLocation loc = {. tli = tli } ;
283
310
284
- XLogSegNoOffsetToRecPtr (segno , offset , segSize , lsn );
311
+ XLogSegNoOffsetToRecPtr (segno , offset , segSize , loc . lsn );
285
312
286
- pg_tde_wal_last_key_set_lsn ( lsn );
287
- EncryptionKey .start_lsn = lsn ;
288
- TDEXLogSetEncKeyLsn ( lsn );
313
+ pg_tde_wal_last_key_set_location ( loc );
314
+ EncryptionKey .wal_start = loc ;
315
+ TDEXLogSetEncKeyLocation ( EncryptionKey . wal_start );
289
316
}
290
317
291
318
if (EncryptionKey .type == TDE_KEY_TYPE_WAL_ENCRYPTED )
@@ -304,12 +331,12 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
304
331
ssize_t readsz ;
305
332
WALKeyCacheRec * keys = pg_tde_get_wal_cache_keys ();
306
333
XLogRecPtr write_key_lsn ;
307
- XLogRecPtr data_start ;
308
- XLogRecPtr data_end ;
334
+ WalLocation data_end = {. tli = tli } ;
335
+ WalLocation data_start = {. tli = tli } ;
309
336
310
337
#ifdef TDE_XLOG_DEBUG
311
- elog (DEBUG1 , "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %X/%X" ,
312
- count , offset , offset , LSN_FORMAT_ARGS (segno ));
338
+ elog (DEBUG1 , "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %u_% X/%X" ,
339
+ count , offset , offset , tli , LSN_FORMAT_ARGS (segno ));
313
340
#endif
314
341
315
342
readsz = pg_pread (fd , buf , count , offset );
@@ -319,30 +346,38 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
319
346
320
347
if (!keys )
321
348
{
349
+ WalLocation start = {.tli = 1 ,.lsn = 0 };
350
+
322
351
/* cache is empty, try to read keys from disk */
323
- keys = pg_tde_fetch_wal_keys (InvalidXLogRecPtr );
352
+ keys = pg_tde_fetch_wal_keys (start );
324
353
}
325
354
355
+ /*
356
+ * The barrier ensures that we always read a vaild TLI after the valid
357
+ * LSN. See the comment in TDEXLogSetEncKeyLocation()
358
+ */
326
359
write_key_lsn = TDEXLogGetEncKeyLsn ();
360
+ pg_read_barrier ();
327
361
328
362
if (!XLogRecPtrIsInvalid (write_key_lsn ))
329
363
{
330
364
WALKeyCacheRec * last_key = pg_tde_get_last_wal_key ();
365
+ WalLocation write_loc = {.tli = TDEXLogGetEncKeyTli (),.lsn = write_key_lsn };
331
366
332
367
Assert (last_key );
333
368
334
369
/* write has generated a new key, need to fetch it */
335
- if (last_key -> start_lsn < write_key_lsn )
370
+ if (wal_location_cmp ( last_key -> start , write_loc ) < 0 )
336
371
{
337
- pg_tde_fetch_wal_keys (write_key_lsn );
372
+ pg_tde_fetch_wal_keys (write_loc );
338
373
339
374
/* in case cache was empty before */
340
375
keys = pg_tde_get_wal_cache_keys ();
341
376
}
342
377
}
343
378
344
- XLogSegNoOffsetToRecPtr (segno , offset , segSize , data_start );
345
- XLogSegNoOffsetToRecPtr (segno , offset + readsz , segSize , data_end );
379
+ XLogSegNoOffsetToRecPtr (segno , offset , segSize , data_start . lsn );
380
+ XLogSegNoOffsetToRecPtr (segno , offset + readsz , segSize , data_end . lsn );
346
381
347
382
/*
348
383
* TODO: this is higly ineffective. We should get rid of linked list and
@@ -351,24 +386,24 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
351
386
for (WALKeyCacheRec * curr_key = keys ; curr_key != NULL ; curr_key = curr_key -> next )
352
387
{
353
388
#ifdef TDE_XLOG_DEBUG
354
- elog (DEBUG1 , "WAL key %X/%X- %X/%X, encrypted: %s" ,
355
- LSN_FORMAT_ARGS (curr_key -> start_lsn ),
356
- LSN_FORMAT_ARGS (curr_key -> end_lsn ),
389
+ elog (DEBUG1 , "WAL key %u_% X/%X - %u_ %X/%X, encrypted: %s" ,
390
+ curr_key -> start . tli , LSN_FORMAT_ARGS (curr_key -> start . lsn ),
391
+ curr_key -> end . tli , LSN_FORMAT_ARGS (curr_key -> end . lsn ),
357
392
curr_key -> key .type == TDE_KEY_TYPE_WAL_ENCRYPTED ? "yes" : "no" );
358
393
#endif
359
394
360
- if (curr_key -> key .start_lsn != InvalidXLogRecPtr &&
395
+ if (wal_location_valid ( curr_key -> key .wal_start ) &&
361
396
curr_key -> key .type == TDE_KEY_TYPE_WAL_ENCRYPTED )
362
397
{
363
398
/*
364
399
* Check if the key's range overlaps with the buffer's and decypt
365
400
* the part that does.
366
401
*/
367
- if (data_start < curr_key -> end_lsn && data_end > curr_key -> start_lsn )
402
+ if (wal_location_cmp ( data_start , curr_key -> end ) < 0 && wal_location_cmp ( data_end , curr_key -> start ) > 0 )
368
403
{
369
404
char iv_prefix [16 ];
370
- off_t dec_off = XLogSegmentOffset (Max (data_start , curr_key -> start_lsn ), segSize );
371
- off_t dec_end = XLogSegmentOffset (Min (data_end , curr_key -> end_lsn ), segSize );
405
+ off_t dec_off = XLogSegmentOffset (Max (data_start . lsn , curr_key -> start . lsn ), segSize );
406
+ off_t dec_end = XLogSegmentOffset (Min (data_end . lsn , curr_key -> end . lsn ), segSize );
372
407
size_t dec_sz ;
373
408
char * dec_buf = (char * ) buf + (dec_off - offset );
374
409
@@ -385,8 +420,8 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
385
420
dec_sz = dec_end - dec_off ;
386
421
387
422
#ifdef TDE_XLOG_DEBUG
388
- elog (DEBUG1 , "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %X/%X" ,
389
- dec_off , dec_off - offset , dec_sz , LSN_FORMAT_ARGS ( curr_key -> key -> start_lsn ));
423
+ elog (DEBUG1 , "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_% X/%X" ,
424
+ dec_off , dec_off - offset , dec_sz , curr_key -> key . wal_start . tli , LSN_FORMAT_ARGS ( curr_key -> key . wal_start . lsn ));
390
425
#endif
391
426
pg_tde_stream_crypt (iv_prefix ,
392
427
dec_off ,
0 commit comments