1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtxlog.c
4 : * WAL replay logic for btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.49 2007/11/16 19:53:50 tgl Exp $
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/nbtree.h"
18 : #include "access/transam.h"
19 :
20 : /*
21 : * We must keep track of expected insertions due to page splits, and apply
22 : * them manually if they are not seen in the WAL log during replay. This
23 : * makes it safe for page insertion to be a multiple-WAL-action process.
24 : *
25 : * Similarly, deletion of an only child page and deletion of its parent page
26 : * form multiple WAL log entries, and we have to be prepared to follow through
27 : * with the deletion if the log ends between.
28 : *
29 : * The data structure is a simple linked list --- this should be good enough,
30 : * since we don't expect a page split or multi deletion to remain incomplete
31 : * for long. In any case we need to respect the order of operations.
32 : */
33 : typedef struct bt_incomplete_action
34 : {
35 : RelFileNode node; /* the index */
36 : bool is_split; /* T = pending split, F = pending delete */
37 : /* these fields are for a split: */
38 : bool is_root; /* we split the root */
39 : BlockNumber leftblk; /* left half of split */
40 : BlockNumber rightblk; /* right half of split */
41 : /* these fields are for a delete: */
42 : BlockNumber delblk; /* parent block to be deleted */
43 : } bt_incomplete_action;
44 :
45 : static List *incomplete_actions;
46 :
47 :
48 : static void
49 : log_incomplete_split(RelFileNode node, BlockNumber leftblk,
50 : BlockNumber rightblk, bool is_root)
51 0 : {
52 0 : bt_incomplete_action *action = palloc(sizeof(bt_incomplete_action));
53 :
54 0 : action->node = node;
55 0 : action->is_split = true;
56 0 : action->is_root = is_root;
57 0 : action->leftblk = leftblk;
58 0 : action->rightblk = rightblk;
59 0 : incomplete_actions = lappend(incomplete_actions, action);
60 0 : }
61 :
62 : static void
63 : forget_matching_split(RelFileNode node, BlockNumber downlink, bool is_root)
64 0 : {
65 : ListCell *l;
66 :
67 0 : foreach(l, incomplete_actions)
68 : {
69 0 : bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);
70 :
71 0 : if (RelFileNodeEquals(node, action->node) &&
72 : action->is_split &&
73 : downlink == action->rightblk)
74 : {
75 0 : if (is_root != action->is_root)
76 0 : elog(LOG, "forget_matching_split: fishy is_root data (expected %d, got %d)",
77 : action->is_root, is_root);
78 0 : incomplete_actions = list_delete_ptr(incomplete_actions, action);
79 0 : pfree(action);
80 0 : break; /* need not look further */
81 : }
82 : }
83 0 : }
84 :
85 : static void
86 : log_incomplete_deletion(RelFileNode node, BlockNumber delblk)
87 0 : {
88 0 : bt_incomplete_action *action = palloc(sizeof(bt_incomplete_action));
89 :
90 0 : action->node = node;
91 0 : action->is_split = false;
92 0 : action->delblk = delblk;
93 0 : incomplete_actions = lappend(incomplete_actions, action);
94 0 : }
95 :
96 : static void
97 : forget_matching_deletion(RelFileNode node, BlockNumber delblk)
98 0 : {
99 : ListCell *l;
100 :
101 0 : foreach(l, incomplete_actions)
102 : {
103 0 : bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);
104 :
105 0 : if (RelFileNodeEquals(node, action->node) &&
106 : !action->is_split &&
107 : delblk == action->delblk)
108 : {
109 0 : incomplete_actions = list_delete_ptr(incomplete_actions, action);
110 0 : pfree(action);
111 0 : break; /* need not look further */
112 : }
113 : }
114 0 : }
115 :
116 : /*
117 : * _bt_restore_page -- re-enter all the index tuples on a page
118 : *
119 : * The page is freshly init'd, and *from (length len) is a copy of what
120 : * had been its upper part (pd_upper to pd_special). We assume that the
121 : * tuples had been added to the page in item-number order, and therefore
122 : * the one with highest item number appears first (lowest on the page).
123 : *
124 : * NOTE: the way this routine is coded, the rebuilt page will have the items
125 : * in correct itemno sequence, but physically the opposite order from the
126 : * original, because we insert them in the opposite of itemno order. This
127 : * does not matter in any current btree code, but it's something to keep an
128 : * eye on. Is it worth changing just on general principles? See also the
129 : * notes in btree_xlog_split().
130 : */
131 : static void
132 : _bt_restore_page(Page page, char *from, int len)
133 0 : {
134 : IndexTupleData itupdata;
135 : Size itemsz;
136 0 : char *end = from + len;
137 :
138 0 : for (; from < end;)
139 : {
140 : /* Need to copy tuple header due to alignment considerations */
141 0 : memcpy(&itupdata, from, sizeof(IndexTupleData));
142 0 : itemsz = IndexTupleDSize(itupdata);
143 0 : itemsz = MAXALIGN(itemsz);
144 0 : if (PageAddItem(page, (Item) from, itemsz, FirstOffsetNumber,
145 : false, false) == InvalidOffsetNumber)
146 0 : elog(PANIC, "_bt_restore_page: cannot add item to page");
147 0 : from += itemsz;
148 : }
149 0 : }
150 :
151 : static void
152 : _bt_restore_meta(Relation reln, XLogRecPtr lsn,
153 : BlockNumber root, uint32 level,
154 : BlockNumber fastroot, uint32 fastlevel)
155 0 : {
156 : Buffer metabuf;
157 : Page metapg;
158 : BTMetaPageData *md;
159 : BTPageOpaque pageop;
160 :
161 0 : metabuf = XLogReadBuffer(reln, BTREE_METAPAGE, true);
162 : Assert(BufferIsValid(metabuf));
163 0 : metapg = BufferGetPage(metabuf);
164 :
165 0 : _bt_pageinit(metapg, BufferGetPageSize(metabuf));
166 :
167 0 : md = BTPageGetMeta(metapg);
168 0 : md->btm_magic = BTREE_MAGIC;
169 0 : md->btm_version = BTREE_VERSION;
170 0 : md->btm_root = root;
171 0 : md->btm_level = level;
172 0 : md->btm_fastroot = fastroot;
173 0 : md->btm_fastlevel = fastlevel;
174 :
175 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
176 0 : pageop->btpo_flags = BTP_META;
177 :
178 : /*
179 : * Set pd_lower just past the end of the metadata. This is not essential
180 : * but it makes the page look compressible to xlog.c.
181 : */
182 0 : ((PageHeader) metapg)->pd_lower =
183 : ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
184 :
185 0 : PageSetLSN(metapg, lsn);
186 0 : PageSetTLI(metapg, ThisTimeLineID);
187 0 : MarkBufferDirty(metabuf);
188 0 : UnlockReleaseBuffer(metabuf);
189 0 : }
190 :
191 : static void
192 : btree_xlog_insert(bool isleaf, bool ismeta,
193 : XLogRecPtr lsn, XLogRecord *record)
194 0 : {
195 0 : xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
196 : Relation reln;
197 : Buffer buffer;
198 : Page page;
199 : char *datapos;
200 : int datalen;
201 : xl_btree_metadata md;
202 0 : BlockNumber downlink = 0;
203 :
204 0 : datapos = (char *) xlrec + SizeOfBtreeInsert;
205 0 : datalen = record->xl_len - SizeOfBtreeInsert;
206 0 : if (!isleaf)
207 : {
208 0 : memcpy(&downlink, datapos, sizeof(BlockNumber));
209 0 : datapos += sizeof(BlockNumber);
210 0 : datalen -= sizeof(BlockNumber);
211 : }
212 0 : if (ismeta)
213 : {
214 0 : memcpy(&md, datapos, sizeof(xl_btree_metadata));
215 0 : datapos += sizeof(xl_btree_metadata);
216 0 : datalen -= sizeof(xl_btree_metadata);
217 : }
218 :
219 0 : if ((record->xl_info & XLR_BKP_BLOCK_1) && !ismeta && isleaf)
220 0 : return; /* nothing to do */
221 :
222 0 : reln = XLogOpenRelation(xlrec->target.node);
223 :
224 0 : if (!(record->xl_info & XLR_BKP_BLOCK_1))
225 : {
226 0 : buffer = XLogReadBuffer(reln,
227 : ItemPointerGetBlockNumber(&(xlrec->target.tid)),
228 : false);
229 0 : if (BufferIsValid(buffer))
230 : {
231 0 : page = (Page) BufferGetPage(buffer);
232 :
233 0 : if (XLByteLE(lsn, PageGetLSN(page)))
234 : {
235 0 : UnlockReleaseBuffer(buffer);
236 : }
237 : else
238 : {
239 0 : if (PageAddItem(page, (Item) datapos, datalen,
240 : ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
241 : false, false) == InvalidOffsetNumber)
242 0 : elog(PANIC, "btree_insert_redo: failed to add item");
243 :
244 0 : PageSetLSN(page, lsn);
245 0 : PageSetTLI(page, ThisTimeLineID);
246 0 : MarkBufferDirty(buffer);
247 0 : UnlockReleaseBuffer(buffer);
248 : }
249 : }
250 : }
251 :
252 0 : if (ismeta)
253 0 : _bt_restore_meta(reln, lsn,
254 : md.root, md.level,
255 : md.fastroot, md.fastlevel);
256 :
257 : /* Forget any split this insertion completes */
258 0 : if (!isleaf)
259 0 : forget_matching_split(xlrec->target.node, downlink, false);
260 : }
261 :
262 : static void
263 : btree_xlog_split(bool onleft, bool isroot,
264 : XLogRecPtr lsn, XLogRecord *record)
265 0 : {
266 0 : xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
267 : Relation reln;
268 : Buffer rbuf;
269 : Page rpage;
270 : BTPageOpaque ropaque;
271 : char *datapos;
272 : int datalen;
273 0 : OffsetNumber newitemoff = 0;
274 0 : Item newitem = NULL;
275 0 : Size newitemsz = 0;
276 0 : Item left_hikey = NULL;
277 0 : Size left_hikeysz = 0;
278 :
279 0 : reln = XLogOpenRelation(xlrec->node);
280 :
281 0 : datapos = (char *) xlrec + SizeOfBtreeSplit;
282 0 : datalen = record->xl_len - SizeOfBtreeSplit;
283 :
284 : /* Forget any split this insertion completes */
285 0 : if (xlrec->level > 0)
286 : {
287 : /* we assume SizeOfBtreeSplit is at least 16-bit aligned */
288 0 : BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos);
289 :
290 0 : datapos += sizeof(BlockIdData);
291 0 : datalen -= sizeof(BlockIdData);
292 :
293 0 : forget_matching_split(xlrec->node, downlink, false);
294 :
295 : /* Extract left hikey and its size (still assuming 16-bit alignment) */
296 0 : if (!(record->xl_info & XLR_BKP_BLOCK_1))
297 : {
298 : /* We assume 16-bit alignment is enough for IndexTupleSize */
299 0 : left_hikey = (Item) datapos;
300 0 : left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
301 :
302 0 : datapos += left_hikeysz;
303 0 : datalen -= left_hikeysz;
304 : }
305 : }
306 :
307 : /* Extract newitem and newitemoff, if present */
308 0 : if (onleft)
309 : {
310 : /* Extract the offset (still assuming 16-bit alignment) */
311 0 : memcpy(&newitemoff, datapos, sizeof(OffsetNumber));
312 0 : datapos += sizeof(OffsetNumber);
313 0 : datalen -= sizeof(OffsetNumber);
314 : }
315 :
316 0 : if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1))
317 : {
318 : /*
319 : * We assume that 16-bit alignment is enough to apply IndexTupleSize
320 : * (since it's fetching from a uint16 field) and also enough for
321 : * PageAddItem to insert the tuple.
322 : */
323 0 : newitem = (Item) datapos;
324 0 : newitemsz = MAXALIGN(IndexTupleSize(newitem));
325 0 : datapos += newitemsz;
326 0 : datalen -= newitemsz;
327 : }
328 :
329 : /* Reconstruct right (new) sibling from scratch */
330 0 : rbuf = XLogReadBuffer(reln, xlrec->rightsib, true);
331 : Assert(BufferIsValid(rbuf));
332 0 : rpage = (Page) BufferGetPage(rbuf);
333 :
334 0 : _bt_pageinit(rpage, BufferGetPageSize(rbuf));
335 0 : ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
336 :
337 0 : ropaque->btpo_prev = xlrec->leftsib;
338 0 : ropaque->btpo_next = xlrec->rnext;
339 0 : ropaque->btpo.level = xlrec->level;
340 0 : ropaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
341 0 : ropaque->btpo_cycleid = 0;
342 :
343 0 : _bt_restore_page(rpage, datapos, datalen);
344 :
345 : /*
346 : * On leaf level, the high key of the left page is equal to the
347 : * first key on the right page.
348 : */
349 0 : if (xlrec->level == 0)
350 : {
351 0 : ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
352 :
353 0 : left_hikey = PageGetItem(rpage, hiItemId);
354 0 : left_hikeysz = ItemIdGetLength(hiItemId);
355 : }
356 :
357 0 : PageSetLSN(rpage, lsn);
358 0 : PageSetTLI(rpage, ThisTimeLineID);
359 0 : MarkBufferDirty(rbuf);
360 :
361 : /* don't release the buffer yet; we touch right page's first item below */
362 :
363 : /*
364 : * Reconstruct left (original) sibling if needed. Note that this code
365 : * ensures that the items remaining on the left page are in the correct
366 : * item number order, but it does not reproduce the physical order they
367 : * would have had. Is this worth changing? See also _bt_restore_page().
368 : */
369 0 : if (!(record->xl_info & XLR_BKP_BLOCK_1))
370 : {
371 0 : Buffer lbuf = XLogReadBuffer(reln, xlrec->leftsib, false);
372 :
373 0 : if (BufferIsValid(lbuf))
374 : {
375 0 : Page lpage = (Page) BufferGetPage(lbuf);
376 0 : BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
377 :
378 0 : if (!XLByteLE(lsn, PageGetLSN(lpage)))
379 : {
380 : OffsetNumber off;
381 0 : OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
382 : OffsetNumber deletable[MaxOffsetNumber];
383 0 : int ndeletable = 0;
384 :
385 : /*
386 : * Remove the items from the left page that were copied to the
387 : * right page. Also remove the old high key, if any. (We must
388 : * remove everything before trying to insert any items, else
389 : * we risk not having enough space.)
390 : */
391 0 : if (!P_RIGHTMOST(lopaque))
392 : {
393 0 : deletable[ndeletable++] = P_HIKEY;
394 :
395 : /*
396 : * newitemoff is given to us relative to the original
397 : * page's item numbering, so adjust it for this deletion.
398 : */
399 0 : newitemoff--;
400 : }
401 0 : for (off = xlrec->firstright; off <= maxoff; off++)
402 0 : deletable[ndeletable++] = off;
403 0 : if (ndeletable > 0)
404 0 : PageIndexMultiDelete(lpage, deletable, ndeletable);
405 :
406 : /*
407 : * Add the new item if it was inserted on left page.
408 : */
409 0 : if (onleft)
410 : {
411 0 : if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
412 : false, false) == InvalidOffsetNumber)
413 0 : elog(PANIC, "failed to add new item to left page after split");
414 : }
415 :
416 : /* Set high key */
417 0 : if (PageAddItem(lpage, left_hikey, left_hikeysz,
418 : P_HIKEY, false, false) == InvalidOffsetNumber)
419 0 : elog(PANIC, "failed to add high key to left page after split");
420 :
421 : /* Fix opaque fields */
422 0 : lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
423 0 : lopaque->btpo_next = xlrec->rightsib;
424 0 : lopaque->btpo_cycleid = 0;
425 :
426 0 : PageSetLSN(lpage, lsn);
427 0 : PageSetTLI(lpage, ThisTimeLineID);
428 0 : MarkBufferDirty(lbuf);
429 : }
430 :
431 0 : UnlockReleaseBuffer(lbuf);
432 : }
433 : }
434 :
435 : /* We no longer need the right buffer */
436 0 : UnlockReleaseBuffer(rbuf);
437 :
438 : /* Fix left-link of the page to the right of the new right sibling */
439 0 : if (xlrec->rnext != P_NONE && !(record->xl_info & XLR_BKP_BLOCK_2))
440 : {
441 0 : Buffer buffer = XLogReadBuffer(reln, xlrec->rnext, false);
442 :
443 0 : if (BufferIsValid(buffer))
444 : {
445 0 : Page page = (Page) BufferGetPage(buffer);
446 :
447 0 : if (!XLByteLE(lsn, PageGetLSN(page)))
448 : {
449 0 : BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
450 :
451 0 : pageop->btpo_prev = xlrec->rightsib;
452 :
453 0 : PageSetLSN(page, lsn);
454 0 : PageSetTLI(page, ThisTimeLineID);
455 0 : MarkBufferDirty(buffer);
456 : }
457 0 : UnlockReleaseBuffer(buffer);
458 : }
459 : }
460 :
461 : /* The job ain't done till the parent link is inserted... */
462 0 : log_incomplete_split(xlrec->node,
463 : xlrec->leftsib, xlrec->rightsib, isroot);
464 0 : }
465 :
466 : static void
467 : btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
468 0 : {
469 : xl_btree_delete *xlrec;
470 : Relation reln;
471 : Buffer buffer;
472 : Page page;
473 : BTPageOpaque opaque;
474 :
475 0 : if (record->xl_info & XLR_BKP_BLOCK_1)
476 0 : return;
477 :
478 0 : xlrec = (xl_btree_delete *) XLogRecGetData(record);
479 0 : reln = XLogOpenRelation(xlrec->node);
480 0 : buffer = XLogReadBuffer(reln, xlrec->block, false);
481 0 : if (!BufferIsValid(buffer))
482 : return;
483 0 : page = (Page) BufferGetPage(buffer);
484 :
485 0 : if (XLByteLE(lsn, PageGetLSN(page)))
486 : {
487 0 : UnlockReleaseBuffer(buffer);
488 0 : return;
489 : }
490 :
491 0 : if (record->xl_len > SizeOfBtreeDelete)
492 : {
493 : OffsetNumber *unused;
494 : OffsetNumber *unend;
495 :
496 0 : unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
497 0 : unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
498 :
499 0 : PageIndexMultiDelete(page, unused, unend - unused);
500 : }
501 :
502 : /*
503 : * Mark the page as not containing any LP_DEAD items --- see comments in
504 : * _bt_delitems().
505 : */
506 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
507 0 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
508 :
509 0 : PageSetLSN(page, lsn);
510 0 : PageSetTLI(page, ThisTimeLineID);
511 0 : MarkBufferDirty(buffer);
512 0 : UnlockReleaseBuffer(buffer);
513 : }
514 :
515 : static void
516 : btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
517 0 : {
518 0 : xl_btree_delete_page *xlrec = (xl_btree_delete_page *) XLogRecGetData(record);
519 : Relation reln;
520 : BlockNumber parent;
521 : BlockNumber target;
522 : BlockNumber leftsib;
523 : BlockNumber rightsib;
524 : Buffer buffer;
525 : Page page;
526 : BTPageOpaque pageop;
527 :
528 0 : reln = XLogOpenRelation(xlrec->target.node);
529 0 : parent = ItemPointerGetBlockNumber(&(xlrec->target.tid));
530 0 : target = xlrec->deadblk;
531 0 : leftsib = xlrec->leftblk;
532 0 : rightsib = xlrec->rightblk;
533 :
534 : /* parent page */
535 0 : if (!(record->xl_info & XLR_BKP_BLOCK_1))
536 : {
537 0 : buffer = XLogReadBuffer(reln, parent, false);
538 0 : if (BufferIsValid(buffer))
539 : {
540 0 : page = (Page) BufferGetPage(buffer);
541 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
542 0 : if (XLByteLE(lsn, PageGetLSN(page)))
543 : {
544 0 : UnlockReleaseBuffer(buffer);
545 : }
546 : else
547 : {
548 : OffsetNumber poffset;
549 :
550 0 : poffset = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
551 0 : if (poffset >= PageGetMaxOffsetNumber(page))
552 : {
553 : Assert(info == XLOG_BTREE_DELETE_PAGE_HALF);
554 : Assert(poffset == P_FIRSTDATAKEY(pageop));
555 0 : PageIndexTupleDelete(page, poffset);
556 0 : pageop->btpo_flags |= BTP_HALF_DEAD;
557 : }
558 : else
559 : {
560 : ItemId itemid;
561 : IndexTuple itup;
562 : OffsetNumber nextoffset;
563 :
564 : Assert(info != XLOG_BTREE_DELETE_PAGE_HALF);
565 0 : itemid = PageGetItemId(page, poffset);
566 0 : itup = (IndexTuple) PageGetItem(page, itemid);
567 0 : ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
568 0 : nextoffset = OffsetNumberNext(poffset);
569 0 : PageIndexTupleDelete(page, nextoffset);
570 : }
571 :
572 0 : PageSetLSN(page, lsn);
573 0 : PageSetTLI(page, ThisTimeLineID);
574 0 : MarkBufferDirty(buffer);
575 0 : UnlockReleaseBuffer(buffer);
576 : }
577 : }
578 : }
579 :
580 : /* Fix left-link of right sibling */
581 0 : if (!(record->xl_info & XLR_BKP_BLOCK_2))
582 : {
583 0 : buffer = XLogReadBuffer(reln, rightsib, false);
584 0 : if (BufferIsValid(buffer))
585 : {
586 0 : page = (Page) BufferGetPage(buffer);
587 0 : if (XLByteLE(lsn, PageGetLSN(page)))
588 : {
589 0 : UnlockReleaseBuffer(buffer);
590 : }
591 : else
592 : {
593 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
594 0 : pageop->btpo_prev = leftsib;
595 :
596 0 : PageSetLSN(page, lsn);
597 0 : PageSetTLI(page, ThisTimeLineID);
598 0 : MarkBufferDirty(buffer);
599 0 : UnlockReleaseBuffer(buffer);
600 : }
601 : }
602 : }
603 :
604 : /* Fix right-link of left sibling, if any */
605 0 : if (!(record->xl_info & XLR_BKP_BLOCK_3))
606 : {
607 0 : if (leftsib != P_NONE)
608 : {
609 0 : buffer = XLogReadBuffer(reln, leftsib, false);
610 0 : if (BufferIsValid(buffer))
611 : {
612 0 : page = (Page) BufferGetPage(buffer);
613 0 : if (XLByteLE(lsn, PageGetLSN(page)))
614 : {
615 0 : UnlockReleaseBuffer(buffer);
616 : }
617 : else
618 : {
619 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
620 0 : pageop->btpo_next = rightsib;
621 :
622 0 : PageSetLSN(page, lsn);
623 0 : PageSetTLI(page, ThisTimeLineID);
624 0 : MarkBufferDirty(buffer);
625 0 : UnlockReleaseBuffer(buffer);
626 : }
627 : }
628 : }
629 : }
630 :
631 : /* Rewrite target page as empty deleted page */
632 0 : buffer = XLogReadBuffer(reln, target, true);
633 : Assert(BufferIsValid(buffer));
634 0 : page = (Page) BufferGetPage(buffer);
635 :
636 0 : _bt_pageinit(page, BufferGetPageSize(buffer));
637 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
638 :
639 0 : pageop->btpo_prev = leftsib;
640 0 : pageop->btpo_next = rightsib;
641 0 : pageop->btpo.xact = FrozenTransactionId;
642 0 : pageop->btpo_flags = BTP_DELETED;
643 0 : pageop->btpo_cycleid = 0;
644 :
645 0 : PageSetLSN(page, lsn);
646 0 : PageSetTLI(page, ThisTimeLineID);
647 0 : MarkBufferDirty(buffer);
648 0 : UnlockReleaseBuffer(buffer);
649 :
650 : /* Update metapage if needed */
651 0 : if (info == XLOG_BTREE_DELETE_PAGE_META)
652 : {
653 : xl_btree_metadata md;
654 :
655 0 : memcpy(&md, (char *) xlrec + SizeOfBtreeDeletePage,
656 : sizeof(xl_btree_metadata));
657 0 : _bt_restore_meta(reln, lsn,
658 : md.root, md.level,
659 : md.fastroot, md.fastlevel);
660 : }
661 :
662 : /* Forget any completed deletion */
663 0 : forget_matching_deletion(xlrec->target.node, target);
664 :
665 : /* If parent became half-dead, remember it for deletion */
666 0 : if (info == XLOG_BTREE_DELETE_PAGE_HALF)
667 0 : log_incomplete_deletion(xlrec->target.node, parent);
668 0 : }
669 :
670 : static void
671 : btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
672 0 : {
673 0 : xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
674 : Relation reln;
675 : Buffer buffer;
676 : Page page;
677 : BTPageOpaque pageop;
678 0 : BlockNumber downlink = 0;
679 :
680 0 : reln = XLogOpenRelation(xlrec->node);
681 0 : buffer = XLogReadBuffer(reln, xlrec->rootblk, true);
682 : Assert(BufferIsValid(buffer));
683 0 : page = (Page) BufferGetPage(buffer);
684 :
685 0 : _bt_pageinit(page, BufferGetPageSize(buffer));
686 0 : pageop = (BTPageOpaque) PageGetSpecialPointer(page);
687 :
688 0 : pageop->btpo_flags = BTP_ROOT;
689 0 : pageop->btpo_prev = pageop->btpo_next = P_NONE;
690 0 : pageop->btpo.level = xlrec->level;
691 0 : if (xlrec->level == 0)
692 0 : pageop->btpo_flags |= BTP_LEAF;
693 0 : pageop->btpo_cycleid = 0;
694 :
695 0 : if (record->xl_len > SizeOfBtreeNewroot)
696 : {
697 : IndexTuple itup;
698 :
699 0 : _bt_restore_page(page,
700 : (char *) xlrec + SizeOfBtreeNewroot,
701 : record->xl_len - SizeOfBtreeNewroot);
702 : /* extract downlink to the right-hand split page */
703 0 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY));
704 0 : downlink = ItemPointerGetBlockNumber(&(itup->t_tid));
705 : Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
706 : }
707 :
708 0 : PageSetLSN(page, lsn);
709 0 : PageSetTLI(page, ThisTimeLineID);
710 0 : MarkBufferDirty(buffer);
711 0 : UnlockReleaseBuffer(buffer);
712 :
713 0 : _bt_restore_meta(reln, lsn,
714 : xlrec->rootblk, xlrec->level,
715 : xlrec->rootblk, xlrec->level);
716 :
717 : /* Check to see if this satisfies any incomplete insertions */
718 0 : if (record->xl_len > SizeOfBtreeNewroot)
719 0 : forget_matching_split(xlrec->node, downlink, true);
720 0 : }
721 :
722 :
723 : void
724 : btree_redo(XLogRecPtr lsn, XLogRecord *record)
725 0 : {
726 0 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
727 :
728 0 : switch (info)
729 : {
730 : case XLOG_BTREE_INSERT_LEAF:
731 0 : btree_xlog_insert(true, false, lsn, record);
732 0 : break;
733 : case XLOG_BTREE_INSERT_UPPER:
734 0 : btree_xlog_insert(false, false, lsn, record);
735 0 : break;
736 : case XLOG_BTREE_INSERT_META:
737 0 : btree_xlog_insert(false, true, lsn, record);
738 0 : break;
739 : case XLOG_BTREE_SPLIT_L:
740 0 : btree_xlog_split(true, false, lsn, record);
741 0 : break;
742 : case XLOG_BTREE_SPLIT_R:
743 0 : btree_xlog_split(false, false, lsn, record);
744 0 : break;
745 : case XLOG_BTREE_SPLIT_L_ROOT:
746 0 : btree_xlog_split(true, true, lsn, record);
747 0 : break;
748 : case XLOG_BTREE_SPLIT_R_ROOT:
749 0 : btree_xlog_split(false, true, lsn, record);
750 0 : break;
751 : case XLOG_BTREE_DELETE:
752 0 : btree_xlog_delete(lsn, record);
753 0 : break;
754 : case XLOG_BTREE_DELETE_PAGE:
755 : case XLOG_BTREE_DELETE_PAGE_META:
756 : case XLOG_BTREE_DELETE_PAGE_HALF:
757 0 : btree_xlog_delete_page(info, lsn, record);
758 0 : break;
759 : case XLOG_BTREE_NEWROOT:
760 0 : btree_xlog_newroot(lsn, record);
761 0 : break;
762 : default:
763 0 : elog(PANIC, "btree_redo: unknown op code %u", info);
764 : }
765 0 : }
766 :
767 : static void
768 : out_target(StringInfo buf, xl_btreetid *target)
769 0 : {
770 0 : appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u",
771 : target->node.spcNode, target->node.dbNode, target->node.relNode,
772 : ItemPointerGetBlockNumber(&(target->tid)),
773 : ItemPointerGetOffsetNumber(&(target->tid)));
774 0 : }
775 :
776 : void
777 : btree_desc(StringInfo buf, uint8 xl_info, char *rec)
778 0 : {
779 0 : uint8 info = xl_info & ~XLR_INFO_MASK;
780 :
781 0 : switch (info)
782 : {
783 : case XLOG_BTREE_INSERT_LEAF:
784 : {
785 0 : xl_btree_insert *xlrec = (xl_btree_insert *) rec;
786 :
787 0 : appendStringInfo(buf, "insert: ");
788 0 : out_target(buf, &(xlrec->target));
789 0 : break;
790 : }
791 : case XLOG_BTREE_INSERT_UPPER:
792 : {
793 0 : xl_btree_insert *xlrec = (xl_btree_insert *) rec;
794 :
795 0 : appendStringInfo(buf, "insert_upper: ");
796 0 : out_target(buf, &(xlrec->target));
797 0 : break;
798 : }
799 : case XLOG_BTREE_INSERT_META:
800 : {
801 0 : xl_btree_insert *xlrec = (xl_btree_insert *) rec;
802 :
803 0 : appendStringInfo(buf, "insert_meta: ");
804 0 : out_target(buf, &(xlrec->target));
805 0 : break;
806 : }
807 : case XLOG_BTREE_SPLIT_L:
808 : {
809 0 : xl_btree_split *xlrec = (xl_btree_split *) rec;
810 :
811 0 : appendStringInfo(buf, "split_l: rel %u/%u/%u ",
812 : xlrec->node.spcNode, xlrec->node.dbNode,
813 : xlrec->node.relNode);
814 0 : appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
815 : xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
816 : xlrec->level, xlrec->firstright);
817 0 : break;
818 : }
819 : case XLOG_BTREE_SPLIT_R:
820 : {
821 0 : xl_btree_split *xlrec = (xl_btree_split *) rec;
822 :
823 0 : appendStringInfo(buf, "split_r: rel %u/%u/%u ",
824 : xlrec->node.spcNode, xlrec->node.dbNode,
825 : xlrec->node.relNode);
826 0 : appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
827 : xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
828 : xlrec->level, xlrec->firstright);
829 0 : break;
830 : }
831 : case XLOG_BTREE_SPLIT_L_ROOT:
832 : {
833 0 : xl_btree_split *xlrec = (xl_btree_split *) rec;
834 :
835 0 : appendStringInfo(buf, "split_l_root: rel %u/%u/%u ",
836 : xlrec->node.spcNode, xlrec->node.dbNode,
837 : xlrec->node.relNode);
838 0 : appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
839 : xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
840 : xlrec->level, xlrec->firstright);
841 0 : break;
842 : }
843 : case XLOG_BTREE_SPLIT_R_ROOT:
844 : {
845 0 : xl_btree_split *xlrec = (xl_btree_split *) rec;
846 :
847 0 : appendStringInfo(buf, "split_r_root: rel %u/%u/%u ",
848 : xlrec->node.spcNode, xlrec->node.dbNode,
849 : xlrec->node.relNode);
850 0 : appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
851 : xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
852 : xlrec->level, xlrec->firstright);
853 0 : break;
854 : }
855 : case XLOG_BTREE_DELETE:
856 : {
857 0 : xl_btree_delete *xlrec = (xl_btree_delete *) rec;
858 :
859 0 : appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u",
860 : xlrec->node.spcNode, xlrec->node.dbNode,
861 : xlrec->node.relNode, xlrec->block);
862 0 : break;
863 : }
864 : case XLOG_BTREE_DELETE_PAGE:
865 : case XLOG_BTREE_DELETE_PAGE_META:
866 : case XLOG_BTREE_DELETE_PAGE_HALF:
867 : {
868 0 : xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec;
869 :
870 0 : appendStringInfo(buf, "delete_page: ");
871 0 : out_target(buf, &(xlrec->target));
872 0 : appendStringInfo(buf, "; dead %u; left %u; right %u",
873 : xlrec->deadblk, xlrec->leftblk, xlrec->rightblk);
874 0 : break;
875 : }
876 : case XLOG_BTREE_NEWROOT:
877 : {
878 0 : xl_btree_newroot *xlrec = (xl_btree_newroot *) rec;
879 :
880 0 : appendStringInfo(buf, "newroot: rel %u/%u/%u; root %u lev %u",
881 : xlrec->node.spcNode, xlrec->node.dbNode,
882 : xlrec->node.relNode,
883 : xlrec->rootblk, xlrec->level);
884 0 : break;
885 : }
886 : default:
887 0 : appendStringInfo(buf, "UNKNOWN");
888 : break;
889 : }
890 0 : }
891 :
892 : void
893 : btree_xlog_startup(void)
894 0 : {
895 0 : incomplete_actions = NIL;
896 0 : }
897 :
898 : void
899 : btree_xlog_cleanup(void)
900 0 : {
901 : ListCell *l;
902 :
903 0 : foreach(l, incomplete_actions)
904 : {
905 0 : bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);
906 : Relation reln;
907 :
908 0 : reln = XLogOpenRelation(action->node);
909 0 : if (action->is_split)
910 : {
911 : /* finish an incomplete split */
912 : Buffer lbuf,
913 : rbuf;
914 : Page lpage,
915 : rpage;
916 : BTPageOpaque lpageop,
917 : rpageop;
918 : bool is_only;
919 :
920 0 : lbuf = XLogReadBuffer(reln, action->leftblk, false);
921 : /* failure is impossible because we wrote this page earlier */
922 0 : if (!BufferIsValid(lbuf))
923 0 : elog(PANIC, "btree_xlog_cleanup: left block unfound");
924 0 : lpage = (Page) BufferGetPage(lbuf);
925 0 : lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
926 0 : rbuf = XLogReadBuffer(reln, action->rightblk, false);
927 : /* failure is impossible because we wrote this page earlier */
928 0 : if (!BufferIsValid(rbuf))
929 0 : elog(PANIC, "btree_xlog_cleanup: right block unfound");
930 0 : rpage = (Page) BufferGetPage(rbuf);
931 0 : rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
932 :
933 : /* if the pages are all of their level, it's a only-page split */
934 0 : is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
935 :
936 0 : _bt_insert_parent(reln, lbuf, rbuf, NULL,
937 : action->is_root, is_only);
938 : }
939 : else
940 : {
941 : /* finish an incomplete deletion (of a half-dead page) */
942 : Buffer buf;
943 :
944 0 : buf = XLogReadBuffer(reln, action->delblk, false);
945 0 : if (BufferIsValid(buf))
946 0 : if (_bt_pagedel(reln, buf, NULL, true) == 0)
947 0 : elog(PANIC, "btree_xlog_cleanup: _bt_pagdel failed");
948 : }
949 : }
950 0 : incomplete_actions = NIL;
951 0 : }
952 :
953 : bool
954 : btree_safe_restartpoint(void)
955 0 : {
956 0 : if (incomplete_actions)
957 0 : return false;
958 0 : return true;
959 : }
|