1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtpage.c
4 : * BTree-specific page management code for the Postgres btree access
5 : * method.
6 : *
7 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : *
11 : * IDENTIFICATION
12 : * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.105 2007/12/31 04:52:05 tgl Exp $
13 : *
14 : * NOTES
15 : * Postgres btree pages look like ordinary relation pages. The opaque
16 : * data at high addresses includes pointers to left and right siblings
17 : * and flag data describing page state. The first page in a btree, page
18 : * zero, is special -- it stores meta-information describing the tree.
19 : * Pages one and higher store the actual tree data.
20 : *
21 : *-------------------------------------------------------------------------
22 : */
23 : #include "postgres.h"
24 :
25 : #include "access/nbtree.h"
26 : #include "access/transam.h"
27 : #include "miscadmin.h"
28 : #include "storage/freespace.h"
29 : #include "storage/lmgr.h"
30 : #include "utils/inval.h"
31 :
32 :
33 : /*
34 : * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
35 : */
36 : void
37 : _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
38 557 : {
39 : BTMetaPageData *metad;
40 : BTPageOpaque metaopaque;
41 :
42 557 : _bt_pageinit(page, BLCKSZ);
43 :
44 557 : metad = BTPageGetMeta(page);
45 557 : metad->btm_magic = BTREE_MAGIC;
46 557 : metad->btm_version = BTREE_VERSION;
47 557 : metad->btm_root = rootbknum;
48 557 : metad->btm_level = level;
49 557 : metad->btm_fastroot = rootbknum;
50 557 : metad->btm_fastlevel = level;
51 :
52 557 : metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
53 557 : metaopaque->btpo_flags = BTP_META;
54 :
55 : /*
56 : * Set pd_lower just past the end of the metadata. This is not essential
57 : * but it makes the page look compressible to xlog.c.
58 : */
59 557 : ((PageHeader) page)->pd_lower =
60 : ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
61 557 : }
62 :
63 : /*
64 : * _bt_getroot() -- Get the root page of the btree.
65 : *
66 : * Since the root page can move around the btree file, we have to read
67 : * its location from the metadata page, and then read the root page
68 : * itself. If no root page exists yet, we have to create one. The
69 : * standard class of race conditions exists here; I think I covered
70 : * them all in the Hopi Indian rain dance of lock requests below.
71 : *
72 : * The access type parameter (BT_READ or BT_WRITE) controls whether
73 : * a new root page will be created or not. If access = BT_READ,
74 : * and no root page exists, we just return InvalidBuffer. For
75 : * BT_WRITE, we try to create the root page if it doesn't exist.
76 : * NOTE that the returned root page will have only a read lock set
77 : * on it even if access = BT_WRITE!
78 : *
79 : * The returned page is not necessarily the true root --- it could be
80 : * a "fast root" (a page that is alone in its level due to deletions).
81 : * Also, if the root page is split while we are "in flight" to it,
82 : * what we will return is the old root, which is now just the leftmost
83 : * page on a probably-not-very-wide level. For most purposes this is
84 : * as good as or better than the true root, so we do not bother to
85 : * insist on finding the true root. We do, however, guarantee to
86 : * return a live (not deleted or half-dead) page.
87 : *
88 : * On successful return, the root page is pinned and read-locked.
89 : * The metadata page is not locked or pinned on exit.
90 : */
91 : Buffer
92 : _bt_getroot(Relation rel, int access)
93 233575 : {
94 : Buffer metabuf;
95 : Page metapg;
96 : BTPageOpaque metaopaque;
97 : Buffer rootbuf;
98 : Page rootpage;
99 : BTPageOpaque rootopaque;
100 : BlockNumber rootblkno;
101 : uint32 rootlevel;
102 : BTMetaPageData *metad;
103 :
104 : /*
105 : * Try to use previously-cached metapage data to find the root. This
106 : * normally saves one buffer access per index search, which is a very
107 : * helpful savings in bufmgr traffic and hence contention.
108 : */
109 233575 : if (rel->rd_amcache != NULL)
110 : {
111 228835 : metad = (BTMetaPageData *) rel->rd_amcache;
112 : /* We shouldn't have cached it if any of these fail */
113 : Assert(metad->btm_magic == BTREE_MAGIC);
114 : Assert(metad->btm_version == BTREE_VERSION);
115 : Assert(metad->btm_root != P_NONE);
116 :
117 228835 : rootblkno = metad->btm_fastroot;
118 : Assert(rootblkno != P_NONE);
119 228835 : rootlevel = metad->btm_fastlevel;
120 :
121 228835 : rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
122 228835 : rootpage = BufferGetPage(rootbuf);
123 228835 : rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
124 :
125 : /*
126 : * Since the cache might be stale, we check the page more carefully
127 : * here than normal. We *must* check that it's not deleted. If it's
128 : * not alone on its level, then we reject too --- this may be overly
129 : * paranoid but better safe than sorry. Note we don't check P_ISROOT,
130 : * because that's not set in a "fast root".
131 : */
132 228835 : if (!P_IGNORE(rootopaque) &&
133 : rootopaque->btpo.level == rootlevel &&
134 : P_LEFTMOST(rootopaque) &&
135 : P_RIGHTMOST(rootopaque))
136 : {
137 : /* OK, accept cached page as the root */
138 228820 : return rootbuf;
139 : }
140 15 : _bt_relbuf(rel, rootbuf);
141 : /* Cache is stale, throw it away */
142 15 : if (rel->rd_amcache)
143 15 : pfree(rel->rd_amcache);
144 15 : rel->rd_amcache = NULL;
145 : }
146 :
147 4755 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
148 4755 : metapg = BufferGetPage(metabuf);
149 4755 : metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
150 4755 : metad = BTPageGetMeta(metapg);
151 :
152 : /* sanity-check the metapage */
153 4755 : if (!(metaopaque->btpo_flags & BTP_META) ||
154 : metad->btm_magic != BTREE_MAGIC)
155 0 : ereport(ERROR,
156 : (errcode(ERRCODE_INDEX_CORRUPTED),
157 : errmsg("index \"%s\" is not a btree",
158 : RelationGetRelationName(rel))));
159 :
160 4755 : if (metad->btm_version != BTREE_VERSION)
161 0 : ereport(ERROR,
162 : (errcode(ERRCODE_INDEX_CORRUPTED),
163 : errmsg("version mismatch in index \"%s\": file version %d, code version %d",
164 : RelationGetRelationName(rel),
165 : metad->btm_version, BTREE_VERSION)));
166 :
167 : /* if no root page initialized yet, do it */
168 4755 : if (metad->btm_root == P_NONE)
169 : {
170 : /* If access = BT_READ, caller doesn't want us to create root yet */
171 377 : if (access == BT_READ)
172 : {
173 248 : _bt_relbuf(rel, metabuf);
174 248 : return InvalidBuffer;
175 : }
176 :
177 : /* trade in our read lock for a write lock */
178 129 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
179 129 : LockBuffer(metabuf, BT_WRITE);
180 :
181 : /*
182 : * Race condition: if someone else initialized the metadata between
183 : * the time we released the read lock and acquired the write lock, we
184 : * must avoid doing it again.
185 : */
186 129 : if (metad->btm_root != P_NONE)
187 : {
188 : /*
189 : * Metadata initialized by someone else. In order to guarantee no
190 : * deadlocks, we have to release the metadata page and start all
191 : * over again. (Is that really true? But it's hardly worth trying
192 : * to optimize this case.)
193 : */
194 0 : _bt_relbuf(rel, metabuf);
195 0 : return _bt_getroot(rel, access);
196 : }
197 :
198 : /*
199 : * Get, initialize, write, and leave a lock of the appropriate type on
200 : * the new root page. Since this is the first page in the tree, it's
201 : * a leaf as well as the root.
202 : */
203 129 : rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
204 129 : rootblkno = BufferGetBlockNumber(rootbuf);
205 129 : rootpage = BufferGetPage(rootbuf);
206 129 : rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
207 129 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
208 129 : rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
209 129 : rootopaque->btpo.level = 0;
210 129 : rootopaque->btpo_cycleid = 0;
211 :
212 : /* NO ELOG(ERROR) till meta is updated */
213 129 : START_CRIT_SECTION();
214 :
215 129 : metad->btm_root = rootblkno;
216 129 : metad->btm_level = 0;
217 129 : metad->btm_fastroot = rootblkno;
218 129 : metad->btm_fastlevel = 0;
219 :
220 129 : MarkBufferDirty(rootbuf);
221 129 : MarkBufferDirty(metabuf);
222 :
223 : /* XLOG stuff */
224 129 : if (!rel->rd_istemp)
225 : {
226 : xl_btree_newroot xlrec;
227 : XLogRecPtr recptr;
228 : XLogRecData rdata;
229 :
230 116 : xlrec.node = rel->rd_node;
231 116 : xlrec.rootblk = rootblkno;
232 116 : xlrec.level = 0;
233 :
234 116 : rdata.data = (char *) &xlrec;
235 116 : rdata.len = SizeOfBtreeNewroot;
236 116 : rdata.buffer = InvalidBuffer;
237 116 : rdata.next = NULL;
238 :
239 116 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
240 :
241 116 : PageSetLSN(rootpage, recptr);
242 116 : PageSetTLI(rootpage, ThisTimeLineID);
243 116 : PageSetLSN(metapg, recptr);
244 116 : PageSetTLI(metapg, ThisTimeLineID);
245 : }
246 :
247 129 : END_CRIT_SECTION();
248 :
249 : /*
250 : * Send out relcache inval for metapage change (probably unnecessary
251 : * here, but let's be safe).
252 : */
253 129 : CacheInvalidateRelcache(rel);
254 :
255 : /*
256 : * swap root write lock for read lock. There is no danger of anyone
257 : * else accessing the new root page while it's unlocked, since no one
258 : * else knows where it is yet.
259 : */
260 129 : LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
261 129 : LockBuffer(rootbuf, BT_READ);
262 :
263 : /* okay, metadata is correct, release lock on it */
264 129 : _bt_relbuf(rel, metabuf);
265 : }
266 : else
267 : {
268 4378 : rootblkno = metad->btm_fastroot;
269 : Assert(rootblkno != P_NONE);
270 4378 : rootlevel = metad->btm_fastlevel;
271 :
272 : /*
273 : * Cache the metapage data for next time
274 : */
275 4378 : rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
276 : sizeof(BTMetaPageData));
277 4378 : memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
278 :
279 : /*
280 : * We are done with the metapage; arrange to release it via first
281 : * _bt_relandgetbuf call
282 : */
283 4378 : rootbuf = metabuf;
284 :
285 : for (;;)
286 : {
287 4378 : rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
288 4378 : rootpage = BufferGetPage(rootbuf);
289 4378 : rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
290 :
291 4378 : if (!P_IGNORE(rootopaque))
292 4378 : break;
293 :
294 : /* it's dead, Jim. step right one page */
295 0 : if (P_RIGHTMOST(rootopaque))
296 0 : elog(ERROR, "no live root page found in index \"%s\"",
297 : RelationGetRelationName(rel));
298 0 : rootblkno = rootopaque->btpo_next;
299 0 : }
300 :
301 : /* Note: can't check btpo.level on deleted pages */
302 4378 : if (rootopaque->btpo.level != rootlevel)
303 0 : elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
304 : rootblkno, RelationGetRelationName(rel),
305 : rootopaque->btpo.level, rootlevel);
306 : }
307 :
308 : /*
309 : * By here, we have a pin and read lock on the root page, and no lock set
310 : * on the metadata page. Return the root page's buffer.
311 : */
312 4507 : return rootbuf;
313 : }
314 :
315 : /*
316 : * _bt_gettrueroot() -- Get the true root page of the btree.
317 : *
318 : * This is the same as the BT_READ case of _bt_getroot(), except
319 : * we follow the true-root link not the fast-root link.
320 : *
321 : * By the time we acquire lock on the root page, it might have been split and
322 : * not be the true root anymore. This is okay for the present uses of this
323 : * routine; we only really need to be able to move up at least one tree level
324 : * from whatever non-root page we were at. If we ever do need to lock the
325 : * one true root page, we could loop here, re-reading the metapage on each
326 : * failure. (Note that it wouldn't do to hold the lock on the metapage while
327 : * moving to the root --- that'd deadlock against any concurrent root split.)
328 : */
329 : Buffer
330 : _bt_gettrueroot(Relation rel)
331 0 : {
332 : Buffer metabuf;
333 : Page metapg;
334 : BTPageOpaque metaopaque;
335 : Buffer rootbuf;
336 : Page rootpage;
337 : BTPageOpaque rootopaque;
338 : BlockNumber rootblkno;
339 : uint32 rootlevel;
340 : BTMetaPageData *metad;
341 :
342 : /*
343 : * We don't try to use cached metapage data here, since (a) this path is
344 : * not performance-critical, and (b) if we are here it suggests our cache
345 : * is out-of-date anyway. In light of point (b), it's probably safest to
346 : * actively flush any cached metapage info.
347 : */
348 0 : if (rel->rd_amcache)
349 0 : pfree(rel->rd_amcache);
350 0 : rel->rd_amcache = NULL;
351 :
352 0 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
353 0 : metapg = BufferGetPage(metabuf);
354 0 : metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
355 0 : metad = BTPageGetMeta(metapg);
356 :
357 0 : if (!(metaopaque->btpo_flags & BTP_META) ||
358 : metad->btm_magic != BTREE_MAGIC)
359 0 : ereport(ERROR,
360 : (errcode(ERRCODE_INDEX_CORRUPTED),
361 : errmsg("index \"%s\" is not a btree",
362 : RelationGetRelationName(rel))));
363 :
364 0 : if (metad->btm_version != BTREE_VERSION)
365 0 : ereport(ERROR,
366 : (errcode(ERRCODE_INDEX_CORRUPTED),
367 : errmsg("version mismatch in index \"%s\": file version %d, code version %d",
368 : RelationGetRelationName(rel),
369 : metad->btm_version, BTREE_VERSION)));
370 :
371 : /* if no root page initialized yet, fail */
372 0 : if (metad->btm_root == P_NONE)
373 : {
374 0 : _bt_relbuf(rel, metabuf);
375 0 : return InvalidBuffer;
376 : }
377 :
378 0 : rootblkno = metad->btm_root;
379 0 : rootlevel = metad->btm_level;
380 :
381 : /*
382 : * We are done with the metapage; arrange to release it via first
383 : * _bt_relandgetbuf call
384 : */
385 0 : rootbuf = metabuf;
386 :
387 : for (;;)
388 : {
389 0 : rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
390 0 : rootpage = BufferGetPage(rootbuf);
391 0 : rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
392 :
393 0 : if (!P_IGNORE(rootopaque))
394 0 : break;
395 :
396 : /* it's dead, Jim. step right one page */
397 0 : if (P_RIGHTMOST(rootopaque))
398 0 : elog(ERROR, "no live root page found in index \"%s\"",
399 : RelationGetRelationName(rel));
400 0 : rootblkno = rootopaque->btpo_next;
401 0 : }
402 :
403 : /* Note: can't check btpo.level on deleted pages */
404 0 : if (rootopaque->btpo.level != rootlevel)
405 0 : elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
406 : rootblkno, RelationGetRelationName(rel),
407 : rootopaque->btpo.level, rootlevel);
408 :
409 0 : return rootbuf;
410 : }
411 :
412 : /*
413 : * _bt_checkpage() -- Verify that a freshly-read page looks sane.
414 : */
415 : void
416 : _bt_checkpage(Relation rel, Buffer buf)
417 413682 : {
418 413682 : Page page = BufferGetPage(buf);
419 :
420 : /*
421 : * ReadBuffer verifies that every newly-read page passes
422 : * PageHeaderIsValid, which means it either contains a reasonably sane
423 : * page header or is all-zero. We have to defend against the all-zero
424 : * case, however.
425 : */
426 413682 : if (PageIsNew(page))
427 0 : ereport(ERROR,
428 : (errcode(ERRCODE_INDEX_CORRUPTED),
429 : errmsg("index \"%s\" contains unexpected zero page at block %u",
430 : RelationGetRelationName(rel),
431 : BufferGetBlockNumber(buf)),
432 : errhint("Please REINDEX it.")));
433 :
434 : /*
435 : * Additionally check that the special area looks sane.
436 : */
437 413682 : if (((PageHeader) (page))->pd_special !=
438 : (BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData))))
439 0 : ereport(ERROR,
440 : (errcode(ERRCODE_INDEX_CORRUPTED),
441 : errmsg("index \"%s\" contains corrupted page at block %u",
442 : RelationGetRelationName(rel),
443 : BufferGetBlockNumber(buf)),
444 : errhint("Please REINDEX it.")));
445 413682 : }
446 :
447 : /*
448 : * _bt_getbuf() -- Get a buffer by block number for read or write.
449 : *
450 : * blkno == P_NEW means to get an unallocated index page. The page
451 : * will be initialized before returning it.
452 : *
453 : * When this routine returns, the appropriate lock is set on the
454 : * requested buffer and its reference count has been incremented
455 : * (ie, the buffer is "locked and pinned"). Also, we apply
456 : * _bt_checkpage to sanity-check the page (except in P_NEW case).
457 : */
458 : Buffer
459 : _bt_getbuf(Relation rel, BlockNumber blkno, int access)
460 234730 : {
461 : Buffer buf;
462 :
463 234730 : if (blkno != P_NEW)
464 : {
465 : /* Read an existing block of the relation */
466 234204 : buf = ReadBuffer(rel, blkno);
467 234204 : LockBuffer(buf, access);
468 234204 : _bt_checkpage(rel, buf);
469 : }
470 : else
471 : {
472 : bool needLock;
473 : Page page;
474 :
475 : Assert(access == BT_WRITE);
476 :
477 : /*
478 : * First see if the FSM knows of any free pages.
479 : *
480 : * We can't trust the FSM's report unreservedly; we have to check that
481 : * the page is still free. (For example, an already-free page could
482 : * have been re-used between the time the last VACUUM scanned it and
483 : * the time the VACUUM made its FSM updates.)
484 : *
485 : * In fact, it's worse than that: we can't even assume that it's safe
486 : * to take a lock on the reported page. If somebody else has a lock
487 : * on it, or even worse our own caller does, we could deadlock. (The
488 : * own-caller scenario is actually not improbable. Consider an index
489 : * on a serial or timestamp column. Nearly all splits will be at the
490 : * rightmost page, so it's entirely likely that _bt_split will call us
491 : * while holding a lock on the page most recently acquired from FSM. A
492 : * VACUUM running concurrently with the previous split could well have
493 : * placed that page back in FSM.)
494 : *
495 : * To get around that, we ask for only a conditional lock on the
496 : * reported page. If we fail, then someone else is using the page,
497 : * and we may reasonably assume it's not free. (If we happen to be
498 : * wrong, the worst consequence is the page will be lost to use till
499 : * the next VACUUM, which is no big problem.)
500 : */
501 : for (;;)
502 : {
503 526 : blkno = GetFreeIndexPage(&rel->rd_node);
504 526 : if (blkno == InvalidBlockNumber)
505 526 : break;
506 0 : buf = ReadBuffer(rel, blkno);
507 0 : if (ConditionalLockBuffer(buf))
508 : {
509 0 : page = BufferGetPage(buf);
510 0 : if (_bt_page_recyclable(page))
511 : {
512 : /* Okay to use page. Re-initialize and return it */
513 0 : _bt_pageinit(page, BufferGetPageSize(buf));
514 0 : return buf;
515 : }
516 0 : elog(DEBUG2, "FSM returned nonrecyclable page");
517 0 : _bt_relbuf(rel, buf);
518 : }
519 : else
520 : {
521 0 : elog(DEBUG2, "FSM returned nonlockable page");
522 : /* couldn't get lock, so just drop pin */
523 0 : ReleaseBuffer(buf);
524 : }
525 : }
526 :
527 : /*
528 : * Extend the relation by one page.
529 : *
530 : * We have to use a lock to ensure no one else is extending the rel at
531 : * the same time, else we will both try to initialize the same new
532 : * page. We can skip locking for new or temp relations, however,
533 : * since no one else could be accessing them.
534 : */
535 526 : needLock = !RELATION_IS_LOCAL(rel);
536 :
537 526 : if (needLock)
538 510 : LockRelationForExtension(rel, ExclusiveLock);
539 :
540 526 : buf = ReadBuffer(rel, P_NEW);
541 :
542 : /* Acquire buffer lock on new page */
543 526 : LockBuffer(buf, BT_WRITE);
544 :
545 : /*
546 : * Release the file-extension lock; it's now OK for someone else to
547 : * extend the relation some more. Note that we cannot release this
548 : * lock before we have buffer lock on the new page, or we risk a race
549 : * condition against btvacuumscan --- see comments therein.
550 : */
551 526 : if (needLock)
552 510 : UnlockRelationForExtension(rel, ExclusiveLock);
553 :
554 : /* Initialize the new page before returning it */
555 526 : page = BufferGetPage(buf);
556 : Assert(PageIsNew((PageHeader) page));
557 526 : _bt_pageinit(page, BufferGetPageSize(buf));
558 : }
559 :
560 : /* ref count and lock type are correct */
561 234730 : return buf;
562 : }
563 :
564 : /*
565 : * _bt_relandgetbuf() -- release a locked buffer and get another one.
566 : *
567 : * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
568 : * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
569 : * then it reduces to just _bt_getbuf; allowing this case simplifies some
570 : * callers. The motivation for using this is to avoid two entries to the
571 : * bufmgr when one will do.
572 : */
573 : Buffer
574 : _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
575 178131 : {
576 : Buffer buf;
577 :
578 : Assert(blkno != P_NEW);
579 178131 : if (BufferIsValid(obuf))
580 176980 : LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
581 178131 : buf = ReleaseAndReadBuffer(obuf, rel, blkno);
582 178131 : LockBuffer(buf, access);
583 178131 : _bt_checkpage(rel, buf);
584 178131 : return buf;
585 : }
586 :
587 : /*
588 : * _bt_relbuf() -- release a locked buffer.
589 : *
590 : * Lock and pin (refcount) are both dropped.
591 : */
592 : void
593 : _bt_relbuf(Relation rel, Buffer buf)
594 140782 : {
595 140782 : UnlockReleaseBuffer(buf);
596 140782 : }
597 :
598 : /*
599 : * _bt_pageinit() -- Initialize a new page.
600 : *
601 : * On return, the page header is initialized; data space is empty;
602 : * special space is zeroed out.
603 : */
604 : void
605 : _bt_pageinit(Page page, Size size)
606 2168 : {
607 2168 : PageInit(page, size, sizeof(BTPageOpaqueData));
608 2168 : }
609 :
610 : /*
611 : * _bt_page_recyclable() -- Is an existing page recyclable?
612 : *
613 : * This exists to make sure _bt_getbuf and btvacuumscan have the same
614 : * policy about whether a page is safe to re-use.
615 : */
616 : bool
617 : _bt_page_recyclable(Page page)
618 1347 : {
619 : BTPageOpaque opaque;
620 :
621 : /*
622 : * It's possible to find an all-zeroes page in an index --- for example, a
623 : * backend might successfully extend the relation one page and then crash
624 : * before it is able to make a WAL entry for adding the page. If we find a
625 : * zeroed page then reclaim it.
626 : */
627 1347 : if (PageIsNew(page))
628 0 : return true;
629 :
630 : /*
631 : * Otherwise, recycle if deleted and too old to have any processes
632 : * interested in it.
633 : */
634 1347 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
635 1347 : if (P_ISDELETED(opaque) &&
636 : TransactionIdPrecedesOrEquals(opaque->btpo.xact, RecentXmin))
637 0 : return true;
638 1347 : return false;
639 : }
640 :
641 : /*
642 : * Delete item(s) from a btree page.
643 : *
644 : * This must only be used for deleting leaf items. Deleting an item on a
645 : * non-leaf page has to be done as part of an atomic action that includes
646 : * deleting the page it points to.
647 : *
648 : * This routine assumes that the caller has pinned and locked the buffer.
649 : * Also, the given itemnos *must* appear in increasing order in the array.
650 : */
651 : void
652 : _bt_delitems(Relation rel, Buffer buf,
653 : OffsetNumber *itemnos, int nitems)
654 257 : {
655 257 : Page page = BufferGetPage(buf);
656 : BTPageOpaque opaque;
657 :
658 : /* No ereport(ERROR) until changes are logged */
659 257 : START_CRIT_SECTION();
660 :
661 : /* Fix the page */
662 257 : PageIndexMultiDelete(page, itemnos, nitems);
663 :
664 : /*
665 : * We can clear the vacuum cycle ID since this page has certainly been
666 : * processed by the current vacuum scan.
667 : */
668 257 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
669 257 : opaque->btpo_cycleid = 0;
670 :
671 : /*
672 : * Mark the page as not containing any LP_DEAD items. This is not
673 : * certainly true (there might be some that have recently been marked, but
674 : * weren't included in our target-item list), but it will almost always be
675 : * true and it doesn't seem worth an additional page scan to check it.
676 : * Remember that BTP_HAS_GARBAGE is only a hint anyway.
677 : */
678 257 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
679 :
680 257 : MarkBufferDirty(buf);
681 :
682 : /* XLOG stuff */
683 257 : if (!rel->rd_istemp)
684 : {
685 : xl_btree_delete xlrec;
686 : XLogRecPtr recptr;
687 : XLogRecData rdata[2];
688 :
689 257 : xlrec.node = rel->rd_node;
690 257 : xlrec.block = BufferGetBlockNumber(buf);
691 :
692 257 : rdata[0].data = (char *) &xlrec;
693 257 : rdata[0].len = SizeOfBtreeDelete;
694 257 : rdata[0].buffer = InvalidBuffer;
695 257 : rdata[0].next = &(rdata[1]);
696 :
697 : /*
698 : * The target-offsets array is not in the buffer, but pretend that it
699 : * is. When XLogInsert stores the whole buffer, the offsets array
700 : * need not be stored too.
701 : */
702 257 : if (nitems > 0)
703 : {
704 257 : rdata[1].data = (char *) itemnos;
705 257 : rdata[1].len = nitems * sizeof(OffsetNumber);
706 : }
707 : else
708 : {
709 0 : rdata[1].data = NULL;
710 0 : rdata[1].len = 0;
711 : }
712 257 : rdata[1].buffer = buf;
713 257 : rdata[1].buffer_std = true;
714 257 : rdata[1].next = NULL;
715 :
716 257 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
717 :
718 257 : PageSetLSN(page, recptr);
719 257 : PageSetTLI(page, ThisTimeLineID);
720 : }
721 :
722 257 : END_CRIT_SECTION();
723 257 : }
724 :
725 : /*
726 : * Subroutine to pre-check whether a page deletion is safe, that is, its
727 : * parent page would be left in a valid or deletable state.
728 : *
729 : * "target" is the page we wish to delete, and "stack" is a search stack
730 : * leading to it (approximately). Note that we will update the stack
731 : * entry(s) to reflect current downlink positions --- this is harmless and
732 : * indeed saves later search effort in _bt_pagedel.
733 : *
734 : * Note: it's OK to release page locks after checking, because a safe
735 : * deletion can't become unsafe due to concurrent activity. A non-rightmost
736 : * page cannot become rightmost unless there's a concurrent page deletion,
737 : * but only VACUUM does page deletion and we only allow one VACUUM on an index
738 : * at a time. An only child could acquire a sibling (of the same parent) only
739 : * by being split ... but that would make it a non-rightmost child so the
740 : * deletion is still safe.
741 : */
742 : static bool
743 : _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
744 2 : {
745 : BlockNumber parent;
746 : OffsetNumber poffset,
747 : maxoff;
748 : Buffer pbuf;
749 : Page page;
750 : BTPageOpaque opaque;
751 :
752 : /*
753 : * In recovery mode, assume the deletion being replayed is valid. We
754 : * can't always check it because we won't have a full search stack, and we
755 : * should complain if there's a problem, anyway.
756 : */
757 2 : if (InRecovery)
758 0 : return true;
759 :
760 : /* Locate the parent's downlink (updating the stack entry if needed) */
761 2 : ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
762 2 : pbuf = _bt_getstackbuf(rel, stack, BT_READ);
763 2 : if (pbuf == InvalidBuffer)
764 0 : elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
765 : RelationGetRelationName(rel), target);
766 2 : parent = stack->bts_blkno;
767 2 : poffset = stack->bts_offset;
768 :
769 2 : page = BufferGetPage(pbuf);
770 2 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
771 2 : maxoff = PageGetMaxOffsetNumber(page);
772 :
773 : /*
774 : * If the target is the rightmost child of its parent, then we can't
775 : * delete, unless it's also the only child.
776 : */
777 2 : if (poffset >= maxoff)
778 : {
779 : /* It's rightmost child... */
780 0 : if (poffset == P_FIRSTDATAKEY(opaque))
781 : {
782 : /*
783 : * It's only child, so safe if parent would itself be removable.
784 : * We have to check the parent itself, and then recurse to test
785 : * the conditions at the parent's parent.
786 : */
787 0 : if (P_RIGHTMOST(opaque) || P_ISROOT(opaque))
788 : {
789 0 : _bt_relbuf(rel, pbuf);
790 0 : return false;
791 : }
792 :
793 0 : _bt_relbuf(rel, pbuf);
794 0 : return _bt_parent_deletion_safe(rel, parent, stack->bts_parent);
795 : }
796 : else
797 : {
798 : /* Unsafe to delete */
799 0 : _bt_relbuf(rel, pbuf);
800 0 : return false;
801 : }
802 : }
803 : else
804 : {
805 : /* Not rightmost child, so safe to delete */
806 2 : _bt_relbuf(rel, pbuf);
807 2 : return true;
808 : }
809 : }
810 :
811 : /*
812 : * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
813 : *
814 : * This action unlinks the page from the b-tree structure, removing all
815 : * pointers leading to it --- but not touching its own left and right links.
816 : * The page cannot be physically reclaimed right away, since other processes
817 : * may currently be trying to follow links leading to the page; they have to
818 : * be allowed to use its right-link to recover. See nbtree/README.
819 : *
820 : * On entry, the target buffer must be pinned and locked (either read or write
821 : * lock is OK). This lock and pin will be dropped before exiting.
822 : *
823 : * The "stack" argument can be a search stack leading (approximately) to the
824 : * target page, or NULL --- outside callers typically pass NULL since they
825 : * have not done such a search, but internal recursion cases pass the stack
826 : * to avoid duplicated search effort.
827 : *
828 : * Returns the number of pages successfully deleted (zero if page cannot
829 : * be deleted now; could be more than one if parent pages were deleted too).
830 : *
831 : * NOTE: this leaks memory. Rather than trying to clean up everything
832 : * carefully, it's better to run it in a temp context that can be reset
833 : * frequently.
834 : */
835 : int
836 : _bt_pagedel(Relation rel, Buffer buf, BTStack stack, bool vacuum_full)
837 8 : {
838 : int result;
839 : BlockNumber target,
840 : leftsib,
841 : rightsib,
842 : parent;
843 : OffsetNumber poffset,
844 : maxoff;
845 : uint32 targetlevel,
846 : ilevel;
847 : ItemId itemid;
848 : IndexTuple targetkey,
849 : itup;
850 : ScanKey itup_scankey;
851 : Buffer lbuf,
852 : rbuf,
853 : pbuf;
854 : bool parent_half_dead;
855 : bool parent_one_child;
856 : bool rightsib_empty;
857 8 : Buffer metabuf = InvalidBuffer;
858 8 : Page metapg = NULL;
859 8 : BTMetaPageData *metad = NULL;
860 : Page page;
861 : BTPageOpaque opaque;
862 :
863 : /*
864 : * We can never delete rightmost pages nor root pages. While at it, check
865 : * that page is not already deleted and is empty.
866 : */
867 8 : page = BufferGetPage(buf);
868 8 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
869 8 : if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
870 : P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
871 : {
872 : /* Should never fail to delete a half-dead page */
873 : Assert(!P_ISHALFDEAD(opaque));
874 :
875 6 : _bt_relbuf(rel, buf);
876 6 : return 0;
877 : }
878 :
879 : /*
880 : * Save info about page, including a copy of its high key (it must have
881 : * one, being non-rightmost).
882 : */
883 2 : target = BufferGetBlockNumber(buf);
884 2 : targetlevel = opaque->btpo.level;
885 2 : leftsib = opaque->btpo_prev;
886 2 : itemid = PageGetItemId(page, P_HIKEY);
887 2 : targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
888 :
889 : /*
890 : * To avoid deadlocks, we'd better drop the target page lock before going
891 : * further.
892 : */
893 2 : _bt_relbuf(rel, buf);
894 :
895 : /*
896 : * We need an approximate pointer to the page's parent page. We use the
897 : * standard search mechanism to search for the page's high key; this will
898 : * give us a link to either the current parent or someplace to its left
899 : * (if there are multiple equal high keys). In recursion cases, the
900 : * caller already generated a search stack and we can just re-use that
901 : * work.
902 : */
903 2 : if (stack == NULL)
904 : {
905 2 : if (!InRecovery)
906 : {
907 : /* we need an insertion scan key to do our search, so build one */
908 2 : itup_scankey = _bt_mkscankey(rel, targetkey);
909 : /* find the leftmost leaf page containing this key */
910 2 : stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
911 : &lbuf, BT_READ);
912 : /* don't need a pin on that either */
913 2 : _bt_relbuf(rel, lbuf);
914 :
915 : /*
916 : * If we are trying to delete an interior page, _bt_search did
917 : * more than we needed. Locate the stack item pointing to our
918 : * parent level.
919 : */
920 2 : ilevel = 0;
921 : for (;;)
922 : {
923 2 : if (stack == NULL)
924 0 : elog(ERROR, "not enough stack items");
925 2 : if (ilevel == targetlevel)
926 2 : break;
927 0 : stack = stack->bts_parent;
928 0 : ilevel++;
929 0 : }
930 : }
931 : else
932 : {
933 : /*
934 : * During WAL recovery, we can't use _bt_search (for one reason,
935 : * it might invoke user-defined comparison functions that expect
936 : * facilities not available in recovery mode). Instead, just set
937 : * up a dummy stack pointing to the left end of the parent tree
938 : * level, from which _bt_getstackbuf will walk right to the parent
939 : * page. Painful, but we don't care too much about performance in
940 : * this scenario.
941 : */
942 0 : pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
943 0 : stack = (BTStack) palloc(sizeof(BTStackData));
944 0 : stack->bts_blkno = BufferGetBlockNumber(pbuf);
945 0 : stack->bts_offset = InvalidOffsetNumber;
946 : /* bts_btentry will be initialized below */
947 0 : stack->bts_parent = NULL;
948 0 : _bt_relbuf(rel, pbuf);
949 : }
950 : }
951 :
952 : /*
953 : * We cannot delete a page that is the rightmost child of its immediate
954 : * parent, unless it is the only child --- in which case the parent has to
955 : * be deleted too, and the same condition applies recursively to it. We
956 : * have to check this condition all the way up before trying to delete. We
957 : * don't need to re-test when deleting a non-leaf page, though.
958 : */
959 2 : if (targetlevel == 0 &&
960 : !_bt_parent_deletion_safe(rel, target, stack))
961 0 : return 0;
962 :
963 : /*
964 : * We have to lock the pages we need to modify in the standard order:
965 : * moving right, then up. Else we will deadlock against other writers.
966 : *
967 : * So, we need to find and write-lock the current left sibling of the
968 : * target page. The sibling that was current a moment ago could have
969 : * split, so we may have to move right. This search could fail if either
970 : * the sibling or the target page was deleted by someone else meanwhile;
971 : * if so, give up. (Right now, that should never happen, since page
972 : * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
973 : * concurrently on the same table.)
974 : */
975 2 : if (leftsib != P_NONE)
976 : {
977 2 : lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
978 2 : page = BufferGetPage(lbuf);
979 2 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
980 4 : while (P_ISDELETED(opaque) || opaque->btpo_next != target)
981 : {
982 : /* step right one page */
983 0 : leftsib = opaque->btpo_next;
984 0 : _bt_relbuf(rel, lbuf);
985 0 : if (leftsib == P_NONE)
986 : {
987 0 : elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
988 : RelationGetRelationName(rel));
989 0 : return 0;
990 : }
991 0 : lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
992 0 : page = BufferGetPage(lbuf);
993 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
994 : }
995 : }
996 : else
997 0 : lbuf = InvalidBuffer;
998 :
999 : /*
1000 : * Next write-lock the target page itself. It should be okay to take just
1001 : * a write lock not a superexclusive lock, since no scans would stop on an
1002 : * empty page.
1003 : */
1004 2 : buf = _bt_getbuf(rel, target, BT_WRITE);
1005 2 : page = BufferGetPage(buf);
1006 2 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1007 :
1008 : /*
1009 : * Check page is still empty etc, else abandon deletion. The empty check
1010 : * is necessary since someone else might have inserted into it while we
1011 : * didn't have it locked; the others are just for paranoia's sake.
1012 : */
1013 2 : if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
1014 : P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
1015 : {
1016 0 : _bt_relbuf(rel, buf);
1017 0 : if (BufferIsValid(lbuf))
1018 0 : _bt_relbuf(rel, lbuf);
1019 0 : return 0;
1020 : }
1021 2 : if (opaque->btpo_prev != leftsib)
1022 0 : elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
1023 : target, RelationGetRelationName(rel));
1024 :
1025 : /*
1026 : * And next write-lock the (current) right sibling.
1027 : */
1028 2 : rightsib = opaque->btpo_next;
1029 2 : rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
1030 :
1031 : /*
1032 : * Next find and write-lock the current parent of the target page. This is
1033 : * essentially the same as the corresponding step of splitting.
1034 : */
1035 2 : ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
1036 2 : pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1037 2 : if (pbuf == InvalidBuffer)
1038 0 : elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
1039 : RelationGetRelationName(rel), target);
1040 2 : parent = stack->bts_blkno;
1041 2 : poffset = stack->bts_offset;
1042 :
1043 : /*
1044 : * If the target is the rightmost child of its parent, then we can't
1045 : * delete, unless it's also the only child --- in which case the parent
1046 : * changes to half-dead status. The "can't delete" case should have been
1047 : * detected by _bt_parent_deletion_safe, so complain if we see it now.
1048 : */
1049 2 : page = BufferGetPage(pbuf);
1050 2 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1051 2 : maxoff = PageGetMaxOffsetNumber(page);
1052 2 : parent_half_dead = false;
1053 2 : parent_one_child = false;
1054 2 : if (poffset >= maxoff)
1055 : {
1056 0 : if (poffset == P_FIRSTDATAKEY(opaque))
1057 0 : parent_half_dead = true;
1058 : else
1059 0 : elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
1060 : target, parent, RelationGetRelationName(rel));
1061 : }
1062 : else
1063 : {
1064 : /* Will there be exactly one child left in this parent? */
1065 2 : if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
1066 0 : parent_one_child = true;
1067 : }
1068 :
1069 : /*
1070 : * If we are deleting the next-to-last page on the target's level, then
1071 : * the rightsib is a candidate to become the new fast root. (In theory, it
1072 : * might be possible to push the fast root even further down, but the odds
1073 : * of doing so are slim, and the locking considerations daunting.)
1074 : *
1075 : * We don't support handling this in the case where the parent is becoming
1076 : * half-dead, even though it theoretically could occur.
1077 : *
1078 : * We can safely acquire a lock on the metapage here --- see comments for
1079 : * _bt_newroot().
1080 : */
1081 2 : if (leftsib == P_NONE && !parent_half_dead)
1082 : {
1083 0 : page = BufferGetPage(rbuf);
1084 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1085 : Assert(opaque->btpo.level == targetlevel);
1086 0 : if (P_RIGHTMOST(opaque))
1087 : {
1088 : /* rightsib will be the only one left on the level */
1089 0 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1090 0 : metapg = BufferGetPage(metabuf);
1091 0 : metad = BTPageGetMeta(metapg);
1092 :
1093 : /*
1094 : * The expected case here is btm_fastlevel == targetlevel+1; if
1095 : * the fastlevel is <= targetlevel, something is wrong, and we
1096 : * choose to overwrite it to fix it.
1097 : */
1098 0 : if (metad->btm_fastlevel > targetlevel + 1)
1099 : {
1100 : /* no update wanted */
1101 0 : _bt_relbuf(rel, metabuf);
1102 0 : metabuf = InvalidBuffer;
1103 : }
1104 : }
1105 : }
1106 :
1107 : /*
1108 : * Here we begin doing the deletion.
1109 : */
1110 :
1111 : /* No ereport(ERROR) until changes are logged */
1112 2 : START_CRIT_SECTION();
1113 :
1114 : /*
1115 : * Update parent. The normal case is a tad tricky because we want to
1116 : * delete the target's downlink and the *following* key. Easiest way is
1117 : * to copy the right sibling's downlink over the target downlink, and then
1118 : * delete the following item.
1119 : */
1120 2 : page = BufferGetPage(pbuf);
1121 2 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1122 2 : if (parent_half_dead)
1123 : {
1124 0 : PageIndexTupleDelete(page, poffset);
1125 0 : opaque->btpo_flags |= BTP_HALF_DEAD;
1126 : }
1127 : else
1128 : {
1129 : OffsetNumber nextoffset;
1130 :
1131 2 : itemid = PageGetItemId(page, poffset);
1132 2 : itup = (IndexTuple) PageGetItem(page, itemid);
1133 : Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
1134 2 : ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
1135 :
1136 2 : nextoffset = OffsetNumberNext(poffset);
1137 : /* This part is just for double-checking */
1138 2 : itemid = PageGetItemId(page, nextoffset);
1139 2 : itup = (IndexTuple) PageGetItem(page, itemid);
1140 2 : if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
1141 0 : elog(PANIC, "right sibling %u of block %u is not next child of %u in index \"%s\"",
1142 : rightsib, target, BufferGetBlockNumber(pbuf),
1143 : RelationGetRelationName(rel));
1144 2 : PageIndexTupleDelete(page, nextoffset);
1145 : }
1146 :
1147 : /*
1148 : * Update siblings' side-links. Note the target page's side-links will
1149 : * continue to point to the siblings.
1150 : */
1151 2 : if (BufferIsValid(lbuf))
1152 : {
1153 2 : page = BufferGetPage(lbuf);
1154 2 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1155 : Assert(opaque->btpo_next == target);
1156 2 : opaque->btpo_next = rightsib;
1157 : }
1158 2 : page = BufferGetPage(rbuf);
1159 2 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1160 : Assert(opaque->btpo_prev == target);
1161 2 : opaque->btpo_prev = leftsib;
1162 2 : rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1163 :
1164 : /*
1165 : * Mark the page itself deleted. It can be recycled when all current
1166 : * transactions are gone; or immediately if we're doing VACUUM FULL.
1167 : */
1168 2 : page = BufferGetPage(buf);
1169 2 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1170 2 : opaque->btpo_flags &= ~BTP_HALF_DEAD;
1171 2 : opaque->btpo_flags |= BTP_DELETED;
1172 2 : opaque->btpo.xact =
1173 : vacuum_full ? FrozenTransactionId : ReadNewTransactionId();
1174 :
1175 : /* And update the metapage, if needed */
1176 2 : if (BufferIsValid(metabuf))
1177 : {
1178 0 : metad->btm_fastroot = rightsib;
1179 0 : metad->btm_fastlevel = targetlevel;
1180 0 : MarkBufferDirty(metabuf);
1181 : }
1182 :
1183 : /* Must mark buffers dirty before XLogInsert */
1184 2 : MarkBufferDirty(pbuf);
1185 2 : MarkBufferDirty(rbuf);
1186 2 : MarkBufferDirty(buf);
1187 2 : if (BufferIsValid(lbuf))
1188 2 : MarkBufferDirty(lbuf);
1189 :
1190 : /* XLOG stuff */
1191 2 : if (!rel->rd_istemp)
1192 : {
1193 : xl_btree_delete_page xlrec;
1194 : xl_btree_metadata xlmeta;
1195 : uint8 xlinfo;
1196 : XLogRecPtr recptr;
1197 : XLogRecData rdata[5];
1198 : XLogRecData *nextrdata;
1199 :
1200 2 : xlrec.target.node = rel->rd_node;
1201 2 : ItemPointerSet(&(xlrec.target.tid), parent, poffset);
1202 2 : xlrec.deadblk = target;
1203 2 : xlrec.leftblk = leftsib;
1204 2 : xlrec.rightblk = rightsib;
1205 :
1206 2 : rdata[0].data = (char *) &xlrec;
1207 2 : rdata[0].len = SizeOfBtreeDeletePage;
1208 2 : rdata[0].buffer = InvalidBuffer;
1209 2 : rdata[0].next = nextrdata = &(rdata[1]);
1210 :
1211 2 : if (BufferIsValid(metabuf))
1212 : {
1213 0 : xlmeta.root = metad->btm_root;
1214 0 : xlmeta.level = metad->btm_level;
1215 0 : xlmeta.fastroot = metad->btm_fastroot;
1216 0 : xlmeta.fastlevel = metad->btm_fastlevel;
1217 :
1218 0 : nextrdata->data = (char *) &xlmeta;
1219 0 : nextrdata->len = sizeof(xl_btree_metadata);
1220 0 : nextrdata->buffer = InvalidBuffer;
1221 0 : nextrdata->next = nextrdata + 1;
1222 0 : nextrdata++;
1223 0 : xlinfo = XLOG_BTREE_DELETE_PAGE_META;
1224 : }
1225 2 : else if (parent_half_dead)
1226 0 : xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
1227 : else
1228 2 : xlinfo = XLOG_BTREE_DELETE_PAGE;
1229 :
1230 2 : nextrdata->data = NULL;
1231 2 : nextrdata->len = 0;
1232 2 : nextrdata->next = nextrdata + 1;
1233 2 : nextrdata->buffer = pbuf;
1234 2 : nextrdata->buffer_std = true;
1235 2 : nextrdata++;
1236 :
1237 2 : nextrdata->data = NULL;
1238 2 : nextrdata->len = 0;
1239 2 : nextrdata->buffer = rbuf;
1240 2 : nextrdata->buffer_std = true;
1241 2 : nextrdata->next = NULL;
1242 :
1243 2 : if (BufferIsValid(lbuf))
1244 : {
1245 2 : nextrdata->next = nextrdata + 1;
1246 2 : nextrdata++;
1247 2 : nextrdata->data = NULL;
1248 2 : nextrdata->len = 0;
1249 2 : nextrdata->buffer = lbuf;
1250 2 : nextrdata->buffer_std = true;
1251 2 : nextrdata->next = NULL;
1252 : }
1253 :
1254 2 : recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
1255 :
1256 2 : if (BufferIsValid(metabuf))
1257 : {
1258 0 : PageSetLSN(metapg, recptr);
1259 0 : PageSetTLI(metapg, ThisTimeLineID);
1260 : }
1261 2 : page = BufferGetPage(pbuf);
1262 2 : PageSetLSN(page, recptr);
1263 2 : PageSetTLI(page, ThisTimeLineID);
1264 2 : page = BufferGetPage(rbuf);
1265 2 : PageSetLSN(page, recptr);
1266 2 : PageSetTLI(page, ThisTimeLineID);
1267 2 : page = BufferGetPage(buf);
1268 2 : PageSetLSN(page, recptr);
1269 2 : PageSetTLI(page, ThisTimeLineID);
1270 2 : if (BufferIsValid(lbuf))
1271 : {
1272 2 : page = BufferGetPage(lbuf);
1273 2 : PageSetLSN(page, recptr);
1274 2 : PageSetTLI(page, ThisTimeLineID);
1275 : }
1276 : }
1277 :
1278 2 : END_CRIT_SECTION();
1279 :
1280 : /* release metapage; send out relcache inval if metapage changed */
1281 2 : if (BufferIsValid(metabuf))
1282 : {
1283 0 : CacheInvalidateRelcache(rel);
1284 0 : _bt_relbuf(rel, metabuf);
1285 : }
1286 : /* can always release leftsib immediately */
1287 2 : if (BufferIsValid(lbuf))
1288 2 : _bt_relbuf(rel, lbuf);
1289 :
1290 : /*
1291 : * If parent became half dead, recurse to delete it. Otherwise, if right
1292 : * sibling is empty and is now the last child of the parent, recurse to
1293 : * try to delete it. (These cases cannot apply at the same time, though
1294 : * the second case might itself recurse to the first.)
1295 : *
1296 : * When recursing to parent, we hold the lock on the target page until
1297 : * done. This delays any insertions into the keyspace that was just
1298 : * effectively reassigned to the parent's right sibling. If we allowed
1299 : * that, and there were enough such insertions before we finish deleting
1300 : * the parent, page splits within that keyspace could lead to inserting
1301 : * out-of-order keys into the grandparent level. It is thought that that
1302 : * wouldn't have any serious consequences, but it still seems like a
1303 : * pretty bad idea.
1304 : */
1305 2 : if (parent_half_dead)
1306 : {
1307 : /* recursive call will release pbuf */
1308 0 : _bt_relbuf(rel, rbuf);
1309 0 : result = _bt_pagedel(rel, pbuf, stack->bts_parent, vacuum_full) + 1;
1310 0 : _bt_relbuf(rel, buf);
1311 : }
1312 2 : else if (parent_one_child && rightsib_empty)
1313 : {
1314 0 : _bt_relbuf(rel, pbuf);
1315 0 : _bt_relbuf(rel, buf);
1316 : /* recursive call will release rbuf */
1317 0 : result = _bt_pagedel(rel, rbuf, stack, vacuum_full) + 1;
1318 : }
1319 : else
1320 : {
1321 2 : _bt_relbuf(rel, pbuf);
1322 2 : _bt_relbuf(rel, buf);
1323 2 : _bt_relbuf(rel, rbuf);
1324 2 : result = 1;
1325 : }
1326 :
1327 2 : return result;
1328 : }
|