1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtinsert.c
4 : * Item insertion in Lehman and Yao btrees for Postgres.
5 : *
6 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.163 2007/12/31 04:52:05 tgl Exp $
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/heapam.h"
19 : #include "access/nbtree.h"
20 : #include "access/transam.h"
21 : #include "miscadmin.h"
22 : #include "utils/inval.h"
23 :
24 :
25 : typedef struct
26 : {
27 : /* context data for _bt_checksplitloc */
28 : Size newitemsz; /* size of new item to be inserted */
29 : int fillfactor; /* needed when splitting rightmost page */
30 : bool is_leaf; /* T if splitting a leaf page */
31 : bool is_rightmost; /* T if splitting a rightmost page */
32 : OffsetNumber newitemoff; /* where the new item is to be inserted */
33 : int leftspace; /* space available for items on left page */
34 : int rightspace; /* space available for items on right page */
35 : int olddataitemstotal; /* space taken by old items */
36 :
37 : bool have_split; /* found a valid split? */
38 :
39 : /* these fields valid only if have_split is true */
40 : bool newitemonleft; /* new item on left or right of best split */
41 : OffsetNumber firstright; /* best split point */
42 : int best_delta; /* best size delta so far */
43 : } FindSplitData;
44 :
45 :
46 : static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
47 :
48 : static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
49 : Relation heapRel, Buffer buf, OffsetNumber ioffset,
50 : ScanKey itup_scankey);
51 : static void _bt_findinsertloc(Relation rel,
52 : Buffer *bufptr,
53 : OffsetNumber *offsetptr,
54 : int keysz,
55 : ScanKey scankey,
56 : IndexTuple newtup);
57 : static void _bt_insertonpg(Relation rel, Buffer buf,
58 : BTStack stack,
59 : IndexTuple itup,
60 : OffsetNumber newitemoff,
61 : bool split_only_page);
62 : static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
63 : OffsetNumber newitemoff, Size newitemsz,
64 : IndexTuple newitem, bool newitemonleft);
65 : static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
66 : OffsetNumber newitemoff,
67 : Size newitemsz,
68 : bool *newitemonleft);
69 : static void _bt_checksplitloc(FindSplitData *state,
70 : OffsetNumber firstoldonright, bool newitemonleft,
71 : int dataitemstoleft, Size firstoldonrightsz);
72 : static void _bt_pgaddtup(Relation rel, Page page,
73 : Size itemsize, IndexTuple itup,
74 : OffsetNumber itup_off, const char *where);
75 : static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
76 : int keysz, ScanKey scankey);
77 : static void _bt_vacuum_one_page(Relation rel, Buffer buffer);
78 :
79 :
80 : /*
81 : * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
82 : *
83 : * This routine is called by the public interface routines, btbuild
84 : * and btinsert. By here, itup is filled in, including the TID.
85 : */
86 : void
87 : _bt_doinsert(Relation rel, IndexTuple itup,
88 : bool index_is_unique, Relation heapRel)
89 70109 : {
90 70109 : int natts = rel->rd_rel->relnatts;
91 : ScanKey itup_scankey;
92 : BTStack stack;
93 : Buffer buf;
94 : OffsetNumber offset;
95 :
96 : /* we need an insertion scan key to do our search, so build one */
97 70109 : itup_scankey = _bt_mkscankey(rel, itup);
98 :
99 70109 : top:
100 : /* find the first page containing this key */
101 70109 : stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
102 :
103 70109 : offset = InvalidOffsetNumber;
104 :
105 : /* trade in our read lock for a write lock */
106 70109 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
107 70109 : LockBuffer(buf, BT_WRITE);
108 :
109 : /*
110 : * If the page was split between the time that we surrendered our read
111 : * lock and acquired our write lock, then this page may no longer be the
112 : * right place for the key we want to insert. In this case, we need to
113 : * move right in the tree. See Lehman and Yao for an excruciatingly
114 : * precise description.
115 : */
116 70109 : buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
117 :
118 : /*
119 : * If we're not allowing duplicates, make sure the key isn't already in
120 : * the index.
121 : *
122 : * NOTE: obviously, _bt_check_unique can only detect keys that are already
123 : * in the index; so it cannot defend against concurrent insertions of the
124 : * same key. We protect against that by means of holding a write lock on
125 : * the target page. Any other would-be inserter of the same key must
126 : * acquire a write lock on the same target page, so only one would-be
127 : * inserter can be making the check at one time. Furthermore, once we are
128 : * past the check we hold write locks continuously until we have performed
129 : * our insertion, so no later inserter can fail to see our insertion.
130 : * (This requires some care in _bt_insertonpg.)
131 : *
132 : * If we must wait for another xact, we release the lock while waiting,
133 : * and then must start over completely.
134 : */
135 70109 : if (index_is_unique)
136 : {
137 : TransactionId xwait;
138 :
139 34237 : offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
140 34237 : xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey);
141 :
142 34215 : if (TransactionIdIsValid(xwait))
143 : {
144 : /* Have to wait for the other guy ... */
145 0 : _bt_relbuf(rel, buf);
146 0 : XactLockTableWait(xwait);
147 : /* start over... */
148 0 : _bt_freestack(stack);
149 0 : goto top;
150 : }
151 : }
152 :
153 : /* do the insertion */
154 70087 : _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
155 70087 : _bt_insertonpg(rel, buf, stack, itup, offset, false);
156 :
157 : /* be tidy */
158 70087 : _bt_freestack(stack);
159 70087 : _bt_freeskey(itup_scankey);
160 70087 : }
161 :
162 : /*
163 : * _bt_check_unique() -- Check for violation of unique index constraint
164 : *
165 : * offset points to the first possible item that could conflict. It can
166 : * also point to end-of-page, which means that the first tuple to check
167 : * is the first tuple on the next page.
168 : *
169 : * Returns InvalidTransactionId if there is no conflict, else an xact ID
170 : * we must wait for to see if it commits a conflicting tuple. If an actual
171 : * conflict is detected, no return --- just ereport().
172 : */
173 : static TransactionId
174 : _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
175 : Buffer buf, OffsetNumber offset, ScanKey itup_scankey)
176 34237 : {
177 34237 : TupleDesc itupdesc = RelationGetDescr(rel);
178 34237 : int natts = rel->rd_rel->relnatts;
179 : SnapshotData SnapshotDirty;
180 : OffsetNumber maxoff;
181 : Page page;
182 : BTPageOpaque opaque;
183 34237 : Buffer nbuf = InvalidBuffer;
184 :
185 34237 : InitDirtySnapshot(SnapshotDirty);
186 :
187 34237 : page = BufferGetPage(buf);
188 34237 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
189 34237 : maxoff = PageGetMaxOffsetNumber(page);
190 :
191 : /*
192 : * Scan over all equal tuples, looking for live conflicts.
193 : */
194 : for (;;)
195 : {
196 : ItemId curitemid;
197 : IndexTuple curitup;
198 : BlockNumber nblkno;
199 :
200 : /*
201 : * make sure the offset points to an actual item before trying to
202 : * examine it...
203 : */
204 38550 : if (offset <= maxoff)
205 : {
206 23020 : curitemid = PageGetItemId(page, offset);
207 :
208 : /*
209 : * We can skip items that are marked killed.
210 : *
211 : * Formerly, we applied _bt_isequal() before checking the kill
212 : * flag, so as to fall out of the item loop as soon as possible.
213 : * However, in the presence of heavy update activity an index may
214 : * contain many killed items with the same key; running
215 : * _bt_isequal() on each killed item gets expensive. Furthermore
216 : * it is likely that the non-killed version of each key appears
217 : * first, so that we didn't actually get to exit any sooner
218 : * anyway. So now we just advance over killed items as quickly as
219 : * we can. We only apply _bt_isequal() when we get to a non-killed
220 : * item or the end of the page.
221 : */
222 23020 : if (!ItemIdIsDead(curitemid))
223 : {
224 : ItemPointerData htid;
225 : bool all_dead;
226 :
227 : /*
228 : * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
229 : * how we handling NULLs - and so we must not use _bt_compare
230 : * in real comparison, but only for ordering/finding items on
231 : * pages. - vadim 03/24/97
232 : */
233 19945 : if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey))
234 18644 : break; /* we're past all the equal tuples */
235 :
236 : /* okay, we gotta fetch the heap tuple ... */
237 1301 : curitup = (IndexTuple) PageGetItem(page, curitemid);
238 1301 : htid = curitup->t_tid;
239 :
240 : /*
241 : * We check the whole HOT-chain to see if there is any tuple
242 : * that satisfies SnapshotDirty. This is necessary because we
243 : * have just a single index entry for the entire chain.
244 : */
245 1301 : if (heap_hot_search(&htid, heapRel, &SnapshotDirty, &all_dead))
246 : {
247 : /* it is a duplicate */
248 : TransactionId xwait =
249 : (TransactionIdIsValid(SnapshotDirty.xmin)) ?
250 22 : SnapshotDirty.xmin : SnapshotDirty.xmax;
251 :
252 : /*
253 : * If this tuple is being updated by other transaction
254 : * then we have to wait for its commit/abort.
255 : */
256 22 : if (TransactionIdIsValid(xwait))
257 : {
258 0 : if (nbuf != InvalidBuffer)
259 0 : _bt_relbuf(rel, nbuf);
260 : /* Tell _bt_doinsert to wait... */
261 0 : return xwait;
262 : }
263 :
264 : /*
265 : * Otherwise we have a definite conflict. But before
266 : * complaining, look to see if the tuple we want to insert
267 : * is itself now committed dead --- if so, don't complain.
268 : * This is a waste of time in normal scenarios but we must
269 : * do it to support CREATE INDEX CONCURRENTLY.
270 : *
271 : * We must follow HOT-chains here because during
272 : * concurrent index build, we insert the root TID though
273 : * the actual tuple may be somewhere in the HOT-chain.
274 : * While following the chain we might not stop at the
275 : * exact tuple which triggered the insert, but that's OK
276 : * because if we find a live tuple anywhere in this chain,
277 : * we have a unique key conflict. The other live tuple is
278 : * not part of this chain because it had a different index
279 : * entry.
280 : */
281 22 : htid = itup->t_tid;
282 22 : if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
283 : {
284 : /* Normal case --- it's still live */
285 : }
286 : else
287 : {
288 : /*
289 : * It's been deleted, so no error, and no need to
290 : * continue searching
291 : */
292 0 : break;
293 : }
294 :
295 22 : ereport(ERROR,
296 : (errcode(ERRCODE_UNIQUE_VIOLATION),
297 : errmsg("duplicate key value violates unique constraint \"%s\"",
298 : RelationGetRelationName(rel))));
299 : }
300 1279 : else if (all_dead)
301 : {
302 : /*
303 : * The conflicting tuple (or whole HOT chain) is dead to
304 : * everyone, so we may as well mark the index entry
305 : * killed.
306 : */
307 32 : ItemIdMarkDead(curitemid);
308 32 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
309 : /* be sure to mark the proper buffer dirty... */
310 32 : if (nbuf != InvalidBuffer)
311 0 : SetBufferCommitInfoNeedsSave(nbuf);
312 : else
313 32 : SetBufferCommitInfoNeedsSave(buf);
314 : }
315 : }
316 : }
317 :
318 : /*
319 : * Advance to next tuple to continue checking.
320 : */
321 19884 : if (offset < maxoff)
322 4311 : offset = OffsetNumberNext(offset);
323 : else
324 : {
325 : /* If scankey == hikey we gotta check the next page too */
326 15573 : if (P_RIGHTMOST(opaque))
327 15181 : break;
328 392 : if (!_bt_isequal(itupdesc, page, P_HIKEY,
329 : natts, itup_scankey))
330 390 : break;
331 : /* Advance to next non-dead page --- there must be one */
332 : for (;;)
333 : {
334 2 : nblkno = opaque->btpo_next;
335 2 : nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
336 2 : page = BufferGetPage(nbuf);
337 2 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
338 2 : if (!P_IGNORE(opaque))
339 2 : break;
340 0 : if (P_RIGHTMOST(opaque))
341 0 : elog(ERROR, "fell off the end of index \"%s\"",
342 : RelationGetRelationName(rel));
343 : }
344 2 : maxoff = PageGetMaxOffsetNumber(page);
345 2 : offset = P_FIRSTDATAKEY(opaque);
346 : }
347 : }
348 :
349 34215 : if (nbuf != InvalidBuffer)
350 2 : _bt_relbuf(rel, nbuf);
351 :
352 34215 : return InvalidTransactionId;
353 : }
354 :
355 :
356 : /*
357 : * _bt_findinsertloc() -- Finds an insert location for a tuple
358 : *
359 : * If the new key is equal to one or more existing keys, we can
360 : * legitimately place it anywhere in the series of equal keys --- in fact,
361 : * if the new key is equal to the page's "high key" we can place it on
362 : * the next page. If it is equal to the high key, and there's not room
363 : * to insert the new tuple on the current page without splitting, then
364 : * we can move right hoping to find more free space and avoid a split.
365 : * (We should not move right indefinitely, however, since that leads to
366 : * O(N^2) insertion behavior in the presence of many equal keys.)
367 : * Once we have chosen the page to put the key on, we'll insert it before
368 : * any existing equal keys because of the way _bt_binsrch() works.
369 : *
370 : * If there's not enough room in the space, we try to make room by
371 : * removing any LP_DEAD tuples.
372 : *
373 : * On entry, *buf and *offsetptr point to the first legal position
374 : * where the new tuple could be inserted. The caller should hold an
375 : * exclusive lock on *buf. *offsetptr can also be set to
376 : * InvalidOffsetNumber, in which case the function will search for the
377 : * right location within the page if needed. On exit, they point to the
378 : * chosen insert location. If _bt_findinsertloc decides to move right,
379 : * the lock and pin on the original page will be released and the new
380 : * page returned to the caller is exclusively locked instead.
381 : *
382 : * newtup is the new tuple we're inserting, and scankey is an insertion
383 : * type scan key for it.
384 : */
385 : static void
386 : _bt_findinsertloc(Relation rel,
387 : Buffer *bufptr,
388 : OffsetNumber *offsetptr,
389 : int keysz,
390 : ScanKey scankey,
391 : IndexTuple newtup)
392 70087 : {
393 70087 : Buffer buf = *bufptr;
394 70087 : Page page = BufferGetPage(buf);
395 : Size itemsz;
396 : BTPageOpaque lpageop;
397 : bool movedright,
398 : vacuumed;
399 : OffsetNumber newitemoff;
400 70087 : OffsetNumber firstlegaloff = *offsetptr;
401 :
402 70087 : lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
403 :
404 70087 : itemsz = IndexTupleDSize(*newtup);
405 70087 : itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
406 : * need to be consistent */
407 :
408 : /*
409 : * Check whether the item can fit on a btree page at all. (Eventually, we
410 : * ought to try to apply TOAST methods if not.) We actually need to be
411 : * able to fit three items on every page, so restrict any one item to 1/3
412 : * the per-page available space. Note that at this point, itemsz doesn't
413 : * include the ItemId.
414 : */
415 70087 : if (itemsz > BTMaxItemSize(page))
416 0 : ereport(ERROR,
417 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
418 : errmsg("index row size %lu exceeds btree maximum, %lu",
419 : (unsigned long) itemsz,
420 : (unsigned long) BTMaxItemSize(page)),
421 : errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
422 : "Consider a function index of an MD5 hash of the value, "
423 : "or use full text indexing.")));
424 :
425 : /*----------
426 : * If we will need to split the page to put the item on this page,
427 : * check whether we can put the tuple somewhere to the right,
428 : * instead. Keep scanning right until we
429 : * (a) find a page with enough free space,
430 : * (b) reach the last page where the tuple can legally go, or
431 : * (c) get tired of searching.
432 : * (c) is not flippant; it is important because if there are many
433 : * pages' worth of equal keys, it's better to split one of the early
434 : * pages than to scan all the way to the end of the run of equal keys
435 : * on every insert. We implement "get tired" as a random choice,
436 : * since stopping after scanning a fixed number of pages wouldn't work
437 : * well (we'd never reach the right-hand side of previously split
438 : * pages). Currently the probability of moving right is set at 0.99,
439 : * which may seem too high to change the behavior much, but it does an
440 : * excellent job of preventing O(N^2) behavior with many equal keys.
441 : *----------
442 : */
443 70087 : movedright = false;
444 70087 : vacuumed = false;
445 141323 : while (PageGetFreeSpace(page) < itemsz)
446 : {
447 : Buffer rbuf;
448 :
449 : /*
450 : * before considering moving right, see if we can obtain enough space
451 : * by erasing LP_DEAD items
452 : */
453 1637 : if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
454 : {
455 106 : _bt_vacuum_one_page(rel, buf);
456 :
457 : /*
458 : * remember that we vacuumed this page, because that makes the
459 : * hint supplied by the caller invalid
460 : */
461 106 : vacuumed = true;
462 :
463 106 : if (PageGetFreeSpace(page) >= itemsz)
464 106 : break; /* OK, now we have enough space */
465 : }
466 :
467 : /*
468 : * nope, so check conditions (b) and (c) enumerated above
469 : */
470 1531 : if (P_RIGHTMOST(lpageop) ||
471 : _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
472 : random() <= (MAX_RANDOM_VALUE / 100))
473 : break;
474 :
475 : /*
476 : * step right to next non-dead page
477 : *
478 : * must write-lock that page before releasing write lock on current
479 : * page; else someone else's _bt_check_unique scan could fail to see
480 : * our insertion. write locks on intermediate dead pages won't do
481 : * because we don't know when they will get de-linked from the tree.
482 : */
483 1149 : rbuf = InvalidBuffer;
484 :
485 : for (;;)
486 : {
487 1149 : BlockNumber rblkno = lpageop->btpo_next;
488 :
489 1149 : rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
490 1149 : page = BufferGetPage(rbuf);
491 1149 : lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
492 1149 : if (!P_IGNORE(lpageop))
493 1149 : break;
494 0 : if (P_RIGHTMOST(lpageop))
495 0 : elog(ERROR, "fell off the end of index \"%s\"",
496 : RelationGetRelationName(rel));
497 : }
498 1149 : _bt_relbuf(rel, buf);
499 1149 : buf = rbuf;
500 1149 : movedright = true;
501 1149 : vacuumed = false;
502 : }
503 :
504 : /*
505 : * Now we are on the right page, so find the insert position. If we moved
506 : * right at all, we know we should insert at the start of the page. If we
507 : * didn't move right, we can use the firstlegaloff hint if the caller
508 : * supplied one, unless we vacuumed the page which might have moved tuples
509 : * around making the hint invalid. If we didn't move right or can't use
510 : * the hint, find the position by searching.
511 : */
512 70087 : if (movedright)
513 1149 : newitemoff = P_FIRSTDATAKEY(lpageop);
514 68938 : else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
515 34147 : newitemoff = firstlegaloff;
516 : else
517 34791 : newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
518 :
519 70087 : *bufptr = buf;
520 70087 : *offsetptr = newitemoff;
521 70087 : }
522 :
523 : /*----------
524 : * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
525 : *
526 : * This recursive procedure does the following things:
527 : *
528 : * + if necessary, splits the target page (making sure that the
529 : * split is equitable as far as post-insert free space goes).
530 : * + inserts the tuple.
531 : * + if the page was split, pops the parent stack, and finds the
532 : * right place to insert the new child pointer (by walking
533 : * right using information stored in the parent stack).
534 : * + invokes itself with the appropriate tuple for the right
535 : * child page on the parent.
536 : * + updates the metapage if a true root or fast root is split.
537 : *
538 : * On entry, we must have the right buffer in which to do the
539 : * insertion, and the buffer must be pinned and write-locked. On return,
540 : * we will have dropped both the pin and the lock on the buffer.
541 : *
542 : * The locking interactions in this code are critical. You should
543 : * grok Lehman and Yao's paper before making any changes. In addition,
544 : * you need to understand how we disambiguate duplicate keys in this
545 : * implementation, in order to be able to find our location using
546 : * L&Y "move right" operations. Since we may insert duplicate user
547 : * keys, and since these dups may propagate up the tree, we use the
548 : * 'afteritem' parameter to position ourselves correctly for the
549 : * insertion on internal pages.
550 : *----------
551 : */
552 : static void
553 : _bt_insertonpg(Relation rel,
554 : Buffer buf,
555 : BTStack stack,
556 : IndexTuple itup,
557 : OffsetNumber newitemoff,
558 : bool split_only_page)
559 70456 : {
560 : Page page;
561 : BTPageOpaque lpageop;
562 70456 : OffsetNumber firstright = InvalidOffsetNumber;
563 : Size itemsz;
564 :
565 70456 : page = BufferGetPage(buf);
566 70456 : lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
567 :
568 70456 : itemsz = IndexTupleDSize(*itup);
569 70456 : itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
570 : * need to be consistent */
571 :
572 : /*
573 : * Do we need to split the page to fit the item on it?
574 : *
575 : * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
576 : * so this comparison is correct even though we appear to be accounting
577 : * only for the item and not for its line pointer.
578 : */
579 70456 : if (PageGetFreeSpace(page) < itemsz)
580 : {
581 383 : bool is_root = P_ISROOT(lpageop);
582 383 : bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
583 : bool newitemonleft;
584 : Buffer rbuf;
585 :
586 : /* Choose the split point */
587 383 : firstright = _bt_findsplitloc(rel, page,
588 : newitemoff, itemsz,
589 : &newitemonleft);
590 :
591 : /* split the buffer into left and right halves */
592 383 : rbuf = _bt_split(rel, buf, firstright,
593 : newitemoff, itemsz, itup, newitemonleft);
594 :
595 : /*----------
596 : * By here,
597 : *
598 : * + our target page has been split;
599 : * + the original tuple has been inserted;
600 : * + we have write locks on both the old (left half)
601 : * and new (right half) buffers, after the split; and
602 : * + we know the key we want to insert into the parent
603 : * (it's the "high key" on the left child page).
604 : *
605 : * We're ready to do the parent insertion. We need to hold onto the
606 : * locks for the child pages until we locate the parent, but we can
607 : * release them before doing the actual insertion (see Lehman and Yao
608 : * for the reasoning).
609 : *----------
610 : */
611 383 : _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
612 : }
613 : else
614 : {
615 70073 : Buffer metabuf = InvalidBuffer;
616 70073 : Page metapg = NULL;
617 70073 : BTMetaPageData *metad = NULL;
618 : OffsetNumber itup_off;
619 : BlockNumber itup_blkno;
620 :
621 70073 : itup_off = newitemoff;
622 70073 : itup_blkno = BufferGetBlockNumber(buf);
623 :
624 : /*
625 : * If we are doing this insert because we split a page that was the
626 : * only one on its tree level, but was not the root, it may have been
627 : * the "fast root". We need to ensure that the fast root link points
628 : * at or above the current page. We can safely acquire a lock on the
629 : * metapage here --- see comments for _bt_newroot().
630 : */
631 70073 : if (split_only_page)
632 : {
633 : Assert(!P_ISLEAF(lpageop));
634 :
635 0 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
636 0 : metapg = BufferGetPage(metabuf);
637 0 : metad = BTPageGetMeta(metapg);
638 :
639 0 : if (metad->btm_fastlevel >= lpageop->btpo.level)
640 : {
641 : /* no update wanted */
642 0 : _bt_relbuf(rel, metabuf);
643 0 : metabuf = InvalidBuffer;
644 : }
645 : }
646 :
647 : /* Do the update. No ereport(ERROR) until changes are logged */
648 70073 : START_CRIT_SECTION();
649 :
650 70073 : _bt_pgaddtup(rel, page, itemsz, itup, newitemoff, "page");
651 :
652 70073 : MarkBufferDirty(buf);
653 :
654 70073 : if (BufferIsValid(metabuf))
655 : {
656 0 : metad->btm_fastroot = itup_blkno;
657 0 : metad->btm_fastlevel = lpageop->btpo.level;
658 0 : MarkBufferDirty(metabuf);
659 : }
660 :
661 : /* XLOG stuff */
662 70073 : if (!rel->rd_istemp)
663 : {
664 : xl_btree_insert xlrec;
665 : BlockNumber xldownlink;
666 : xl_btree_metadata xlmeta;
667 : uint8 xlinfo;
668 : XLogRecPtr recptr;
669 : XLogRecData rdata[4];
670 : XLogRecData *nextrdata;
671 : IndexTupleData trunctuple;
672 :
673 70032 : xlrec.target.node = rel->rd_node;
674 70032 : ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
675 :
676 70032 : rdata[0].data = (char *) &xlrec;
677 70032 : rdata[0].len = SizeOfBtreeInsert;
678 70032 : rdata[0].buffer = InvalidBuffer;
679 70032 : rdata[0].next = nextrdata = &(rdata[1]);
680 :
681 70032 : if (P_ISLEAF(lpageop))
682 69664 : xlinfo = XLOG_BTREE_INSERT_LEAF;
683 : else
684 : {
685 368 : xldownlink = ItemPointerGetBlockNumber(&(itup->t_tid));
686 : Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
687 :
688 368 : nextrdata->data = (char *) &xldownlink;
689 368 : nextrdata->len = sizeof(BlockNumber);
690 368 : nextrdata->buffer = InvalidBuffer;
691 368 : nextrdata->next = nextrdata + 1;
692 368 : nextrdata++;
693 :
694 368 : xlinfo = XLOG_BTREE_INSERT_UPPER;
695 : }
696 :
697 70032 : if (BufferIsValid(metabuf))
698 : {
699 0 : xlmeta.root = metad->btm_root;
700 0 : xlmeta.level = metad->btm_level;
701 0 : xlmeta.fastroot = metad->btm_fastroot;
702 0 : xlmeta.fastlevel = metad->btm_fastlevel;
703 :
704 0 : nextrdata->data = (char *) &xlmeta;
705 0 : nextrdata->len = sizeof(xl_btree_metadata);
706 0 : nextrdata->buffer = InvalidBuffer;
707 0 : nextrdata->next = nextrdata + 1;
708 0 : nextrdata++;
709 :
710 0 : xlinfo = XLOG_BTREE_INSERT_META;
711 : }
712 :
713 : /* Read comments in _bt_pgaddtup */
714 70032 : if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
715 : {
716 0 : trunctuple = *itup;
717 0 : trunctuple.t_info = sizeof(IndexTupleData);
718 0 : nextrdata->data = (char *) &trunctuple;
719 0 : nextrdata->len = sizeof(IndexTupleData);
720 : }
721 : else
722 : {
723 70032 : nextrdata->data = (char *) itup;
724 70032 : nextrdata->len = IndexTupleDSize(*itup);
725 : }
726 70032 : nextrdata->buffer = buf;
727 70032 : nextrdata->buffer_std = true;
728 70032 : nextrdata->next = NULL;
729 :
730 70032 : recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
731 :
732 70032 : if (BufferIsValid(metabuf))
733 : {
734 0 : PageSetLSN(metapg, recptr);
735 0 : PageSetTLI(metapg, ThisTimeLineID);
736 : }
737 :
738 70032 : PageSetLSN(page, recptr);
739 70032 : PageSetTLI(page, ThisTimeLineID);
740 : }
741 :
742 70073 : END_CRIT_SECTION();
743 :
744 : /* release buffers; send out relcache inval if metapage changed */
745 70073 : if (BufferIsValid(metabuf))
746 : {
747 0 : CacheInvalidateRelcache(rel);
748 0 : _bt_relbuf(rel, metabuf);
749 : }
750 :
751 70073 : _bt_relbuf(rel, buf);
752 : }
753 70456 : }
754 :
755 : /*
756 : * _bt_split() -- split a page in the btree.
757 : *
758 : * On entry, buf is the page to split, and is pinned and write-locked.
759 : * firstright is the item index of the first item to be moved to the
760 : * new right page. newitemoff etc. tell us about the new item that
761 : * must be inserted along with the data from the old page.
762 : *
763 : * Returns the new right sibling of buf, pinned and write-locked.
764 : * The pin and lock on buf are maintained.
765 : */
766 : static Buffer
767 : _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
768 : OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
769 : bool newitemonleft)
770 383 : {
771 : Buffer rbuf;
772 : Page origpage;
773 : Page leftpage,
774 : rightpage;
775 : BTPageOpaque ropaque,
776 : lopaque,
777 : oopaque;
778 383 : Buffer sbuf = InvalidBuffer;
779 383 : Page spage = NULL;
780 383 : BTPageOpaque sopaque = NULL;
781 : Size itemsz;
782 : ItemId itemid;
783 : IndexTuple item;
784 : OffsetNumber leftoff,
785 : rightoff;
786 : OffsetNumber maxoff;
787 : OffsetNumber i;
788 : bool isroot;
789 :
790 383 : rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
791 383 : origpage = BufferGetPage(buf);
792 383 : leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
793 383 : rightpage = BufferGetPage(rbuf);
794 :
795 383 : _bt_pageinit(leftpage, BufferGetPageSize(buf));
796 : /* rightpage was already initialized by _bt_getbuf */
797 :
798 : /*
799 : * Copy the original page's LSN and TLI into leftpage, which will become
800 : * the updated version of the page. We need this because XLogInsert will
801 : * examine these fields and possibly dump them in a page image.
802 : */
803 383 : PageSetLSN(leftpage, PageGetLSN(origpage));
804 383 : PageSetTLI(leftpage, PageGetTLI(origpage));
805 :
806 : /* init btree private data */
807 383 : oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
808 383 : lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
809 383 : ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
810 :
811 383 : isroot = P_ISROOT(oopaque);
812 :
813 : /* if we're splitting this page, it won't be the root when we're done */
814 : /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
815 383 : lopaque->btpo_flags = oopaque->btpo_flags;
816 383 : lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
817 383 : ropaque->btpo_flags = lopaque->btpo_flags;
818 383 : lopaque->btpo_prev = oopaque->btpo_prev;
819 383 : lopaque->btpo_next = BufferGetBlockNumber(rbuf);
820 383 : ropaque->btpo_prev = BufferGetBlockNumber(buf);
821 383 : ropaque->btpo_next = oopaque->btpo_next;
822 383 : lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
823 : /* Since we already have write-lock on both pages, ok to read cycleid */
824 383 : lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
825 383 : ropaque->btpo_cycleid = lopaque->btpo_cycleid;
826 :
827 : /*
828 : * If the page we're splitting is not the rightmost page at its level in
829 : * the tree, then the first entry on the page is the high key for the
830 : * page. We need to copy that to the right half. Otherwise (meaning the
831 : * rightmost page case), all the items on the right half will be user
832 : * data.
833 : */
834 383 : rightoff = P_HIKEY;
835 :
836 383 : if (!P_RIGHTMOST(oopaque))
837 : {
838 221 : itemid = PageGetItemId(origpage, P_HIKEY);
839 221 : itemsz = ItemIdGetLength(itemid);
840 221 : item = (IndexTuple) PageGetItem(origpage, itemid);
841 221 : if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
842 : false, false) == InvalidOffsetNumber)
843 0 : elog(PANIC, "failed to add hikey to the right sibling"
844 : " while splitting block %u of index \"%s\"",
845 : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
846 221 : rightoff = OffsetNumberNext(rightoff);
847 : }
848 :
849 : /*
850 : * The "high key" for the new left page will be the first key that's going
851 : * to go into the new right page. This might be either the existing data
852 : * item at position firstright, or the incoming tuple.
853 : */
854 383 : leftoff = P_HIKEY;
855 383 : if (!newitemonleft && newitemoff == firstright)
856 : {
857 : /* incoming tuple will become first on right page */
858 2 : itemsz = newitemsz;
859 2 : item = newitem;
860 : }
861 : else
862 : {
863 : /* existing item at firstright will become first on right page */
864 381 : itemid = PageGetItemId(origpage, firstright);
865 381 : itemsz = ItemIdGetLength(itemid);
866 381 : item = (IndexTuple) PageGetItem(origpage, itemid);
867 : }
868 383 : if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
869 : false, false) == InvalidOffsetNumber)
870 0 : elog(PANIC, "failed to add hikey to the left sibling"
871 : " while splitting block %u of index \"%s\"",
872 : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
873 383 : leftoff = OffsetNumberNext(leftoff);
874 :
875 : /*
876 : * Now transfer all the data items to the appropriate page.
877 : *
878 : * Note: we *must* insert at least the right page's items in item-number
879 : * order, for the benefit of _bt_restore_page().
880 : */
881 383 : maxoff = PageGetMaxOffsetNumber(origpage);
882 :
883 86862 : for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
884 : {
885 86479 : itemid = PageGetItemId(origpage, i);
886 86479 : itemsz = ItemIdGetLength(itemid);
887 86479 : item = (IndexTuple) PageGetItem(origpage, itemid);
888 :
889 : /* does new item belong before this one? */
890 86479 : if (i == newitemoff)
891 : {
892 315 : if (newitemonleft)
893 : {
894 92 : _bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
895 : "left sibling");
896 92 : leftoff = OffsetNumberNext(leftoff);
897 : }
898 : else
899 : {
900 223 : _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
901 : "right sibling");
902 223 : rightoff = OffsetNumberNext(rightoff);
903 : }
904 : }
905 :
906 : /* decide which page to put it on */
907 86479 : if (i < firstright)
908 : {
909 53732 : _bt_pgaddtup(rel, leftpage, itemsz, item, leftoff,
910 : "left sibling");
911 53732 : leftoff = OffsetNumberNext(leftoff);
912 : }
913 : else
914 : {
915 32747 : _bt_pgaddtup(rel, rightpage, itemsz, item, rightoff,
916 : "right sibling");
917 32747 : rightoff = OffsetNumberNext(rightoff);
918 : }
919 : }
920 :
921 : /* cope with possibility that newitem goes at the end */
922 383 : if (i <= newitemoff)
923 : {
924 : /*
925 : * Can't have newitemonleft here; that would imply we were told to put
926 : * *everything* on the left page, which cannot fit (if it could, we'd
927 : * not be splitting the page).
928 : */
929 : Assert(!newitemonleft);
930 68 : _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
931 : "right sibling");
932 68 : rightoff = OffsetNumberNext(rightoff);
933 : }
934 :
935 : /*
936 : * We have to grab the right sibling (if any) and fix the prev pointer
937 : * there. We are guaranteed that this is deadlock-free since no other
938 : * writer will be holding a lock on that page and trying to move left, and
939 : * all readers release locks on a page before trying to fetch its
940 : * neighbors.
941 : */
942 :
943 383 : if (!P_RIGHTMOST(ropaque))
944 : {
945 221 : sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
946 221 : spage = BufferGetPage(sbuf);
947 221 : sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
948 221 : if (sopaque->btpo_prev != ropaque->btpo_prev)
949 0 : elog(PANIC, "right sibling's left-link doesn't match: "
950 : "block %u links to %u instead of expected %u in index \"%s\"",
951 : ropaque->btpo_next, sopaque->btpo_prev, ropaque->btpo_prev,
952 : RelationGetRelationName(rel));
953 :
954 : /*
955 : * Check to see if we can set the SPLIT_END flag in the right-hand
956 : * split page; this can save some I/O for vacuum since it need not
957 : * proceed to the right sibling. We can set the flag if the right
958 : * sibling has a different cycleid: that means it could not be part of
959 : * a group of pages that were all split off from the same ancestor
960 : * page. If you're confused, imagine that page A splits to A B and
961 : * then again, yielding A C B, while vacuum is in progress. Tuples
962 : * originally in A could now be in either B or C, hence vacuum must
963 : * examine both pages. But if D, our right sibling, has a different
964 : * cycleid then it could not contain any tuples that were in A when
965 : * the vacuum started.
966 : */
967 221 : if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
968 0 : ropaque->btpo_flags |= BTP_SPLIT_END;
969 : }
970 :
971 : /*
972 : * Right sibling is locked, new siblings are prepared, but original page
973 : * is not updated yet.
974 : *
975 : * NO EREPORT(ERROR) till right sibling is updated. We can get away with
976 : * not starting the critical section till here because we haven't been
977 : * scribbling on the original page yet, and we don't care about the new
978 : * sibling until it's linked into the btree.
979 : */
980 383 : START_CRIT_SECTION();
981 :
982 : /*
983 : * By here, the original data page has been split into two new halves, and
984 : * these are correct. The algorithm requires that the left page never
985 : * move during a split, so we copy the new left page back on top of the
986 : * original. Note that this is not a waste of time, since we also require
987 : * (in the page management code) that the center of a page always be
988 : * clean, and the most efficient way to guarantee this is just to compact
989 : * the data by reinserting it into a new left page. (XXX the latter
990 : * comment is probably obsolete.)
991 : *
992 : * We need to do this before writing the WAL record, so that XLogInsert
993 : * can WAL log an image of the page if necessary.
994 : */
995 383 : PageRestoreTempPage(leftpage, origpage);
996 :
997 383 : MarkBufferDirty(buf);
998 383 : MarkBufferDirty(rbuf);
999 :
1000 383 : if (!P_RIGHTMOST(ropaque))
1001 : {
1002 221 : sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
1003 221 : MarkBufferDirty(sbuf);
1004 : }
1005 :
1006 : /* XLOG stuff */
1007 383 : if (!rel->rd_istemp)
1008 : {
1009 : xl_btree_split xlrec;
1010 : uint8 xlinfo;
1011 : XLogRecPtr recptr;
1012 : XLogRecData rdata[7];
1013 : XLogRecData *lastrdata;
1014 :
1015 383 : xlrec.node = rel->rd_node;
1016 383 : xlrec.leftsib = BufferGetBlockNumber(buf);
1017 383 : xlrec.rightsib = BufferGetBlockNumber(rbuf);
1018 383 : xlrec.rnext = ropaque->btpo_next;
1019 383 : xlrec.level = ropaque->btpo.level;
1020 383 : xlrec.firstright = firstright;
1021 :
1022 383 : rdata[0].data = (char *) &xlrec;
1023 383 : rdata[0].len = SizeOfBtreeSplit;
1024 383 : rdata[0].buffer = InvalidBuffer;
1025 :
1026 383 : lastrdata = &rdata[0];
1027 :
1028 383 : if (ropaque->btpo.level > 0)
1029 : {
1030 : /* Log downlink on non-leaf pages */
1031 1 : lastrdata->next = lastrdata + 1;
1032 1 : lastrdata++;
1033 :
1034 1 : lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
1035 1 : lastrdata->len = sizeof(BlockIdData);
1036 1 : lastrdata->buffer = InvalidBuffer;
1037 :
1038 : /*
1039 : * We must also log the left page's high key, because the right
1040 : * page's leftmost key is suppressed on non-leaf levels. Show it
1041 : * as belonging to the left page buffer, so that it is not stored
1042 : * if XLogInsert decides it needs a full-page image of the left
1043 : * page.
1044 : */
1045 1 : lastrdata->next = lastrdata + 1;
1046 1 : lastrdata++;
1047 :
1048 1 : itemid = PageGetItemId(origpage, P_HIKEY);
1049 1 : item = (IndexTuple) PageGetItem(origpage, itemid);
1050 1 : lastrdata->data = (char *) item;
1051 1 : lastrdata->len = MAXALIGN(IndexTupleSize(item));
1052 1 : lastrdata->buffer = buf; /* backup block 1 */
1053 1 : lastrdata->buffer_std = true;
1054 : }
1055 :
1056 : /*
1057 : * Log the new item and its offset, if it was inserted on the left
1058 : * page. (If it was put on the right page, we don't need to explicitly
1059 : * WAL log it because it's included with all the other items on the
1060 : * right page.) Show the new item as belonging to the left page
1061 : * buffer, so that it is not stored if XLogInsert decides it needs a
1062 : * full-page image of the left page. We store the offset anyway,
1063 : * though, to support archive compression of these records.
1064 : */
1065 383 : if (newitemonleft)
1066 : {
1067 92 : lastrdata->next = lastrdata + 1;
1068 92 : lastrdata++;
1069 :
1070 92 : lastrdata->data = (char *) &newitemoff;
1071 92 : lastrdata->len = sizeof(OffsetNumber);
1072 92 : lastrdata->buffer = InvalidBuffer;
1073 :
1074 92 : lastrdata->next = lastrdata + 1;
1075 92 : lastrdata++;
1076 :
1077 92 : lastrdata->data = (char *) newitem;
1078 92 : lastrdata->len = MAXALIGN(newitemsz);
1079 92 : lastrdata->buffer = buf; /* backup block 1 */
1080 92 : lastrdata->buffer_std = true;
1081 : }
1082 291 : else if (ropaque->btpo.level == 0)
1083 : {
1084 : /*
1085 : * Although we don't need to WAL-log the new item, we still need
1086 : * XLogInsert to consider storing a full-page image of the left
1087 : * page, so make an empty entry referencing that buffer. This also
1088 : * ensures that the left page is always backup block 1.
1089 : */
1090 290 : lastrdata->next = lastrdata + 1;
1091 290 : lastrdata++;
1092 :
1093 290 : lastrdata->data = NULL;
1094 290 : lastrdata->len = 0;
1095 290 : lastrdata->buffer = buf; /* backup block 1 */
1096 290 : lastrdata->buffer_std = true;
1097 : }
1098 :
1099 : /*
1100 : * Log the contents of the right page in the format understood by
1101 : * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1102 : * because we're going to recreate the whole page anyway, so it should
1103 : * never be stored by XLogInsert.
1104 : *
1105 : * Direct access to page is not good but faster - we should implement
1106 : * some new func in page API. Note we only store the tuples
1107 : * themselves, knowing that they were inserted in item-number order
1108 : * and so the item pointers can be reconstructed. See comments for
1109 : * _bt_restore_page().
1110 : */
1111 383 : lastrdata->next = lastrdata + 1;
1112 383 : lastrdata++;
1113 :
1114 383 : lastrdata->data = (char *) rightpage +
1115 : ((PageHeader) rightpage)->pd_upper;
1116 383 : lastrdata->len = ((PageHeader) rightpage)->pd_special -
1117 : ((PageHeader) rightpage)->pd_upper;
1118 383 : lastrdata->buffer = InvalidBuffer;
1119 :
1120 : /* Log the right sibling, because we've changed its' prev-pointer. */
1121 383 : if (!P_RIGHTMOST(ropaque))
1122 : {
1123 221 : lastrdata->next = lastrdata + 1;
1124 221 : lastrdata++;
1125 :
1126 221 : lastrdata->data = NULL;
1127 221 : lastrdata->len = 0;
1128 221 : lastrdata->buffer = sbuf; /* backup block 2 */
1129 221 : lastrdata->buffer_std = true;
1130 : }
1131 :
1132 383 : lastrdata->next = NULL;
1133 :
1134 383 : if (isroot)
1135 14 : xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
1136 : else
1137 369 : xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
1138 :
1139 383 : recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
1140 :
1141 383 : PageSetLSN(origpage, recptr);
1142 383 : PageSetTLI(origpage, ThisTimeLineID);
1143 383 : PageSetLSN(rightpage, recptr);
1144 383 : PageSetTLI(rightpage, ThisTimeLineID);
1145 383 : if (!P_RIGHTMOST(ropaque))
1146 : {
1147 221 : PageSetLSN(spage, recptr);
1148 221 : PageSetTLI(spage, ThisTimeLineID);
1149 : }
1150 : }
1151 :
1152 383 : END_CRIT_SECTION();
1153 :
1154 : /* release the old right sibling */
1155 383 : if (!P_RIGHTMOST(ropaque))
1156 221 : _bt_relbuf(rel, sbuf);
1157 :
1158 : /* split's done */
1159 383 : return rbuf;
1160 : }
1161 :
1162 : /*
1163 : * _bt_findsplitloc() -- find an appropriate place to split a page.
1164 : *
1165 : * The idea here is to equalize the free space that will be on each split
1166 : * page, *after accounting for the inserted tuple*. (If we fail to account
1167 : * for it, we might find ourselves with too little room on the page that
1168 : * it needs to go into!)
1169 : *
1170 : * If the page is the rightmost page on its level, we instead try to arrange
1171 : * to leave the left split page fillfactor% full. In this way, when we are
1172 : * inserting successively increasing keys (consider sequences, timestamps,
1173 : * etc) we will end up with a tree whose pages are about fillfactor% full,
1174 : * instead of the 50% full result that we'd get without this special case.
1175 : * This is the same as nbtsort.c produces for a newly-created tree. Note
1176 : * that leaf and nonleaf pages use different fillfactors.
1177 : *
1178 : * We are passed the intended insert position of the new tuple, expressed as
1179 : * the offsetnumber of the tuple it must go in front of. (This could be
1180 : * maxoff+1 if the tuple is to go at the end.)
1181 : *
1182 : * We return the index of the first existing tuple that should go on the
1183 : * righthand page, plus a boolean indicating whether the new tuple goes on
1184 : * the left or right page. The bool is necessary to disambiguate the case
1185 : * where firstright == newitemoff.
1186 : */
1187 : static OffsetNumber
1188 : _bt_findsplitloc(Relation rel,
1189 : Page page,
1190 : OffsetNumber newitemoff,
1191 : Size newitemsz,
1192 : bool *newitemonleft)
1193 383 : {
1194 : BTPageOpaque opaque;
1195 : OffsetNumber offnum;
1196 : OffsetNumber maxoff;
1197 : ItemId itemid;
1198 : FindSplitData state;
1199 : int leftspace,
1200 : rightspace,
1201 : goodenough,
1202 : olddataitemstotal,
1203 : olddataitemstoleft;
1204 : bool goodenoughfound;
1205 :
1206 383 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1207 :
1208 : /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
1209 383 : newitemsz += sizeof(ItemIdData);
1210 :
1211 : /* Total free space available on a btree page, after fixed overhead */
1212 383 : leftspace = rightspace =
1213 : PageGetPageSize(page) - SizeOfPageHeaderData -
1214 : MAXALIGN(sizeof(BTPageOpaqueData));
1215 :
1216 : /* The right page will have the same high key as the old page */
1217 383 : if (!P_RIGHTMOST(opaque))
1218 : {
1219 221 : itemid = PageGetItemId(page, P_HIKEY);
1220 221 : rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
1221 : sizeof(ItemIdData));
1222 : }
1223 :
1224 : /* Count up total space in data items without actually scanning 'em */
1225 383 : olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
1226 :
1227 383 : state.newitemsz = newitemsz;
1228 383 : state.is_leaf = P_ISLEAF(opaque);
1229 383 : state.is_rightmost = P_RIGHTMOST(opaque);
1230 383 : state.have_split = false;
1231 383 : if (state.is_leaf)
1232 382 : state.fillfactor = RelationGetFillFactor(rel,
1233 : BTREE_DEFAULT_FILLFACTOR);
1234 : else
1235 1 : state.fillfactor = BTREE_NONLEAF_FILLFACTOR;
1236 383 : state.newitemonleft = false; /* these just to keep compiler quiet */
1237 383 : state.firstright = 0;
1238 383 : state.best_delta = 0;
1239 383 : state.leftspace = leftspace;
1240 383 : state.rightspace = rightspace;
1241 383 : state.olddataitemstotal = olddataitemstotal;
1242 383 : state.newitemoff = newitemoff;
1243 :
1244 : /*
1245 : * Finding the best possible split would require checking all the possible
1246 : * split points, because of the high-key and left-key special cases.
1247 : * That's probably more work than it's worth; instead, stop as soon as we
1248 : * find a "good-enough" split, where good-enough is defined as an
1249 : * imbalance in free space of no more than pagesize/16 (arbitrary...) This
1250 : * should let us stop near the middle on most pages, instead of plowing to
1251 : * the end.
1252 : */
1253 383 : goodenough = leftspace / 16;
1254 :
1255 : /*
1256 : * Scan through the data items and calculate space usage for a split at
1257 : * each possible position.
1258 : */
1259 383 : olddataitemstoleft = 0;
1260 383 : goodenoughfound = false;
1261 383 : maxoff = PageGetMaxOffsetNumber(page);
1262 :
1263 383 : for (offnum = P_FIRSTDATAKEY(opaque);
1264 55691 : offnum <= maxoff;
1265 54925 : offnum = OffsetNumberNext(offnum))
1266 : {
1267 : Size itemsz;
1268 :
1269 55192 : itemid = PageGetItemId(page, offnum);
1270 55192 : itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1271 :
1272 : /*
1273 : * Will the new item go to left or right of split?
1274 : */
1275 55192 : if (offnum > newitemoff)
1276 7352 : _bt_checksplitloc(&state, offnum, true,
1277 : olddataitemstoleft, itemsz);
1278 :
1279 47840 : else if (offnum < newitemoff)
1280 47684 : _bt_checksplitloc(&state, offnum, false,
1281 : olddataitemstoleft, itemsz);
1282 : else
1283 : {
1284 : /* need to try it both ways! */
1285 156 : _bt_checksplitloc(&state, offnum, true,
1286 : olddataitemstoleft, itemsz);
1287 :
1288 156 : _bt_checksplitloc(&state, offnum, false,
1289 : olddataitemstoleft, itemsz);
1290 : }
1291 :
1292 : /* Abort scan once we find a good-enough choice */
1293 55192 : if (state.have_split && state.best_delta <= goodenough)
1294 : {
1295 267 : goodenoughfound = true;
1296 267 : break;
1297 : }
1298 :
1299 54925 : olddataitemstoleft += itemsz;
1300 : }
1301 :
1302 : /*
1303 : * If the new item goes as the last item, check for splitting so that all
1304 : * the old items go to the left page and the new item goes to the right
1305 : * page.
1306 : */
1307 383 : if (newitemoff > maxoff && !goodenoughfound)
1308 42 : _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
1309 :
1310 : /*
1311 : * I believe it is not possible to fail to find a feasible split, but just
1312 : * in case ...
1313 : */
1314 383 : if (!state.have_split)
1315 0 : elog(ERROR, "could not find a feasible split point for index \"%s\"",
1316 : RelationGetRelationName(rel));
1317 :
1318 383 : *newitemonleft = state.newitemonleft;
1319 383 : return state.firstright;
1320 : }
1321 :
1322 : /*
1323 : * Subroutine to analyze a particular possible split choice (ie, firstright
1324 : * and newitemonleft settings), and record the best split so far in *state.
1325 : *
1326 : * firstoldonright is the offset of the first item on the original page
1327 : * that goes to the right page, and firstoldonrightsz is the size of that
1328 : * tuple. firstoldonright can be > max offset, which means that all the old
1329 : * items go to the left page and only the new item goes to the right page.
1330 : * In that case, firstoldonrightsz is not used.
1331 : *
1332 : * olddataitemstoleft is the total size of all old items to the left of
1333 : * firstoldonright.
1334 : */
1335 : static void
1336 : _bt_checksplitloc(FindSplitData *state,
1337 : OffsetNumber firstoldonright,
1338 : bool newitemonleft,
1339 : int olddataitemstoleft,
1340 : Size firstoldonrightsz)
1341 55390 : {
1342 : int leftfree,
1343 : rightfree;
1344 : Size firstrightitemsz;
1345 : bool newitemisfirstonright;
1346 :
1347 : /* Is the new item going to be the first item on the right page? */
1348 55390 : newitemisfirstonright = (firstoldonright == state->newitemoff
1349 : && !newitemonleft);
1350 :
1351 55390 : if (newitemisfirstonright)
1352 198 : firstrightitemsz = state->newitemsz;
1353 : else
1354 55192 : firstrightitemsz = firstoldonrightsz;
1355 :
1356 : /* Account for all the old tuples */
1357 55390 : leftfree = state->leftspace - olddataitemstoleft;
1358 55390 : rightfree = state->rightspace -
1359 : (state->olddataitemstotal - olddataitemstoleft);
1360 :
1361 : /*
1362 : * The first item on the right page becomes the high key of the left page;
1363 : * therefore it counts against left space as well as right space.
1364 : */
1365 55390 : leftfree -= firstrightitemsz;
1366 :
1367 : /* account for the new item */
1368 55390 : if (newitemonleft)
1369 7508 : leftfree -= (int) state->newitemsz;
1370 : else
1371 47882 : rightfree -= (int) state->newitemsz;
1372 :
1373 : /*
1374 : * If we are not on the leaf level, we will be able to discard the key
1375 : * data from the first item that winds up on the right page.
1376 : */
1377 55390 : if (!state->is_leaf)
1378 103 : rightfree += (int) firstrightitemsz -
1379 : (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
1380 :
1381 : /*
1382 : * If feasible split point, remember best delta.
1383 : */
1384 55390 : if (leftfree >= 0 && rightfree >= 0)
1385 : {
1386 : int delta;
1387 :
1388 54891 : if (state->is_rightmost)
1389 : {
1390 : /*
1391 : * If splitting a rightmost page, try to put (100-fillfactor)% of
1392 : * free space on left page. See comments for _bt_findsplitloc.
1393 : */
1394 28422 : delta = (state->fillfactor * leftfree)
1395 : - ((100 - state->fillfactor) * rightfree);
1396 : }
1397 : else
1398 : {
1399 : /* Otherwise, aim for equal free space on both sides */
1400 26469 : delta = leftfree - rightfree;
1401 : }
1402 :
1403 54891 : if (delta < 0)
1404 1228 : delta = -delta;
1405 54891 : if (!state->have_split || delta < state->best_delta)
1406 : {
1407 53751 : state->have_split = true;
1408 53751 : state->newitemonleft = newitemonleft;
1409 53751 : state->firstright = firstoldonright;
1410 53751 : state->best_delta = delta;
1411 : }
1412 : }
1413 55390 : }
1414 :
1415 : /*
1416 : * _bt_insert_parent() -- Insert downlink into parent after a page split.
1417 : *
1418 : * On entry, buf and rbuf are the left and right split pages, which we
1419 : * still hold write locks on per the L&Y algorithm. We release the
1420 : * write locks once we have write lock on the parent page. (Any sooner,
1421 : * and it'd be possible for some other process to try to split or delete
1422 : * one of these pages, and get confused because it cannot find the downlink.)
1423 : *
1424 : * stack - stack showing how we got here. May be NULL in cases that don't
1425 : * have to be efficient (concurrent ROOT split, WAL recovery)
1426 : * is_root - we split the true root
1427 : * is_only - we split a page alone on its level (might have been fast root)
1428 : *
1429 : * This is exported so it can be called by nbtxlog.c.
1430 : */
1431 : void
1432 : _bt_insert_parent(Relation rel,
1433 : Buffer buf,
1434 : Buffer rbuf,
1435 : BTStack stack,
1436 : bool is_root,
1437 : bool is_only)
1438 383 : {
1439 : /*
1440 : * Here we have to do something Lehman and Yao don't talk about: deal with
1441 : * a root split and construction of a new root. If our stack is empty
1442 : * then we have just split a node on what had been the root level when we
1443 : * descended the tree. If it was still the root then we perform a
1444 : * new-root construction. If it *wasn't* the root anymore, search to find
1445 : * the next higher level that someone constructed meanwhile, and find the
1446 : * right place to insert as for the normal case.
1447 : *
1448 : * If we have to search for the parent level, we do so by re-descending
1449 : * from the root. This is not super-efficient, but it's rare enough not
1450 : * to matter. (This path is also taken when called from WAL recovery ---
1451 : * we have no stack in that case.)
1452 : */
1453 383 : if (is_root)
1454 : {
1455 : Buffer rootbuf;
1456 :
1457 : Assert(stack == NULL);
1458 : Assert(is_only);
1459 : /* create a new root node and update the metapage */
1460 14 : rootbuf = _bt_newroot(rel, buf, rbuf);
1461 : /* release the split buffers */
1462 14 : _bt_relbuf(rel, rootbuf);
1463 14 : _bt_relbuf(rel, rbuf);
1464 14 : _bt_relbuf(rel, buf);
1465 : }
1466 : else
1467 : {
1468 369 : BlockNumber bknum = BufferGetBlockNumber(buf);
1469 369 : BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1470 369 : Page page = BufferGetPage(buf);
1471 : IndexTuple new_item;
1472 : BTStackData fakestack;
1473 : IndexTuple ritem;
1474 : Buffer pbuf;
1475 :
1476 369 : if (stack == NULL)
1477 : {
1478 : BTPageOpaque lpageop;
1479 :
1480 0 : if (!InRecovery)
1481 0 : elog(DEBUG2, "concurrent ROOT page split");
1482 0 : lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1483 : /* Find the leftmost page at the next level up */
1484 0 : pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false);
1485 : /* Set up a phony stack entry pointing there */
1486 0 : stack = &fakestack;
1487 0 : stack->bts_blkno = BufferGetBlockNumber(pbuf);
1488 0 : stack->bts_offset = InvalidOffsetNumber;
1489 : /* bts_btentry will be initialized below */
1490 0 : stack->bts_parent = NULL;
1491 0 : _bt_relbuf(rel, pbuf);
1492 : }
1493 :
1494 : /* get high key from left page == lowest key on new right page */
1495 369 : ritem = (IndexTuple) PageGetItem(page,
1496 : PageGetItemId(page, P_HIKEY));
1497 :
1498 : /* form an index tuple that points at the new right page */
1499 369 : new_item = CopyIndexTuple(ritem);
1500 369 : ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY);
1501 :
1502 : /*
1503 : * Find the parent buffer and get the parent page.
1504 : *
1505 : * Oops - if we were moved right then we need to change stack item! We
1506 : * want to find parent pointing to where we are, right ? - vadim
1507 : * 05/27/97
1508 : */
1509 369 : ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY);
1510 :
1511 369 : pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1512 :
1513 : /* Now we can unlock the children */
1514 369 : _bt_relbuf(rel, rbuf);
1515 369 : _bt_relbuf(rel, buf);
1516 :
1517 : /* Check for error only after writing children */
1518 369 : if (pbuf == InvalidBuffer)
1519 0 : elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
1520 : RelationGetRelationName(rel), bknum, rbknum);
1521 :
1522 : /* Recursively update the parent */
1523 369 : _bt_insertonpg(rel, pbuf, stack->bts_parent,
1524 : new_item, stack->bts_offset + 1,
1525 : is_only);
1526 :
1527 : /* be tidy */
1528 369 : pfree(new_item);
1529 : }
1530 383 : }
1531 :
1532 : /*
1533 : * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
1534 : * we last looked at in the parent.
1535 : *
1536 : * This is possible because we save the downlink from the parent item,
1537 : * which is enough to uniquely identify it. Insertions into the parent
1538 : * level could cause the item to move right; deletions could cause it
1539 : * to move left, but not left of the page we previously found it in.
1540 : *
1541 : * Adjusts bts_blkno & bts_offset if changed.
1542 : *
1543 : * Returns InvalidBuffer if item not found (should not happen).
1544 : */
1545 : Buffer
1546 : _bt_getstackbuf(Relation rel, BTStack stack, int access)
1547 373 : {
1548 : BlockNumber blkno;
1549 : OffsetNumber start;
1550 :
1551 373 : blkno = stack->bts_blkno;
1552 373 : start = stack->bts_offset;
1553 :
1554 : for (;;)
1555 : {
1556 : Buffer buf;
1557 : Page page;
1558 : BTPageOpaque opaque;
1559 :
1560 373 : buf = _bt_getbuf(rel, blkno, access);
1561 373 : page = BufferGetPage(buf);
1562 373 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1563 :
1564 373 : if (!P_IGNORE(opaque))
1565 : {
1566 : OffsetNumber offnum,
1567 : minoff,
1568 : maxoff;
1569 : ItemId itemid;
1570 : IndexTuple item;
1571 :
1572 373 : minoff = P_FIRSTDATAKEY(opaque);
1573 373 : maxoff = PageGetMaxOffsetNumber(page);
1574 :
1575 : /*
1576 : * start = InvalidOffsetNumber means "search the whole page". We
1577 : * need this test anyway due to possibility that page has a high
1578 : * key now when it didn't before.
1579 : */
1580 373 : if (start < minoff)
1581 0 : start = minoff;
1582 :
1583 : /*
1584 : * Need this check too, to guard against possibility that page
1585 : * split since we visited it originally.
1586 : */
1587 373 : if (start > maxoff)
1588 0 : start = OffsetNumberNext(maxoff);
1589 :
1590 : /*
1591 : * These loops will check every item on the page --- but in an
1592 : * order that's attuned to the probability of where it actually
1593 : * is. Scan to the right first, then to the left.
1594 : */
1595 373 : for (offnum = start;
1596 746 : offnum <= maxoff;
1597 0 : offnum = OffsetNumberNext(offnum))
1598 : {
1599 373 : itemid = PageGetItemId(page, offnum);
1600 373 : item = (IndexTuple) PageGetItem(page, itemid);
1601 373 : if (BTEntrySame(item, &stack->bts_btentry))
1602 : {
1603 : /* Return accurate pointer to where link is now */
1604 373 : stack->bts_blkno = blkno;
1605 373 : stack->bts_offset = offnum;
1606 373 : return buf;
1607 : }
1608 : }
1609 :
1610 0 : for (offnum = OffsetNumberPrev(start);
1611 0 : offnum >= minoff;
1612 0 : offnum = OffsetNumberPrev(offnum))
1613 : {
1614 0 : itemid = PageGetItemId(page, offnum);
1615 0 : item = (IndexTuple) PageGetItem(page, itemid);
1616 0 : if (BTEntrySame(item, &stack->bts_btentry))
1617 : {
1618 : /* Return accurate pointer to where link is now */
1619 0 : stack->bts_blkno = blkno;
1620 0 : stack->bts_offset = offnum;
1621 0 : return buf;
1622 : }
1623 : }
1624 : }
1625 :
1626 : /*
1627 : * The item we're looking for moved right at least one page.
1628 : */
1629 0 : if (P_RIGHTMOST(opaque))
1630 : {
1631 0 : _bt_relbuf(rel, buf);
1632 0 : return InvalidBuffer;
1633 : }
1634 0 : blkno = opaque->btpo_next;
1635 0 : start = InvalidOffsetNumber;
1636 0 : _bt_relbuf(rel, buf);
1637 0 : }
1638 : }
1639 :
1640 : /*
1641 : * _bt_newroot() -- Create a new root page for the index.
1642 : *
1643 : * We've just split the old root page and need to create a new one.
1644 : * In order to do this, we add a new root page to the file, then lock
1645 : * the metadata page and update it. This is guaranteed to be deadlock-
1646 : * free, because all readers release their locks on the metadata page
1647 : * before trying to lock the root, and all writers lock the root before
1648 : * trying to lock the metadata page. We have a write lock on the old
1649 : * root page, so we have not introduced any cycles into the waits-for
1650 : * graph.
1651 : *
1652 : * On entry, lbuf (the old root) and rbuf (its new peer) are write-
1653 : * locked. On exit, a new root page exists with entries for the
1654 : * two new children, metapage is updated and unlocked/unpinned.
1655 : * The new root buffer is returned to caller which has to unlock/unpin
1656 : * lbuf, rbuf & rootbuf.
1657 : */
1658 : static Buffer
1659 : _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
1660 14 : {
1661 : Buffer rootbuf;
1662 : Page lpage,
1663 : rootpage;
1664 : BlockNumber lbkno,
1665 : rbkno;
1666 : BlockNumber rootblknum;
1667 : BTPageOpaque rootopaque;
1668 : ItemId itemid;
1669 : IndexTuple item;
1670 : Size itemsz;
1671 : IndexTuple new_item;
1672 : Buffer metabuf;
1673 : Page metapg;
1674 : BTMetaPageData *metad;
1675 :
1676 14 : lbkno = BufferGetBlockNumber(lbuf);
1677 14 : rbkno = BufferGetBlockNumber(rbuf);
1678 14 : lpage = BufferGetPage(lbuf);
1679 :
1680 : /* get a new root page */
1681 14 : rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1682 14 : rootpage = BufferGetPage(rootbuf);
1683 14 : rootblknum = BufferGetBlockNumber(rootbuf);
1684 :
1685 : /* acquire lock on the metapage */
1686 14 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1687 14 : metapg = BufferGetPage(metabuf);
1688 14 : metad = BTPageGetMeta(metapg);
1689 :
1690 : /* NO EREPORT(ERROR) from here till newroot op is logged */
1691 14 : START_CRIT_SECTION();
1692 :
1693 : /* set btree special data */
1694 14 : rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
1695 14 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
1696 14 : rootopaque->btpo_flags = BTP_ROOT;
1697 14 : rootopaque->btpo.level =
1698 : ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
1699 14 : rootopaque->btpo_cycleid = 0;
1700 :
1701 : /* update metapage data */
1702 14 : metad->btm_root = rootblknum;
1703 14 : metad->btm_level = rootopaque->btpo.level;
1704 14 : metad->btm_fastroot = rootblknum;
1705 14 : metad->btm_fastlevel = rootopaque->btpo.level;
1706 :
1707 : /*
1708 : * Create downlink item for left page (old root). Since this will be the
1709 : * first item in a non-leaf page, it implicitly has minus-infinity key
1710 : * value, so we need not store any actual key in it.
1711 : */
1712 14 : itemsz = sizeof(IndexTupleData);
1713 14 : new_item = (IndexTuple) palloc(itemsz);
1714 14 : new_item->t_info = itemsz;
1715 14 : ItemPointerSet(&(new_item->t_tid), lbkno, P_HIKEY);
1716 :
1717 : /*
1718 : * Insert the left page pointer into the new root page. The root page is
1719 : * the rightmost page on its level so there is no "high key" in it; the
1720 : * two items will go into positions P_HIKEY and P_FIRSTKEY.
1721 : *
1722 : * Note: we *must* insert the two items in item-number order, for the
1723 : * benefit of _bt_restore_page().
1724 : */
1725 14 : if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
1726 : false, false) == InvalidOffsetNumber)
1727 0 : elog(PANIC, "failed to add leftkey to new root page"
1728 : " while splitting block %u of index \"%s\"",
1729 : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
1730 14 : pfree(new_item);
1731 :
1732 : /*
1733 : * Create downlink item for right page. The key for it is obtained from
1734 : * the "high key" position in the left page.
1735 : */
1736 14 : itemid = PageGetItemId(lpage, P_HIKEY);
1737 14 : itemsz = ItemIdGetLength(itemid);
1738 14 : item = (IndexTuple) PageGetItem(lpage, itemid);
1739 14 : new_item = CopyIndexTuple(item);
1740 14 : ItemPointerSet(&(new_item->t_tid), rbkno, P_HIKEY);
1741 :
1742 : /*
1743 : * insert the right page pointer into the new root page.
1744 : */
1745 14 : if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
1746 : false, false) == InvalidOffsetNumber)
1747 0 : elog(PANIC, "failed to add rightkey to new root page"
1748 : " while splitting block %u of index \"%s\"",
1749 : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
1750 14 : pfree(new_item);
1751 :
1752 14 : MarkBufferDirty(rootbuf);
1753 14 : MarkBufferDirty(metabuf);
1754 :
1755 : /* XLOG stuff */
1756 14 : if (!rel->rd_istemp)
1757 : {
1758 : xl_btree_newroot xlrec;
1759 : XLogRecPtr recptr;
1760 : XLogRecData rdata[2];
1761 :
1762 14 : xlrec.node = rel->rd_node;
1763 14 : xlrec.rootblk = rootblknum;
1764 14 : xlrec.level = metad->btm_level;
1765 :
1766 14 : rdata[0].data = (char *) &xlrec;
1767 14 : rdata[0].len = SizeOfBtreeNewroot;
1768 14 : rdata[0].buffer = InvalidBuffer;
1769 14 : rdata[0].next = &(rdata[1]);
1770 :
1771 : /*
1772 : * Direct access to page is not good but faster - we should implement
1773 : * some new func in page API.
1774 : */
1775 14 : rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper;
1776 14 : rdata[1].len = ((PageHeader) rootpage)->pd_special -
1777 : ((PageHeader) rootpage)->pd_upper;
1778 14 : rdata[1].buffer = InvalidBuffer;
1779 14 : rdata[1].next = NULL;
1780 :
1781 14 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata);
1782 :
1783 14 : PageSetLSN(rootpage, recptr);
1784 14 : PageSetTLI(rootpage, ThisTimeLineID);
1785 14 : PageSetLSN(metapg, recptr);
1786 14 : PageSetTLI(metapg, ThisTimeLineID);
1787 : }
1788 :
1789 14 : END_CRIT_SECTION();
1790 :
1791 : /* send out relcache inval for metapage change */
1792 14 : CacheInvalidateRelcache(rel);
1793 :
1794 : /* done with metapage */
1795 14 : _bt_relbuf(rel, metabuf);
1796 :
1797 14 : return rootbuf;
1798 : }
1799 :
1800 : /*
1801 : * _bt_pgaddtup() -- add a tuple to a particular page in the index.
1802 : *
1803 : * This routine adds the tuple to the page as requested. It does
1804 : * not affect pin/lock status, but you'd better have a write lock
1805 : * and pin on the target buffer! Don't forget to write and release
1806 : * the buffer afterwards, either.
1807 : *
1808 : * The main difference between this routine and a bare PageAddItem call
1809 : * is that this code knows that the leftmost index tuple on a non-leaf
1810 : * btree page doesn't need to have a key. Therefore, it strips such
1811 : * tuples down to just the tuple header. CAUTION: this works ONLY if
1812 : * we insert the tuples in order, so that the given itup_off does
1813 : * represent the final position of the tuple!
1814 : */
1815 : static void
1816 : _bt_pgaddtup(Relation rel,
1817 : Page page,
1818 : Size itemsize,
1819 : IndexTuple itup,
1820 : OffsetNumber itup_off,
1821 : const char *where)
1822 156935 : {
1823 156935 : BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1824 : IndexTupleData trunctuple;
1825 :
1826 156935 : if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
1827 : {
1828 2 : trunctuple = *itup;
1829 2 : trunctuple.t_info = sizeof(IndexTupleData);
1830 2 : itup = &trunctuple;
1831 2 : itemsize = sizeof(IndexTupleData);
1832 : }
1833 :
1834 156935 : if (PageAddItem(page, (Item) itup, itemsize, itup_off,
1835 : false, false) == InvalidOffsetNumber)
1836 0 : elog(PANIC, "failed to add item to the %s in index \"%s\"",
1837 : where, RelationGetRelationName(rel));
1838 156935 : }
1839 :
1840 : /*
1841 : * _bt_isequal - used in _bt_doinsert in check for duplicates.
1842 : *
1843 : * This is very similar to _bt_compare, except for NULL handling.
1844 : * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
1845 : */
1846 : static bool
1847 : _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
1848 : int keysz, ScanKey scankey)
1849 20337 : {
1850 : IndexTuple itup;
1851 : int i;
1852 :
1853 : /* Better be comparing to a leaf item */
1854 : Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
1855 :
1856 20337 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1857 :
1858 34551 : for (i = 1; i <= keysz; i++)
1859 : {
1860 : AttrNumber attno;
1861 : Datum datum;
1862 : bool isNull;
1863 : int32 result;
1864 :
1865 33248 : attno = scankey->sk_attno;
1866 : Assert(attno == i);
1867 33248 : datum = index_getattr(itup, attno, itupdesc, &isNull);
1868 :
1869 : /* NULLs are never equal to anything */
1870 33248 : if (isNull || (scankey->sk_flags & SK_ISNULL))
1871 13 : return false;
1872 :
1873 33235 : result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
1874 : datum,
1875 : scankey->sk_argument));
1876 :
1877 33235 : if (result != 0)
1878 19021 : return false;
1879 :
1880 14214 : scankey++;
1881 : }
1882 :
1883 : /* if we get here, the keys are equal */
1884 1303 : return true;
1885 : }
1886 :
1887 : /*
1888 : * _bt_vacuum_one_page - vacuum just one index page.
1889 : *
1890 : * Try to remove LP_DEAD items from the given page. The passed buffer
1891 : * must be exclusive-locked, but unlike a real VACUUM, we don't need a
1892 : * super-exclusive "cleanup" lock (see nbtree/README).
1893 : */
1894 : static void
1895 : _bt_vacuum_one_page(Relation rel, Buffer buffer)
1896 106 : {
1897 : OffsetNumber deletable[MaxOffsetNumber];
1898 106 : int ndeletable = 0;
1899 : OffsetNumber offnum,
1900 : minoff,
1901 : maxoff;
1902 106 : Page page = BufferGetPage(buffer);
1903 106 : BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1904 :
1905 : /*
1906 : * Scan over all items to see which ones need to be deleted according to
1907 : * LP_DEAD flags.
1908 : */
1909 106 : minoff = P_FIRSTDATAKEY(opaque);
1910 106 : maxoff = PageGetMaxOffsetNumber(page);
1911 106 : for (offnum = minoff;
1912 23832 : offnum <= maxoff;
1913 23620 : offnum = OffsetNumberNext(offnum))
1914 : {
1915 23620 : ItemId itemId = PageGetItemId(page, offnum);
1916 :
1917 23620 : if (ItemIdIsDead(itemId))
1918 796 : deletable[ndeletable++] = offnum;
1919 : }
1920 :
1921 106 : if (ndeletable > 0)
1922 106 : _bt_delitems(rel, buffer, deletable, ndeletable);
1923 :
1924 : /*
1925 : * Note: if we didn't find any LP_DEAD items, then the page's
1926 : * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
1927 : * separate write to clear it, however. We will clear it when we split
1928 : * the page.
1929 : */
1930 106 : }
|