LTP GCOV extension - code coverage report
Current view: directory - access/nbtree - nbtinsert.c
Test: unnamed
Date: 2008-07-03 Instrumented lines: 576
Code covered: 86.5 % Executed lines: 498
Legend: not executed executed

       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * nbtinsert.c
       4                 :  *        Item insertion in Lehman and Yao btrees for Postgres.
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *        $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.163 2007/12/31 04:52:05 tgl Exp $
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : 
      16                 : #include "postgres.h"
      17                 : 
      18                 : #include "access/heapam.h"
      19                 : #include "access/nbtree.h"
      20                 : #include "access/transam.h"
      21                 : #include "miscadmin.h"
      22                 : #include "utils/inval.h"
      23                 : 
      24                 : 
      25                 : typedef struct
      26                 : {
      27                 :         /* context data for _bt_checksplitloc */
      28                 :         Size            newitemsz;              /* size of new item to be inserted */
      29                 :         int                     fillfactor;             /* needed when splitting rightmost page */
      30                 :         bool            is_leaf;                /* T if splitting a leaf page */
      31                 :         bool            is_rightmost;   /* T if splitting a rightmost page */
      32                 :         OffsetNumber newitemoff;        /* where the new item is to be inserted */
      33                 :         int                     leftspace;              /* space available for items on left page */
      34                 :         int                     rightspace;             /* space available for items on right page */
      35                 :         int                     olddataitemstotal;              /* space taken by old items */
      36                 : 
      37                 :         bool            have_split;             /* found a valid split? */
      38                 : 
      39                 :         /* these fields valid only if have_split is true */
      40                 :         bool            newitemonleft;  /* new item on left or right of best split */
      41                 :         OffsetNumber firstright;        /* best split point */
      42                 :         int                     best_delta;             /* best size delta so far */
      43                 : } FindSplitData;
      44                 : 
      45                 : 
      46                 : static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
      47                 : 
      48                 : static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
      49                 :                                  Relation heapRel, Buffer buf, OffsetNumber ioffset,
      50                 :                                  ScanKey itup_scankey);
      51                 : static void _bt_findinsertloc(Relation rel,
      52                 :                                   Buffer *bufptr,
      53                 :                                   OffsetNumber *offsetptr,
      54                 :                                   int keysz,
      55                 :                                   ScanKey scankey,
      56                 :                                   IndexTuple newtup);
      57                 : static void _bt_insertonpg(Relation rel, Buffer buf,
      58                 :                            BTStack stack,
      59                 :                            IndexTuple itup,
      60                 :                            OffsetNumber newitemoff,
      61                 :                            bool split_only_page);
      62                 : static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
      63                 :                   OffsetNumber newitemoff, Size newitemsz,
      64                 :                   IndexTuple newitem, bool newitemonleft);
      65                 : static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
      66                 :                                  OffsetNumber newitemoff,
      67                 :                                  Size newitemsz,
      68                 :                                  bool *newitemonleft);
      69                 : static void _bt_checksplitloc(FindSplitData *state,
      70                 :                                   OffsetNumber firstoldonright, bool newitemonleft,
      71                 :                                   int dataitemstoleft, Size firstoldonrightsz);
      72                 : static void _bt_pgaddtup(Relation rel, Page page,
      73                 :                          Size itemsize, IndexTuple itup,
      74                 :                          OffsetNumber itup_off, const char *where);
      75                 : static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
      76                 :                         int keysz, ScanKey scankey);
      77                 : static void _bt_vacuum_one_page(Relation rel, Buffer buffer);
      78                 : 
      79                 : 
      80                 : /*
      81                 :  *      _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
      82                 :  *
      83                 :  *              This routine is called by the public interface routines, btbuild
      84                 :  *              and btinsert.  By here, itup is filled in, including the TID.
      85                 :  */
      86                 : void
      87                 : _bt_doinsert(Relation rel, IndexTuple itup,
      88                 :                          bool index_is_unique, Relation heapRel)
      89           70109 : {
      90           70109 :         int                     natts = rel->rd_rel->relnatts;
      91                 :         ScanKey         itup_scankey;
      92                 :         BTStack         stack;
      93                 :         Buffer          buf;
      94                 :         OffsetNumber offset;
      95                 : 
      96                 :         /* we need an insertion scan key to do our search, so build one */
      97           70109 :         itup_scankey = _bt_mkscankey(rel, itup);
      98                 : 
      99           70109 : top:
     100                 :         /* find the first page containing this key */
     101           70109 :         stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
     102                 : 
     103           70109 :         offset = InvalidOffsetNumber;
     104                 : 
     105                 :         /* trade in our read lock for a write lock */
     106           70109 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     107           70109 :         LockBuffer(buf, BT_WRITE);
     108                 : 
     109                 :         /*
     110                 :          * If the page was split between the time that we surrendered our read
     111                 :          * lock and acquired our write lock, then this page may no longer be the
     112                 :          * right place for the key we want to insert.  In this case, we need to
     113                 :          * move right in the tree.      See Lehman and Yao for an excruciatingly
     114                 :          * precise description.
     115                 :          */
     116           70109 :         buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
     117                 : 
     118                 :         /*
     119                 :          * If we're not allowing duplicates, make sure the key isn't already in
     120                 :          * the index.
     121                 :          *
     122                 :          * NOTE: obviously, _bt_check_unique can only detect keys that are already
     123                 :          * in the index; so it cannot defend against concurrent insertions of the
     124                 :          * same key.  We protect against that by means of holding a write lock on
     125                 :          * the target page.  Any other would-be inserter of the same key must
     126                 :          * acquire a write lock on the same target page, so only one would-be
     127                 :          * inserter can be making the check at one time.  Furthermore, once we are
     128                 :          * past the check we hold write locks continuously until we have performed
     129                 :          * our insertion, so no later inserter can fail to see our insertion.
     130                 :          * (This requires some care in _bt_insertonpg.)
     131                 :          *
     132                 :          * If we must wait for another xact, we release the lock while waiting,
     133                 :          * and then must start over completely.
     134                 :          */
     135           70109 :         if (index_is_unique)
     136                 :         {
     137                 :                 TransactionId xwait;
     138                 : 
     139           34237 :                 offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
     140           34237 :                 xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey);
     141                 : 
     142           34215 :                 if (TransactionIdIsValid(xwait))
     143                 :                 {
     144                 :                         /* Have to wait for the other guy ... */
     145               0 :                         _bt_relbuf(rel, buf);
     146               0 :                         XactLockTableWait(xwait);
     147                 :                         /* start over... */
     148               0 :                         _bt_freestack(stack);
     149               0 :                         goto top;
     150                 :                 }
     151                 :         }
     152                 : 
     153                 :         /* do the insertion */
     154           70087 :         _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
     155           70087 :         _bt_insertonpg(rel, buf, stack, itup, offset, false);
     156                 : 
     157                 :         /* be tidy */
     158           70087 :         _bt_freestack(stack);
     159           70087 :         _bt_freeskey(itup_scankey);
     160           70087 : }
     161                 : 
     162                 : /*
     163                 :  *      _bt_check_unique() -- Check for violation of unique index constraint
     164                 :  *
     165                 :  * offset points to the first possible item that could conflict. It can
     166                 :  * also point to end-of-page, which means that the first tuple to check
     167                 :  * is the first tuple on the next page.
     168                 :  *
     169                 :  * Returns InvalidTransactionId if there is no conflict, else an xact ID
     170                 :  * we must wait for to see if it commits a conflicting tuple.   If an actual
     171                 :  * conflict is detected, no return --- just ereport().
     172                 :  */
     173                 : static TransactionId
     174                 : _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
     175                 :                                  Buffer buf, OffsetNumber offset, ScanKey itup_scankey)
     176           34237 : {
     177           34237 :         TupleDesc       itupdesc = RelationGetDescr(rel);
     178           34237 :         int                     natts = rel->rd_rel->relnatts;
     179                 :         SnapshotData SnapshotDirty;
     180                 :         OffsetNumber maxoff;
     181                 :         Page            page;
     182                 :         BTPageOpaque opaque;
     183           34237 :         Buffer          nbuf = InvalidBuffer;
     184                 : 
     185           34237 :         InitDirtySnapshot(SnapshotDirty);
     186                 : 
     187           34237 :         page = BufferGetPage(buf);
     188           34237 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     189           34237 :         maxoff = PageGetMaxOffsetNumber(page);
     190                 : 
     191                 :         /*
     192                 :          * Scan over all equal tuples, looking for live conflicts.
     193                 :          */
     194                 :         for (;;)
     195                 :         {
     196                 :                 ItemId          curitemid;
     197                 :                 IndexTuple      curitup;
     198                 :                 BlockNumber nblkno;
     199                 : 
     200                 :                 /*
     201                 :                  * make sure the offset points to an actual item before trying to
     202                 :                  * examine it...
     203                 :                  */
     204           38550 :                 if (offset <= maxoff)
     205                 :                 {
     206           23020 :                         curitemid = PageGetItemId(page, offset);
     207                 : 
     208                 :                         /*
     209                 :                          * We can skip items that are marked killed.
     210                 :                          *
     211                 :                          * Formerly, we applied _bt_isequal() before checking the kill
     212                 :                          * flag, so as to fall out of the item loop as soon as possible.
     213                 :                          * However, in the presence of heavy update activity an index may
     214                 :                          * contain many killed items with the same key; running
     215                 :                          * _bt_isequal() on each killed item gets expensive. Furthermore
     216                 :                          * it is likely that the non-killed version of each key appears
     217                 :                          * first, so that we didn't actually get to exit any sooner
     218                 :                          * anyway. So now we just advance over killed items as quickly as
     219                 :                          * we can. We only apply _bt_isequal() when we get to a non-killed
     220                 :                          * item or the end of the page.
     221                 :                          */
     222           23020 :                         if (!ItemIdIsDead(curitemid))
     223                 :                         {
     224                 :                                 ItemPointerData htid;
     225                 :                                 bool            all_dead;
     226                 : 
     227                 :                                 /*
     228                 :                                  * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
     229                 :                                  * how we handling NULLs - and so we must not use _bt_compare
     230                 :                                  * in real comparison, but only for ordering/finding items on
     231                 :                                  * pages. - vadim 03/24/97
     232                 :                                  */
     233           19945 :                                 if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey))
     234           18644 :                                         break;          /* we're past all the equal tuples */
     235                 : 
     236                 :                                 /* okay, we gotta fetch the heap tuple ... */
     237            1301 :                                 curitup = (IndexTuple) PageGetItem(page, curitemid);
     238            1301 :                                 htid = curitup->t_tid;
     239                 : 
     240                 :                                 /*
     241                 :                                  * We check the whole HOT-chain to see if there is any tuple
     242                 :                                  * that satisfies SnapshotDirty.  This is necessary because we
     243                 :                                  * have just a single index entry for the entire chain.
     244                 :                                  */
     245            1301 :                                 if (heap_hot_search(&htid, heapRel, &SnapshotDirty, &all_dead))
     246                 :                                 {
     247                 :                                         /* it is a duplicate */
     248                 :                                         TransactionId xwait =
     249                 :                                         (TransactionIdIsValid(SnapshotDirty.xmin)) ?
     250              22 :                                         SnapshotDirty.xmin : SnapshotDirty.xmax;
     251                 : 
     252                 :                                         /*
     253                 :                                          * If this tuple is being updated by other transaction
     254                 :                                          * then we have to wait for its commit/abort.
     255                 :                                          */
     256              22 :                                         if (TransactionIdIsValid(xwait))
     257                 :                                         {
     258               0 :                                                 if (nbuf != InvalidBuffer)
     259               0 :                                                         _bt_relbuf(rel, nbuf);
     260                 :                                                 /* Tell _bt_doinsert to wait... */
     261               0 :                                                 return xwait;
     262                 :                                         }
     263                 : 
     264                 :                                         /*
     265                 :                                          * Otherwise we have a definite conflict.  But before
     266                 :                                          * complaining, look to see if the tuple we want to insert
     267                 :                                          * is itself now committed dead --- if so, don't complain.
     268                 :                                          * This is a waste of time in normal scenarios but we must
     269                 :                                          * do it to support CREATE INDEX CONCURRENTLY.
     270                 :                                          *
     271                 :                                          * We must follow HOT-chains here because during
     272                 :                                          * concurrent index build, we insert the root TID though
     273                 :                                          * the actual tuple may be somewhere in the HOT-chain.
     274                 :                                          * While following the chain we might not stop at the
     275                 :                                          * exact tuple which triggered the insert, but that's OK
     276                 :                                          * because if we find a live tuple anywhere in this chain,
     277                 :                                          * we have a unique key conflict.  The other live tuple is
     278                 :                                          * not part of this chain because it had a different index
     279                 :                                          * entry.
     280                 :                                          */
     281              22 :                                         htid = itup->t_tid;
     282              22 :                                         if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
     283                 :                                         {
     284                 :                                                 /* Normal case --- it's still live */
     285                 :                                         }
     286                 :                                         else
     287                 :                                         {
     288                 :                                                 /*
     289                 :                                                  * It's been deleted, so no error, and no need to
     290                 :                                                  * continue searching
     291                 :                                                  */
     292               0 :                                                 break;
     293                 :                                         }
     294                 : 
     295              22 :                                         ereport(ERROR,
     296                 :                                                         (errcode(ERRCODE_UNIQUE_VIOLATION),
     297                 :                                                          errmsg("duplicate key value violates unique constraint \"%s\"",
     298                 :                                                                         RelationGetRelationName(rel))));
     299                 :                                 }
     300            1279 :                                 else if (all_dead)
     301                 :                                 {
     302                 :                                         /*
     303                 :                                          * The conflicting tuple (or whole HOT chain) is dead to
     304                 :                                          * everyone, so we may as well mark the index entry
     305                 :                                          * killed.
     306                 :                                          */
     307              32 :                                         ItemIdMarkDead(curitemid);
     308              32 :                                         opaque->btpo_flags |= BTP_HAS_GARBAGE;
     309                 :                                         /* be sure to mark the proper buffer dirty... */
     310              32 :                                         if (nbuf != InvalidBuffer)
     311               0 :                                                 SetBufferCommitInfoNeedsSave(nbuf);
     312                 :                                         else
     313              32 :                                                 SetBufferCommitInfoNeedsSave(buf);
     314                 :                                 }
     315                 :                         }
     316                 :                 }
     317                 : 
     318                 :                 /*
     319                 :                  * Advance to next tuple to continue checking.
     320                 :                  */
     321           19884 :                 if (offset < maxoff)
     322            4311 :                         offset = OffsetNumberNext(offset);
     323                 :                 else
     324                 :                 {
     325                 :                         /* If scankey == hikey we gotta check the next page too */
     326           15573 :                         if (P_RIGHTMOST(opaque))
     327           15181 :                                 break;
     328             392 :                         if (!_bt_isequal(itupdesc, page, P_HIKEY,
     329                 :                                                          natts, itup_scankey))
     330             390 :                                 break;
     331                 :                         /* Advance to next non-dead page --- there must be one */
     332                 :                         for (;;)
     333                 :                         {
     334               2 :                                 nblkno = opaque->btpo_next;
     335               2 :                                 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
     336               2 :                                 page = BufferGetPage(nbuf);
     337               2 :                                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     338               2 :                                 if (!P_IGNORE(opaque))
     339               2 :                                         break;
     340               0 :                                 if (P_RIGHTMOST(opaque))
     341               0 :                                         elog(ERROR, "fell off the end of index \"%s\"",
     342                 :                                                  RelationGetRelationName(rel));
     343                 :                         }
     344               2 :                         maxoff = PageGetMaxOffsetNumber(page);
     345               2 :                         offset = P_FIRSTDATAKEY(opaque);
     346                 :                 }
     347                 :         }
     348                 : 
     349           34215 :         if (nbuf != InvalidBuffer)
     350               2 :                 _bt_relbuf(rel, nbuf);
     351                 : 
     352           34215 :         return InvalidTransactionId;
     353                 : }
     354                 : 
     355                 : 
     356                 : /*
     357                 :  *      _bt_findinsertloc() -- Finds an insert location for a tuple
     358                 :  *
     359                 :  *              If the new key is equal to one or more existing keys, we can
     360                 :  *              legitimately place it anywhere in the series of equal keys --- in fact,
     361                 :  *              if the new key is equal to the page's "high key" we can place it on
     362                 :  *              the next page.  If it is equal to the high key, and there's not room
     363                 :  *              to insert the new tuple on the current page without splitting, then
     364                 :  *              we can move right hoping to find more free space and avoid a split.
     365                 :  *              (We should not move right indefinitely, however, since that leads to
     366                 :  *              O(N^2) insertion behavior in the presence of many equal keys.)
     367                 :  *              Once we have chosen the page to put the key on, we'll insert it before
     368                 :  *              any existing equal keys because of the way _bt_binsrch() works.
     369                 :  *
     370                 :  *              If there's not enough room in the space, we try to make room by
     371                 :  *              removing any LP_DEAD tuples.
     372                 :  *
     373                 :  *              On entry, *buf and *offsetptr point to the first legal position
     374                 :  *              where the new tuple could be inserted.  The caller should hold an
     375                 :  *              exclusive lock on *buf.  *offsetptr can also be set to
     376                 :  *              InvalidOffsetNumber, in which case the function will search for the
     377                 :  *              right location within the page if needed.  On exit, they point to the
     378                 :  *              chosen insert location.  If _bt_findinsertloc decides to move right,
     379                 :  *              the lock and pin on the original page will be released and the new
     380                 :  *              page returned to the caller is exclusively locked instead.
     381                 :  *
     382                 :  *              newtup is the new tuple we're inserting, and scankey is an insertion
     383                 :  *              type scan key for it.
     384                 :  */
     385                 : static void
     386                 : _bt_findinsertloc(Relation rel,
     387                 :                                   Buffer *bufptr,
     388                 :                                   OffsetNumber *offsetptr,
     389                 :                                   int keysz,
     390                 :                                   ScanKey scankey,
     391                 :                                   IndexTuple newtup)
     392           70087 : {
     393           70087 :         Buffer          buf = *bufptr;
     394           70087 :         Page            page = BufferGetPage(buf);
     395                 :         Size            itemsz;
     396                 :         BTPageOpaque lpageop;
     397                 :         bool            movedright,
     398                 :                                 vacuumed;
     399                 :         OffsetNumber newitemoff;
     400           70087 :         OffsetNumber firstlegaloff = *offsetptr;
     401                 : 
     402           70087 :         lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
     403                 : 
     404           70087 :         itemsz = IndexTupleDSize(*newtup);
     405           70087 :         itemsz = MAXALIGN(itemsz);      /* be safe, PageAddItem will do this but we
     406                 :                                                                  * need to be consistent */
     407                 : 
     408                 :         /*
     409                 :          * Check whether the item can fit on a btree page at all. (Eventually, we
     410                 :          * ought to try to apply TOAST methods if not.) We actually need to be
     411                 :          * able to fit three items on every page, so restrict any one item to 1/3
     412                 :          * the per-page available space. Note that at this point, itemsz doesn't
     413                 :          * include the ItemId.
     414                 :          */
     415           70087 :         if (itemsz > BTMaxItemSize(page))
     416               0 :                 ereport(ERROR,
     417                 :                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     418                 :                                  errmsg("index row size %lu exceeds btree maximum, %lu",
     419                 :                                                 (unsigned long) itemsz,
     420                 :                                                 (unsigned long) BTMaxItemSize(page)),
     421                 :                 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
     422                 :                                 "Consider a function index of an MD5 hash of the value, "
     423                 :                                 "or use full text indexing.")));
     424                 : 
     425                 :         /*----------
     426                 :          * If we will need to split the page to put the item on this page,
     427                 :          * check whether we can put the tuple somewhere to the right,
     428                 :          * instead.  Keep scanning right until we
     429                 :          *              (a) find a page with enough free space,
     430                 :          *              (b) reach the last page where the tuple can legally go, or
     431                 :          *              (c) get tired of searching.
     432                 :          * (c) is not flippant; it is important because if there are many
     433                 :          * pages' worth of equal keys, it's better to split one of the early
     434                 :          * pages than to scan all the way to the end of the run of equal keys
     435                 :          * on every insert.  We implement "get tired" as a random choice,
     436                 :          * since stopping after scanning a fixed number of pages wouldn't work
     437                 :          * well (we'd never reach the right-hand side of previously split
     438                 :          * pages).      Currently the probability of moving right is set at 0.99,
     439                 :          * which may seem too high to change the behavior much, but it does an
     440                 :          * excellent job of preventing O(N^2) behavior with many equal keys.
     441                 :          *----------
     442                 :          */
     443           70087 :         movedright = false;
     444           70087 :         vacuumed = false;
     445          141323 :         while (PageGetFreeSpace(page) < itemsz)
     446                 :         {
     447                 :                 Buffer          rbuf;
     448                 : 
     449                 :                 /*
     450                 :                  * before considering moving right, see if we can obtain enough space
     451                 :                  * by erasing LP_DEAD items
     452                 :                  */
     453            1637 :                 if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
     454                 :                 {
     455             106 :                         _bt_vacuum_one_page(rel, buf);
     456                 : 
     457                 :                         /*
     458                 :                          * remember that we vacuumed this page, because that makes the
     459                 :                          * hint supplied by the caller invalid
     460                 :                          */
     461             106 :                         vacuumed = true;
     462                 : 
     463             106 :                         if (PageGetFreeSpace(page) >= itemsz)
     464             106 :                                 break;                  /* OK, now we have enough space */
     465                 :                 }
     466                 : 
     467                 :                 /*
     468                 :                  * nope, so check conditions (b) and (c) enumerated above
     469                 :                  */
     470            1531 :                 if (P_RIGHTMOST(lpageop) ||
     471                 :                         _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
     472                 :                         random() <= (MAX_RANDOM_VALUE / 100))
     473                 :                         break;
     474                 : 
     475                 :                 /*
     476                 :                  * step right to next non-dead page
     477                 :                  *
     478                 :                  * must write-lock that page before releasing write lock on current
     479                 :                  * page; else someone else's _bt_check_unique scan could fail to see
     480                 :                  * our insertion.  write locks on intermediate dead pages won't do
     481                 :                  * because we don't know when they will get de-linked from the tree.
     482                 :                  */
     483            1149 :                 rbuf = InvalidBuffer;
     484                 : 
     485                 :                 for (;;)
     486                 :                 {
     487            1149 :                         BlockNumber rblkno = lpageop->btpo_next;
     488                 : 
     489            1149 :                         rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
     490            1149 :                         page = BufferGetPage(rbuf);
     491            1149 :                         lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
     492            1149 :                         if (!P_IGNORE(lpageop))
     493            1149 :                                 break;
     494               0 :                         if (P_RIGHTMOST(lpageop))
     495               0 :                                 elog(ERROR, "fell off the end of index \"%s\"",
     496                 :                                          RelationGetRelationName(rel));
     497                 :                 }
     498            1149 :                 _bt_relbuf(rel, buf);
     499            1149 :                 buf = rbuf;
     500            1149 :                 movedright = true;
     501            1149 :                 vacuumed = false;
     502                 :         }
     503                 : 
     504                 :         /*
     505                 :          * Now we are on the right page, so find the insert position. If we moved
     506                 :          * right at all, we know we should insert at the start of the page. If we
     507                 :          * didn't move right, we can use the firstlegaloff hint if the caller
     508                 :          * supplied one, unless we vacuumed the page which might have moved tuples
     509                 :          * around making the hint invalid. If we didn't move right or can't use
     510                 :          * the hint, find the position by searching.
     511                 :          */
     512           70087 :         if (movedright)
     513            1149 :                 newitemoff = P_FIRSTDATAKEY(lpageop);
     514           68938 :         else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
     515           34147 :                 newitemoff = firstlegaloff;
     516                 :         else
     517           34791 :                 newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
     518                 : 
     519           70087 :         *bufptr = buf;
     520           70087 :         *offsetptr = newitemoff;
     521           70087 : }
     522                 : 
     523                 : /*----------
     524                 :  *      _bt_insertonpg() -- Insert a tuple on a particular page in the index.
     525                 :  *
     526                 :  *              This recursive procedure does the following things:
     527                 :  *
     528                 :  *                      +  if necessary, splits the target page (making sure that the
     529                 :  *                         split is equitable as far as post-insert free space goes).
     530                 :  *                      +  inserts the tuple.
     531                 :  *                      +  if the page was split, pops the parent stack, and finds the
     532                 :  *                         right place to insert the new child pointer (by walking
     533                 :  *                         right using information stored in the parent stack).
     534                 :  *                      +  invokes itself with the appropriate tuple for the right
     535                 :  *                         child page on the parent.
     536                 :  *                      +  updates the metapage if a true root or fast root is split.
     537                 :  *
     538                 :  *              On entry, we must have the right buffer in which to do the
     539                 :  *              insertion, and the buffer must be pinned and write-locked.      On return,
     540                 :  *              we will have dropped both the pin and the lock on the buffer.
     541                 :  *
     542                 :  *              The locking interactions in this code are critical.  You should
     543                 :  *              grok Lehman and Yao's paper before making any changes.  In addition,
     544                 :  *              you need to understand how we disambiguate duplicate keys in this
     545                 :  *              implementation, in order to be able to find our location using
     546                 :  *              L&Y "move right" operations.  Since we may insert duplicate user
     547                 :  *              keys, and since these dups may propagate up the tree, we use the
     548                 :  *              'afteritem' parameter to position ourselves correctly for the
     549                 :  *              insertion on internal pages.
     550                 :  *----------
     551                 :  */
     552                 : static void
     553                 : _bt_insertonpg(Relation rel,
     554                 :                            Buffer buf,
     555                 :                            BTStack stack,
     556                 :                            IndexTuple itup,
     557                 :                            OffsetNumber newitemoff,
     558                 :                            bool split_only_page)
     559           70456 : {
     560                 :         Page            page;
     561                 :         BTPageOpaque lpageop;
     562           70456 :         OffsetNumber firstright = InvalidOffsetNumber;
     563                 :         Size            itemsz;
     564                 : 
     565           70456 :         page = BufferGetPage(buf);
     566           70456 :         lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
     567                 : 
     568           70456 :         itemsz = IndexTupleDSize(*itup);
     569           70456 :         itemsz = MAXALIGN(itemsz);      /* be safe, PageAddItem will do this but we
     570                 :                                                                  * need to be consistent */
     571                 : 
     572                 :         /*
     573                 :          * Do we need to split the page to fit the item on it?
     574                 :          *
     575                 :          * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
     576                 :          * so this comparison is correct even though we appear to be accounting
     577                 :          * only for the item and not for its line pointer.
     578                 :          */
     579           70456 :         if (PageGetFreeSpace(page) < itemsz)
     580                 :         {
     581             383 :                 bool            is_root = P_ISROOT(lpageop);
     582             383 :                 bool            is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
     583                 :                 bool            newitemonleft;
     584                 :                 Buffer          rbuf;
     585                 : 
     586                 :                 /* Choose the split point */
     587             383 :                 firstright = _bt_findsplitloc(rel, page,
     588                 :                                                                           newitemoff, itemsz,
     589                 :                                                                           &newitemonleft);
     590                 : 
     591                 :                 /* split the buffer into left and right halves */
     592             383 :                 rbuf = _bt_split(rel, buf, firstright,
     593                 :                                                  newitemoff, itemsz, itup, newitemonleft);
     594                 : 
     595                 :                 /*----------
     596                 :                  * By here,
     597                 :                  *
     598                 :                  *              +  our target page has been split;
     599                 :                  *              +  the original tuple has been inserted;
     600                 :                  *              +  we have write locks on both the old (left half)
     601                 :                  *                 and new (right half) buffers, after the split; and
     602                 :                  *              +  we know the key we want to insert into the parent
     603                 :                  *                 (it's the "high key" on the left child page).
     604                 :                  *
     605                 :                  * We're ready to do the parent insertion.  We need to hold onto the
     606                 :                  * locks for the child pages until we locate the parent, but we can
     607                 :                  * release them before doing the actual insertion (see Lehman and Yao
     608                 :                  * for the reasoning).
     609                 :                  *----------
     610                 :                  */
     611             383 :                 _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
     612                 :         }
     613                 :         else
     614                 :         {
     615           70073 :                 Buffer          metabuf = InvalidBuffer;
     616           70073 :                 Page            metapg = NULL;
     617           70073 :                 BTMetaPageData *metad = NULL;
     618                 :                 OffsetNumber itup_off;
     619                 :                 BlockNumber itup_blkno;
     620                 : 
     621           70073 :                 itup_off = newitemoff;
     622           70073 :                 itup_blkno = BufferGetBlockNumber(buf);
     623                 : 
     624                 :                 /*
     625                 :                  * If we are doing this insert because we split a page that was the
     626                 :                  * only one on its tree level, but was not the root, it may have been
     627                 :                  * the "fast root".  We need to ensure that the fast root link points
     628                 :                  * at or above the current page.  We can safely acquire a lock on the
     629                 :                  * metapage here --- see comments for _bt_newroot().
     630                 :                  */
     631           70073 :                 if (split_only_page)
     632                 :                 {
     633                 :                         Assert(!P_ISLEAF(lpageop));
     634                 : 
     635               0 :                         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
     636               0 :                         metapg = BufferGetPage(metabuf);
     637               0 :                         metad = BTPageGetMeta(metapg);
     638                 : 
     639               0 :                         if (metad->btm_fastlevel >= lpageop->btpo.level)
     640                 :                         {
     641                 :                                 /* no update wanted */
     642               0 :                                 _bt_relbuf(rel, metabuf);
     643               0 :                                 metabuf = InvalidBuffer;
     644                 :                         }
     645                 :                 }
     646                 : 
     647                 :                 /* Do the update.  No ereport(ERROR) until changes are logged */
     648           70073 :                 START_CRIT_SECTION();
     649                 : 
     650           70073 :                 _bt_pgaddtup(rel, page, itemsz, itup, newitemoff, "page");
     651                 : 
     652           70073 :                 MarkBufferDirty(buf);
     653                 : 
     654           70073 :                 if (BufferIsValid(metabuf))
     655                 :                 {
     656               0 :                         metad->btm_fastroot = itup_blkno;
     657               0 :                         metad->btm_fastlevel = lpageop->btpo.level;
     658               0 :                         MarkBufferDirty(metabuf);
     659                 :                 }
     660                 : 
     661                 :                 /* XLOG stuff */
     662           70073 :                 if (!rel->rd_istemp)
     663                 :                 {
     664                 :                         xl_btree_insert xlrec;
     665                 :                         BlockNumber xldownlink;
     666                 :                         xl_btree_metadata xlmeta;
     667                 :                         uint8           xlinfo;
     668                 :                         XLogRecPtr      recptr;
     669                 :                         XLogRecData rdata[4];
     670                 :                         XLogRecData *nextrdata;
     671                 :                         IndexTupleData trunctuple;
     672                 : 
     673           70032 :                         xlrec.target.node = rel->rd_node;
     674           70032 :                         ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
     675                 : 
     676           70032 :                         rdata[0].data = (char *) &xlrec;
     677           70032 :                         rdata[0].len = SizeOfBtreeInsert;
     678           70032 :                         rdata[0].buffer = InvalidBuffer;
     679           70032 :                         rdata[0].next = nextrdata = &(rdata[1]);
     680                 : 
     681           70032 :                         if (P_ISLEAF(lpageop))
     682           69664 :                                 xlinfo = XLOG_BTREE_INSERT_LEAF;
     683                 :                         else
     684                 :                         {
     685             368 :                                 xldownlink = ItemPointerGetBlockNumber(&(itup->t_tid));
     686                 :                                 Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
     687                 : 
     688             368 :                                 nextrdata->data = (char *) &xldownlink;
     689             368 :                                 nextrdata->len = sizeof(BlockNumber);
     690             368 :                                 nextrdata->buffer = InvalidBuffer;
     691             368 :                                 nextrdata->next = nextrdata + 1;
     692             368 :                                 nextrdata++;
     693                 : 
     694             368 :                                 xlinfo = XLOG_BTREE_INSERT_UPPER;
     695                 :                         }
     696                 : 
     697           70032 :                         if (BufferIsValid(metabuf))
     698                 :                         {
     699               0 :                                 xlmeta.root = metad->btm_root;
     700               0 :                                 xlmeta.level = metad->btm_level;
     701               0 :                                 xlmeta.fastroot = metad->btm_fastroot;
     702               0 :                                 xlmeta.fastlevel = metad->btm_fastlevel;
     703                 : 
     704               0 :                                 nextrdata->data = (char *) &xlmeta;
     705               0 :                                 nextrdata->len = sizeof(xl_btree_metadata);
     706               0 :                                 nextrdata->buffer = InvalidBuffer;
     707               0 :                                 nextrdata->next = nextrdata + 1;
     708               0 :                                 nextrdata++;
     709                 : 
     710               0 :                                 xlinfo = XLOG_BTREE_INSERT_META;
     711                 :                         }
     712                 : 
     713                 :                         /* Read comments in _bt_pgaddtup */
     714           70032 :                         if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
     715                 :                         {
     716               0 :                                 trunctuple = *itup;
     717               0 :                                 trunctuple.t_info = sizeof(IndexTupleData);
     718               0 :                                 nextrdata->data = (char *) &trunctuple;
     719               0 :                                 nextrdata->len = sizeof(IndexTupleData);
     720                 :                         }
     721                 :                         else
     722                 :                         {
     723           70032 :                                 nextrdata->data = (char *) itup;
     724           70032 :                                 nextrdata->len = IndexTupleDSize(*itup);
     725                 :                         }
     726           70032 :                         nextrdata->buffer = buf;
     727           70032 :                         nextrdata->buffer_std = true;
     728           70032 :                         nextrdata->next = NULL;
     729                 : 
     730           70032 :                         recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
     731                 : 
     732           70032 :                         if (BufferIsValid(metabuf))
     733                 :                         {
     734               0 :                                 PageSetLSN(metapg, recptr);
     735               0 :                                 PageSetTLI(metapg, ThisTimeLineID);
     736                 :                         }
     737                 : 
     738           70032 :                         PageSetLSN(page, recptr);
     739           70032 :                         PageSetTLI(page, ThisTimeLineID);
     740                 :                 }
     741                 : 
     742           70073 :                 END_CRIT_SECTION();
     743                 : 
     744                 :                 /* release buffers; send out relcache inval if metapage changed */
     745           70073 :                 if (BufferIsValid(metabuf))
     746                 :                 {
     747               0 :                         CacheInvalidateRelcache(rel);
     748               0 :                         _bt_relbuf(rel, metabuf);
     749                 :                 }
     750                 : 
     751           70073 :                 _bt_relbuf(rel, buf);
     752                 :         }
     753           70456 : }
     754                 : 
     755                 : /*
     756                 :  *      _bt_split() -- split a page in the btree.
     757                 :  *
     758                 :  *              On entry, buf is the page to split, and is pinned and write-locked.
     759                 :  *              firstright is the item index of the first item to be moved to the
     760                 :  *              new right page.  newitemoff etc. tell us about the new item that
     761                 :  *              must be inserted along with the data from the old page.
     762                 :  *
     763                 :  *              Returns the new right sibling of buf, pinned and write-locked.
     764                 :  *              The pin and lock on buf are maintained.
     765                 :  */
     766                 : static Buffer
     767                 : _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
     768                 :                   OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
     769                 :                   bool newitemonleft)
     770             383 : {
     771                 :         Buffer          rbuf;
     772                 :         Page            origpage;
     773                 :         Page            leftpage,
     774                 :                                 rightpage;
     775                 :         BTPageOpaque ropaque,
     776                 :                                 lopaque,
     777                 :                                 oopaque;
     778             383 :         Buffer          sbuf = InvalidBuffer;
     779             383 :         Page            spage = NULL;
     780             383 :         BTPageOpaque sopaque = NULL;
     781                 :         Size            itemsz;
     782                 :         ItemId          itemid;
     783                 :         IndexTuple      item;
     784                 :         OffsetNumber leftoff,
     785                 :                                 rightoff;
     786                 :         OffsetNumber maxoff;
     787                 :         OffsetNumber i;
     788                 :         bool            isroot;
     789                 : 
     790             383 :         rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
     791             383 :         origpage = BufferGetPage(buf);
     792             383 :         leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
     793             383 :         rightpage = BufferGetPage(rbuf);
     794                 : 
     795             383 :         _bt_pageinit(leftpage, BufferGetPageSize(buf));
     796                 :         /* rightpage was already initialized by _bt_getbuf */
     797                 : 
     798                 :         /*
     799                 :          * Copy the original page's LSN and TLI into leftpage, which will become
     800                 :          * the updated version of the page.  We need this because XLogInsert will
     801                 :          * examine these fields and possibly dump them in a page image.
     802                 :          */
     803             383 :         PageSetLSN(leftpage, PageGetLSN(origpage));
     804             383 :         PageSetTLI(leftpage, PageGetTLI(origpage));
     805                 : 
     806                 :         /* init btree private data */
     807             383 :         oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
     808             383 :         lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
     809             383 :         ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
     810                 : 
     811             383 :         isroot = P_ISROOT(oopaque);
     812                 : 
     813                 :         /* if we're splitting this page, it won't be the root when we're done */
     814                 :         /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
     815             383 :         lopaque->btpo_flags = oopaque->btpo_flags;
     816             383 :         lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
     817             383 :         ropaque->btpo_flags = lopaque->btpo_flags;
     818             383 :         lopaque->btpo_prev = oopaque->btpo_prev;
     819             383 :         lopaque->btpo_next = BufferGetBlockNumber(rbuf);
     820             383 :         ropaque->btpo_prev = BufferGetBlockNumber(buf);
     821             383 :         ropaque->btpo_next = oopaque->btpo_next;
     822             383 :         lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
     823                 :         /* Since we already have write-lock on both pages, ok to read cycleid */
     824             383 :         lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
     825             383 :         ropaque->btpo_cycleid = lopaque->btpo_cycleid;
     826                 : 
     827                 :         /*
     828                 :          * If the page we're splitting is not the rightmost page at its level in
     829                 :          * the tree, then the first entry on the page is the high key for the
     830                 :          * page.  We need to copy that to the right half.  Otherwise (meaning the
     831                 :          * rightmost page case), all the items on the right half will be user
     832                 :          * data.
     833                 :          */
     834             383 :         rightoff = P_HIKEY;
     835                 : 
     836             383 :         if (!P_RIGHTMOST(oopaque))
     837                 :         {
     838             221 :                 itemid = PageGetItemId(origpage, P_HIKEY);
     839             221 :                 itemsz = ItemIdGetLength(itemid);
     840             221 :                 item = (IndexTuple) PageGetItem(origpage, itemid);
     841             221 :                 if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
     842                 :                                                 false, false) == InvalidOffsetNumber)
     843               0 :                         elog(PANIC, "failed to add hikey to the right sibling"
     844                 :                                  " while splitting block %u of index \"%s\"",
     845                 :                                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
     846             221 :                 rightoff = OffsetNumberNext(rightoff);
     847                 :         }
     848                 : 
     849                 :         /*
     850                 :          * The "high key" for the new left page will be the first key that's going
     851                 :          * to go into the new right page.  This might be either the existing data
     852                 :          * item at position firstright, or the incoming tuple.
     853                 :          */
     854             383 :         leftoff = P_HIKEY;
     855             383 :         if (!newitemonleft && newitemoff == firstright)
     856                 :         {
     857                 :                 /* incoming tuple will become first on right page */
     858               2 :                 itemsz = newitemsz;
     859               2 :                 item = newitem;
     860                 :         }
     861                 :         else
     862                 :         {
     863                 :                 /* existing item at firstright will become first on right page */
     864             381 :                 itemid = PageGetItemId(origpage, firstright);
     865             381 :                 itemsz = ItemIdGetLength(itemid);
     866             381 :                 item = (IndexTuple) PageGetItem(origpage, itemid);
     867                 :         }
     868             383 :         if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
     869                 :                                         false, false) == InvalidOffsetNumber)
     870               0 :                 elog(PANIC, "failed to add hikey to the left sibling"
     871                 :                          " while splitting block %u of index \"%s\"",
     872                 :                          BufferGetBlockNumber(buf), RelationGetRelationName(rel));
     873             383 :         leftoff = OffsetNumberNext(leftoff);
     874                 : 
     875                 :         /*
     876                 :          * Now transfer all the data items to the appropriate page.
     877                 :          *
     878                 :          * Note: we *must* insert at least the right page's items in item-number
     879                 :          * order, for the benefit of _bt_restore_page().
     880                 :          */
     881             383 :         maxoff = PageGetMaxOffsetNumber(origpage);
     882                 : 
     883           86862 :         for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
     884                 :         {
     885           86479 :                 itemid = PageGetItemId(origpage, i);
     886           86479 :                 itemsz = ItemIdGetLength(itemid);
     887           86479 :                 item = (IndexTuple) PageGetItem(origpage, itemid);
     888                 : 
     889                 :                 /* does new item belong before this one? */
     890           86479 :                 if (i == newitemoff)
     891                 :                 {
     892             315 :                         if (newitemonleft)
     893                 :                         {
     894              92 :                                 _bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
     895                 :                                                          "left sibling");
     896              92 :                                 leftoff = OffsetNumberNext(leftoff);
     897                 :                         }
     898                 :                         else
     899                 :                         {
     900             223 :                                 _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
     901                 :                                                          "right sibling");
     902             223 :                                 rightoff = OffsetNumberNext(rightoff);
     903                 :                         }
     904                 :                 }
     905                 : 
     906                 :                 /* decide which page to put it on */
     907           86479 :                 if (i < firstright)
     908                 :                 {
     909           53732 :                         _bt_pgaddtup(rel, leftpage, itemsz, item, leftoff,
     910                 :                                                  "left sibling");
     911           53732 :                         leftoff = OffsetNumberNext(leftoff);
     912                 :                 }
     913                 :                 else
     914                 :                 {
     915           32747 :                         _bt_pgaddtup(rel, rightpage, itemsz, item, rightoff,
     916                 :                                                  "right sibling");
     917           32747 :                         rightoff = OffsetNumberNext(rightoff);
     918                 :                 }
     919                 :         }
     920                 : 
     921                 :         /* cope with possibility that newitem goes at the end */
     922             383 :         if (i <= newitemoff)
     923                 :         {
     924                 :                 /*
     925                 :                  * Can't have newitemonleft here; that would imply we were told to put
     926                 :                  * *everything* on the left page, which cannot fit (if it could, we'd
     927                 :                  * not be splitting the page).
     928                 :                  */
     929                 :                 Assert(!newitemonleft);
     930              68 :                 _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
     931                 :                                          "right sibling");
     932              68 :                 rightoff = OffsetNumberNext(rightoff);
     933                 :         }
     934                 : 
     935                 :         /*
     936                 :          * We have to grab the right sibling (if any) and fix the prev pointer
     937                 :          * there. We are guaranteed that this is deadlock-free since no other
     938                 :          * writer will be holding a lock on that page and trying to move left, and
     939                 :          * all readers release locks on a page before trying to fetch its
     940                 :          * neighbors.
     941                 :          */
     942                 : 
     943             383 :         if (!P_RIGHTMOST(ropaque))
     944                 :         {
     945             221 :                 sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
     946             221 :                 spage = BufferGetPage(sbuf);
     947             221 :                 sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
     948             221 :                 if (sopaque->btpo_prev != ropaque->btpo_prev)
     949               0 :                         elog(PANIC, "right sibling's left-link doesn't match: "
     950                 :                                  "block %u links to %u instead of expected %u in index \"%s\"",
     951                 :                                  ropaque->btpo_next, sopaque->btpo_prev, ropaque->btpo_prev,
     952                 :                                  RelationGetRelationName(rel));
     953                 : 
     954                 :                 /*
     955                 :                  * Check to see if we can set the SPLIT_END flag in the right-hand
     956                 :                  * split page; this can save some I/O for vacuum since it need not
     957                 :                  * proceed to the right sibling.  We can set the flag if the right
     958                 :                  * sibling has a different cycleid: that means it could not be part of
     959                 :                  * a group of pages that were all split off from the same ancestor
     960                 :                  * page.  If you're confused, imagine that page A splits to A B and
     961                 :                  * then again, yielding A C B, while vacuum is in progress.  Tuples
     962                 :                  * originally in A could now be in either B or C, hence vacuum must
     963                 :                  * examine both pages.  But if D, our right sibling, has a different
     964                 :                  * cycleid then it could not contain any tuples that were in A when
     965                 :                  * the vacuum started.
     966                 :                  */
     967             221 :                 if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
     968               0 :                         ropaque->btpo_flags |= BTP_SPLIT_END;
     969                 :         }
     970                 : 
     971                 :         /*
     972                 :          * Right sibling is locked, new siblings are prepared, but original page
     973                 :          * is not updated yet.
     974                 :          *
     975                 :          * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
     976                 :          * not starting the critical section till here because we haven't been
     977                 :          * scribbling on the original page yet, and we don't care about the new
     978                 :          * sibling until it's linked into the btree.
     979                 :          */
     980             383 :         START_CRIT_SECTION();
     981                 : 
     982                 :         /*
     983                 :          * By here, the original data page has been split into two new halves, and
     984                 :          * these are correct.  The algorithm requires that the left page never
     985                 :          * move during a split, so we copy the new left page back on top of the
     986                 :          * original.  Note that this is not a waste of time, since we also require
     987                 :          * (in the page management code) that the center of a page always be
     988                 :          * clean, and the most efficient way to guarantee this is just to compact
     989                 :          * the data by reinserting it into a new left page.  (XXX the latter
     990                 :          * comment is probably obsolete.)
     991                 :          *
     992                 :          * We need to do this before writing the WAL record, so that XLogInsert
     993                 :          * can WAL log an image of the page if necessary.
     994                 :          */
     995             383 :         PageRestoreTempPage(leftpage, origpage);
     996                 : 
     997             383 :         MarkBufferDirty(buf);
     998             383 :         MarkBufferDirty(rbuf);
     999                 : 
    1000             383 :         if (!P_RIGHTMOST(ropaque))
    1001                 :         {
    1002             221 :                 sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
    1003             221 :                 MarkBufferDirty(sbuf);
    1004                 :         }
    1005                 : 
    1006                 :         /* XLOG stuff */
    1007             383 :         if (!rel->rd_istemp)
    1008                 :         {
    1009                 :                 xl_btree_split xlrec;
    1010                 :                 uint8           xlinfo;
    1011                 :                 XLogRecPtr      recptr;
    1012                 :                 XLogRecData rdata[7];
    1013                 :                 XLogRecData *lastrdata;
    1014                 : 
    1015             383 :                 xlrec.node = rel->rd_node;
    1016             383 :                 xlrec.leftsib = BufferGetBlockNumber(buf);
    1017             383 :                 xlrec.rightsib = BufferGetBlockNumber(rbuf);
    1018             383 :                 xlrec.rnext = ropaque->btpo_next;
    1019             383 :                 xlrec.level = ropaque->btpo.level;
    1020             383 :                 xlrec.firstright = firstright;
    1021                 : 
    1022             383 :                 rdata[0].data = (char *) &xlrec;
    1023             383 :                 rdata[0].len = SizeOfBtreeSplit;
    1024             383 :                 rdata[0].buffer = InvalidBuffer;
    1025                 : 
    1026             383 :                 lastrdata = &rdata[0];
    1027                 : 
    1028             383 :                 if (ropaque->btpo.level > 0)
    1029                 :                 {
    1030                 :                         /* Log downlink on non-leaf pages */
    1031               1 :                         lastrdata->next = lastrdata + 1;
    1032               1 :                         lastrdata++;
    1033                 : 
    1034               1 :                         lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
    1035               1 :                         lastrdata->len = sizeof(BlockIdData);
    1036               1 :                         lastrdata->buffer = InvalidBuffer;
    1037                 : 
    1038                 :                         /*
    1039                 :                          * We must also log the left page's high key, because the right
    1040                 :                          * page's leftmost key is suppressed on non-leaf levels.  Show it
    1041                 :                          * as belonging to the left page buffer, so that it is not stored
    1042                 :                          * if XLogInsert decides it needs a full-page image of the left
    1043                 :                          * page.
    1044                 :                          */
    1045               1 :                         lastrdata->next = lastrdata + 1;
    1046               1 :                         lastrdata++;
    1047                 : 
    1048               1 :                         itemid = PageGetItemId(origpage, P_HIKEY);
    1049               1 :                         item = (IndexTuple) PageGetItem(origpage, itemid);
    1050               1 :                         lastrdata->data = (char *) item;
    1051               1 :                         lastrdata->len = MAXALIGN(IndexTupleSize(item));
    1052               1 :                         lastrdata->buffer = buf;     /* backup block 1 */
    1053               1 :                         lastrdata->buffer_std = true;
    1054                 :                 }
    1055                 : 
    1056                 :                 /*
    1057                 :                  * Log the new item and its offset, if it was inserted on the left
    1058                 :                  * page. (If it was put on the right page, we don't need to explicitly
    1059                 :                  * WAL log it because it's included with all the other items on the
    1060                 :                  * right page.) Show the new item as belonging to the left page
    1061                 :                  * buffer, so that it is not stored if XLogInsert decides it needs a
    1062                 :                  * full-page image of the left page.  We store the offset anyway,
    1063                 :                  * though, to support archive compression of these records.
    1064                 :                  */
    1065             383 :                 if (newitemonleft)
    1066                 :                 {
    1067              92 :                         lastrdata->next = lastrdata + 1;
    1068              92 :                         lastrdata++;
    1069                 : 
    1070              92 :                         lastrdata->data = (char *) &newitemoff;
    1071              92 :                         lastrdata->len = sizeof(OffsetNumber);
    1072              92 :                         lastrdata->buffer = InvalidBuffer;
    1073                 : 
    1074              92 :                         lastrdata->next = lastrdata + 1;
    1075              92 :                         lastrdata++;
    1076                 : 
    1077              92 :                         lastrdata->data = (char *) newitem;
    1078              92 :                         lastrdata->len = MAXALIGN(newitemsz);
    1079              92 :                         lastrdata->buffer = buf;     /* backup block 1 */
    1080              92 :                         lastrdata->buffer_std = true;
    1081                 :                 }
    1082             291 :                 else if (ropaque->btpo.level == 0)
    1083                 :                 {
    1084                 :                         /*
    1085                 :                          * Although we don't need to WAL-log the new item, we still need
    1086                 :                          * XLogInsert to consider storing a full-page image of the left
    1087                 :                          * page, so make an empty entry referencing that buffer. This also
    1088                 :                          * ensures that the left page is always backup block 1.
    1089                 :                          */
    1090             290 :                         lastrdata->next = lastrdata + 1;
    1091             290 :                         lastrdata++;
    1092                 : 
    1093             290 :                         lastrdata->data = NULL;
    1094             290 :                         lastrdata->len = 0;
    1095             290 :                         lastrdata->buffer = buf;     /* backup block 1 */
    1096             290 :                         lastrdata->buffer_std = true;
    1097                 :                 }
    1098                 : 
    1099                 :                 /*
    1100                 :                  * Log the contents of the right page in the format understood by
    1101                 :                  * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
    1102                 :                  * because we're going to recreate the whole page anyway, so it should
    1103                 :                  * never be stored by XLogInsert.
    1104                 :                  *
    1105                 :                  * Direct access to page is not good but faster - we should implement
    1106                 :                  * some new func in page API.  Note we only store the tuples
    1107                 :                  * themselves, knowing that they were inserted in item-number order
    1108                 :                  * and so the item pointers can be reconstructed.  See comments for
    1109                 :                  * _bt_restore_page().
    1110                 :                  */
    1111             383 :                 lastrdata->next = lastrdata + 1;
    1112             383 :                 lastrdata++;
    1113                 : 
    1114             383 :                 lastrdata->data = (char *) rightpage +
    1115                 :                         ((PageHeader) rightpage)->pd_upper;
    1116             383 :                 lastrdata->len = ((PageHeader) rightpage)->pd_special -
    1117                 :                         ((PageHeader) rightpage)->pd_upper;
    1118             383 :                 lastrdata->buffer = InvalidBuffer;
    1119                 : 
    1120                 :                 /* Log the right sibling, because we've changed its' prev-pointer. */
    1121             383 :                 if (!P_RIGHTMOST(ropaque))
    1122                 :                 {
    1123             221 :                         lastrdata->next = lastrdata + 1;
    1124             221 :                         lastrdata++;
    1125                 : 
    1126             221 :                         lastrdata->data = NULL;
    1127             221 :                         lastrdata->len = 0;
    1128             221 :                         lastrdata->buffer = sbuf;    /* backup block 2 */
    1129             221 :                         lastrdata->buffer_std = true;
    1130                 :                 }
    1131                 : 
    1132             383 :                 lastrdata->next = NULL;
    1133                 : 
    1134             383 :                 if (isroot)
    1135              14 :                         xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
    1136                 :                 else
    1137             369 :                         xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
    1138                 : 
    1139             383 :                 recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
    1140                 : 
    1141             383 :                 PageSetLSN(origpage, recptr);
    1142             383 :                 PageSetTLI(origpage, ThisTimeLineID);
    1143             383 :                 PageSetLSN(rightpage, recptr);
    1144             383 :                 PageSetTLI(rightpage, ThisTimeLineID);
    1145             383 :                 if (!P_RIGHTMOST(ropaque))
    1146                 :                 {
    1147             221 :                         PageSetLSN(spage, recptr);
    1148             221 :                         PageSetTLI(spage, ThisTimeLineID);
    1149                 :                 }
    1150                 :         }
    1151                 : 
    1152             383 :         END_CRIT_SECTION();
    1153                 : 
    1154                 :         /* release the old right sibling */
    1155             383 :         if (!P_RIGHTMOST(ropaque))
    1156             221 :                 _bt_relbuf(rel, sbuf);
    1157                 : 
    1158                 :         /* split's done */
    1159             383 :         return rbuf;
    1160                 : }
    1161                 : 
    1162                 : /*
    1163                 :  *      _bt_findsplitloc() -- find an appropriate place to split a page.
    1164                 :  *
    1165                 :  * The idea here is to equalize the free space that will be on each split
    1166                 :  * page, *after accounting for the inserted tuple*.  (If we fail to account
    1167                 :  * for it, we might find ourselves with too little room on the page that
    1168                 :  * it needs to go into!)
    1169                 :  *
    1170                 :  * If the page is the rightmost page on its level, we instead try to arrange
    1171                 :  * to leave the left split page fillfactor% full.  In this way, when we are
    1172                 :  * inserting successively increasing keys (consider sequences, timestamps,
    1173                 :  * etc) we will end up with a tree whose pages are about fillfactor% full,
    1174                 :  * instead of the 50% full result that we'd get without this special case.
    1175                 :  * This is the same as nbtsort.c produces for a newly-created tree.  Note
    1176                 :  * that leaf and nonleaf pages use different fillfactors.
    1177                 :  *
    1178                 :  * We are passed the intended insert position of the new tuple, expressed as
    1179                 :  * the offsetnumber of the tuple it must go in front of.  (This could be
    1180                 :  * maxoff+1 if the tuple is to go at the end.)
    1181                 :  *
    1182                 :  * We return the index of the first existing tuple that should go on the
    1183                 :  * righthand page, plus a boolean indicating whether the new tuple goes on
    1184                 :  * the left or right page.      The bool is necessary to disambiguate the case
    1185                 :  * where firstright == newitemoff.
    1186                 :  */
    1187                 : static OffsetNumber
    1188                 : _bt_findsplitloc(Relation rel,
    1189                 :                                  Page page,
    1190                 :                                  OffsetNumber newitemoff,
    1191                 :                                  Size newitemsz,
    1192                 :                                  bool *newitemonleft)
    1193             383 : {
    1194                 :         BTPageOpaque opaque;
    1195                 :         OffsetNumber offnum;
    1196                 :         OffsetNumber maxoff;
    1197                 :         ItemId          itemid;
    1198                 :         FindSplitData state;
    1199                 :         int                     leftspace,
    1200                 :                                 rightspace,
    1201                 :                                 goodenough,
    1202                 :                                 olddataitemstotal,
    1203                 :                                 olddataitemstoleft;
    1204                 :         bool            goodenoughfound;
    1205                 : 
    1206             383 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1207                 : 
    1208                 :         /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
    1209             383 :         newitemsz += sizeof(ItemIdData);
    1210                 : 
    1211                 :         /* Total free space available on a btree page, after fixed overhead */
    1212             383 :         leftspace = rightspace =
    1213                 :                 PageGetPageSize(page) - SizeOfPageHeaderData -
    1214                 :                 MAXALIGN(sizeof(BTPageOpaqueData));
    1215                 : 
    1216                 :         /* The right page will have the same high key as the old page */
    1217             383 :         if (!P_RIGHTMOST(opaque))
    1218                 :         {
    1219             221 :                 itemid = PageGetItemId(page, P_HIKEY);
    1220             221 :                 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
    1221                 :                                                          sizeof(ItemIdData));
    1222                 :         }
    1223                 : 
    1224                 :         /* Count up total space in data items without actually scanning 'em */
    1225             383 :         olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
    1226                 : 
    1227             383 :         state.newitemsz = newitemsz;
    1228             383 :         state.is_leaf = P_ISLEAF(opaque);
    1229             383 :         state.is_rightmost = P_RIGHTMOST(opaque);
    1230             383 :         state.have_split = false;
    1231             383 :         if (state.is_leaf)
    1232             382 :                 state.fillfactor = RelationGetFillFactor(rel,
    1233                 :                                                                                                  BTREE_DEFAULT_FILLFACTOR);
    1234                 :         else
    1235               1 :                 state.fillfactor = BTREE_NONLEAF_FILLFACTOR;
    1236             383 :         state.newitemonleft = false;    /* these just to keep compiler quiet */
    1237             383 :         state.firstright = 0;
    1238             383 :         state.best_delta = 0;
    1239             383 :         state.leftspace = leftspace;
    1240             383 :         state.rightspace = rightspace;
    1241             383 :         state.olddataitemstotal = olddataitemstotal;
    1242             383 :         state.newitemoff = newitemoff;
    1243                 : 
    1244                 :         /*
    1245                 :          * Finding the best possible split would require checking all the possible
    1246                 :          * split points, because of the high-key and left-key special cases.
    1247                 :          * That's probably more work than it's worth; instead, stop as soon as we
    1248                 :          * find a "good-enough" split, where good-enough is defined as an
    1249                 :          * imbalance in free space of no more than pagesize/16 (arbitrary...) This
    1250                 :          * should let us stop near the middle on most pages, instead of plowing to
    1251                 :          * the end.
    1252                 :          */
    1253             383 :         goodenough = leftspace / 16;
    1254                 : 
    1255                 :         /*
    1256                 :          * Scan through the data items and calculate space usage for a split at
    1257                 :          * each possible position.
    1258                 :          */
    1259             383 :         olddataitemstoleft = 0;
    1260             383 :         goodenoughfound = false;
    1261             383 :         maxoff = PageGetMaxOffsetNumber(page);
    1262                 : 
    1263             383 :         for (offnum = P_FIRSTDATAKEY(opaque);
    1264           55691 :                  offnum <= maxoff;
    1265           54925 :                  offnum = OffsetNumberNext(offnum))
    1266                 :         {
    1267                 :                 Size            itemsz;
    1268                 : 
    1269           55192 :                 itemid = PageGetItemId(page, offnum);
    1270           55192 :                 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
    1271                 : 
    1272                 :                 /*
    1273                 :                  * Will the new item go to left or right of split?
    1274                 :                  */
    1275           55192 :                 if (offnum > newitemoff)
    1276            7352 :                         _bt_checksplitloc(&state, offnum, true,
    1277                 :                                                           olddataitemstoleft, itemsz);
    1278                 : 
    1279           47840 :                 else if (offnum < newitemoff)
    1280           47684 :                         _bt_checksplitloc(&state, offnum, false,
    1281                 :                                                           olddataitemstoleft, itemsz);
    1282                 :                 else
    1283                 :                 {
    1284                 :                         /* need to try it both ways! */
    1285             156 :                         _bt_checksplitloc(&state, offnum, true,
    1286                 :                                                           olddataitemstoleft, itemsz);
    1287                 : 
    1288             156 :                         _bt_checksplitloc(&state, offnum, false,
    1289                 :                                                           olddataitemstoleft, itemsz);
    1290                 :                 }
    1291                 : 
    1292                 :                 /* Abort scan once we find a good-enough choice */
    1293           55192 :                 if (state.have_split && state.best_delta <= goodenough)
    1294                 :                 {
    1295             267 :                         goodenoughfound = true;
    1296             267 :                         break;
    1297                 :                 }
    1298                 : 
    1299           54925 :                 olddataitemstoleft += itemsz;
    1300                 :         }
    1301                 : 
    1302                 :         /*
    1303                 :          * If the new item goes as the last item, check for splitting so that all
    1304                 :          * the old items go to the left page and the new item goes to the right
    1305                 :          * page.
    1306                 :          */
    1307             383 :         if (newitemoff > maxoff && !goodenoughfound)
    1308              42 :                 _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
    1309                 : 
    1310                 :         /*
    1311                 :          * I believe it is not possible to fail to find a feasible split, but just
    1312                 :          * in case ...
    1313                 :          */
    1314             383 :         if (!state.have_split)
    1315               0 :                 elog(ERROR, "could not find a feasible split point for index \"%s\"",
    1316                 :                          RelationGetRelationName(rel));
    1317                 : 
    1318             383 :         *newitemonleft = state.newitemonleft;
    1319             383 :         return state.firstright;
    1320                 : }
    1321                 : 
    1322                 : /*
    1323                 :  * Subroutine to analyze a particular possible split choice (ie, firstright
    1324                 :  * and newitemonleft settings), and record the best split so far in *state.
    1325                 :  *
    1326                 :  * firstoldonright is the offset of the first item on the original page
    1327                 :  * that goes to the right page, and firstoldonrightsz is the size of that
    1328                 :  * tuple. firstoldonright can be > max offset, which means that all the old
    1329                 :  * items go to the left page and only the new item goes to the right page.
    1330                 :  * In that case, firstoldonrightsz is not used.
    1331                 :  *
    1332                 :  * olddataitemstoleft is the total size of all old items to the left of
    1333                 :  * firstoldonright.
    1334                 :  */
    1335                 : static void
    1336                 : _bt_checksplitloc(FindSplitData *state,
    1337                 :                                   OffsetNumber firstoldonright,
    1338                 :                                   bool newitemonleft,
    1339                 :                                   int olddataitemstoleft,
    1340                 :                                   Size firstoldonrightsz)
    1341           55390 : {
    1342                 :         int                     leftfree,
    1343                 :                                 rightfree;
    1344                 :         Size            firstrightitemsz;
    1345                 :         bool            newitemisfirstonright;
    1346                 : 
    1347                 :         /* Is the new item going to be the first item on the right page? */
    1348           55390 :         newitemisfirstonright = (firstoldonright == state->newitemoff
    1349                 :                                                          && !newitemonleft);
    1350                 : 
    1351           55390 :         if (newitemisfirstonright)
    1352             198 :                 firstrightitemsz = state->newitemsz;
    1353                 :         else
    1354           55192 :                 firstrightitemsz = firstoldonrightsz;
    1355                 : 
    1356                 :         /* Account for all the old tuples */
    1357           55390 :         leftfree = state->leftspace - olddataitemstoleft;
    1358           55390 :         rightfree = state->rightspace -
    1359                 :                 (state->olddataitemstotal - olddataitemstoleft);
    1360                 : 
    1361                 :         /*
    1362                 :          * The first item on the right page becomes the high key of the left page;
    1363                 :          * therefore it counts against left space as well as right space.
    1364                 :          */
    1365           55390 :         leftfree -= firstrightitemsz;
    1366                 : 
    1367                 :         /* account for the new item */
    1368           55390 :         if (newitemonleft)
    1369            7508 :                 leftfree -= (int) state->newitemsz;
    1370                 :         else
    1371           47882 :                 rightfree -= (int) state->newitemsz;
    1372                 : 
    1373                 :         /*
    1374                 :          * If we are not on the leaf level, we will be able to discard the key
    1375                 :          * data from the first item that winds up on the right page.
    1376                 :          */
    1377           55390 :         if (!state->is_leaf)
    1378             103 :                 rightfree += (int) firstrightitemsz -
    1379                 :                         (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
    1380                 : 
    1381                 :         /*
    1382                 :          * If feasible split point, remember best delta.
    1383                 :          */
    1384           55390 :         if (leftfree >= 0 && rightfree >= 0)
    1385                 :         {
    1386                 :                 int                     delta;
    1387                 : 
    1388           54891 :                 if (state->is_rightmost)
    1389                 :                 {
    1390                 :                         /*
    1391                 :                          * If splitting a rightmost page, try to put (100-fillfactor)% of
    1392                 :                          * free space on left page. See comments for _bt_findsplitloc.
    1393                 :                          */
    1394           28422 :                         delta = (state->fillfactor * leftfree)
    1395                 :                                 - ((100 - state->fillfactor) * rightfree);
    1396                 :                 }
    1397                 :                 else
    1398                 :                 {
    1399                 :                         /* Otherwise, aim for equal free space on both sides */
    1400           26469 :                         delta = leftfree - rightfree;
    1401                 :                 }
    1402                 : 
    1403           54891 :                 if (delta < 0)
    1404            1228 :                         delta = -delta;
    1405           54891 :                 if (!state->have_split || delta < state->best_delta)
    1406                 :                 {
    1407           53751 :                         state->have_split = true;
    1408           53751 :                         state->newitemonleft = newitemonleft;
    1409           53751 :                         state->firstright = firstoldonright;
    1410           53751 :                         state->best_delta = delta;
    1411                 :                 }
    1412                 :         }
    1413           55390 : }
    1414                 : 
    1415                 : /*
    1416                 :  * _bt_insert_parent() -- Insert downlink into parent after a page split.
    1417                 :  *
    1418                 :  * On entry, buf and rbuf are the left and right split pages, which we
    1419                 :  * still hold write locks on per the L&Y algorithm.  We release the
    1420                 :  * write locks once we have write lock on the parent page.      (Any sooner,
    1421                 :  * and it'd be possible for some other process to try to split or delete
    1422                 :  * one of these pages, and get confused because it cannot find the downlink.)
    1423                 :  *
    1424                 :  * stack - stack showing how we got here.  May be NULL in cases that don't
    1425                 :  *                      have to be efficient (concurrent ROOT split, WAL recovery)
    1426                 :  * is_root - we split the true root
    1427                 :  * is_only - we split a page alone on its level (might have been fast root)
    1428                 :  *
    1429                 :  * This is exported so it can be called by nbtxlog.c.
    1430                 :  */
    1431                 : void
    1432                 : _bt_insert_parent(Relation rel,
    1433                 :                                   Buffer buf,
    1434                 :                                   Buffer rbuf,
    1435                 :                                   BTStack stack,
    1436                 :                                   bool is_root,
    1437                 :                                   bool is_only)
    1438             383 : {
    1439                 :         /*
    1440                 :          * Here we have to do something Lehman and Yao don't talk about: deal with
    1441                 :          * a root split and construction of a new root.  If our stack is empty
    1442                 :          * then we have just split a node on what had been the root level when we
    1443                 :          * descended the tree.  If it was still the root then we perform a
    1444                 :          * new-root construction.  If it *wasn't* the root anymore, search to find
    1445                 :          * the next higher level that someone constructed meanwhile, and find the
    1446                 :          * right place to insert as for the normal case.
    1447                 :          *
    1448                 :          * If we have to search for the parent level, we do so by re-descending
    1449                 :          * from the root.  This is not super-efficient, but it's rare enough not
    1450                 :          * to matter.  (This path is also taken when called from WAL recovery ---
    1451                 :          * we have no stack in that case.)
    1452                 :          */
    1453             383 :         if (is_root)
    1454                 :         {
    1455                 :                 Buffer          rootbuf;
    1456                 : 
    1457                 :                 Assert(stack == NULL);
    1458                 :                 Assert(is_only);
    1459                 :                 /* create a new root node and update the metapage */
    1460              14 :                 rootbuf = _bt_newroot(rel, buf, rbuf);
    1461                 :                 /* release the split buffers */
    1462              14 :                 _bt_relbuf(rel, rootbuf);
    1463              14 :                 _bt_relbuf(rel, rbuf);
    1464              14 :                 _bt_relbuf(rel, buf);
    1465                 :         }
    1466                 :         else
    1467                 :         {
    1468             369 :                 BlockNumber bknum = BufferGetBlockNumber(buf);
    1469             369 :                 BlockNumber rbknum = BufferGetBlockNumber(rbuf);
    1470             369 :                 Page            page = BufferGetPage(buf);
    1471                 :                 IndexTuple      new_item;
    1472                 :                 BTStackData fakestack;
    1473                 :                 IndexTuple      ritem;
    1474                 :                 Buffer          pbuf;
    1475                 : 
    1476             369 :                 if (stack == NULL)
    1477                 :                 {
    1478                 :                         BTPageOpaque lpageop;
    1479                 : 
    1480               0 :                         if (!InRecovery)
    1481               0 :                                 elog(DEBUG2, "concurrent ROOT page split");
    1482               0 :                         lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
    1483                 :                         /* Find the leftmost page at the next level up */
    1484               0 :                         pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false);
    1485                 :                         /* Set up a phony stack entry pointing there */
    1486               0 :                         stack = &fakestack;
    1487               0 :                         stack->bts_blkno = BufferGetBlockNumber(pbuf);
    1488               0 :                         stack->bts_offset = InvalidOffsetNumber;
    1489                 :                         /* bts_btentry will be initialized below */
    1490               0 :                         stack->bts_parent = NULL;
    1491               0 :                         _bt_relbuf(rel, pbuf);
    1492                 :                 }
    1493                 : 
    1494                 :                 /* get high key from left page == lowest key on new right page */
    1495             369 :                 ritem = (IndexTuple) PageGetItem(page,
    1496                 :                                                                                  PageGetItemId(page, P_HIKEY));
    1497                 : 
    1498                 :                 /* form an index tuple that points at the new right page */
    1499             369 :                 new_item = CopyIndexTuple(ritem);
    1500             369 :                 ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY);
    1501                 : 
    1502                 :                 /*
    1503                 :                  * Find the parent buffer and get the parent page.
    1504                 :                  *
    1505                 :                  * Oops - if we were moved right then we need to change stack item! We
    1506                 :                  * want to find parent pointing to where we are, right ?        - vadim
    1507                 :                  * 05/27/97
    1508                 :                  */
    1509             369 :                 ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY);
    1510                 : 
    1511             369 :                 pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
    1512                 : 
    1513                 :                 /* Now we can unlock the children */
    1514             369 :                 _bt_relbuf(rel, rbuf);
    1515             369 :                 _bt_relbuf(rel, buf);
    1516                 : 
    1517                 :                 /* Check for error only after writing children */
    1518             369 :                 if (pbuf == InvalidBuffer)
    1519               0 :                         elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
    1520                 :                                  RelationGetRelationName(rel), bknum, rbknum);
    1521                 : 
    1522                 :                 /* Recursively update the parent */
    1523             369 :                 _bt_insertonpg(rel, pbuf, stack->bts_parent,
    1524                 :                                            new_item, stack->bts_offset + 1,
    1525                 :                                            is_only);
    1526                 : 
    1527                 :                 /* be tidy */
    1528             369 :                 pfree(new_item);
    1529                 :         }
    1530             383 : }
    1531                 : 
    1532                 : /*
    1533                 :  *      _bt_getstackbuf() -- Walk back up the tree one step, and find the item
    1534                 :  *                                               we last looked at in the parent.
    1535                 :  *
    1536                 :  *              This is possible because we save the downlink from the parent item,
    1537                 :  *              which is enough to uniquely identify it.  Insertions into the parent
    1538                 :  *              level could cause the item to move right; deletions could cause it
    1539                 :  *              to move left, but not left of the page we previously found it in.
    1540                 :  *
    1541                 :  *              Adjusts bts_blkno & bts_offset if changed.
    1542                 :  *
    1543                 :  *              Returns InvalidBuffer if item not found (should not happen).
    1544                 :  */
    1545                 : Buffer
    1546                 : _bt_getstackbuf(Relation rel, BTStack stack, int access)
    1547             373 : {
    1548                 :         BlockNumber blkno;
    1549                 :         OffsetNumber start;
    1550                 : 
    1551             373 :         blkno = stack->bts_blkno;
    1552             373 :         start = stack->bts_offset;
    1553                 : 
    1554                 :         for (;;)
    1555                 :         {
    1556                 :                 Buffer          buf;
    1557                 :                 Page            page;
    1558                 :                 BTPageOpaque opaque;
    1559                 : 
    1560             373 :                 buf = _bt_getbuf(rel, blkno, access);
    1561             373 :                 page = BufferGetPage(buf);
    1562             373 :                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1563                 : 
    1564             373 :                 if (!P_IGNORE(opaque))
    1565                 :                 {
    1566                 :                         OffsetNumber offnum,
    1567                 :                                                 minoff,
    1568                 :                                                 maxoff;
    1569                 :                         ItemId          itemid;
    1570                 :                         IndexTuple      item;
    1571                 : 
    1572             373 :                         minoff = P_FIRSTDATAKEY(opaque);
    1573             373 :                         maxoff = PageGetMaxOffsetNumber(page);
    1574                 : 
    1575                 :                         /*
    1576                 :                          * start = InvalidOffsetNumber means "search the whole page". We
    1577                 :                          * need this test anyway due to possibility that page has a high
    1578                 :                          * key now when it didn't before.
    1579                 :                          */
    1580             373 :                         if (start < minoff)
    1581               0 :                                 start = minoff;
    1582                 : 
    1583                 :                         /*
    1584                 :                          * Need this check too, to guard against possibility that page
    1585                 :                          * split since we visited it originally.
    1586                 :                          */
    1587             373 :                         if (start > maxoff)
    1588               0 :                                 start = OffsetNumberNext(maxoff);
    1589                 : 
    1590                 :                         /*
    1591                 :                          * These loops will check every item on the page --- but in an
    1592                 :                          * order that's attuned to the probability of where it actually
    1593                 :                          * is.  Scan to the right first, then to the left.
    1594                 :                          */
    1595             373 :                         for (offnum = start;
    1596             746 :                                  offnum <= maxoff;
    1597               0 :                                  offnum = OffsetNumberNext(offnum))
    1598                 :                         {
    1599             373 :                                 itemid = PageGetItemId(page, offnum);
    1600             373 :                                 item = (IndexTuple) PageGetItem(page, itemid);
    1601             373 :                                 if (BTEntrySame(item, &stack->bts_btentry))
    1602                 :                                 {
    1603                 :                                         /* Return accurate pointer to where link is now */
    1604             373 :                                         stack->bts_blkno = blkno;
    1605             373 :                                         stack->bts_offset = offnum;
    1606             373 :                                         return buf;
    1607                 :                                 }
    1608                 :                         }
    1609                 : 
    1610               0 :                         for (offnum = OffsetNumberPrev(start);
    1611               0 :                                  offnum >= minoff;
    1612               0 :                                  offnum = OffsetNumberPrev(offnum))
    1613                 :                         {
    1614               0 :                                 itemid = PageGetItemId(page, offnum);
    1615               0 :                                 item = (IndexTuple) PageGetItem(page, itemid);
    1616               0 :                                 if (BTEntrySame(item, &stack->bts_btentry))
    1617                 :                                 {
    1618                 :                                         /* Return accurate pointer to where link is now */
    1619               0 :                                         stack->bts_blkno = blkno;
    1620               0 :                                         stack->bts_offset = offnum;
    1621               0 :                                         return buf;
    1622                 :                                 }
    1623                 :                         }
    1624                 :                 }
    1625                 : 
    1626                 :                 /*
    1627                 :                  * The item we're looking for moved right at least one page.
    1628                 :                  */
    1629               0 :                 if (P_RIGHTMOST(opaque))
    1630                 :                 {
    1631               0 :                         _bt_relbuf(rel, buf);
    1632               0 :                         return InvalidBuffer;
    1633                 :                 }
    1634               0 :                 blkno = opaque->btpo_next;
    1635               0 :                 start = InvalidOffsetNumber;
    1636               0 :                 _bt_relbuf(rel, buf);
    1637               0 :         }
    1638                 : }
    1639                 : 
    1640                 : /*
    1641                 :  *      _bt_newroot() -- Create a new root page for the index.
    1642                 :  *
    1643                 :  *              We've just split the old root page and need to create a new one.
    1644                 :  *              In order to do this, we add a new root page to the file, then lock
    1645                 :  *              the metadata page and update it.  This is guaranteed to be deadlock-
    1646                 :  *              free, because all readers release their locks on the metadata page
    1647                 :  *              before trying to lock the root, and all writers lock the root before
    1648                 :  *              trying to lock the metadata page.  We have a write lock on the old
    1649                 :  *              root page, so we have not introduced any cycles into the waits-for
    1650                 :  *              graph.
    1651                 :  *
    1652                 :  *              On entry, lbuf (the old root) and rbuf (its new peer) are write-
    1653                 :  *              locked. On exit, a new root page exists with entries for the
    1654                 :  *              two new children, metapage is updated and unlocked/unpinned.
    1655                 :  *              The new root buffer is returned to caller which has to unlock/unpin
    1656                 :  *              lbuf, rbuf & rootbuf.
    1657                 :  */
    1658                 : static Buffer
    1659                 : _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
    1660              14 : {
    1661                 :         Buffer          rootbuf;
    1662                 :         Page            lpage,
    1663                 :                                 rootpage;
    1664                 :         BlockNumber lbkno,
    1665                 :                                 rbkno;
    1666                 :         BlockNumber rootblknum;
    1667                 :         BTPageOpaque rootopaque;
    1668                 :         ItemId          itemid;
    1669                 :         IndexTuple      item;
    1670                 :         Size            itemsz;
    1671                 :         IndexTuple      new_item;
    1672                 :         Buffer          metabuf;
    1673                 :         Page            metapg;
    1674                 :         BTMetaPageData *metad;
    1675                 : 
    1676              14 :         lbkno = BufferGetBlockNumber(lbuf);
    1677              14 :         rbkno = BufferGetBlockNumber(rbuf);
    1678              14 :         lpage = BufferGetPage(lbuf);
    1679                 : 
    1680                 :         /* get a new root page */
    1681              14 :         rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
    1682              14 :         rootpage = BufferGetPage(rootbuf);
    1683              14 :         rootblknum = BufferGetBlockNumber(rootbuf);
    1684                 : 
    1685                 :         /* acquire lock on the metapage */
    1686              14 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    1687              14 :         metapg = BufferGetPage(metabuf);
    1688              14 :         metad = BTPageGetMeta(metapg);
    1689                 : 
    1690                 :         /* NO EREPORT(ERROR) from here till newroot op is logged */
    1691              14 :         START_CRIT_SECTION();
    1692                 : 
    1693                 :         /* set btree special data */
    1694              14 :         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
    1695              14 :         rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
    1696              14 :         rootopaque->btpo_flags = BTP_ROOT;
    1697              14 :         rootopaque->btpo.level =
    1698                 :                 ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
    1699              14 :         rootopaque->btpo_cycleid = 0;
    1700                 : 
    1701                 :         /* update metapage data */
    1702              14 :         metad->btm_root = rootblknum;
    1703              14 :         metad->btm_level = rootopaque->btpo.level;
    1704              14 :         metad->btm_fastroot = rootblknum;
    1705              14 :         metad->btm_fastlevel = rootopaque->btpo.level;
    1706                 : 
    1707                 :         /*
    1708                 :          * Create downlink item for left page (old root).  Since this will be the
    1709                 :          * first item in a non-leaf page, it implicitly has minus-infinity key
    1710                 :          * value, so we need not store any actual key in it.
    1711                 :          */
    1712              14 :         itemsz = sizeof(IndexTupleData);
    1713              14 :         new_item = (IndexTuple) palloc(itemsz);
    1714              14 :         new_item->t_info = itemsz;
    1715              14 :         ItemPointerSet(&(new_item->t_tid), lbkno, P_HIKEY);
    1716                 : 
    1717                 :         /*
    1718                 :          * Insert the left page pointer into the new root page.  The root page is
    1719                 :          * the rightmost page on its level so there is no "high key" in it; the
    1720                 :          * two items will go into positions P_HIKEY and P_FIRSTKEY.
    1721                 :          *
    1722                 :          * Note: we *must* insert the two items in item-number order, for the
    1723                 :          * benefit of _bt_restore_page().
    1724                 :          */
    1725              14 :         if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
    1726                 :                                         false, false) == InvalidOffsetNumber)
    1727               0 :                 elog(PANIC, "failed to add leftkey to new root page"
    1728                 :                          " while splitting block %u of index \"%s\"",
    1729                 :                          BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    1730              14 :         pfree(new_item);
    1731                 : 
    1732                 :         /*
    1733                 :          * Create downlink item for right page.  The key for it is obtained from
    1734                 :          * the "high key" position in the left page.
    1735                 :          */
    1736              14 :         itemid = PageGetItemId(lpage, P_HIKEY);
    1737              14 :         itemsz = ItemIdGetLength(itemid);
    1738              14 :         item = (IndexTuple) PageGetItem(lpage, itemid);
    1739              14 :         new_item = CopyIndexTuple(item);
    1740              14 :         ItemPointerSet(&(new_item->t_tid), rbkno, P_HIKEY);
    1741                 : 
    1742                 :         /*
    1743                 :          * insert the right page pointer into the new root page.
    1744                 :          */
    1745              14 :         if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
    1746                 :                                         false, false) == InvalidOffsetNumber)
    1747               0 :                 elog(PANIC, "failed to add rightkey to new root page"
    1748                 :                          " while splitting block %u of index \"%s\"",
    1749                 :                          BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    1750              14 :         pfree(new_item);
    1751                 : 
    1752              14 :         MarkBufferDirty(rootbuf);
    1753              14 :         MarkBufferDirty(metabuf);
    1754                 : 
    1755                 :         /* XLOG stuff */
    1756              14 :         if (!rel->rd_istemp)
    1757                 :         {
    1758                 :                 xl_btree_newroot xlrec;
    1759                 :                 XLogRecPtr      recptr;
    1760                 :                 XLogRecData rdata[2];
    1761                 : 
    1762              14 :                 xlrec.node = rel->rd_node;
    1763              14 :                 xlrec.rootblk = rootblknum;
    1764              14 :                 xlrec.level = metad->btm_level;
    1765                 : 
    1766              14 :                 rdata[0].data = (char *) &xlrec;
    1767              14 :                 rdata[0].len = SizeOfBtreeNewroot;
    1768              14 :                 rdata[0].buffer = InvalidBuffer;
    1769              14 :                 rdata[0].next = &(rdata[1]);
    1770                 : 
    1771                 :                 /*
    1772                 :                  * Direct access to page is not good but faster - we should implement
    1773                 :                  * some new func in page API.
    1774                 :                  */
    1775              14 :                 rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper;
    1776              14 :                 rdata[1].len = ((PageHeader) rootpage)->pd_special -
    1777                 :                         ((PageHeader) rootpage)->pd_upper;
    1778              14 :                 rdata[1].buffer = InvalidBuffer;
    1779              14 :                 rdata[1].next = NULL;
    1780                 : 
    1781              14 :                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata);
    1782                 : 
    1783              14 :                 PageSetLSN(rootpage, recptr);
    1784              14 :                 PageSetTLI(rootpage, ThisTimeLineID);
    1785              14 :                 PageSetLSN(metapg, recptr);
    1786              14 :                 PageSetTLI(metapg, ThisTimeLineID);
    1787                 :         }
    1788                 : 
    1789              14 :         END_CRIT_SECTION();
    1790                 : 
    1791                 :         /* send out relcache inval for metapage change */
    1792              14 :         CacheInvalidateRelcache(rel);
    1793                 : 
    1794                 :         /* done with metapage */
    1795              14 :         _bt_relbuf(rel, metabuf);
    1796                 : 
    1797              14 :         return rootbuf;
    1798                 : }
    1799                 : 
    1800                 : /*
    1801                 :  *      _bt_pgaddtup() -- add a tuple to a particular page in the index.
    1802                 :  *
    1803                 :  *              This routine adds the tuple to the page as requested.  It does
    1804                 :  *              not affect pin/lock status, but you'd better have a write lock
    1805                 :  *              and pin on the target buffer!  Don't forget to write and release
    1806                 :  *              the buffer afterwards, either.
    1807                 :  *
    1808                 :  *              The main difference between this routine and a bare PageAddItem call
    1809                 :  *              is that this code knows that the leftmost index tuple on a non-leaf
    1810                 :  *              btree page doesn't need to have a key.  Therefore, it strips such
    1811                 :  *              tuples down to just the tuple header.  CAUTION: this works ONLY if
    1812                 :  *              we insert the tuples in order, so that the given itup_off does
    1813                 :  *              represent the final position of the tuple!
    1814                 :  */
    1815                 : static void
    1816                 : _bt_pgaddtup(Relation rel,
    1817                 :                          Page page,
    1818                 :                          Size itemsize,
    1819                 :                          IndexTuple itup,
    1820                 :                          OffsetNumber itup_off,
    1821                 :                          const char *where)
    1822          156935 : {
    1823          156935 :         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1824                 :         IndexTupleData trunctuple;
    1825                 : 
    1826          156935 :         if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
    1827                 :         {
    1828               2 :                 trunctuple = *itup;
    1829               2 :                 trunctuple.t_info = sizeof(IndexTupleData);
    1830               2 :                 itup = &trunctuple;
    1831               2 :                 itemsize = sizeof(IndexTupleData);
    1832                 :         }
    1833                 : 
    1834          156935 :         if (PageAddItem(page, (Item) itup, itemsize, itup_off,
    1835                 :                                         false, false) == InvalidOffsetNumber)
    1836               0 :                 elog(PANIC, "failed to add item to the %s in index \"%s\"",
    1837                 :                          where, RelationGetRelationName(rel));
    1838          156935 : }
    1839                 : 
    1840                 : /*
    1841                 :  * _bt_isequal - used in _bt_doinsert in check for duplicates.
    1842                 :  *
    1843                 :  * This is very similar to _bt_compare, except for NULL handling.
    1844                 :  * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
    1845                 :  */
    1846                 : static bool
    1847                 : _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
    1848                 :                         int keysz, ScanKey scankey)
    1849           20337 : {
    1850                 :         IndexTuple      itup;
    1851                 :         int                     i;
    1852                 : 
    1853                 :         /* Better be comparing to a leaf item */
    1854                 :         Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
    1855                 : 
    1856           20337 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    1857                 : 
    1858           34551 :         for (i = 1; i <= keysz; i++)
    1859                 :         {
    1860                 :                 AttrNumber      attno;
    1861                 :                 Datum           datum;
    1862                 :                 bool            isNull;
    1863                 :                 int32           result;
    1864                 : 
    1865           33248 :                 attno = scankey->sk_attno;
    1866                 :                 Assert(attno == i);
    1867           33248 :                 datum = index_getattr(itup, attno, itupdesc, &isNull);
    1868                 : 
    1869                 :                 /* NULLs are never equal to anything */
    1870           33248 :                 if (isNull || (scankey->sk_flags & SK_ISNULL))
    1871              13 :                         return false;
    1872                 : 
    1873           33235 :                 result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
    1874                 :                                                                                          datum,
    1875                 :                                                                                          scankey->sk_argument));
    1876                 : 
    1877           33235 :                 if (result != 0)
    1878           19021 :                         return false;
    1879                 : 
    1880           14214 :                 scankey++;
    1881                 :         }
    1882                 : 
    1883                 :         /* if we get here, the keys are equal */
    1884            1303 :         return true;
    1885                 : }
    1886                 : 
    1887                 : /*
    1888                 :  * _bt_vacuum_one_page - vacuum just one index page.
    1889                 :  *
    1890                 :  * Try to remove LP_DEAD items from the given page.  The passed buffer
    1891                 :  * must be exclusive-locked, but unlike a real VACUUM, we don't need a
    1892                 :  * super-exclusive "cleanup" lock (see nbtree/README).
    1893                 :  */
    1894                 : static void
    1895                 : _bt_vacuum_one_page(Relation rel, Buffer buffer)
    1896             106 : {
    1897                 :         OffsetNumber deletable[MaxOffsetNumber];
    1898             106 :         int                     ndeletable = 0;
    1899                 :         OffsetNumber offnum,
    1900                 :                                 minoff,
    1901                 :                                 maxoff;
    1902             106 :         Page            page = BufferGetPage(buffer);
    1903             106 :         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1904                 : 
    1905                 :         /*
    1906                 :          * Scan over all items to see which ones need to be deleted according to
    1907                 :          * LP_DEAD flags.
    1908                 :          */
    1909             106 :         minoff = P_FIRSTDATAKEY(opaque);
    1910             106 :         maxoff = PageGetMaxOffsetNumber(page);
    1911             106 :         for (offnum = minoff;
    1912           23832 :                  offnum <= maxoff;
    1913           23620 :                  offnum = OffsetNumberNext(offnum))
    1914                 :         {
    1915           23620 :                 ItemId          itemId = PageGetItemId(page, offnum);
    1916                 : 
    1917           23620 :                 if (ItemIdIsDead(itemId))
    1918             796 :                         deletable[ndeletable++] = offnum;
    1919                 :         }
    1920                 : 
    1921             106 :         if (ndeletable > 0)
    1922             106 :                 _bt_delitems(rel, buffer, deletable, ndeletable);
    1923                 : 
    1924                 :         /*
    1925                 :          * Note: if we didn't find any LP_DEAD items, then the page's
    1926                 :          * BTP_HAS_GARBAGE hint bit is falsely set.  We do not bother expending a
    1927                 :          * separate write to clear it, however.  We will clear it when we split
    1928                 :          * the page.
    1929                 :          */
    1930             106 : }

Generated by: LTP GCOV extension version 1.5