LTP GCOV extension - code coverage report
Current view: directory - access/nbtree - nbtsearch.c
Test: unnamed
Date: 2008-07-03 Instrumented lines: 371
Code covered: 81.9 % Executed lines: 304
Legend: not executed executed

       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * nbtsearch.c
       4                 :  *        Search code for postgres btrees.
       5                 :  *
       6                 :  *
       7                 :  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
       8                 :  * Portions Copyright (c) 1994, Regents of the University of California
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *        $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.115 2007/12/31 04:52:05 tgl Exp $
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : 
      16                 : #include "postgres.h"
      17                 : 
      18                 : #include "access/genam.h"
      19                 : #include "access/nbtree.h"
      20                 : #include "pgstat.h"
      21                 : #include "utils/lsyscache.h"
      22                 : 
      23                 : 
      24                 : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
      25                 :                          OffsetNumber offnum);
      26                 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
      27                 : static Buffer _bt_walk_left(Relation rel, Buffer buf);
      28                 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
      29                 : 
      30                 : 
      31                 : /*
      32                 :  *      _bt_search() -- Search the tree for a particular scankey,
      33                 :  *              or more precisely for the first leaf page it could be on.
      34                 :  *
      35                 :  * The passed scankey must be an insertion-type scankey (see nbtree/README),
      36                 :  * but it can omit the rightmost column(s) of the index.
      37                 :  *
      38                 :  * When nextkey is false (the usual case), we are looking for the first
      39                 :  * item >= scankey.  When nextkey is true, we are looking for the first
      40                 :  * item strictly greater than scankey.
      41                 :  *
      42                 :  * Return value is a stack of parent-page pointers.  *bufP is set to the
      43                 :  * address of the leaf-page buffer, which is read-locked and pinned.
      44                 :  * No locks are held on the parent pages, however!
      45                 :  *
      46                 :  * NOTE that the returned buffer is read-locked regardless of the access
      47                 :  * parameter.  However, access = BT_WRITE will allow an empty root page
      48                 :  * to be created and returned.  When access = BT_READ, an empty index
      49                 :  * will result in *bufP being set to InvalidBuffer.
      50                 :  */
      51                 : BTStack
      52                 : _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
      53                 :                    Buffer *bufP, int access)
      54          233514 : {
      55          233514 :         BTStack         stack_in = NULL;
      56                 : 
      57                 :         /* Get the root page to start with */
      58          233514 :         *bufP = _bt_getroot(rel, access);
      59                 : 
      60                 :         /* If index is empty and access = BT_READ, no root page is created. */
      61          233514 :         if (!BufferIsValid(*bufP))
      62             248 :                 return (BTStack) NULL;
      63                 : 
      64                 :         /* Loop iterates once per level descended in the tree */
      65                 :         for (;;)
      66                 :         {
      67                 :                 Page            page;
      68                 :                 BTPageOpaque opaque;
      69                 :                 OffsetNumber offnum;
      70                 :                 ItemId          itemid;
      71                 :                 IndexTuple      itup;
      72                 :                 BlockNumber blkno;
      73                 :                 BlockNumber par_blkno;
      74                 :                 BTStack         new_stack;
      75                 : 
      76                 :                 /*
      77                 :                  * Race -- the page we just grabbed may have split since we read its
      78                 :                  * pointer in the parent (or metapage).  If it has, we may need to
      79                 :                  * move right to its new sibling.  Do that.
      80                 :                  */
      81          405162 :                 *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
      82                 : 
      83                 :                 /* if this is a leaf page, we're done */
      84          405162 :                 page = BufferGetPage(*bufP);
      85          405162 :                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
      86          405162 :                 if (P_ISLEAF(opaque))
      87          233266 :                         break;
      88                 : 
      89                 :                 /*
      90                 :                  * Find the appropriate item on the internal page, and get the child
      91                 :                  * page that it points to.
      92                 :                  */
      93          171896 :                 offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
      94          171896 :                 itemid = PageGetItemId(page, offnum);
      95          171896 :                 itup = (IndexTuple) PageGetItem(page, itemid);
      96          171896 :                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
      97          171896 :                 par_blkno = BufferGetBlockNumber(*bufP);
      98                 : 
      99                 :                 /*
     100                 :                  * We need to save the location of the index entry we chose in the
     101                 :                  * parent page on a stack. In case we split the tree, we'll use the
     102                 :                  * stack to work back up to the parent page.  We also save the actual
     103                 :                  * downlink (TID) to uniquely identify the index entry, in case it
     104                 :                  * moves right while we're working lower in the tree.  See the paper
     105                 :                  * by Lehman and Yao for how this is detected and handled. (We use the
     106                 :                  * child link to disambiguate duplicate keys in the index -- Lehman
     107                 :                  * and Yao disallow duplicate keys.)
     108                 :                  */
     109          171896 :                 new_stack = (BTStack) palloc(sizeof(BTStackData));
     110          171896 :                 new_stack->bts_blkno = par_blkno;
     111          171896 :                 new_stack->bts_offset = offnum;
     112          171896 :                 memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
     113          171896 :                 new_stack->bts_parent = stack_in;
     114                 : 
     115                 :                 /* drop the read lock on the parent page, acquire one on the child */
     116          171896 :                 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
     117                 : 
     118                 :                 /* okay, all set to move down a level */
     119          171896 :                 stack_in = new_stack;
     120          171896 :         }
     121                 : 
     122          233266 :         return stack_in;
     123                 : }
     124                 : 
     125                 : /*
     126                 :  *      _bt_moveright() -- move right in the btree if necessary.
     127                 :  *
     128                 :  * When we follow a pointer to reach a page, it is possible that
     129                 :  * the page has changed in the meanwhile.  If this happens, we're
     130                 :  * guaranteed that the page has "split right" -- that is, that any
     131                 :  * data that appeared on the page originally is either on the page
     132                 :  * or strictly to the right of it.
     133                 :  *
     134                 :  * This routine decides whether or not we need to move right in the
     135                 :  * tree by examining the high key entry on the page.  If that entry
     136                 :  * is strictly less than the scankey, or <= the scankey in the nextkey=true
     137                 :  * case, then we followed the wrong link and we need to move right.
     138                 :  *
     139                 :  * The passed scankey must be an insertion-type scankey (see nbtree/README),
     140                 :  * but it can omit the rightmost column(s) of the index.
     141                 :  *
     142                 :  * When nextkey is false (the usual case), we are looking for the first
     143                 :  * item >= scankey.  When nextkey is true, we are looking for the first
     144                 :  * item strictly greater than scankey.
     145                 :  *
     146                 :  * On entry, we have the buffer pinned and a lock of the type specified by
     147                 :  * 'access'.  If we move right, we release the buffer and lock and acquire
     148                 :  * the same on the right sibling.  Return value is the buffer we stop at.
     149                 :  */
     150                 : Buffer
     151                 : _bt_moveright(Relation rel,
     152                 :                           Buffer buf,
     153                 :                           int keysz,
     154                 :                           ScanKey scankey,
     155                 :                           bool nextkey,
     156                 :                           int access)
     157          475271 : {
     158                 :         Page            page;
     159                 :         BTPageOpaque opaque;
     160                 :         int32           cmpval;
     161                 : 
     162          475271 :         page = BufferGetPage(buf);
     163          475271 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     164                 : 
     165                 :         /*
     166                 :          * When nextkey = false (normal case): if the scan key that brought us to
     167                 :          * this page is > the high key stored on the page, then the page has split
     168                 :          * and we need to move right.  (If the scan key is equal to the high key,
     169                 :          * we might or might not need to move right; have to scan the page first
     170                 :          * anyway.)
     171                 :          *
     172                 :          * When nextkey = true: move right if the scan key is >= page's high key.
     173                 :          *
     174                 :          * The page could even have split more than once, so scan as far as
     175                 :          * needed.
     176                 :          *
     177                 :          * We also have to move right if we followed a link that brought us to a
     178                 :          * dead page.
     179                 :          */
     180          475271 :         cmpval = nextkey ? 0 : 1;
     181                 : 
     182          950545 :         while (!P_RIGHTMOST(opaque) &&
     183                 :                    (P_IGNORE(opaque) ||
     184                 :                         _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
     185                 :         {
     186                 :                 /* step right one page */
     187               3 :                 BlockNumber rblkno = opaque->btpo_next;
     188                 : 
     189               3 :                 buf = _bt_relandgetbuf(rel, buf, rblkno, access);
     190               3 :                 page = BufferGetPage(buf);
     191               3 :                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     192                 :         }
     193                 : 
     194          475271 :         if (P_IGNORE(opaque))
     195               0 :                 elog(ERROR, "fell off the end of index \"%s\"",
     196                 :                          RelationGetRelationName(rel));
     197                 : 
     198          475271 :         return buf;
     199                 : }
     200                 : 
     201                 : /*
     202                 :  *      _bt_binsrch() -- Do a binary search for a key on a particular page.
     203                 :  *
     204                 :  * The passed scankey must be an insertion-type scankey (see nbtree/README),
     205                 :  * but it can omit the rightmost column(s) of the index.
     206                 :  *
     207                 :  * When nextkey is false (the usual case), we are looking for the first
     208                 :  * item >= scankey.  When nextkey is true, we are looking for the first
     209                 :  * item strictly greater than scankey.
     210                 :  *
     211                 :  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
     212                 :  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
     213                 :  * particular, this means it is possible to return a value 1 greater than the
     214                 :  * number of keys on the page, if the scankey is > all keys on the page.)
     215                 :  *
     216                 :  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
     217                 :  * of the last key < given scankey, or last key <= given scankey if nextkey
     218                 :  * is true.  (Since _bt_compare treats the first data key of such a page as
     219                 :  * minus infinity, there will be at least one key < scankey, so the result
     220                 :  * always points at one of the keys on the page.)  This key indicates the
     221                 :  * right place to descend to be sure we find all leaf keys >= given scankey
     222                 :  * (or leaf keys > given scankey when nextkey is true).
     223                 :  *
     224                 :  * This procedure is not responsible for walking right, it just examines
     225                 :  * the given page.      _bt_binsrch() has no lock or refcount side effects
     226                 :  * on the buffer.
     227                 :  */
     228                 : OffsetNumber
     229                 : _bt_binsrch(Relation rel,
     230                 :                         Buffer buf,
     231                 :                         int keysz,
     232                 :                         ScanKey scankey,
     233                 :                         bool nextkey)
     234          404079 : {
     235                 :         Page            page;
     236                 :         BTPageOpaque opaque;
     237                 :         OffsetNumber low,
     238                 :                                 high;
     239                 :         int32           result,
     240                 :                                 cmpval;
     241                 : 
     242          404079 :         page = BufferGetPage(buf);
     243          404079 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     244                 : 
     245          404079 :         low = P_FIRSTDATAKEY(opaque);
     246          404079 :         high = PageGetMaxOffsetNumber(page);
     247                 : 
     248                 :         /*
     249                 :          * If there are no keys on the page, return the first available slot. Note
     250                 :          * this covers two cases: the page is really empty (no keys), or it
     251                 :          * contains only a high key.  The latter case is possible after vacuuming.
     252                 :          * This can never happen on an internal page, however, since they are
     253                 :          * never empty (an internal page must have children).
     254                 :          */
     255          404079 :         if (high < low)
     256             137 :                 return low;
     257                 : 
     258                 :         /*
     259                 :          * Binary search to find the first key on the page >= scan key, or first
     260                 :          * key > scankey when nextkey is true.
     261                 :          *
     262                 :          * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     263                 :          * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     264                 :          *
     265                 :          * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     266                 :          * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     267                 :          *
     268                 :          * We can fall out when high == low.
     269                 :          */
     270          403942 :         high++;                                         /* establish the loop invariant for high */
     271                 : 
     272          403942 :         cmpval = nextkey ? 0 : 1;       /* select comparison value */
     273                 : 
     274         3190539 :         while (high > low)
     275                 :         {
     276         2382655 :                 OffsetNumber mid = low + ((high - low) / 2);
     277                 : 
     278                 :                 /* We have low <= mid < high, so mid points at a real slot */
     279                 : 
     280         2382655 :                 result = _bt_compare(rel, keysz, scankey, page, mid);
     281                 : 
     282         2382655 :                 if (result >= cmpval)
     283         1451785 :                         low = mid + 1;
     284                 :                 else
     285          930870 :                         high = mid;
     286                 :         }
     287                 : 
     288                 :         /*
     289                 :          * At this point we have high == low, but be careful: they could point
     290                 :          * past the last slot on the page.
     291                 :          *
     292                 :          * On a leaf page, we always return the first key >= scan key (resp. >
     293                 :          * scan key), which could be the last slot + 1.
     294                 :          */
     295          403942 :         if (P_ISLEAF(opaque))
     296          232046 :                 return low;
     297                 : 
     298                 :         /*
     299                 :          * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     300                 :          * There must be one if _bt_compare() is playing by the rules.
     301                 :          */
     302                 :         Assert(low > P_FIRSTDATAKEY(opaque));
     303                 : 
     304          171896 :         return OffsetNumberPrev(low);
     305                 : }
     306                 : 
     307                 : /*----------
     308                 :  *      _bt_compare() -- Compare scankey to a particular tuple on the page.
     309                 :  *
     310                 :  * The passed scankey must be an insertion-type scankey (see nbtree/README),
     311                 :  * but it can omit the rightmost column(s) of the index.
     312                 :  *
     313                 :  *      keysz: number of key conditions to be checked (might be less than the
     314                 :  *              number of index columns!)
     315                 :  *      page/offnum: location of btree item to be compared to.
     316                 :  *
     317                 :  *              This routine returns:
     318                 :  *                      <0 if scankey < tuple at offnum;
     319                 :  *                       0 if scankey == tuple at offnum;
     320                 :  *                      >0 if scankey > tuple at offnum.
     321                 :  *              NULLs in the keys are treated as sortable values.  Therefore
     322                 :  *              "equality" does not necessarily mean that the item should be
     323                 :  *              returned to the caller as a matching key!
     324                 :  *
     325                 :  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
     326                 :  * "minus infinity": this routine will always claim it is less than the
     327                 :  * scankey.  The actual key value stored (if any, which there probably isn't)
     328                 :  * does not matter.  This convention allows us to implement the Lehman and
     329                 :  * Yao convention that the first down-link pointer is before the first key.
     330                 :  * See backend/access/nbtree/README for details.
     331                 :  *----------
     332                 :  */
     333                 : int32
     334                 : _bt_compare(Relation rel,
     335                 :                         int keysz,
     336                 :                         ScanKey scankey,
     337                 :                         Page page,
     338                 :                         OffsetNumber offnum)
     339         2515955 : {
     340         2515955 :         TupleDesc       itupdesc = RelationGetDescr(rel);
     341         2515955 :         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     342                 :         IndexTuple      itup;
     343                 :         int                     i;
     344                 : 
     345                 :         /*
     346                 :          * Force result ">" if target item is first data item on an internal page
     347                 :          * --- see NOTE above.
     348                 :          */
     349         2515955 :         if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
     350           24968 :                 return 1;
     351                 : 
     352         2490987 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     353                 : 
     354                 :         /*
     355                 :          * The scan key is set up with the attribute number associated with each
     356                 :          * term in the key.  It is important that, if the index is multi-key, the
     357                 :          * scan contain the first k key attributes, and that they be in order.  If
     358                 :          * you think about how multi-key ordering works, you'll understand why
     359                 :          * this is.
     360                 :          *
     361                 :          * We don't test for violation of this condition here, however.  The
     362                 :          * initial setup for the index scan had better have gotten it right (see
     363                 :          * _bt_first).
     364                 :          */
     365                 : 
     366         3386047 :         for (i = 1; i <= keysz; i++)
     367                 :         {
     368                 :                 Datum           datum;
     369                 :                 bool            isNull;
     370                 :                 int32           result;
     371                 : 
     372         3162600 :                 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
     373                 : 
     374                 :                 /* see comments about NULLs handling in btbuild */
     375         3162600 :                 if (scankey->sk_flags & SK_ISNULL)               /* key is NULL */
     376                 :                 {
     377              62 :                         if (isNull)
     378              14 :                                 result = 0;             /* NULL "=" NULL */
     379              48 :                         else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     380              20 :                                 result = -1;    /* NULL "<" NOT_NULL */
     381                 :                         else
     382              28 :                                 result = 1;             /* NULL ">" NOT_NULL */
     383                 :                 }
     384         3162538 :                 else if (isNull)                /* key is NOT_NULL and item is NULL */
     385                 :                 {
     386              18 :                         if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     387               0 :                                 result = 1;             /* NOT_NULL ">" NULL */
     388                 :                         else
     389              18 :                                 result = -1;    /* NOT_NULL "<" NULL */
     390                 :                 }
     391                 :                 else
     392                 :                 {
     393                 :                         /*
     394                 :                          * The sk_func needs to be passed the index value as left arg and
     395                 :                          * the sk_argument as right arg (they might be of different
     396                 :                          * types).      Since it is convenient for callers to think of
     397                 :                          * _bt_compare as comparing the scankey to the index item, we have
     398                 :                          * to flip the sign of the comparison result.  (Unless it's a DESC
     399                 :                          * column, in which case we *don't* flip the sign.)
     400                 :                          */
     401         3162520 :                         result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
     402                 :                                                                                                  datum,
     403                 :                                                                                                  scankey->sk_argument));
     404                 : 
     405         3162520 :                         if (!(scankey->sk_flags & SK_BT_DESC))
     406         3162520 :                                 result = -result;
     407                 :                 }
     408                 : 
     409                 :                 /* if the keys are unequal, return the difference */
     410         3162600 :                 if (result != 0)
     411         2267540 :                         return result;
     412                 : 
     413          895060 :                 scankey++;
     414                 :         }
     415                 : 
     416                 :         /* if we get here, the keys are equal */
     417          223447 :         return 0;
     418                 : }
     419                 : 
     420                 : /*
     421                 :  *      _bt_first() -- Find the first item in a scan.
     422                 :  *
     423                 :  *              We need to be clever about the direction of scan, the search
     424                 :  *              conditions, and the tree ordering.      We find the first item (or,
     425                 :  *              if backwards scan, the last item) in the tree that satisfies the
     426                 :  *              qualifications in the scan key.  On success exit, the page containing
     427                 :  *              the current index tuple is pinned but not locked, and data about
     428                 :  *              the matching tuple(s) on the page has been loaded into so->currPos,
     429                 :  *              and scan->xs_ctup.t_self is set to the heap TID of the current tuple.
     430                 :  *
     431                 :  * If there are no matching items in the index, we return FALSE, with no
     432                 :  * pins or locks held.
     433                 :  *
     434                 :  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
     435                 :  * are both search-type scankeys (see nbtree/README for more about this).
     436                 :  * Within this routine, we build a temporary insertion-type scankey to use
     437                 :  * in locating the scan start position.
     438                 :  */
     439                 : bool
     440                 : _bt_first(IndexScanDesc scan, ScanDirection dir)
     441          163467 : {
     442          163467 :         Relation        rel = scan->indexRelation;
     443          163467 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
     444                 :         Buffer          buf;
     445                 :         BTStack         stack;
     446                 :         OffsetNumber offnum;
     447                 :         StrategyNumber strat;
     448                 :         bool            nextkey;
     449                 :         bool            goback;
     450                 :         ScanKey         startKeys[INDEX_MAX_KEYS];
     451                 :         ScanKeyData scankeys[INDEX_MAX_KEYS];
     452          163467 :         int                     keysCount = 0;
     453                 :         int                     i;
     454                 :         StrategyNumber strat_total;
     455                 : 
     456          163467 :         pgstat_count_index_scan(rel);
     457                 : 
     458                 :         /*
     459                 :          * Examine the scan keys and eliminate any redundant keys; also mark the
     460                 :          * keys that must be matched to continue the scan.
     461                 :          */
     462          163467 :         _bt_preprocess_keys(scan);
     463                 : 
     464                 :         /*
     465                 :          * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     466                 :          * never be satisfied (eg, x == 1 AND x > 2).
     467                 :          */
     468          163467 :         if (!so->qual_ok)
     469               3 :                 return false;
     470                 : 
     471                 :         /*----------
     472                 :          * Examine the scan keys to discover where we need to start the scan.
     473                 :          *
     474                 :          * We want to identify the keys that can be used as starting boundaries;
     475                 :          * these are =, >, or >= keys for a forward scan or =, <, <= keys for
     476                 :          * a backwards scan.  We can use keys for multiple attributes so long as
     477                 :          * the prior attributes had only =, >= (resp. =, <=) keys.        Once we accept
     478                 :          * a > or < boundary or find an attribute with no boundary (which can be
     479                 :          * thought of as the same as "> -infinity"), we can't use keys for any
     480                 :          * attributes to its right, because it would break our simplistic notion
     481                 :          * of what initial positioning strategy to use.
     482                 :          *
     483                 :          * When the scan keys include cross-type operators, _bt_preprocess_keys
     484                 :          * may not be able to eliminate redundant keys; in such cases we will
     485                 :          * arbitrarily pick a usable one for each attribute.  This is correct
     486                 :          * but possibly not optimal behavior.  (For example, with keys like
     487                 :          * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
     488                 :          * x=5 would be more efficient.)  Since the situation only arises given
     489                 :          * a poorly-worded query plus an incomplete opfamily, live with it.
     490                 :          *
     491                 :          * When both equality and inequality keys appear for a single attribute
     492                 :          * (again, only possible when cross-type operators appear), we *must*
     493                 :          * select one of the equality keys for the starting point, because
     494                 :          * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
     495                 :          * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
     496                 :          * start at x=4, we will fail and stop before reaching x=10.  If multiple
     497                 :          * equality quals survive preprocessing, however, it doesn't matter which
     498                 :          * one we use --- by definition, they are either redundant or
     499                 :          * contradictory.
     500                 :          *
     501                 :          * In this loop, row-comparison keys are treated the same as keys on their
     502                 :          * first (leftmost) columns.  We'll add on lower-order columns of the row
     503                 :          * comparison below, if possible.
     504                 :          *
     505                 :          * The selected scan keys (at most one per index column) are remembered by
     506                 :          * storing their addresses into the local startKeys[] array.
     507                 :          *----------
     508                 :          */
     509          163464 :         strat_total = BTEqualStrategyNumber;
     510          163464 :         if (so->numberOfKeys > 0)
     511                 :         {
     512                 :                 AttrNumber      curattr;
     513                 :                 ScanKey         chosen;
     514                 :                 ScanKey         cur;
     515                 : 
     516                 :                 /*
     517                 :                  * chosen is the so-far-chosen key for the current attribute, if any.
     518                 :                  * We don't cast the decision in stone until we reach keys for the
     519                 :                  * next attribute.
     520                 :                  */
     521          163425 :                 curattr = 1;
     522          163425 :                 chosen = NULL;
     523                 : 
     524                 :                 /*
     525                 :                  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
     526                 :                  * pass to handle after-last-key processing.  Actual exit from the
     527                 :                  * loop is at one of the "break" statements below.
     528                 :                  */
     529          404424 :                 for (cur = so->keyData, i = 0;; cur++, i++)
     530                 :                 {
     531          404424 :                         if (i >= so->numberOfKeys || cur->sk_attno != curattr)
     532                 :                         {
     533                 :                                 /*
     534                 :                                  * Done looking at keys for curattr.  If we didn't find a
     535                 :                                  * usable boundary key, quit; else save the boundary key
     536                 :                                  * pointer in startKeys.
     537                 :                                  */
     538          240990 :                                 if (chosen == NULL)
     539              22 :                                         break;
     540          240968 :                                 startKeys[keysCount++] = chosen;
     541                 : 
     542                 :                                 /*
     543                 :                                  * Adjust strat_total, and quit if we have stored a > or <
     544                 :                                  * key.
     545                 :                                  */
     546          240968 :                                 strat = chosen->sk_strategy;
     547          240968 :                                 if (strat != BTEqualStrategyNumber)
     548                 :                                 {
     549           10181 :                                         strat_total = strat;
     550           10181 :                                         if (strat == BTGreaterStrategyNumber ||
     551                 :                                                 strat == BTLessStrategyNumber)
     552            9821 :                                                 break;
     553                 :                                 }
     554                 : 
     555                 :                                 /*
     556                 :                                  * Done if that was the last attribute, or if next key is not
     557                 :                                  * in sequence (implying no boundary key is available for the
     558                 :                                  * next attribute).
     559                 :                                  */
     560          231147 :                                 if (i >= so->numberOfKeys ||
     561                 :                                         cur->sk_attno != curattr + 1)
     562                 :                                         break;
     563                 : 
     564                 :                                 /*
     565                 :                                  * Reset for next attr.
     566                 :                                  */
     567           77565 :                                 curattr = cur->sk_attno;
     568           77565 :                                 chosen = NULL;
     569                 :                         }
     570                 : 
     571                 :                         /* Can we use this key as a starting boundary for this attr? */
     572          240999 :                         switch (cur->sk_strategy)
     573                 :                         {
     574                 :                                 case BTLessStrategyNumber:
     575                 :                                 case BTLessEqualStrategyNumber:
     576              34 :                                         if (chosen == NULL && ScanDirectionIsBackward(dir))
     577               5 :                                                 chosen = cur;
     578                 :                                         break;
     579                 :                                 case BTEqualStrategyNumber:
     580                 :                                         /* override any non-equality choice */
     581          230787 :                                         chosen = cur;
     582          230787 :                                         break;
     583                 :                                 case BTGreaterEqualStrategyNumber:
     584                 :                                 case BTGreaterStrategyNumber:
     585           10178 :                                         if (chosen == NULL && ScanDirectionIsForward(dir))
     586           10176 :                                                 chosen = cur;
     587                 :                                         break;
     588                 :                         }
     589          240999 :                 }
     590                 :         }
     591                 : 
     592                 :         /*
     593                 :          * If we found no usable boundary keys, we have to start from one end of
     594                 :          * the tree.  Walk down that edge to the first or last key, and scan from
     595                 :          * there.
     596                 :          */
     597          163464 :         if (keysCount == 0)
     598              61 :                 return _bt_endpoint(scan, dir);
     599                 : 
     600                 :         /*
     601                 :          * We want to start the scan somewhere within the index.  Set up an
     602                 :          * insertion scankey we can use to search for the boundary point we
     603                 :          * identified above.  The insertion scankey is built in the local
     604                 :          * scankeys[] array, using the keys identified by startKeys[].
     605                 :          */
     606                 :         Assert(keysCount <= INDEX_MAX_KEYS);
     607          404370 :         for (i = 0; i < keysCount; i++)
     608                 :         {
     609          240968 :                 ScanKey         cur = startKeys[i];
     610                 : 
     611                 :                 Assert(cur->sk_attno == i + 1);
     612                 : 
     613          240968 :                 if (cur->sk_flags & SK_ROW_HEADER)
     614                 :                 {
     615                 :                         /*
     616                 :                          * Row comparison header: look to the first row member instead.
     617                 :                          *
     618                 :                          * The member scankeys are already in insertion format (ie, they
     619                 :                          * have sk_func = 3-way-comparison function), but we have to watch
     620                 :                          * out for nulls, which _bt_preprocess_keys didn't check. A null
     621                 :                          * in the first row member makes the condition unmatchable, just
     622                 :                          * like qual_ok = false.
     623                 :                          */
     624               1 :                         ScanKey         subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
     625                 : 
     626                 :                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
     627               1 :                         if (subkey->sk_flags & SK_ISNULL)
     628               0 :                                 return false;
     629               1 :                         memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
     630                 : 
     631                 :                         /*
     632                 :                          * If the row comparison is the last positioning key we accepted,
     633                 :                          * try to add additional keys from the lower-order row members.
     634                 :                          * (If we accepted independent conditions on additional index
     635                 :                          * columns, we use those instead --- doesn't seem worth trying to
     636                 :                          * determine which is more restrictive.)  Note that this is OK
     637                 :                          * even if the row comparison is of ">" or "<" type, because the
     638                 :                          * condition applied to all but the last row member is effectively
     639                 :                          * ">=" or "<=", and so the extra keys don't break the positioning
     640                 :                          * scheme.      But, by the same token, if we aren't able to use all
     641                 :                          * the row members, then the part of the row comparison that we
     642                 :                          * did use has to be treated as just a ">=" or "<=" condition, and
     643                 :                          * so we'd better adjust strat_total accordingly.
     644                 :                          */
     645               1 :                         if (i == keysCount - 1)
     646                 :                         {
     647               1 :                                 bool            used_all_subkeys = false;
     648                 : 
     649                 :                                 Assert(!(subkey->sk_flags & SK_ROW_END));
     650                 :                                 for (;;)
     651                 :                                 {
     652               1 :                                         subkey++;
     653                 :                                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
     654               1 :                                         if (subkey->sk_attno != keysCount + 1)
     655               0 :                                                 break;  /* out-of-sequence, can't use it */
     656               1 :                                         if (subkey->sk_strategy != cur->sk_strategy)
     657               0 :                                                 break;  /* wrong direction, can't use it */
     658               1 :                                         if (subkey->sk_flags & SK_ISNULL)
     659               0 :                                                 break;  /* can't use null keys */
     660                 :                                         Assert(keysCount < INDEX_MAX_KEYS);
     661               1 :                                         memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
     662               1 :                                         keysCount++;
     663               1 :                                         if (subkey->sk_flags & SK_ROW_END)
     664                 :                                         {
     665               1 :                                                 used_all_subkeys = true;
     666               1 :                                                 break;
     667                 :                                         }
     668                 :                                 }
     669               1 :                                 if (!used_all_subkeys)
     670                 :                                 {
     671               0 :                                         switch (strat_total)
     672                 :                                         {
     673                 :                                                 case BTLessStrategyNumber:
     674               0 :                                                         strat_total = BTLessEqualStrategyNumber;
     675               0 :                                                         break;
     676                 :                                                 case BTGreaterStrategyNumber:
     677               0 :                                                         strat_total = BTGreaterEqualStrategyNumber;
     678                 :                                                         break;
     679                 :                                         }
     680                 :                                 }
     681                 :                                 break;                  /* done with outer loop */
     682                 :                         }
     683                 :                 }
     684                 :                 else
     685                 :                 {
     686                 :                         /*
     687                 :                          * Ordinary comparison key.  Transform the search-style scan key
     688                 :                          * to an insertion scan key by replacing the sk_func with the
     689                 :                          * appropriate btree comparison function.
     690                 :                          *
     691                 :                          * If scankey operator is not a cross-type comparison, we can use
     692                 :                          * the cached comparison function; otherwise gotta look it up in
     693                 :                          * the catalogs.  (That can't lead to infinite recursion, since no
     694                 :                          * indexscan initiated by syscache lookup will use cross-data-type
     695                 :                          * operators.)
     696                 :                          *
     697                 :                          * We support the convention that sk_subtype == InvalidOid means
     698                 :                          * the opclass input type; this is a hack to simplify life for
     699                 :                          * ScanKeyInit().
     700                 :                          */
     701          481590 :                         if (cur->sk_subtype == rel->rd_opcintype[i] ||
     702                 :                                 cur->sk_subtype == InvalidOid)
     703                 :                         {
     704                 :                                 FmgrInfo   *procinfo;
     705                 : 
     706          240623 :                                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
     707          240623 :                                 ScanKeyEntryInitializeWithInfo(scankeys + i,
     708                 :                                                                                            cur->sk_flags,
     709                 :                                                                                            cur->sk_attno,
     710                 :                                                                                            InvalidStrategy,
     711                 :                                                                                            cur->sk_subtype,
     712                 :                                                                                            procinfo,
     713                 :                                                                                            cur->sk_argument);
     714                 :                         }
     715                 :                         else
     716                 :                         {
     717                 :                                 RegProcedure cmp_proc;
     718                 : 
     719             344 :                                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
     720                 :                                                                                          rel->rd_opcintype[i],
     721                 :                                                                                          cur->sk_subtype,
     722                 :                                                                                          BTORDER_PROC);
     723             344 :                                 if (!RegProcedureIsValid(cmp_proc))
     724               0 :                                         elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
     725                 :                                                  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
     726                 :                                                  cur->sk_attno, RelationGetRelationName(rel));
     727             344 :                                 ScanKeyEntryInitialize(scankeys + i,
     728                 :                                                                            cur->sk_flags,
     729                 :                                                                            cur->sk_attno,
     730                 :                                                                            InvalidStrategy,
     731                 :                                                                            cur->sk_subtype,
     732                 :                                                                            cmp_proc,
     733                 :                                                                            cur->sk_argument);
     734                 :                         }
     735                 :                 }
     736                 :         }
     737                 : 
     738                 :         /*----------
     739                 :          * Examine the selected initial-positioning strategy to determine exactly
     740                 :          * where we need to start the scan, and set flag variables to control the
     741                 :          * code below.
     742                 :          *
     743                 :          * If nextkey = false, _bt_search and _bt_binsrch will locate the first
     744                 :          * item >= scan key.  If nextkey = true, they will locate the first
     745                 :          * item > scan key.
     746                 :          *
     747                 :          * If goback = true, we will then step back one item, while if
     748                 :          * goback = false, we will start the scan on the located item.
     749                 :          *----------
     750                 :          */
     751          163403 :         switch (strat_total)
     752                 :         {
     753                 :                 case BTLessStrategyNumber:
     754                 : 
     755                 :                         /*
     756                 :                          * Find first item >= scankey, then back up one to arrive at last
     757                 :                          * item < scankey.   (Note: this positioning strategy is only used
     758                 :                          * for a backward scan, so that is always the correct starting
     759                 :                          * position.)
     760                 :                          */
     761               5 :                         nextkey = false;
     762               5 :                         goback = true;
     763               5 :                         break;
     764                 : 
     765                 :                 case BTLessEqualStrategyNumber:
     766                 : 
     767                 :                         /*
     768                 :                          * Find first item > scankey, then back up one to arrive at last
     769                 :                          * item <= scankey.  (Note: this positioning strategy is only used
     770                 :                          * for a backward scan, so that is always the correct starting
     771                 :                          * position.)
     772                 :                          */
     773               0 :                         nextkey = true;
     774               0 :                         goback = true;
     775               0 :                         break;
     776                 : 
     777                 :                 case BTEqualStrategyNumber:
     778                 : 
     779                 :                         /*
     780                 :                          * If a backward scan was specified, need to start with last equal
     781                 :                          * item not first one.
     782                 :                          */
     783          153222 :                         if (ScanDirectionIsBackward(dir))
     784                 :                         {
     785                 :                                 /*
     786                 :                                  * This is the same as the <= strategy.  We will check at the
     787                 :                                  * end whether the found item is actually =.
     788                 :                                  */
     789               4 :                                 nextkey = true;
     790               4 :                                 goback = true;
     791                 :                         }
     792                 :                         else
     793                 :                         {
     794                 :                                 /*
     795                 :                                  * This is the same as the >= strategy.  We will check at the
     796                 :                                  * end whether the found item is actually =.
     797                 :                                  */
     798          153218 :                                 nextkey = false;
     799          153218 :                                 goback = false;
     800                 :                         }
     801                 :                         break;
     802                 : 
     803                 :                 case BTGreaterEqualStrategyNumber:
     804                 : 
     805                 :                         /*
     806                 :                          * Find first item >= scankey.       (This is only used for forward
     807                 :                          * scans.)
     808                 :                          */
     809             360 :                         nextkey = false;
     810             360 :                         goback = false;
     811             360 :                         break;
     812                 : 
     813                 :                 case BTGreaterStrategyNumber:
     814                 : 
     815                 :                         /*
     816                 :                          * Find first item > scankey.  (This is only used for forward
     817                 :                          * scans.)
     818                 :                          */
     819            9816 :                         nextkey = true;
     820            9816 :                         goback = false;
     821            9816 :                         break;
     822                 : 
     823                 :                 default:
     824                 :                         /* can't get here, but keep compiler quiet */
     825               0 :                         elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
     826               0 :                         return false;
     827                 :         }
     828                 : 
     829                 :         /*
     830                 :          * Use the manufactured insertion scan key to descend the tree and
     831                 :          * position ourselves on the target leaf page.
     832                 :          */
     833          163403 :         stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
     834                 : 
     835                 :         /* don't need to keep the stack around... */
     836          163403 :         _bt_freestack(stack);
     837                 : 
     838                 :         /* remember which buffer we have pinned, if any */
     839          163403 :         so->currPos.buf = buf;
     840                 : 
     841          163403 :         if (!BufferIsValid(buf))
     842                 :         {
     843                 :                 /* Only get here if index is completely empty */
     844             248 :                 return false;
     845                 :         }
     846                 : 
     847                 :         /* initialize moreLeft/moreRight appropriately for scan direction */
     848          163155 :         if (ScanDirectionIsForward(dir))
     849                 :         {
     850          163146 :                 so->currPos.moreLeft = false;
     851          163146 :                 so->currPos.moreRight = true;
     852                 :         }
     853                 :         else
     854                 :         {
     855               9 :                 so->currPos.moreLeft = true;
     856               9 :                 so->currPos.moreRight = false;
     857                 :         }
     858          163155 :         so->numKilled = 0;                   /* just paranoia */
     859          163155 :         so->markItemIndex = -1;              /* ditto */
     860                 : 
     861                 :         /* position to the precise item on the page */
     862          163155 :         offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
     863                 : 
     864                 :         /*
     865                 :          * If nextkey = false, we are positioned at the first item >= scan key, or
     866                 :          * possibly at the end of a page on which all the existing items are less
     867                 :          * than the scan key and we know that everything on later pages is greater
     868                 :          * than or equal to scan key.
     869                 :          *
     870                 :          * If nextkey = true, we are positioned at the first item > scan key, or
     871                 :          * possibly at the end of a page on which all the existing items are less
     872                 :          * than or equal to the scan key and we know that everything on later
     873                 :          * pages is greater than scan key.
     874                 :          *
     875                 :          * The actually desired starting point is either this item or the prior
     876                 :          * one, or in the end-of-page case it's the first item on the next page or
     877                 :          * the last item on this page.  Adjust the starting offset if needed. (If
     878                 :          * this results in an offset before the first item or after the last one,
     879                 :          * _bt_readpage will report no items found, and then we'll step to the
     880                 :          * next page as needed.)
     881                 :          */
     882          163155 :         if (goback)
     883               9 :                 offnum = OffsetNumberPrev(offnum);
     884                 : 
     885                 :         /*
     886                 :          * Now load data from the first page of the scan.
     887                 :          */
     888          163155 :         if (!_bt_readpage(scan, dir, offnum))
     889                 :         {
     890                 :                 /*
     891                 :                  * There's no actually-matching data on this page.  Try to advance to
     892                 :                  * the next page.  Return false if there's no matching data at all.
     893                 :                  */
     894           49794 :                 if (!_bt_steppage(scan, dir))
     895           49718 :                         return false;
     896                 :         }
     897                 : 
     898                 :         /* Drop the lock, but not pin, on the current page */
     899          113437 :         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
     900                 : 
     901                 :         /* OK, itemIndex says what to return */
     902          113437 :         scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
     903                 : 
     904          113437 :         return true;
     905                 : }
     906                 : 
     907                 : /*
     908                 :  *      _bt_next() -- Get the next item in a scan.
     909                 :  *
     910                 :  *              On entry, so->currPos describes the current page, which is pinned
     911                 :  *              but not locked, and so->currPos.itemIndex identifies which item was
     912                 :  *              previously returned.
     913                 :  *
     914                 :  *              On successful exit, scan->xs_ctup.t_self is set to the TID of the
     915                 :  *              next heap tuple, and so->currPos is updated as needed.
     916                 :  *
     917                 :  *              On failure exit (no more tuples), we release pin and set
     918                 :  *              so->currPos.buf to InvalidBuffer.
     919                 :  */
     920                 : bool
     921                 : _bt_next(IndexScanDesc scan, ScanDirection dir)
     922           85298 : {
     923           85298 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
     924                 : 
     925                 :         /*
     926                 :          * Advance to next tuple on current page; or if there's no more, try to
     927                 :          * step to the next page with data.
     928                 :          */
     929           85298 :         if (ScanDirectionIsForward(dir))
     930                 :         {
     931           84933 :                 if (++so->currPos.itemIndex > so->currPos.lastItem)
     932                 :                 {
     933                 :                         /* We must acquire lock before applying _bt_steppage */
     934                 :                         Assert(BufferIsValid(so->currPos.buf));
     935           17210 :                         LockBuffer(so->currPos.buf, BT_READ);
     936           17210 :                         if (!_bt_steppage(scan, dir))
     937           17061 :                                 return false;
     938                 :                         /* Drop the lock, but not pin, on the new page */
     939             149 :                         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
     940                 :                 }
     941                 :         }
     942                 :         else
     943                 :         {
     944             365 :                 if (--so->currPos.itemIndex < so->currPos.firstItem)
     945                 :                 {
     946                 :                         /* We must acquire lock before applying _bt_steppage */
     947                 :                         Assert(BufferIsValid(so->currPos.buf));
     948              12 :                         LockBuffer(so->currPos.buf, BT_READ);
     949              12 :                         if (!_bt_steppage(scan, dir))
     950              12 :                                 return false;
     951                 :                         /* Drop the lock, but not pin, on the new page */
     952               0 :                         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
     953                 :                 }
     954                 :         }
     955                 : 
     956                 :         /* OK, itemIndex says what to return */
     957           68225 :         scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
     958                 : 
     959           68225 :         return true;
     960                 : }
     961                 : 
     962                 : /*
     963                 :  *      _bt_readpage() -- Load data from current index page into so->currPos
     964                 :  *
     965                 :  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
     966                 :  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
     967                 :  * they are updated as appropriate.  All other fields of so->currPos are
     968                 :  * initialized from scratch here.
     969                 :  *
     970                 :  * We scan the current page starting at offnum and moving in the indicated
     971                 :  * direction.  All items matching the scan keys are loaded into currPos.items.
     972                 :  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
     973                 :  * that there can be no more matching tuples in the current scan direction.
     974                 :  *
     975                 :  * Returns true if any matching items found on the page, false if none.
     976                 :  */
     977                 : static bool
     978                 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
     979          163887 : {
     980          163887 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
     981                 :         Page            page;
     982                 :         BTPageOpaque opaque;
     983                 :         OffsetNumber minoff;
     984                 :         OffsetNumber maxoff;
     985                 :         int                     itemIndex;
     986                 :         bool            continuescan;
     987                 : 
     988                 :         /* we must have the buffer pinned and locked */
     989                 :         Assert(BufferIsValid(so->currPos.buf));
     990                 : 
     991          163887 :         page = BufferGetPage(so->currPos.buf);
     992          163887 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     993          163887 :         minoff = P_FIRSTDATAKEY(opaque);
     994          163887 :         maxoff = PageGetMaxOffsetNumber(page);
     995                 : 
     996                 :         /*
     997                 :          * we must save the page's right-link while scanning it; this tells us
     998                 :          * where to step right to after we're done with these items.  There is no
     999                 :          * corresponding need for the left-link, since splits always go right.
    1000                 :          */
    1001          163887 :         so->currPos.nextPage = opaque->btpo_next;
    1002                 : 
    1003          163887 :         if (ScanDirectionIsForward(dir))
    1004                 :         {
    1005                 :                 /* load items[] in ascending order */
    1006          163869 :                 itemIndex = 0;
    1007                 : 
    1008          163869 :                 offnum = Max(offnum, minoff);
    1009                 : 
    1010          814228 :                 while (offnum <= maxoff)
    1011                 :                 {
    1012          605296 :                         if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
    1013                 :                         {
    1014                 :                                 /* tuple passes all scan key conditions, so remember it */
    1015                 :                                 /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
    1016          433450 :                                 so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
    1017          433450 :                                 so->currPos.items[itemIndex].indexOffset = offnum;
    1018          433450 :                                 itemIndex++;
    1019                 :                         }
    1020          605296 :                         if (!continuescan)
    1021                 :                         {
    1022                 :                                 /* there can't be any more matches, so stop */
    1023          118806 :                                 so->currPos.moreRight = false;
    1024          118806 :                                 break;
    1025                 :                         }
    1026                 : 
    1027          486490 :                         offnum = OffsetNumberNext(offnum);
    1028                 :                 }
    1029                 : 
    1030                 :                 Assert(itemIndex <= MaxIndexTuplesPerPage);
    1031          163869 :                 so->currPos.firstItem = 0;
    1032          163869 :                 so->currPos.lastItem = itemIndex - 1;
    1033          163869 :                 so->currPos.itemIndex = 0;
    1034                 :         }
    1035                 :         else
    1036                 :         {
    1037                 :                 /* load items[] in descending order */
    1038              18 :                 itemIndex = MaxIndexTuplesPerPage;
    1039                 : 
    1040              18 :                 offnum = Min(offnum, maxoff);
    1041                 : 
    1042            2154 :                 while (offnum >= minoff)
    1043                 :                 {
    1044            2120 :                         if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
    1045                 :                         {
    1046                 :                                 /* tuple passes all scan key conditions, so remember it */
    1047                 :                                 /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
    1048            1969 :                                 itemIndex--;
    1049            1969 :                                 so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
    1050            1969 :                                 so->currPos.items[itemIndex].indexOffset = offnum;
    1051                 :                         }
    1052            2120 :                         if (!continuescan)
    1053                 :                         {
    1054                 :                                 /* there can't be any more matches, so stop */
    1055               2 :                                 so->currPos.moreLeft = false;
    1056               2 :                                 break;
    1057                 :                         }
    1058                 : 
    1059            2118 :                         offnum = OffsetNumberPrev(offnum);
    1060                 :                 }
    1061                 : 
    1062                 :                 Assert(itemIndex >= 0);
    1063              18 :                 so->currPos.firstItem = itemIndex;
    1064              18 :                 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
    1065              18 :                 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
    1066                 :         }
    1067                 : 
    1068          163887 :         return (so->currPos.firstItem <= so->currPos.lastItem);
    1069                 : }
    1070                 : 
    1071                 : /*
    1072                 :  *      _bt_steppage() -- Step to next page containing valid data for scan
    1073                 :  *
    1074                 :  * On entry, so->currPos.buf must be pinned and read-locked.  We'll drop
    1075                 :  * the lock and pin before moving to next page.
    1076                 :  *
    1077                 :  * On success exit, we hold pin and read-lock on the next interesting page,
    1078                 :  * and so->currPos is updated to contain data from that page.
    1079                 :  *
    1080                 :  * If there are no more matching records in the given direction, we drop all
    1081                 :  * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
    1082                 :  */
    1083                 : static bool
    1084                 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
    1085           67018 : {
    1086           67018 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1087                 :         Relation        rel;
    1088                 :         Page            page;
    1089                 :         BTPageOpaque opaque;
    1090                 : 
    1091                 :         /* we must have the buffer pinned and locked */
    1092                 :         Assert(BufferIsValid(so->currPos.buf));
    1093                 : 
    1094                 :         /* Before leaving current page, deal with any killed items */
    1095           67018 :         if (so->numKilled > 0)
    1096             696 :                 _bt_killitems(scan, true);
    1097                 : 
    1098                 :         /*
    1099                 :          * Before we modify currPos, make a copy of the page data if there was a
    1100                 :          * mark position that needs it.
    1101                 :          */
    1102           67018 :         if (so->markItemIndex >= 0)
    1103                 :         {
    1104                 :                 /* bump pin on current buffer for assignment to mark buffer */
    1105               0 :                 IncrBufferRefCount(so->currPos.buf);
    1106               0 :                 memcpy(&so->markPos, &so->currPos,
    1107                 :                            offsetof(BTScanPosData, items[1]) +
    1108                 :                            so->currPos.lastItem * sizeof(BTScanPosItem));
    1109               0 :                 so->markPos.itemIndex = so->markItemIndex;
    1110               0 :                 so->markItemIndex = -1;
    1111                 :         }
    1112                 : 
    1113           67018 :         rel = scan->indexRelation;
    1114                 : 
    1115           67018 :         if (ScanDirectionIsForward(dir))
    1116                 :         {
    1117                 :                 /* Walk right to the next page with data */
    1118                 :                 /* We must rely on the previously saved nextPage link! */
    1119           67005 :                 BlockNumber blkno = so->currPos.nextPage;
    1120                 : 
    1121                 :                 /* Remember we left a page with data */
    1122           67005 :                 so->currPos.moreLeft = true;
    1123                 : 
    1124                 :                 for (;;)
    1125                 :                 {
    1126                 :                         /* if we're at end of scan, release the buffer and return */
    1127           67450 :                         if (blkno == P_NONE || !so->currPos.moreRight)
    1128                 :                         {
    1129           66779 :                                 _bt_relbuf(rel, so->currPos.buf);
    1130           66779 :                                 so->currPos.buf = InvalidBuffer;
    1131           66779 :                                 return false;
    1132                 :                         }
    1133                 :                         /* step right one page */
    1134             671 :                         so->currPos.buf = _bt_relandgetbuf(rel, so->currPos.buf,
    1135                 :                                                                                            blkno, BT_READ);
    1136                 :                         /* check for deleted page */
    1137             671 :                         page = BufferGetPage(so->currPos.buf);
    1138             671 :                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1139             671 :                         if (!P_IGNORE(opaque))
    1140                 :                         {
    1141                 :                                 /* see if there are any matches on this page */
    1142                 :                                 /* note that this will clear moreRight if we can stop */
    1143             671 :                                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
    1144             226 :                                         break;
    1145                 :                         }
    1146                 :                         /* nope, keep going */
    1147             445 :                         blkno = opaque->btpo_next;
    1148             445 :                 }
    1149                 :         }
    1150                 :         else
    1151                 :         {
    1152                 :                 /* Remember we left a page with data */
    1153              13 :                 so->currPos.moreRight = true;
    1154                 : 
    1155                 :                 /*
    1156                 :                  * Walk left to the next page with data.  This is much more complex
    1157                 :                  * than the walk-right case because of the possibility that the page
    1158                 :                  * to our left splits while we are in flight to it, plus the
    1159                 :                  * possibility that the page we were on gets deleted after we leave
    1160                 :                  * it.  See nbtree/README for details.
    1161                 :                  */
    1162                 :                 for (;;)
    1163                 :                 {
    1164                 :                         /* Done if we know there are no matching keys to the left */
    1165              13 :                         if (!so->currPos.moreLeft)
    1166                 :                         {
    1167               7 :                                 _bt_relbuf(rel, so->currPos.buf);
    1168               7 :                                 so->currPos.buf = InvalidBuffer;
    1169               7 :                                 return false;
    1170                 :                         }
    1171                 : 
    1172                 :                         /* Step to next physical page */
    1173               6 :                         so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
    1174                 : 
    1175                 :                         /* if we're physically at end of index, return failure */
    1176               6 :                         if (so->currPos.buf == InvalidBuffer)
    1177               6 :                                 return false;
    1178                 : 
    1179                 :                         /*
    1180                 :                          * Okay, we managed to move left to a non-deleted page. Done if
    1181                 :                          * it's not half-dead and contains matching tuples. Else loop back
    1182                 :                          * and do it all again.
    1183                 :                          */
    1184               0 :                         page = BufferGetPage(so->currPos.buf);
    1185               0 :                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1186               0 :                         if (!P_IGNORE(opaque))
    1187                 :                         {
    1188                 :                                 /* see if there are any matches on this page */
    1189                 :                                 /* note that this will clear moreLeft if we can stop */
    1190               0 :                                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
    1191               0 :                                         break;
    1192                 :                         }
    1193                 :                 }
    1194                 :         }
    1195                 : 
    1196             226 :         return true;
    1197                 : }
    1198                 : 
    1199                 : /*
    1200                 :  * _bt_walk_left() -- step left one page, if possible
    1201                 :  *
    1202                 :  * The given buffer must be pinned and read-locked.  This will be dropped
    1203                 :  * before stepping left.  On return, we have pin and read lock on the
    1204                 :  * returned page, instead.
    1205                 :  *
    1206                 :  * Returns InvalidBuffer if there is no page to the left (no lock is held
    1207                 :  * in that case).
    1208                 :  *
    1209                 :  * When working on a non-leaf level, it is possible for the returned page
    1210                 :  * to be half-dead; the caller should check that condition and step left
    1211                 :  * again if it's important.
    1212                 :  */
    1213                 : static Buffer
    1214                 : _bt_walk_left(Relation rel, Buffer buf)
    1215               6 : {
    1216                 :         Page            page;
    1217                 :         BTPageOpaque opaque;
    1218                 : 
    1219               6 :         page = BufferGetPage(buf);
    1220               6 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1221                 : 
    1222                 :         for (;;)
    1223                 :         {
    1224                 :                 BlockNumber obknum;
    1225                 :                 BlockNumber lblkno;
    1226                 :                 BlockNumber blkno;
    1227                 :                 int                     tries;
    1228                 : 
    1229                 :                 /* if we're at end of tree, release buf and return failure */
    1230               6 :                 if (P_LEFTMOST(opaque))
    1231                 :                 {
    1232               6 :                         _bt_relbuf(rel, buf);
    1233                 :                         break;
    1234                 :                 }
    1235                 :                 /* remember original page we are stepping left from */
    1236               0 :                 obknum = BufferGetBlockNumber(buf);
    1237                 :                 /* step left */
    1238               0 :                 blkno = lblkno = opaque->btpo_prev;
    1239               0 :                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    1240               0 :                 page = BufferGetPage(buf);
    1241               0 :                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1242                 : 
    1243                 :                 /*
    1244                 :                  * If this isn't the page we want, walk right till we find what we
    1245                 :                  * want --- but go no more than four hops (an arbitrary limit). If we
    1246                 :                  * don't find the correct page by then, the most likely bet is that
    1247                 :                  * the original page got deleted and isn't in the sibling chain at all
    1248                 :                  * anymore, not that its left sibling got split more than four times.
    1249                 :                  *
    1250                 :                  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
    1251                 :                  * because half-dead pages are still in the sibling chain.      Caller
    1252                 :                  * must reject half-dead pages if wanted.
    1253                 :                  */
    1254               0 :                 tries = 0;
    1255                 :                 for (;;)
    1256                 :                 {
    1257               0 :                         if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
    1258                 :                         {
    1259                 :                                 /* Found desired page, return it */
    1260               0 :                                 return buf;
    1261                 :                         }
    1262               0 :                         if (P_RIGHTMOST(opaque) || ++tries > 4)
    1263                 :                                 break;
    1264               0 :                         blkno = opaque->btpo_next;
    1265               0 :                         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    1266               0 :                         page = BufferGetPage(buf);
    1267               0 :                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1268               0 :                 }
    1269                 : 
    1270                 :                 /* Return to the original page to see what's up */
    1271               0 :                 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
    1272               0 :                 page = BufferGetPage(buf);
    1273               0 :                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1274               0 :                 if (P_ISDELETED(opaque))
    1275                 :                 {
    1276                 :                         /*
    1277                 :                          * It was deleted.      Move right to first nondeleted page (there
    1278                 :                          * must be one); that is the page that has acquired the deleted
    1279                 :                          * one's keyspace, so stepping left from it will take us where we
    1280                 :                          * want to be.
    1281                 :                          */
    1282                 :                         for (;;)
    1283                 :                         {
    1284               0 :                                 if (P_RIGHTMOST(opaque))
    1285               0 :                                         elog(ERROR, "fell off the end of index \"%s\"",
    1286                 :                                                  RelationGetRelationName(rel));
    1287               0 :                                 blkno = opaque->btpo_next;
    1288               0 :                                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    1289               0 :                                 page = BufferGetPage(buf);
    1290               0 :                                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1291               0 :                                 if (!P_ISDELETED(opaque))
    1292               0 :                                         break;
    1293                 :                         }
    1294                 : 
    1295                 :                         /*
    1296                 :                          * Now return to top of loop, resetting obknum to point to this
    1297                 :                          * nondeleted page, and try again.
    1298                 :                          */
    1299                 :                 }
    1300                 :                 else
    1301                 :                 {
    1302                 :                         /*
    1303                 :                          * It wasn't deleted; the explanation had better be that the page
    1304                 :                          * to the left got split or deleted. Without this check, we'd go
    1305                 :                          * into an infinite loop if there's anything wrong.
    1306                 :                          */
    1307               0 :                         if (opaque->btpo_prev == lblkno)
    1308               0 :                                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
    1309                 :                                          obknum, RelationGetRelationName(rel));
    1310                 :                         /* Okay to try again with new lblkno value */
    1311                 :                 }
    1312                 :         }
    1313                 : 
    1314               6 :         return InvalidBuffer;
    1315                 : }
    1316                 : 
    1317                 : /*
    1318                 :  * _bt_get_endpoint() -- Find the first or last page on a given tree level
    1319                 :  *
    1320                 :  * If the index is empty, we will return InvalidBuffer; any other failure
    1321                 :  * condition causes ereport().  We will not return a dead page.
    1322                 :  *
    1323                 :  * The returned buffer is pinned and read-locked.
    1324                 :  */
    1325                 : Buffer
    1326                 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
    1327              61 : {
    1328                 :         Buffer          buf;
    1329                 :         Page            page;
    1330                 :         BTPageOpaque opaque;
    1331                 :         OffsetNumber offnum;
    1332                 :         BlockNumber blkno;
    1333                 :         IndexTuple      itup;
    1334                 : 
    1335                 :         /*
    1336                 :          * If we are looking for a leaf page, okay to descend from fast root;
    1337                 :          * otherwise better descend from true root.  (There is no point in being
    1338                 :          * smarter about intermediate levels.)
    1339                 :          */
    1340              61 :         if (level == 0)
    1341              61 :                 buf = _bt_getroot(rel, BT_READ);
    1342                 :         else
    1343               0 :                 buf = _bt_gettrueroot(rel);
    1344                 : 
    1345              61 :         if (!BufferIsValid(buf))
    1346                 :         {
    1347                 :                 /* empty index... */
    1348               0 :                 return InvalidBuffer;
    1349                 :         }
    1350                 : 
    1351              61 :         page = BufferGetPage(buf);
    1352              61 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1353                 : 
    1354                 :         for (;;)
    1355                 :         {
    1356                 :                 /*
    1357                 :                  * If we landed on a deleted page, step right to find a live page
    1358                 :                  * (there must be one).  Also, if we want the rightmost page, step
    1359                 :                  * right if needed to get to it (this could happen if the page split
    1360                 :                  * since we obtained a pointer to it).
    1361                 :                  */
    1362              93 :                 while (P_IGNORE(opaque) ||
    1363                 :                            (rightmost && !P_RIGHTMOST(opaque)))
    1364                 :                 {
    1365               0 :                         blkno = opaque->btpo_next;
    1366               0 :                         if (blkno == P_NONE)
    1367               0 :                                 elog(ERROR, "fell off the end of index \"%s\"",
    1368                 :                                          RelationGetRelationName(rel));
    1369               0 :                         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    1370               0 :                         page = BufferGetPage(buf);
    1371               0 :                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1372                 :                 }
    1373                 : 
    1374                 :                 /* Done? */
    1375              93 :                 if (opaque->btpo.level == level)
    1376              61 :                         break;
    1377              32 :                 if (opaque->btpo.level < level)
    1378               0 :                         elog(ERROR, "btree level %u not found in index \"%s\"",
    1379                 :                                  level, RelationGetRelationName(rel));
    1380                 : 
    1381                 :                 /* Descend to leftmost or rightmost child page */
    1382              32 :                 if (rightmost)
    1383               5 :                         offnum = PageGetMaxOffsetNumber(page);
    1384                 :                 else
    1385              27 :                         offnum = P_FIRSTDATAKEY(opaque);
    1386                 : 
    1387              32 :                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    1388              32 :                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
    1389                 : 
    1390              32 :                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    1391              32 :                 page = BufferGetPage(buf);
    1392              32 :                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1393              32 :         }
    1394                 : 
    1395              61 :         return buf;
    1396                 : }
    1397                 : 
    1398                 : /*
    1399                 :  *      _bt_endpoint() -- Find the first or last page in the index, and scan
    1400                 :  * from there to the first key satisfying all the quals.
    1401                 :  *
    1402                 :  * This is used by _bt_first() to set up a scan when we've determined
    1403                 :  * that the scan must start at the beginning or end of the index (for
    1404                 :  * a forward or backward scan respectively).  Exit conditions are the
    1405                 :  * same as for _bt_first().
    1406                 :  */
    1407                 : static bool
    1408                 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
    1409              61 : {
    1410              61 :         Relation        rel = scan->indexRelation;
    1411              61 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1412                 :         Buffer          buf;
    1413                 :         Page            page;
    1414                 :         BTPageOpaque opaque;
    1415                 :         OffsetNumber start;
    1416                 : 
    1417                 :         /*
    1418                 :          * Scan down to the leftmost or rightmost leaf page.  This is a simplified
    1419                 :          * version of _bt_search().  We don't maintain a stack since we know we
    1420                 :          * won't need it.
    1421                 :          */
    1422              61 :         buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
    1423                 : 
    1424              61 :         if (!BufferIsValid(buf))
    1425                 :         {
    1426                 :                 /* empty index... */
    1427               0 :                 so->currPos.buf = InvalidBuffer;
    1428               0 :                 return false;
    1429                 :         }
    1430                 : 
    1431              61 :         page = BufferGetPage(buf);
    1432              61 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1433                 :         Assert(P_ISLEAF(opaque));
    1434                 : 
    1435              61 :         if (ScanDirectionIsForward(dir))
    1436                 :         {
    1437                 :                 /* There could be dead pages to the left, so not this: */
    1438                 :                 /* Assert(P_LEFTMOST(opaque)); */
    1439                 : 
    1440              52 :                 start = P_FIRSTDATAKEY(opaque);
    1441                 :         }
    1442               9 :         else if (ScanDirectionIsBackward(dir))
    1443                 :         {
    1444                 :                 Assert(P_RIGHTMOST(opaque));
    1445                 : 
    1446               9 :                 start = PageGetMaxOffsetNumber(page);
    1447                 :         }
    1448                 :         else
    1449                 :         {
    1450               0 :                 elog(ERROR, "invalid scan direction: %d", (int) dir);
    1451               0 :                 start = 0;                              /* keep compiler quiet */
    1452                 :         }
    1453                 : 
    1454                 :         /* remember which buffer we have pinned */
    1455              61 :         so->currPos.buf = buf;
    1456                 : 
    1457                 :         /* initialize moreLeft/moreRight appropriately for scan direction */
    1458              61 :         if (ScanDirectionIsForward(dir))
    1459                 :         {
    1460              52 :                 so->currPos.moreLeft = false;
    1461              52 :                 so->currPos.moreRight = true;
    1462                 :         }
    1463                 :         else
    1464                 :         {
    1465               9 :                 so->currPos.moreLeft = true;
    1466               9 :                 so->currPos.moreRight = false;
    1467                 :         }
    1468              61 :         so->numKilled = 0;                   /* just paranoia */
    1469              61 :         so->markItemIndex = -1;              /* ditto */
    1470                 : 
    1471                 :         /*
    1472                 :          * Now load data from the first page of the scan.
    1473                 :          */
    1474              61 :         if (!_bt_readpage(scan, dir, start))
    1475                 :         {
    1476                 :                 /*
    1477                 :                  * There's no actually-matching data on this page.  Try to advance to
    1478                 :                  * the next page.  Return false if there's no matching data at all.
    1479                 :                  */
    1480               2 :                 if (!_bt_steppage(scan, dir))
    1481               1 :                         return false;
    1482                 :         }
    1483                 : 
    1484                 :         /* Drop the lock, but not pin, on the current page */
    1485              60 :         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
    1486                 : 
    1487                 :         /* OK, itemIndex says what to return */
    1488              60 :         scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
    1489                 : 
    1490              60 :         return true;
    1491                 : }

Generated by: LTP GCOV extension version 1.5