1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsearch.c
4 : * Search code for postgres btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.115 2007/12/31 04:52:05 tgl Exp $
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/genam.h"
19 : #include "access/nbtree.h"
20 : #include "pgstat.h"
21 : #include "utils/lsyscache.h"
22 :
23 :
24 : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
25 : OffsetNumber offnum);
26 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
27 : static Buffer _bt_walk_left(Relation rel, Buffer buf);
28 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
29 :
30 :
31 : /*
32 : * _bt_search() -- Search the tree for a particular scankey,
33 : * or more precisely for the first leaf page it could be on.
34 : *
35 : * The passed scankey must be an insertion-type scankey (see nbtree/README),
36 : * but it can omit the rightmost column(s) of the index.
37 : *
38 : * When nextkey is false (the usual case), we are looking for the first
39 : * item >= scankey. When nextkey is true, we are looking for the first
40 : * item strictly greater than scankey.
41 : *
42 : * Return value is a stack of parent-page pointers. *bufP is set to the
43 : * address of the leaf-page buffer, which is read-locked and pinned.
44 : * No locks are held on the parent pages, however!
45 : *
46 : * NOTE that the returned buffer is read-locked regardless of the access
47 : * parameter. However, access = BT_WRITE will allow an empty root page
48 : * to be created and returned. When access = BT_READ, an empty index
49 : * will result in *bufP being set to InvalidBuffer.
50 : */
51 : BTStack
52 : _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
53 : Buffer *bufP, int access)
54 233514 : {
55 233514 : BTStack stack_in = NULL;
56 :
57 : /* Get the root page to start with */
58 233514 : *bufP = _bt_getroot(rel, access);
59 :
60 : /* If index is empty and access = BT_READ, no root page is created. */
61 233514 : if (!BufferIsValid(*bufP))
62 248 : return (BTStack) NULL;
63 :
64 : /* Loop iterates once per level descended in the tree */
65 : for (;;)
66 : {
67 : Page page;
68 : BTPageOpaque opaque;
69 : OffsetNumber offnum;
70 : ItemId itemid;
71 : IndexTuple itup;
72 : BlockNumber blkno;
73 : BlockNumber par_blkno;
74 : BTStack new_stack;
75 :
76 : /*
77 : * Race -- the page we just grabbed may have split since we read its
78 : * pointer in the parent (or metapage). If it has, we may need to
79 : * move right to its new sibling. Do that.
80 : */
81 405162 : *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
82 :
83 : /* if this is a leaf page, we're done */
84 405162 : page = BufferGetPage(*bufP);
85 405162 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
86 405162 : if (P_ISLEAF(opaque))
87 233266 : break;
88 :
89 : /*
90 : * Find the appropriate item on the internal page, and get the child
91 : * page that it points to.
92 : */
93 171896 : offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
94 171896 : itemid = PageGetItemId(page, offnum);
95 171896 : itup = (IndexTuple) PageGetItem(page, itemid);
96 171896 : blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
97 171896 : par_blkno = BufferGetBlockNumber(*bufP);
98 :
99 : /*
100 : * We need to save the location of the index entry we chose in the
101 : * parent page on a stack. In case we split the tree, we'll use the
102 : * stack to work back up to the parent page. We also save the actual
103 : * downlink (TID) to uniquely identify the index entry, in case it
104 : * moves right while we're working lower in the tree. See the paper
105 : * by Lehman and Yao for how this is detected and handled. (We use the
106 : * child link to disambiguate duplicate keys in the index -- Lehman
107 : * and Yao disallow duplicate keys.)
108 : */
109 171896 : new_stack = (BTStack) palloc(sizeof(BTStackData));
110 171896 : new_stack->bts_blkno = par_blkno;
111 171896 : new_stack->bts_offset = offnum;
112 171896 : memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
113 171896 : new_stack->bts_parent = stack_in;
114 :
115 : /* drop the read lock on the parent page, acquire one on the child */
116 171896 : *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
117 :
118 : /* okay, all set to move down a level */
119 171896 : stack_in = new_stack;
120 171896 : }
121 :
122 233266 : return stack_in;
123 : }
124 :
125 : /*
126 : * _bt_moveright() -- move right in the btree if necessary.
127 : *
128 : * When we follow a pointer to reach a page, it is possible that
129 : * the page has changed in the meanwhile. If this happens, we're
130 : * guaranteed that the page has "split right" -- that is, that any
131 : * data that appeared on the page originally is either on the page
132 : * or strictly to the right of it.
133 : *
134 : * This routine decides whether or not we need to move right in the
135 : * tree by examining the high key entry on the page. If that entry
136 : * is strictly less than the scankey, or <= the scankey in the nextkey=true
137 : * case, then we followed the wrong link and we need to move right.
138 : *
139 : * The passed scankey must be an insertion-type scankey (see nbtree/README),
140 : * but it can omit the rightmost column(s) of the index.
141 : *
142 : * When nextkey is false (the usual case), we are looking for the first
143 : * item >= scankey. When nextkey is true, we are looking for the first
144 : * item strictly greater than scankey.
145 : *
146 : * On entry, we have the buffer pinned and a lock of the type specified by
147 : * 'access'. If we move right, we release the buffer and lock and acquire
148 : * the same on the right sibling. Return value is the buffer we stop at.
149 : */
150 : Buffer
151 : _bt_moveright(Relation rel,
152 : Buffer buf,
153 : int keysz,
154 : ScanKey scankey,
155 : bool nextkey,
156 : int access)
157 475271 : {
158 : Page page;
159 : BTPageOpaque opaque;
160 : int32 cmpval;
161 :
162 475271 : page = BufferGetPage(buf);
163 475271 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
164 :
165 : /*
166 : * When nextkey = false (normal case): if the scan key that brought us to
167 : * this page is > the high key stored on the page, then the page has split
168 : * and we need to move right. (If the scan key is equal to the high key,
169 : * we might or might not need to move right; have to scan the page first
170 : * anyway.)
171 : *
172 : * When nextkey = true: move right if the scan key is >= page's high key.
173 : *
174 : * The page could even have split more than once, so scan as far as
175 : * needed.
176 : *
177 : * We also have to move right if we followed a link that brought us to a
178 : * dead page.
179 : */
180 475271 : cmpval = nextkey ? 0 : 1;
181 :
182 950545 : while (!P_RIGHTMOST(opaque) &&
183 : (P_IGNORE(opaque) ||
184 : _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
185 : {
186 : /* step right one page */
187 3 : BlockNumber rblkno = opaque->btpo_next;
188 :
189 3 : buf = _bt_relandgetbuf(rel, buf, rblkno, access);
190 3 : page = BufferGetPage(buf);
191 3 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
192 : }
193 :
194 475271 : if (P_IGNORE(opaque))
195 0 : elog(ERROR, "fell off the end of index \"%s\"",
196 : RelationGetRelationName(rel));
197 :
198 475271 : return buf;
199 : }
200 :
201 : /*
202 : * _bt_binsrch() -- Do a binary search for a key on a particular page.
203 : *
204 : * The passed scankey must be an insertion-type scankey (see nbtree/README),
205 : * but it can omit the rightmost column(s) of the index.
206 : *
207 : * When nextkey is false (the usual case), we are looking for the first
208 : * item >= scankey. When nextkey is true, we are looking for the first
209 : * item strictly greater than scankey.
210 : *
211 : * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
212 : * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
213 : * particular, this means it is possible to return a value 1 greater than the
214 : * number of keys on the page, if the scankey is > all keys on the page.)
215 : *
216 : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
217 : * of the last key < given scankey, or last key <= given scankey if nextkey
218 : * is true. (Since _bt_compare treats the first data key of such a page as
219 : * minus infinity, there will be at least one key < scankey, so the result
220 : * always points at one of the keys on the page.) This key indicates the
221 : * right place to descend to be sure we find all leaf keys >= given scankey
222 : * (or leaf keys > given scankey when nextkey is true).
223 : *
224 : * This procedure is not responsible for walking right, it just examines
225 : * the given page. _bt_binsrch() has no lock or refcount side effects
226 : * on the buffer.
227 : */
228 : OffsetNumber
229 : _bt_binsrch(Relation rel,
230 : Buffer buf,
231 : int keysz,
232 : ScanKey scankey,
233 : bool nextkey)
234 404079 : {
235 : Page page;
236 : BTPageOpaque opaque;
237 : OffsetNumber low,
238 : high;
239 : int32 result,
240 : cmpval;
241 :
242 404079 : page = BufferGetPage(buf);
243 404079 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
244 :
245 404079 : low = P_FIRSTDATAKEY(opaque);
246 404079 : high = PageGetMaxOffsetNumber(page);
247 :
248 : /*
249 : * If there are no keys on the page, return the first available slot. Note
250 : * this covers two cases: the page is really empty (no keys), or it
251 : * contains only a high key. The latter case is possible after vacuuming.
252 : * This can never happen on an internal page, however, since they are
253 : * never empty (an internal page must have children).
254 : */
255 404079 : if (high < low)
256 137 : return low;
257 :
258 : /*
259 : * Binary search to find the first key on the page >= scan key, or first
260 : * key > scankey when nextkey is true.
261 : *
262 : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
263 : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
264 : *
265 : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
266 : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
267 : *
268 : * We can fall out when high == low.
269 : */
270 403942 : high++; /* establish the loop invariant for high */
271 :
272 403942 : cmpval = nextkey ? 0 : 1; /* select comparison value */
273 :
274 3190539 : while (high > low)
275 : {
276 2382655 : OffsetNumber mid = low + ((high - low) / 2);
277 :
278 : /* We have low <= mid < high, so mid points at a real slot */
279 :
280 2382655 : result = _bt_compare(rel, keysz, scankey, page, mid);
281 :
282 2382655 : if (result >= cmpval)
283 1451785 : low = mid + 1;
284 : else
285 930870 : high = mid;
286 : }
287 :
288 : /*
289 : * At this point we have high == low, but be careful: they could point
290 : * past the last slot on the page.
291 : *
292 : * On a leaf page, we always return the first key >= scan key (resp. >
293 : * scan key), which could be the last slot + 1.
294 : */
295 403942 : if (P_ISLEAF(opaque))
296 232046 : return low;
297 :
298 : /*
299 : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
300 : * There must be one if _bt_compare() is playing by the rules.
301 : */
302 : Assert(low > P_FIRSTDATAKEY(opaque));
303 :
304 171896 : return OffsetNumberPrev(low);
305 : }
306 :
307 : /*----------
308 : * _bt_compare() -- Compare scankey to a particular tuple on the page.
309 : *
310 : * The passed scankey must be an insertion-type scankey (see nbtree/README),
311 : * but it can omit the rightmost column(s) of the index.
312 : *
313 : * keysz: number of key conditions to be checked (might be less than the
314 : * number of index columns!)
315 : * page/offnum: location of btree item to be compared to.
316 : *
317 : * This routine returns:
318 : * <0 if scankey < tuple at offnum;
319 : * 0 if scankey == tuple at offnum;
320 : * >0 if scankey > tuple at offnum.
321 : * NULLs in the keys are treated as sortable values. Therefore
322 : * "equality" does not necessarily mean that the item should be
323 : * returned to the caller as a matching key!
324 : *
325 : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
326 : * "minus infinity": this routine will always claim it is less than the
327 : * scankey. The actual key value stored (if any, which there probably isn't)
328 : * does not matter. This convention allows us to implement the Lehman and
329 : * Yao convention that the first down-link pointer is before the first key.
330 : * See backend/access/nbtree/README for details.
331 : *----------
332 : */
333 : int32
334 : _bt_compare(Relation rel,
335 : int keysz,
336 : ScanKey scankey,
337 : Page page,
338 : OffsetNumber offnum)
339 2515955 : {
340 2515955 : TupleDesc itupdesc = RelationGetDescr(rel);
341 2515955 : BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
342 : IndexTuple itup;
343 : int i;
344 :
345 : /*
346 : * Force result ">" if target item is first data item on an internal page
347 : * --- see NOTE above.
348 : */
349 2515955 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
350 24968 : return 1;
351 :
352 2490987 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
353 :
354 : /*
355 : * The scan key is set up with the attribute number associated with each
356 : * term in the key. It is important that, if the index is multi-key, the
357 : * scan contain the first k key attributes, and that they be in order. If
358 : * you think about how multi-key ordering works, you'll understand why
359 : * this is.
360 : *
361 : * We don't test for violation of this condition here, however. The
362 : * initial setup for the index scan had better have gotten it right (see
363 : * _bt_first).
364 : */
365 :
366 3386047 : for (i = 1; i <= keysz; i++)
367 : {
368 : Datum datum;
369 : bool isNull;
370 : int32 result;
371 :
372 3162600 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
373 :
374 : /* see comments about NULLs handling in btbuild */
375 3162600 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
376 : {
377 62 : if (isNull)
378 14 : result = 0; /* NULL "=" NULL */
379 48 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
380 20 : result = -1; /* NULL "<" NOT_NULL */
381 : else
382 28 : result = 1; /* NULL ">" NOT_NULL */
383 : }
384 3162538 : else if (isNull) /* key is NOT_NULL and item is NULL */
385 : {
386 18 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
387 0 : result = 1; /* NOT_NULL ">" NULL */
388 : else
389 18 : result = -1; /* NOT_NULL "<" NULL */
390 : }
391 : else
392 : {
393 : /*
394 : * The sk_func needs to be passed the index value as left arg and
395 : * the sk_argument as right arg (they might be of different
396 : * types). Since it is convenient for callers to think of
397 : * _bt_compare as comparing the scankey to the index item, we have
398 : * to flip the sign of the comparison result. (Unless it's a DESC
399 : * column, in which case we *don't* flip the sign.)
400 : */
401 3162520 : result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
402 : datum,
403 : scankey->sk_argument));
404 :
405 3162520 : if (!(scankey->sk_flags & SK_BT_DESC))
406 3162520 : result = -result;
407 : }
408 :
409 : /* if the keys are unequal, return the difference */
410 3162600 : if (result != 0)
411 2267540 : return result;
412 :
413 895060 : scankey++;
414 : }
415 :
416 : /* if we get here, the keys are equal */
417 223447 : return 0;
418 : }
419 :
420 : /*
421 : * _bt_first() -- Find the first item in a scan.
422 : *
423 : * We need to be clever about the direction of scan, the search
424 : * conditions, and the tree ordering. We find the first item (or,
425 : * if backwards scan, the last item) in the tree that satisfies the
426 : * qualifications in the scan key. On success exit, the page containing
427 : * the current index tuple is pinned but not locked, and data about
428 : * the matching tuple(s) on the page has been loaded into so->currPos,
429 : * and scan->xs_ctup.t_self is set to the heap TID of the current tuple.
430 : *
431 : * If there are no matching items in the index, we return FALSE, with no
432 : * pins or locks held.
433 : *
434 : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
435 : * are both search-type scankeys (see nbtree/README for more about this).
436 : * Within this routine, we build a temporary insertion-type scankey to use
437 : * in locating the scan start position.
438 : */
439 : bool
440 : _bt_first(IndexScanDesc scan, ScanDirection dir)
441 163467 : {
442 163467 : Relation rel = scan->indexRelation;
443 163467 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
444 : Buffer buf;
445 : BTStack stack;
446 : OffsetNumber offnum;
447 : StrategyNumber strat;
448 : bool nextkey;
449 : bool goback;
450 : ScanKey startKeys[INDEX_MAX_KEYS];
451 : ScanKeyData scankeys[INDEX_MAX_KEYS];
452 163467 : int keysCount = 0;
453 : int i;
454 : StrategyNumber strat_total;
455 :
456 163467 : pgstat_count_index_scan(rel);
457 :
458 : /*
459 : * Examine the scan keys and eliminate any redundant keys; also mark the
460 : * keys that must be matched to continue the scan.
461 : */
462 163467 : _bt_preprocess_keys(scan);
463 :
464 : /*
465 : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
466 : * never be satisfied (eg, x == 1 AND x > 2).
467 : */
468 163467 : if (!so->qual_ok)
469 3 : return false;
470 :
471 : /*----------
472 : * Examine the scan keys to discover where we need to start the scan.
473 : *
474 : * We want to identify the keys that can be used as starting boundaries;
475 : * these are =, >, or >= keys for a forward scan or =, <, <= keys for
476 : * a backwards scan. We can use keys for multiple attributes so long as
477 : * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
478 : * a > or < boundary or find an attribute with no boundary (which can be
479 : * thought of as the same as "> -infinity"), we can't use keys for any
480 : * attributes to its right, because it would break our simplistic notion
481 : * of what initial positioning strategy to use.
482 : *
483 : * When the scan keys include cross-type operators, _bt_preprocess_keys
484 : * may not be able to eliminate redundant keys; in such cases we will
485 : * arbitrarily pick a usable one for each attribute. This is correct
486 : * but possibly not optimal behavior. (For example, with keys like
487 : * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
488 : * x=5 would be more efficient.) Since the situation only arises given
489 : * a poorly-worded query plus an incomplete opfamily, live with it.
490 : *
491 : * When both equality and inequality keys appear for a single attribute
492 : * (again, only possible when cross-type operators appear), we *must*
493 : * select one of the equality keys for the starting point, because
494 : * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
495 : * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
496 : * start at x=4, we will fail and stop before reaching x=10. If multiple
497 : * equality quals survive preprocessing, however, it doesn't matter which
498 : * one we use --- by definition, they are either redundant or
499 : * contradictory.
500 : *
501 : * In this loop, row-comparison keys are treated the same as keys on their
502 : * first (leftmost) columns. We'll add on lower-order columns of the row
503 : * comparison below, if possible.
504 : *
505 : * The selected scan keys (at most one per index column) are remembered by
506 : * storing their addresses into the local startKeys[] array.
507 : *----------
508 : */
509 163464 : strat_total = BTEqualStrategyNumber;
510 163464 : if (so->numberOfKeys > 0)
511 : {
512 : AttrNumber curattr;
513 : ScanKey chosen;
514 : ScanKey cur;
515 :
516 : /*
517 : * chosen is the so-far-chosen key for the current attribute, if any.
518 : * We don't cast the decision in stone until we reach keys for the
519 : * next attribute.
520 : */
521 163425 : curattr = 1;
522 163425 : chosen = NULL;
523 :
524 : /*
525 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
526 : * pass to handle after-last-key processing. Actual exit from the
527 : * loop is at one of the "break" statements below.
528 : */
529 404424 : for (cur = so->keyData, i = 0;; cur++, i++)
530 : {
531 404424 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
532 : {
533 : /*
534 : * Done looking at keys for curattr. If we didn't find a
535 : * usable boundary key, quit; else save the boundary key
536 : * pointer in startKeys.
537 : */
538 240990 : if (chosen == NULL)
539 22 : break;
540 240968 : startKeys[keysCount++] = chosen;
541 :
542 : /*
543 : * Adjust strat_total, and quit if we have stored a > or <
544 : * key.
545 : */
546 240968 : strat = chosen->sk_strategy;
547 240968 : if (strat != BTEqualStrategyNumber)
548 : {
549 10181 : strat_total = strat;
550 10181 : if (strat == BTGreaterStrategyNumber ||
551 : strat == BTLessStrategyNumber)
552 9821 : break;
553 : }
554 :
555 : /*
556 : * Done if that was the last attribute, or if next key is not
557 : * in sequence (implying no boundary key is available for the
558 : * next attribute).
559 : */
560 231147 : if (i >= so->numberOfKeys ||
561 : cur->sk_attno != curattr + 1)
562 : break;
563 :
564 : /*
565 : * Reset for next attr.
566 : */
567 77565 : curattr = cur->sk_attno;
568 77565 : chosen = NULL;
569 : }
570 :
571 : /* Can we use this key as a starting boundary for this attr? */
572 240999 : switch (cur->sk_strategy)
573 : {
574 : case BTLessStrategyNumber:
575 : case BTLessEqualStrategyNumber:
576 34 : if (chosen == NULL && ScanDirectionIsBackward(dir))
577 5 : chosen = cur;
578 : break;
579 : case BTEqualStrategyNumber:
580 : /* override any non-equality choice */
581 230787 : chosen = cur;
582 230787 : break;
583 : case BTGreaterEqualStrategyNumber:
584 : case BTGreaterStrategyNumber:
585 10178 : if (chosen == NULL && ScanDirectionIsForward(dir))
586 10176 : chosen = cur;
587 : break;
588 : }
589 240999 : }
590 : }
591 :
592 : /*
593 : * If we found no usable boundary keys, we have to start from one end of
594 : * the tree. Walk down that edge to the first or last key, and scan from
595 : * there.
596 : */
597 163464 : if (keysCount == 0)
598 61 : return _bt_endpoint(scan, dir);
599 :
600 : /*
601 : * We want to start the scan somewhere within the index. Set up an
602 : * insertion scankey we can use to search for the boundary point we
603 : * identified above. The insertion scankey is built in the local
604 : * scankeys[] array, using the keys identified by startKeys[].
605 : */
606 : Assert(keysCount <= INDEX_MAX_KEYS);
607 404370 : for (i = 0; i < keysCount; i++)
608 : {
609 240968 : ScanKey cur = startKeys[i];
610 :
611 : Assert(cur->sk_attno == i + 1);
612 :
613 240968 : if (cur->sk_flags & SK_ROW_HEADER)
614 : {
615 : /*
616 : * Row comparison header: look to the first row member instead.
617 : *
618 : * The member scankeys are already in insertion format (ie, they
619 : * have sk_func = 3-way-comparison function), but we have to watch
620 : * out for nulls, which _bt_preprocess_keys didn't check. A null
621 : * in the first row member makes the condition unmatchable, just
622 : * like qual_ok = false.
623 : */
624 1 : ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
625 :
626 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
627 1 : if (subkey->sk_flags & SK_ISNULL)
628 0 : return false;
629 1 : memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
630 :
631 : /*
632 : * If the row comparison is the last positioning key we accepted,
633 : * try to add additional keys from the lower-order row members.
634 : * (If we accepted independent conditions on additional index
635 : * columns, we use those instead --- doesn't seem worth trying to
636 : * determine which is more restrictive.) Note that this is OK
637 : * even if the row comparison is of ">" or "<" type, because the
638 : * condition applied to all but the last row member is effectively
639 : * ">=" or "<=", and so the extra keys don't break the positioning
640 : * scheme. But, by the same token, if we aren't able to use all
641 : * the row members, then the part of the row comparison that we
642 : * did use has to be treated as just a ">=" or "<=" condition, and
643 : * so we'd better adjust strat_total accordingly.
644 : */
645 1 : if (i == keysCount - 1)
646 : {
647 1 : bool used_all_subkeys = false;
648 :
649 : Assert(!(subkey->sk_flags & SK_ROW_END));
650 : for (;;)
651 : {
652 1 : subkey++;
653 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
654 1 : if (subkey->sk_attno != keysCount + 1)
655 0 : break; /* out-of-sequence, can't use it */
656 1 : if (subkey->sk_strategy != cur->sk_strategy)
657 0 : break; /* wrong direction, can't use it */
658 1 : if (subkey->sk_flags & SK_ISNULL)
659 0 : break; /* can't use null keys */
660 : Assert(keysCount < INDEX_MAX_KEYS);
661 1 : memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
662 1 : keysCount++;
663 1 : if (subkey->sk_flags & SK_ROW_END)
664 : {
665 1 : used_all_subkeys = true;
666 1 : break;
667 : }
668 : }
669 1 : if (!used_all_subkeys)
670 : {
671 0 : switch (strat_total)
672 : {
673 : case BTLessStrategyNumber:
674 0 : strat_total = BTLessEqualStrategyNumber;
675 0 : break;
676 : case BTGreaterStrategyNumber:
677 0 : strat_total = BTGreaterEqualStrategyNumber;
678 : break;
679 : }
680 : }
681 : break; /* done with outer loop */
682 : }
683 : }
684 : else
685 : {
686 : /*
687 : * Ordinary comparison key. Transform the search-style scan key
688 : * to an insertion scan key by replacing the sk_func with the
689 : * appropriate btree comparison function.
690 : *
691 : * If scankey operator is not a cross-type comparison, we can use
692 : * the cached comparison function; otherwise gotta look it up in
693 : * the catalogs. (That can't lead to infinite recursion, since no
694 : * indexscan initiated by syscache lookup will use cross-data-type
695 : * operators.)
696 : *
697 : * We support the convention that sk_subtype == InvalidOid means
698 : * the opclass input type; this is a hack to simplify life for
699 : * ScanKeyInit().
700 : */
701 481590 : if (cur->sk_subtype == rel->rd_opcintype[i] ||
702 : cur->sk_subtype == InvalidOid)
703 : {
704 : FmgrInfo *procinfo;
705 :
706 240623 : procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
707 240623 : ScanKeyEntryInitializeWithInfo(scankeys + i,
708 : cur->sk_flags,
709 : cur->sk_attno,
710 : InvalidStrategy,
711 : cur->sk_subtype,
712 : procinfo,
713 : cur->sk_argument);
714 : }
715 : else
716 : {
717 : RegProcedure cmp_proc;
718 :
719 344 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
720 : rel->rd_opcintype[i],
721 : cur->sk_subtype,
722 : BTORDER_PROC);
723 344 : if (!RegProcedureIsValid(cmp_proc))
724 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
725 : BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
726 : cur->sk_attno, RelationGetRelationName(rel));
727 344 : ScanKeyEntryInitialize(scankeys + i,
728 : cur->sk_flags,
729 : cur->sk_attno,
730 : InvalidStrategy,
731 : cur->sk_subtype,
732 : cmp_proc,
733 : cur->sk_argument);
734 : }
735 : }
736 : }
737 :
738 : /*----------
739 : * Examine the selected initial-positioning strategy to determine exactly
740 : * where we need to start the scan, and set flag variables to control the
741 : * code below.
742 : *
743 : * If nextkey = false, _bt_search and _bt_binsrch will locate the first
744 : * item >= scan key. If nextkey = true, they will locate the first
745 : * item > scan key.
746 : *
747 : * If goback = true, we will then step back one item, while if
748 : * goback = false, we will start the scan on the located item.
749 : *----------
750 : */
751 163403 : switch (strat_total)
752 : {
753 : case BTLessStrategyNumber:
754 :
755 : /*
756 : * Find first item >= scankey, then back up one to arrive at last
757 : * item < scankey. (Note: this positioning strategy is only used
758 : * for a backward scan, so that is always the correct starting
759 : * position.)
760 : */
761 5 : nextkey = false;
762 5 : goback = true;
763 5 : break;
764 :
765 : case BTLessEqualStrategyNumber:
766 :
767 : /*
768 : * Find first item > scankey, then back up one to arrive at last
769 : * item <= scankey. (Note: this positioning strategy is only used
770 : * for a backward scan, so that is always the correct starting
771 : * position.)
772 : */
773 0 : nextkey = true;
774 0 : goback = true;
775 0 : break;
776 :
777 : case BTEqualStrategyNumber:
778 :
779 : /*
780 : * If a backward scan was specified, need to start with last equal
781 : * item not first one.
782 : */
783 153222 : if (ScanDirectionIsBackward(dir))
784 : {
785 : /*
786 : * This is the same as the <= strategy. We will check at the
787 : * end whether the found item is actually =.
788 : */
789 4 : nextkey = true;
790 4 : goback = true;
791 : }
792 : else
793 : {
794 : /*
795 : * This is the same as the >= strategy. We will check at the
796 : * end whether the found item is actually =.
797 : */
798 153218 : nextkey = false;
799 153218 : goback = false;
800 : }
801 : break;
802 :
803 : case BTGreaterEqualStrategyNumber:
804 :
805 : /*
806 : * Find first item >= scankey. (This is only used for forward
807 : * scans.)
808 : */
809 360 : nextkey = false;
810 360 : goback = false;
811 360 : break;
812 :
813 : case BTGreaterStrategyNumber:
814 :
815 : /*
816 : * Find first item > scankey. (This is only used for forward
817 : * scans.)
818 : */
819 9816 : nextkey = true;
820 9816 : goback = false;
821 9816 : break;
822 :
823 : default:
824 : /* can't get here, but keep compiler quiet */
825 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
826 0 : return false;
827 : }
828 :
829 : /*
830 : * Use the manufactured insertion scan key to descend the tree and
831 : * position ourselves on the target leaf page.
832 : */
833 163403 : stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
834 :
835 : /* don't need to keep the stack around... */
836 163403 : _bt_freestack(stack);
837 :
838 : /* remember which buffer we have pinned, if any */
839 163403 : so->currPos.buf = buf;
840 :
841 163403 : if (!BufferIsValid(buf))
842 : {
843 : /* Only get here if index is completely empty */
844 248 : return false;
845 : }
846 :
847 : /* initialize moreLeft/moreRight appropriately for scan direction */
848 163155 : if (ScanDirectionIsForward(dir))
849 : {
850 163146 : so->currPos.moreLeft = false;
851 163146 : so->currPos.moreRight = true;
852 : }
853 : else
854 : {
855 9 : so->currPos.moreLeft = true;
856 9 : so->currPos.moreRight = false;
857 : }
858 163155 : so->numKilled = 0; /* just paranoia */
859 163155 : so->markItemIndex = -1; /* ditto */
860 :
861 : /* position to the precise item on the page */
862 163155 : offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
863 :
864 : /*
865 : * If nextkey = false, we are positioned at the first item >= scan key, or
866 : * possibly at the end of a page on which all the existing items are less
867 : * than the scan key and we know that everything on later pages is greater
868 : * than or equal to scan key.
869 : *
870 : * If nextkey = true, we are positioned at the first item > scan key, or
871 : * possibly at the end of a page on which all the existing items are less
872 : * than or equal to the scan key and we know that everything on later
873 : * pages is greater than scan key.
874 : *
875 : * The actually desired starting point is either this item or the prior
876 : * one, or in the end-of-page case it's the first item on the next page or
877 : * the last item on this page. Adjust the starting offset if needed. (If
878 : * this results in an offset before the first item or after the last one,
879 : * _bt_readpage will report no items found, and then we'll step to the
880 : * next page as needed.)
881 : */
882 163155 : if (goback)
883 9 : offnum = OffsetNumberPrev(offnum);
884 :
885 : /*
886 : * Now load data from the first page of the scan.
887 : */
888 163155 : if (!_bt_readpage(scan, dir, offnum))
889 : {
890 : /*
891 : * There's no actually-matching data on this page. Try to advance to
892 : * the next page. Return false if there's no matching data at all.
893 : */
894 49794 : if (!_bt_steppage(scan, dir))
895 49718 : return false;
896 : }
897 :
898 : /* Drop the lock, but not pin, on the current page */
899 113437 : LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
900 :
901 : /* OK, itemIndex says what to return */
902 113437 : scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
903 :
904 113437 : return true;
905 : }
906 :
907 : /*
908 : * _bt_next() -- Get the next item in a scan.
909 : *
910 : * On entry, so->currPos describes the current page, which is pinned
911 : * but not locked, and so->currPos.itemIndex identifies which item was
912 : * previously returned.
913 : *
914 : * On successful exit, scan->xs_ctup.t_self is set to the TID of the
915 : * next heap tuple, and so->currPos is updated as needed.
916 : *
917 : * On failure exit (no more tuples), we release pin and set
918 : * so->currPos.buf to InvalidBuffer.
919 : */
920 : bool
921 : _bt_next(IndexScanDesc scan, ScanDirection dir)
922 85298 : {
923 85298 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
924 :
925 : /*
926 : * Advance to next tuple on current page; or if there's no more, try to
927 : * step to the next page with data.
928 : */
929 85298 : if (ScanDirectionIsForward(dir))
930 : {
931 84933 : if (++so->currPos.itemIndex > so->currPos.lastItem)
932 : {
933 : /* We must acquire lock before applying _bt_steppage */
934 : Assert(BufferIsValid(so->currPos.buf));
935 17210 : LockBuffer(so->currPos.buf, BT_READ);
936 17210 : if (!_bt_steppage(scan, dir))
937 17061 : return false;
938 : /* Drop the lock, but not pin, on the new page */
939 149 : LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
940 : }
941 : }
942 : else
943 : {
944 365 : if (--so->currPos.itemIndex < so->currPos.firstItem)
945 : {
946 : /* We must acquire lock before applying _bt_steppage */
947 : Assert(BufferIsValid(so->currPos.buf));
948 12 : LockBuffer(so->currPos.buf, BT_READ);
949 12 : if (!_bt_steppage(scan, dir))
950 12 : return false;
951 : /* Drop the lock, but not pin, on the new page */
952 0 : LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
953 : }
954 : }
955 :
956 : /* OK, itemIndex says what to return */
957 68225 : scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
958 :
959 68225 : return true;
960 : }
961 :
962 : /*
963 : * _bt_readpage() -- Load data from current index page into so->currPos
964 : *
965 : * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
966 : * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
967 : * they are updated as appropriate. All other fields of so->currPos are
968 : * initialized from scratch here.
969 : *
970 : * We scan the current page starting at offnum and moving in the indicated
971 : * direction. All items matching the scan keys are loaded into currPos.items.
972 : * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
973 : * that there can be no more matching tuples in the current scan direction.
974 : *
975 : * Returns true if any matching items found on the page, false if none.
976 : */
977 : static bool
978 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
979 163887 : {
980 163887 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
981 : Page page;
982 : BTPageOpaque opaque;
983 : OffsetNumber minoff;
984 : OffsetNumber maxoff;
985 : int itemIndex;
986 : bool continuescan;
987 :
988 : /* we must have the buffer pinned and locked */
989 : Assert(BufferIsValid(so->currPos.buf));
990 :
991 163887 : page = BufferGetPage(so->currPos.buf);
992 163887 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
993 163887 : minoff = P_FIRSTDATAKEY(opaque);
994 163887 : maxoff = PageGetMaxOffsetNumber(page);
995 :
996 : /*
997 : * we must save the page's right-link while scanning it; this tells us
998 : * where to step right to after we're done with these items. There is no
999 : * corresponding need for the left-link, since splits always go right.
1000 : */
1001 163887 : so->currPos.nextPage = opaque->btpo_next;
1002 :
1003 163887 : if (ScanDirectionIsForward(dir))
1004 : {
1005 : /* load items[] in ascending order */
1006 163869 : itemIndex = 0;
1007 :
1008 163869 : offnum = Max(offnum, minoff);
1009 :
1010 814228 : while (offnum <= maxoff)
1011 : {
1012 605296 : if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
1013 : {
1014 : /* tuple passes all scan key conditions, so remember it */
1015 : /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
1016 433450 : so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
1017 433450 : so->currPos.items[itemIndex].indexOffset = offnum;
1018 433450 : itemIndex++;
1019 : }
1020 605296 : if (!continuescan)
1021 : {
1022 : /* there can't be any more matches, so stop */
1023 118806 : so->currPos.moreRight = false;
1024 118806 : break;
1025 : }
1026 :
1027 486490 : offnum = OffsetNumberNext(offnum);
1028 : }
1029 :
1030 : Assert(itemIndex <= MaxIndexTuplesPerPage);
1031 163869 : so->currPos.firstItem = 0;
1032 163869 : so->currPos.lastItem = itemIndex - 1;
1033 163869 : so->currPos.itemIndex = 0;
1034 : }
1035 : else
1036 : {
1037 : /* load items[] in descending order */
1038 18 : itemIndex = MaxIndexTuplesPerPage;
1039 :
1040 18 : offnum = Min(offnum, maxoff);
1041 :
1042 2154 : while (offnum >= minoff)
1043 : {
1044 2120 : if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
1045 : {
1046 : /* tuple passes all scan key conditions, so remember it */
1047 : /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
1048 1969 : itemIndex--;
1049 1969 : so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
1050 1969 : so->currPos.items[itemIndex].indexOffset = offnum;
1051 : }
1052 2120 : if (!continuescan)
1053 : {
1054 : /* there can't be any more matches, so stop */
1055 2 : so->currPos.moreLeft = false;
1056 2 : break;
1057 : }
1058 :
1059 2118 : offnum = OffsetNumberPrev(offnum);
1060 : }
1061 :
1062 : Assert(itemIndex >= 0);
1063 18 : so->currPos.firstItem = itemIndex;
1064 18 : so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1065 18 : so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1066 : }
1067 :
1068 163887 : return (so->currPos.firstItem <= so->currPos.lastItem);
1069 : }
1070 :
1071 : /*
1072 : * _bt_steppage() -- Step to next page containing valid data for scan
1073 : *
1074 : * On entry, so->currPos.buf must be pinned and read-locked. We'll drop
1075 : * the lock and pin before moving to next page.
1076 : *
1077 : * On success exit, we hold pin and read-lock on the next interesting page,
1078 : * and so->currPos is updated to contain data from that page.
1079 : *
1080 : * If there are no more matching records in the given direction, we drop all
1081 : * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
1082 : */
1083 : static bool
1084 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1085 67018 : {
1086 67018 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1087 : Relation rel;
1088 : Page page;
1089 : BTPageOpaque opaque;
1090 :
1091 : /* we must have the buffer pinned and locked */
1092 : Assert(BufferIsValid(so->currPos.buf));
1093 :
1094 : /* Before leaving current page, deal with any killed items */
1095 67018 : if (so->numKilled > 0)
1096 696 : _bt_killitems(scan, true);
1097 :
1098 : /*
1099 : * Before we modify currPos, make a copy of the page data if there was a
1100 : * mark position that needs it.
1101 : */
1102 67018 : if (so->markItemIndex >= 0)
1103 : {
1104 : /* bump pin on current buffer for assignment to mark buffer */
1105 0 : IncrBufferRefCount(so->currPos.buf);
1106 0 : memcpy(&so->markPos, &so->currPos,
1107 : offsetof(BTScanPosData, items[1]) +
1108 : so->currPos.lastItem * sizeof(BTScanPosItem));
1109 0 : so->markPos.itemIndex = so->markItemIndex;
1110 0 : so->markItemIndex = -1;
1111 : }
1112 :
1113 67018 : rel = scan->indexRelation;
1114 :
1115 67018 : if (ScanDirectionIsForward(dir))
1116 : {
1117 : /* Walk right to the next page with data */
1118 : /* We must rely on the previously saved nextPage link! */
1119 67005 : BlockNumber blkno = so->currPos.nextPage;
1120 :
1121 : /* Remember we left a page with data */
1122 67005 : so->currPos.moreLeft = true;
1123 :
1124 : for (;;)
1125 : {
1126 : /* if we're at end of scan, release the buffer and return */
1127 67450 : if (blkno == P_NONE || !so->currPos.moreRight)
1128 : {
1129 66779 : _bt_relbuf(rel, so->currPos.buf);
1130 66779 : so->currPos.buf = InvalidBuffer;
1131 66779 : return false;
1132 : }
1133 : /* step right one page */
1134 671 : so->currPos.buf = _bt_relandgetbuf(rel, so->currPos.buf,
1135 : blkno, BT_READ);
1136 : /* check for deleted page */
1137 671 : page = BufferGetPage(so->currPos.buf);
1138 671 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1139 671 : if (!P_IGNORE(opaque))
1140 : {
1141 : /* see if there are any matches on this page */
1142 : /* note that this will clear moreRight if we can stop */
1143 671 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1144 226 : break;
1145 : }
1146 : /* nope, keep going */
1147 445 : blkno = opaque->btpo_next;
1148 445 : }
1149 : }
1150 : else
1151 : {
1152 : /* Remember we left a page with data */
1153 13 : so->currPos.moreRight = true;
1154 :
1155 : /*
1156 : * Walk left to the next page with data. This is much more complex
1157 : * than the walk-right case because of the possibility that the page
1158 : * to our left splits while we are in flight to it, plus the
1159 : * possibility that the page we were on gets deleted after we leave
1160 : * it. See nbtree/README for details.
1161 : */
1162 : for (;;)
1163 : {
1164 : /* Done if we know there are no matching keys to the left */
1165 13 : if (!so->currPos.moreLeft)
1166 : {
1167 7 : _bt_relbuf(rel, so->currPos.buf);
1168 7 : so->currPos.buf = InvalidBuffer;
1169 7 : return false;
1170 : }
1171 :
1172 : /* Step to next physical page */
1173 6 : so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
1174 :
1175 : /* if we're physically at end of index, return failure */
1176 6 : if (so->currPos.buf == InvalidBuffer)
1177 6 : return false;
1178 :
1179 : /*
1180 : * Okay, we managed to move left to a non-deleted page. Done if
1181 : * it's not half-dead and contains matching tuples. Else loop back
1182 : * and do it all again.
1183 : */
1184 0 : page = BufferGetPage(so->currPos.buf);
1185 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1186 0 : if (!P_IGNORE(opaque))
1187 : {
1188 : /* see if there are any matches on this page */
1189 : /* note that this will clear moreLeft if we can stop */
1190 0 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1191 0 : break;
1192 : }
1193 : }
1194 : }
1195 :
1196 226 : return true;
1197 : }
1198 :
1199 : /*
1200 : * _bt_walk_left() -- step left one page, if possible
1201 : *
1202 : * The given buffer must be pinned and read-locked. This will be dropped
1203 : * before stepping left. On return, we have pin and read lock on the
1204 : * returned page, instead.
1205 : *
1206 : * Returns InvalidBuffer if there is no page to the left (no lock is held
1207 : * in that case).
1208 : *
1209 : * When working on a non-leaf level, it is possible for the returned page
1210 : * to be half-dead; the caller should check that condition and step left
1211 : * again if it's important.
1212 : */
1213 : static Buffer
1214 : _bt_walk_left(Relation rel, Buffer buf)
1215 6 : {
1216 : Page page;
1217 : BTPageOpaque opaque;
1218 :
1219 6 : page = BufferGetPage(buf);
1220 6 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1221 :
1222 : for (;;)
1223 : {
1224 : BlockNumber obknum;
1225 : BlockNumber lblkno;
1226 : BlockNumber blkno;
1227 : int tries;
1228 :
1229 : /* if we're at end of tree, release buf and return failure */
1230 6 : if (P_LEFTMOST(opaque))
1231 : {
1232 6 : _bt_relbuf(rel, buf);
1233 : break;
1234 : }
1235 : /* remember original page we are stepping left from */
1236 0 : obknum = BufferGetBlockNumber(buf);
1237 : /* step left */
1238 0 : blkno = lblkno = opaque->btpo_prev;
1239 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1240 0 : page = BufferGetPage(buf);
1241 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1242 :
1243 : /*
1244 : * If this isn't the page we want, walk right till we find what we
1245 : * want --- but go no more than four hops (an arbitrary limit). If we
1246 : * don't find the correct page by then, the most likely bet is that
1247 : * the original page got deleted and isn't in the sibling chain at all
1248 : * anymore, not that its left sibling got split more than four times.
1249 : *
1250 : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1251 : * because half-dead pages are still in the sibling chain. Caller
1252 : * must reject half-dead pages if wanted.
1253 : */
1254 0 : tries = 0;
1255 : for (;;)
1256 : {
1257 0 : if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1258 : {
1259 : /* Found desired page, return it */
1260 0 : return buf;
1261 : }
1262 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
1263 : break;
1264 0 : blkno = opaque->btpo_next;
1265 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1266 0 : page = BufferGetPage(buf);
1267 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1268 0 : }
1269 :
1270 : /* Return to the original page to see what's up */
1271 0 : buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1272 0 : page = BufferGetPage(buf);
1273 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1274 0 : if (P_ISDELETED(opaque))
1275 : {
1276 : /*
1277 : * It was deleted. Move right to first nondeleted page (there
1278 : * must be one); that is the page that has acquired the deleted
1279 : * one's keyspace, so stepping left from it will take us where we
1280 : * want to be.
1281 : */
1282 : for (;;)
1283 : {
1284 0 : if (P_RIGHTMOST(opaque))
1285 0 : elog(ERROR, "fell off the end of index \"%s\"",
1286 : RelationGetRelationName(rel));
1287 0 : blkno = opaque->btpo_next;
1288 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1289 0 : page = BufferGetPage(buf);
1290 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1291 0 : if (!P_ISDELETED(opaque))
1292 0 : break;
1293 : }
1294 :
1295 : /*
1296 : * Now return to top of loop, resetting obknum to point to this
1297 : * nondeleted page, and try again.
1298 : */
1299 : }
1300 : else
1301 : {
1302 : /*
1303 : * It wasn't deleted; the explanation had better be that the page
1304 : * to the left got split or deleted. Without this check, we'd go
1305 : * into an infinite loop if there's anything wrong.
1306 : */
1307 0 : if (opaque->btpo_prev == lblkno)
1308 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1309 : obknum, RelationGetRelationName(rel));
1310 : /* Okay to try again with new lblkno value */
1311 : }
1312 : }
1313 :
1314 6 : return InvalidBuffer;
1315 : }
1316 :
1317 : /*
1318 : * _bt_get_endpoint() -- Find the first or last page on a given tree level
1319 : *
1320 : * If the index is empty, we will return InvalidBuffer; any other failure
1321 : * condition causes ereport(). We will not return a dead page.
1322 : *
1323 : * The returned buffer is pinned and read-locked.
1324 : */
1325 : Buffer
1326 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
1327 61 : {
1328 : Buffer buf;
1329 : Page page;
1330 : BTPageOpaque opaque;
1331 : OffsetNumber offnum;
1332 : BlockNumber blkno;
1333 : IndexTuple itup;
1334 :
1335 : /*
1336 : * If we are looking for a leaf page, okay to descend from fast root;
1337 : * otherwise better descend from true root. (There is no point in being
1338 : * smarter about intermediate levels.)
1339 : */
1340 61 : if (level == 0)
1341 61 : buf = _bt_getroot(rel, BT_READ);
1342 : else
1343 0 : buf = _bt_gettrueroot(rel);
1344 :
1345 61 : if (!BufferIsValid(buf))
1346 : {
1347 : /* empty index... */
1348 0 : return InvalidBuffer;
1349 : }
1350 :
1351 61 : page = BufferGetPage(buf);
1352 61 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1353 :
1354 : for (;;)
1355 : {
1356 : /*
1357 : * If we landed on a deleted page, step right to find a live page
1358 : * (there must be one). Also, if we want the rightmost page, step
1359 : * right if needed to get to it (this could happen if the page split
1360 : * since we obtained a pointer to it).
1361 : */
1362 93 : while (P_IGNORE(opaque) ||
1363 : (rightmost && !P_RIGHTMOST(opaque)))
1364 : {
1365 0 : blkno = opaque->btpo_next;
1366 0 : if (blkno == P_NONE)
1367 0 : elog(ERROR, "fell off the end of index \"%s\"",
1368 : RelationGetRelationName(rel));
1369 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1370 0 : page = BufferGetPage(buf);
1371 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1372 : }
1373 :
1374 : /* Done? */
1375 93 : if (opaque->btpo.level == level)
1376 61 : break;
1377 32 : if (opaque->btpo.level < level)
1378 0 : elog(ERROR, "btree level %u not found in index \"%s\"",
1379 : level, RelationGetRelationName(rel));
1380 :
1381 : /* Descend to leftmost or rightmost child page */
1382 32 : if (rightmost)
1383 5 : offnum = PageGetMaxOffsetNumber(page);
1384 : else
1385 27 : offnum = P_FIRSTDATAKEY(opaque);
1386 :
1387 32 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1388 32 : blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1389 :
1390 32 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1391 32 : page = BufferGetPage(buf);
1392 32 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1393 32 : }
1394 :
1395 61 : return buf;
1396 : }
1397 :
1398 : /*
1399 : * _bt_endpoint() -- Find the first or last page in the index, and scan
1400 : * from there to the first key satisfying all the quals.
1401 : *
1402 : * This is used by _bt_first() to set up a scan when we've determined
1403 : * that the scan must start at the beginning or end of the index (for
1404 : * a forward or backward scan respectively). Exit conditions are the
1405 : * same as for _bt_first().
1406 : */
1407 : static bool
1408 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1409 61 : {
1410 61 : Relation rel = scan->indexRelation;
1411 61 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1412 : Buffer buf;
1413 : Page page;
1414 : BTPageOpaque opaque;
1415 : OffsetNumber start;
1416 :
1417 : /*
1418 : * Scan down to the leftmost or rightmost leaf page. This is a simplified
1419 : * version of _bt_search(). We don't maintain a stack since we know we
1420 : * won't need it.
1421 : */
1422 61 : buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
1423 :
1424 61 : if (!BufferIsValid(buf))
1425 : {
1426 : /* empty index... */
1427 0 : so->currPos.buf = InvalidBuffer;
1428 0 : return false;
1429 : }
1430 :
1431 61 : page = BufferGetPage(buf);
1432 61 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1433 : Assert(P_ISLEAF(opaque));
1434 :
1435 61 : if (ScanDirectionIsForward(dir))
1436 : {
1437 : /* There could be dead pages to the left, so not this: */
1438 : /* Assert(P_LEFTMOST(opaque)); */
1439 :
1440 52 : start = P_FIRSTDATAKEY(opaque);
1441 : }
1442 9 : else if (ScanDirectionIsBackward(dir))
1443 : {
1444 : Assert(P_RIGHTMOST(opaque));
1445 :
1446 9 : start = PageGetMaxOffsetNumber(page);
1447 : }
1448 : else
1449 : {
1450 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
1451 0 : start = 0; /* keep compiler quiet */
1452 : }
1453 :
1454 : /* remember which buffer we have pinned */
1455 61 : so->currPos.buf = buf;
1456 :
1457 : /* initialize moreLeft/moreRight appropriately for scan direction */
1458 61 : if (ScanDirectionIsForward(dir))
1459 : {
1460 52 : so->currPos.moreLeft = false;
1461 52 : so->currPos.moreRight = true;
1462 : }
1463 : else
1464 : {
1465 9 : so->currPos.moreLeft = true;
1466 9 : so->currPos.moreRight = false;
1467 : }
1468 61 : so->numKilled = 0; /* just paranoia */
1469 61 : so->markItemIndex = -1; /* ditto */
1470 :
1471 : /*
1472 : * Now load data from the first page of the scan.
1473 : */
1474 61 : if (!_bt_readpage(scan, dir, start))
1475 : {
1476 : /*
1477 : * There's no actually-matching data on this page. Try to advance to
1478 : * the next page. Return false if there's no matching data at all.
1479 : */
1480 2 : if (!_bt_steppage(scan, dir))
1481 1 : return false;
1482 : }
1483 :
1484 : /* Drop the lock, but not pin, on the current page */
1485 60 : LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1486 :
1487 : /* OK, itemIndex says what to return */
1488 60 : scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
1489 :
1490 60 : return true;
1491 : }
|