1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.155 2007/05/30 20:11:53 tgl Exp $
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/genam.h"
22 : #include "access/nbtree.h"
23 : #include "catalog/index.h"
24 : #include "commands/vacuum.h"
25 : #include "storage/freespace.h"
26 : #include "storage/lmgr.h"
27 : #include "utils/memutils.h"
28 :
29 :
30 : /* Working state for btbuild and its callback */
31 : typedef struct
32 : {
33 : bool isUnique;
34 : bool haveDead;
35 : Relation heapRel;
36 : BTSpool *spool;
37 :
38 : /*
39 : * spool2 is needed only when the index is an unique index. Dead tuples
40 : * are put into spool2 instead of spool in order to avoid uniqueness
41 : * check.
42 : */
43 : BTSpool *spool2;
44 : double indtuples;
45 : } BTBuildState;
46 :
47 : /* Working state needed by btvacuumpage */
48 : typedef struct
49 : {
50 : IndexVacuumInfo *info;
51 : IndexBulkDeleteResult *stats;
52 : IndexBulkDeleteCallback callback;
53 : void *callback_state;
54 : BTCycleId cycleid;
55 : BlockNumber *freePages;
56 : int nFreePages; /* number of entries in freePages[] */
57 : int maxFreePages; /* allocated size of freePages[] */
58 : BlockNumber totFreePages; /* true total # of free pages */
59 : MemoryContext pagedelcontext;
60 : } BTVacState;
61 :
62 :
63 : static void btbuildCallback(Relation index,
64 : HeapTuple htup,
65 : Datum *values,
66 : bool *isnull,
67 : bool tupleIsAlive,
68 : void *state);
69 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
70 : IndexBulkDeleteCallback callback, void *callback_state,
71 : BTCycleId cycleid);
72 : static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
73 : BlockNumber orig_blkno);
74 :
75 :
76 : /*
77 : * btbuild() -- build a new btree index.
78 : */
79 : Datum
80 : btbuild(PG_FUNCTION_ARGS)
81 560 : {
82 560 : Relation heap = (Relation) PG_GETARG_POINTER(0);
83 560 : Relation index = (Relation) PG_GETARG_POINTER(1);
84 560 : IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
85 : IndexBuildResult *result;
86 : double reltuples;
87 : BTBuildState buildstate;
88 :
89 560 : buildstate.isUnique = indexInfo->ii_Unique;
90 560 : buildstate.haveDead = false;
91 560 : buildstate.heapRel = heap;
92 560 : buildstate.spool = NULL;
93 560 : buildstate.spool2 = NULL;
94 560 : buildstate.indtuples = 0;
95 :
96 : #ifdef BTREE_BUILD_STATS
97 : if (log_btree_build_stats)
98 : ResetUsage();
99 : #endif /* BTREE_BUILD_STATS */
100 :
101 : /*
102 : * We expect to be called exactly once for any index relation. If that's
103 : * not the case, big trouble's what we have.
104 : */
105 560 : if (RelationGetNumberOfBlocks(index) != 0)
106 0 : elog(ERROR, "index \"%s\" already contains data",
107 : RelationGetRelationName(index));
108 :
109 560 : buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
110 :
111 : /*
112 : * If building a unique index, put dead tuples in a second spool to keep
113 : * them out of the uniqueness check.
114 : */
115 560 : if (indexInfo->ii_Unique)
116 503 : buildstate.spool2 = _bt_spoolinit(index, false, true);
117 :
118 : /* do the heap scan */
119 560 : reltuples = IndexBuildHeapScan(heap, index, indexInfo,
120 : btbuildCallback, (void *) &buildstate);
121 :
122 : /* okay, all heap tuples are indexed */
123 560 : if (buildstate.spool2 && !buildstate.haveDead)
124 : {
125 : /* spool2 turns out to be unnecessary */
126 500 : _bt_spooldestroy(buildstate.spool2);
127 500 : buildstate.spool2 = NULL;
128 : }
129 :
130 : /*
131 : * Finish the build by (1) completing the sort of the spool file, (2)
132 : * inserting the sorted tuples into btree pages and (3) building the upper
133 : * levels.
134 : */
135 560 : _bt_leafbuild(buildstate.spool, buildstate.spool2);
136 557 : _bt_spooldestroy(buildstate.spool);
137 557 : if (buildstate.spool2)
138 3 : _bt_spooldestroy(buildstate.spool2);
139 :
140 : #ifdef BTREE_BUILD_STATS
141 : if (log_btree_build_stats)
142 : {
143 : ShowUsage("BTREE BUILD STATS");
144 : ResetUsage();
145 : }
146 : #endif /* BTREE_BUILD_STATS */
147 :
148 : /*
149 : * If we are reindexing a pre-existing index, it is critical to send out a
150 : * relcache invalidation SI message to ensure all backends re-read the
151 : * index metapage. We expect that the caller will ensure that happens
152 : * (typically as a side effect of updating index stats, but it must happen
153 : * even if the stats don't change!)
154 : */
155 :
156 : /*
157 : * Return statistics
158 : */
159 557 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
160 :
161 557 : result->heap_tuples = reltuples;
162 557 : result->index_tuples = buildstate.indtuples;
163 :
164 557 : PG_RETURN_POINTER(result);
165 : }
166 :
167 : /*
168 : * Per-tuple callback from IndexBuildHeapScan
169 : */
170 : static void
171 : btbuildCallback(Relation index,
172 : HeapTuple htup,
173 : Datum *values,
174 : bool *isnull,
175 : bool tupleIsAlive,
176 : void *state)
177 173938 : {
178 173938 : BTBuildState *buildstate = (BTBuildState *) state;
179 : IndexTuple itup;
180 :
181 : /* form an index tuple and point it at the heap tuple */
182 173938 : itup = index_form_tuple(RelationGetDescr(index), values, isnull);
183 173938 : itup->t_tid = htup->t_self;
184 :
185 : /*
186 : * insert the index tuple into the appropriate spool file for subsequent
187 : * processing
188 : */
189 347865 : if (tupleIsAlive || buildstate->spool2 == NULL)
190 173927 : _bt_spool(itup, buildstate->spool);
191 : else
192 : {
193 : /* dead tuples are put into spool2 */
194 11 : buildstate->haveDead = true;
195 11 : _bt_spool(itup, buildstate->spool2);
196 : }
197 :
198 173938 : buildstate->indtuples += 1;
199 :
200 173938 : pfree(itup);
201 173938 : }
202 :
203 : /*
204 : * btinsert() -- insert an index tuple into a btree.
205 : *
206 : * Descend the tree recursively, find the appropriate location for our
207 : * new tuple, and put it there.
208 : */
209 : Datum
210 : btinsert(PG_FUNCTION_ARGS)
211 70109 : {
212 70109 : Relation rel = (Relation) PG_GETARG_POINTER(0);
213 70109 : Datum *values = (Datum *) PG_GETARG_POINTER(1);
214 70109 : bool *isnull = (bool *) PG_GETARG_POINTER(2);
215 70109 : ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
216 70109 : Relation heapRel = (Relation) PG_GETARG_POINTER(4);
217 70109 : bool checkUnique = PG_GETARG_BOOL(5);
218 : IndexTuple itup;
219 :
220 : /* generate an index tuple */
221 70109 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
222 70109 : itup->t_tid = *ht_ctid;
223 :
224 70109 : _bt_doinsert(rel, itup, checkUnique, heapRel);
225 :
226 70087 : pfree(itup);
227 :
228 70087 : PG_RETURN_BOOL(true);
229 : }
230 :
231 : /*
232 : * btgettuple() -- Get the next tuple in the scan.
233 : */
234 : Datum
235 : btgettuple(PG_FUNCTION_ARGS)
236 248122 : {
237 248122 : IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
238 248122 : ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
239 248122 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
240 : bool res;
241 :
242 : /*
243 : * If we've already initialized this scan, we can just advance it in the
244 : * appropriate direction. If we haven't done so yet, we call a routine to
245 : * get the first item in the scan.
246 : */
247 333105 : if (BTScanPosIsValid(so->currPos))
248 : {
249 : /*
250 : * Check to see if we should kill the previously-fetched tuple.
251 : */
252 84983 : if (scan->kill_prior_tuple)
253 : {
254 : /*
255 : * Yes, remember it for later. (We'll deal with all such tuples
256 : * at once right before leaving the index page.) The test for
257 : * numKilled overrun is not just paranoia: if the caller reverses
258 : * direction in the indexscan then the same item might get entered
259 : * multiple times. It's not worth trying to optimize that, so we
260 : * don't detect it, but instead just forget any excess entries.
261 : */
262 1697 : if (so->killedItems == NULL)
263 930 : so->killedItems = (int *)
264 : palloc(MaxIndexTuplesPerPage * sizeof(int));
265 1697 : if (so->numKilled < MaxIndexTuplesPerPage)
266 1697 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
267 : }
268 :
269 : /*
270 : * Now continue the scan.
271 : */
272 84983 : res = _bt_next(scan, dir);
273 : }
274 : else
275 163139 : res = _bt_first(scan, dir);
276 :
277 248122 : PG_RETURN_BOOL(res);
278 : }
279 :
280 : /*
281 : * btgetmulti() -- get multiple tuples at once
282 : *
283 : * In the current implementation there seems no strong reason to stop at
284 : * index page boundaries; we just press on until we fill the caller's buffer
285 : * or run out of matches.
286 : */
287 : Datum
288 : btgetmulti(PG_FUNCTION_ARGS)
289 328 : {
290 328 : IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
291 328 : ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
292 328 : int32 max_tids = PG_GETARG_INT32(2);
293 328 : int32 *returned_tids = (int32 *) PG_GETARG_POINTER(3);
294 328 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
295 328 : bool res = true;
296 328 : int32 ntids = 0;
297 :
298 328 : if (max_tids <= 0) /* behave correctly in boundary case */
299 0 : PG_RETURN_BOOL(true);
300 :
301 : /* If we haven't started the scan yet, fetch the first page & tuple. */
302 328 : if (!BTScanPosIsValid(so->currPos))
303 : {
304 328 : res = _bt_first(scan, ForwardScanDirection);
305 328 : if (!res)
306 : {
307 : /* empty scan */
308 18 : *returned_tids = ntids;
309 18 : PG_RETURN_BOOL(res);
310 : }
311 : /* Save tuple ID, and continue scanning */
312 310 : tids[ntids] = scan->xs_ctup.t_self;
313 310 : ntids++;
314 : }
315 :
316 2430 : while (ntids < max_tids)
317 : {
318 : /*
319 : * Advance to next tuple within page. This is the same as the easy
320 : * case in _bt_next().
321 : */
322 2430 : if (++so->currPos.itemIndex > so->currPos.lastItem)
323 : {
324 : /* let _bt_next do the heavy lifting */
325 315 : res = _bt_next(scan, ForwardScanDirection);
326 315 : if (!res)
327 310 : break;
328 : }
329 :
330 : /* Save tuple ID, and continue scanning */
331 2120 : tids[ntids] = so->currPos.items[so->currPos.itemIndex].heapTid;
332 2120 : ntids++;
333 : }
334 :
335 310 : *returned_tids = ntids;
336 310 : PG_RETURN_BOOL(res);
337 : }
338 :
339 : /*
340 : * btbeginscan() -- start a scan on a btree index
341 : */
342 : Datum
343 : btbeginscan(PG_FUNCTION_ARGS)
344 118464 : {
345 118464 : Relation rel = (Relation) PG_GETARG_POINTER(0);
346 118464 : int keysz = PG_GETARG_INT32(1);
347 118464 : ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2);
348 : IndexScanDesc scan;
349 :
350 : /* get the scan */
351 118464 : scan = RelationGetIndexScan(rel, keysz, scankey);
352 :
353 118464 : PG_RETURN_POINTER(scan);
354 : }
355 :
356 : /*
357 : * btrescan() -- rescan an index relation
358 : */
359 : Datum
360 : btrescan(PG_FUNCTION_ARGS)
361 167793 : {
362 167793 : IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
363 167793 : ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
364 : BTScanOpaque so;
365 :
366 167793 : so = (BTScanOpaque) scan->opaque;
367 :
368 167793 : if (so == NULL) /* if called from btbeginscan */
369 : {
370 118464 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
371 118464 : so->currPos.buf = so->markPos.buf = InvalidBuffer;
372 118464 : if (scan->numberOfKeys > 0)
373 118424 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
374 : else
375 40 : so->keyData = NULL;
376 118464 : so->killedItems = NULL; /* until needed */
377 118464 : so->numKilled = 0;
378 118464 : scan->opaque = so;
379 : }
380 :
381 : /* we aren't holding any read locks, but gotta drop the pins */
382 167793 : if (BTScanPosIsValid(so->currPos))
383 : {
384 : /* Before leaving current page, deal with any killed items */
385 26340 : if (so->numKilled > 0)
386 0 : _bt_killitems(scan, false);
387 26340 : ReleaseBuffer(so->currPos.buf);
388 26340 : so->currPos.buf = InvalidBuffer;
389 : }
390 :
391 167793 : if (BTScanPosIsValid(so->markPos))
392 : {
393 0 : ReleaseBuffer(so->markPos.buf);
394 0 : so->markPos.buf = InvalidBuffer;
395 : }
396 167793 : so->markItemIndex = -1;
397 :
398 : /*
399 : * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
400 : * - vadim 05/05/97
401 : */
402 167793 : if (scankey && scan->numberOfKeys > 0)
403 167753 : memmove(scan->keyData,
404 : scankey,
405 : scan->numberOfKeys * sizeof(ScanKeyData));
406 167793 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
407 :
408 167793 : PG_RETURN_VOID();
409 : }
410 :
411 : /*
412 : * btendscan() -- close down a scan
413 : */
414 : Datum
415 : btendscan(PG_FUNCTION_ARGS)
416 118438 : {
417 118438 : IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
418 118438 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
419 :
420 : /* we aren't holding any read locks, but gotta drop the pins */
421 118438 : if (BTScanPosIsValid(so->currPos))
422 : {
423 : /* Before leaving current page, deal with any killed items */
424 70075 : if (so->numKilled > 0)
425 238 : _bt_killitems(scan, false);
426 70075 : ReleaseBuffer(so->currPos.buf);
427 70075 : so->currPos.buf = InvalidBuffer;
428 : }
429 :
430 118438 : if (BTScanPosIsValid(so->markPos))
431 : {
432 0 : ReleaseBuffer(so->markPos.buf);
433 0 : so->markPos.buf = InvalidBuffer;
434 : }
435 118438 : so->markItemIndex = -1;
436 :
437 118438 : if (so->killedItems != NULL)
438 928 : pfree(so->killedItems);
439 118438 : if (so->keyData != NULL)
440 118400 : pfree(so->keyData);
441 118438 : pfree(so);
442 :
443 118438 : PG_RETURN_VOID();
444 : }
445 :
446 : /*
447 : * btmarkpos() -- save current scan position
448 : */
449 : Datum
450 : btmarkpos(PG_FUNCTION_ARGS)
451 0 : {
452 0 : IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
453 0 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
454 :
455 : /* we aren't holding any read locks, but gotta drop the pin */
456 0 : if (BTScanPosIsValid(so->markPos))
457 : {
458 0 : ReleaseBuffer(so->markPos.buf);
459 0 : so->markPos.buf = InvalidBuffer;
460 : }
461 :
462 : /*
463 : * Just record the current itemIndex. If we later step to next page
464 : * before releasing the marked position, _bt_steppage makes a full copy of
465 : * the currPos struct in markPos. If (as often happens) the mark is moved
466 : * before we leave the page, we don't have to do that work.
467 : */
468 0 : if (BTScanPosIsValid(so->currPos))
469 0 : so->markItemIndex = so->currPos.itemIndex;
470 : else
471 0 : so->markItemIndex = -1;
472 :
473 0 : PG_RETURN_VOID();
474 : }
475 :
476 : /*
477 : * btrestrpos() -- restore scan to last saved position
478 : */
479 : Datum
480 : btrestrpos(PG_FUNCTION_ARGS)
481 0 : {
482 0 : IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
483 0 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
484 :
485 0 : if (so->markItemIndex >= 0)
486 : {
487 : /*
488 : * The mark position is on the same page we are currently on. Just
489 : * restore the itemIndex.
490 : */
491 0 : so->currPos.itemIndex = so->markItemIndex;
492 : }
493 : else
494 : {
495 : /* we aren't holding any read locks, but gotta drop the pin */
496 0 : if (BTScanPosIsValid(so->currPos))
497 : {
498 : /* Before leaving current page, deal with any killed items */
499 0 : if (so->numKilled > 0 &&
500 : so->currPos.buf != so->markPos.buf)
501 0 : _bt_killitems(scan, false);
502 0 : ReleaseBuffer(so->currPos.buf);
503 0 : so->currPos.buf = InvalidBuffer;
504 : }
505 :
506 0 : if (BTScanPosIsValid(so->markPos))
507 : {
508 : /* bump pin on mark buffer for assignment to current buffer */
509 0 : IncrBufferRefCount(so->markPos.buf);
510 0 : memcpy(&so->currPos, &so->markPos,
511 : offsetof(BTScanPosData, items[1]) +
512 : so->markPos.lastItem * sizeof(BTScanPosItem));
513 : }
514 : }
515 :
516 0 : PG_RETURN_VOID();
517 : }
518 :
519 : /*
520 : * Bulk deletion of all index entries pointing to a set of heap tuples.
521 : * The set of target tuples is specified via a callback routine that tells
522 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
523 : *
524 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
525 : */
526 : Datum
527 : btbulkdelete(PG_FUNCTION_ARGS)
528 70 : {
529 70 : IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
530 70 : IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
531 70 : IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
532 70 : void *callback_state = (void *) PG_GETARG_POINTER(3);
533 70 : Relation rel = info->index;
534 : BTCycleId cycleid;
535 :
536 : /* allocate stats if first time through, else re-use existing struct */
537 70 : if (stats == NULL)
538 70 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
539 :
540 : /* Establish the vacuum cycle ID to use for this scan */
541 70 : PG_TRY();
542 : {
543 70 : cycleid = _bt_start_vacuum(rel);
544 :
545 70 : btvacuumscan(info, stats, callback, callback_state, cycleid);
546 :
547 70 : _bt_end_vacuum(rel);
548 : }
549 0 : PG_CATCH();
550 : {
551 : /* Make sure shared memory gets cleaned up */
552 0 : _bt_end_vacuum(rel);
553 0 : PG_RE_THROW();
554 : }
555 70 : PG_END_TRY();
556 :
557 70 : PG_RETURN_POINTER(stats);
558 : }
559 :
560 : /*
561 : * Post-VACUUM cleanup.
562 : *
563 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
564 : */
565 : Datum
566 : btvacuumcleanup(PG_FUNCTION_ARGS)
567 366 : {
568 366 : IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
569 366 : IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
570 :
571 : /*
572 : * If btbulkdelete was called, we need not do anything, just return the
573 : * stats from the latest btbulkdelete call. If it wasn't called, we must
574 : * still do a pass over the index, to recycle any newly-recyclable pages
575 : * and to obtain index statistics.
576 : *
577 : * Since we aren't going to actually delete any leaf items, there's no
578 : * need to go through all the vacuum-cycle-ID pushups.
579 : */
580 366 : if (stats == NULL)
581 : {
582 301 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
583 301 : btvacuumscan(info, stats, NULL, NULL, 0);
584 : }
585 :
586 : /*
587 : * During a non-FULL vacuum it's quite possible for us to be fooled by
588 : * concurrent page splits into double-counting some index tuples, so
589 : * disbelieve any total that exceeds the underlying heap's count. (We
590 : * can't check this during btbulkdelete.)
591 : */
592 366 : if (!info->vacuum_full)
593 : {
594 263 : if (stats->num_index_tuples > info->num_heap_tuples)
595 0 : stats->num_index_tuples = info->num_heap_tuples;
596 : }
597 :
598 366 : PG_RETURN_POINTER(stats);
599 : }
600 :
601 : /*
602 : * btvacuumscan --- scan the index for VACUUMing purposes
603 : *
604 : * This combines the functions of looking for leaf tuples that are deletable
605 : * according to the vacuum callback, looking for empty pages that can be
606 : * deleted, and looking for old deleted pages that can be recycled. Both
607 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
608 : * btbulkdelete call occurred).
609 : *
610 : * The caller is responsible for initially allocating/zeroing a stats struct
611 : * and for obtaining a vacuum cycle ID if necessary.
612 : */
613 : static void
614 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
615 : IndexBulkDeleteCallback callback, void *callback_state,
616 : BTCycleId cycleid)
617 371 : {
618 371 : Relation rel = info->index;
619 : BTVacState vstate;
620 : BlockNumber num_pages;
621 : BlockNumber blkno;
622 : bool needLock;
623 :
624 : /*
625 : * Reset counts that will be incremented during the scan; needed in case
626 : * of multiple scans during a single VACUUM command
627 : */
628 371 : stats->num_index_tuples = 0;
629 371 : stats->pages_deleted = 0;
630 :
631 : /* Set up info to pass down to btvacuumpage */
632 371 : vstate.info = info;
633 371 : vstate.stats = stats;
634 371 : vstate.callback = callback;
635 371 : vstate.callback_state = callback_state;
636 371 : vstate.cycleid = cycleid;
637 371 : vstate.freePages = NULL; /* temporarily */
638 371 : vstate.nFreePages = 0;
639 371 : vstate.maxFreePages = 0;
640 371 : vstate.totFreePages = 0;
641 :
642 : /* Create a temporary memory context to run _bt_pagedel in */
643 371 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
644 : "_bt_pagedel",
645 : ALLOCSET_DEFAULT_MINSIZE,
646 : ALLOCSET_DEFAULT_INITSIZE,
647 : ALLOCSET_DEFAULT_MAXSIZE);
648 :
649 : /*
650 : * The outer loop iterates over all index pages except the metapage, in
651 : * physical order (we hope the kernel will cooperate in providing
652 : * read-ahead for speed). It is critical that we visit all leaf pages,
653 : * including ones added after we start the scan, else we might fail to
654 : * delete some deletable tuples. Hence, we must repeatedly check the
655 : * relation length. We must acquire the relation-extension lock while
656 : * doing so to avoid a race condition: if someone else is extending the
657 : * relation, there is a window where bufmgr/smgr have created a new
658 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
659 : * we manage to scan such a page here, we'll improperly assume it can be
660 : * recycled. Taking the lock synchronizes things enough to prevent a
661 : * problem: either num_pages won't include the new page, or _bt_getbuf
662 : * already has write lock on the buffer and it will be fully initialized
663 : * before we can examine it. (See also vacuumlazy.c, which has the same
664 : * issue.) Also, we need not worry if a page is added immediately after
665 : * we look; the page splitting code already has write-lock on the left
666 : * page before it adds a right page, so we must already have processed any
667 : * tuples due to be moved into such a page.
668 : *
669 : * We can skip locking for new or temp relations, however, since no one
670 : * else could be accessing them.
671 : */
672 371 : needLock = !RELATION_IS_LOCAL(rel);
673 :
674 371 : blkno = BTREE_METAPAGE + 1;
675 : for (;;)
676 : {
677 : /* Get the current relation length */
678 632 : if (needLock)
679 632 : LockRelationForExtension(rel, ExclusiveLock);
680 632 : num_pages = RelationGetNumberOfBlocks(rel);
681 632 : if (needLock)
682 632 : UnlockRelationForExtension(rel, ExclusiveLock);
683 :
684 : /* Allocate freePages after we read num_pages the first time */
685 632 : if (vstate.freePages == NULL)
686 : {
687 : /* No point in remembering more than MaxFSMPages pages */
688 371 : vstate.maxFreePages = MaxFSMPages;
689 371 : if ((BlockNumber) vstate.maxFreePages > num_pages)
690 371 : vstate.maxFreePages = (int) num_pages;
691 371 : vstate.freePages = (BlockNumber *)
692 : palloc(vstate.maxFreePages * sizeof(BlockNumber));
693 : }
694 :
695 : /* Quit if we've scanned the whole relation */
696 632 : if (blkno >= num_pages)
697 371 : break;
698 : /* Iterate over pages, then loop back to recheck length */
699 1347 : for (; blkno < num_pages; blkno++)
700 : {
701 1347 : btvacuumpage(&vstate, blkno, blkno);
702 : }
703 : }
704 :
705 : /*
706 : * During VACUUM FULL, we truncate off any recyclable pages at the end of
707 : * the index. In a normal vacuum it'd be unsafe to do this except by
708 : * acquiring exclusive lock on the index and then rechecking all the
709 : * pages; doesn't seem worth it.
710 : */
711 371 : if (info->vacuum_full && vstate.nFreePages > 0)
712 : {
713 0 : BlockNumber new_pages = num_pages;
714 :
715 0 : while (vstate.nFreePages > 0 &&
716 : vstate.freePages[vstate.nFreePages - 1] == new_pages - 1)
717 : {
718 0 : new_pages--;
719 0 : stats->pages_deleted--;
720 0 : vstate.nFreePages--;
721 0 : vstate.totFreePages = vstate.nFreePages; /* can't be more */
722 : }
723 0 : if (new_pages != num_pages)
724 : {
725 : /*
726 : * Okay to truncate.
727 : */
728 0 : RelationTruncate(rel, new_pages);
729 :
730 : /* update statistics */
731 0 : stats->pages_removed += num_pages - new_pages;
732 :
733 0 : num_pages = new_pages;
734 : }
735 : }
736 :
737 : /*
738 : * Update the shared Free Space Map with the info we now have about free
739 : * pages in the index, discarding any old info the map may have. We do not
740 : * need to sort the page numbers; they're in order already.
741 : */
742 371 : RecordIndexFreeSpace(&rel->rd_node, vstate.totFreePages,
743 : vstate.nFreePages, vstate.freePages);
744 :
745 371 : pfree(vstate.freePages);
746 :
747 371 : MemoryContextDelete(vstate.pagedelcontext);
748 :
749 : /* update statistics */
750 371 : stats->num_pages = num_pages;
751 371 : stats->pages_free = vstate.totFreePages;
752 371 : }
753 :
754 : /*
755 : * btvacuumpage --- VACUUM one page
756 : *
757 : * This processes a single page for btvacuumscan(). In some cases we
758 : * must go back and re-examine previously-scanned pages; this routine
759 : * recurses when necessary to handle that case.
760 : *
761 : * blkno is the page to process. orig_blkno is the highest block number
762 : * reached by the outer btvacuumscan loop (the same as blkno, unless we
763 : * are recursing to re-examine a previous page).
764 : */
765 : static void
766 : btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
767 1347 : {
768 1347 : IndexVacuumInfo *info = vstate->info;
769 1347 : IndexBulkDeleteResult *stats = vstate->stats;
770 1347 : IndexBulkDeleteCallback callback = vstate->callback;
771 1347 : void *callback_state = vstate->callback_state;
772 1347 : Relation rel = info->index;
773 : bool delete_now;
774 : BlockNumber recurse_to;
775 : Buffer buf;
776 : Page page;
777 : BTPageOpaque opaque;
778 :
779 1347 : restart:
780 1347 : delete_now = false;
781 1347 : recurse_to = P_NONE;
782 :
783 : /* call vacuum_delay_point while not holding any buffer lock */
784 1347 : vacuum_delay_point();
785 :
786 : /*
787 : * We can't use _bt_getbuf() here because it always applies
788 : * _bt_checkpage(), which will barf on an all-zero page. We want to
789 : * recycle all-zero pages, not fail. Also, we want to use a nondefault
790 : * buffer access strategy.
791 : */
792 1347 : buf = ReadBufferWithStrategy(rel, blkno, info->strategy);
793 1347 : LockBuffer(buf, BT_READ);
794 1347 : page = BufferGetPage(buf);
795 1347 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
796 1347 : if (!PageIsNew(page))
797 1347 : _bt_checkpage(rel, buf);
798 :
799 : /*
800 : * If we are recursing, the only case we want to do anything with is a
801 : * live leaf page having the current vacuum cycle ID. Any other state
802 : * implies we already saw the page (eg, deleted it as being empty). In
803 : * particular, we don't want to risk adding it to freePages twice.
804 : */
805 1347 : if (blkno != orig_blkno)
806 : {
807 0 : if (_bt_page_recyclable(page) ||
808 : P_IGNORE(opaque) ||
809 : !P_ISLEAF(opaque) ||
810 : opaque->btpo_cycleid != vstate->cycleid)
811 : {
812 0 : _bt_relbuf(rel, buf);
813 0 : return;
814 : }
815 : }
816 :
817 : /* Page is valid, see what to do with it */
818 1347 : if (_bt_page_recyclable(page))
819 : {
820 : /* Okay to recycle this page */
821 0 : if (vstate->nFreePages < vstate->maxFreePages)
822 0 : vstate->freePages[vstate->nFreePages++] = blkno;
823 0 : vstate->totFreePages++;
824 0 : stats->pages_deleted++;
825 : }
826 1347 : else if (P_ISDELETED(opaque))
827 : {
828 : /* Already deleted, but can't recycle yet */
829 0 : stats->pages_deleted++;
830 : }
831 1347 : else if (P_ISHALFDEAD(opaque))
832 : {
833 : /* Half-dead, try to delete */
834 0 : delete_now = true;
835 : }
836 1347 : else if (P_ISLEAF(opaque))
837 : {
838 : OffsetNumber deletable[MaxOffsetNumber];
839 : int ndeletable;
840 : OffsetNumber offnum,
841 : minoff,
842 : maxoff;
843 :
844 : /*
845 : * Trade in the initial read lock for a super-exclusive write lock on
846 : * this page. We must get such a lock on every leaf page over the
847 : * course of the vacuum scan, whether or not it actually contains any
848 : * deletable tuples --- see nbtree/README.
849 : */
850 1275 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
851 1275 : LockBufferForCleanup(buf);
852 :
853 : /*
854 : * Check whether we need to recurse back to earlier pages. What we
855 : * are concerned about is a page split that happened since we started
856 : * the vacuum scan. If the split moved some tuples to a lower page
857 : * then we might have missed 'em. If so, set up for tail recursion.
858 : * (Must do this before possibly clearing btpo_cycleid below!)
859 : */
860 1275 : if (vstate->cycleid != 0 &&
861 : opaque->btpo_cycleid == vstate->cycleid &&
862 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
863 : !P_RIGHTMOST(opaque) &&
864 : opaque->btpo_next < orig_blkno)
865 0 : recurse_to = opaque->btpo_next;
866 :
867 : /*
868 : * Scan over all items to see which ones need deleted according to the
869 : * callback function.
870 : */
871 1275 : ndeletable = 0;
872 1275 : minoff = P_FIRSTDATAKEY(opaque);
873 1275 : maxoff = PageGetMaxOffsetNumber(page);
874 1275 : if (callback)
875 : {
876 454 : for (offnum = minoff;
877 65399 : offnum <= maxoff;
878 64491 : offnum = OffsetNumberNext(offnum))
879 : {
880 : IndexTuple itup;
881 : ItemPointer htup;
882 :
883 64491 : itup = (IndexTuple) PageGetItem(page,
884 : PageGetItemId(page, offnum));
885 64491 : htup = &(itup->t_tid);
886 64491 : if (callback(htup, callback_state))
887 6952 : deletable[ndeletable++] = offnum;
888 : }
889 : }
890 :
891 : /*
892 : * Apply any needed deletes. We issue just one _bt_delitems() call
893 : * per page, so as to minimize WAL traffic.
894 : */
895 1275 : if (ndeletable > 0)
896 : {
897 151 : _bt_delitems(rel, buf, deletable, ndeletable);
898 151 : stats->tuples_removed += ndeletable;
899 : /* must recompute maxoff */
900 151 : maxoff = PageGetMaxOffsetNumber(page);
901 : }
902 : else
903 : {
904 : /*
905 : * If the page has been split during this vacuum cycle, it seems
906 : * worth expending a write to clear btpo_cycleid even if we don't
907 : * have any deletions to do. (If we do, _bt_delitems takes care
908 : * of this.) This ensures we won't process the page again.
909 : *
910 : * We treat this like a hint-bit update because there's no need to
911 : * WAL-log it.
912 : */
913 1124 : if (vstate->cycleid != 0 &&
914 : opaque->btpo_cycleid == vstate->cycleid)
915 : {
916 0 : opaque->btpo_cycleid = 0;
917 0 : SetBufferCommitInfoNeedsSave(buf);
918 : }
919 : }
920 :
921 : /*
922 : * If it's now empty, try to delete; else count the live tuples. We
923 : * don't delete when recursing, though, to avoid putting entries into
924 : * freePages out-of-order (doesn't seem worth any extra code to handle
925 : * the case).
926 : */
927 1275 : if (minoff > maxoff)
928 8 : delete_now = (blkno == orig_blkno);
929 : else
930 1267 : stats->num_index_tuples += maxoff - minoff + 1;
931 : }
932 :
933 1347 : if (delete_now)
934 : {
935 : MemoryContext oldcontext;
936 : int ndel;
937 :
938 : /* Run pagedel in a temp context to avoid memory leakage */
939 8 : MemoryContextReset(vstate->pagedelcontext);
940 16 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
941 :
942 8 : ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full);
943 :
944 : /* count only this page, else may double-count parent */
945 8 : if (ndel)
946 2 : stats->pages_deleted++;
947 :
948 : /*
949 : * During VACUUM FULL it's okay to recycle deleted pages immediately,
950 : * since there can be no other transactions scanning the index. Note
951 : * that we will only recycle the current page and not any parent pages
952 : * that _bt_pagedel might have recursed to; this seems reasonable in
953 : * the name of simplicity. (Trying to do otherwise would mean we'd
954 : * have to sort the list of recyclable pages we're building.)
955 : */
956 8 : if (ndel && info->vacuum_full)
957 : {
958 0 : if (vstate->nFreePages < vstate->maxFreePages)
959 0 : vstate->freePages[vstate->nFreePages++] = blkno;
960 0 : vstate->totFreePages++;
961 : }
962 :
963 : MemoryContextSwitchTo(oldcontext);
964 : /* pagedel released buffer, so we shouldn't */
965 : }
966 : else
967 1339 : _bt_relbuf(rel, buf);
968 :
969 : /*
970 : * This is really tail recursion, but if the compiler is too stupid to
971 : * optimize it as such, we'd eat an uncomfortably large amount of stack
972 : * space per recursion level (due to the deletable[] array). A failure is
973 : * improbable since the number of levels isn't likely to be large ... but
974 : * just in case, let's hand-optimize into a loop.
975 : */
976 1347 : if (recurse_to != P_NONE)
977 : {
978 0 : blkno = recurse_to;
979 0 : goto restart;
980 : }
981 : }
|