LTP GCOV extension - code coverage report
Current view: directory - access/nbtree - nbtree.c
Test: unnamed
Date: 2008-07-03 Instrumented lines: 306
Code covered: 79.7 % Executed lines: 244
Legend: not executed executed

       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * nbtree.c
       4                 :  *        Implementation of Lehman and Yao's btree management algorithm for
       5                 :  *        Postgres.
       6                 :  *
       7                 :  * NOTES
       8                 :  *        This file contains only the public interface routines.
       9                 :  *
      10                 :  *
      11                 :  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
      12                 :  * Portions Copyright (c) 1994, Regents of the University of California
      13                 :  *
      14                 :  * IDENTIFICATION
      15                 :  *        $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.155 2007/05/30 20:11:53 tgl Exp $
      16                 :  *
      17                 :  *-------------------------------------------------------------------------
      18                 :  */
      19                 : #include "postgres.h"
      20                 : 
      21                 : #include "access/genam.h"
      22                 : #include "access/nbtree.h"
      23                 : #include "catalog/index.h"
      24                 : #include "commands/vacuum.h"
      25                 : #include "storage/freespace.h"
      26                 : #include "storage/lmgr.h"
      27                 : #include "utils/memutils.h"
      28                 : 
      29                 : 
      30                 : /* Working state for btbuild and its callback */
      31                 : typedef struct
      32                 : {
      33                 :         bool            isUnique;
      34                 :         bool            haveDead;
      35                 :         Relation        heapRel;
      36                 :         BTSpool    *spool;
      37                 : 
      38                 :         /*
      39                 :          * spool2 is needed only when the index is an unique index. Dead tuples
      40                 :          * are put into spool2 instead of spool in order to avoid uniqueness
      41                 :          * check.
      42                 :          */
      43                 :         BTSpool    *spool2;
      44                 :         double          indtuples;
      45                 : } BTBuildState;
      46                 : 
      47                 : /* Working state needed by btvacuumpage */
      48                 : typedef struct
      49                 : {
      50                 :         IndexVacuumInfo *info;
      51                 :         IndexBulkDeleteResult *stats;
      52                 :         IndexBulkDeleteCallback callback;
      53                 :         void       *callback_state;
      54                 :         BTCycleId       cycleid;
      55                 :         BlockNumber *freePages;
      56                 :         int                     nFreePages;             /* number of entries in freePages[] */
      57                 :         int                     maxFreePages;   /* allocated size of freePages[] */
      58                 :         BlockNumber totFreePages;       /* true total # of free pages */
      59                 :         MemoryContext pagedelcontext;
      60                 : } BTVacState;
      61                 : 
      62                 : 
      63                 : static void btbuildCallback(Relation index,
      64                 :                                 HeapTuple htup,
      65                 :                                 Datum *values,
      66                 :                                 bool *isnull,
      67                 :                                 bool tupleIsAlive,
      68                 :                                 void *state);
      69                 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
      70                 :                          IndexBulkDeleteCallback callback, void *callback_state,
      71                 :                          BTCycleId cycleid);
      72                 : static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
      73                 :                          BlockNumber orig_blkno);
      74                 : 
      75                 : 
      76                 : /*
      77                 :  *      btbuild() -- build a new btree index.
      78                 :  */
      79                 : Datum
      80                 : btbuild(PG_FUNCTION_ARGS)
      81             560 : {
      82             560 :         Relation        heap = (Relation) PG_GETARG_POINTER(0);
      83             560 :         Relation        index = (Relation) PG_GETARG_POINTER(1);
      84             560 :         IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
      85                 :         IndexBuildResult *result;
      86                 :         double          reltuples;
      87                 :         BTBuildState buildstate;
      88                 : 
      89             560 :         buildstate.isUnique = indexInfo->ii_Unique;
      90             560 :         buildstate.haveDead = false;
      91             560 :         buildstate.heapRel = heap;
      92             560 :         buildstate.spool = NULL;
      93             560 :         buildstate.spool2 = NULL;
      94             560 :         buildstate.indtuples = 0;
      95                 : 
      96                 : #ifdef BTREE_BUILD_STATS
      97                 :         if (log_btree_build_stats)
      98                 :                 ResetUsage();
      99                 : #endif   /* BTREE_BUILD_STATS */
     100                 : 
     101                 :         /*
     102                 :          * We expect to be called exactly once for any index relation. If that's
     103                 :          * not the case, big trouble's what we have.
     104                 :          */
     105             560 :         if (RelationGetNumberOfBlocks(index) != 0)
     106               0 :                 elog(ERROR, "index \"%s\" already contains data",
     107                 :                          RelationGetRelationName(index));
     108                 : 
     109             560 :         buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
     110                 : 
     111                 :         /*
     112                 :          * If building a unique index, put dead tuples in a second spool to keep
     113                 :          * them out of the uniqueness check.
     114                 :          */
     115             560 :         if (indexInfo->ii_Unique)
     116             503 :                 buildstate.spool2 = _bt_spoolinit(index, false, true);
     117                 : 
     118                 :         /* do the heap scan */
     119             560 :         reltuples = IndexBuildHeapScan(heap, index, indexInfo,
     120                 :                                                                    btbuildCallback, (void *) &buildstate);
     121                 : 
     122                 :         /* okay, all heap tuples are indexed */
     123             560 :         if (buildstate.spool2 && !buildstate.haveDead)
     124                 :         {
     125                 :                 /* spool2 turns out to be unnecessary */
     126             500 :                 _bt_spooldestroy(buildstate.spool2);
     127             500 :                 buildstate.spool2 = NULL;
     128                 :         }
     129                 : 
     130                 :         /*
     131                 :          * Finish the build by (1) completing the sort of the spool file, (2)
     132                 :          * inserting the sorted tuples into btree pages and (3) building the upper
     133                 :          * levels.
     134                 :          */
     135             560 :         _bt_leafbuild(buildstate.spool, buildstate.spool2);
     136             557 :         _bt_spooldestroy(buildstate.spool);
     137             557 :         if (buildstate.spool2)
     138               3 :                 _bt_spooldestroy(buildstate.spool2);
     139                 : 
     140                 : #ifdef BTREE_BUILD_STATS
     141                 :         if (log_btree_build_stats)
     142                 :         {
     143                 :                 ShowUsage("BTREE BUILD STATS");
     144                 :                 ResetUsage();
     145                 :         }
     146                 : #endif   /* BTREE_BUILD_STATS */
     147                 : 
     148                 :         /*
     149                 :          * If we are reindexing a pre-existing index, it is critical to send out a
     150                 :          * relcache invalidation SI message to ensure all backends re-read the
     151                 :          * index metapage.      We expect that the caller will ensure that happens
     152                 :          * (typically as a side effect of updating index stats, but it must happen
     153                 :          * even if the stats don't change!)
     154                 :          */
     155                 : 
     156                 :         /*
     157                 :          * Return statistics
     158                 :          */
     159             557 :         result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     160                 : 
     161             557 :         result->heap_tuples = reltuples;
     162             557 :         result->index_tuples = buildstate.indtuples;
     163                 : 
     164             557 :         PG_RETURN_POINTER(result);
     165                 : }
     166                 : 
     167                 : /*
     168                 :  * Per-tuple callback from IndexBuildHeapScan
     169                 :  */
     170                 : static void
     171                 : btbuildCallback(Relation index,
     172                 :                                 HeapTuple htup,
     173                 :                                 Datum *values,
     174                 :                                 bool *isnull,
     175                 :                                 bool tupleIsAlive,
     176                 :                                 void *state)
     177          173938 : {
     178          173938 :         BTBuildState *buildstate = (BTBuildState *) state;
     179                 :         IndexTuple      itup;
     180                 : 
     181                 :         /* form an index tuple and point it at the heap tuple */
     182          173938 :         itup = index_form_tuple(RelationGetDescr(index), values, isnull);
     183          173938 :         itup->t_tid = htup->t_self;
     184                 : 
     185                 :         /*
     186                 :          * insert the index tuple into the appropriate spool file for subsequent
     187                 :          * processing
     188                 :          */
     189          347865 :         if (tupleIsAlive || buildstate->spool2 == NULL)
     190          173927 :                 _bt_spool(itup, buildstate->spool);
     191                 :         else
     192                 :         {
     193                 :                 /* dead tuples are put into spool2 */
     194              11 :                 buildstate->haveDead = true;
     195              11 :                 _bt_spool(itup, buildstate->spool2);
     196                 :         }
     197                 : 
     198          173938 :         buildstate->indtuples += 1;
     199                 : 
     200          173938 :         pfree(itup);
     201          173938 : }
     202                 : 
     203                 : /*
     204                 :  *      btinsert() -- insert an index tuple into a btree.
     205                 :  *
     206                 :  *              Descend the tree recursively, find the appropriate location for our
     207                 :  *              new tuple, and put it there.
     208                 :  */
     209                 : Datum
     210                 : btinsert(PG_FUNCTION_ARGS)
     211           70109 : {
     212           70109 :         Relation        rel = (Relation) PG_GETARG_POINTER(0);
     213           70109 :         Datum      *values = (Datum *) PG_GETARG_POINTER(1);
     214           70109 :         bool       *isnull = (bool *) PG_GETARG_POINTER(2);
     215           70109 :         ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
     216           70109 :         Relation        heapRel = (Relation) PG_GETARG_POINTER(4);
     217           70109 :         bool            checkUnique = PG_GETARG_BOOL(5);
     218                 :         IndexTuple      itup;
     219                 : 
     220                 :         /* generate an index tuple */
     221           70109 :         itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
     222           70109 :         itup->t_tid = *ht_ctid;
     223                 : 
     224           70109 :         _bt_doinsert(rel, itup, checkUnique, heapRel);
     225                 : 
     226           70087 :         pfree(itup);
     227                 : 
     228           70087 :         PG_RETURN_BOOL(true);
     229                 : }
     230                 : 
     231                 : /*
     232                 :  *      btgettuple() -- Get the next tuple in the scan.
     233                 :  */
     234                 : Datum
     235                 : btgettuple(PG_FUNCTION_ARGS)
     236          248122 : {
     237          248122 :         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
     238          248122 :         ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
     239          248122 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
     240                 :         bool            res;
     241                 : 
     242                 :         /*
     243                 :          * If we've already initialized this scan, we can just advance it in the
     244                 :          * appropriate direction.  If we haven't done so yet, we call a routine to
     245                 :          * get the first item in the scan.
     246                 :          */
     247          333105 :         if (BTScanPosIsValid(so->currPos))
     248                 :         {
     249                 :                 /*
     250                 :                  * Check to see if we should kill the previously-fetched tuple.
     251                 :                  */
     252           84983 :                 if (scan->kill_prior_tuple)
     253                 :                 {
     254                 :                         /*
     255                 :                          * Yes, remember it for later.  (We'll deal with all such tuples
     256                 :                          * at once right before leaving the index page.)  The test for
     257                 :                          * numKilled overrun is not just paranoia: if the caller reverses
     258                 :                          * direction in the indexscan then the same item might get entered
     259                 :                          * multiple times.      It's not worth trying to optimize that, so we
     260                 :                          * don't detect it, but instead just forget any excess entries.
     261                 :                          */
     262            1697 :                         if (so->killedItems == NULL)
     263             930 :                                 so->killedItems = (int *)
     264                 :                                         palloc(MaxIndexTuplesPerPage * sizeof(int));
     265            1697 :                         if (so->numKilled < MaxIndexTuplesPerPage)
     266            1697 :                                 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
     267                 :                 }
     268                 : 
     269                 :                 /*
     270                 :                  * Now continue the scan.
     271                 :                  */
     272           84983 :                 res = _bt_next(scan, dir);
     273                 :         }
     274                 :         else
     275          163139 :                 res = _bt_first(scan, dir);
     276                 : 
     277          248122 :         PG_RETURN_BOOL(res);
     278                 : }
     279                 : 
     280                 : /*
     281                 :  * btgetmulti() -- get multiple tuples at once
     282                 :  *
     283                 :  * In the current implementation there seems no strong reason to stop at
     284                 :  * index page boundaries; we just press on until we fill the caller's buffer
     285                 :  * or run out of matches.
     286                 :  */
     287                 : Datum
     288                 : btgetmulti(PG_FUNCTION_ARGS)
     289             328 : {
     290             328 :         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
     291             328 :         ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
     292             328 :         int32           max_tids = PG_GETARG_INT32(2);
     293             328 :         int32      *returned_tids = (int32 *) PG_GETARG_POINTER(3);
     294             328 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
     295             328 :         bool            res = true;
     296             328 :         int32           ntids = 0;
     297                 : 
     298             328 :         if (max_tids <= 0)                   /* behave correctly in boundary case */
     299               0 :                 PG_RETURN_BOOL(true);
     300                 : 
     301                 :         /* If we haven't started the scan yet, fetch the first page & tuple. */
     302             328 :         if (!BTScanPosIsValid(so->currPos))
     303                 :         {
     304             328 :                 res = _bt_first(scan, ForwardScanDirection);
     305             328 :                 if (!res)
     306                 :                 {
     307                 :                         /* empty scan */
     308              18 :                         *returned_tids = ntids;
     309              18 :                         PG_RETURN_BOOL(res);
     310                 :                 }
     311                 :                 /* Save tuple ID, and continue scanning */
     312             310 :                 tids[ntids] = scan->xs_ctup.t_self;
     313             310 :                 ntids++;
     314                 :         }
     315                 : 
     316            2430 :         while (ntids < max_tids)
     317                 :         {
     318                 :                 /*
     319                 :                  * Advance to next tuple within page.  This is the same as the easy
     320                 :                  * case in _bt_next().
     321                 :                  */
     322            2430 :                 if (++so->currPos.itemIndex > so->currPos.lastItem)
     323                 :                 {
     324                 :                         /* let _bt_next do the heavy lifting */
     325             315 :                         res = _bt_next(scan, ForwardScanDirection);
     326             315 :                         if (!res)
     327             310 :                                 break;
     328                 :                 }
     329                 : 
     330                 :                 /* Save tuple ID, and continue scanning */
     331            2120 :                 tids[ntids] = so->currPos.items[so->currPos.itemIndex].heapTid;
     332            2120 :                 ntids++;
     333                 :         }
     334                 : 
     335             310 :         *returned_tids = ntids;
     336             310 :         PG_RETURN_BOOL(res);
     337                 : }
     338                 : 
     339                 : /*
     340                 :  *      btbeginscan() -- start a scan on a btree index
     341                 :  */
     342                 : Datum
     343                 : btbeginscan(PG_FUNCTION_ARGS)
     344          118464 : {
     345          118464 :         Relation        rel = (Relation) PG_GETARG_POINTER(0);
     346          118464 :         int                     keysz = PG_GETARG_INT32(1);
     347          118464 :         ScanKey         scankey = (ScanKey) PG_GETARG_POINTER(2);
     348                 :         IndexScanDesc scan;
     349                 : 
     350                 :         /* get the scan */
     351          118464 :         scan = RelationGetIndexScan(rel, keysz, scankey);
     352                 : 
     353          118464 :         PG_RETURN_POINTER(scan);
     354                 : }
     355                 : 
     356                 : /*
     357                 :  *      btrescan() -- rescan an index relation
     358                 :  */
     359                 : Datum
     360                 : btrescan(PG_FUNCTION_ARGS)
     361          167793 : {
     362          167793 :         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
     363          167793 :         ScanKey         scankey = (ScanKey) PG_GETARG_POINTER(1);
     364                 :         BTScanOpaque so;
     365                 : 
     366          167793 :         so = (BTScanOpaque) scan->opaque;
     367                 : 
     368          167793 :         if (so == NULL)                         /* if called from btbeginscan */
     369                 :         {
     370          118464 :                 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
     371          118464 :                 so->currPos.buf = so->markPos.buf = InvalidBuffer;
     372          118464 :                 if (scan->numberOfKeys > 0)
     373          118424 :                         so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
     374                 :                 else
     375              40 :                         so->keyData = NULL;
     376          118464 :                 so->killedItems = NULL; /* until needed */
     377          118464 :                 so->numKilled = 0;
     378          118464 :                 scan->opaque = so;
     379                 :         }
     380                 : 
     381                 :         /* we aren't holding any read locks, but gotta drop the pins */
     382          167793 :         if (BTScanPosIsValid(so->currPos))
     383                 :         {
     384                 :                 /* Before leaving current page, deal with any killed items */
     385           26340 :                 if (so->numKilled > 0)
     386               0 :                         _bt_killitems(scan, false);
     387           26340 :                 ReleaseBuffer(so->currPos.buf);
     388           26340 :                 so->currPos.buf = InvalidBuffer;
     389                 :         }
     390                 : 
     391          167793 :         if (BTScanPosIsValid(so->markPos))
     392                 :         {
     393               0 :                 ReleaseBuffer(so->markPos.buf);
     394               0 :                 so->markPos.buf = InvalidBuffer;
     395                 :         }
     396          167793 :         so->markItemIndex = -1;
     397                 : 
     398                 :         /*
     399                 :          * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
     400                 :          * - vadim 05/05/97
     401                 :          */
     402          167793 :         if (scankey && scan->numberOfKeys > 0)
     403          167753 :                 memmove(scan->keyData,
     404                 :                                 scankey,
     405                 :                                 scan->numberOfKeys * sizeof(ScanKeyData));
     406          167793 :         so->numberOfKeys = 0;                /* until _bt_preprocess_keys sets it */
     407                 : 
     408          167793 :         PG_RETURN_VOID();
     409                 : }
     410                 : 
     411                 : /*
     412                 :  *      btendscan() -- close down a scan
     413                 :  */
     414                 : Datum
     415                 : btendscan(PG_FUNCTION_ARGS)
     416          118438 : {
     417          118438 :         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
     418          118438 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
     419                 : 
     420                 :         /* we aren't holding any read locks, but gotta drop the pins */
     421          118438 :         if (BTScanPosIsValid(so->currPos))
     422                 :         {
     423                 :                 /* Before leaving current page, deal with any killed items */
     424           70075 :                 if (so->numKilled > 0)
     425             238 :                         _bt_killitems(scan, false);
     426           70075 :                 ReleaseBuffer(so->currPos.buf);
     427           70075 :                 so->currPos.buf = InvalidBuffer;
     428                 :         }
     429                 : 
     430          118438 :         if (BTScanPosIsValid(so->markPos))
     431                 :         {
     432               0 :                 ReleaseBuffer(so->markPos.buf);
     433               0 :                 so->markPos.buf = InvalidBuffer;
     434                 :         }
     435          118438 :         so->markItemIndex = -1;
     436                 : 
     437          118438 :         if (so->killedItems != NULL)
     438             928 :                 pfree(so->killedItems);
     439          118438 :         if (so->keyData != NULL)
     440          118400 :                 pfree(so->keyData);
     441          118438 :         pfree(so);
     442                 : 
     443          118438 :         PG_RETURN_VOID();
     444                 : }
     445                 : 
     446                 : /*
     447                 :  *      btmarkpos() -- save current scan position
     448                 :  */
     449                 : Datum
     450                 : btmarkpos(PG_FUNCTION_ARGS)
     451               0 : {
     452               0 :         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
     453               0 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
     454                 : 
     455                 :         /* we aren't holding any read locks, but gotta drop the pin */
     456               0 :         if (BTScanPosIsValid(so->markPos))
     457                 :         {
     458               0 :                 ReleaseBuffer(so->markPos.buf);
     459               0 :                 so->markPos.buf = InvalidBuffer;
     460                 :         }
     461                 : 
     462                 :         /*
     463                 :          * Just record the current itemIndex.  If we later step to next page
     464                 :          * before releasing the marked position, _bt_steppage makes a full copy of
     465                 :          * the currPos struct in markPos.  If (as often happens) the mark is moved
     466                 :          * before we leave the page, we don't have to do that work.
     467                 :          */
     468               0 :         if (BTScanPosIsValid(so->currPos))
     469               0 :                 so->markItemIndex = so->currPos.itemIndex;
     470                 :         else
     471               0 :                 so->markItemIndex = -1;
     472                 : 
     473               0 :         PG_RETURN_VOID();
     474                 : }
     475                 : 
     476                 : /*
     477                 :  *      btrestrpos() -- restore scan to last saved position
     478                 :  */
     479                 : Datum
     480                 : btrestrpos(PG_FUNCTION_ARGS)
     481               0 : {
     482               0 :         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
     483               0 :         BTScanOpaque so = (BTScanOpaque) scan->opaque;
     484                 : 
     485               0 :         if (so->markItemIndex >= 0)
     486                 :         {
     487                 :                 /*
     488                 :                  * The mark position is on the same page we are currently on. Just
     489                 :                  * restore the itemIndex.
     490                 :                  */
     491               0 :                 so->currPos.itemIndex = so->markItemIndex;
     492                 :         }
     493                 :         else
     494                 :         {
     495                 :                 /* we aren't holding any read locks, but gotta drop the pin */
     496               0 :                 if (BTScanPosIsValid(so->currPos))
     497                 :                 {
     498                 :                         /* Before leaving current page, deal with any killed items */
     499               0 :                         if (so->numKilled > 0 &&
     500                 :                                 so->currPos.buf != so->markPos.buf)
     501               0 :                                 _bt_killitems(scan, false);
     502               0 :                         ReleaseBuffer(so->currPos.buf);
     503               0 :                         so->currPos.buf = InvalidBuffer;
     504                 :                 }
     505                 : 
     506               0 :                 if (BTScanPosIsValid(so->markPos))
     507                 :                 {
     508                 :                         /* bump pin on mark buffer for assignment to current buffer */
     509               0 :                         IncrBufferRefCount(so->markPos.buf);
     510               0 :                         memcpy(&so->currPos, &so->markPos,
     511                 :                                    offsetof(BTScanPosData, items[1]) +
     512                 :                                    so->markPos.lastItem * sizeof(BTScanPosItem));
     513                 :                 }
     514                 :         }
     515                 : 
     516               0 :         PG_RETURN_VOID();
     517                 : }
     518                 : 
     519                 : /*
     520                 :  * Bulk deletion of all index entries pointing to a set of heap tuples.
     521                 :  * The set of target tuples is specified via a callback routine that tells
     522                 :  * whether any given heap tuple (identified by ItemPointer) is being deleted.
     523                 :  *
     524                 :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     525                 :  */
     526                 : Datum
     527                 : btbulkdelete(PG_FUNCTION_ARGS)
     528              70 : {
     529              70 :         IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
     530              70 :         IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
     531              70 :         IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
     532              70 :         void       *callback_state = (void *) PG_GETARG_POINTER(3);
     533              70 :         Relation        rel = info->index;
     534                 :         BTCycleId       cycleid;
     535                 : 
     536                 :         /* allocate stats if first time through, else re-use existing struct */
     537              70 :         if (stats == NULL)
     538              70 :                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
     539                 : 
     540                 :         /* Establish the vacuum cycle ID to use for this scan */
     541              70 :         PG_TRY();
     542                 :         {
     543              70 :                 cycleid = _bt_start_vacuum(rel);
     544                 : 
     545              70 :                 btvacuumscan(info, stats, callback, callback_state, cycleid);
     546                 : 
     547              70 :                 _bt_end_vacuum(rel);
     548                 :         }
     549               0 :         PG_CATCH();
     550                 :         {
     551                 :                 /* Make sure shared memory gets cleaned up */
     552               0 :                 _bt_end_vacuum(rel);
     553               0 :                 PG_RE_THROW();
     554                 :         }
     555              70 :         PG_END_TRY();
     556                 : 
     557              70 :         PG_RETURN_POINTER(stats);
     558                 : }
     559                 : 
     560                 : /*
     561                 :  * Post-VACUUM cleanup.
     562                 :  *
     563                 :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     564                 :  */
     565                 : Datum
     566                 : btvacuumcleanup(PG_FUNCTION_ARGS)
     567             366 : {
     568             366 :         IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
     569             366 :         IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
     570                 : 
     571                 :         /*
     572                 :          * If btbulkdelete was called, we need not do anything, just return the
     573                 :          * stats from the latest btbulkdelete call.  If it wasn't called, we must
     574                 :          * still do a pass over the index, to recycle any newly-recyclable pages
     575                 :          * and to obtain index statistics.
     576                 :          *
     577                 :          * Since we aren't going to actually delete any leaf items, there's no
     578                 :          * need to go through all the vacuum-cycle-ID pushups.
     579                 :          */
     580             366 :         if (stats == NULL)
     581                 :         {
     582             301 :                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
     583             301 :                 btvacuumscan(info, stats, NULL, NULL, 0);
     584                 :         }
     585                 : 
     586                 :         /*
     587                 :          * During a non-FULL vacuum it's quite possible for us to be fooled by
     588                 :          * concurrent page splits into double-counting some index tuples, so
     589                 :          * disbelieve any total that exceeds the underlying heap's count. (We
     590                 :          * can't check this during btbulkdelete.)
     591                 :          */
     592             366 :         if (!info->vacuum_full)
     593                 :         {
     594             263 :                 if (stats->num_index_tuples > info->num_heap_tuples)
     595               0 :                         stats->num_index_tuples = info->num_heap_tuples;
     596                 :         }
     597                 : 
     598             366 :         PG_RETURN_POINTER(stats);
     599                 : }
     600                 : 
     601                 : /*
     602                 :  * btvacuumscan --- scan the index for VACUUMing purposes
     603                 :  *
     604                 :  * This combines the functions of looking for leaf tuples that are deletable
     605                 :  * according to the vacuum callback, looking for empty pages that can be
     606                 :  * deleted, and looking for old deleted pages that can be recycled.  Both
     607                 :  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
     608                 :  * btbulkdelete call occurred).
     609                 :  *
     610                 :  * The caller is responsible for initially allocating/zeroing a stats struct
     611                 :  * and for obtaining a vacuum cycle ID if necessary.
     612                 :  */
     613                 : static void
     614                 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     615                 :                          IndexBulkDeleteCallback callback, void *callback_state,
     616                 :                          BTCycleId cycleid)
     617             371 : {
     618             371 :         Relation        rel = info->index;
     619                 :         BTVacState      vstate;
     620                 :         BlockNumber num_pages;
     621                 :         BlockNumber blkno;
     622                 :         bool            needLock;
     623                 : 
     624                 :         /*
     625                 :          * Reset counts that will be incremented during the scan; needed in case
     626                 :          * of multiple scans during a single VACUUM command
     627                 :          */
     628             371 :         stats->num_index_tuples = 0;
     629             371 :         stats->pages_deleted = 0;
     630                 : 
     631                 :         /* Set up info to pass down to btvacuumpage */
     632             371 :         vstate.info = info;
     633             371 :         vstate.stats = stats;
     634             371 :         vstate.callback = callback;
     635             371 :         vstate.callback_state = callback_state;
     636             371 :         vstate.cycleid = cycleid;
     637             371 :         vstate.freePages = NULL;        /* temporarily */
     638             371 :         vstate.nFreePages = 0;
     639             371 :         vstate.maxFreePages = 0;
     640             371 :         vstate.totFreePages = 0;
     641                 : 
     642                 :         /* Create a temporary memory context to run _bt_pagedel in */
     643             371 :         vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
     644                 :                                                                                                   "_bt_pagedel",
     645                 :                                                                                                   ALLOCSET_DEFAULT_MINSIZE,
     646                 :                                                                                                   ALLOCSET_DEFAULT_INITSIZE,
     647                 :                                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
     648                 : 
     649                 :         /*
     650                 :          * The outer loop iterates over all index pages except the metapage, in
     651                 :          * physical order (we hope the kernel will cooperate in providing
     652                 :          * read-ahead for speed).  It is critical that we visit all leaf pages,
     653                 :          * including ones added after we start the scan, else we might fail to
     654                 :          * delete some deletable tuples.  Hence, we must repeatedly check the
     655                 :          * relation length.  We must acquire the relation-extension lock while
     656                 :          * doing so to avoid a race condition: if someone else is extending the
     657                 :          * relation, there is a window where bufmgr/smgr have created a new
     658                 :          * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
     659                 :          * we manage to scan such a page here, we'll improperly assume it can be
     660                 :          * recycled.  Taking the lock synchronizes things enough to prevent a
     661                 :          * problem: either num_pages won't include the new page, or _bt_getbuf
     662                 :          * already has write lock on the buffer and it will be fully initialized
     663                 :          * before we can examine it.  (See also vacuumlazy.c, which has the same
     664                 :          * issue.)      Also, we need not worry if a page is added immediately after
     665                 :          * we look; the page splitting code already has write-lock on the left
     666                 :          * page before it adds a right page, so we must already have processed any
     667                 :          * tuples due to be moved into such a page.
     668                 :          *
     669                 :          * We can skip locking for new or temp relations, however, since no one
     670                 :          * else could be accessing them.
     671                 :          */
     672             371 :         needLock = !RELATION_IS_LOCAL(rel);
     673                 : 
     674             371 :         blkno = BTREE_METAPAGE + 1;
     675                 :         for (;;)
     676                 :         {
     677                 :                 /* Get the current relation length */
     678             632 :                 if (needLock)
     679             632 :                         LockRelationForExtension(rel, ExclusiveLock);
     680             632 :                 num_pages = RelationGetNumberOfBlocks(rel);
     681             632 :                 if (needLock)
     682             632 :                         UnlockRelationForExtension(rel, ExclusiveLock);
     683                 : 
     684                 :                 /* Allocate freePages after we read num_pages the first time */
     685             632 :                 if (vstate.freePages == NULL)
     686                 :                 {
     687                 :                         /* No point in remembering more than MaxFSMPages pages */
     688             371 :                         vstate.maxFreePages = MaxFSMPages;
     689             371 :                         if ((BlockNumber) vstate.maxFreePages > num_pages)
     690             371 :                                 vstate.maxFreePages = (int) num_pages;
     691             371 :                         vstate.freePages = (BlockNumber *)
     692                 :                                 palloc(vstate.maxFreePages * sizeof(BlockNumber));
     693                 :                 }
     694                 : 
     695                 :                 /* Quit if we've scanned the whole relation */
     696             632 :                 if (blkno >= num_pages)
     697             371 :                         break;
     698                 :                 /* Iterate over pages, then loop back to recheck length */
     699            1347 :                 for (; blkno < num_pages; blkno++)
     700                 :                 {
     701            1347 :                         btvacuumpage(&vstate, blkno, blkno);
     702                 :                 }
     703                 :         }
     704                 : 
     705                 :         /*
     706                 :          * During VACUUM FULL, we truncate off any recyclable pages at the end of
     707                 :          * the index.  In a normal vacuum it'd be unsafe to do this except by
     708                 :          * acquiring exclusive lock on the index and then rechecking all the
     709                 :          * pages; doesn't seem worth it.
     710                 :          */
     711             371 :         if (info->vacuum_full && vstate.nFreePages > 0)
     712                 :         {
     713               0 :                 BlockNumber new_pages = num_pages;
     714                 : 
     715               0 :                 while (vstate.nFreePages > 0 &&
     716                 :                            vstate.freePages[vstate.nFreePages - 1] == new_pages - 1)
     717                 :                 {
     718               0 :                         new_pages--;
     719               0 :                         stats->pages_deleted--;
     720               0 :                         vstate.nFreePages--;
     721               0 :                         vstate.totFreePages = vstate.nFreePages;        /* can't be more */
     722                 :                 }
     723               0 :                 if (new_pages != num_pages)
     724                 :                 {
     725                 :                         /*
     726                 :                          * Okay to truncate.
     727                 :                          */
     728               0 :                         RelationTruncate(rel, new_pages);
     729                 : 
     730                 :                         /* update statistics */
     731               0 :                         stats->pages_removed += num_pages - new_pages;
     732                 : 
     733               0 :                         num_pages = new_pages;
     734                 :                 }
     735                 :         }
     736                 : 
     737                 :         /*
     738                 :          * Update the shared Free Space Map with the info we now have about free
     739                 :          * pages in the index, discarding any old info the map may have. We do not
     740                 :          * need to sort the page numbers; they're in order already.
     741                 :          */
     742             371 :         RecordIndexFreeSpace(&rel->rd_node, vstate.totFreePages,
     743                 :                                                  vstate.nFreePages, vstate.freePages);
     744                 : 
     745             371 :         pfree(vstate.freePages);
     746                 : 
     747             371 :         MemoryContextDelete(vstate.pagedelcontext);
     748                 : 
     749                 :         /* update statistics */
     750             371 :         stats->num_pages = num_pages;
     751             371 :         stats->pages_free = vstate.totFreePages;
     752             371 : }
     753                 : 
     754                 : /*
     755                 :  * btvacuumpage --- VACUUM one page
     756                 :  *
     757                 :  * This processes a single page for btvacuumscan().  In some cases we
     758                 :  * must go back and re-examine previously-scanned pages; this routine
     759                 :  * recurses when necessary to handle that case.
     760                 :  *
     761                 :  * blkno is the page to process.  orig_blkno is the highest block number
     762                 :  * reached by the outer btvacuumscan loop (the same as blkno, unless we
     763                 :  * are recursing to re-examine a previous page).
     764                 :  */
     765                 : static void
     766                 : btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
     767            1347 : {
     768            1347 :         IndexVacuumInfo *info = vstate->info;
     769            1347 :         IndexBulkDeleteResult *stats = vstate->stats;
     770            1347 :         IndexBulkDeleteCallback callback = vstate->callback;
     771            1347 :         void       *callback_state = vstate->callback_state;
     772            1347 :         Relation        rel = info->index;
     773                 :         bool            delete_now;
     774                 :         BlockNumber recurse_to;
     775                 :         Buffer          buf;
     776                 :         Page            page;
     777                 :         BTPageOpaque opaque;
     778                 : 
     779            1347 : restart:
     780            1347 :         delete_now = false;
     781            1347 :         recurse_to = P_NONE;
     782                 : 
     783                 :         /* call vacuum_delay_point while not holding any buffer lock */
     784            1347 :         vacuum_delay_point();
     785                 : 
     786                 :         /*
     787                 :          * We can't use _bt_getbuf() here because it always applies
     788                 :          * _bt_checkpage(), which will barf on an all-zero page. We want to
     789                 :          * recycle all-zero pages, not fail.  Also, we want to use a nondefault
     790                 :          * buffer access strategy.
     791                 :          */
     792            1347 :         buf = ReadBufferWithStrategy(rel, blkno, info->strategy);
     793            1347 :         LockBuffer(buf, BT_READ);
     794            1347 :         page = BufferGetPage(buf);
     795            1347 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     796            1347 :         if (!PageIsNew(page))
     797            1347 :                 _bt_checkpage(rel, buf);
     798                 : 
     799                 :         /*
     800                 :          * If we are recursing, the only case we want to do anything with is a
     801                 :          * live leaf page having the current vacuum cycle ID.  Any other state
     802                 :          * implies we already saw the page (eg, deleted it as being empty). In
     803                 :          * particular, we don't want to risk adding it to freePages twice.
     804                 :          */
     805            1347 :         if (blkno != orig_blkno)
     806                 :         {
     807               0 :                 if (_bt_page_recyclable(page) ||
     808                 :                         P_IGNORE(opaque) ||
     809                 :                         !P_ISLEAF(opaque) ||
     810                 :                         opaque->btpo_cycleid != vstate->cycleid)
     811                 :                 {
     812               0 :                         _bt_relbuf(rel, buf);
     813               0 :                         return;
     814                 :                 }
     815                 :         }
     816                 : 
     817                 :         /* Page is valid, see what to do with it */
     818            1347 :         if (_bt_page_recyclable(page))
     819                 :         {
     820                 :                 /* Okay to recycle this page */
     821               0 :                 if (vstate->nFreePages < vstate->maxFreePages)
     822               0 :                         vstate->freePages[vstate->nFreePages++] = blkno;
     823               0 :                 vstate->totFreePages++;
     824               0 :                 stats->pages_deleted++;
     825                 :         }
     826            1347 :         else if (P_ISDELETED(opaque))
     827                 :         {
     828                 :                 /* Already deleted, but can't recycle yet */
     829               0 :                 stats->pages_deleted++;
     830                 :         }
     831            1347 :         else if (P_ISHALFDEAD(opaque))
     832                 :         {
     833                 :                 /* Half-dead, try to delete */
     834               0 :                 delete_now = true;
     835                 :         }
     836            1347 :         else if (P_ISLEAF(opaque))
     837                 :         {
     838                 :                 OffsetNumber deletable[MaxOffsetNumber];
     839                 :                 int                     ndeletable;
     840                 :                 OffsetNumber offnum,
     841                 :                                         minoff,
     842                 :                                         maxoff;
     843                 : 
     844                 :                 /*
     845                 :                  * Trade in the initial read lock for a super-exclusive write lock on
     846                 :                  * this page.  We must get such a lock on every leaf page over the
     847                 :                  * course of the vacuum scan, whether or not it actually contains any
     848                 :                  * deletable tuples --- see nbtree/README.
     849                 :                  */
     850            1275 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     851            1275 :                 LockBufferForCleanup(buf);
     852                 : 
     853                 :                 /*
     854                 :                  * Check whether we need to recurse back to earlier pages.      What we
     855                 :                  * are concerned about is a page split that happened since we started
     856                 :                  * the vacuum scan.  If the split moved some tuples to a lower page
     857                 :                  * then we might have missed 'em.  If so, set up for tail recursion.
     858                 :                  * (Must do this before possibly clearing btpo_cycleid below!)
     859                 :                  */
     860            1275 :                 if (vstate->cycleid != 0 &&
     861                 :                         opaque->btpo_cycleid == vstate->cycleid &&
     862                 :                         !(opaque->btpo_flags & BTP_SPLIT_END) &&
     863                 :                         !P_RIGHTMOST(opaque) &&
     864                 :                         opaque->btpo_next < orig_blkno)
     865               0 :                         recurse_to = opaque->btpo_next;
     866                 : 
     867                 :                 /*
     868                 :                  * Scan over all items to see which ones need deleted according to the
     869                 :                  * callback function.
     870                 :                  */
     871            1275 :                 ndeletable = 0;
     872            1275 :                 minoff = P_FIRSTDATAKEY(opaque);
     873            1275 :                 maxoff = PageGetMaxOffsetNumber(page);
     874            1275 :                 if (callback)
     875                 :                 {
     876             454 :                         for (offnum = minoff;
     877           65399 :                                  offnum <= maxoff;
     878           64491 :                                  offnum = OffsetNumberNext(offnum))
     879                 :                         {
     880                 :                                 IndexTuple      itup;
     881                 :                                 ItemPointer htup;
     882                 : 
     883           64491 :                                 itup = (IndexTuple) PageGetItem(page,
     884                 :                                                                                                 PageGetItemId(page, offnum));
     885           64491 :                                 htup = &(itup->t_tid);
     886           64491 :                                 if (callback(htup, callback_state))
     887            6952 :                                         deletable[ndeletable++] = offnum;
     888                 :                         }
     889                 :                 }
     890                 : 
     891                 :                 /*
     892                 :                  * Apply any needed deletes.  We issue just one _bt_delitems() call
     893                 :                  * per page, so as to minimize WAL traffic.
     894                 :                  */
     895            1275 :                 if (ndeletable > 0)
     896                 :                 {
     897             151 :                         _bt_delitems(rel, buf, deletable, ndeletable);
     898             151 :                         stats->tuples_removed += ndeletable;
     899                 :                         /* must recompute maxoff */
     900             151 :                         maxoff = PageGetMaxOffsetNumber(page);
     901                 :                 }
     902                 :                 else
     903                 :                 {
     904                 :                         /*
     905                 :                          * If the page has been split during this vacuum cycle, it seems
     906                 :                          * worth expending a write to clear btpo_cycleid even if we don't
     907                 :                          * have any deletions to do.  (If we do, _bt_delitems takes care
     908                 :                          * of this.)  This ensures we won't process the page again.
     909                 :                          *
     910                 :                          * We treat this like a hint-bit update because there's no need to
     911                 :                          * WAL-log it.
     912                 :                          */
     913            1124 :                         if (vstate->cycleid != 0 &&
     914                 :                                 opaque->btpo_cycleid == vstate->cycleid)
     915                 :                         {
     916               0 :                                 opaque->btpo_cycleid = 0;
     917               0 :                                 SetBufferCommitInfoNeedsSave(buf);
     918                 :                         }
     919                 :                 }
     920                 : 
     921                 :                 /*
     922                 :                  * If it's now empty, try to delete; else count the live tuples. We
     923                 :                  * don't delete when recursing, though, to avoid putting entries into
     924                 :                  * freePages out-of-order (doesn't seem worth any extra code to handle
     925                 :                  * the case).
     926                 :                  */
     927            1275 :                 if (minoff > maxoff)
     928               8 :                         delete_now = (blkno == orig_blkno);
     929                 :                 else
     930            1267 :                         stats->num_index_tuples += maxoff - minoff + 1;
     931                 :         }
     932                 : 
     933            1347 :         if (delete_now)
     934                 :         {
     935                 :                 MemoryContext oldcontext;
     936                 :                 int                     ndel;
     937                 : 
     938                 :                 /* Run pagedel in a temp context to avoid memory leakage */
     939               8 :                 MemoryContextReset(vstate->pagedelcontext);
     940              16 :                 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
     941                 : 
     942               8 :                 ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full);
     943                 : 
     944                 :                 /* count only this page, else may double-count parent */
     945               8 :                 if (ndel)
     946               2 :                         stats->pages_deleted++;
     947                 : 
     948                 :                 /*
     949                 :                  * During VACUUM FULL it's okay to recycle deleted pages immediately,
     950                 :                  * since there can be no other transactions scanning the index.  Note
     951                 :                  * that we will only recycle the current page and not any parent pages
     952                 :                  * that _bt_pagedel might have recursed to; this seems reasonable in
     953                 :                  * the name of simplicity.      (Trying to do otherwise would mean we'd
     954                 :                  * have to sort the list of recyclable pages we're building.)
     955                 :                  */
     956               8 :                 if (ndel && info->vacuum_full)
     957                 :                 {
     958               0 :                         if (vstate->nFreePages < vstate->maxFreePages)
     959               0 :                                 vstate->freePages[vstate->nFreePages++] = blkno;
     960               0 :                         vstate->totFreePages++;
     961                 :                 }
     962                 : 
     963                 :                 MemoryContextSwitchTo(oldcontext);
     964                 :                 /* pagedel released buffer, so we shouldn't */
     965                 :         }
     966                 :         else
     967            1339 :                 _bt_relbuf(rel, buf);
     968                 : 
     969                 :         /*
     970                 :          * This is really tail recursion, but if the compiler is too stupid to
     971                 :          * optimize it as such, we'd eat an uncomfortably large amount of stack
     972                 :          * space per recursion level (due to the deletable[] array). A failure is
     973                 :          * improbable since the number of levels isn't likely to be large ... but
     974                 :          * just in case, let's hand-optimize into a loop.
     975                 :          */
     976            1347 :         if (recurse_to != P_NONE)
     977                 :         {
     978               0 :                 blkno = recurse_to;
     979               0 :                 goto restart;
     980                 :         }
     981                 : }

Generated by: LTP GCOV extension version 1.5