1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsort.c
4 : * Build a btree from sorted input by loading leaf pages sequentially.
5 : *
6 : * NOTES
7 : *
8 : * We use tuplesort.c to sort the given index tuples into order.
9 : * Then we scan the index tuples in order and build the btree pages
10 : * for each level. We load source tuples into leaf-level pages.
11 : * Whenever we fill a page at one level, we add a link to it to its
12 : * parent level (starting a new parent level if necessary). When
13 : * done, we write out each final page on each level, adding it to
14 : * its parent level. When we have only one page on a level, it must be
15 : * the root -- it can be attached to the btree metapage and we are done.
16 : *
17 : * This code is moderately slow (~10% slower) compared to the regular
18 : * btree (insertion) build code on sorted or well-clustered data. On
19 : * random data, however, the insertion build code is unusable -- the
20 : * difference on a 60MB heap is a factor of 15 because the random
21 : * probes into the btree thrash the buffer pool. (NOTE: the above
22 : * "10%" estimate is probably obsolete, since it refers to an old and
23 : * not very good external sort implementation that used to exist in
24 : * this module. tuplesort.c is almost certainly faster.)
25 : *
26 : * It is not wise to pack the pages entirely full, since then *any*
27 : * insertion would cause a split (and not only of the leaf page; the need
28 : * for a split would cascade right up the tree). The steady-state load
29 : * factor for btrees is usually estimated at 70%. We choose to pack leaf
30 : * pages to the user-controllable fill factor (default 90%) while upper pages
31 : * are always packed to 70%. This gives us reasonable density (there aren't
32 : * many upper pages if the keys are reasonable-size) without risking a lot of
33 : * cascading splits during early insertions.
34 : *
35 : * Formerly the index pages being built were kept in shared buffers, but
36 : * that is of no value (since other backends have no interest in them yet)
37 : * and it created locking problems for CHECKPOINT, because the upper-level
38 : * pages were held exclusive-locked for long periods. Now we just build
39 : * the pages in local memory and smgrwrite or smgrextend them as we finish
40 : * them. They will need to be re-read into shared buffers on first use after
41 : * the build finishes.
42 : *
43 : * Since the index will never be used unless it is completely built,
44 : * from a crash-recovery point of view there is no need to WAL-log the
45 : * steps of the build. After completing the index build, we can just sync
46 : * the whole file to disk using smgrimmedsync() before exiting this module.
47 : * This can be seen to be sufficient for crash recovery by considering that
48 : * it's effectively equivalent to what would happen if a CHECKPOINT occurred
49 : * just after the index build. However, it is clearly not sufficient if the
50 : * DBA is using the WAL log for PITR or replication purposes, since another
51 : * machine would not be able to reconstruct the index from WAL. Therefore,
52 : * we log the completed index pages to WAL if and only if WAL archiving is
53 : * active.
54 : *
55 : *
56 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
57 : * Portions Copyright (c) 1994, Regents of the University of California
58 : *
59 : * IDENTIFICATION
60 : * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.113 2007/09/20 17:56:30 tgl Exp $
61 : *
62 : *-------------------------------------------------------------------------
63 : */
64 :
65 : #include "postgres.h"
66 :
67 : #include "access/heapam.h"
68 : #include "access/nbtree.h"
69 : #include "miscadmin.h"
70 : #include "storage/smgr.h"
71 : #include "utils/tuplesort.h"
72 :
73 :
74 : /*
75 : * Status record for spooling/sorting phase. (Note we may have two of
76 : * these due to the special requirements for uniqueness-checking with
77 : * dead tuples.)
78 : */
79 : struct BTSpool
80 : {
81 : Tuplesortstate *sortstate; /* state data for tuplesort.c */
82 : Relation index;
83 : bool isunique;
84 : };
85 :
86 : /*
87 : * Status record for a btree page being built. We have one of these
88 : * for each active tree level.
89 : *
90 : * The reason we need to store a copy of the minimum key is that we'll
91 : * need to propagate it to the parent node when this page is linked
92 : * into its parent. However, if the page is not a leaf page, the first
93 : * entry on the page doesn't need to contain a key, so we will not have
94 : * stored the key itself on the page. (You might think we could skip
95 : * copying the minimum key on leaf pages, but actually we must have a
96 : * writable copy anyway because we'll poke the page's address into it
97 : * before passing it up to the parent...)
98 : */
99 : typedef struct BTPageState
100 : {
101 : Page btps_page; /* workspace for page building */
102 : BlockNumber btps_blkno; /* block # to write this page at */
103 : IndexTuple btps_minkey; /* copy of minimum key (first item) on page */
104 : OffsetNumber btps_lastoff; /* last item offset loaded */
105 : uint32 btps_level; /* tree level (0 = leaf) */
106 : Size btps_full; /* "full" if less than this much free space */
107 : struct BTPageState *btps_next; /* link to parent level, if any */
108 : } BTPageState;
109 :
110 : /*
111 : * Overall status record for index writing phase.
112 : */
113 : typedef struct BTWriteState
114 : {
115 : Relation index;
116 : bool btws_use_wal; /* dump pages to WAL? */
117 : BlockNumber btws_pages_alloced; /* # pages allocated */
118 : BlockNumber btws_pages_written; /* # pages written out */
119 : Page btws_zeropage; /* workspace for filling zeroes */
120 : } BTWriteState;
121 :
122 :
123 : static Page _bt_blnewpage(uint32 level);
124 : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
125 : static void _bt_slideleft(Page page);
126 : static void _bt_sortaddtup(Page page, Size itemsize,
127 : IndexTuple itup, OffsetNumber itup_off);
128 : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
129 : IndexTuple itup);
130 : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
131 : static void _bt_load(BTWriteState *wstate,
132 : BTSpool *btspool, BTSpool *btspool2);
133 :
134 :
135 : /*
136 : * Interface routines
137 : */
138 :
139 :
140 : /*
141 : * create and initialize a spool structure
142 : */
143 : BTSpool *
144 : _bt_spoolinit(Relation index, bool isunique, bool isdead)
145 1063 : {
146 1063 : BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
147 : int btKbytes;
148 :
149 1063 : btspool->index = index;
150 1063 : btspool->isunique = isunique;
151 :
152 : /*
153 : * We size the sort area as maintenance_work_mem rather than work_mem to
154 : * speed index creation. This should be OK since a single backend can't
155 : * run multiple index creations in parallel. Note that creation of a
156 : * unique index actually requires two BTSpool objects. We expect that the
157 : * second one (for dead tuples) won't get very full, so we give it only
158 : * work_mem.
159 : */
160 1063 : btKbytes = isdead ? work_mem : maintenance_work_mem;
161 1063 : btspool->sortstate = tuplesort_begin_index(index, isunique,
162 : btKbytes, false);
163 :
164 1063 : return btspool;
165 : }
166 :
167 : /*
168 : * clean up a spool structure and its substructures.
169 : */
170 : void
171 : _bt_spooldestroy(BTSpool *btspool)
172 1060 : {
173 1060 : tuplesort_end(btspool->sortstate);
174 1060 : pfree(btspool);
175 1060 : }
176 :
177 : /*
178 : * spool an index entry into the sort file.
179 : */
180 : void
181 : _bt_spool(IndexTuple itup, BTSpool *btspool)
182 173938 : {
183 173938 : tuplesort_putindextuple(btspool->sortstate, itup);
184 173938 : }
185 :
186 : /*
187 : * given a spool loaded by successive calls to _bt_spool,
188 : * create an entire btree.
189 : */
190 : void
191 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
192 560 : {
193 : BTWriteState wstate;
194 :
195 : #ifdef BTREE_BUILD_STATS
196 : if (log_btree_build_stats)
197 : {
198 : ShowUsage("BTREE BUILD (Spool) STATISTICS");
199 : ResetUsage();
200 : }
201 : #endif /* BTREE_BUILD_STATS */
202 :
203 560 : tuplesort_performsort(btspool->sortstate);
204 557 : if (btspool2)
205 3 : tuplesort_performsort(btspool2->sortstate);
206 :
207 557 : wstate.index = btspool->index;
208 :
209 : /*
210 : * We need to log index creation in WAL iff WAL archiving is enabled AND
211 : * it's not a temp index.
212 : */
213 557 : wstate.btws_use_wal = XLogArchivingActive() && !wstate.index->rd_istemp;
214 :
215 : /* reserve the metapage */
216 557 : wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
217 557 : wstate.btws_pages_written = 0;
218 557 : wstate.btws_zeropage = NULL; /* until needed */
219 :
220 557 : _bt_load(&wstate, btspool, btspool2);
221 557 : }
222 :
223 :
224 : /*
225 : * Internal routines.
226 : */
227 :
228 :
229 : /*
230 : * allocate workspace for a new, clean btree page, not linked to any siblings.
231 : */
232 : static Page
233 : _bt_blnewpage(uint32 level)
234 702 : {
235 : Page page;
236 : BTPageOpaque opaque;
237 :
238 702 : page = (Page) palloc(BLCKSZ);
239 :
240 : /* Zero the page and set up standard page header info */
241 702 : _bt_pageinit(page, BLCKSZ);
242 :
243 : /* Initialize BT opaque state */
244 702 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
245 702 : opaque->btpo_prev = opaque->btpo_next = P_NONE;
246 702 : opaque->btpo.level = level;
247 702 : opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
248 702 : opaque->btpo_cycleid = 0;
249 :
250 : /* Make the P_HIKEY line pointer appear allocated */
251 702 : ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
252 :
253 702 : return page;
254 : }
255 :
256 : /*
257 : * emit a completed btree page, and release the working storage.
258 : */
259 : static void
260 : _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
261 1259 : {
262 : /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
263 1259 : RelationOpenSmgr(wstate->index);
264 :
265 : /* XLOG stuff */
266 1259 : if (wstate->btws_use_wal)
267 : {
268 : /* We use the heap NEWPAGE record type for this */
269 0 : log_newpage(&wstate->index->rd_node, blkno, page);
270 : }
271 : else
272 : {
273 : /* Leave the page LSN zero if not WAL-logged, but set TLI anyway */
274 1259 : PageSetTLI(page, ThisTimeLineID);
275 : }
276 :
277 : /*
278 : * If we have to write pages nonsequentially, fill in the space with
279 : * zeroes until we come back and overwrite. This is not logically
280 : * necessary on standard Unix filesystems (unwritten space will read as
281 : * zeroes anyway), but it should help to avoid fragmentation. The dummy
282 : * pages aren't WAL-logged though.
283 : */
284 1402 : while (blkno > wstate->btws_pages_written)
285 : {
286 143 : if (!wstate->btws_zeropage)
287 111 : wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
288 143 : smgrextend(wstate->index->rd_smgr, wstate->btws_pages_written++,
289 : (char *) wstate->btws_zeropage,
290 : true);
291 : }
292 :
293 : /*
294 : * Now write the page. We say isTemp = true even if it's not a temp
295 : * index, because there's no need for smgr to schedule an fsync for this
296 : * write; we'll do it ourselves before ending the build.
297 : */
298 1259 : if (blkno == wstate->btws_pages_written)
299 : {
300 : /* extending the file... */
301 1116 : smgrextend(wstate->index->rd_smgr, blkno, (char *) page, true);
302 1116 : wstate->btws_pages_written++;
303 : }
304 : else
305 : {
306 : /* overwriting a block we zero-filled before */
307 143 : smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true);
308 : }
309 :
310 1259 : pfree(page);
311 1259 : }
312 :
313 : /*
314 : * allocate and initialize a new BTPageState. the returned structure
315 : * is suitable for immediate use by _bt_buildadd.
316 : */
317 : static BTPageState *
318 : _bt_pagestate(BTWriteState *wstate, uint32 level)
319 147 : {
320 147 : BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
321 :
322 : /* create initial page for level */
323 147 : state->btps_page = _bt_blnewpage(level);
324 :
325 : /* and assign it a page position */
326 147 : state->btps_blkno = wstate->btws_pages_alloced++;
327 :
328 147 : state->btps_minkey = NULL;
329 : /* initialize lastoff so first item goes into P_FIRSTKEY */
330 147 : state->btps_lastoff = P_HIKEY;
331 147 : state->btps_level = level;
332 : /* set "full" threshold based on level. See notes at head of file. */
333 147 : if (level > 0)
334 36 : state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
335 : else
336 111 : state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
337 : BTREE_DEFAULT_FILLFACTOR);
338 : /* no parent level, yet */
339 147 : state->btps_next = NULL;
340 :
341 147 : return state;
342 : }
343 :
344 : /*
345 : * slide an array of ItemIds back one slot (from P_FIRSTKEY to
346 : * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover
347 : * that we have built an ItemId array in what has turned out to be a
348 : * P_RIGHTMOST page.
349 : */
350 : static void
351 : _bt_slideleft(Page page)
352 147 : {
353 : OffsetNumber off;
354 : OffsetNumber maxoff;
355 : ItemId previi;
356 : ItemId thisii;
357 :
358 147 : if (!PageIsEmpty(page))
359 : {
360 147 : maxoff = PageGetMaxOffsetNumber(page);
361 147 : previi = PageGetItemId(page, P_HIKEY);
362 10779 : for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
363 : {
364 10632 : thisii = PageGetItemId(page, off);
365 10632 : *previi = *thisii;
366 10632 : previi = thisii;
367 : }
368 147 : ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
369 : }
370 147 : }
371 :
372 : /*
373 : * Add an item to a page being built.
374 : *
375 : * The main difference between this routine and a bare PageAddItem call
376 : * is that this code knows that the leftmost data item on a non-leaf
377 : * btree page doesn't need to have a key. Therefore, it strips such
378 : * items down to just the item header.
379 : *
380 : * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
381 : * that because it assumes that P_RIGHTMOST() will return the correct
382 : * answer for the page. Here, we don't know yet if the page will be
383 : * rightmost. Offset P_FIRSTKEY is always the first data key.
384 : */
385 : static void
386 : _bt_sortaddtup(Page page,
387 : Size itemsize,
388 : IndexTuple itup,
389 : OffsetNumber itup_off)
390 175078 : {
391 175078 : BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
392 : IndexTupleData trunctuple;
393 :
394 175078 : if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
395 : {
396 37 : trunctuple = *itup;
397 37 : trunctuple.t_info = sizeof(IndexTupleData);
398 37 : itup = &trunctuple;
399 37 : itemsize = sizeof(IndexTupleData);
400 : }
401 :
402 175078 : if (PageAddItem(page, (Item) itup, itemsize, itup_off,
403 : false, false) == InvalidOffsetNumber)
404 0 : elog(ERROR, "failed to add item to the index page");
405 175078 : }
406 :
407 : /*----------
408 : * Add an item to a disk page from the sort output.
409 : *
410 : * We must be careful to observe the page layout conventions of nbtsearch.c:
411 : * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
412 : * - on non-leaf pages, the key portion of the first item need not be
413 : * stored, we should store only the link.
414 : *
415 : * A leaf page being built looks like:
416 : *
417 : * +----------------+---------------------------------+
418 : * | PageHeaderData | linp0 linp1 linp2 ... |
419 : * +-----------+----+---------------------------------+
420 : * | ... linpN | |
421 : * +-----------+--------------------------------------+
422 : * | ^ last |
423 : * | |
424 : * +-------------+------------------------------------+
425 : * | | itemN ... |
426 : * +-------------+------------------+-----------------+
427 : * | ... item3 item2 item1 | "special space" |
428 : * +--------------------------------+-----------------+
429 : *
430 : * Contrast this with the diagram in bufpage.h; note the mismatch
431 : * between linps and items. This is because we reserve linp0 as a
432 : * placeholder for the pointer to the "high key" item; when we have
433 : * filled up the page, we will set linp0 to point to itemN and clear
434 : * linpN. On the other hand, if we find this is the last (rightmost)
435 : * page, we leave the items alone and slide the linp array over.
436 : *
437 : * 'last' pointer indicates the last offset added to the page.
438 : *----------
439 : */
440 : static void
441 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
442 174523 : {
443 : Page npage;
444 : BlockNumber nblkno;
445 : OffsetNumber last_off;
446 : Size pgspc;
447 : Size itupsz;
448 :
449 : /*
450 : * This is a handy place to check for cancel interrupts during the btree
451 : * load phase of index creation.
452 : */
453 174523 : CHECK_FOR_INTERRUPTS();
454 :
455 174523 : npage = state->btps_page;
456 174523 : nblkno = state->btps_blkno;
457 174523 : last_off = state->btps_lastoff;
458 :
459 174523 : pgspc = PageGetFreeSpace(npage);
460 174523 : itupsz = IndexTupleDSize(*itup);
461 174523 : itupsz = MAXALIGN(itupsz);
462 :
463 : /*
464 : * Check whether the item can fit on a btree page at all. (Eventually, we
465 : * ought to try to apply TOAST methods if not.) We actually need to be
466 : * able to fit three items on every page, so restrict any one item to 1/3
467 : * the per-page available space. Note that at this point, itupsz doesn't
468 : * include the ItemId.
469 : *
470 : * NOTE: similar code appears in _bt_insertonpg() to defend against
471 : * oversize items being inserted into an already-existing index. But
472 : * during creation of an index, we don't go through there.
473 : */
474 174523 : if (itupsz > BTMaxItemSize(npage))
475 0 : ereport(ERROR,
476 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
477 : errmsg("index row size %lu exceeds btree maximum, %lu",
478 : (unsigned long) itupsz,
479 : (unsigned long) BTMaxItemSize(npage)),
480 : errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
481 : "Consider a function index of an MD5 hash of the value, "
482 : "or use full text indexing.")));
483 :
484 : /*
485 : * Check to see if page is "full". It's definitely full if the item won't
486 : * fit. Otherwise, compare to the target freespace derived from the
487 : * fillfactor. However, we must put at least two items on each page, so
488 : * disregard fillfactor if we don't have that many.
489 : */
490 174523 : if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
491 : {
492 : /*
493 : * Finish off the page and write it out.
494 : */
495 555 : Page opage = npage;
496 555 : BlockNumber oblkno = nblkno;
497 : ItemId ii;
498 : ItemId hii;
499 : IndexTuple oitup;
500 :
501 : /* Create new page of same level */
502 555 : npage = _bt_blnewpage(state->btps_level);
503 :
504 : /* and assign it a page position */
505 555 : nblkno = wstate->btws_pages_alloced++;
506 :
507 : /*
508 : * We copy the last item on the page into the new page, and then
509 : * rearrange the old page so that the 'last item' becomes its high key
510 : * rather than a true data item. There had better be at least two
511 : * items on the page already, else the page would be empty of useful
512 : * data.
513 : */
514 : Assert(last_off > P_FIRSTKEY);
515 555 : ii = PageGetItemId(opage, last_off);
516 555 : oitup = (IndexTuple) PageGetItem(opage, ii);
517 555 : _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
518 :
519 : /*
520 : * Move 'last' into the high key position on opage
521 : */
522 555 : hii = PageGetItemId(opage, P_HIKEY);
523 555 : *hii = *ii;
524 555 : ItemIdSetUnused(ii); /* redundant */
525 555 : ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
526 :
527 : /*
528 : * Link the old page into its parent, using its minimum key. If we
529 : * don't have a parent, we have to create one; this adds a new btree
530 : * level.
531 : */
532 555 : if (state->btps_next == NULL)
533 36 : state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
534 :
535 : Assert(state->btps_minkey != NULL);
536 555 : ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY);
537 555 : _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
538 555 : pfree(state->btps_minkey);
539 :
540 : /*
541 : * Save a copy of the minimum key for the new page. We have to copy
542 : * it off the old page, not the new one, in case we are not at leaf
543 : * level.
544 : */
545 555 : state->btps_minkey = CopyIndexTuple(oitup);
546 :
547 : /*
548 : * Set the sibling links for both pages.
549 : */
550 : {
551 555 : BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
552 555 : BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
553 :
554 555 : oopaque->btpo_next = nblkno;
555 555 : nopaque->btpo_prev = oblkno;
556 555 : nopaque->btpo_next = P_NONE; /* redundant */
557 : }
558 :
559 : /*
560 : * Write out the old page. We never need to touch it again, so we can
561 : * free the opage workspace too.
562 : */
563 555 : _bt_blwritepage(wstate, opage, oblkno);
564 :
565 : /*
566 : * Reset last_off to point to new page
567 : */
568 555 : last_off = P_FIRSTKEY;
569 : }
570 :
571 : /*
572 : * If the new item is the first for its page, stash a copy for later. Note
573 : * this will only happen for the first item on a level; on later pages,
574 : * the first item for a page is copied from the prior page in the code
575 : * above.
576 : */
577 174523 : if (last_off == P_HIKEY)
578 : {
579 : Assert(state->btps_minkey == NULL);
580 147 : state->btps_minkey = CopyIndexTuple(itup);
581 : }
582 :
583 : /*
584 : * Add the new item into the current page.
585 : */
586 174523 : last_off = OffsetNumberNext(last_off);
587 174523 : _bt_sortaddtup(npage, itupsz, itup, last_off);
588 :
589 174523 : state->btps_page = npage;
590 174523 : state->btps_blkno = nblkno;
591 174523 : state->btps_lastoff = last_off;
592 174523 : }
593 :
594 : /*
595 : * Finish writing out the completed btree.
596 : */
597 : static void
598 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
599 557 : {
600 : BTPageState *s;
601 557 : BlockNumber rootblkno = P_NONE;
602 557 : uint32 rootlevel = 0;
603 : Page metapage;
604 :
605 : /*
606 : * Each iteration of this loop completes one more level of the tree.
607 : */
608 704 : for (s = state; s != NULL; s = s->btps_next)
609 : {
610 : BlockNumber blkno;
611 : BTPageOpaque opaque;
612 :
613 147 : blkno = s->btps_blkno;
614 147 : opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
615 :
616 : /*
617 : * We have to link the last page on this level to somewhere.
618 : *
619 : * If we're at the top, it's the root, so attach it to the metapage.
620 : * Otherwise, add an entry for it to its parent using its minimum key.
621 : * This may cause the last page of the parent level to split, but
622 : * that's not a problem -- we haven't gotten to it yet.
623 : */
624 147 : if (s->btps_next == NULL)
625 : {
626 111 : opaque->btpo_flags |= BTP_ROOT;
627 111 : rootblkno = blkno;
628 111 : rootlevel = s->btps_level;
629 : }
630 : else
631 : {
632 : Assert(s->btps_minkey != NULL);
633 36 : ItemPointerSet(&(s->btps_minkey->t_tid), blkno, P_HIKEY);
634 36 : _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
635 36 : pfree(s->btps_minkey);
636 36 : s->btps_minkey = NULL;
637 : }
638 :
639 : /*
640 : * This is the rightmost page, so the ItemId array needs to be slid
641 : * back one slot. Then we can dump out the page.
642 : */
643 147 : _bt_slideleft(s->btps_page);
644 147 : _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
645 147 : s->btps_page = NULL; /* writepage freed the workspace */
646 : }
647 :
648 : /*
649 : * As the last step in the process, construct the metapage and make it
650 : * point to the new root (unless we had no data at all, in which case it's
651 : * set to point to "P_NONE"). This changes the index to the "valid" state
652 : * by filling in a valid magic number in the metapage.
653 : */
654 557 : metapage = (Page) palloc(BLCKSZ);
655 557 : _bt_initmetapage(metapage, rootblkno, rootlevel);
656 557 : _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
657 557 : }
658 :
659 : /*
660 : * Read tuples in correct sort order from tuplesort, and load them into
661 : * btree leaves.
662 : */
663 : static void
664 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
665 557 : {
666 557 : BTPageState *state = NULL;
667 557 : bool merge = (btspool2 != NULL);
668 : IndexTuple itup,
669 557 : itup2 = NULL;
670 : bool should_free,
671 : should_free2,
672 : load1;
673 557 : TupleDesc tupdes = RelationGetDescr(wstate->index);
674 : int i,
675 557 : keysz = RelationGetNumberOfAttributes(wstate->index);
676 557 : ScanKey indexScanKey = NULL;
677 :
678 557 : if (merge)
679 : {
680 : /*
681 : * Another BTSpool for dead tuples exists. Now we have to merge
682 : * btspool and btspool2.
683 : */
684 :
685 : /* the preparation of merge */
686 3 : itup = tuplesort_getindextuple(btspool->sortstate,
687 : true, &should_free);
688 3 : itup2 = tuplesort_getindextuple(btspool2->sortstate,
689 : true, &should_free2);
690 3 : indexScanKey = _bt_mkscankey_nodata(wstate->index);
691 :
692 : for (;;)
693 : {
694 23 : load1 = true; /* load BTSpool next ? */
695 23 : if (itup2 == NULL)
696 : {
697 5 : if (itup == NULL)
698 3 : break;
699 : }
700 18 : else if (itup != NULL)
701 : {
702 19 : for (i = 1; i <= keysz; i++)
703 : {
704 : ScanKey entry;
705 : Datum attrDatum1,
706 : attrDatum2;
707 : bool isNull1,
708 : isNull2;
709 : int32 compare;
710 :
711 15 : entry = indexScanKey + i - 1;
712 15 : attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
713 15 : attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
714 15 : if (isNull1)
715 : {
716 0 : if (isNull2)
717 0 : compare = 0; /* NULL "=" NULL */
718 0 : else if (entry->sk_flags & SK_BT_NULLS_FIRST)
719 0 : compare = -1; /* NULL "<" NOT_NULL */
720 : else
721 0 : compare = 1; /* NULL ">" NOT_NULL */
722 : }
723 15 : else if (isNull2)
724 : {
725 0 : if (entry->sk_flags & SK_BT_NULLS_FIRST)
726 0 : compare = 1; /* NOT_NULL ">" NULL */
727 : else
728 0 : compare = -1; /* NOT_NULL "<" NULL */
729 : }
730 : else
731 : {
732 15 : compare = DatumGetInt32(FunctionCall2(&entry->sk_func,
733 : attrDatum1,
734 : attrDatum2));
735 :
736 15 : if (entry->sk_flags & SK_BT_DESC)
737 0 : compare = -compare;
738 : }
739 15 : if (compare > 0)
740 : {
741 8 : load1 = false;
742 8 : break;
743 : }
744 7 : else if (compare < 0)
745 3 : break;
746 : }
747 : }
748 : else
749 3 : load1 = false;
750 :
751 : /* When we see first tuple, create first index page */
752 20 : if (state == NULL)
753 3 : state = _bt_pagestate(wstate, 0);
754 :
755 20 : if (load1)
756 : {
757 9 : _bt_buildadd(wstate, state, itup);
758 9 : if (should_free)
759 0 : pfree(itup);
760 9 : itup = tuplesort_getindextuple(btspool->sortstate,
761 : true, &should_free);
762 : }
763 : else
764 : {
765 11 : _bt_buildadd(wstate, state, itup2);
766 11 : if (should_free2)
767 0 : pfree(itup2);
768 11 : itup2 = tuplesort_getindextuple(btspool2->sortstate,
769 : true, &should_free2);
770 : }
771 : }
772 3 : _bt_freeskey(indexScanKey);
773 : }
774 : else
775 : {
776 : /* merge is unnecessary */
777 174466 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
778 : true, &should_free)) != NULL)
779 : {
780 : /* When we see first tuple, create first index page */
781 173912 : if (state == NULL)
782 108 : state = _bt_pagestate(wstate, 0);
783 :
784 173912 : _bt_buildadd(wstate, state, itup);
785 173912 : if (should_free)
786 0 : pfree(itup);
787 : }
788 : }
789 :
790 : /* Close down final pages and write the metapage */
791 557 : _bt_uppershutdown(wstate, state);
792 :
793 : /*
794 : * If the index isn't temp, we must fsync it down to disk before it's safe
795 : * to commit the transaction. (For a temp index we don't care since the
796 : * index will be uninteresting after a crash anyway.)
797 : *
798 : * It's obvious that we must do this when not WAL-logging the build. It's
799 : * less obvious that we have to do it even if we did WAL-log the index
800 : * pages. The reason is that since we're building outside shared buffers,
801 : * a CHECKPOINT occurring during the build has no way to flush the
802 : * previously written data to disk (indeed it won't know the index even
803 : * exists). A crash later on would replay WAL from the checkpoint,
804 : * therefore it wouldn't replay our earlier WAL entries. If we do not
805 : * fsync those pages here, they might still not be on disk when the crash
806 : * occurs.
807 : */
808 557 : if (!wstate->index->rd_istemp)
809 : {
810 469 : RelationOpenSmgr(wstate->index);
811 469 : smgrimmedsync(wstate->index->rd_smgr);
812 : }
813 557 : }
|