1 : /*-------------------------------------------------------------------------
2 : *
3 : * hashpage.c
4 : * Hash table page management code for the Postgres hash access method
5 : *
6 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.71 2007/11/15 21:14:32 momjian Exp $
12 : *
13 : * NOTES
14 : * Postgres hash pages look like ordinary relation pages. The opaque
15 : * data at high addresses includes information about the page including
16 : * whether a page is an overflow page or a true bucket, the bucket
17 : * number, and the block numbers of the preceding and following pages
18 : * in the same bucket.
19 : *
20 : * The first page in a hash relation, page zero, is special -- it stores
21 : * information describing the hash table; it is referred to as the
22 : * "meta page." Pages one and higher store the actual data.
23 : *
24 : * There are also bitmap pages, which are not manipulated here;
25 : * see hashovfl.c.
26 : *
27 : *-------------------------------------------------------------------------
28 : */
29 : #include "postgres.h"
30 :
31 : #include "access/genam.h"
32 : #include "access/hash.h"
33 : #include "miscadmin.h"
34 : #include "storage/lmgr.h"
35 : #include "storage/smgr.h"
36 : #include "utils/lsyscache.h"
37 :
38 :
39 : static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
40 : uint32 nblocks);
41 : static void _hash_splitbucket(Relation rel, Buffer metabuf,
42 : Bucket obucket, Bucket nbucket,
43 : BlockNumber start_oblkno,
44 : BlockNumber start_nblkno,
45 : uint32 maxbucket,
46 : uint32 highmask, uint32 lowmask);
47 :
48 :
49 : /*
50 : * We use high-concurrency locking on hash indexes (see README for an overview
51 : * of the locking rules). However, we can skip taking lmgr locks when the
52 : * index is local to the current backend (ie, either temp or new in the
53 : * current transaction). No one else can see it, so there's no reason to
54 : * take locks. We still take buffer-level locks, but not lmgr locks.
55 : */
56 : #define USELOCKING(rel) (!RELATION_IS_LOCAL(rel))
57 :
58 :
59 : /*
60 : * _hash_getlock() -- Acquire an lmgr lock.
61 : *
62 : * 'whichlock' should be zero to acquire the split-control lock, or the
63 : * block number of a bucket's primary bucket page to acquire the per-bucket
64 : * lock. (See README for details of the use of these locks.)
65 : *
66 : * 'access' must be HASH_SHARE or HASH_EXCLUSIVE.
67 : */
68 : void
69 : _hash_getlock(Relation rel, BlockNumber whichlock, int access)
70 80328 : {
71 80328 : if (USELOCKING(rel))
72 61 : LockPage(rel, whichlock, access);
73 80328 : }
74 :
75 : /*
76 : * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free.
77 : *
78 : * Same as above except we return FALSE without blocking if lock isn't free.
79 : */
80 : bool
81 : _hash_try_getlock(Relation rel, BlockNumber whichlock, int access)
82 500 : {
83 500 : if (USELOCKING(rel))
84 2 : return ConditionalLockPage(rel, whichlock, access);
85 : else
86 498 : return true;
87 : }
88 :
89 : /*
90 : * _hash_droplock() -- Release an lmgr lock.
91 : */
92 : void
93 : _hash_droplock(Relation rel, BlockNumber whichlock, int access)
94 80828 : {
95 80828 : if (USELOCKING(rel))
96 63 : UnlockPage(rel, whichlock, access);
97 80828 : }
98 :
99 : /*
100 : * _hash_getbuf() -- Get a buffer by block number for read or write.
101 : *
102 : * 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
103 : * 'flags' is a bitwise OR of the allowed page types.
104 : *
105 : * This must be used only to fetch pages that are expected to be valid
106 : * already. _hash_checkpage() is applied using the given flags.
107 : *
108 : * When this routine returns, the appropriate lock is set on the
109 : * requested buffer and its reference count has been incremented
110 : * (ie, the buffer is "locked and pinned").
111 : *
112 : * P_NEW is disallowed because this routine can only be used
113 : * to access pages that are known to be before the filesystem EOF.
114 : * Extending the index should be done with _hash_getnewbuf.
115 : */
116 : Buffer
117 : _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
118 86438 : {
119 : Buffer buf;
120 :
121 86438 : if (blkno == P_NEW)
122 0 : elog(ERROR, "hash AM does not use P_NEW");
123 :
124 86438 : buf = ReadBuffer(rel, blkno);
125 :
126 86438 : if (access != HASH_NOLOCK)
127 86438 : LockBuffer(buf, access);
128 :
129 : /* ref count and lock type are correct */
130 :
131 86438 : _hash_checkpage(rel, buf, flags);
132 :
133 86438 : return buf;
134 : }
135 :
136 : /*
137 : * _hash_getinitbuf() -- Get and initialize a buffer by block number.
138 : *
139 : * This must be used only to fetch pages that are known to be before
140 : * the index's filesystem EOF, but are to be filled from scratch.
141 : * _hash_pageinit() is applied automatically. Otherwise it has
142 : * effects similar to _hash_getbuf() with access = HASH_WRITE.
143 : *
144 : * When this routine returns, a write lock is set on the
145 : * requested buffer and its reference count has been incremented
146 : * (ie, the buffer is "locked and pinned").
147 : *
148 : * P_NEW is disallowed because this routine can only be used
149 : * to access pages that are known to be before the filesystem EOF.
150 : * Extending the index should be done with _hash_getnewbuf.
151 : */
152 : Buffer
153 : _hash_getinitbuf(Relation rel, BlockNumber blkno)
154 68 : {
155 : Buffer buf;
156 :
157 68 : if (blkno == P_NEW)
158 0 : elog(ERROR, "hash AM does not use P_NEW");
159 :
160 68 : buf = ReadOrZeroBuffer(rel, blkno);
161 :
162 68 : LockBuffer(buf, HASH_WRITE);
163 :
164 : /* ref count and lock type are correct */
165 :
166 : /* initialize the page */
167 68 : _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
168 :
169 68 : return buf;
170 : }
171 :
172 : /*
173 : * _hash_getnewbuf() -- Get a new page at the end of the index.
174 : *
175 : * This has the same API as _hash_getinitbuf, except that we are adding
176 : * a page to the index, and hence expect the page to be past the
177 : * logical EOF. (However, we have to support the case where it isn't,
178 : * since a prior try might have crashed after extending the filesystem
179 : * EOF but before updating the metapage to reflect the added page.)
180 : *
181 : * It is caller's responsibility to ensure that only one process can
182 : * extend the index at a time.
183 : */
184 : Buffer
185 : _hash_getnewbuf(Relation rel, BlockNumber blkno)
186 323 : {
187 323 : BlockNumber nblocks = RelationGetNumberOfBlocks(rel);
188 : Buffer buf;
189 :
190 323 : if (blkno == P_NEW)
191 0 : elog(ERROR, "hash AM does not use P_NEW");
192 323 : if (blkno > nblocks)
193 0 : elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
194 : RelationGetRelationName(rel));
195 :
196 : /* smgr insists we use P_NEW to extend the relation */
197 323 : if (blkno == nblocks)
198 : {
199 73 : buf = ReadBuffer(rel, P_NEW);
200 73 : if (BufferGetBlockNumber(buf) != blkno)
201 0 : elog(ERROR, "unexpected hash relation size: %u, should be %u",
202 : BufferGetBlockNumber(buf), blkno);
203 : }
204 : else
205 250 : buf = ReadOrZeroBuffer(rel, blkno);
206 :
207 323 : LockBuffer(buf, HASH_WRITE);
208 :
209 : /* ref count and lock type are correct */
210 :
211 : /* initialize the page */
212 323 : _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
213 :
214 323 : return buf;
215 : }
216 :
217 : /*
218 : * _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
219 : *
220 : * This is identical to _hash_getbuf() but also allows a buffer access
221 : * strategy to be specified. We use this for VACUUM operations.
222 : */
223 : Buffer
224 : _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
225 : int access, int flags,
226 : BufferAccessStrategy bstrategy)
227 470 : {
228 : Buffer buf;
229 :
230 470 : if (blkno == P_NEW)
231 0 : elog(ERROR, "hash AM does not use P_NEW");
232 :
233 470 : buf = ReadBufferWithStrategy(rel, blkno, bstrategy);
234 :
235 470 : if (access != HASH_NOLOCK)
236 470 : LockBuffer(buf, access);
237 :
238 : /* ref count and lock type are correct */
239 :
240 470 : _hash_checkpage(rel, buf, flags);
241 :
242 470 : return buf;
243 : }
244 :
245 : /*
246 : * _hash_relbuf() -- release a locked buffer.
247 : *
248 : * Lock and pin (refcount) are both dropped.
249 : */
250 : void
251 : _hash_relbuf(Relation rel, Buffer buf)
252 5953 : {
253 5953 : UnlockReleaseBuffer(buf);
254 5953 : }
255 :
256 : /*
257 : * _hash_dropbuf() -- release an unlocked buffer.
258 : *
259 : * This is used to unpin a buffer on which we hold no lock.
260 : */
261 : void
262 : _hash_dropbuf(Relation rel, Buffer buf)
263 40018 : {
264 40018 : ReleaseBuffer(buf);
265 40018 : }
266 :
267 : /*
268 : * _hash_wrtbuf() -- write a hash page to disk.
269 : *
270 : * This routine releases the lock held on the buffer and our refcount
271 : * for it. It is an error to call _hash_wrtbuf() without a write lock
272 : * and a pin on the buffer.
273 : *
274 : * NOTE: this routine should go away when/if hash indexes are WAL-ified.
275 : * The correct sequence of operations is to mark the buffer dirty, then
276 : * write the WAL record, then release the lock and pin; so marking dirty
277 : * can't be combined with releasing.
278 : */
279 : void
280 : _hash_wrtbuf(Relation rel, Buffer buf)
281 41328 : {
282 41328 : MarkBufferDirty(buf);
283 41328 : UnlockReleaseBuffer(buf);
284 41328 : }
285 :
286 : /*
287 : * _hash_chgbufaccess() -- Change the lock type on a buffer, without
288 : * dropping our pin on it.
289 : *
290 : * from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK,
291 : * the last indicating that no buffer-level lock is held or wanted.
292 : *
293 : * When from_access == HASH_WRITE, we assume the buffer is dirty and tell
294 : * bufmgr it must be written out. If the caller wants to release a write
295 : * lock on a page that's not been modified, it's okay to pass from_access
296 : * as HASH_READ (a bit ugly, but handy in some places).
297 : */
298 : void
299 : _hash_chgbufaccess(Relation rel,
300 : Buffer buf,
301 : int from_access,
302 : int to_access)
303 121478 : {
304 121478 : if (from_access == HASH_WRITE)
305 40385 : MarkBufferDirty(buf);
306 121478 : if (from_access != HASH_NOLOCK)
307 80748 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
308 121478 : if (to_access != HASH_NOLOCK)
309 40730 : LockBuffer(buf, to_access);
310 121478 : }
311 :
312 :
313 : /*
314 : * _hash_metapinit() -- Initialize the metadata page of a hash index,
315 : * the two buckets that we begin with and the initial
316 : * bitmap page.
317 : *
318 : * We are fairly cavalier about locking here, since we know that no one else
319 : * could be accessing this index. In particular the rule about not holding
320 : * multiple buffer locks is ignored.
321 : */
322 : void
323 : _hash_metapinit(Relation rel)
324 6 : {
325 : HashMetaPage metap;
326 : HashPageOpaque pageopaque;
327 : Buffer metabuf;
328 : Buffer buf;
329 : Page pg;
330 : int32 data_width;
331 : int32 item_width;
332 : int32 ffactor;
333 : uint16 i;
334 :
335 : /* safety check */
336 6 : if (RelationGetNumberOfBlocks(rel) != 0)
337 0 : elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
338 : RelationGetRelationName(rel));
339 :
340 : /*
341 : * Determine the target fill factor (in tuples per bucket) for this index.
342 : * The idea is to make the fill factor correspond to pages about as full
343 : * as the user-settable fillfactor parameter says. We can compute it
344 : * exactly if the index datatype is fixed-width, but for var-width there's
345 : * some guessing involved.
346 : */
347 6 : data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
348 : RelationGetDescr(rel)->attrs[0]->atttypmod);
349 6 : item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
350 : sizeof(ItemIdData); /* include the line pointer */
351 6 : ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
352 : /* keep to a sane range */
353 6 : if (ffactor < 10)
354 0 : ffactor = 10;
355 :
356 : /*
357 : * We initialize the metapage, the first two bucket pages, and the first
358 : * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
359 : * calls to occur. This ensures that the smgr level has the right idea of
360 : * the physical index length.
361 : */
362 6 : metabuf = _hash_getnewbuf(rel, HASH_METAPAGE);
363 6 : pg = BufferGetPage(metabuf);
364 :
365 6 : pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
366 6 : pageopaque->hasho_prevblkno = InvalidBlockNumber;
367 6 : pageopaque->hasho_nextblkno = InvalidBlockNumber;
368 6 : pageopaque->hasho_bucket = -1;
369 6 : pageopaque->hasho_flag = LH_META_PAGE;
370 6 : pageopaque->hasho_page_id = HASHO_PAGE_ID;
371 :
372 6 : metap = (HashMetaPage) pg;
373 :
374 6 : metap->hashm_magic = HASH_MAGIC;
375 6 : metap->hashm_version = HASH_VERSION;
376 6 : metap->hashm_ntuples = 0;
377 6 : metap->hashm_nmaps = 0;
378 6 : metap->hashm_ffactor = ffactor;
379 6 : metap->hashm_bsize = BufferGetPageSize(metabuf);
380 : /* find largest bitmap array size that will fit in page size */
381 12 : for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
382 : {
383 12 : if ((1 << i) <= (metap->hashm_bsize -
384 : (MAXALIGN(sizeof(PageHeaderData)) +
385 : MAXALIGN(sizeof(HashPageOpaqueData)))))
386 6 : break;
387 : }
388 : Assert(i > 0);
389 6 : metap->hashm_bmsize = 1 << i;
390 6 : metap->hashm_bmshift = i + BYTE_TO_BIT;
391 : Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
392 :
393 : /*
394 : * Label the index with its primary hash support function's OID. This is
395 : * pretty useless for normal operation (in fact, hashm_procid is not used
396 : * anywhere), but it might be handy for forensic purposes so we keep it.
397 : */
398 6 : metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
399 :
400 : /*
401 : * We initialize the index with two buckets, 0 and 1, occupying physical
402 : * blocks 1 and 2. The first freespace bitmap page is in block 3.
403 : */
404 6 : metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */
405 6 : metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */
406 :
407 6 : MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
408 6 : MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
409 :
410 6 : metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
411 6 : metap->hashm_ovflpoint = 1;
412 6 : metap->hashm_firstfree = 0;
413 :
414 : /*
415 : * Initialize the first two buckets
416 : */
417 18 : for (i = 0; i <= 1; i++)
418 : {
419 12 : buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
420 12 : pg = BufferGetPage(buf);
421 12 : pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
422 12 : pageopaque->hasho_prevblkno = InvalidBlockNumber;
423 12 : pageopaque->hasho_nextblkno = InvalidBlockNumber;
424 12 : pageopaque->hasho_bucket = i;
425 12 : pageopaque->hasho_flag = LH_BUCKET_PAGE;
426 12 : pageopaque->hasho_page_id = HASHO_PAGE_ID;
427 12 : _hash_wrtbuf(rel, buf);
428 : }
429 :
430 : /*
431 : * Initialize first bitmap page
432 : */
433 6 : _hash_initbitmap(rel, metap, 3);
434 :
435 : /* all done */
436 6 : _hash_wrtbuf(rel, metabuf);
437 6 : }
438 :
439 : /*
440 : * _hash_pageinit() -- Initialize a new hash index page.
441 : */
442 : void
443 : _hash_pageinit(Page page, Size size)
444 391 : {
445 : Assert(PageIsNew(page));
446 391 : PageInit(page, size, sizeof(HashPageOpaqueData));
447 391 : }
448 :
449 : /*
450 : * Attempt to expand the hash table by creating one new bucket.
451 : *
452 : * This will silently do nothing if it cannot get the needed locks.
453 : *
454 : * The caller should hold no locks on the hash index.
455 : *
456 : * The caller must hold a pin, but no lock, on the metapage buffer.
457 : * The buffer is returned in the same state.
458 : */
459 : void
460 : _hash_expandtable(Relation rel, Buffer metabuf)
461 250 : {
462 : HashMetaPage metap;
463 : Bucket old_bucket;
464 : Bucket new_bucket;
465 : uint32 spare_ndx;
466 : BlockNumber start_oblkno;
467 : BlockNumber start_nblkno;
468 : uint32 maxbucket;
469 : uint32 highmask;
470 : uint32 lowmask;
471 :
472 : /*
473 : * Obtain the page-zero lock to assert the right to begin a split (see
474 : * README).
475 : *
476 : * Note: deadlock should be impossible here. Our own backend could only be
477 : * holding bucket sharelocks due to stopped indexscans; those will not
478 : * block other holders of the page-zero lock, who are only interested in
479 : * acquiring bucket sharelocks themselves. Exclusive bucket locks are
480 : * only taken here and in hashbulkdelete, and neither of these operations
481 : * needs any additional locks to complete. (If, due to some flaw in this
482 : * reasoning, we manage to deadlock anyway, it's okay to error out; the
483 : * index will be left in a consistent state.)
484 : */
485 250 : _hash_getlock(rel, 0, HASH_EXCLUSIVE);
486 :
487 : /* Write-lock the meta page */
488 250 : _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
489 :
490 250 : _hash_checkpage(rel, metabuf, LH_META_PAGE);
491 250 : metap = (HashMetaPage) BufferGetPage(metabuf);
492 :
493 : /*
494 : * Check to see if split is still needed; someone else might have already
495 : * done one while we waited for the lock.
496 : *
497 : * Make sure this stays in sync with _hash_doinsert()
498 : */
499 250 : if (metap->hashm_ntuples <=
500 : (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
501 0 : goto fail;
502 :
503 : /*
504 : * Can't split anymore if maxbucket has reached its maximum possible
505 : * value.
506 : *
507 : * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
508 : * the calculation maxbucket+1 mustn't overflow). Currently we restrict
509 : * to half that because of overflow looping in _hash_log2() and
510 : * insufficient space in hashm_spares[]. It's moot anyway because an
511 : * index with 2^32 buckets would certainly overflow BlockNumber and hence
512 : * _hash_alloc_buckets() would fail, but if we supported buckets smaller
513 : * than a disk block then this would be an independent constraint.
514 : */
515 250 : if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
516 0 : goto fail;
517 :
518 : /*
519 : * Determine which bucket is to be split, and attempt to lock the old
520 : * bucket. If we can't get the lock, give up.
521 : *
522 : * The lock protects us against other backends, but not against our own
523 : * backend. Must check for active scans separately.
524 : */
525 250 : new_bucket = metap->hashm_maxbucket + 1;
526 :
527 250 : old_bucket = (new_bucket & metap->hashm_lowmask);
528 :
529 250 : start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
530 :
531 250 : if (_hash_has_active_scan(rel, old_bucket))
532 0 : goto fail;
533 :
534 250 : if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
535 0 : goto fail;
536 :
537 : /*
538 : * Likewise lock the new bucket (should never fail).
539 : *
540 : * Note: it is safe to compute the new bucket's blkno here, even though we
541 : * may still need to update the BUCKET_TO_BLKNO mapping. This is because
542 : * the current value of hashm_spares[hashm_ovflpoint] correctly shows
543 : * where we are going to put a new splitpoint's worth of buckets.
544 : */
545 250 : start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
546 :
547 250 : if (_hash_has_active_scan(rel, new_bucket))
548 0 : elog(ERROR, "scan in progress on supposedly new bucket");
549 :
550 250 : if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
551 0 : elog(ERROR, "could not get lock on supposedly new bucket");
552 :
553 : /*
554 : * If the split point is increasing (hashm_maxbucket's log base 2
555 : * increases), we need to allocate a new batch of bucket pages.
556 : */
557 250 : spare_ndx = _hash_log2(new_bucket + 1);
558 250 : if (spare_ndx > metap->hashm_ovflpoint)
559 : {
560 : Assert(spare_ndx == metap->hashm_ovflpoint + 1);
561 :
562 : /*
563 : * The number of buckets in the new splitpoint is equal to the total
564 : * number already in existence, i.e. new_bucket. Currently this maps
565 : * one-to-one to blocks required, but someday we may need a more
566 : * complicated calculation here.
567 : */
568 21 : if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
569 : {
570 : /* can't split due to BlockNumber overflow */
571 0 : _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
572 0 : _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
573 0 : goto fail;
574 : }
575 : }
576 :
577 : /*
578 : * Okay to proceed with split. Update the metapage bucket mapping info.
579 : *
580 : * Since we are scribbling on the metapage data right in the shared
581 : * buffer, any failure in this next little bit leaves us with a big
582 : * problem: the metapage is effectively corrupt but could get written back
583 : * to disk. We don't really expect any failure, but just to be sure,
584 : * establish a critical section.
585 : */
586 250 : START_CRIT_SECTION();
587 :
588 250 : metap->hashm_maxbucket = new_bucket;
589 :
590 250 : if (new_bucket > metap->hashm_highmask)
591 : {
592 : /* Starting a new doubling */
593 17 : metap->hashm_lowmask = metap->hashm_highmask;
594 17 : metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
595 : }
596 :
597 : /*
598 : * If the split point is increasing (hashm_maxbucket's log base 2
599 : * increases), we need to adjust the hashm_spares[] array and
600 : * hashm_ovflpoint so that future overflow pages will be created beyond
601 : * this new batch of bucket pages.
602 : */
603 250 : if (spare_ndx > metap->hashm_ovflpoint)
604 : {
605 21 : metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
606 21 : metap->hashm_ovflpoint = spare_ndx;
607 : }
608 :
609 : /* Done mucking with metapage */
610 250 : END_CRIT_SECTION();
611 :
612 : /*
613 : * Copy bucket mapping info now; this saves re-accessing the meta page
614 : * inside _hash_splitbucket's inner loop. Note that once we drop the
615 : * split lock, other splits could begin, so these values might be out of
616 : * date before _hash_splitbucket finishes. That's okay, since all it
617 : * needs is to tell which of these two buckets to map hashkeys into.
618 : */
619 250 : maxbucket = metap->hashm_maxbucket;
620 250 : highmask = metap->hashm_highmask;
621 250 : lowmask = metap->hashm_lowmask;
622 :
623 : /* Write out the metapage and drop lock, but keep pin */
624 250 : _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
625 :
626 : /* Release split lock; okay for other splits to occur now */
627 250 : _hash_droplock(rel, 0, HASH_EXCLUSIVE);
628 :
629 : /* Relocate records to the new bucket */
630 250 : _hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
631 : start_oblkno, start_nblkno,
632 : maxbucket, highmask, lowmask);
633 :
634 : /* Release bucket locks, allowing others to access them */
635 250 : _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
636 250 : _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
637 :
638 250 : return;
639 :
640 : /* Here if decide not to split or fail to acquire old bucket lock */
641 0 : fail:
642 :
643 : /* We didn't write the metapage, so just drop lock */
644 0 : _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
645 :
646 : /* Release split lock */
647 0 : _hash_droplock(rel, 0, HASH_EXCLUSIVE);
648 : }
649 :
650 :
651 : /*
652 : * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
653 : *
654 : * This does not need to initialize the new bucket pages; we'll do that as
655 : * each one is used by _hash_expandtable(). But we have to extend the logical
656 : * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
657 : * sync with ours, so that we don't get complaints from smgr.
658 : *
659 : * We do this by writing a page of zeroes at the end of the splitpoint range.
660 : * We expect that the filesystem will ensure that the intervening pages read
661 : * as zeroes too. On many filesystems this "hole" will not be allocated
662 : * immediately, which means that the index file may end up more fragmented
663 : * than if we forced it all to be allocated now; but since we don't scan
664 : * hash indexes sequentially anyway, that probably doesn't matter.
665 : *
666 : * XXX It's annoying that this code is executed with the metapage lock held.
667 : * We need to interlock against _hash_getovflpage() adding a new overflow page
668 : * concurrently, but it'd likely be better to use LockRelationForExtension
669 : * for the purpose. OTOH, adding a splitpoint is a very infrequent operation,
670 : * so it may not be worth worrying about.
671 : *
672 : * Returns TRUE if successful, or FALSE if allocation failed due to
673 : * BlockNumber overflow.
674 : */
675 : static bool
676 : _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
677 21 : {
678 : BlockNumber lastblock;
679 : char zerobuf[BLCKSZ];
680 :
681 21 : lastblock = firstblock + nblocks - 1;
682 :
683 : /*
684 : * Check for overflow in block number calculation; if so, we cannot extend
685 : * the index anymore.
686 : */
687 21 : if (lastblock < firstblock || lastblock == InvalidBlockNumber)
688 0 : return false;
689 :
690 21 : MemSet(zerobuf, 0, sizeof(zerobuf));
691 :
692 21 : RelationOpenSmgr(rel);
693 21 : smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp);
694 :
695 21 : return true;
696 : }
697 :
698 :
699 : /*
700 : * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
701 : *
702 : * We are splitting a bucket that consists of a base bucket page and zero
703 : * or more overflow (bucket chain) pages. We must relocate tuples that
704 : * belong in the new bucket, and compress out any free space in the old
705 : * bucket.
706 : *
707 : * The caller must hold exclusive locks on both buckets to ensure that
708 : * no one else is trying to access them (see README).
709 : *
710 : * The caller must hold a pin, but no lock, on the metapage buffer.
711 : * The buffer is returned in the same state. (The metapage is only
712 : * touched if it becomes necessary to add or remove overflow pages.)
713 : */
714 : static void
715 : _hash_splitbucket(Relation rel,
716 : Buffer metabuf,
717 : Bucket obucket,
718 : Bucket nbucket,
719 : BlockNumber start_oblkno,
720 : BlockNumber start_nblkno,
721 : uint32 maxbucket,
722 : uint32 highmask,
723 : uint32 lowmask)
724 250 : {
725 : Bucket bucket;
726 : Buffer obuf;
727 : Buffer nbuf;
728 : BlockNumber oblkno;
729 : BlockNumber nblkno;
730 : bool null;
731 : Datum datum;
732 : HashPageOpaque oopaque;
733 : HashPageOpaque nopaque;
734 : IndexTuple itup;
735 : Size itemsz;
736 : OffsetNumber ooffnum;
737 : OffsetNumber noffnum;
738 : OffsetNumber omaxoffnum;
739 : Page opage;
740 : Page npage;
741 250 : TupleDesc itupdesc = RelationGetDescr(rel);
742 :
743 : /*
744 : * It should be okay to simultaneously write-lock pages from each bucket,
745 : * since no one else can be trying to acquire buffer lock on pages of
746 : * either bucket.
747 : */
748 250 : oblkno = start_oblkno;
749 250 : obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE);
750 250 : opage = BufferGetPage(obuf);
751 250 : oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
752 :
753 250 : nblkno = start_nblkno;
754 250 : nbuf = _hash_getnewbuf(rel, nblkno);
755 250 : npage = BufferGetPage(nbuf);
756 :
757 : /* initialize the new bucket's primary page */
758 250 : nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
759 250 : nopaque->hasho_prevblkno = InvalidBlockNumber;
760 250 : nopaque->hasho_nextblkno = InvalidBlockNumber;
761 250 : nopaque->hasho_bucket = nbucket;
762 250 : nopaque->hasho_flag = LH_BUCKET_PAGE;
763 250 : nopaque->hasho_page_id = HASHO_PAGE_ID;
764 :
765 : /*
766 : * Partition the tuples in the old bucket between the old bucket and the
767 : * new bucket, advancing along the old bucket's overflow bucket chain and
768 : * adding overflow pages to the new bucket as needed.
769 : */
770 250 : ooffnum = FirstOffsetNumber;
771 250 : omaxoffnum = PageGetMaxOffsetNumber(opage);
772 : for (;;)
773 : {
774 : /*
775 : * at each iteration through this loop, each of these variables should
776 : * be up-to-date: obuf opage oopaque ooffnum omaxoffnum
777 : */
778 :
779 : /* check if we're at the end of the page */
780 55376 : if (ooffnum > omaxoffnum)
781 : {
782 : /* at end of page, but check for an(other) overflow page */
783 360 : oblkno = oopaque->hasho_nextblkno;
784 360 : if (!BlockNumberIsValid(oblkno))
785 250 : break;
786 :
787 : /*
788 : * we ran out of tuples on this particular page, but we have more
789 : * overflow pages; advance to next page.
790 : */
791 110 : _hash_wrtbuf(rel, obuf);
792 :
793 110 : obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
794 110 : opage = BufferGetPage(obuf);
795 110 : oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
796 110 : ooffnum = FirstOffsetNumber;
797 110 : omaxoffnum = PageGetMaxOffsetNumber(opage);
798 110 : continue;
799 : }
800 :
801 : /*
802 : * Re-hash the tuple to determine which bucket it now belongs in.
803 : *
804 : * It is annoying to call the hash function while holding locks, but
805 : * releasing and relocking the page for each tuple is unappealing too.
806 : */
807 55016 : itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum));
808 55016 : datum = index_getattr(itup, 1, itupdesc, &null);
809 : Assert(!null);
810 :
811 55016 : bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
812 : maxbucket, highmask, lowmask);
813 :
814 55016 : if (bucket == nbucket)
815 : {
816 : /*
817 : * insert the tuple into the new bucket. if it doesn't fit on the
818 : * current page in the new bucket, we must allocate a new overflow
819 : * page and place the tuple on that page instead.
820 : */
821 27502 : itemsz = IndexTupleDSize(*itup);
822 27502 : itemsz = MAXALIGN(itemsz);
823 :
824 27502 : if (PageGetFreeSpace(npage) < itemsz)
825 : {
826 : /* write out nbuf and drop lock, but keep pin */
827 0 : _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
828 : /* chain to a new overflow page */
829 0 : nbuf = _hash_addovflpage(rel, metabuf, nbuf);
830 0 : npage = BufferGetPage(nbuf);
831 : /* we don't need nopaque within the loop */
832 : }
833 :
834 27502 : noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
835 27502 : if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false, false)
836 : == InvalidOffsetNumber)
837 0 : elog(ERROR, "failed to add index item to \"%s\"",
838 : RelationGetRelationName(rel));
839 :
840 : /*
841 : * now delete the tuple from the old bucket. after this section
842 : * of code, 'ooffnum' will actually point to the ItemId to which
843 : * we would point if we had advanced it before the deletion
844 : * (PageIndexTupleDelete repacks the ItemId array). this also
845 : * means that 'omaxoffnum' is exactly one less than it used to be,
846 : * so we really can just decrement it instead of calling
847 : * PageGetMaxOffsetNumber.
848 : */
849 27502 : PageIndexTupleDelete(opage, ooffnum);
850 27502 : omaxoffnum = OffsetNumberPrev(omaxoffnum);
851 : }
852 : else
853 : {
854 : /*
855 : * the tuple stays on this page. we didn't move anything, so we
856 : * didn't delete anything and therefore we don't have to change
857 : * 'omaxoffnum'.
858 : */
859 : Assert(bucket == obucket);
860 27514 : ooffnum = OffsetNumberNext(ooffnum);
861 : }
862 : }
863 :
864 : /*
865 : * We're at the end of the old bucket chain, so we're done partitioning
866 : * the tuples. Before quitting, call _hash_squeezebucket to ensure the
867 : * tuples remaining in the old bucket (including the overflow pages) are
868 : * packed as tightly as possible. The new bucket is already tight.
869 : */
870 250 : _hash_wrtbuf(rel, obuf);
871 250 : _hash_wrtbuf(rel, nbuf);
872 :
873 250 : _hash_squeezebucket(rel, obucket, start_oblkno, NULL);
874 250 : }
|