1 : /*-------------------------------------------------------------------------
2 : *
3 : * hio.c
4 : * POSTGRES heap access method input/output code.
5 : *
6 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.67 2007/09/20 17:56:30 tgl Exp $
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/heapam.h"
19 : #include "access/hio.h"
20 : #include "storage/freespace.h"
21 :
22 :
23 : /*
24 : * RelationPutHeapTuple - place tuple at specified page
25 : *
26 : * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!!
27 : *
28 : * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer.
29 : */
30 : void
31 : RelationPutHeapTuple(Relation relation,
32 : Buffer buffer,
33 : HeapTuple tuple)
34 206189 : {
35 : Page pageHeader;
36 : OffsetNumber offnum;
37 : ItemId itemId;
38 : Item item;
39 :
40 : /* Add the tuple to the page */
41 206189 : pageHeader = BufferGetPage(buffer);
42 :
43 206189 : offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
44 : tuple->t_len, InvalidOffsetNumber, false, true);
45 :
46 206189 : if (offnum == InvalidOffsetNumber)
47 0 : elog(PANIC, "failed to add tuple to page");
48 :
49 : /* Update tuple->t_self to the actual position where it was stored */
50 206189 : ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);
51 :
52 : /* Insert the correct position into CTID of the stored tuple, too */
53 206189 : itemId = PageGetItemId(pageHeader, offnum);
54 206189 : item = PageGetItem(pageHeader, itemId);
55 206189 : ((HeapTupleHeader) item)->t_ctid = tuple->t_self;
56 206189 : }
57 :
58 : /*
59 : * RelationGetBufferForTuple
60 : *
61 : * Returns pinned and exclusive-locked buffer of a page in given relation
62 : * with free space >= given len.
63 : *
64 : * If otherBuffer is not InvalidBuffer, then it references a previously
65 : * pinned buffer of another page in the same relation; on return, this
66 : * buffer will also be exclusive-locked. (This case is used by heap_update;
67 : * the otherBuffer contains the tuple being updated.)
68 : *
69 : * The reason for passing otherBuffer is that if two backends are doing
70 : * concurrent heap_update operations, a deadlock could occur if they try
71 : * to lock the same two buffers in opposite orders. To ensure that this
72 : * can't happen, we impose the rule that buffers of a relation must be
73 : * locked in increasing page number order. This is most conveniently done
74 : * by having RelationGetBufferForTuple lock them both, with suitable care
75 : * for ordering.
76 : *
77 : * NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
78 : * same buffer we select for insertion of the new tuple (this could only
79 : * happen if space is freed in that page after heap_update finds there's not
80 : * enough there). In that case, the page will be pinned and locked only once.
81 : *
82 : * If use_fsm is true (the normal case), we use FSM to help us find free
83 : * space. If use_fsm is false, we always append a new empty page to the
84 : * end of the relation if the tuple won't fit on the current target page.
85 : * This can save some cycles when we know the relation is new and doesn't
86 : * contain useful amounts of free space.
87 : *
88 : * The use_fsm = false case is also useful for non-WAL-logged additions to a
89 : * relation, if the caller holds exclusive lock and is careful to invalidate
90 : * relation->rd_targblock before the first insertion --- that ensures that
91 : * all insertions will occur into newly added pages and not be intermixed
92 : * with tuples from other transactions. That way, a crash can't risk losing
93 : * any committed data of other transactions. (See heap_insert's comments
94 : * for additional constraints needed for safe usage of this behavior.)
95 : *
96 : * We always try to avoid filling existing pages further than the fillfactor.
97 : * This is OK since this routine is not consulted when updating a tuple and
98 : * keeping it on the same page, which is the scenario fillfactor is meant
99 : * to reserve space for.
100 : *
101 : * ereport(ERROR) is allowed here, so this routine *must* be called
102 : * before any (unlogged) changes are made in buffer pool.
103 : */
104 : Buffer
105 : RelationGetBufferForTuple(Relation relation, Size len,
106 : Buffer otherBuffer, bool use_fsm)
107 203804 : {
108 203804 : Buffer buffer = InvalidBuffer;
109 : Page pageHeader;
110 : Size pageFreeSpace,
111 : saveFreeSpace;
112 : BlockNumber targetBlock,
113 : otherBlock;
114 : bool needLock;
115 :
116 203804 : len = MAXALIGN(len); /* be conservative */
117 :
118 : /*
119 : * If we're gonna fail for oversize tuple, do it right away
120 : */
121 203804 : if (len > MaxHeapTupleSize)
122 0 : ereport(ERROR,
123 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
124 : errmsg("row is too big: size %lu, maximum size %lu",
125 : (unsigned long) len,
126 : (unsigned long) MaxHeapTupleSize)));
127 :
128 : /* Compute desired extra freespace due to fillfactor option */
129 203804 : saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
130 : HEAP_DEFAULT_FILLFACTOR);
131 :
132 203804 : if (otherBuffer != InvalidBuffer)
133 2199 : otherBlock = BufferGetBlockNumber(otherBuffer);
134 : else
135 201605 : otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */
136 :
137 : /*
138 : * We first try to put the tuple on the same page we last inserted a tuple
139 : * on, as cached in the relcache entry. If that doesn't work, we ask the
140 : * shared Free Space Map to locate a suitable page. Since the FSM's info
141 : * might be out of date, we have to be prepared to loop around and retry
142 : * multiple times. (To insure this isn't an infinite loop, we must update
143 : * the FSM with the correct amount of free space on each page that proves
144 : * not to be suitable.) If the FSM has no record of a page with enough
145 : * free space, we give up and extend the relation.
146 : *
147 : * When use_fsm is false, we either put the tuple onto the existing target
148 : * page or extend the relation.
149 : */
150 203804 : if (len + saveFreeSpace <= MaxHeapTupleSize)
151 203804 : targetBlock = relation->rd_targblock;
152 : else
153 : {
154 : /* can't fit, don't screw up FSM request tracking by trying */
155 0 : targetBlock = InvalidBlockNumber;
156 0 : use_fsm = false;
157 : }
158 :
159 203804 : if (targetBlock == InvalidBlockNumber && use_fsm)
160 : {
161 : /*
162 : * We have no cached target page, so ask the FSM for an initial
163 : * target.
164 : */
165 1194 : targetBlock = GetPageWithFreeSpace(&relation->rd_node,
166 : len + saveFreeSpace);
167 :
168 : /*
169 : * If the FSM knows nothing of the rel, try the last page before we
170 : * give up and extend. This avoids one-tuple-per-page syndrome during
171 : * bootstrapping or in a recently-started system.
172 : */
173 1194 : if (targetBlock == InvalidBlockNumber)
174 : {
175 966 : BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
176 :
177 966 : if (nblocks > 0)
178 480 : targetBlock = nblocks - 1;
179 : }
180 : }
181 :
182 206786 : while (targetBlock != InvalidBlockNumber)
183 : {
184 : /*
185 : * Read and exclusive-lock the target block, as well as the other
186 : * block if one was given, taking suitable care with lock ordering and
187 : * the possibility they are the same block.
188 : */
189 203377 : if (otherBuffer == InvalidBuffer)
190 : {
191 : /* easy case */
192 201141 : buffer = ReadBuffer(relation, targetBlock);
193 201141 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
194 : }
195 2236 : else if (otherBlock == targetBlock)
196 : {
197 : /* also easy case */
198 23 : buffer = otherBuffer;
199 23 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
200 : }
201 2213 : else if (otherBlock < targetBlock)
202 : {
203 : /* lock other buffer first */
204 1197 : buffer = ReadBuffer(relation, targetBlock);
205 1197 : LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
206 1197 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
207 : }
208 : else
209 : {
210 : /* lock target buffer first */
211 1016 : buffer = ReadBuffer(relation, targetBlock);
212 1016 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
213 1016 : LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
214 : }
215 :
216 : /*
217 : * Now we can check to see if there's enough free space here. If so,
218 : * we're done.
219 : */
220 203377 : pageHeader = (Page) BufferGetPage(buffer);
221 203377 : pageFreeSpace = PageGetHeapFreeSpace(pageHeader);
222 203377 : if (len + saveFreeSpace <= pageFreeSpace)
223 : {
224 : /* use this page as future insert target, too */
225 200228 : relation->rd_targblock = targetBlock;
226 200228 : return buffer;
227 : }
228 :
229 : /*
230 : * Not enough space, so we must give up our page locks and pin (if
231 : * any) and prepare to look elsewhere. We don't care which order we
232 : * unlock the two buffers in, so this can be slightly simpler than the
233 : * code above.
234 : */
235 3149 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
236 3149 : if (otherBuffer == InvalidBuffer)
237 3057 : ReleaseBuffer(buffer);
238 92 : else if (otherBlock != targetBlock)
239 : {
240 69 : LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
241 69 : ReleaseBuffer(buffer);
242 : }
243 :
244 : /* Without FSM, always fall out of the loop and extend */
245 3149 : if (!use_fsm)
246 167 : break;
247 :
248 : /*
249 : * Update FSM as to condition of this page, and ask for another page
250 : * to try.
251 : */
252 2982 : targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node,
253 : targetBlock,
254 : pageFreeSpace,
255 : len + saveFreeSpace);
256 : }
257 :
258 : /*
259 : * Have to extend the relation.
260 : *
261 : * We have to use a lock to ensure no one else is extending the rel at the
262 : * same time, else we will both try to initialize the same new page. We
263 : * can skip locking for new or temp relations, however, since no one else
264 : * could be accessing them.
265 : */
266 3576 : needLock = !RELATION_IS_LOCAL(relation);
267 :
268 3576 : if (needLock)
269 2577 : LockRelationForExtension(relation, ExclusiveLock);
270 :
271 : /*
272 : * XXX This does an lseek - rather expensive - but at the moment it is the
273 : * only way to accurately determine how many blocks are in a relation. Is
274 : * it worth keeping an accurate file length in shared memory someplace,
275 : * rather than relying on the kernel to do it for us?
276 : */
277 3576 : buffer = ReadBuffer(relation, P_NEW);
278 :
279 : /*
280 : * We can be certain that locking the otherBuffer first is OK, since it
281 : * must have a lower page number.
282 : */
283 3576 : if (otherBuffer != InvalidBuffer)
284 55 : LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
285 :
286 : /*
287 : * Now acquire lock on the new page.
288 : */
289 3576 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
290 :
291 : /*
292 : * Release the file-extension lock; it's now OK for someone else to extend
293 : * the relation some more. Note that we cannot release this lock before
294 : * we have buffer lock on the new page, or we risk a race condition
295 : * against vacuumlazy.c --- see comments therein.
296 : */
297 3576 : if (needLock)
298 2577 : UnlockRelationForExtension(relation, ExclusiveLock);
299 :
300 : /*
301 : * We need to initialize the empty new page. Double-check that it really
302 : * is empty (this should never happen, but if it does we don't want to
303 : * risk wiping out valid data).
304 : */
305 3576 : pageHeader = (Page) BufferGetPage(buffer);
306 :
307 3576 : if (!PageIsNew((PageHeader) pageHeader))
308 0 : elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
309 : BufferGetBlockNumber(buffer),
310 : RelationGetRelationName(relation));
311 :
312 3576 : PageInit(pageHeader, BufferGetPageSize(buffer), 0);
313 :
314 3576 : if (len > PageGetHeapFreeSpace(pageHeader))
315 : {
316 : /* We should not get here given the test at the top */
317 0 : elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
318 : }
319 :
320 : /*
321 : * Remember the new page as our target for future insertions.
322 : *
323 : * XXX should we enter the new page into the free space map immediately,
324 : * or just keep it for this backend's exclusive use in the short run
325 : * (until VACUUM sees it)? Seems to depend on whether you expect the
326 : * current backend to make more insertions or not, which is probably a
327 : * good bet most of the time. So for now, don't add it to FSM yet.
328 : */
329 3576 : relation->rd_targblock = BufferGetBlockNumber(buffer);
330 :
331 3576 : return buffer;
332 : }
|