1 : /*-------------------------------------------------------------------------
2 : *
3 : * syncscan.c
4 : * heap scan synchronization support
5 : *
6 : * When multiple backends run a sequential scan on the same table, we try
7 : * to keep them synchronized to reduce the overall I/O needed. The goal is
8 : * to read each page into shared buffer cache only once, and let all backends
9 : * that take part in the shared scan process the page before it falls out of
10 : * the cache.
11 : *
12 : * Since the "leader" in a pack of backends doing a seqscan will have to wait
13 : * for I/O, while the "followers" don't, there is a strong self-synchronizing
14 : * effect once we can get the backends examining approximately the same part
15 : * of the table at the same time. Hence all that is really needed is to get
16 : * a new backend beginning a seqscan to begin it close to where other backends
17 : * are reading. We can scan the table circularly, from block X up to the
18 : * end and then from block 0 to X-1, to ensure we visit all rows while still
19 : * participating in the common scan.
20 : *
21 : * To accomplish that, we keep track of the scan position of each table, and
22 : * start new scans close to where the previous scan(s) are. We don't try to
23 : * do any extra synchronization to keep the scans together afterwards; some
24 : * scans might progress much more slowly than others, for example if the
25 : * results need to be transferred to the client over a slow network, and we
26 : * don't want such queries to slow down others.
27 : *
28 : * There can realistically only be a few large sequential scans on different
29 : * tables in progress at any time. Therefore we just keep the scan positions
30 : * in a small LRU list which we scan every time we need to look up or update a
31 : * scan position. The whole mechanism is only applied for tables exceeding
32 : * a threshold size (but that is not the concern of this module).
33 : *
34 : * INTERFACE ROUTINES
35 : * ss_get_location - return current scan location of a relation
36 : * ss_report_location - update current scan location
37 : *
38 : *
39 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
40 : * Portions Copyright (c) 1994, Regents of the University of California
41 : *
42 : * IDENTIFICATION
43 : * $PostgreSQL: pgsql/src/backend/access/heap/syncscan.c,v 1.3 2007/11/15 22:25:15 momjian Exp $
44 : *
45 : *-------------------------------------------------------------------------
46 : */
47 : #include "postgres.h"
48 :
49 : #include "access/heapam.h"
50 : #include "miscadmin.h"
51 :
52 :
53 : /* GUC variables */
54 : #ifdef TRACE_SYNCSCAN
55 : bool trace_syncscan = false;
56 : #endif
57 :
58 :
59 : /*
60 : * Size of the LRU list.
61 : *
62 : * Note: the code assumes that SYNC_SCAN_NELEM > 1.
63 : *
64 : * XXX: What's a good value? It should be large enough to hold the
65 : * maximum number of large tables scanned simultaneously. But a larger value
66 : * means more traversing of the LRU list when starting a new scan.
67 : */
68 : #define SYNC_SCAN_NELEM 20
69 :
70 : /*
71 : * Interval between reports of the location of the current scan, in pages.
72 : *
73 : * Note: This should be smaller than the ring size (see buffer/freelist.c)
74 : * we use for bulk reads. Otherwise a scan joining other scans might start
75 : * from a page that's no longer in the buffer cache. This is a bit fuzzy;
76 : * there's no guarantee that the new scan will read the page before it leaves
77 : * the buffer cache anyway, and on the other hand the page is most likely
78 : * still in the OS cache.
79 : */
80 : #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
81 :
82 :
83 : /*
84 : * The scan locations structure is essentially a doubly-linked LRU with head
85 : * and tail pointer, but designed to hold a fixed maximum number of elements in
86 : * fixed-size shared memory.
87 : */
88 : typedef struct ss_scan_location_t
89 : {
90 : RelFileNode relfilenode; /* identity of a relation */
91 : BlockNumber location; /* last-reported location in the relation */
92 : } ss_scan_location_t;
93 :
94 : typedef struct ss_lru_item_t
95 : {
96 : struct ss_lru_item_t *prev;
97 : struct ss_lru_item_t *next;
98 : ss_scan_location_t location;
99 : } ss_lru_item_t;
100 :
101 : typedef struct ss_scan_locations_t
102 : {
103 : ss_lru_item_t *head;
104 : ss_lru_item_t *tail;
105 : ss_lru_item_t items[1]; /* SYNC_SCAN_NELEM items */
106 : } ss_scan_locations_t;
107 :
108 : #define SizeOfScanLocations(N) offsetof(ss_scan_locations_t, items[N])
109 :
110 : /* Pointer to struct in shared memory */
111 : static ss_scan_locations_t *scan_locations;
112 :
113 : /* prototypes for internal functions */
114 : static BlockNumber ss_search(RelFileNode relfilenode,
115 : BlockNumber location, bool set);
116 :
117 :
118 : /*
119 : * SyncScanShmemSize --- report amount of shared memory space needed
120 : */
121 : Size
122 : SyncScanShmemSize(void)
123 18 : {
124 18 : return SizeOfScanLocations(SYNC_SCAN_NELEM);
125 : }
126 :
127 : /*
128 : * SyncScanShmemInit --- initialize this module's shared memory
129 : */
130 : void
131 : SyncScanShmemInit(void)
132 16 : {
133 : int i;
134 : bool found;
135 :
136 16 : scan_locations = (ss_scan_locations_t *)
137 : ShmemInitStruct("Sync Scan Locations List",
138 : SizeOfScanLocations(SYNC_SCAN_NELEM),
139 : &found);
140 :
141 16 : if (!IsUnderPostmaster)
142 : {
143 : /* Initialize shared memory area */
144 : Assert(!found);
145 :
146 16 : scan_locations->head = &scan_locations->items[0];
147 16 : scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
148 :
149 336 : for (i = 0; i < SYNC_SCAN_NELEM; i++)
150 : {
151 320 : ss_lru_item_t *item = &scan_locations->items[i];
152 :
153 : /*
154 : * Initialize all slots with invalid values. As scans are started,
155 : * these invalid entries will fall off the LRU list and get
156 : * replaced with real entries.
157 : */
158 320 : item->location.relfilenode.spcNode = InvalidOid;
159 320 : item->location.relfilenode.dbNode = InvalidOid;
160 320 : item->location.relfilenode.relNode = InvalidOid;
161 320 : item->location.location = InvalidBlockNumber;
162 :
163 320 : item->prev = (i > 0) ?
164 : (&scan_locations->items[i - 1]) : NULL;
165 320 : item->next = (i < SYNC_SCAN_NELEM - 1) ?
166 : (&scan_locations->items[i + 1]) : NULL;
167 : }
168 : }
169 : else
170 : Assert(found);
171 16 : }
172 :
173 : /*
174 : * ss_search --- search the scan_locations structure for an entry with the
175 : * given relfilenode.
176 : *
177 : * If "set" is true, the location is updated to the given location. If no
178 : * entry for the given relfilenode is found, it will be created at the head
179 : * of the list with the given location, even if "set" is false.
180 : *
181 : * In any case, the location after possible update is returned.
182 : *
183 : * Caller is responsible for having acquired suitable lock on the shared
184 : * data structure.
185 : */
186 : static BlockNumber
187 : ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
188 0 : {
189 : ss_lru_item_t *item;
190 :
191 0 : item = scan_locations->head;
192 : for (;;)
193 : {
194 : bool match;
195 :
196 0 : match = RelFileNodeEquals(item->location.relfilenode, relfilenode);
197 :
198 0 : if (match || item->next == NULL)
199 : {
200 : /*
201 : * If we reached the end of list and no match was found, take over
202 : * the last entry
203 : */
204 0 : if (!match)
205 : {
206 0 : item->location.relfilenode = relfilenode;
207 0 : item->location.location = location;
208 : }
209 0 : else if (set)
210 0 : item->location.location = location;
211 :
212 : /* Move the entry to the front of the LRU list */
213 0 : if (item != scan_locations->head)
214 : {
215 : /* unlink */
216 0 : if (item == scan_locations->tail)
217 0 : scan_locations->tail = item->prev;
218 0 : item->prev->next = item->next;
219 0 : if (item->next)
220 0 : item->next->prev = item->prev;
221 :
222 : /* link */
223 0 : item->prev = NULL;
224 0 : item->next = scan_locations->head;
225 0 : scan_locations->head->prev = item;
226 0 : scan_locations->head = item;
227 : }
228 :
229 0 : return item->location.location;
230 : }
231 :
232 0 : item = item->next;
233 0 : }
234 :
235 : /* not reached */
236 : }
237 :
238 : /*
239 : * ss_get_location --- get the optimal starting location for scan
240 : *
241 : * Returns the last-reported location of a sequential scan on the
242 : * relation, or 0 if no valid location is found.
243 : *
244 : * We expect the caller has just done RelationGetNumberOfBlocks(), and
245 : * so that number is passed in rather than computing it again. The result
246 : * is guaranteed less than relnblocks (assuming that's > 0).
247 : */
248 : BlockNumber
249 : ss_get_location(Relation rel, BlockNumber relnblocks)
250 0 : {
251 : BlockNumber startloc;
252 :
253 0 : LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
254 0 : startloc = ss_search(rel->rd_node, 0, false);
255 0 : LWLockRelease(SyncScanLock);
256 :
257 : /*
258 : * If the location is not a valid block number for this scan, start at 0.
259 : *
260 : * This can happen if for instance a VACUUM truncated the table since the
261 : * location was saved.
262 : */
263 0 : if (startloc >= relnblocks)
264 0 : startloc = 0;
265 :
266 : #ifdef TRACE_SYNCSCAN
267 : if (trace_syncscan)
268 : elog(LOG,
269 : "SYNC_SCAN: start \"%s\" (size %u) at %u",
270 : RelationGetRelationName(rel), relnblocks, startloc);
271 : #endif
272 :
273 0 : return startloc;
274 : }
275 :
276 : /*
277 : * ss_report_location --- update the current scan location
278 : *
279 : * Writes an entry into the shared Sync Scan state of the form
280 : * (relfilenode, blocknumber), overwriting any existing entry for the
281 : * same relfilenode.
282 : */
283 : void
284 : ss_report_location(Relation rel, BlockNumber location)
285 0 : {
286 : #ifdef TRACE_SYNCSCAN
287 : if (trace_syncscan)
288 : {
289 : if ((location % 1024) == 0)
290 : elog(LOG,
291 : "SYNC_SCAN: scanning \"%s\" at %u",
292 : RelationGetRelationName(rel), location);
293 : }
294 : #endif
295 :
296 : /*
297 : * To reduce lock contention, only report scan progress every N pages. For
298 : * the same reason, don't block if the lock isn't immediately available.
299 : * Missing a few updates isn't critical, it just means that a new scan
300 : * that wants to join the pack will start a little bit behind the head of
301 : * the scan. Hopefully the pages are still in OS cache and the scan
302 : * catches up quickly.
303 : */
304 0 : if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
305 : {
306 0 : if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
307 : {
308 0 : (void) ss_search(rel->rd_node, location, true);
309 0 : LWLockRelease(SyncScanLock);
310 : }
311 : #ifdef TRACE_SYNCSCAN
312 : else if (trace_syncscan)
313 : elog(LOG,
314 : "SYNC_SCAN: missed update for \"%s\" at %u",
315 : RelationGetRelationName(rel), location);
316 : #endif
317 : }
318 0 : }
|