1 : /*-------------------------------------------------------------------------
2 : *
3 : * twophase.c
4 : * Two-phase commit support functions.
5 : *
6 : * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.38 2007/11/15 21:14:32 momjian Exp $
11 : *
12 : * NOTES
13 : * Each global transaction is associated with a global transaction
14 : * identifier (GID). The client assigns a GID to a postgres
15 : * transaction with the PREPARE TRANSACTION command.
16 : *
17 : * We keep all active global transactions in a shared memory array.
18 : * When the PREPARE TRANSACTION command is issued, the GID is
19 : * reserved for the transaction in the array. This is done before
20 : * a WAL entry is made, because the reservation checks for duplicate
21 : * GIDs and aborts the transaction if there already is a global
22 : * transaction in prepared state with the same GID.
23 : *
24 : * A global transaction (gxact) also has a dummy PGPROC that is entered
25 : * into the ProcArray array; this is what keeps the XID considered
26 : * running by TransactionIdIsInProgress. It is also convenient as a
27 : * PGPROC to hook the gxact's locks to.
28 : *
29 : * In order to survive crashes and shutdowns, all prepared
30 : * transactions must be stored in permanent storage. This includes
31 : * locking information, pending notifications etc. All that state
32 : * information is written to the per-transaction state file in
33 : * the pg_twophase directory.
34 : *
35 : *-------------------------------------------------------------------------
36 : */
37 : #include "postgres.h"
38 :
39 : #include <fcntl.h>
40 : #include <sys/stat.h>
41 : #include <sys/types.h>
42 : #include <time.h>
43 : #include <unistd.h>
44 :
45 : #include "access/heapam.h"
46 : #include "access/subtrans.h"
47 : #include "access/transam.h"
48 : #include "access/twophase.h"
49 : #include "access/twophase_rmgr.h"
50 : #include "access/xact.h"
51 : #include "catalog/pg_type.h"
52 : #include "funcapi.h"
53 : #include "miscadmin.h"
54 : #include "pgstat.h"
55 : #include "storage/fd.h"
56 : #include "storage/procarray.h"
57 : #include "storage/smgr.h"
58 : #include "utils/builtins.h"
59 :
60 :
61 : /*
62 : * Directory where Two-phase commit files reside within PGDATA
63 : */
64 : #define TWOPHASE_DIR "pg_twophase"
65 :
66 : /* GUC variable, can't be changed after startup */
67 : int max_prepared_xacts = 5;
68 :
69 : /*
70 : * This struct describes one global transaction that is in prepared state
71 : * or attempting to become prepared.
72 : *
73 : * The first component of the struct is a dummy PGPROC that is inserted
74 : * into the global ProcArray so that the transaction appears to still be
75 : * running and holding locks. It must be first because we cast pointers
76 : * to PGPROC and pointers to GlobalTransactionData back and forth.
77 : *
78 : * The lifecycle of a global transaction is:
79 : *
80 : * 1. After checking that the requested GID is not in use, set up an
81 : * entry in the TwoPhaseState->prepXacts array with the correct XID and GID,
82 : * with locking_xid = my own XID and valid = false.
83 : *
84 : * 2. After successfully completing prepare, set valid = true and enter the
85 : * contained PGPROC into the global ProcArray.
86 : *
87 : * 3. To begin COMMIT PREPARED or ROLLBACK PREPARED, check that the entry
88 : * is valid and its locking_xid is no longer active, then store my current
89 : * XID into locking_xid. This prevents concurrent attempts to commit or
90 : * rollback the same prepared xact.
91 : *
92 : * 4. On completion of COMMIT PREPARED or ROLLBACK PREPARED, remove the entry
93 : * from the ProcArray and the TwoPhaseState->prepXacts array and return it to
94 : * the freelist.
95 : *
96 : * Note that if the preparing transaction fails between steps 1 and 2, the
97 : * entry will remain in prepXacts until recycled. We can detect recyclable
98 : * entries by checking for valid = false and locking_xid no longer active.
99 : *
100 : * typedef struct GlobalTransactionData *GlobalTransaction appears in
101 : * twophase.h
102 : */
103 : #define GIDSIZE 200
104 :
105 : typedef struct GlobalTransactionData
106 : {
107 : PGPROC proc; /* dummy proc */
108 : TimestampTz prepared_at; /* time of preparation */
109 : XLogRecPtr prepare_lsn; /* XLOG offset of prepare record */
110 : Oid owner; /* ID of user that executed the xact */
111 : TransactionId locking_xid; /* top-level XID of backend working on xact */
112 : bool valid; /* TRUE if fully prepared */
113 : char gid[GIDSIZE]; /* The GID assigned to the prepared xact */
114 : } GlobalTransactionData;
115 :
116 : /*
117 : * Two Phase Commit shared state. Access to this struct is protected
118 : * by TwoPhaseStateLock.
119 : */
120 : typedef struct TwoPhaseStateData
121 : {
122 : /* Head of linked list of free GlobalTransactionData structs */
123 : SHMEM_OFFSET freeGXacts;
124 :
125 : /* Number of valid prepXacts entries. */
126 : int numPrepXacts;
127 :
128 : /*
129 : * There are max_prepared_xacts items in this array, but C wants a
130 : * fixed-size array.
131 : */
132 : GlobalTransaction prepXacts[1]; /* VARIABLE LENGTH ARRAY */
133 : } TwoPhaseStateData; /* VARIABLE LENGTH STRUCT */
134 :
135 : static TwoPhaseStateData *TwoPhaseState;
136 :
137 :
138 : static void RecordTransactionCommitPrepared(TransactionId xid,
139 : int nchildren,
140 : TransactionId *children,
141 : int nrels,
142 : RelFileNode *rels);
143 : static void RecordTransactionAbortPrepared(TransactionId xid,
144 : int nchildren,
145 : TransactionId *children,
146 : int nrels,
147 : RelFileNode *rels);
148 : static void ProcessRecords(char *bufptr, TransactionId xid,
149 : const TwoPhaseCallback callbacks[]);
150 :
151 :
152 : /*
153 : * Initialization of shared memory
154 : */
155 : Size
156 : TwoPhaseShmemSize(void)
157 34 : {
158 : Size size;
159 :
160 : /* Need the fixed struct, the array of pointers, and the GTD structs */
161 34 : size = offsetof(TwoPhaseStateData, prepXacts);
162 34 : size = add_size(size, mul_size(max_prepared_xacts,
163 : sizeof(GlobalTransaction)));
164 34 : size = MAXALIGN(size);
165 34 : size = add_size(size, mul_size(max_prepared_xacts,
166 : sizeof(GlobalTransactionData)));
167 :
168 34 : return size;
169 : }
170 :
171 : void
172 : TwoPhaseShmemInit(void)
173 16 : {
174 : bool found;
175 :
176 16 : TwoPhaseState = ShmemInitStruct("Prepared Transaction Table",
177 : TwoPhaseShmemSize(),
178 : &found);
179 16 : if (!IsUnderPostmaster)
180 : {
181 : GlobalTransaction gxacts;
182 : int i;
183 :
184 : Assert(!found);
185 16 : TwoPhaseState->freeGXacts = INVALID_OFFSET;
186 16 : TwoPhaseState->numPrepXacts = 0;
187 :
188 : /*
189 : * Initialize the linked list of free GlobalTransactionData structs
190 : */
191 16 : gxacts = (GlobalTransaction)
192 : ((char *) TwoPhaseState +
193 : MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) +
194 : sizeof(GlobalTransaction) * max_prepared_xacts));
195 96 : for (i = 0; i < max_prepared_xacts; i++)
196 : {
197 80 : gxacts[i].proc.links.next = TwoPhaseState->freeGXacts;
198 80 : TwoPhaseState->freeGXacts = MAKE_OFFSET(&gxacts[i]);
199 : }
200 : }
201 : else
202 : Assert(found);
203 16 : }
204 :
205 :
206 : /*
207 : * MarkAsPreparing
208 : * Reserve the GID for the given transaction.
209 : *
210 : * Internally, this creates a gxact struct and puts it into the active array.
211 : * NOTE: this is also used when reloading a gxact after a crash; so avoid
212 : * assuming that we can use very much backend context.
213 : */
214 : GlobalTransaction
215 : MarkAsPreparing(TransactionId xid, const char *gid,
216 : TimestampTz prepared_at, Oid owner, Oid databaseid)
217 6 : {
218 : GlobalTransaction gxact;
219 : int i;
220 :
221 6 : if (strlen(gid) >= GIDSIZE)
222 0 : ereport(ERROR,
223 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
224 : errmsg("transaction identifier \"%s\" is too long",
225 : gid)));
226 :
227 6 : LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
228 :
229 : /*
230 : * First, find and recycle any gxacts that failed during prepare. We do
231 : * this partly to ensure we don't mistakenly say their GIDs are still
232 : * reserved, and partly so we don't fail on out-of-slots unnecessarily.
233 : */
234 8 : for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
235 : {
236 2 : gxact = TwoPhaseState->prepXacts[i];
237 2 : if (!gxact->valid && !TransactionIdIsActive(gxact->locking_xid))
238 : {
239 : /* It's dead Jim ... remove from the active array */
240 0 : TwoPhaseState->numPrepXacts--;
241 0 : TwoPhaseState->prepXacts[i] = TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts];
242 : /* and put it back in the freelist */
243 0 : gxact->proc.links.next = TwoPhaseState->freeGXacts;
244 0 : TwoPhaseState->freeGXacts = MAKE_OFFSET(gxact);
245 : /* Back up index count too, so we don't miss scanning one */
246 0 : i--;
247 : }
248 : }
249 :
250 : /* Check for conflicting GID */
251 7 : for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
252 : {
253 2 : gxact = TwoPhaseState->prepXacts[i];
254 2 : if (strcmp(gxact->gid, gid) == 0)
255 : {
256 1 : ereport(ERROR,
257 : (errcode(ERRCODE_DUPLICATE_OBJECT),
258 : errmsg("transaction identifier \"%s\" is already in use",
259 : gid)));
260 : }
261 : }
262 :
263 : /* Get a free gxact from the freelist */
264 5 : if (TwoPhaseState->freeGXacts == INVALID_OFFSET)
265 0 : ereport(ERROR,
266 : (errcode(ERRCODE_OUT_OF_MEMORY),
267 : errmsg("maximum number of prepared transactions reached"),
268 : errhint("Increase max_prepared_transactions (currently %d).",
269 : max_prepared_xacts)));
270 5 : gxact = (GlobalTransaction) MAKE_PTR(TwoPhaseState->freeGXacts);
271 5 : TwoPhaseState->freeGXacts = gxact->proc.links.next;
272 :
273 : /* Initialize it */
274 5 : MemSet(&gxact->proc, 0, sizeof(PGPROC));
275 5 : SHMQueueElemInit(&(gxact->proc.links));
276 5 : gxact->proc.waitStatus = STATUS_OK;
277 : /* We set up the gxact's VXID as InvalidBackendId/XID */
278 5 : gxact->proc.lxid = (LocalTransactionId) xid;
279 5 : gxact->proc.xid = xid;
280 5 : gxact->proc.xmin = InvalidTransactionId;
281 5 : gxact->proc.pid = 0;
282 5 : gxact->proc.backendId = InvalidBackendId;
283 5 : gxact->proc.databaseId = databaseid;
284 5 : gxact->proc.roleId = owner;
285 5 : gxact->proc.inCommit = false;
286 5 : gxact->proc.vacuumFlags = 0;
287 5 : gxact->proc.lwWaiting = false;
288 5 : gxact->proc.lwExclusive = false;
289 5 : gxact->proc.lwWaitLink = NULL;
290 5 : gxact->proc.waitLock = NULL;
291 5 : gxact->proc.waitProcLock = NULL;
292 85 : for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
293 80 : SHMQueueInit(&(gxact->proc.myProcLocks[i]));
294 : /* subxid data must be filled later by GXactLoadSubxactData */
295 5 : gxact->proc.subxids.overflowed = false;
296 5 : gxact->proc.subxids.nxids = 0;
297 :
298 5 : gxact->prepared_at = prepared_at;
299 : /* initialize LSN to 0 (start of WAL) */
300 5 : gxact->prepare_lsn.xlogid = 0;
301 5 : gxact->prepare_lsn.xrecoff = 0;
302 5 : gxact->owner = owner;
303 5 : gxact->locking_xid = xid;
304 5 : gxact->valid = false;
305 5 : strcpy(gxact->gid, gid);
306 :
307 : /* And insert it into the active array */
308 : Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts);
309 5 : TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact;
310 :
311 5 : LWLockRelease(TwoPhaseStateLock);
312 :
313 5 : return gxact;
314 : }
315 :
316 : /*
317 : * GXactLoadSubxactData
318 : *
319 : * If the transaction being persisted had any subtransactions, this must
320 : * be called before MarkAsPrepared() to load information into the dummy
321 : * PGPROC.
322 : */
323 : static void
324 : GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts,
325 : TransactionId *children)
326 1 : {
327 : /* We need no extra lock since the GXACT isn't valid yet */
328 1 : if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS)
329 : {
330 0 : gxact->proc.subxids.overflowed = true;
331 0 : nsubxacts = PGPROC_MAX_CACHED_SUBXIDS;
332 : }
333 1 : if (nsubxacts > 0)
334 : {
335 1 : memcpy(gxact->proc.subxids.xids, children,
336 : nsubxacts * sizeof(TransactionId));
337 1 : gxact->proc.subxids.nxids = nsubxacts;
338 : }
339 1 : }
340 :
341 : /*
342 : * MarkAsPrepared
343 : * Mark the GXACT as fully valid, and enter it into the global ProcArray.
344 : */
345 : static void
346 : MarkAsPrepared(GlobalTransaction gxact)
347 5 : {
348 : /* Lock here may be overkill, but I'm not convinced of that ... */
349 5 : LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
350 : Assert(!gxact->valid);
351 5 : gxact->valid = true;
352 5 : LWLockRelease(TwoPhaseStateLock);
353 :
354 : /*
355 : * Put it into the global ProcArray so TransactionIdIsInProgress considers
356 : * the XID as still running.
357 : */
358 5 : ProcArrayAdd(&gxact->proc);
359 5 : }
360 :
361 : /*
362 : * LockGXact
363 : * Locate the prepared transaction and mark it busy for COMMIT or PREPARE.
364 : */
365 : static GlobalTransaction
366 : LockGXact(const char *gid, Oid user)
367 5 : {
368 : int i;
369 :
370 5 : LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
371 :
372 5 : for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
373 : {
374 5 : GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
375 :
376 : /* Ignore not-yet-valid GIDs */
377 5 : if (!gxact->valid)
378 0 : continue;
379 5 : if (strcmp(gxact->gid, gid) != 0)
380 0 : continue;
381 :
382 : /* Found it, but has someone else got it locked? */
383 5 : if (TransactionIdIsValid(gxact->locking_xid))
384 : {
385 5 : if (TransactionIdIsActive(gxact->locking_xid))
386 0 : ereport(ERROR,
387 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
388 : errmsg("prepared transaction with identifier \"%s\" is busy",
389 : gid)));
390 5 : gxact->locking_xid = InvalidTransactionId;
391 : }
392 :
393 5 : if (user != gxact->owner && !superuser_arg(user))
394 0 : ereport(ERROR,
395 : (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
396 : errmsg("permission denied to finish prepared transaction"),
397 : errhint("Must be superuser or the user that prepared the transaction.")));
398 :
399 : /*
400 : * Note: it probably would be possible to allow committing from
401 : * another database; but at the moment NOTIFY is known not to work and
402 : * there may be some other issues as well. Hence disallow until
403 : * someone gets motivated to make it work.
404 : */
405 5 : if (MyDatabaseId != gxact->proc.databaseId)
406 0 : ereport(ERROR,
407 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
408 : errmsg("prepared transaction belongs to another database"),
409 : errhint("Connect to the database where the transaction was prepared to finish it.")));
410 :
411 : /* OK for me to lock it */
412 5 : gxact->locking_xid = GetTopTransactionId();
413 :
414 5 : LWLockRelease(TwoPhaseStateLock);
415 :
416 5 : return gxact;
417 : }
418 :
419 0 : LWLockRelease(TwoPhaseStateLock);
420 :
421 0 : ereport(ERROR,
422 : (errcode(ERRCODE_UNDEFINED_OBJECT),
423 : errmsg("prepared transaction with identifier \"%s\" does not exist",
424 : gid)));
425 :
426 : /* NOTREACHED */
427 0 : return NULL;
428 : }
429 :
430 : /*
431 : * RemoveGXact
432 : * Remove the prepared transaction from the shared memory array.
433 : *
434 : * NB: caller should have already removed it from ProcArray
435 : */
436 : static void
437 : RemoveGXact(GlobalTransaction gxact)
438 5 : {
439 : int i;
440 :
441 5 : LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
442 :
443 5 : for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
444 : {
445 5 : if (gxact == TwoPhaseState->prepXacts[i])
446 : {
447 : /* remove from the active array */
448 5 : TwoPhaseState->numPrepXacts--;
449 5 : TwoPhaseState->prepXacts[i] = TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts];
450 :
451 : /* and put it back in the freelist */
452 5 : gxact->proc.links.next = TwoPhaseState->freeGXacts;
453 5 : TwoPhaseState->freeGXacts = MAKE_OFFSET(gxact);
454 :
455 5 : LWLockRelease(TwoPhaseStateLock);
456 :
457 5 : return;
458 : }
459 : }
460 :
461 0 : LWLockRelease(TwoPhaseStateLock);
462 :
463 0 : elog(ERROR, "failed to find %p in GlobalTransaction array", gxact);
464 : }
465 :
466 : /*
467 : * TransactionIdIsPrepared
468 : * True iff transaction associated with the identifier is prepared
469 : * for two-phase commit
470 : *
471 : * Note: only gxacts marked "valid" are considered; but notice we do not
472 : * check the locking status.
473 : *
474 : * This is not currently exported, because it is only needed internally.
475 : */
476 : static bool
477 : TransactionIdIsPrepared(TransactionId xid)
478 0 : {
479 0 : bool result = false;
480 : int i;
481 :
482 0 : LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
483 :
484 0 : for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
485 : {
486 0 : GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
487 :
488 0 : if (gxact->valid && gxact->proc.xid == xid)
489 : {
490 0 : result = true;
491 0 : break;
492 : }
493 : }
494 :
495 0 : LWLockRelease(TwoPhaseStateLock);
496 :
497 0 : return result;
498 : }
499 :
500 : /*
501 : * Returns an array of all prepared transactions for the user-level
502 : * function pg_prepared_xact.
503 : *
504 : * The returned array and all its elements are copies of internal data
505 : * structures, to minimize the time we need to hold the TwoPhaseStateLock.
506 : *
507 : * WARNING -- we return even those transactions that are not fully prepared
508 : * yet. The caller should filter them out if he doesn't want them.
509 : *
510 : * The returned array is palloc'd.
511 : */
512 : static int
513 : GetPreparedTransactionList(GlobalTransaction *gxacts)
514 7 : {
515 : GlobalTransaction array;
516 : int num;
517 : int i;
518 :
519 7 : LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
520 :
521 7 : if (TwoPhaseState->numPrepXacts == 0)
522 : {
523 2 : LWLockRelease(TwoPhaseStateLock);
524 :
525 2 : *gxacts = NULL;
526 2 : return 0;
527 : }
528 :
529 5 : num = TwoPhaseState->numPrepXacts;
530 5 : array = (GlobalTransaction) palloc(sizeof(GlobalTransactionData) * num);
531 5 : *gxacts = array;
532 12 : for (i = 0; i < num; i++)
533 7 : memcpy(array + i, TwoPhaseState->prepXacts[i],
534 : sizeof(GlobalTransactionData));
535 :
536 5 : LWLockRelease(TwoPhaseStateLock);
537 :
538 5 : return num;
539 : }
540 :
541 :
542 : /* Working status for pg_prepared_xact */
543 : typedef struct
544 : {
545 : GlobalTransaction array;
546 : int ngxacts;
547 : int currIdx;
548 : } Working_State;
549 :
550 : /*
551 : * pg_prepared_xact
552 : * Produce a view with one row per prepared transaction.
553 : *
554 : * This function is here so we don't have to export the
555 : * GlobalTransactionData struct definition.
556 : */
557 : Datum
558 : pg_prepared_xact(PG_FUNCTION_ARGS)
559 14 : {
560 : FuncCallContext *funcctx;
561 : Working_State *status;
562 :
563 14 : if (SRF_IS_FIRSTCALL())
564 : {
565 : TupleDesc tupdesc;
566 : MemoryContext oldcontext;
567 :
568 : /* create a function context for cross-call persistence */
569 7 : funcctx = SRF_FIRSTCALL_INIT();
570 :
571 : /*
572 : * Switch to memory context appropriate for multiple function calls
573 : */
574 7 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
575 :
576 : /* build tupdesc for result tuples */
577 : /* this had better match pg_prepared_xacts view in system_views.sql */
578 7 : tupdesc = CreateTemplateTupleDesc(5, false);
579 7 : TupleDescInitEntry(tupdesc, (AttrNumber) 1, "transaction",
580 : XIDOID, -1, 0);
581 7 : TupleDescInitEntry(tupdesc, (AttrNumber) 2, "gid",
582 : TEXTOID, -1, 0);
583 7 : TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepared",
584 : TIMESTAMPTZOID, -1, 0);
585 7 : TupleDescInitEntry(tupdesc, (AttrNumber) 4, "ownerid",
586 : OIDOID, -1, 0);
587 7 : TupleDescInitEntry(tupdesc, (AttrNumber) 5, "dbid",
588 : OIDOID, -1, 0);
589 :
590 7 : funcctx->tuple_desc = BlessTupleDesc(tupdesc);
591 :
592 : /*
593 : * Collect all the 2PC status information that we will format and send
594 : * out as a result set.
595 : */
596 7 : status = (Working_State *) palloc(sizeof(Working_State));
597 7 : funcctx->user_fctx = (void *) status;
598 :
599 7 : status->ngxacts = GetPreparedTransactionList(&status->array);
600 7 : status->currIdx = 0;
601 :
602 7 : MemoryContextSwitchTo(oldcontext);
603 : }
604 :
605 14 : funcctx = SRF_PERCALL_SETUP();
606 14 : status = (Working_State *) funcctx->user_fctx;
607 :
608 28 : while (status->array != NULL && status->currIdx < status->ngxacts)
609 : {
610 7 : GlobalTransaction gxact = &status->array[status->currIdx++];
611 : Datum values[5];
612 : bool nulls[5];
613 : HeapTuple tuple;
614 : Datum result;
615 :
616 7 : if (!gxact->valid)
617 0 : continue;
618 :
619 : /*
620 : * Form tuple with appropriate data.
621 : */
622 7 : MemSet(values, 0, sizeof(values));
623 7 : MemSet(nulls, 0, sizeof(nulls));
624 :
625 7 : values[0] = TransactionIdGetDatum(gxact->proc.xid);
626 7 : values[1] = DirectFunctionCall1(textin, CStringGetDatum(gxact->gid));
627 7 : values[2] = TimestampTzGetDatum(gxact->prepared_at);
628 7 : values[3] = ObjectIdGetDatum(gxact->owner);
629 7 : values[4] = ObjectIdGetDatum(gxact->proc.databaseId);
630 :
631 7 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
632 7 : result = HeapTupleGetDatum(tuple);
633 7 : SRF_RETURN_NEXT(funcctx, result);
634 : }
635 :
636 7 : SRF_RETURN_DONE(funcctx);
637 : }
638 :
639 : /*
640 : * TwoPhaseGetDummyProc
641 : * Get the PGPROC that represents a prepared transaction specified by XID
642 : */
643 : PGPROC *
644 : TwoPhaseGetDummyProc(TransactionId xid)
645 22 : {
646 22 : PGPROC *result = NULL;
647 : int i;
648 :
649 : static TransactionId cached_xid = InvalidTransactionId;
650 : static PGPROC *cached_proc = NULL;
651 :
652 : /*
653 : * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
654 : * repeatedly for the same XID. We can save work with a simple cache.
655 : */
656 22 : if (xid == cached_xid)
657 15 : return cached_proc;
658 :
659 7 : LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
660 :
661 8 : for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
662 : {
663 8 : GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
664 :
665 8 : if (gxact->proc.xid == xid)
666 : {
667 7 : result = &gxact->proc;
668 7 : break;
669 : }
670 : }
671 :
672 7 : LWLockRelease(TwoPhaseStateLock);
673 :
674 7 : if (result == NULL) /* should not happen */
675 0 : elog(ERROR, "failed to find dummy PGPROC for xid %u", xid);
676 :
677 7 : cached_xid = xid;
678 7 : cached_proc = result;
679 :
680 7 : return result;
681 : }
682 :
683 : /************************************************************************/
684 : /* State file support */
685 : /************************************************************************/
686 :
687 : #define TwoPhaseFilePath(path, xid) \
688 : snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X", xid)
689 :
690 : /*
691 : * 2PC state file format:
692 : *
693 : * 1. TwoPhaseFileHeader
694 : * 2. TransactionId[] (subtransactions)
695 : * 3. RelFileNode[] (files to be deleted at commit)
696 : * 4. RelFileNode[] (files to be deleted at abort)
697 : * 5. TwoPhaseRecordOnDisk
698 : * 6. ...
699 : * 7. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
700 : * 8. CRC32
701 : *
702 : * Each segment except the final CRC32 is MAXALIGN'd.
703 : */
704 :
705 : /*
706 : * Header for a 2PC state file
707 : */
708 : #define TWOPHASE_MAGIC 0x57F94531 /* format identifier */
709 :
710 : typedef struct TwoPhaseFileHeader
711 : {
712 : uint32 magic; /* format identifier */
713 : uint32 total_len; /* actual file length */
714 : TransactionId xid; /* original transaction XID */
715 : Oid database; /* OID of database it was in */
716 : TimestampTz prepared_at; /* time of preparation */
717 : Oid owner; /* user running the transaction */
718 : int32 nsubxacts; /* number of following subxact XIDs */
719 : int32 ncommitrels; /* number of delete-on-commit rels */
720 : int32 nabortrels; /* number of delete-on-abort rels */
721 : char gid[GIDSIZE]; /* GID for transaction */
722 : } TwoPhaseFileHeader;
723 :
724 : /*
725 : * Header for each record in a state file
726 : *
727 : * NOTE: len counts only the rmgr data, not the TwoPhaseRecordOnDisk header.
728 : * The rmgr data will be stored starting on a MAXALIGN boundary.
729 : */
730 : typedef struct TwoPhaseRecordOnDisk
731 : {
732 : uint32 len; /* length of rmgr data */
733 : TwoPhaseRmgrId rmid; /* resource manager for this record */
734 : uint16 info; /* flag bits for use by rmgr */
735 : } TwoPhaseRecordOnDisk;
736 :
737 : /*
738 : * During prepare, the state file is assembled in memory before writing it
739 : * to WAL and the actual state file. We use a chain of XLogRecData blocks
740 : * so that we will be able to pass the state file contents directly to
741 : * XLogInsert.
742 : */
743 : static struct xllist
744 : {
745 : XLogRecData *head; /* first data block in the chain */
746 : XLogRecData *tail; /* last block in chain */
747 : uint32 bytes_free; /* free bytes left in tail block */
748 : uint32 total_len; /* total data bytes in chain */
749 : } records;
750 :
751 :
752 : /*
753 : * Append a block of data to records data structure.
754 : *
755 : * NB: each block is padded to a MAXALIGN multiple. This must be
756 : * accounted for when the file is later read!
757 : *
758 : * The data is copied, so the caller is free to modify it afterwards.
759 : */
760 : static void
761 : save_state_data(const void *data, uint32 len)
762 206 : {
763 206 : uint32 padlen = MAXALIGN(len);
764 :
765 206 : if (padlen > records.bytes_free)
766 : {
767 5 : records.tail->next = palloc0(sizeof(XLogRecData));
768 5 : records.tail = records.tail->next;
769 5 : records.tail->buffer = InvalidBuffer;
770 5 : records.tail->len = 0;
771 5 : records.tail->next = NULL;
772 :
773 5 : records.bytes_free = Max(padlen, 512);
774 5 : records.tail->data = palloc(records.bytes_free);
775 : }
776 :
777 206 : memcpy(((char *) records.tail->data) + records.tail->len, data, len);
778 206 : records.tail->len += padlen;
779 206 : records.bytes_free -= padlen;
780 206 : records.total_len += padlen;
781 206 : }
782 :
783 : /*
784 : * Start preparing a state file.
785 : *
786 : * Initializes data structure and inserts the 2PC file header record.
787 : */
788 : void
789 : StartPrepare(GlobalTransaction gxact)
790 5 : {
791 5 : TransactionId xid = gxact->proc.xid;
792 : TwoPhaseFileHeader hdr;
793 : TransactionId *children;
794 : RelFileNode *commitrels;
795 : RelFileNode *abortrels;
796 :
797 : /* Initialize linked list */
798 5 : records.head = palloc0(sizeof(XLogRecData));
799 5 : records.head->buffer = InvalidBuffer;
800 5 : records.head->len = 0;
801 5 : records.head->next = NULL;
802 :
803 5 : records.bytes_free = Max(sizeof(TwoPhaseFileHeader), 512);
804 5 : records.head->data = palloc(records.bytes_free);
805 :
806 5 : records.tail = records.head;
807 :
808 5 : records.total_len = 0;
809 :
810 : /* Create header */
811 5 : hdr.magic = TWOPHASE_MAGIC;
812 5 : hdr.total_len = 0; /* EndPrepare will fill this in */
813 5 : hdr.xid = xid;
814 5 : hdr.database = gxact->proc.databaseId;
815 5 : hdr.prepared_at = gxact->prepared_at;
816 5 : hdr.owner = gxact->owner;
817 5 : hdr.nsubxacts = xactGetCommittedChildren(&children);
818 5 : hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels, NULL);
819 5 : hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels, NULL);
820 5 : StrNCpy(hdr.gid, gxact->gid, GIDSIZE);
821 :
822 5 : save_state_data(&hdr, sizeof(TwoPhaseFileHeader));
823 :
824 : /* Add the additional info about subxacts and deletable files */
825 5 : if (hdr.nsubxacts > 0)
826 : {
827 1 : save_state_data(children, hdr.nsubxacts * sizeof(TransactionId));
828 : /* While we have the child-xact data, stuff it in the gxact too */
829 1 : GXactLoadSubxactData(gxact, hdr.nsubxacts, children);
830 1 : pfree(children);
831 : }
832 5 : if (hdr.ncommitrels > 0)
833 : {
834 1 : save_state_data(commitrels, hdr.ncommitrels * sizeof(RelFileNode));
835 1 : pfree(commitrels);
836 : }
837 5 : if (hdr.nabortrels > 0)
838 : {
839 2 : save_state_data(abortrels, hdr.nabortrels * sizeof(RelFileNode));
840 2 : pfree(abortrels);
841 : }
842 5 : }
843 :
844 : /*
845 : * Finish preparing state file.
846 : *
847 : * Calculates CRC and writes state file to WAL and in pg_twophase directory.
848 : */
849 : void
850 : EndPrepare(GlobalTransaction gxact)
851 5 : {
852 5 : TransactionId xid = gxact->proc.xid;
853 : TwoPhaseFileHeader *hdr;
854 : char path[MAXPGPATH];
855 : XLogRecData *record;
856 : pg_crc32 statefile_crc;
857 : pg_crc32 bogus_crc;
858 : int fd;
859 :
860 : /* Add the end sentinel to the list of 2PC records */
861 5 : RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0,
862 : NULL, 0);
863 :
864 : /* Go back and fill in total_len in the file header record */
865 5 : hdr = (TwoPhaseFileHeader *) records.head->data;
866 : Assert(hdr->magic == TWOPHASE_MAGIC);
867 5 : hdr->total_len = records.total_len + sizeof(pg_crc32);
868 :
869 : /*
870 : * Create the 2PC state file.
871 : *
872 : * Note: because we use BasicOpenFile(), we are responsible for ensuring
873 : * the FD gets closed in any error exit path. Once we get into the
874 : * critical section, though, it doesn't matter since any failure causes
875 : * PANIC anyway.
876 : */
877 5 : TwoPhaseFilePath(path, xid);
878 :
879 5 : fd = BasicOpenFile(path,
880 : O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
881 : S_IRUSR | S_IWUSR);
882 5 : if (fd < 0)
883 0 : ereport(ERROR,
884 : (errcode_for_file_access(),
885 : errmsg("could not create two-phase state file \"%s\": %m",
886 : path)));
887 :
888 : /* Write data to file, and calculate CRC as we pass over it */
889 5 : INIT_CRC32(statefile_crc);
890 :
891 15 : for (record = records.head; record != NULL; record = record->next)
892 : {
893 10 : COMP_CRC32(statefile_crc, record->data, record->len);
894 10 : if ((write(fd, record->data, record->len)) != record->len)
895 : {
896 0 : close(fd);
897 0 : ereport(ERROR,
898 : (errcode_for_file_access(),
899 : errmsg("could not write two-phase state file: %m")));
900 : }
901 : }
902 :
903 5 : FIN_CRC32(statefile_crc);
904 :
905 : /*
906 : * Write a deliberately bogus CRC to the state file; this is just paranoia
907 : * to catch the case where four more bytes will run us out of disk space.
908 : */
909 5 : bogus_crc = ~statefile_crc;
910 :
911 5 : if ((write(fd, &bogus_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
912 : {
913 0 : close(fd);
914 0 : ereport(ERROR,
915 : (errcode_for_file_access(),
916 : errmsg("could not write two-phase state file: %m")));
917 : }
918 :
919 : /* Back up to prepare for rewriting the CRC */
920 5 : if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0)
921 : {
922 0 : close(fd);
923 0 : ereport(ERROR,
924 : (errcode_for_file_access(),
925 : errmsg("could not seek in two-phase state file: %m")));
926 : }
927 :
928 : /*
929 : * The state file isn't valid yet, because we haven't written the correct
930 : * CRC yet. Before we do that, insert entry in WAL and flush it to disk.
931 : *
932 : * Between the time we have written the WAL entry and the time we write
933 : * out the correct state file CRC, we have an inconsistency: the xact is
934 : * prepared according to WAL but not according to our on-disk state. We
935 : * use a critical section to force a PANIC if we are unable to complete
936 : * the write --- then, WAL replay should repair the inconsistency. The
937 : * odds of a PANIC actually occurring should be very tiny given that we
938 : * were able to write the bogus CRC above.
939 : *
940 : * We have to set inCommit here, too; otherwise a checkpoint starting
941 : * immediately after the WAL record is inserted could complete without
942 : * fsync'ing our state file. (This is essentially the same kind of race
943 : * condition as the COMMIT-to-clog-write case that RecordTransactionCommit
944 : * uses inCommit for; see notes there.)
945 : *
946 : * We save the PREPARE record's location in the gxact for later use by
947 : * CheckPointTwoPhase.
948 : */
949 5 : START_CRIT_SECTION();
950 :
951 5 : MyProc->inCommit = true;
952 :
953 5 : gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE,
954 : records.head);
955 5 : XLogFlush(gxact->prepare_lsn);
956 :
957 : /* If we crash now, we have prepared: WAL replay will fix things */
958 :
959 : /* write correct CRC and close file */
960 5 : if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
961 : {
962 0 : close(fd);
963 0 : ereport(ERROR,
964 : (errcode_for_file_access(),
965 : errmsg("could not write two-phase state file: %m")));
966 : }
967 :
968 5 : if (close(fd) != 0)
969 0 : ereport(ERROR,
970 : (errcode_for_file_access(),
971 : errmsg("could not close two-phase state file: %m")));
972 :
973 : /*
974 : * Mark the prepared transaction as valid. As soon as xact.c marks MyProc
975 : * as not running our XID (which it will do immediately after this
976 : * function returns), others can commit/rollback the xact.
977 : *
978 : * NB: a side effect of this is to make a dummy ProcArray entry for the
979 : * prepared XID. This must happen before we clear the XID from MyProc,
980 : * else there is a window where the XID is not running according to
981 : * TransactionIdIsInProgress, and onlookers would be entitled to assume
982 : * the xact crashed. Instead we have a window where the same XID appears
983 : * twice in ProcArray, which is OK.
984 : */
985 5 : MarkAsPrepared(gxact);
986 :
987 : /*
988 : * Now we can mark ourselves as out of the commit critical section: a
989 : * checkpoint starting after this will certainly see the gxact as a
990 : * candidate for fsyncing.
991 : */
992 5 : MyProc->inCommit = false;
993 :
994 5 : END_CRIT_SECTION();
995 :
996 5 : records.tail = records.head = NULL;
997 5 : }
998 :
999 : /*
1000 : * Register a 2PC record to be written to state file.
1001 : */
1002 : void
1003 : RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
1004 : const void *data, uint32 len)
1005 101 : {
1006 : TwoPhaseRecordOnDisk record;
1007 :
1008 101 : record.rmid = rmid;
1009 101 : record.info = info;
1010 101 : record.len = len;
1011 101 : save_state_data(&record, sizeof(TwoPhaseRecordOnDisk));
1012 101 : if (len > 0)
1013 96 : save_state_data(data, len);
1014 101 : }
1015 :
1016 :
1017 : /*
1018 : * Read and validate the state file for xid.
1019 : *
1020 : * If it looks OK (has a valid magic number and CRC), return the palloc'd
1021 : * contents of the file. Otherwise return NULL.
1022 : */
1023 : static char *
1024 : ReadTwoPhaseFile(TransactionId xid)
1025 5 : {
1026 : char path[MAXPGPATH];
1027 : char *buf;
1028 : TwoPhaseFileHeader *hdr;
1029 : int fd;
1030 : struct stat stat;
1031 : uint32 crc_offset;
1032 : pg_crc32 calc_crc,
1033 : file_crc;
1034 :
1035 5 : TwoPhaseFilePath(path, xid);
1036 :
1037 5 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
1038 5 : if (fd < 0)
1039 : {
1040 0 : ereport(WARNING,
1041 : (errcode_for_file_access(),
1042 : errmsg("could not open two-phase state file \"%s\": %m",
1043 : path)));
1044 0 : return NULL;
1045 : }
1046 :
1047 : /*
1048 : * Check file length. We can determine a lower bound pretty easily. We
1049 : * set an upper bound mainly to avoid palloc() failure on a corrupt file.
1050 : */
1051 5 : if (fstat(fd, &stat))
1052 : {
1053 0 : close(fd);
1054 0 : ereport(WARNING,
1055 : (errcode_for_file_access(),
1056 : errmsg("could not stat two-phase state file \"%s\": %m",
1057 : path)));
1058 0 : return NULL;
1059 : }
1060 :
1061 5 : if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
1062 : MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
1063 : sizeof(pg_crc32)) ||
1064 : stat.st_size > 10000000)
1065 : {
1066 0 : close(fd);
1067 0 : return NULL;
1068 : }
1069 :
1070 5 : crc_offset = stat.st_size - sizeof(pg_crc32);
1071 5 : if (crc_offset != MAXALIGN(crc_offset))
1072 : {
1073 0 : close(fd);
1074 0 : return NULL;
1075 : }
1076 :
1077 : /*
1078 : * OK, slurp in the file.
1079 : */
1080 5 : buf = (char *) palloc(stat.st_size);
1081 :
1082 5 : if (read(fd, buf, stat.st_size) != stat.st_size)
1083 : {
1084 0 : close(fd);
1085 0 : ereport(WARNING,
1086 : (errcode_for_file_access(),
1087 : errmsg("could not read two-phase state file \"%s\": %m",
1088 : path)));
1089 0 : pfree(buf);
1090 0 : return NULL;
1091 : }
1092 :
1093 5 : close(fd);
1094 :
1095 5 : hdr = (TwoPhaseFileHeader *) buf;
1096 5 : if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size)
1097 : {
1098 0 : pfree(buf);
1099 0 : return NULL;
1100 : }
1101 :
1102 5 : INIT_CRC32(calc_crc);
1103 5 : COMP_CRC32(calc_crc, buf, crc_offset);
1104 5 : FIN_CRC32(calc_crc);
1105 :
1106 5 : file_crc = *((pg_crc32 *) (buf + crc_offset));
1107 :
1108 5 : if (!EQ_CRC32(calc_crc, file_crc))
1109 : {
1110 0 : pfree(buf);
1111 0 : return NULL;
1112 : }
1113 :
1114 5 : return buf;
1115 : }
1116 :
1117 :
1118 : /*
1119 : * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED
1120 : */
1121 : void
1122 : FinishPreparedTransaction(const char *gid, bool isCommit)
1123 5 : {
1124 : GlobalTransaction gxact;
1125 : TransactionId xid;
1126 : char *buf;
1127 : char *bufptr;
1128 : TwoPhaseFileHeader *hdr;
1129 : TransactionId latestXid;
1130 : TransactionId *children;
1131 : RelFileNode *commitrels;
1132 : RelFileNode *abortrels;
1133 : int i;
1134 :
1135 : /*
1136 : * Validate the GID, and lock the GXACT to ensure that two backends do not
1137 : * try to commit the same GID at once.
1138 : */
1139 5 : gxact = LockGXact(gid, GetUserId());
1140 5 : xid = gxact->proc.xid;
1141 :
1142 : /*
1143 : * Read and validate the state file
1144 : */
1145 5 : buf = ReadTwoPhaseFile(xid);
1146 5 : if (buf == NULL)
1147 0 : ereport(ERROR,
1148 : (errcode(ERRCODE_DATA_CORRUPTED),
1149 : errmsg("two-phase state file for transaction %u is corrupt",
1150 : xid)));
1151 :
1152 : /*
1153 : * Disassemble the header area
1154 : */
1155 5 : hdr = (TwoPhaseFileHeader *) buf;
1156 : Assert(TransactionIdEquals(hdr->xid, xid));
1157 5 : bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
1158 5 : children = (TransactionId *) bufptr;
1159 5 : bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
1160 5 : commitrels = (RelFileNode *) bufptr;
1161 5 : bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
1162 5 : abortrels = (RelFileNode *) bufptr;
1163 5 : bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
1164 :
1165 : /* compute latestXid among all children */
1166 5 : latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children);
1167 :
1168 : /*
1169 : * The order of operations here is critical: make the XLOG entry for
1170 : * commit or abort, then mark the transaction committed or aborted in
1171 : * pg_clog, then remove its PGPROC from the global ProcArray (which means
1172 : * TransactionIdIsInProgress will stop saying the prepared xact is in
1173 : * progress), then run the post-commit or post-abort callbacks. The
1174 : * callbacks will release the locks the transaction held.
1175 : */
1176 5 : if (isCommit)
1177 3 : RecordTransactionCommitPrepared(xid,
1178 : hdr->nsubxacts, children,
1179 : hdr->ncommitrels, commitrels);
1180 : else
1181 2 : RecordTransactionAbortPrepared(xid,
1182 : hdr->nsubxacts, children,
1183 : hdr->nabortrels, abortrels);
1184 :
1185 5 : ProcArrayRemove(&gxact->proc, latestXid);
1186 :
1187 : /*
1188 : * In case we fail while running the callbacks, mark the gxact invalid so
1189 : * no one else will try to commit/rollback, and so it can be recycled
1190 : * properly later. It is still locked by our XID so it won't go away yet.
1191 : *
1192 : * (We assume it's safe to do this without taking TwoPhaseStateLock.)
1193 : */
1194 5 : gxact->valid = false;
1195 :
1196 : /*
1197 : * We have to remove any files that were supposed to be dropped. For
1198 : * consistency with the regular xact.c code paths, must do this before
1199 : * releasing locks, so do it before running the callbacks.
1200 : *
1201 : * NB: this code knows that we couldn't be dropping any temp rels ...
1202 : */
1203 5 : if (isCommit)
1204 : {
1205 4 : for (i = 0; i < hdr->ncommitrels; i++)
1206 1 : smgrdounlink(smgropen(commitrels[i]), false, false);
1207 : }
1208 : else
1209 : {
1210 2 : for (i = 0; i < hdr->nabortrels; i++)
1211 0 : smgrdounlink(smgropen(abortrels[i]), false, false);
1212 : }
1213 :
1214 : /* And now do the callbacks */
1215 5 : if (isCommit)
1216 3 : ProcessRecords(bufptr, xid, twophase_postcommit_callbacks);
1217 : else
1218 2 : ProcessRecords(bufptr, xid, twophase_postabort_callbacks);
1219 :
1220 : /* Count the prepared xact as committed or aborted */
1221 5 : AtEOXact_PgStat(isCommit);
1222 :
1223 : /*
1224 : * And now we can clean up our mess.
1225 : */
1226 5 : RemoveTwoPhaseFile(xid, true);
1227 :
1228 5 : RemoveGXact(gxact);
1229 :
1230 5 : pfree(buf);
1231 5 : }
1232 :
1233 : /*
1234 : * Scan a 2PC state file (already read into memory by ReadTwoPhaseFile)
1235 : * and call the indicated callbacks for each 2PC record.
1236 : */
1237 : static void
1238 : ProcessRecords(char *bufptr, TransactionId xid,
1239 : const TwoPhaseCallback callbacks[])
1240 101 : {
1241 : for (;;)
1242 : {
1243 101 : TwoPhaseRecordOnDisk *record = (TwoPhaseRecordOnDisk *) bufptr;
1244 :
1245 : Assert(record->rmid <= TWOPHASE_RM_MAX_ID);
1246 101 : if (record->rmid == TWOPHASE_RM_END_ID)
1247 5 : break;
1248 :
1249 96 : bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk));
1250 :
1251 96 : if (callbacks[record->rmid] != NULL)
1252 96 : callbacks[record->rmid] (xid, record->info,
1253 : (void *) bufptr, record->len);
1254 :
1255 96 : bufptr += MAXALIGN(record->len);
1256 96 : }
1257 5 : }
1258 :
1259 : /*
1260 : * Remove the 2PC file for the specified XID.
1261 : *
1262 : * If giveWarning is false, do not complain about file-not-present;
1263 : * this is an expected case during WAL replay.
1264 : */
1265 : void
1266 : RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
1267 5 : {
1268 : char path[MAXPGPATH];
1269 :
1270 5 : TwoPhaseFilePath(path, xid);
1271 5 : if (unlink(path))
1272 0 : if (errno != ENOENT || giveWarning)
1273 0 : ereport(WARNING,
1274 : (errcode_for_file_access(),
1275 : errmsg("could not remove two-phase state file \"%s\": %m",
1276 : path)));
1277 5 : }
1278 :
1279 : /*
1280 : * Recreates a state file. This is used in WAL replay.
1281 : *
1282 : * Note: content and len don't include CRC.
1283 : */
1284 : void
1285 : RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
1286 0 : {
1287 : char path[MAXPGPATH];
1288 : pg_crc32 statefile_crc;
1289 : int fd;
1290 :
1291 : /* Recompute CRC */
1292 0 : INIT_CRC32(statefile_crc);
1293 0 : COMP_CRC32(statefile_crc, content, len);
1294 0 : FIN_CRC32(statefile_crc);
1295 :
1296 0 : TwoPhaseFilePath(path, xid);
1297 :
1298 0 : fd = BasicOpenFile(path,
1299 : O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY,
1300 : S_IRUSR | S_IWUSR);
1301 0 : if (fd < 0)
1302 0 : ereport(ERROR,
1303 : (errcode_for_file_access(),
1304 : errmsg("could not recreate two-phase state file \"%s\": %m",
1305 : path)));
1306 :
1307 : /* Write content and CRC */
1308 0 : if (write(fd, content, len) != len)
1309 : {
1310 0 : close(fd);
1311 0 : ereport(ERROR,
1312 : (errcode_for_file_access(),
1313 : errmsg("could not write two-phase state file: %m")));
1314 : }
1315 0 : if (write(fd, &statefile_crc, sizeof(pg_crc32)) != sizeof(pg_crc32))
1316 : {
1317 0 : close(fd);
1318 0 : ereport(ERROR,
1319 : (errcode_for_file_access(),
1320 : errmsg("could not write two-phase state file: %m")));
1321 : }
1322 :
1323 : /*
1324 : * We must fsync the file because the end-of-replay checkpoint will not do
1325 : * so, there being no GXACT in shared memory yet to tell it to.
1326 : */
1327 0 : if (pg_fsync(fd) != 0)
1328 : {
1329 0 : close(fd);
1330 0 : ereport(ERROR,
1331 : (errcode_for_file_access(),
1332 : errmsg("could not fsync two-phase state file: %m")));
1333 : }
1334 :
1335 0 : if (close(fd) != 0)
1336 0 : ereport(ERROR,
1337 : (errcode_for_file_access(),
1338 : errmsg("could not close two-phase state file: %m")));
1339 0 : }
1340 :
1341 : /*
1342 : * CheckPointTwoPhase -- handle 2PC component of checkpointing.
1343 : *
1344 : * We must fsync the state file of any GXACT that is valid and has a PREPARE
1345 : * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or
1346 : * has a later LSN, this checkpoint is not responsible for fsyncing it.)
1347 : *
1348 : * This is deliberately run as late as possible in the checkpoint sequence,
1349 : * because GXACTs ordinarily have short lifespans, and so it is quite
1350 : * possible that GXACTs that were valid at checkpoint start will no longer
1351 : * exist if we wait a little bit.
1352 : *
1353 : * If a GXACT remains valid across multiple checkpoints, it'll be fsynced
1354 : * each time. This is considered unusual enough that we don't bother to
1355 : * expend any extra code to avoid the redundant fsyncs. (They should be
1356 : * reasonably cheap anyway, since they won't cause I/O.)
1357 : */
1358 : void
1359 : CheckPointTwoPhase(XLogRecPtr redo_horizon)
1360 19 : {
1361 : TransactionId *xids;
1362 : int nxids;
1363 : char path[MAXPGPATH];
1364 : int i;
1365 :
1366 : /*
1367 : * We don't want to hold the TwoPhaseStateLock while doing I/O, so we grab
1368 : * it just long enough to make a list of the XIDs that require fsyncing,
1369 : * and then do the I/O afterwards.
1370 : *
1371 : * This approach creates a race condition: someone else could delete a
1372 : * GXACT between the time we release TwoPhaseStateLock and the time we try
1373 : * to open its state file. We handle this by special-casing ENOENT
1374 : * failures: if we see that, we verify that the GXACT is no longer valid,
1375 : * and if so ignore the failure.
1376 : */
1377 19 : if (max_prepared_xacts <= 0)
1378 0 : return; /* nothing to do */
1379 19 : xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId));
1380 19 : nxids = 0;
1381 :
1382 19 : LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
1383 :
1384 19 : for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
1385 : {
1386 0 : GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
1387 :
1388 0 : if (gxact->valid &&
1389 : XLByteLE(gxact->prepare_lsn, redo_horizon))
1390 0 : xids[nxids++] = gxact->proc.xid;
1391 : }
1392 :
1393 19 : LWLockRelease(TwoPhaseStateLock);
1394 :
1395 19 : for (i = 0; i < nxids; i++)
1396 : {
1397 0 : TransactionId xid = xids[i];
1398 : int fd;
1399 :
1400 0 : TwoPhaseFilePath(path, xid);
1401 :
1402 0 : fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
1403 0 : if (fd < 0)
1404 : {
1405 0 : if (errno == ENOENT)
1406 : {
1407 : /* OK if gxact is no longer valid */
1408 0 : if (!TransactionIdIsPrepared(xid))
1409 0 : continue;
1410 : /* Restore errno in case it was changed */
1411 0 : errno = ENOENT;
1412 : }
1413 0 : ereport(ERROR,
1414 : (errcode_for_file_access(),
1415 : errmsg("could not open two-phase state file \"%s\": %m",
1416 : path)));
1417 : }
1418 :
1419 0 : if (pg_fsync(fd) != 0)
1420 : {
1421 0 : close(fd);
1422 0 : ereport(ERROR,
1423 : (errcode_for_file_access(),
1424 : errmsg("could not fsync two-phase state file \"%s\": %m",
1425 : path)));
1426 : }
1427 :
1428 0 : if (close(fd) != 0)
1429 0 : ereport(ERROR,
1430 : (errcode_for_file_access(),
1431 : errmsg("could not close two-phase state file \"%s\": %m",
1432 : path)));
1433 : }
1434 :
1435 19 : pfree(xids);
1436 : }
1437 :
1438 : /*
1439 : * PrescanPreparedTransactions
1440 : *
1441 : * Scan the pg_twophase directory and determine the range of valid XIDs
1442 : * present. This is run during database startup, after we have completed
1443 : * reading WAL. ShmemVariableCache->nextXid has been set to one more than
1444 : * the highest XID for which evidence exists in WAL.
1445 : *
1446 : * We throw away any prepared xacts with main XID beyond nextXid --- if any
1447 : * are present, it suggests that the DBA has done a PITR recovery to an
1448 : * earlier point in time without cleaning out pg_twophase. We dare not
1449 : * try to recover such prepared xacts since they likely depend on database
1450 : * state that doesn't exist now.
1451 : *
1452 : * However, we will advance nextXid beyond any subxact XIDs belonging to
1453 : * valid prepared xacts. We need to do this since subxact commit doesn't
1454 : * write a WAL entry, and so there might be no evidence in WAL of those
1455 : * subxact XIDs.
1456 : *
1457 : * Our other responsibility is to determine and return the oldest valid XID
1458 : * among the prepared xacts (if none, return ShmemVariableCache->nextXid).
1459 : * This is needed to synchronize pg_subtrans startup properly.
1460 : */
1461 : TransactionId
1462 : PrescanPreparedTransactions(void)
1463 14 : {
1464 14 : TransactionId origNextXid = ShmemVariableCache->nextXid;
1465 14 : TransactionId result = origNextXid;
1466 : DIR *cldir;
1467 : struct dirent *clde;
1468 :
1469 14 : cldir = AllocateDir(TWOPHASE_DIR);
1470 56 : while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
1471 : {
1472 28 : if (strlen(clde->d_name) == 8 &&
1473 0 : strspn(clde->d_name, "0123456789ABCDEF") == 8)
1474 : {
1475 : TransactionId xid;
1476 : char *buf;
1477 : TwoPhaseFileHeader *hdr;
1478 : TransactionId *subxids;
1479 : int i;
1480 :
1481 0 : xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
1482 :
1483 : /* Reject XID if too new */
1484 0 : if (TransactionIdFollowsOrEquals(xid, origNextXid))
1485 : {
1486 0 : ereport(WARNING,
1487 : (errmsg("removing future two-phase state file \"%s\"",
1488 : clde->d_name)));
1489 0 : RemoveTwoPhaseFile(xid, true);
1490 0 : continue;
1491 : }
1492 :
1493 : /*
1494 : * Note: we can't check if already processed because clog
1495 : * subsystem isn't up yet.
1496 : */
1497 :
1498 : /* Read and validate file */
1499 0 : buf = ReadTwoPhaseFile(xid);
1500 0 : if (buf == NULL)
1501 : {
1502 0 : ereport(WARNING,
1503 : (errmsg("removing corrupt two-phase state file \"%s\"",
1504 : clde->d_name)));
1505 0 : RemoveTwoPhaseFile(xid, true);
1506 0 : continue;
1507 : }
1508 :
1509 : /* Deconstruct header */
1510 0 : hdr = (TwoPhaseFileHeader *) buf;
1511 0 : if (!TransactionIdEquals(hdr->xid, xid))
1512 : {
1513 0 : ereport(WARNING,
1514 : (errmsg("removing corrupt two-phase state file \"%s\"",
1515 : clde->d_name)));
1516 0 : RemoveTwoPhaseFile(xid, true);
1517 0 : pfree(buf);
1518 0 : continue;
1519 : }
1520 :
1521 : /*
1522 : * OK, we think this file is valid. Incorporate xid into the
1523 : * running-minimum result.
1524 : */
1525 0 : if (TransactionIdPrecedes(xid, result))
1526 0 : result = xid;
1527 :
1528 : /*
1529 : * Examine subtransaction XIDs ... they should all follow main
1530 : * XID, and they may force us to advance nextXid.
1531 : */
1532 0 : subxids = (TransactionId *)
1533 : (buf + MAXALIGN(sizeof(TwoPhaseFileHeader)));
1534 0 : for (i = 0; i < hdr->nsubxacts; i++)
1535 : {
1536 0 : TransactionId subxid = subxids[i];
1537 :
1538 : Assert(TransactionIdFollows(subxid, xid));
1539 0 : if (TransactionIdFollowsOrEquals(subxid,
1540 : ShmemVariableCache->nextXid))
1541 : {
1542 0 : ShmemVariableCache->nextXid = subxid;
1543 0 : TransactionIdAdvance(ShmemVariableCache->nextXid);
1544 : }
1545 : }
1546 :
1547 0 : pfree(buf);
1548 : }
1549 : }
1550 14 : FreeDir(cldir);
1551 :
1552 14 : return result;
1553 : }
1554 :
1555 : /*
1556 : * RecoverPreparedTransactions
1557 : *
1558 : * Scan the pg_twophase directory and reload shared-memory state for each
1559 : * prepared transaction (reacquire locks, etc). This is run during database
1560 : * startup.
1561 : */
1562 : void
1563 : RecoverPreparedTransactions(void)
1564 14 : {
1565 : char dir[MAXPGPATH];
1566 : DIR *cldir;
1567 : struct dirent *clde;
1568 :
1569 14 : snprintf(dir, MAXPGPATH, "%s", TWOPHASE_DIR);
1570 :
1571 14 : cldir = AllocateDir(dir);
1572 56 : while ((clde = ReadDir(cldir, dir)) != NULL)
1573 : {
1574 28 : if (strlen(clde->d_name) == 8 &&
1575 0 : strspn(clde->d_name, "0123456789ABCDEF") == 8)
1576 : {
1577 : TransactionId xid;
1578 : char *buf;
1579 : char *bufptr;
1580 : TwoPhaseFileHeader *hdr;
1581 : TransactionId *subxids;
1582 : GlobalTransaction gxact;
1583 : int i;
1584 :
1585 0 : xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
1586 :
1587 : /* Already processed? */
1588 0 : if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
1589 : {
1590 0 : ereport(WARNING,
1591 : (errmsg("removing stale two-phase state file \"%s\"",
1592 : clde->d_name)));
1593 0 : RemoveTwoPhaseFile(xid, true);
1594 0 : continue;
1595 : }
1596 :
1597 : /* Read and validate file */
1598 0 : buf = ReadTwoPhaseFile(xid);
1599 0 : if (buf == NULL)
1600 : {
1601 0 : ereport(WARNING,
1602 : (errmsg("removing corrupt two-phase state file \"%s\"",
1603 : clde->d_name)));
1604 0 : RemoveTwoPhaseFile(xid, true);
1605 0 : continue;
1606 : }
1607 :
1608 0 : ereport(LOG,
1609 : (errmsg("recovering prepared transaction %u", xid)));
1610 :
1611 : /* Deconstruct header */
1612 0 : hdr = (TwoPhaseFileHeader *) buf;
1613 : Assert(TransactionIdEquals(hdr->xid, xid));
1614 0 : bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
1615 0 : subxids = (TransactionId *) bufptr;
1616 0 : bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
1617 0 : bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
1618 0 : bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
1619 :
1620 : /*
1621 : * Reconstruct subtrans state for the transaction --- needed
1622 : * because pg_subtrans is not preserved over a restart. Note that
1623 : * we are linking all the subtransactions directly to the
1624 : * top-level XID; there may originally have been a more complex
1625 : * hierarchy, but there's no need to restore that exactly.
1626 : */
1627 0 : for (i = 0; i < hdr->nsubxacts; i++)
1628 0 : SubTransSetParent(subxids[i], xid);
1629 :
1630 : /*
1631 : * Recreate its GXACT and dummy PGPROC
1632 : *
1633 : * Note: since we don't have the PREPARE record's WAL location at
1634 : * hand, we leave prepare_lsn zeroes. This means the GXACT will
1635 : * be fsync'd on every future checkpoint. We assume this
1636 : * situation is infrequent enough that the performance cost is
1637 : * negligible (especially since we know the state file has already
1638 : * been fsynced).
1639 : */
1640 0 : gxact = MarkAsPreparing(xid, hdr->gid,
1641 : hdr->prepared_at,
1642 : hdr->owner, hdr->database);
1643 0 : GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids);
1644 0 : MarkAsPrepared(gxact);
1645 :
1646 : /*
1647 : * Recover other state (notably locks) using resource managers
1648 : */
1649 0 : ProcessRecords(bufptr, xid, twophase_recover_callbacks);
1650 :
1651 0 : pfree(buf);
1652 : }
1653 : }
1654 14 : FreeDir(cldir);
1655 14 : }
1656 :
1657 : /*
1658 : * RecordTransactionCommitPrepared
1659 : *
1660 : * This is basically the same as RecordTransactionCommit: in particular,
1661 : * we must set the inCommit flag to avoid a race condition.
1662 : *
1663 : * We know the transaction made at least one XLOG entry (its PREPARE),
1664 : * so it is never possible to optimize out the commit record.
1665 : */
1666 : static void
1667 : RecordTransactionCommitPrepared(TransactionId xid,
1668 : int nchildren,
1669 : TransactionId *children,
1670 : int nrels,
1671 : RelFileNode *rels)
1672 3 : {
1673 : XLogRecData rdata[3];
1674 3 : int lastrdata = 0;
1675 : xl_xact_commit_prepared xlrec;
1676 : XLogRecPtr recptr;
1677 :
1678 3 : START_CRIT_SECTION();
1679 :
1680 : /* See notes in RecordTransactionCommit */
1681 3 : MyProc->inCommit = true;
1682 :
1683 : /* Emit the XLOG commit record */
1684 3 : xlrec.xid = xid;
1685 3 : xlrec.crec.xact_time = GetCurrentTimestamp();
1686 3 : xlrec.crec.nrels = nrels;
1687 3 : xlrec.crec.nsubxacts = nchildren;
1688 3 : rdata[0].data = (char *) (&xlrec);
1689 3 : rdata[0].len = MinSizeOfXactCommitPrepared;
1690 3 : rdata[0].buffer = InvalidBuffer;
1691 : /* dump rels to delete */
1692 3 : if (nrels > 0)
1693 : {
1694 1 : rdata[0].next = &(rdata[1]);
1695 1 : rdata[1].data = (char *) rels;
1696 1 : rdata[1].len = nrels * sizeof(RelFileNode);
1697 1 : rdata[1].buffer = InvalidBuffer;
1698 1 : lastrdata = 1;
1699 : }
1700 : /* dump committed child Xids */
1701 3 : if (nchildren > 0)
1702 : {
1703 1 : rdata[lastrdata].next = &(rdata[2]);
1704 1 : rdata[2].data = (char *) children;
1705 1 : rdata[2].len = nchildren * sizeof(TransactionId);
1706 1 : rdata[2].buffer = InvalidBuffer;
1707 1 : lastrdata = 2;
1708 : }
1709 3 : rdata[lastrdata].next = NULL;
1710 :
1711 3 : recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_PREPARED, rdata);
1712 :
1713 : /*
1714 : * We don't currently try to sleep before flush here ... nor is there any
1715 : * support for async commit of a prepared xact (the very idea is probably
1716 : * a contradiction)
1717 : */
1718 :
1719 : /* Flush XLOG to disk */
1720 3 : XLogFlush(recptr);
1721 :
1722 : /* Mark the transaction committed in pg_clog */
1723 3 : TransactionIdCommit(xid);
1724 : /* to avoid race conditions, the parent must commit first */
1725 3 : TransactionIdCommitTree(nchildren, children);
1726 :
1727 : /* Checkpoint can proceed now */
1728 3 : MyProc->inCommit = false;
1729 :
1730 3 : END_CRIT_SECTION();
1731 3 : }
1732 :
1733 : /*
1734 : * RecordTransactionAbortPrepared
1735 : *
1736 : * This is basically the same as RecordTransactionAbort.
1737 : *
1738 : * We know the transaction made at least one XLOG entry (its PREPARE),
1739 : * so it is never possible to optimize out the abort record.
1740 : */
1741 : static void
1742 : RecordTransactionAbortPrepared(TransactionId xid,
1743 : int nchildren,
1744 : TransactionId *children,
1745 : int nrels,
1746 : RelFileNode *rels)
1747 2 : {
1748 : XLogRecData rdata[3];
1749 2 : int lastrdata = 0;
1750 : xl_xact_abort_prepared xlrec;
1751 : XLogRecPtr recptr;
1752 :
1753 : /*
1754 : * Catch the scenario where we aborted partway through
1755 : * RecordTransactionCommitPrepared ...
1756 : */
1757 2 : if (TransactionIdDidCommit(xid))
1758 0 : elog(PANIC, "cannot abort transaction %u, it was already committed",
1759 : xid);
1760 :
1761 2 : START_CRIT_SECTION();
1762 :
1763 : /* Emit the XLOG abort record */
1764 2 : xlrec.xid = xid;
1765 2 : xlrec.arec.xact_time = GetCurrentTimestamp();
1766 2 : xlrec.arec.nrels = nrels;
1767 2 : xlrec.arec.nsubxacts = nchildren;
1768 2 : rdata[0].data = (char *) (&xlrec);
1769 2 : rdata[0].len = MinSizeOfXactAbortPrepared;
1770 2 : rdata[0].buffer = InvalidBuffer;
1771 : /* dump rels to delete */
1772 2 : if (nrels > 0)
1773 : {
1774 0 : rdata[0].next = &(rdata[1]);
1775 0 : rdata[1].data = (char *) rels;
1776 0 : rdata[1].len = nrels * sizeof(RelFileNode);
1777 0 : rdata[1].buffer = InvalidBuffer;
1778 0 : lastrdata = 1;
1779 : }
1780 : /* dump committed child Xids */
1781 2 : if (nchildren > 0)
1782 : {
1783 0 : rdata[lastrdata].next = &(rdata[2]);
1784 0 : rdata[2].data = (char *) children;
1785 0 : rdata[2].len = nchildren * sizeof(TransactionId);
1786 0 : rdata[2].buffer = InvalidBuffer;
1787 0 : lastrdata = 2;
1788 : }
1789 2 : rdata[lastrdata].next = NULL;
1790 :
1791 2 : recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT_PREPARED, rdata);
1792 :
1793 : /* Always flush, since we're about to remove the 2PC state file */
1794 2 : XLogFlush(recptr);
1795 :
1796 : /*
1797 : * Mark the transaction aborted in clog. This is not absolutely necessary
1798 : * but we may as well do it while we are here.
1799 : */
1800 2 : TransactionIdAbort(xid);
1801 2 : TransactionIdAbortTree(nchildren, children);
1802 :
1803 2 : END_CRIT_SECTION();
1804 2 : }
|