LTP GCOV extension - code coverage report
Current view: directory - access/transam - xlog.c
Test: unnamed
Date: 2008-07-03 Instrumented lines: 2010
Code covered: 48.3 % Executed lines: 971
Legend: not executed executed

       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * xlog.c
       4                 :  *              PostgreSQL transaction log manager
       5                 :  *
       6                 :  *
       7                 :  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
       8                 :  * Portions Copyright (c) 1994, Regents of the University of California
       9                 :  *
      10                 :  * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.292 2008/01/21 11:17:46 petere Exp $
      11                 :  *
      12                 :  *-------------------------------------------------------------------------
      13                 :  */
      14                 : 
      15                 : #include "postgres.h"
      16                 : 
      17                 : #include <ctype.h>
      18                 : #include <fcntl.h>
      19                 : #include <signal.h>
      20                 : #include <time.h>
      21                 : #include <sys/stat.h>
      22                 : #include <sys/time.h>
      23                 : #include <sys/wait.h>
      24                 : #include <unistd.h>
      25                 : 
      26                 : #include "access/clog.h"
      27                 : #include "access/heapam.h"
      28                 : #include "access/multixact.h"
      29                 : #include "access/subtrans.h"
      30                 : #include "access/transam.h"
      31                 : #include "access/tuptoaster.h"
      32                 : #include "access/twophase.h"
      33                 : #include "access/xact.h"
      34                 : #include "access/xlog_internal.h"
      35                 : #include "access/xlogdefs.h"
      36                 : #include "access/xlogutils.h"
      37                 : #include "catalog/catversion.h"
      38                 : #include "catalog/pg_control.h"
      39                 : #include "catalog/pg_type.h"
      40                 : #include "funcapi.h"
      41                 : #include "miscadmin.h"
      42                 : #include "pgstat.h"
      43                 : #include "postmaster/bgwriter.h"
      44                 : #include "storage/bufpage.h"
      45                 : #include "storage/fd.h"
      46                 : #include "storage/pmsignal.h"
      47                 : #include "storage/procarray.h"
      48                 : #include "storage/smgr.h"
      49                 : #include "storage/spin.h"
      50                 : #include "utils/builtins.h"
      51                 : #include "utils/pg_locale.h"
      52                 : #include "utils/ps_status.h"
      53                 : 
      54                 : 
      55                 : /* File path names (all relative to $PGDATA) */
      56                 : #define BACKUP_LABEL_FILE               "backup_label"
      57                 : #define BACKUP_LABEL_OLD                "backup_label.old"
      58                 : #define RECOVERY_COMMAND_FILE   "recovery.conf"
      59                 : #define RECOVERY_COMMAND_DONE   "recovery.done"
      60                 : 
      61                 : 
      62                 : /* User-settable parameters */
      63                 : int                     CheckPointSegments = 3;
      64                 : int                     XLOGbuffers = 8;
      65                 : int                     XLogArchiveTimeout = 0;
      66                 : bool            XLogArchiveMode = false;
      67                 : char       *XLogArchiveCommand = NULL;
      68                 : char       *XLOG_sync_method = NULL;
      69                 : const char      XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;
      70                 : bool            fullPageWrites = true;
      71                 : bool            log_checkpoints = false;
      72                 : 
      73                 : #ifdef WAL_DEBUG
      74                 : bool            XLOG_DEBUG = false;
      75                 : #endif
      76                 : 
      77                 : /*
      78                 :  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
      79                 :  * When we are done with an old XLOG segment file, we will recycle it as a
      80                 :  * future XLOG segment as long as there aren't already XLOGfileslop future
      81                 :  * segments; else we'll delete it.  This could be made a separate GUC
      82                 :  * variable, but at present I think it's sufficient to hardwire it as
      83                 :  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
      84                 :  * no more than 2*CheckPointSegments log segments, and we want to recycle all
      85                 :  * of them; the +1 allows boundary cases to happen without wasting a
      86                 :  * delete/create-segment cycle.
      87                 :  */
      88                 : #define XLOGfileslop    (2*CheckPointSegments + 1)
      89                 : 
      90                 : 
      91                 : /* these are derived from XLOG_sync_method by assign_xlog_sync_method */
      92                 : int                     sync_method = DEFAULT_SYNC_METHOD;
      93                 : static int      open_sync_bit = DEFAULT_SYNC_FLAGBIT;
      94                 : 
      95                 : #define XLOG_SYNC_BIT  (enableFsync ? open_sync_bit : 0)
      96                 : 
      97                 : 
      98                 : /*
      99                 :  * Statistics for current checkpoint are collected in this global struct.
     100                 :  * Because only the background writer or a stand-alone backend can perform
     101                 :  * checkpoints, this will be unused in normal backends.
     102                 :  */
     103                 : CheckpointStatsData CheckpointStats;
     104                 : 
     105                 : /*
     106                 :  * ThisTimeLineID will be same in all backends --- it identifies current
     107                 :  * WAL timeline for the database system.
     108                 :  */
     109                 : TimeLineID      ThisTimeLineID = 0;
     110                 : 
     111                 : /* Are we doing recovery from XLOG? */
     112                 : bool            InRecovery = false;
     113                 : 
     114                 : /* Are we recovering using offline XLOG archives? */
     115                 : static bool InArchiveRecovery = false;
     116                 : 
     117                 : /* Was the last xlog file restored from archive, or local? */
     118                 : static bool restoredFromArchive = false;
     119                 : 
     120                 : /* options taken from recovery.conf */
     121                 : static char *recoveryRestoreCommand = NULL;
     122                 : static bool recoveryTarget = false;
     123                 : static bool recoveryTargetExact = false;
     124                 : static bool recoveryTargetInclusive = true;
     125                 : static bool recoveryLogRestartpoints = false;
     126                 : static TransactionId recoveryTargetXid;
     127                 : static TimestampTz recoveryTargetTime;
     128                 : static TimestampTz recoveryLastXTime = 0;
     129                 : 
     130                 : /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
     131                 : static TransactionId recoveryStopXid;
     132                 : static TimestampTz recoveryStopTime;
     133                 : static bool recoveryStopAfter;
     134                 : 
     135                 : /*
     136                 :  * During normal operation, the only timeline we care about is ThisTimeLineID.
     137                 :  * During recovery, however, things are more complicated.  To simplify life
     138                 :  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
     139                 :  * scan through the WAL history (that is, it is the line that was active when
     140                 :  * the currently-scanned WAL record was generated).  We also need these
     141                 :  * timeline values:
     142                 :  *
     143                 :  * recoveryTargetTLI: the desired timeline that we want to end in.
     144                 :  *
     145                 :  * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
     146                 :  * its known parents, newest first (so recoveryTargetTLI is always the
     147                 :  * first list member).  Only these TLIs are expected to be seen in the WAL
     148                 :  * segments we read, and indeed only these TLIs will be considered as
     149                 :  * candidate WAL files to open at all.
     150                 :  *
     151                 :  * curFileTLI: the TLI appearing in the name of the current input WAL file.
     152                 :  * (This is not necessarily the same as ThisTimeLineID, because we could
     153                 :  * be scanning data that was copied from an ancestor timeline when the current
     154                 :  * file was created.)  During a sequential scan we do not allow this value
     155                 :  * to decrease.
     156                 :  */
     157                 : static TimeLineID recoveryTargetTLI;
     158                 : static List *expectedTLIs;
     159                 : static TimeLineID curFileTLI;
     160                 : 
     161                 : /*
     162                 :  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
     163                 :  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
     164                 :  * end+1 of the last record, and is reset when we end a top-level transaction,
     165                 :  * or start a new one; so it can be used to tell if the current transaction has
     166                 :  * created any XLOG records.
     167                 :  */
     168                 : static XLogRecPtr ProcLastRecPtr = {0, 0};
     169                 : 
     170                 : XLogRecPtr      XactLastRecEnd = {0, 0};
     171                 : 
     172                 : /*
     173                 :  * RedoRecPtr is this backend's local copy of the REDO record pointer
     174                 :  * (which is almost but not quite the same as a pointer to the most recent
     175                 :  * CHECKPOINT record).  We update this from the shared-memory copy,
     176                 :  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
     177                 :  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
     178                 :  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
     179                 :  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
     180                 :  * InitXLOGAccess.
     181                 :  */
     182                 : static XLogRecPtr RedoRecPtr;
     183                 : 
     184                 : /*----------
     185                 :  * Shared-memory data structures for XLOG control
     186                 :  *
     187                 :  * LogwrtRqst indicates a byte position that we need to write and/or fsync
     188                 :  * the log up to (all records before that point must be written or fsynced).
     189                 :  * LogwrtResult indicates the byte positions we have already written/fsynced.
     190                 :  * These structs are identical but are declared separately to indicate their
     191                 :  * slightly different functions.
     192                 :  *
     193                 :  * We do a lot of pushups to minimize the amount of access to lockable
     194                 :  * shared memory values.  There are actually three shared-memory copies of
     195                 :  * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
     196                 :  *              XLogCtl->LogwrtResult is protected by info_lck
     197                 :  *              XLogCtl->Write.LogwrtResult is protected by WALWriteLock
     198                 :  *              XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
     199                 :  * One must hold the associated lock to read or write any of these, but
     200                 :  * of course no lock is needed to read/write the unshared LogwrtResult.
     201                 :  *
     202                 :  * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
     203                 :  * right", since both are updated by a write or flush operation before
     204                 :  * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
     205                 :  * is that it can be examined/modified by code that already holds WALWriteLock
     206                 :  * without needing to grab info_lck as well.
     207                 :  *
     208                 :  * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
     209                 :  * but is updated when convenient.      Again, it exists for the convenience of
     210                 :  * code that is already holding WALInsertLock but not the other locks.
     211                 :  *
     212                 :  * The unshared LogwrtResult may lag behind any or all of these, and again
     213                 :  * is updated when convenient.
     214                 :  *
     215                 :  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
     216                 :  * (protected by info_lck), but we don't need to cache any copies of it.
     217                 :  *
     218                 :  * Note that this all works because the request and result positions can only
     219                 :  * advance forward, never back up, and so we can easily determine which of two
     220                 :  * values is "more up to date".
     221                 :  *
     222                 :  * info_lck is only held long enough to read/update the protected variables,
     223                 :  * so it's a plain spinlock.  The other locks are held longer (potentially
     224                 :  * over I/O operations), so we use LWLocks for them.  These locks are:
     225                 :  *
     226                 :  * WALInsertLock: must be held to insert a record into the WAL buffers.
     227                 :  *
     228                 :  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
     229                 :  * XLogFlush).
     230                 :  *
     231                 :  * ControlFileLock: must be held to read/update control file or create
     232                 :  * new log file.
     233                 :  *
     234                 :  * CheckpointLock: must be held to do a checkpoint (ensures only one
     235                 :  * checkpointer at a time; currently, with all checkpoints done by the
     236                 :  * bgwriter, this is just pro forma).
     237                 :  *
     238                 :  *----------
     239                 :  */
     240                 : 
     241                 : typedef struct XLogwrtRqst
     242                 : {
     243                 :         XLogRecPtr      Write;                  /* last byte + 1 to write out */
     244                 :         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
     245                 : } XLogwrtRqst;
     246                 : 
     247                 : typedef struct XLogwrtResult
     248                 : {
     249                 :         XLogRecPtr      Write;                  /* last byte + 1 written out */
     250                 :         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
     251                 : } XLogwrtResult;
     252                 : 
     253                 : /*
     254                 :  * Shared state data for XLogInsert.
     255                 :  */
     256                 : typedef struct XLogCtlInsert
     257                 : {
     258                 :         XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
     259                 :         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
     260                 :         int                     curridx;                /* current block index in cache */
     261                 :         XLogPageHeader currpage;        /* points to header of block in cache */
     262                 :         char       *currpos;            /* current insertion point in cache */
     263                 :         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
     264                 :         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
     265                 : } XLogCtlInsert;
     266                 : 
     267                 : /*
     268                 :  * Shared state data for XLogWrite/XLogFlush.
     269                 :  */
     270                 : typedef struct XLogCtlWrite
     271                 : {
     272                 :         XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
     273                 :         int                     curridx;                /* cache index of next block to write */
     274                 :         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
     275                 : } XLogCtlWrite;
     276                 : 
     277                 : /*
     278                 :  * Total shared-memory state for XLOG.
     279                 :  */
     280                 : typedef struct XLogCtlData
     281                 : {
     282                 :         /* Protected by WALInsertLock: */
     283                 :         XLogCtlInsert Insert;
     284                 : 
     285                 :         /* Protected by info_lck: */
     286                 :         XLogwrtRqst LogwrtRqst;
     287                 :         XLogwrtResult LogwrtResult;
     288                 :         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
     289                 :         TransactionId ckptXid;
     290                 :         XLogRecPtr      asyncCommitLSN; /* LSN of newest async commit */
     291                 : 
     292                 :         /* Protected by WALWriteLock: */
     293                 :         XLogCtlWrite Write;
     294                 : 
     295                 :         /*
     296                 :          * These values do not change after startup, although the pointed-to pages
     297                 :          * and xlblocks values certainly do.  Permission to read/write the pages
     298                 :          * and xlblocks values depends on WALInsertLock and WALWriteLock.
     299                 :          */
     300                 :         char       *pages;                      /* buffers for unwritten XLOG pages */
     301                 :         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
     302                 :         Size            XLogCacheByte;  /* # bytes in xlog buffers */
     303                 :         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
     304                 :         TimeLineID      ThisTimeLineID;
     305                 : 
     306                 :         slock_t         info_lck;               /* locks shared variables shown above */
     307                 : } XLogCtlData;
     308                 : 
     309                 : static XLogCtlData *XLogCtl = NULL;
     310                 : 
     311                 : /*
     312                 :  * We maintain an image of pg_control in shared memory.
     313                 :  */
     314                 : static ControlFileData *ControlFile = NULL;
     315                 : 
     316                 : /*
     317                 :  * Macros for managing XLogInsert state.  In most cases, the calling routine
     318                 :  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
     319                 :  * so these are passed as parameters instead of being fetched via XLogCtl.
     320                 :  */
     321                 : 
     322                 : /* Free space remaining in the current xlog page buffer */
     323                 : #define INSERT_FREESPACE(Insert)  \
     324                 :         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
     325                 : 
     326                 : /* Construct XLogRecPtr value for current insertion point */
     327                 : #define INSERT_RECPTR(recptr,Insert,curridx)  \
     328                 :         ( \
     329                 :           (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
     330                 :           (recptr).xrecoff = \
     331                 :                 XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
     332                 :         )
     333                 : 
     334                 : #define PrevBufIdx(idx)         \
     335                 :                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
     336                 : 
     337                 : #define NextBufIdx(idx)         \
     338                 :                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
     339                 : 
     340                 : /*
     341                 :  * Private, possibly out-of-date copy of shared LogwrtResult.
     342                 :  * See discussion above.
     343                 :  */
     344                 : static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
     345                 : 
     346                 : /*
     347                 :  * openLogFile is -1 or a kernel FD for an open log file segment.
     348                 :  * When it's open, openLogOff is the current seek offset in the file.
     349                 :  * openLogId/openLogSeg identify the segment.  These variables are only
     350                 :  * used to write the XLOG, and so will normally refer to the active segment.
     351                 :  */
     352                 : static int      openLogFile = -1;
     353                 : static uint32 openLogId = 0;
     354                 : static uint32 openLogSeg = 0;
     355                 : static uint32 openLogOff = 0;
     356                 : 
     357                 : /*
     358                 :  * These variables are used similarly to the ones above, but for reading
     359                 :  * the XLOG.  Note, however, that readOff generally represents the offset
     360                 :  * of the page just read, not the seek position of the FD itself, which
     361                 :  * will be just past that page.
     362                 :  */
     363                 : static int      readFile = -1;
     364                 : static uint32 readId = 0;
     365                 : static uint32 readSeg = 0;
     366                 : static uint32 readOff = 0;
     367                 : 
     368                 : /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
     369                 : static char *readBuf = NULL;
     370                 : 
     371                 : /* Buffer for current ReadRecord result (expandable) */
     372                 : static char *readRecordBuf = NULL;
     373                 : static uint32 readRecordBufSize = 0;
     374                 : 
     375                 : /* State information for XLOG reading */
     376                 : static XLogRecPtr ReadRecPtr;   /* start of last record read */
     377                 : static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
     378                 : static XLogRecord *nextRecord = NULL;
     379                 : static TimeLineID lastPageTLI = 0;
     380                 : 
     381                 : static bool InRedo = false;
     382                 : 
     383                 : 
     384                 : static void XLogArchiveNotify(const char *xlog);
     385                 : static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
     386                 : static bool XLogArchiveCheckDone(const char *xlog);
     387                 : static void XLogArchiveCleanup(const char *xlog);
     388                 : static void readRecoveryCommandFile(void);
     389                 : static void exitArchiveRecovery(TimeLineID endTLI,
     390                 :                                         uint32 endLogId, uint32 endLogSeg);
     391                 : static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
     392                 : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
     393                 : 
     394                 : static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
     395                 :                                 XLogRecPtr *lsn, BkpBlock *bkpb);
     396                 : static bool AdvanceXLInsertBuffer(bool new_segment);
     397                 : static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
     398                 : static int XLogFileInit(uint32 log, uint32 seg,
     399                 :                          bool *use_existent, bool use_lock);
     400                 : static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
     401                 :                                            bool find_free, int *max_advance,
     402                 :                                            bool use_lock);
     403                 : static int      XLogFileOpen(uint32 log, uint32 seg);
     404                 : static int      XLogFileRead(uint32 log, uint32 seg, int emode);
     405                 : static void XLogFileClose(void);
     406                 : static bool RestoreArchivedFile(char *path, const char *xlogfname,
     407                 :                                         const char *recovername, off_t expectedSize);
     408                 : static void PreallocXlogFiles(XLogRecPtr endptr);
     409                 : static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
     410                 : static void CleanupBackupHistory(void);
     411                 : static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
     412                 : static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
     413                 : static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
     414                 : static List *readTimeLineHistory(TimeLineID targetTLI);
     415                 : static bool existsTimeLineHistory(TimeLineID probeTLI);
     416                 : static TimeLineID findNewestTimeLine(TimeLineID startTLI);
     417                 : static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
     418                 :                                          TimeLineID endTLI,
     419                 :                                          uint32 endLogId, uint32 endLogSeg);
     420                 : static void WriteControlFile(void);
     421                 : static void ReadControlFile(void);
     422                 : static char *str_time(pg_time_t tnow);
     423                 : static void issue_xlog_fsync(void);
     424                 : 
     425                 : #ifdef WAL_DEBUG
     426                 : static void xlog_outrec(StringInfo buf, XLogRecord *record);
     427                 : #endif
     428                 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
     429                 :                                   XLogRecPtr *minRecoveryLoc);
     430                 : static void rm_redo_error_callback(void *arg);
     431                 : 
     432                 : 
     433                 : /*
     434                 :  * Insert an XLOG record having the specified RMID and info bytes,
     435                 :  * with the body of the record being the data chunk(s) described by
     436                 :  * the rdata chain (see xlog.h for notes about rdata).
     437                 :  *
     438                 :  * Returns XLOG pointer to end of record (beginning of next record).
     439                 :  * This can be used as LSN for data pages affected by the logged action.
     440                 :  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
     441                 :  * before the data page can be written out.  This implements the basic
     442                 :  * WAL rule "write the log before the data".)
     443                 :  *
     444                 :  * NB: this routine feels free to scribble on the XLogRecData structs,
     445                 :  * though not on the data they reference.  This is OK since the XLogRecData
     446                 :  * structs are always just temporaries in the calling code.
     447                 :  */
     448                 : XLogRecPtr
     449                 : XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
     450          297091 : {
     451          297091 :         XLogCtlInsert *Insert = &XLogCtl->Insert;
     452                 :         XLogRecord *record;
     453                 :         XLogContRecord *contrecord;
     454                 :         XLogRecPtr      RecPtr;
     455                 :         XLogRecPtr      WriteRqst;
     456                 :         uint32          freespace;
     457                 :         int                     curridx;
     458                 :         XLogRecData *rdt;
     459                 :         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
     460                 :         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
     461                 :         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
     462                 :         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
     463                 :         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
     464                 :         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
     465                 :         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
     466                 :         pg_crc32        rdata_crc;
     467                 :         uint32          len,
     468                 :                                 write_len;
     469                 :         unsigned        i;
     470                 :         bool            updrqst;
     471                 :         bool            doPageWrites;
     472          297091 :         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
     473                 : 
     474                 :         /* info's high bits are reserved for use by me */
     475          297091 :         if (info & XLR_INFO_MASK)
     476               0 :                 elog(PANIC, "invalid xlog info mask %02X", info);
     477                 : 
     478                 :         /*
     479                 :          * In bootstrap mode, we don't actually log anything but XLOG resources;
     480                 :          * return a phony record pointer.
     481                 :          */
     482          297091 :         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
     483                 :         {
     484            9623 :                 RecPtr.xlogid = 0;
     485            9623 :                 RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
     486            9623 :                 return RecPtr;
     487                 :         }
     488                 : 
     489                 :         /*
     490                 :          * Here we scan the rdata chain, determine which buffers must be backed
     491                 :          * up, and compute the CRC values for the data.  Note that the record
     492                 :          * header isn't added into the CRC initially since we don't know the final
     493                 :          * length or info bits quite yet.  Thus, the CRC will represent the CRC of
     494                 :          * the whole record in the order "rdata, then backup blocks, then record
     495                 :          * header".
     496                 :          *
     497                 :          * We may have to loop back to here if a race condition is detected below.
     498                 :          * We could prevent the race by doing all this work while holding the
     499                 :          * insert lock, but it seems better to avoid doing CRC calculations while
     500                 :          * holding the lock.  This means we have to be careful about modifying the
     501                 :          * rdata chain until we know we aren't going to loop back again.  The only
     502                 :          * change we allow ourselves to make earlier is to set rdt->data = NULL in
     503                 :          * chain items we have decided we will have to back up the whole buffer
     504                 :          * for.  This is OK because we will certainly decide the same thing again
     505                 :          * for those items if we do it over; doing it here saves an extra pass
     506                 :          * over the chain later.
     507                 :          */
     508          287468 : begin:;
     509         1149872 :         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
     510                 :         {
     511          862404 :                 dtbuf[i] = InvalidBuffer;
     512          862404 :                 dtbuf_bkp[i] = false;
     513                 :         }
     514                 : 
     515                 :         /*
     516                 :          * Decide if we need to do full-page writes in this XLOG record: true if
     517                 :          * full_page_writes is on or we have a PITR request for it.  Since we
     518                 :          * don't yet have the insert lock, forcePageWrites could change under us,
     519                 :          * but we'll recheck it once we have the lock.
     520                 :          */
     521          287468 :         doPageWrites = fullPageWrites || Insert->forcePageWrites;
     522                 : 
     523          287468 :         INIT_CRC32(rdata_crc);
     524          287468 :         len = 0;
     525          287468 :         for (rdt = rdata;;)
     526                 :         {
     527          765272 :                 if (rdt->buffer == InvalidBuffer)
     528                 :                 {
     529                 :                         /* Simple data, just include it */
     530          298642 :                         len += rdt->len;
     531          298642 :                         COMP_CRC32(rdata_crc, rdt->data, rdt->len);
     532                 :                 }
     533                 :                 else
     534                 :                 {
     535                 :                         /* Find info for buffer */
     536          471367 :                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
     537                 :                         {
     538          471367 :                                 if (rdt->buffer == dtbuf[i])
     539                 :                                 {
     540                 :                                         /* Buffer already referenced by earlier chain item */
     541          188485 :                                         if (dtbuf_bkp[i])
     542             144 :                                                 rdt->data = NULL;
     543          188341 :                                         else if (rdt->data)
     544                 :                                         {
     545          187419 :                                                 len += rdt->len;
     546          187419 :                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
     547                 :                                         }
     548                 :                                         break;
     549                 :                                 }
     550          282882 :                                 if (dtbuf[i] == InvalidBuffer)
     551                 :                                 {
     552                 :                                         /* OK, put it in this slot */
     553          278145 :                                         dtbuf[i] = rdt->buffer;
     554          278145 :                                         if (XLogCheckBuffer(rdt, doPageWrites,
     555                 :                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
     556                 :                                         {
     557             562 :                                                 dtbuf_bkp[i] = true;
     558             562 :                                                 rdt->data = NULL;
     559                 :                                         }
     560          277583 :                                         else if (rdt->data)
     561                 :                                         {
     562          244227 :                                                 len += rdt->len;
     563          244227 :                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
     564                 :                                         }
     565                 :                                         break;
     566                 :                                 }
     567                 :                         }
     568          466630 :                         if (i >= XLR_MAX_BKP_BLOCKS)
     569               0 :                                 elog(PANIC, "can backup at most %d blocks per xlog record",
     570                 :                                          XLR_MAX_BKP_BLOCKS);
     571                 :                 }
     572                 :                 /* Break out of loop when rdt points to last chain item */
     573          765272 :                 if (rdt->next == NULL)
     574          287468 :                         break;
     575          477804 :                 rdt = rdt->next;
     576          477804 :         }
     577                 : 
     578                 :         /*
     579                 :          * Now add the backup block headers and data into the CRC
     580                 :          */
     581         1149872 :         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
     582                 :         {
     583          862404 :                 if (dtbuf_bkp[i])
     584                 :                 {
     585             562 :                         BkpBlock   *bkpb = &(dtbuf_xlg[i]);
     586                 :                         char       *page;
     587                 : 
     588             562 :                         COMP_CRC32(rdata_crc,
     589                 :                                            (char *) bkpb,
     590                 :                                            sizeof(BkpBlock));
     591             562 :                         page = (char *) BufferGetBlock(dtbuf[i]);
     592             562 :                         if (bkpb->hole_length == 0)
     593                 :                         {
     594               2 :                                 COMP_CRC32(rdata_crc,
     595                 :                                                    page,
     596                 :                                                    BLCKSZ);
     597                 :                         }
     598                 :                         else
     599                 :                         {
     600                 :                                 /* must skip the hole */
     601             560 :                                 COMP_CRC32(rdata_crc,
     602                 :                                                    page,
     603                 :                                                    bkpb->hole_offset);
     604             560 :                                 COMP_CRC32(rdata_crc,
     605                 :                                                    page + (bkpb->hole_offset + bkpb->hole_length),
     606                 :                                                    BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
     607                 :                         }
     608                 :                 }
     609                 :         }
     610                 : 
     611                 :         /*
     612                 :          * NOTE: We disallow len == 0 because it provides a useful bit of extra
     613                 :          * error checking in ReadRecord.  This means that all callers of
     614                 :          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
     615                 :          * make an exception for XLOG SWITCH records because we don't want them to
     616                 :          * ever cross a segment boundary.
     617                 :          */
     618          287468 :         if (len == 0 && !isLogSwitch)
     619               0 :                 elog(PANIC, "invalid xlog record length %u", len);
     620                 : 
     621          287468 :         START_CRIT_SECTION();
     622                 : 
     623                 :         /* Now wait to get insert lock */
     624          287468 :         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
     625                 : 
     626                 :         /*
     627                 :          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
     628                 :          * back and recompute everything.  This can only happen just after a
     629                 :          * checkpoint, so it's better to be slow in this case and fast otherwise.
     630                 :          *
     631                 :          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
     632                 :          * affect the contents of the XLOG record, so we'll update our local copy
     633                 :          * but not force a recomputation.
     634                 :          */
     635          287468 :         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
     636                 :         {
     637                 :                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
     638               3 :                 RedoRecPtr = Insert->RedoRecPtr;
     639                 : 
     640               3 :                 if (doPageWrites)
     641                 :                 {
     642              12 :                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
     643                 :                         {
     644               9 :                                 if (dtbuf[i] == InvalidBuffer)
     645               9 :                                         continue;
     646               0 :                                 if (dtbuf_bkp[i] == false &&
     647                 :                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
     648                 :                                 {
     649                 :                                         /*
     650                 :                                          * Oops, this buffer now needs to be backed up, but we
     651                 :                                          * didn't think so above.  Start over.
     652                 :                                          */
     653               0 :                                         LWLockRelease(WALInsertLock);
     654               0 :                                         END_CRIT_SECTION();
     655               0 :                                         goto begin;
     656                 :                                 }
     657                 :                         }
     658                 :                 }
     659                 :         }
     660                 : 
     661                 :         /*
     662                 :          * Also check to see if forcePageWrites was just turned on; if we weren't
     663                 :          * already doing full-page writes then go back and recompute. (If it was
     664                 :          * just turned off, we could recompute the record without full pages, but
     665                 :          * we choose not to bother.)
     666                 :          */
     667          287468 :         if (Insert->forcePageWrites && !doPageWrites)
     668                 :         {
     669                 :                 /* Oops, must redo it with full-page data */
     670               0 :                 LWLockRelease(WALInsertLock);
     671               0 :                 END_CRIT_SECTION();
     672               0 :                 goto begin;
     673                 :         }
     674                 : 
     675                 :         /*
     676                 :          * Make additional rdata chain entries for the backup blocks, so that we
     677                 :          * don't need to special-case them in the write loop.  Note that we have
     678                 :          * now irrevocably changed the input rdata chain.  At the exit of this
     679                 :          * loop, write_len includes the backup block data.
     680                 :          *
     681                 :          * Also set the appropriate info bits to show which buffers were backed
     682                 :          * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
     683                 :          * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
     684                 :          */
     685          287468 :         write_len = len;
     686         1149872 :         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
     687                 :         {
     688                 :                 BkpBlock   *bkpb;
     689                 :                 char       *page;
     690                 : 
     691          862404 :                 if (!dtbuf_bkp[i])
     692          861842 :                         continue;
     693                 : 
     694             562 :                 info |= XLR_SET_BKP_BLOCK(i);
     695                 : 
     696             562 :                 bkpb = &(dtbuf_xlg[i]);
     697             562 :                 page = (char *) BufferGetBlock(dtbuf[i]);
     698                 : 
     699             562 :                 rdt->next = &(dtbuf_rdt1[i]);
     700             562 :                 rdt = rdt->next;
     701                 : 
     702             562 :                 rdt->data = (char *) bkpb;
     703             562 :                 rdt->len = sizeof(BkpBlock);
     704             562 :                 write_len += sizeof(BkpBlock);
     705                 : 
     706             562 :                 rdt->next = &(dtbuf_rdt2[i]);
     707             562 :                 rdt = rdt->next;
     708                 : 
     709             562 :                 if (bkpb->hole_length == 0)
     710                 :                 {
     711               2 :                         rdt->data = page;
     712               2 :                         rdt->len = BLCKSZ;
     713               2 :                         write_len += BLCKSZ;
     714               2 :                         rdt->next = NULL;
     715                 :                 }
     716                 :                 else
     717                 :                 {
     718                 :                         /* must skip the hole */
     719             560 :                         rdt->data = page;
     720             560 :                         rdt->len = bkpb->hole_offset;
     721             560 :                         write_len += bkpb->hole_offset;
     722                 : 
     723             560 :                         rdt->next = &(dtbuf_rdt3[i]);
     724             560 :                         rdt = rdt->next;
     725                 : 
     726             560 :                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
     727             560 :                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
     728             560 :                         write_len += rdt->len;
     729             560 :                         rdt->next = NULL;
     730                 :                 }
     731                 :         }
     732                 : 
     733                 :         /*
     734                 :          * If we backed up any full blocks and online backup is not in progress,
     735                 :          * mark the backup blocks as removable.  This allows the WAL archiver to
     736                 :          * know whether it is safe to compress archived WAL data by transforming
     737                 :          * full-block records into the non-full-block format.
     738                 :          *
     739                 :          * Note: we could just set the flag whenever !forcePageWrites, but
     740                 :          * defining it like this leaves the info bit free for some potential other
     741                 :          * use in records without any backup blocks.
     742                 :          */
     743          287468 :         if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
     744             558 :                 info |= XLR_BKP_REMOVABLE;
     745                 : 
     746                 :         /*
     747                 :          * If there isn't enough space on the current XLOG page for a record
     748                 :          * header, advance to the next page (leaving the unused space as zeroes).
     749                 :          */
     750          287468 :         updrqst = false;
     751          287468 :         freespace = INSERT_FREESPACE(Insert);
     752          287468 :         if (freespace < SizeOfXLogRecord)
     753                 :         {
     754            1433 :                 updrqst = AdvanceXLInsertBuffer(false);
     755            1433 :                 freespace = INSERT_FREESPACE(Insert);
     756                 :         }
     757                 : 
     758                 :         /* Compute record's XLOG location */
     759          287468 :         curridx = Insert->curridx;
     760          287468 :         INSERT_RECPTR(RecPtr, Insert, curridx);
     761                 : 
     762                 :         /*
     763                 :          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
     764                 :          * segment, we need not insert it (and don't want to because we'd like
     765                 :          * consecutive switch requests to be no-ops).  Instead, make sure
     766                 :          * everything is written and flushed through the end of the prior segment,
     767                 :          * and return the prior segment's end address.
     768                 :          */
     769          287468 :         if (isLogSwitch &&
     770                 :                 (RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
     771                 :         {
     772                 :                 /* We can release insert lock immediately */
     773               0 :                 LWLockRelease(WALInsertLock);
     774                 : 
     775               0 :                 RecPtr.xrecoff -= SizeOfXLogLongPHD;
     776               0 :                 if (RecPtr.xrecoff == 0)
     777                 :                 {
     778                 :                         /* crossing a logid boundary */
     779               0 :                         RecPtr.xlogid -= 1;
     780               0 :                         RecPtr.xrecoff = XLogFileSize;
     781                 :                 }
     782                 : 
     783               0 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
     784               0 :                 LogwrtResult = XLogCtl->Write.LogwrtResult;
     785               0 :                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
     786                 :                 {
     787                 :                         XLogwrtRqst FlushRqst;
     788                 : 
     789               0 :                         FlushRqst.Write = RecPtr;
     790               0 :                         FlushRqst.Flush = RecPtr;
     791               0 :                         XLogWrite(FlushRqst, false, false);
     792                 :                 }
     793               0 :                 LWLockRelease(WALWriteLock);
     794                 : 
     795               0 :                 END_CRIT_SECTION();
     796                 : 
     797               0 :                 return RecPtr;
     798                 :         }
     799                 : 
     800                 :         /* Insert record header */
     801                 : 
     802          287468 :         record = (XLogRecord *) Insert->currpos;
     803          287468 :         record->xl_prev = Insert->PrevRecord;
     804          287468 :         record->xl_xid = GetCurrentTransactionIdIfAny();
     805          287468 :         record->xl_tot_len = SizeOfXLogRecord + write_len;
     806          287468 :         record->xl_len = len;                /* doesn't include backup blocks */
     807          287468 :         record->xl_info = info;
     808          287468 :         record->xl_rmid = rmid;
     809                 : 
     810                 :         /* Now we can finish computing the record's CRC */
     811          287468 :         COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
     812                 :                            SizeOfXLogRecord - sizeof(pg_crc32));
     813          287468 :         FIN_CRC32(rdata_crc);
     814          287468 :         record->xl_crc = rdata_crc;
     815                 : 
     816                 : #ifdef WAL_DEBUG
     817                 :         if (XLOG_DEBUG)
     818                 :         {
     819                 :                 StringInfoData buf;
     820                 : 
     821                 :                 initStringInfo(&buf);
     822                 :                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
     823                 :                                                  RecPtr.xlogid, RecPtr.xrecoff);
     824                 :                 xlog_outrec(&buf, record);
     825                 :                 if (rdata->data != NULL)
     826                 :                 {
     827                 :                         appendStringInfo(&buf, " - ");
     828                 :                         RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
     829                 :                 }
     830                 :                 elog(LOG, "%s", buf.data);
     831                 :                 pfree(buf.data);
     832                 :         }
     833                 : #endif
     834                 : 
     835                 :         /* Record begin of record in appropriate places */
     836          287468 :         ProcLastRecPtr = RecPtr;
     837          287468 :         Insert->PrevRecord = RecPtr;
     838                 : 
     839          287468 :         Insert->currpos += SizeOfXLogRecord;
     840          287468 :         freespace -= SizeOfXLogRecord;
     841                 : 
     842                 :         /*
     843                 :          * Append the data, including backup blocks if any
     844                 :          */
     845         1309823 :         while (write_len)
     846                 :         {
     847          747409 :                 while (rdata->data == NULL)
     848           12522 :                         rdata = rdata->next;
     849                 : 
     850          734887 :                 if (freespace > 0)
     851                 :                 {
     852          734708 :                         if (rdata->len > freespace)
     853                 :                         {
     854            2736 :                                 memcpy(Insert->currpos, rdata->data, freespace);
     855            2736 :                                 rdata->data += freespace;
     856            2736 :                                 rdata->len -= freespace;
     857            2736 :                                 write_len -= freespace;
     858                 :                         }
     859                 :                         else
     860                 :                         {
     861          731972 :                                 memcpy(Insert->currpos, rdata->data, rdata->len);
     862          731972 :                                 freespace -= rdata->len;
     863          731972 :                                 write_len -= rdata->len;
     864          731972 :                                 Insert->currpos += rdata->len;
     865          731972 :                                 rdata = rdata->next;
     866          731972 :                                 continue;
     867                 :                         }
     868                 :                 }
     869                 : 
     870                 :                 /* Use next buffer */
     871            2915 :                 updrqst = AdvanceXLInsertBuffer(false);
     872            2915 :                 curridx = Insert->curridx;
     873                 :                 /* Insert cont-record header */
     874            2915 :                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
     875            2915 :                 contrecord = (XLogContRecord *) Insert->currpos;
     876            2915 :                 contrecord->xl_rem_len = write_len;
     877            2915 :                 Insert->currpos += SizeOfXLogContRecord;
     878            2915 :                 freespace = INSERT_FREESPACE(Insert);
     879                 :         }
     880                 : 
     881                 :         /* Ensure next record will be properly aligned */
     882          287468 :         Insert->currpos = (char *) Insert->currpage +
     883                 :                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
     884          287468 :         freespace = INSERT_FREESPACE(Insert);
     885                 : 
     886                 :         /*
     887                 :          * The recptr I return is the beginning of the *next* record. This will be
     888                 :          * stored as LSN for changed data pages...
     889                 :          */
     890          287468 :         INSERT_RECPTR(RecPtr, Insert, curridx);
     891                 : 
     892                 :         /*
     893                 :          * If the record is an XLOG_SWITCH, we must now write and flush all the
     894                 :          * existing data, and then forcibly advance to the start of the next
     895                 :          * segment.  It's not good to do this I/O while holding the insert lock,
     896                 :          * but there seems too much risk of confusion if we try to release the
     897                 :          * lock sooner.  Fortunately xlog switch needn't be a high-performance
     898                 :          * operation anyway...
     899                 :          */
     900          287468 :         if (isLogSwitch)
     901                 :         {
     902               0 :                 XLogCtlWrite *Write = &XLogCtl->Write;
     903                 :                 XLogwrtRqst FlushRqst;
     904                 :                 XLogRecPtr      OldSegEnd;
     905                 : 
     906               0 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
     907                 : 
     908                 :                 /*
     909                 :                  * Flush through the end of the page containing XLOG_SWITCH, and
     910                 :                  * perform end-of-segment actions (eg, notifying archiver).
     911                 :                  */
     912               0 :                 WriteRqst = XLogCtl->xlblocks[curridx];
     913               0 :                 FlushRqst.Write = WriteRqst;
     914               0 :                 FlushRqst.Flush = WriteRqst;
     915               0 :                 XLogWrite(FlushRqst, false, true);
     916                 : 
     917                 :                 /* Set up the next buffer as first page of next segment */
     918                 :                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
     919               0 :                 (void) AdvanceXLInsertBuffer(true);
     920                 : 
     921                 :                 /* There should be no unwritten data */
     922               0 :                 curridx = Insert->curridx;
     923                 :                 Assert(curridx == Write->curridx);
     924                 : 
     925                 :                 /* Compute end address of old segment */
     926               0 :                 OldSegEnd = XLogCtl->xlblocks[curridx];
     927               0 :                 OldSegEnd.xrecoff -= XLOG_BLCKSZ;
     928               0 :                 if (OldSegEnd.xrecoff == 0)
     929                 :                 {
     930                 :                         /* crossing a logid boundary */
     931               0 :                         OldSegEnd.xlogid -= 1;
     932               0 :                         OldSegEnd.xrecoff = XLogFileSize;
     933                 :                 }
     934                 : 
     935                 :                 /* Make it look like we've written and synced all of old segment */
     936               0 :                 LogwrtResult.Write = OldSegEnd;
     937               0 :                 LogwrtResult.Flush = OldSegEnd;
     938                 : 
     939                 :                 /*
     940                 :                  * Update shared-memory status --- this code should match XLogWrite
     941                 :                  */
     942                 :                 {
     943                 :                         /* use volatile pointer to prevent code rearrangement */
     944               0 :                         volatile XLogCtlData *xlogctl = XLogCtl;
     945                 : 
     946               0 :                         SpinLockAcquire(&xlogctl->info_lck);
     947               0 :                         xlogctl->LogwrtResult = LogwrtResult;
     948               0 :                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
     949               0 :                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
     950               0 :                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
     951               0 :                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
     952               0 :                         SpinLockRelease(&xlogctl->info_lck);
     953                 :                 }
     954                 : 
     955               0 :                 Write->LogwrtResult = LogwrtResult;
     956                 : 
     957               0 :                 LWLockRelease(WALWriteLock);
     958                 : 
     959               0 :                 updrqst = false;                /* done already */
     960                 :         }
     961                 :         else
     962                 :         {
     963                 :                 /* normal case, ie not xlog switch */
     964                 : 
     965                 :                 /* Need to update shared LogwrtRqst if some block was filled up */
     966          287468 :                 if (freespace < SizeOfXLogRecord)
     967                 :                 {
     968                 :                         /* curridx is filled and available for writing out */
     969            1434 :                         updrqst = true;
     970                 :                 }
     971                 :                 else
     972                 :                 {
     973                 :                         /* if updrqst already set, write through end of previous buf */
     974          286034 :                         curridx = PrevBufIdx(curridx);
     975                 :                 }
     976          287468 :                 WriteRqst = XLogCtl->xlblocks[curridx];
     977                 :         }
     978                 : 
     979          287468 :         LWLockRelease(WALInsertLock);
     980                 : 
     981          287468 :         if (updrqst)
     982                 :         {
     983                 :                 /* use volatile pointer to prevent code rearrangement */
     984            3377 :                 volatile XLogCtlData *xlogctl = XLogCtl;
     985                 : 
     986            6754 :                 SpinLockAcquire(&xlogctl->info_lck);
     987                 :                 /* advance global request to include new block(s) */
     988            3377 :                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
     989            2840 :                         xlogctl->LogwrtRqst.Write = WriteRqst;
     990                 :                 /* update local result copy while I have the chance */
     991            3377 :                 LogwrtResult = xlogctl->LogwrtResult;
     992            3377 :                 SpinLockRelease(&xlogctl->info_lck);
     993                 :         }
     994                 : 
     995          287468 :         XactLastRecEnd = RecPtr;
     996                 : 
     997          287468 :         END_CRIT_SECTION();
     998                 : 
     999          287468 :         return RecPtr;
    1000                 : }
    1001                 : 
    1002                 : /*
    1003                 :  * Determine whether the buffer referenced by an XLogRecData item has to
    1004                 :  * be backed up, and if so fill a BkpBlock struct for it.  In any case
    1005                 :  * save the buffer's LSN at *lsn.
    1006                 :  */
    1007                 : static bool
    1008                 : XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
    1009                 :                                 XLogRecPtr *lsn, BkpBlock *bkpb)
    1010          278145 : {
    1011                 :         PageHeader      page;
    1012                 : 
    1013          278145 :         page = (PageHeader) BufferGetBlock(rdata->buffer);
    1014                 : 
    1015                 :         /*
    1016                 :          * XXX We assume page LSN is first data on *every* page that can be passed
    1017                 :          * to XLogInsert, whether it otherwise has the standard page layout or
    1018                 :          * not.
    1019                 :          */
    1020          278145 :         *lsn = page->pd_lsn;
    1021                 : 
    1022          278145 :         if (doPageWrites &&
    1023                 :                 XLByteLE(page->pd_lsn, RedoRecPtr))
    1024                 :         {
    1025                 :                 /*
    1026                 :                  * The page needs to be backed up, so set up *bkpb
    1027                 :                  */
    1028             562 :                 bkpb->node = BufferGetFileNode(rdata->buffer);
    1029             562 :                 bkpb->block = BufferGetBlockNumber(rdata->buffer);
    1030                 : 
    1031             562 :                 if (rdata->buffer_std)
    1032                 :                 {
    1033                 :                         /* Assume we can omit data between pd_lower and pd_upper */
    1034             562 :                         uint16          lower = page->pd_lower;
    1035             562 :                         uint16          upper = page->pd_upper;
    1036                 : 
    1037            1122 :                         if (lower >= SizeOfPageHeaderData &&
    1038                 :                                 upper > lower &&
    1039                 :                                 upper <= BLCKSZ)
    1040                 :                         {
    1041             560 :                                 bkpb->hole_offset = lower;
    1042             560 :                                 bkpb->hole_length = upper - lower;
    1043                 :                         }
    1044                 :                         else
    1045                 :                         {
    1046                 :                                 /* No "hole" to compress out */
    1047               2 :                                 bkpb->hole_offset = 0;
    1048               2 :                                 bkpb->hole_length = 0;
    1049                 :                         }
    1050                 :                 }
    1051                 :                 else
    1052                 :                 {
    1053                 :                         /* Not a standard page header, don't try to eliminate "hole" */
    1054               0 :                         bkpb->hole_offset = 0;
    1055               0 :                         bkpb->hole_length = 0;
    1056                 :                 }
    1057                 : 
    1058             562 :                 return true;                    /* buffer requires backup */
    1059                 :         }
    1060                 : 
    1061          277583 :         return false;                           /* buffer does not need to be backed up */
    1062                 : }
    1063                 : 
    1064                 : /*
    1065                 :  * XLogArchiveNotify
    1066                 :  *
    1067                 :  * Create an archive notification file
    1068                 :  *
    1069                 :  * The name of the notification file is the message that will be picked up
    1070                 :  * by the archiver, e.g. we write 0000000100000001000000C6.ready
    1071                 :  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
    1072                 :  * then when complete, rename it to 0000000100000001000000C6.done
    1073                 :  */
    1074                 : static void
    1075                 : XLogArchiveNotify(const char *xlog)
    1076               0 : {
    1077                 :         char            archiveStatusPath[MAXPGPATH];
    1078                 :         FILE       *fd;
    1079                 : 
    1080                 :         /* insert an otherwise empty file called <XLOG>.ready */
    1081               0 :         StatusFilePath(archiveStatusPath, xlog, ".ready");
    1082               0 :         fd = AllocateFile(archiveStatusPath, "w");
    1083               0 :         if (fd == NULL)
    1084                 :         {
    1085               0 :                 ereport(LOG,
    1086                 :                                 (errcode_for_file_access(),
    1087                 :                                  errmsg("could not create archive status file \"%s\": %m",
    1088                 :                                                 archiveStatusPath)));
    1089                 :                 return;
    1090                 :         }
    1091               0 :         if (FreeFile(fd))
    1092                 :         {
    1093               0 :                 ereport(LOG,
    1094                 :                                 (errcode_for_file_access(),
    1095                 :                                  errmsg("could not write archive status file \"%s\": %m",
    1096                 :                                                 archiveStatusPath)));
    1097                 :                 return;
    1098                 :         }
    1099                 : 
    1100                 :         /* Notify archiver that it's got something to do */
    1101               0 :         if (IsUnderPostmaster)
    1102               0 :                 SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
    1103                 : }
    1104                 : 
    1105                 : /*
    1106                 :  * Convenience routine to notify using log/seg representation of filename
    1107                 :  */
    1108                 : static void
    1109                 : XLogArchiveNotifySeg(uint32 log, uint32 seg)
    1110               0 : {
    1111                 :         char            xlog[MAXFNAMELEN];
    1112                 : 
    1113               0 :         XLogFileName(xlog, ThisTimeLineID, log, seg);
    1114               0 :         XLogArchiveNotify(xlog);
    1115               0 : }
    1116                 : 
    1117                 : /*
    1118                 :  * XLogArchiveCheckDone
    1119                 :  *
    1120                 :  * This is called when we are ready to delete or recycle an old XLOG segment
    1121                 :  * file or backup history file.  If it is okay to delete it then return true.
    1122                 :  * If it is not time to delete it, make sure a .ready file exists, and return
    1123                 :  * false.
    1124                 :  *
    1125                 :  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
    1126                 :  * then return false; else create <XLOG>.ready and return false.
    1127                 :  *
    1128                 :  * The reason we do things this way is so that if the original attempt to
    1129                 :  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
    1130                 :  */
    1131                 : static bool
    1132                 : XLogArchiveCheckDone(const char *xlog)
    1133               2 : {
    1134                 :         char            archiveStatusPath[MAXPGPATH];
    1135                 :         struct stat stat_buf;
    1136                 : 
    1137                 :         /* Always deletable if archiving is off */
    1138               2 :         if (!XLogArchivingActive())
    1139               2 :                 return true;
    1140                 : 
    1141                 :         /* First check for .done --- this means archiver is done with it */
    1142               0 :         StatusFilePath(archiveStatusPath, xlog, ".done");
    1143               0 :         if (stat(archiveStatusPath, &stat_buf) == 0)
    1144               0 :                 return true;
    1145                 : 
    1146                 :         /* check for .ready --- this means archiver is still busy with it */
    1147               0 :         StatusFilePath(archiveStatusPath, xlog, ".ready");
    1148               0 :         if (stat(archiveStatusPath, &stat_buf) == 0)
    1149               0 :                 return false;
    1150                 : 
    1151                 :         /* Race condition --- maybe archiver just finished, so recheck */
    1152               0 :         StatusFilePath(archiveStatusPath, xlog, ".done");
    1153               0 :         if (stat(archiveStatusPath, &stat_buf) == 0)
    1154               0 :                 return true;
    1155                 : 
    1156                 :         /* Retry creation of the .ready file */
    1157               0 :         XLogArchiveNotify(xlog);
    1158               0 :         return false;
    1159                 : }
    1160                 : 
    1161                 : /*
    1162                 :  * XLogArchiveCleanup
    1163                 :  *
    1164                 :  * Cleanup archive notification file(s) for a particular xlog segment
    1165                 :  */
    1166                 : static void
    1167                 : XLogArchiveCleanup(const char *xlog)
    1168               2 : {
    1169                 :         char            archiveStatusPath[MAXPGPATH];
    1170                 : 
    1171                 :         /* Remove the .done file */
    1172               2 :         StatusFilePath(archiveStatusPath, xlog, ".done");
    1173               2 :         unlink(archiveStatusPath);
    1174                 :         /* should we complain about failure? */
    1175                 : 
    1176                 :         /* Remove the .ready file if present --- normally it shouldn't be */
    1177               2 :         StatusFilePath(archiveStatusPath, xlog, ".ready");
    1178               2 :         unlink(archiveStatusPath);
    1179                 :         /* should we complain about failure? */
    1180               2 : }
    1181                 : 
    1182                 : /*
    1183                 :  * Advance the Insert state to the next buffer page, writing out the next
    1184                 :  * buffer if it still contains unwritten data.
    1185                 :  *
    1186                 :  * If new_segment is TRUE then we set up the next buffer page as the first
    1187                 :  * page of the next xlog segment file, possibly but not usually the next
    1188                 :  * consecutive file page.
    1189                 :  *
    1190                 :  * The global LogwrtRqst.Write pointer needs to be advanced to include the
    1191                 :  * just-filled page.  If we can do this for free (without an extra lock),
    1192                 :  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
    1193                 :  * request update still needs to be done, FALSE if we did it internally.
    1194                 :  *
    1195                 :  * Must be called with WALInsertLock held.
    1196                 :  */
    1197                 : static bool
    1198                 : AdvanceXLInsertBuffer(bool new_segment)
    1199            4349 : {
    1200            4349 :         XLogCtlInsert *Insert = &XLogCtl->Insert;
    1201            4349 :         XLogCtlWrite *Write = &XLogCtl->Write;
    1202            4349 :         int                     nextidx = NextBufIdx(Insert->curridx);
    1203            4349 :         bool            update_needed = true;
    1204                 :         XLogRecPtr      OldPageRqstPtr;
    1205                 :         XLogwrtRqst WriteRqst;
    1206                 :         XLogRecPtr      NewPageEndPtr;
    1207                 :         XLogPageHeader NewPage;
    1208                 : 
    1209                 :         /* Use Insert->LogwrtResult copy if it's more fresh */
    1210            4349 :         if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
    1211             112 :                 LogwrtResult = Insert->LogwrtResult;
    1212                 : 
    1213                 :         /*
    1214                 :          * Get ending-offset of the buffer page we need to replace (this may be
    1215                 :          * zero if the buffer hasn't been used yet).  Fall through if it's already
    1216                 :          * written out.
    1217                 :          */
    1218            4349 :         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
    1219            4349 :         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
    1220                 :         {
    1221                 :                 /* nope, got work to do... */
    1222                 :                 XLogRecPtr      FinishedPageRqstPtr;
    1223                 : 
    1224            2399 :                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
    1225                 : 
    1226                 :                 /* Before waiting, get info_lck and update LogwrtResult */
    1227                 :                 {
    1228                 :                         /* use volatile pointer to prevent code rearrangement */
    1229            2399 :                         volatile XLogCtlData *xlogctl = XLogCtl;
    1230                 : 
    1231            4798 :                         SpinLockAcquire(&xlogctl->info_lck);
    1232            2399 :                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
    1233            1503 :                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
    1234            2399 :                         LogwrtResult = xlogctl->LogwrtResult;
    1235            2399 :                         SpinLockRelease(&xlogctl->info_lck);
    1236                 :                 }
    1237                 : 
    1238            2399 :                 update_needed = false;  /* Did the shared-request update */
    1239                 : 
    1240            2489 :                 if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
    1241                 :                 {
    1242                 :                         /* OK, someone wrote it already */
    1243              90 :                         Insert->LogwrtResult = LogwrtResult;
    1244                 :                 }
    1245                 :                 else
    1246                 :                 {
    1247                 :                         /* Must acquire write lock */
    1248            2309 :                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    1249            2309 :                         LogwrtResult = Write->LogwrtResult;
    1250            2310 :                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
    1251                 :                         {
    1252                 :                                 /* OK, someone wrote it already */
    1253               1 :                                 LWLockRelease(WALWriteLock);
    1254               1 :                                 Insert->LogwrtResult = LogwrtResult;
    1255                 :                         }
    1256                 :                         else
    1257                 :                         {
    1258                 :                                 /*
    1259                 :                                  * Have to write buffers while holding insert lock. This is
    1260                 :                                  * not good, so only write as much as we absolutely must.
    1261                 :                                  */
    1262            2308 :                                 WriteRqst.Write = OldPageRqstPtr;
    1263            2308 :                                 WriteRqst.Flush.xlogid = 0;
    1264            2308 :                                 WriteRqst.Flush.xrecoff = 0;
    1265            2308 :                                 XLogWrite(WriteRqst, false, false);
    1266            2308 :                                 LWLockRelease(WALWriteLock);
    1267            2308 :                                 Insert->LogwrtResult = LogwrtResult;
    1268                 :                         }
    1269                 :                 }
    1270                 :         }
    1271                 : 
    1272                 :         /*
    1273                 :          * Now the next buffer slot is free and we can set it up to be the next
    1274                 :          * output page.
    1275                 :          */
    1276            4349 :         NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
    1277                 : 
    1278            4349 :         if (new_segment)
    1279                 :         {
    1280                 :                 /* force it to a segment start point */
    1281               0 :                 NewPageEndPtr.xrecoff += XLogSegSize - 1;
    1282               0 :                 NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
    1283                 :         }
    1284                 : 
    1285            4349 :         if (NewPageEndPtr.xrecoff >= XLogFileSize)
    1286                 :         {
    1287                 :                 /* crossing a logid boundary */
    1288               0 :                 NewPageEndPtr.xlogid += 1;
    1289               0 :                 NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
    1290                 :         }
    1291                 :         else
    1292            4349 :                 NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
    1293            4349 :         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
    1294            4349 :         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
    1295                 : 
    1296            4349 :         Insert->curridx = nextidx;
    1297            4349 :         Insert->currpage = NewPage;
    1298                 : 
    1299            4349 :         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
    1300                 : 
    1301                 :         /*
    1302                 :          * Be sure to re-zero the buffer so that bytes beyond what we've written
    1303                 :          * will look like zeroes and not valid XLOG records...
    1304                 :          */
    1305            4349 :         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
    1306                 : 
    1307                 :         /*
    1308                 :          * Fill the new page's header
    1309                 :          */
    1310            4349 :         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
    1311                 : 
    1312                 :         /* NewPage->xlp_info = 0; */ /* done by memset */
    1313            4349 :         NewPage   ->xlp_tli = ThisTimeLineID;
    1314            4349 :         NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
    1315            4349 :         NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
    1316                 : 
    1317                 :         /*
    1318                 :          * If first page of an XLOG segment file, make it a long header.
    1319                 :          */
    1320            4349 :         if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
    1321                 :         {
    1322               2 :                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
    1323                 : 
    1324               2 :                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
    1325               2 :                 NewLongPage->xlp_seg_size = XLogSegSize;
    1326               2 :                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    1327               2 :                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
    1328                 : 
    1329               2 :                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
    1330                 :         }
    1331                 : 
    1332            4349 :         return update_needed;
    1333                 : }
    1334                 : 
    1335                 : /*
    1336                 :  * Check whether we've consumed enough xlog space that a checkpoint is needed.
    1337                 :  *
    1338                 :  * Caller must have just finished filling the open log file (so that
    1339                 :  * openLogId/openLogSeg are valid).  We measure the distance from RedoRecPtr
    1340                 :  * to the open log file and see if that exceeds CheckPointSegments.
    1341                 :  *
    1342                 :  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
    1343                 :  */
    1344                 : static bool
    1345                 : XLogCheckpointNeeded(void)
    1346               2 : {
    1347                 :         /*
    1348                 :          * A straight computation of segment number could overflow 32 bits. Rather
    1349                 :          * than assuming we have working 64-bit arithmetic, we compare the
    1350                 :          * highest-order bits separately, and force a checkpoint immediately when
    1351                 :          * they change.
    1352                 :          */
    1353                 :         uint32          old_segno,
    1354                 :                                 new_segno;
    1355                 :         uint32          old_highbits,
    1356                 :                                 new_highbits;
    1357                 : 
    1358               2 :         old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
    1359                 :                 (RedoRecPtr.xrecoff / XLogSegSize);
    1360               2 :         old_highbits = RedoRecPtr.xlogid / XLogSegSize;
    1361               2 :         new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + openLogSeg;
    1362               2 :         new_highbits = openLogId / XLogSegSize;
    1363               2 :         if (new_highbits != old_highbits ||
    1364                 :                 new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
    1365               0 :                 return true;
    1366               2 :         return false;
    1367                 : }
    1368                 : 
    1369                 : /*
    1370                 :  * Write and/or fsync the log at least as far as WriteRqst indicates.
    1371                 :  *
    1372                 :  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
    1373                 :  * may stop at any convenient boundary (such as a cache or logfile boundary).
    1374                 :  * This option allows us to avoid uselessly issuing multiple writes when a
    1375                 :  * single one would do.
    1376                 :  *
    1377                 :  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
    1378                 :  * perform end-of-segment actions after writing the last page, even if
    1379                 :  * it's not physically the end of its segment.  (NB: this will work properly
    1380                 :  * only if caller specifies WriteRqst == page-end and flexible == false,
    1381                 :  * and there is some data to write.)
    1382                 :  *
    1383                 :  * Must be called with WALWriteLock held.
    1384                 :  */
    1385                 : static void
    1386                 : XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
    1387            6795 : {
    1388            6795 :         XLogCtlWrite *Write = &XLogCtl->Write;
    1389                 :         bool            ispartialpage;
    1390                 :         bool            last_iteration;
    1391                 :         bool            finishing_seg;
    1392                 :         bool            use_existent;
    1393                 :         int                     curridx;
    1394                 :         int                     npages;
    1395                 :         int                     startidx;
    1396                 :         uint32          startoffset;
    1397                 : 
    1398                 :         /* We should always be inside a critical section here */
    1399                 :         Assert(CritSectionCount > 0);
    1400                 : 
    1401                 :         /*
    1402                 :          * Update local LogwrtResult (caller probably did this already, but...)
    1403                 :          */
    1404            6795 :         LogwrtResult = Write->LogwrtResult;
    1405                 : 
    1406                 :         /*
    1407                 :          * Since successive pages in the xlog cache are consecutively allocated,
    1408                 :          * we can usually gather multiple pages together and issue just one
    1409                 :          * write() call.  npages is the number of pages we have determined can be
    1410                 :          * written together; startidx is the cache block index of the first one,
    1411                 :          * and startoffset is the file offset at which it should go. The latter
    1412                 :          * two variables are only valid when npages > 0, but we must initialize
    1413                 :          * all of them to keep the compiler quiet.
    1414                 :          */
    1415            6795 :         npages = 0;
    1416            6795 :         startidx = 0;
    1417            6795 :         startoffset = 0;
    1418                 : 
    1419                 :         /*
    1420                 :          * Within the loop, curridx is the cache block index of the page to
    1421                 :          * consider writing.  We advance Write->curridx only after successfully
    1422                 :          * writing pages.  (Right now, this refinement is useless since we are
    1423                 :          * going to PANIC if any error occurs anyway; but someday it may come in
    1424                 :          * useful.)
    1425                 :          */
    1426            6795 :         curridx = Write->curridx;
    1427                 : 
    1428           17911 :         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
    1429                 :         {
    1430                 :                 /*
    1431                 :                  * Make sure we're not ahead of the insert process.  This could happen
    1432                 :                  * if we're passed a bogus WriteRqst.Write that is past the end of the
    1433                 :                  * last page that's been initialized by AdvanceXLInsertBuffer.
    1434                 :                  */
    1435            8788 :                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
    1436               0 :                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
    1437                 :                                  LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
    1438                 :                                  XLogCtl->xlblocks[curridx].xlogid,
    1439                 :                                  XLogCtl->xlblocks[curridx].xrecoff);
    1440                 : 
    1441                 :                 /* Advance LogwrtResult.Write to end of current buffer page */
    1442            8788 :                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
    1443            8788 :                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
    1444                 : 
    1445            8788 :                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
    1446                 :                 {
    1447                 :                         /*
    1448                 :                          * Switch to new logfile segment.  We cannot have any pending
    1449                 :                          * pages here (since we dump what we have at segment end).
    1450                 :                          */
    1451                 :                         Assert(npages == 0);
    1452              78 :                         if (openLogFile >= 0)
    1453              11 :                                 XLogFileClose();
    1454              78 :                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
    1455                 : 
    1456                 :                         /* create/use new log file */
    1457              78 :                         use_existent = true;
    1458              78 :                         openLogFile = XLogFileInit(openLogId, openLogSeg,
    1459                 :                                                                            &use_existent, true);
    1460              78 :                         openLogOff = 0;
    1461                 :                 }
    1462                 : 
    1463                 :                 /* Make sure we have the current logfile open */
    1464            8788 :                 if (openLogFile < 0)
    1465                 :                 {
    1466              51 :                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
    1467              51 :                         openLogFile = XLogFileOpen(openLogId, openLogSeg);
    1468              51 :                         openLogOff = 0;
    1469                 :                 }
    1470                 : 
    1471                 :                 /* Add current page to the set of pending pages-to-dump */
    1472            8788 :                 if (npages == 0)
    1473                 :                 {
    1474                 :                         /* first of group */
    1475            7031 :                         startidx = curridx;
    1476            7031 :                         startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
    1477                 :                 }
    1478            8788 :                 npages++;
    1479                 : 
    1480                 :                 /*
    1481                 :                  * Dump the set if this will be the last loop iteration, or if we are
    1482                 :                  * at the last page of the cache area (since the next page won't be
    1483                 :                  * contiguous in memory), or if we are at the end of the logfile
    1484                 :                  * segment.
    1485                 :                  */
    1486            8788 :                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
    1487                 : 
    1488            8788 :                 finishing_seg = !ispartialpage &&
    1489                 :                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
    1490                 : 
    1491            8788 :                 if (last_iteration ||
    1492                 :                         curridx == XLogCtl->XLogCacheBlck ||
    1493                 :                         finishing_seg)
    1494                 :                 {
    1495                 :                         char       *from;
    1496                 :                         Size            nbytes;
    1497                 : 
    1498                 :                         /* Need to seek in the file? */
    1499            7031 :                         if (openLogOff != startoffset)
    1500                 :                         {
    1501            4121 :                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
    1502               0 :                                         ereport(PANIC,
    1503                 :                                                         (errcode_for_file_access(),
    1504                 :                                                          errmsg("could not seek in log file %u, "
    1505                 :                                                                         "segment %u to offset %u: %m",
    1506                 :                                                                         openLogId, openLogSeg, startoffset)));
    1507            4121 :                                 openLogOff = startoffset;
    1508                 :                         }
    1509                 : 
    1510                 :                         /* OK to write the page(s) */
    1511            7031 :                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
    1512            7031 :                         nbytes = npages * (Size) XLOG_BLCKSZ;
    1513            7031 :                         errno = 0;
    1514            7031 :                         if (write(openLogFile, from, nbytes) != nbytes)
    1515                 :                         {
    1516                 :                                 /* if write didn't set errno, assume no disk space */
    1517               0 :                                 if (errno == 0)
    1518               0 :                                         errno = ENOSPC;
    1519               0 :                                 ereport(PANIC,
    1520                 :                                                 (errcode_for_file_access(),
    1521                 :                                                  errmsg("could not write to log file %u, segment %u "
    1522                 :                                                                 "at offset %u, length %lu: %m",
    1523                 :                                                                 openLogId, openLogSeg,
    1524                 :                                                                 openLogOff, (unsigned long) nbytes)));
    1525                 :                         }
    1526                 : 
    1527                 :                         /* Update state for write */
    1528            7031 :                         openLogOff += nbytes;
    1529            7031 :                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
    1530            7031 :                         npages = 0;
    1531                 : 
    1532                 :                         /*
    1533                 :                          * If we just wrote the whole last page of a logfile segment,
    1534                 :                          * fsync the segment immediately.  This avoids having to go back
    1535                 :                          * and re-open prior segments when an fsync request comes along
    1536                 :                          * later. Doing it here ensures that one and only one backend will
    1537                 :                          * perform this fsync.
    1538                 :                          *
    1539                 :                          * We also do this if this is the last page written for an xlog
    1540                 :                          * switch.
    1541                 :                          *
    1542                 :                          * This is also the right place to notify the Archiver that the
    1543                 :                          * segment is ready to copy to archival storage, and to update the
    1544                 :                          * timer for archive_timeout, and to signal for a checkpoint if
    1545                 :                          * too many logfile segments have been used since the last
    1546                 :                          * checkpoint.
    1547                 :                          */
    1548            7031 :                         if (finishing_seg || (xlog_switch && last_iteration))
    1549                 :                         {
    1550               2 :                                 issue_xlog_fsync();
    1551               2 :                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
    1552                 : 
    1553               2 :                                 if (XLogArchivingActive())
    1554               0 :                                         XLogArchiveNotifySeg(openLogId, openLogSeg);
    1555                 : 
    1556               2 :                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
    1557                 : 
    1558                 :                                 /*
    1559                 :                                  * Signal bgwriter to start a checkpoint if we've consumed too
    1560                 :                                  * much xlog since the last one.  For speed, we first check
    1561                 :                                  * using the local copy of RedoRecPtr, which might be out of
    1562                 :                                  * date; if it looks like a checkpoint is needed, forcibly
    1563                 :                                  * update RedoRecPtr and recheck.
    1564                 :                                  */
    1565               2 :                                 if (IsUnderPostmaster &&
    1566                 :                                         XLogCheckpointNeeded())
    1567                 :                                 {
    1568               0 :                                         (void) GetRedoRecPtr();
    1569               0 :                                         if (XLogCheckpointNeeded())
    1570               0 :                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
    1571                 :                                 }
    1572                 :                         }
    1573                 :                 }
    1574                 : 
    1575            8788 :                 if (ispartialpage)
    1576                 :                 {
    1577                 :                         /* Only asked to write a partial page */
    1578            4439 :                         LogwrtResult.Write = WriteRqst.Write;
    1579            4439 :                         break;
    1580                 :                 }
    1581            4349 :                 curridx = NextBufIdx(curridx);
    1582                 : 
    1583                 :                 /* If flexible, break out of loop as soon as we wrote something */
    1584            4349 :                 if (flexible && npages == 0)
    1585              28 :                         break;
    1586                 :         }
    1587                 : 
    1588                 :         Assert(npages == 0);
    1589                 :         Assert(curridx == Write->curridx);
    1590                 : 
    1591                 :         /*
    1592                 :          * If asked to flush, do so
    1593                 :          */
    1594            6795 :         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
    1595                 :                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
    1596                 :         {
    1597                 :                 /*
    1598                 :                  * Could get here without iterating above loop, in which case we might
    1599                 :                  * have no open file or the wrong one.  However, we do not need to
    1600                 :                  * fsync more than one file.
    1601                 :                  */
    1602            4487 :                 if (sync_method != SYNC_METHOD_OPEN)
    1603                 :                 {
    1604            4487 :                         if (openLogFile >= 0 &&
    1605                 :                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
    1606               0 :                                 XLogFileClose();
    1607            4487 :                         if (openLogFile < 0)
    1608                 :                         {
    1609               0 :                                 XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
    1610               0 :                                 openLogFile = XLogFileOpen(openLogId, openLogSeg);
    1611               0 :                                 openLogOff = 0;
    1612                 :                         }
    1613            4487 :                         issue_xlog_fsync();
    1614                 :                 }
    1615            4487 :                 LogwrtResult.Flush = LogwrtResult.Write;
    1616                 :         }
    1617                 : 
    1618                 :         /*
    1619                 :          * Update shared-memory status
    1620                 :          *
    1621                 :          * We make sure that the shared 'request' values do not fall behind the
    1622                 :          * 'result' values.  This is not absolutely essential, but it saves some
    1623                 :          * code in a couple of places.
    1624                 :          */
    1625                 :         {
    1626                 :                 /* use volatile pointer to prevent code rearrangement */
    1627            6795 :                 volatile XLogCtlData *xlogctl = XLogCtl;
    1628                 : 
    1629           13590 :                 SpinLockAcquire(&xlogctl->info_lck);
    1630            6795 :                 xlogctl->LogwrtResult = LogwrtResult;
    1631            6795 :                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
    1632            4435 :                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
    1633            6795 :                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
    1634            4488 :                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
    1635            6795 :                 SpinLockRelease(&xlogctl->info_lck);
    1636                 :         }
    1637                 : 
    1638            6795 :         Write->LogwrtResult = LogwrtResult;
    1639            6795 : }
    1640                 : 
    1641                 : /*
    1642                 :  * Record the LSN for an asynchronous transaction commit.
    1643                 :  * (This should not be called for aborts, nor for synchronous commits.)
    1644                 :  */
    1645                 : void
    1646                 : XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
    1647               0 : {
    1648                 :         /* use volatile pointer to prevent code rearrangement */
    1649               0 :         volatile XLogCtlData *xlogctl = XLogCtl;
    1650                 : 
    1651               0 :         SpinLockAcquire(&xlogctl->info_lck);
    1652               0 :         if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
    1653               0 :                 xlogctl->asyncCommitLSN = asyncCommitLSN;
    1654               0 :         SpinLockRelease(&xlogctl->info_lck);
    1655               0 : }
    1656                 : 
    1657                 : /*
    1658                 :  * Ensure that all XLOG data through the given position is flushed to disk.
    1659                 :  *
    1660                 :  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
    1661                 :  * already held, and we try to avoid acquiring it if possible.
    1662                 :  */
    1663                 : void
    1664                 : XLogFlush(XLogRecPtr record)
    1665           12794 : {
    1666                 :         XLogRecPtr      WriteRqstPtr;
    1667                 :         XLogwrtRqst WriteRqst;
    1668                 : 
    1669                 :         /* Disabled during REDO */
    1670           12794 :         if (InRedo)
    1671               0 :                 return;
    1672                 : 
    1673                 :         /* Quick exit if already known flushed */
    1674           12794 :         if (XLByteLE(record, LogwrtResult.Flush))
    1675                 :                 return;
    1676                 : 
    1677                 : #ifdef WAL_DEBUG
    1678                 :         if (XLOG_DEBUG)
    1679                 :                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
    1680                 :                          record.xlogid, record.xrecoff,
    1681                 :                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
    1682                 :                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
    1683                 : #endif
    1684                 : 
    1685            4474 :         START_CRIT_SECTION();
    1686                 : 
    1687                 :         /*
    1688                 :          * Since fsync is usually a horribly expensive operation, we try to
    1689                 :          * piggyback as much data as we can on each fsync: if we see any more data
    1690                 :          * entered into the xlog buffer, we'll write and fsync that too, so that
    1691                 :          * the final value of LogwrtResult.Flush is as large as possible. This
    1692                 :          * gives us some chance of avoiding another fsync immediately after.
    1693                 :          */
    1694                 : 
    1695                 :         /* initialize to given target; may increase below */
    1696            4474 :         WriteRqstPtr = record;
    1697                 : 
    1698                 :         /* read LogwrtResult and update local state */
    1699                 :         {
    1700                 :                 /* use volatile pointer to prevent code rearrangement */
    1701            4474 :                 volatile XLogCtlData *xlogctl = XLogCtl;
    1702                 : 
    1703            8948 :                 SpinLockAcquire(&xlogctl->info_lck);
    1704            4474 :                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
    1705              38 :                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
    1706            4474 :                 LogwrtResult = xlogctl->LogwrtResult;
    1707            4474 :                 SpinLockRelease(&xlogctl->info_lck);
    1708                 :         }
    1709                 : 
    1710                 :         /* done already? */
    1711            4474 :         if (!XLByteLE(record, LogwrtResult.Flush))
    1712                 :         {
    1713                 :                 /* now wait for the write lock */
    1714            4461 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    1715            4461 :                 LogwrtResult = XLogCtl->Write.LogwrtResult;
    1716            4461 :                 if (!XLByteLE(record, LogwrtResult.Flush))
    1717                 :                 {
    1718                 :                         /* try to write/flush later additions to XLOG as well */
    1719            4459 :                         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
    1720                 :                         {
    1721            4457 :                                 XLogCtlInsert *Insert = &XLogCtl->Insert;
    1722            4457 :                                 uint32          freespace = INSERT_FREESPACE(Insert);
    1723                 : 
    1724            4457 :                                 if (freespace < SizeOfXLogRecord)            /* buffer is full */
    1725              20 :                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
    1726                 :                                 else
    1727                 :                                 {
    1728            4437 :                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
    1729            4437 :                                         WriteRqstPtr.xrecoff -= freespace;
    1730                 :                                 }
    1731            4457 :                                 LWLockRelease(WALInsertLock);
    1732            4457 :                                 WriteRqst.Write = WriteRqstPtr;
    1733            4457 :                                 WriteRqst.Flush = WriteRqstPtr;
    1734                 :                         }
    1735                 :                         else
    1736                 :                         {
    1737               2 :                                 WriteRqst.Write = WriteRqstPtr;
    1738               2 :                                 WriteRqst.Flush = record;
    1739                 :                         }
    1740            4459 :                         XLogWrite(WriteRqst, false, false);
    1741                 :                 }
    1742            4461 :                 LWLockRelease(WALWriteLock);
    1743                 :         }
    1744                 : 
    1745            4474 :         END_CRIT_SECTION();
    1746                 : 
    1747                 :         /*
    1748                 :          * If we still haven't flushed to the request point then we have a
    1749                 :          * problem; most likely, the requested flush point is past end of XLOG.
    1750                 :          * This has been seen to occur when a disk page has a corrupted LSN.
    1751                 :          *
    1752                 :          * Formerly we treated this as a PANIC condition, but that hurts the
    1753                 :          * system's robustness rather than helping it: we do not want to take down
    1754                 :          * the whole system due to corruption on one data page.  In particular, if
    1755                 :          * the bad page is encountered again during recovery then we would be
    1756                 :          * unable to restart the database at all!  (This scenario has actually
    1757                 :          * happened in the field several times with 7.1 releases. Note that we
    1758                 :          * cannot get here while InRedo is true, but if the bad page is brought in
    1759                 :          * and marked dirty during recovery then CreateCheckPoint will try to
    1760                 :          * flush it at the end of recovery.)
    1761                 :          *
    1762                 :          * The current approach is to ERROR under normal conditions, but only
    1763                 :          * WARNING during recovery, so that the system can be brought up even if
    1764                 :          * there's a corrupt LSN.  Note that for calls from xact.c, the ERROR will
    1765                 :          * be promoted to PANIC since xact.c calls this routine inside a critical
    1766                 :          * section.  However, calls from bufmgr.c are not within critical sections
    1767                 :          * and so we will not force a restart for a bad LSN on a data page.
    1768                 :          */
    1769            4474 :         if (XLByteLT(LogwrtResult.Flush, record))
    1770               0 :                 elog(InRecovery ? WARNING : ERROR,
    1771                 :                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
    1772                 :                          record.xlogid, record.xrecoff,
    1773                 :                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
    1774                 : }
    1775                 : 
    1776                 : /*
    1777                 :  * Flush xlog, but without specifying exactly where to flush to.
    1778                 :  *
    1779                 :  * We normally flush only completed blocks; but if there is nothing to do on
    1780                 :  * that basis, we check for unflushed async commits in the current incomplete
    1781                 :  * block, and flush through the latest one of those.  Thus, if async commits
    1782                 :  * are not being used, we will flush complete blocks only.      We can guarantee
    1783                 :  * that async commits reach disk after at most three cycles; normally only
    1784                 :  * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
    1785                 :  * at the end of the buffer ring; this makes a difference only with very high
    1786                 :  * load or long wal_writer_delay, but imposes one extra cycle for the worst
    1787                 :  * case for async commits.)
    1788                 :  *
    1789                 :  * This routine is invoked periodically by the background walwriter process.
    1790                 :  */
    1791                 : void
    1792                 : XLogBackgroundFlush(void)
    1793             243 : {
    1794                 :         XLogRecPtr      WriteRqstPtr;
    1795             243 :         bool            flexible = true;
    1796                 : 
    1797                 :         /* read LogwrtResult and update local state */
    1798                 :         {
    1799                 :                 /* use volatile pointer to prevent code rearrangement */
    1800             243 :                 volatile XLogCtlData *xlogctl = XLogCtl;
    1801                 : 
    1802             243 :                 SpinLockAcquire(&xlogctl->info_lck);
    1803             243 :                 LogwrtResult = xlogctl->LogwrtResult;
    1804             243 :                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
    1805             243 :                 SpinLockRelease(&xlogctl->info_lck);
    1806                 :         }
    1807                 : 
    1808                 :         /* back off to last completed page boundary */
    1809             243 :         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
    1810                 : 
    1811                 :         /* if we have already flushed that far, consider async commit records */
    1812             243 :         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
    1813                 :         {
    1814                 :                 /* use volatile pointer to prevent code rearrangement */
    1815             213 :                 volatile XLogCtlData *xlogctl = XLogCtl;
    1816                 : 
    1817             213 :                 SpinLockAcquire(&xlogctl->info_lck);
    1818             213 :                 WriteRqstPtr = xlogctl->asyncCommitLSN;
    1819             213 :                 SpinLockRelease(&xlogctl->info_lck);
    1820             213 :                 flexible = false;               /* ensure it all gets written */
    1821                 :         }
    1822                 : 
    1823                 :         /* Done if already known flushed */
    1824             243 :         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
    1825                 :                 return;
    1826                 : 
    1827                 : #ifdef WAL_DEBUG
    1828                 :         if (XLOG_DEBUG)
    1829                 :                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
    1830                 :                          WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
    1831                 :                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
    1832                 :                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
    1833                 : #endif
    1834                 : 
    1835              30 :         START_CRIT_SECTION();
    1836                 : 
    1837                 :         /* now wait for the write lock */
    1838              30 :         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    1839              30 :         LogwrtResult = XLogCtl->Write.LogwrtResult;
    1840              30 :         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
    1841                 :         {
    1842                 :                 XLogwrtRqst WriteRqst;
    1843                 : 
    1844              28 :                 WriteRqst.Write = WriteRqstPtr;
    1845              28 :                 WriteRqst.Flush = WriteRqstPtr;
    1846              28 :                 XLogWrite(WriteRqst, flexible, false);
    1847                 :         }
    1848              30 :         LWLockRelease(WALWriteLock);
    1849                 : 
    1850              30 :         END_CRIT_SECTION();
    1851                 : }
    1852                 : 
    1853                 : /*
    1854                 :  * Flush any previous asynchronously-committed transactions' commit records.
    1855                 :  *
    1856                 :  * NOTE: it is unwise to assume that this provides any strong guarantees.
    1857                 :  * In particular, because of the inexact LSN bookkeeping used by clog.c,
    1858                 :  * we cannot assume that hint bits will be settable for these transactions.
    1859                 :  */
    1860                 : void
    1861                 : XLogAsyncCommitFlush(void)
    1862              67 : {
    1863                 :         XLogRecPtr      WriteRqstPtr;
    1864                 : 
    1865                 :         /* use volatile pointer to prevent code rearrangement */
    1866              67 :         volatile XLogCtlData *xlogctl = XLogCtl;
    1867                 : 
    1868              67 :         SpinLockAcquire(&xlogctl->info_lck);
    1869              67 :         WriteRqstPtr = xlogctl->asyncCommitLSN;
    1870              67 :         SpinLockRelease(&xlogctl->info_lck);
    1871                 : 
    1872              67 :         XLogFlush(WriteRqstPtr);
    1873              67 : }
    1874                 : 
    1875                 : /*
    1876                 :  * Test whether XLOG data has been flushed up to (at least) the given position.
    1877                 :  *
    1878                 :  * Returns true if a flush is still needed.  (It may be that someone else
    1879                 :  * is already in process of flushing that far, however.)
    1880                 :  */
    1881                 : bool
    1882                 : XLogNeedsFlush(XLogRecPtr record)
    1883          226382 : {
    1884                 :         /* Quick exit if already known flushed */
    1885          226382 :         if (XLByteLE(record, LogwrtResult.Flush))
    1886          226377 :                 return false;
    1887                 : 
    1888                 :         /* read LogwrtResult and update local state */
    1889                 :         {
    1890                 :                 /* use volatile pointer to prevent code rearrangement */
    1891               5 :                 volatile XLogCtlData *xlogctl = XLogCtl;
    1892                 : 
    1893               5 :                 SpinLockAcquire(&xlogctl->info_lck);
    1894               5 :                 LogwrtResult = xlogctl->LogwrtResult;
    1895               5 :                 SpinLockRelease(&xlogctl->info_lck);
    1896                 :         }
    1897                 : 
    1898                 :         /* check again */
    1899               5 :         if (XLByteLE(record, LogwrtResult.Flush))
    1900               0 :                 return false;
    1901                 : 
    1902               5 :         return true;
    1903                 : }
    1904                 : 
    1905                 : /*
    1906                 :  * Create a new XLOG file segment, or open a pre-existing one.
    1907                 :  *
    1908                 :  * log, seg: identify segment to be created/opened.
    1909                 :  *
    1910                 :  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
    1911                 :  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
    1912                 :  * file was used.
    1913                 :  *
    1914                 :  * use_lock: if TRUE, acquire ControlFileLock while moving file into
    1915                 :  * place.  This should be TRUE except during bootstrap log creation.  The
    1916                 :  * caller must *not* hold the lock at call.
    1917                 :  *
    1918                 :  * Returns FD of opened file.
    1919                 :  *
    1920                 :  * Note: errors here are ERROR not PANIC because we might or might not be
    1921                 :  * inside a critical section (eg, during checkpoint there is no reason to
    1922                 :  * take down the system on failure).  They will promote to PANIC if we are
    1923                 :  * in a critical section.
    1924                 :  */
    1925                 : static int
    1926                 : XLogFileInit(uint32 log, uint32 seg,
    1927                 :                          bool *use_existent, bool use_lock)
    1928              79 : {
    1929                 :         char            path[MAXPGPATH];
    1930                 :         char            tmppath[MAXPGPATH];
    1931                 :         char       *zbuffer;
    1932                 :         uint32          installed_log;
    1933                 :         uint32          installed_seg;
    1934                 :         int                     max_advance;
    1935                 :         int                     fd;
    1936                 :         int                     nbytes;
    1937                 : 
    1938              79 :         XLogFilePath(path, ThisTimeLineID, log, seg);
    1939                 : 
    1940                 :         /*
    1941                 :          * Try to use existent file (checkpoint maker may have created it already)
    1942                 :          */
    1943              79 :         if (*use_existent)
    1944                 :         {
    1945              78 :                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
    1946                 :                                                    S_IRUSR | S_IWUSR);
    1947              78 :                 if (fd < 0)
    1948                 :                 {
    1949               2 :                         if (errno != ENOENT)
    1950               0 :                                 ereport(ERROR,
    1951                 :                                                 (errcode_for_file_access(),
    1952                 :                                                  errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
    1953                 :                                                                 path, log, seg)));
    1954                 :                 }
    1955                 :                 else
    1956              76 :                         return fd;
    1957                 :         }
    1958                 : 
    1959                 :         /*
    1960                 :          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
    1961                 :          * another process is doing the same thing.  If so, we will end up
    1962                 :          * pre-creating an extra log segment.  That seems OK, and better than
    1963                 :          * holding the lock throughout this lengthy process.
    1964                 :          */
    1965               3 :         elog(DEBUG2, "creating and filling new WAL file");
    1966                 : 
    1967               3 :         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    1968                 : 
    1969               3 :         unlink(tmppath);
    1970                 : 
    1971                 :         /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
    1972               3 :         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
    1973                 :                                            S_IRUSR | S_IWUSR);
    1974               3 :         if (fd < 0)
    1975               0 :                 ereport(ERROR,
    1976                 :                                 (errcode_for_file_access(),
    1977                 :                                  errmsg("could not create file \"%s\": %m", tmppath)));
    1978                 : 
    1979                 :         /*
    1980                 :          * Zero-fill the file.  We have to do this the hard way to ensure that all
    1981                 :          * the file space has really been allocated --- on platforms that allow
    1982                 :          * "holes" in files, just seeking to the end doesn't allocate intermediate
    1983                 :          * space.  This way, we know that we have all the space and (after the
    1984                 :          * fsync below) that all the indirect blocks are down on disk.  Therefore,
    1985                 :          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
    1986                 :          * log file.
    1987                 :          *
    1988                 :          * Note: palloc zbuffer, instead of just using a local char array, to
    1989                 :          * ensure it is reasonably well-aligned; this may save a few cycles
    1990                 :          * transferring data to the kernel.
    1991                 :          */
    1992               3 :         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
    1993            6147 :         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
    1994                 :         {
    1995            6144 :                 errno = 0;
    1996            6144 :                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
    1997                 :                 {
    1998               0 :                         int                     save_errno = errno;
    1999                 : 
    2000                 :                         /*
    2001                 :                          * If we fail to make the file, delete it to release disk space
    2002                 :                          */
    2003               0 :                         unlink(tmppath);
    2004                 :                         /* if write didn't set errno, assume problem is no disk space */
    2005               0 :                         errno = save_errno ? save_errno : ENOSPC;
    2006                 : 
    2007               0 :                         ereport(ERROR,
    2008                 :                                         (errcode_for_file_access(),
    2009                 :                                          errmsg("could not write to file \"%s\": %m", tmppath)));
    2010                 :                 }
    2011                 :         }
    2012               3 :         pfree(zbuffer);
    2013                 : 
    2014               3 :         if (pg_fsync(fd) != 0)
    2015               0 :                 ereport(ERROR,
    2016                 :                                 (errcode_for_file_access(),
    2017                 :                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    2018                 : 
    2019               3 :         if (close(fd))
    2020               0 :                 ereport(ERROR,
    2021                 :                                 (errcode_for_file_access(),
    2022                 :                                  errmsg("could not close file \"%s\": %m", tmppath)));
    2023                 : 
    2024                 :         /*
    2025                 :          * Now move the segment into place with its final name.
    2026                 :          *
    2027                 :          * If caller didn't want to use a pre-existing file, get rid of any
    2028                 :          * pre-existing file.  Otherwise, cope with possibility that someone else
    2029                 :          * has created the file while we were filling ours: if so, use ours to
    2030                 :          * pre-create a future log segment.
    2031                 :          */
    2032               3 :         installed_log = log;
    2033               3 :         installed_seg = seg;
    2034               3 :         max_advance = XLOGfileslop;
    2035               3 :         if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
    2036                 :                                                                 *use_existent, &max_advance,
    2037                 :                                                                 use_lock))
    2038                 :         {
    2039                 :                 /* No need for any more future segments... */
    2040               0 :                 unlink(tmppath);
    2041                 :         }
    2042                 : 
    2043               3 :         elog(DEBUG2, "done creating and filling new WAL file");
    2044                 : 
    2045                 :         /* Set flag to tell caller there was no existent file */
    2046               3 :         *use_existent = false;
    2047                 : 
    2048                 :         /* Now open original target segment (might not be file I just made) */
    2049               3 :         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
    2050                 :                                            S_IRUSR | S_IWUSR);
    2051               3 :         if (fd < 0)
    2052               0 :                 ereport(ERROR,
    2053                 :                                 (errcode_for_file_access(),
    2054                 :                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
    2055                 :                                   path, log, seg)));
    2056                 : 
    2057               3 :         return fd;
    2058                 : }
    2059                 : 
    2060                 : /*
    2061                 :  * Create a new XLOG file segment by copying a pre-existing one.
    2062                 :  *
    2063                 :  * log, seg: identify segment to be created.
    2064                 :  *
    2065                 :  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
    2066                 :  *              a different timeline)
    2067                 :  *
    2068                 :  * Currently this is only used during recovery, and so there are no locking
    2069                 :  * considerations.      But we should be just as tense as XLogFileInit to avoid
    2070                 :  * emplacing a bogus file.
    2071                 :  */
    2072                 : static void
    2073                 : XLogFileCopy(uint32 log, uint32 seg,
    2074                 :                          TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
    2075               0 : {
    2076                 :         char            path[MAXPGPATH];
    2077                 :         char            tmppath[MAXPGPATH];
    2078                 :         char            buffer[XLOG_BLCKSZ];
    2079                 :         int                     srcfd;
    2080                 :         int                     fd;
    2081                 :         int                     nbytes;
    2082                 : 
    2083                 :         /*
    2084                 :          * Open the source file
    2085                 :          */
    2086               0 :         XLogFilePath(path, srcTLI, srclog, srcseg);
    2087               0 :         srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
    2088               0 :         if (srcfd < 0)
    2089               0 :                 ereport(ERROR,
    2090                 :                                 (errcode_for_file_access(),
    2091                 :                                  errmsg("could not open file \"%s\": %m", path)));
    2092                 : 
    2093                 :         /*
    2094                 :          * Copy into a temp file name.
    2095                 :          */
    2096               0 :         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    2097                 : 
    2098               0 :         unlink(tmppath);
    2099                 : 
    2100                 :         /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
    2101               0 :         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
    2102                 :                                            S_IRUSR | S_IWUSR);
    2103               0 :         if (fd < 0)
    2104               0 :                 ereport(ERROR,
    2105                 :                                 (errcode_for_file_access(),
    2106                 :                                  errmsg("could not create file \"%s\": %m", tmppath)));
    2107                 : 
    2108                 :         /*
    2109                 :          * Do the data copying.
    2110                 :          */
    2111               0 :         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
    2112                 :         {
    2113               0 :                 errno = 0;
    2114               0 :                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
    2115                 :                 {
    2116               0 :                         if (errno != 0)
    2117               0 :                                 ereport(ERROR,
    2118                 :                                                 (errcode_for_file_access(),
    2119                 :                                                  errmsg("could not read file \"%s\": %m", path)));
    2120                 :                         else
    2121               0 :                                 ereport(ERROR,
    2122                 :                                                 (errmsg("not enough data in file \"%s\"", path)));
    2123                 :                 }
    2124               0 :                 errno = 0;
    2125               0 :                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
    2126                 :                 {
    2127               0 :                         int                     save_errno = errno;
    2128                 : 
    2129                 :                         /*
    2130                 :                          * If we fail to make the file, delete it to release disk space
    2131                 :                          */
    2132               0 :                         unlink(tmppath);
    2133                 :                         /* if write didn't set errno, assume problem is no disk space */
    2134               0 :                         errno = save_errno ? save_errno : ENOSPC;
    2135                 : 
    2136               0 :                         ereport(ERROR,
    2137                 :                                         (errcode_for_file_access(),
    2138                 :                                          errmsg("could not write to file \"%s\": %m", tmppath)));
    2139                 :                 }
    2140                 :         }
    2141                 : 
    2142               0 :         if (pg_fsync(fd) != 0)
    2143               0 :                 ereport(ERROR,
    2144                 :                                 (errcode_for_file_access(),
    2145                 :                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    2146                 : 
    2147               0 :         if (close(fd))
    2148               0 :                 ereport(ERROR,
    2149                 :                                 (errcode_for_file_access(),
    2150                 :                                  errmsg("could not close file \"%s\": %m", tmppath)));
    2151                 : 
    2152               0 :         close(srcfd);
    2153                 : 
    2154                 :         /*
    2155                 :          * Now move the segment into place with its final name.
    2156                 :          */
    2157               0 :         if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
    2158               0 :                 elog(ERROR, "InstallXLogFileSegment should not have failed");
    2159               0 : }
    2160                 : 
    2161                 : /*
    2162                 :  * Install a new XLOG segment file as a current or future log segment.
    2163                 :  *
    2164                 :  * This is used both to install a newly-created segment (which has a temp
    2165                 :  * filename while it's being created) and to recycle an old segment.
    2166                 :  *
    2167                 :  * *log, *seg: identify segment to install as (or first possible target).
    2168                 :  * When find_free is TRUE, these are modified on return to indicate the
    2169                 :  * actual installation location or last segment searched.
    2170                 :  *
    2171                 :  * tmppath: initial name of file to install.  It will be renamed into place.
    2172                 :  *
    2173                 :  * find_free: if TRUE, install the new segment at the first empty log/seg
    2174                 :  * number at or after the passed numbers.  If FALSE, install the new segment
    2175                 :  * exactly where specified, deleting any existing segment file there.
    2176                 :  *
    2177                 :  * *max_advance: maximum number of log/seg slots to advance past the starting
    2178                 :  * point.  Fail if no free slot is found in this range.  On return, reduced
    2179                 :  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
    2180                 :  * when find_free is FALSE.)
    2181                 :  *
    2182                 :  * use_lock: if TRUE, acquire ControlFileLock while moving file into
    2183                 :  * place.  This should be TRUE except during bootstrap log creation.  The
    2184                 :  * caller must *not* hold the lock at call.
    2185                 :  *
    2186                 :  * Returns TRUE if file installed, FALSE if not installed because of
    2187                 :  * exceeding max_advance limit.  On Windows, we also return FALSE if we
    2188                 :  * can't rename the file into place because someone's got it open.
    2189                 :  * (Any other kind of failure causes ereport().)
    2190                 :  */
    2191                 : static bool
    2192                 : InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
    2193                 :                                            bool find_free, int *max_advance,
    2194                 :                                            bool use_lock)
    2195               5 : {
    2196                 :         char            path[MAXPGPATH];
    2197                 :         struct stat stat_buf;
    2198                 : 
    2199               5 :         XLogFilePath(path, ThisTimeLineID, *log, *seg);
    2200                 : 
    2201                 :         /*
    2202                 :          * We want to be sure that only one process does this at a time.
    2203                 :          */
    2204               5 :         if (use_lock)
    2205               4 :                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    2206                 : 
    2207               5 :         if (!find_free)
    2208                 :         {
    2209                 :                 /* Force installation: get rid of any pre-existing segment file */
    2210               1 :                 unlink(path);
    2211                 :         }
    2212                 :         else
    2213                 :         {
    2214                 :                 /* Find a free slot to put it in */
    2215               5 :                 while (stat(path, &stat_buf) == 0)
    2216                 :                 {
    2217               1 :                         if (*max_advance <= 0)
    2218                 :                         {
    2219                 :                                 /* Failed to find a free slot within specified range */
    2220               0 :                                 if (use_lock)
    2221               0 :                                         LWLockRelease(ControlFileLock);
    2222               0 :                                 return false;
    2223                 :                         }
    2224               1 :                         NextLogSeg(*log, *seg);
    2225               1 :                         (*max_advance)--;
    2226               1 :                         XLogFilePath(path, ThisTimeLineID, *log, *seg);
    2227                 :                 }
    2228                 :         }
    2229                 : 
    2230                 :         /*
    2231                 :          * Prefer link() to rename() here just to be really sure that we don't
    2232                 :          * overwrite an existing logfile.  However, there shouldn't be one, so
    2233                 :          * rename() is an acceptable substitute except for the truly paranoid.
    2234                 :          */
    2235                 : #if HAVE_WORKING_LINK
    2236               5 :         if (link(tmppath, path) < 0)
    2237               0 :                 ereport(ERROR,
    2238                 :                                 (errcode_for_file_access(),
    2239                 :                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
    2240                 :                                                 tmppath, path, *log, *seg)));
    2241               5 :         unlink(tmppath);
    2242                 : #else
    2243                 :         if (rename(tmppath, path) < 0)
    2244                 :         {
    2245                 : #ifdef WIN32
    2246                 : #if !defined(__CYGWIN__)
    2247                 :                 if (GetLastError() == ERROR_ACCESS_DENIED)
    2248                 : #else
    2249                 :                 if (errno == EACCES)
    2250                 : #endif
    2251                 :                 {
    2252                 :                         if (use_lock)
    2253                 :                                 LWLockRelease(ControlFileLock);
    2254                 :                         return false;
    2255                 :                 }
    2256                 : #endif   /* WIN32 */
    2257                 : 
    2258                 :                 ereport(ERROR,
    2259                 :                                 (errcode_for_file_access(),
    2260                 :                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
    2261                 :                                                 tmppath, path, *log, *seg)));
    2262                 :         }
    2263                 : #endif
    2264                 : 
    2265               5 :         if (use_lock)
    2266               4 :                 LWLockRelease(ControlFileLock);
    2267                 : 
    2268               5 :         return true;
    2269                 : }
    2270                 : 
    2271                 : /*
    2272                 :  * Open a pre-existing logfile segment for writing.
    2273                 :  */
    2274                 : static int
    2275                 : XLogFileOpen(uint32 log, uint32 seg)
    2276              65 : {
    2277                 :         char            path[MAXPGPATH];
    2278                 :         int                     fd;
    2279                 : 
    2280              65 :         XLogFilePath(path, ThisTimeLineID, log, seg);
    2281                 : 
    2282              65 :         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
    2283                 :                                            S_IRUSR | S_IWUSR);
    2284              65 :         if (fd < 0)
    2285               0 :                 ereport(PANIC,
    2286                 :                                 (errcode_for_file_access(),
    2287                 :                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
    2288                 :                                   path, log, seg)));
    2289                 : 
    2290              65 :         return fd;
    2291                 : }
    2292                 : 
    2293                 : /*
    2294                 :  * Open a logfile segment for reading (during recovery).
    2295                 :  */
    2296                 : static int
    2297                 : XLogFileRead(uint32 log, uint32 seg, int emode)
    2298              14 : {
    2299                 :         char            path[MAXPGPATH];
    2300                 :         char            xlogfname[MAXFNAMELEN];
    2301                 :         char            activitymsg[MAXFNAMELEN + 16];
    2302                 :         ListCell   *cell;
    2303                 :         int                     fd;
    2304                 : 
    2305                 :         /*
    2306                 :          * Loop looking for a suitable timeline ID: we might need to read any of
    2307                 :          * the timelines listed in expectedTLIs.
    2308                 :          *
    2309                 :          * We expect curFileTLI on entry to be the TLI of the preceding file in
    2310                 :          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
    2311                 :          * to go backwards; this prevents us from picking up the wrong file when a
    2312                 :          * parent timeline extends to higher segment numbers than the child we
    2313                 :          * want to read.
    2314                 :          */
    2315              28 :         foreach(cell, expectedTLIs)
    2316                 :         {
    2317              14 :                 TimeLineID      tli = (TimeLineID) lfirst_int(cell);
    2318                 : 
    2319              14 :                 if (tli < curFileTLI)
    2320               0 :                         break;                          /* don't bother looking at too-old TLIs */
    2321                 : 
    2322              14 :                 XLogFileName(xlogfname, tli, log, seg);
    2323                 : 
    2324              14 :                 if (InArchiveRecovery)
    2325                 :                 {
    2326                 :                         /* Report recovery progress in PS display */
    2327               0 :                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
    2328                 :                                          xlogfname);
    2329               0 :                         set_ps_display(activitymsg, false);
    2330                 : 
    2331               0 :                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
    2332                 :                                                                                                           "RECOVERYXLOG",
    2333                 :                                                                                                           XLogSegSize);
    2334                 :                 }
    2335                 :                 else
    2336              14 :                         XLogFilePath(path, tli, log, seg);
    2337                 : 
    2338              14 :                 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
    2339              14 :                 if (fd >= 0)
    2340                 :                 {
    2341                 :                         /* Success! */
    2342              14 :                         curFileTLI = tli;
    2343                 : 
    2344                 :                         /* Report recovery progress in PS display */
    2345              14 :                         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
    2346                 :                                          xlogfname);
    2347              14 :                         set_ps_display(activitymsg, false);
    2348                 : 
    2349              14 :                         return fd;
    2350                 :                 }
    2351               0 :                 if (errno != ENOENT)    /* unexpected failure? */
    2352               0 :                         ereport(PANIC,
    2353                 :                                         (errcode_for_file_access(),
    2354                 :                         errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
    2355                 :                                    path, log, seg)));
    2356                 :         }
    2357                 : 
    2358                 :         /* Couldn't find it.  For simplicity, complain about front timeline */
    2359               0 :         XLogFilePath(path, recoveryTargetTLI, log, seg);
    2360               0 :         errno = ENOENT;
    2361               0 :         ereport(emode,
    2362                 :                         (errcode_for_file_access(),
    2363                 :                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
    2364                 :                                   path, log, seg)));
    2365               0 :         return -1;
    2366                 : }
    2367                 : 
    2368                 : /*
    2369                 :  * Close the current logfile segment for writing.
    2370                 :  */
    2371                 : static void
    2372                 : XLogFileClose(void)
    2373              11 : {
    2374                 :         Assert(openLogFile >= 0);
    2375                 : 
    2376                 :         /*
    2377                 :          * posix_fadvise is problematic on many platforms: on older x86 Linux it
    2378                 :          * just dumps core, and there are reports of problems on PPC platforms as
    2379                 :          * well.  The following is therefore disabled for the time being. We could
    2380                 :          * consider some kind of configure test to see if it's safe to use, but
    2381                 :          * since we lack hard evidence that there's any useful performance gain to
    2382                 :          * be had, spending time on that seems unprofitable for now.
    2383                 :          */
    2384                 : #ifdef NOT_USED
    2385                 : 
    2386                 :         /*
    2387                 :          * WAL segment files will not be re-read in normal operation, so we advise
    2388                 :          * OS to release any cached pages.      But do not do so if WAL archiving is
    2389                 :          * active, because archiver process could use the cache to read the WAL
    2390                 :          * segment.
    2391                 :          *
    2392                 :          * While O_DIRECT works for O_SYNC, posix_fadvise() works for fsync() and
    2393                 :          * O_SYNC, and some platforms only have posix_fadvise().
    2394                 :          */
    2395                 : #if defined(HAVE_DECL_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    2396                 :         if (!XLogArchivingActive())
    2397                 :                 posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
    2398                 : #endif
    2399                 : #endif   /* NOT_USED */
    2400                 : 
    2401              11 :         if (close(openLogFile))
    2402               0 :                 ereport(PANIC,
    2403                 :                                 (errcode_for_file_access(),
    2404                 :                                  errmsg("could not close log file %u, segment %u: %m",
    2405                 :                                                 openLogId, openLogSeg)));
    2406              11 :         openLogFile = -1;
    2407              11 : }
    2408                 : 
    2409                 : /*
    2410                 :  * Attempt to retrieve the specified file from off-line archival storage.
    2411                 :  * If successful, fill "path" with its complete path (note that this will be
    2412                 :  * a temp file name that doesn't follow the normal naming convention), and
    2413                 :  * return TRUE.
    2414                 :  *
    2415                 :  * If not successful, fill "path" with the name of the normal on-line file
    2416                 :  * (which may or may not actually exist, but we'll try to use it), and return
    2417                 :  * FALSE.
    2418                 :  *
    2419                 :  * For fixed-size files, the caller may pass the expected size as an
    2420                 :  * additional crosscheck on successful recovery.  If the file size is not
    2421                 :  * known, set expectedSize = 0.
    2422                 :  */
    2423                 : static bool
    2424                 : RestoreArchivedFile(char *path, const char *xlogfname,
    2425                 :                                         const char *recovername, off_t expectedSize)
    2426               0 : {
    2427                 :         char            xlogpath[MAXPGPATH];
    2428                 :         char            xlogRestoreCmd[MAXPGPATH];
    2429                 :         char            lastRestartPointFname[MAXPGPATH];
    2430                 :         char       *dp;
    2431                 :         char       *endp;
    2432                 :         const char *sp;
    2433                 :         int                     rc;
    2434                 :         bool            signaled;
    2435                 :         struct stat stat_buf;
    2436                 :         uint32          restartLog;
    2437                 :         uint32          restartSeg;
    2438                 : 
    2439                 :         /*
    2440                 :          * When doing archive recovery, we always prefer an archived log file even
    2441                 :          * if a file of the same name exists in XLOGDIR.  The reason is that the
    2442                 :          * file in XLOGDIR could be an old, un-filled or partly-filled version
    2443                 :          * that was copied and restored as part of backing up $PGDATA.
    2444                 :          *
    2445                 :          * We could try to optimize this slightly by checking the local copy
    2446                 :          * lastchange timestamp against the archived copy, but we have no API to
    2447                 :          * do this, nor can we guarantee that the lastchange timestamp was
    2448                 :          * preserved correctly when we copied to archive. Our aim is robustness,
    2449                 :          * so we elect not to do this.
    2450                 :          *
    2451                 :          * If we cannot obtain the log file from the archive, however, we will try
    2452                 :          * to use the XLOGDIR file if it exists.  This is so that we can make use
    2453                 :          * of log segments that weren't yet transferred to the archive.
    2454                 :          *
    2455                 :          * Notice that we don't actually overwrite any files when we copy back
    2456                 :          * from archive because the recoveryRestoreCommand may inadvertently
    2457                 :          * restore inappropriate xlogs, or they may be corrupt, so we may wish to
    2458                 :          * fallback to the segments remaining in current XLOGDIR later. The
    2459                 :          * copy-from-archive filename is always the same, ensuring that we don't
    2460                 :          * run out of disk space on long recoveries.
    2461                 :          */
    2462               0 :         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
    2463                 : 
    2464                 :         /*
    2465                 :          * Make sure there is no existing file named recovername.
    2466                 :          */
    2467               0 :         if (stat(xlogpath, &stat_buf) != 0)
    2468                 :         {
    2469               0 :                 if (errno != ENOENT)
    2470               0 :                         ereport(FATAL,
    2471                 :                                         (errcode_for_file_access(),
    2472                 :                                          errmsg("could not stat file \"%s\": %m",
    2473                 :                                                         xlogpath)));
    2474                 :         }
    2475                 :         else
    2476                 :         {
    2477               0 :                 if (unlink(xlogpath) != 0)
    2478               0 :                         ereport(FATAL,
    2479                 :                                         (errcode_for_file_access(),
    2480                 :                                          errmsg("could not remove file \"%s\": %m",
    2481                 :                                                         xlogpath)));
    2482                 :         }
    2483                 : 
    2484                 :         /*
    2485                 :          * construct the command to be executed
    2486                 :          */
    2487               0 :         dp = xlogRestoreCmd;
    2488               0 :         endp = xlogRestoreCmd + MAXPGPATH - 1;
    2489               0 :         *endp = '\0';
    2490                 : 
    2491               0 :         for (sp = recoveryRestoreCommand; *sp; sp++)
    2492                 :         {
    2493               0 :                 if (*sp == '%')
    2494                 :                 {
    2495               0 :                         switch (sp[1])
    2496                 :                         {
    2497                 :                                 case 'p':
    2498                 :                                         /* %p: relative path of target file */
    2499               0 :                                         sp++;
    2500               0 :                                         StrNCpy(dp, xlogpath, endp - dp);
    2501               0 :                                         make_native_path(dp);
    2502               0 :                                         dp += strlen(dp);
    2503               0 :                                         break;
    2504                 :                                 case 'f':
    2505                 :                                         /* %f: filename of desired file */
    2506               0 :                                         sp++;
    2507               0 :                                         StrNCpy(dp, xlogfname, endp - dp);
    2508               0 :                                         dp += strlen(dp);
    2509               0 :                                         break;
    2510                 :                                 case 'r':
    2511                 :                                         /* %r: filename of last restartpoint */
    2512               0 :                                         sp++;
    2513               0 :                                         XLByteToSeg(ControlFile->checkPointCopy.redo,
    2514                 :                                                                 restartLog, restartSeg);
    2515               0 :                                         XLogFileName(lastRestartPointFname,
    2516                 :                                                                  ControlFile->checkPointCopy.ThisTimeLineID,
    2517                 :                                                                  restartLog, restartSeg);
    2518               0 :                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
    2519               0 :                                         dp += strlen(dp);
    2520               0 :                                         break;
    2521                 :                                 case '%':
    2522                 :                                         /* convert %% to a single % */
    2523               0 :                                         sp++;
    2524               0 :                                         if (dp < endp)
    2525               0 :                                                 *dp++ = *sp;
    2526                 :                                         break;
    2527                 :                                 default:
    2528                 :                                         /* otherwise treat the % as not special */
    2529               0 :                                         if (dp < endp)
    2530               0 :                                                 *dp++ = *sp;
    2531                 :                                         break;
    2532                 :                         }
    2533                 :                 }
    2534                 :                 else
    2535                 :                 {
    2536               0 :                         if (dp < endp)
    2537               0 :                                 *dp++ = *sp;
    2538                 :                 }
    2539                 :         }
    2540               0 :         *dp = '\0';
    2541                 : 
    2542               0 :         ereport(DEBUG3,
    2543                 :                         (errmsg_internal("executing restore command \"%s\"",
    2544                 :                                                          xlogRestoreCmd)));
    2545                 : 
    2546                 :         /*
    2547                 :          * Copy xlog from archival storage to XLOGDIR
    2548                 :          */
    2549               0 :         rc = system(xlogRestoreCmd);
    2550               0 :         if (rc == 0)
    2551                 :         {
    2552                 :                 /*
    2553                 :                  * command apparently succeeded, but let's make sure the file is
    2554                 :                  * really there now and has the correct size.
    2555                 :                  *
    2556                 :                  * XXX I made wrong-size a fatal error to ensure the DBA would notice
    2557                 :                  * it, but is that too strong?  We could try to plow ahead with a
    2558                 :                  * local copy of the file ... but the problem is that there probably
    2559                 :                  * isn't one, and we'd incorrectly conclude we've reached the end of
    2560                 :                  * WAL and we're done recovering ...
    2561                 :                  */
    2562               0 :                 if (stat(xlogpath, &stat_buf) == 0)
    2563                 :                 {
    2564               0 :                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
    2565               0 :                                 ereport(FATAL,
    2566                 :                                                 (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
    2567                 :                                                                 xlogfname,
    2568                 :                                                                 (unsigned long) stat_buf.st_size,
    2569                 :                                                                 (unsigned long) expectedSize)));
    2570                 :                         else
    2571                 :                         {
    2572               0 :                                 ereport(LOG,
    2573                 :                                                 (errmsg("restored log file \"%s\" from archive",
    2574                 :                                                                 xlogfname)));
    2575               0 :                                 strcpy(path, xlogpath);
    2576               0 :                                 return true;
    2577                 :                         }
    2578                 :                 }
    2579                 :                 else
    2580                 :                 {
    2581                 :                         /* stat failed */
    2582               0 :                         if (errno != ENOENT)
    2583               0 :                                 ereport(FATAL,
    2584                 :                                                 (errcode_for_file_access(),
    2585                 :                                                  errmsg("could not stat file \"%s\": %m",
    2586                 :                                                                 xlogpath)));
    2587                 :                 }
    2588                 :         }
    2589                 : 
    2590                 :         /*
    2591                 :          * Remember, we rollforward UNTIL the restore fails so failure here is
    2592                 :          * just part of the process... that makes it difficult to determine
    2593                 :          * whether the restore failed because there isn't an archive to restore,
    2594                 :          * or because the administrator has specified the restore program
    2595                 :          * incorrectly.  We have to assume the former.
    2596                 :          *
    2597                 :          * However, if the failure was due to any sort of signal, it's best to
    2598                 :          * punt and abort recovery.  (If we "return false" here, upper levels will
    2599                 :          * assume that recovery is complete and start up the database!) It's
    2600                 :          * essential to abort on child SIGINT and SIGQUIT, because per spec
    2601                 :          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
    2602                 :          * those it's a good bet we should have gotten it too.  Aborting on other
    2603                 :          * signals such as SIGTERM seems a good idea as well.
    2604                 :          *
    2605                 :          * Per the Single Unix Spec, shells report exit status > 128 when a called
    2606                 :          * command died on a signal.  Also, 126 and 127 are used to report
    2607                 :          * problems such as an unfindable command; treat those as fatal errors
    2608                 :          * too.
    2609                 :          */
    2610               0 :         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
    2611                 : 
    2612               0 :         ereport(signaled ? FATAL : DEBUG2,
    2613                 :                 (errmsg("could not restore file \"%s\" from archive: return code %d",
    2614                 :                                 xlogfname, rc)));
    2615                 : 
    2616                 :         /*
    2617                 :          * if an archived file is not available, there might still be a version of
    2618                 :          * this file in XLOGDIR, so return that as the filename to open.
    2619                 :          *
    2620                 :          * In many recovery scenarios we expect this to fail also, but if so that
    2621                 :          * just means we've reached the end of WAL.
    2622                 :          */
    2623               0 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
    2624               0 :         return false;
    2625                 : }
    2626                 : 
    2627                 : /*
    2628                 :  * Preallocate log files beyond the specified log endpoint.
    2629                 :  *
    2630                 :  * XXX this is currently extremely conservative, since it forces only one
    2631                 :  * future log segment to exist, and even that only if we are 75% done with
    2632                 :  * the current one.  This is only appropriate for very low-WAL-volume systems.
    2633                 :  * High-volume systems will be OK once they've built up a sufficient set of
    2634                 :  * recycled log segments, but the startup transient is likely to include
    2635                 :  * a lot of segment creations by foreground processes, which is not so good.
    2636                 :  */
    2637                 : static void
    2638                 : PreallocXlogFiles(XLogRecPtr endptr)
    2639              19 : {
    2640                 :         uint32          _logId;
    2641                 :         uint32          _logSeg;
    2642                 :         int                     lf;
    2643                 :         bool            use_existent;
    2644                 : 
    2645              19 :         XLByteToPrevSeg(endptr, _logId, _logSeg);
    2646              19 :         if ((endptr.xrecoff - 1) % XLogSegSize >=
    2647                 :                 (uint32) (0.75 * XLogSegSize))
    2648                 :         {
    2649               0 :                 NextLogSeg(_logId, _logSeg);
    2650               0 :                 use_existent = true;
    2651               0 :                 lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
    2652               0 :                 close(lf);
    2653               0 :                 if (!use_existent)
    2654               0 :                         CheckpointStats.ckpt_segs_added++;
    2655                 :         }
    2656              19 : }
    2657                 : 
    2658                 : /*
    2659                 :  * Recycle or remove all log files older or equal to passed log/seg#
    2660                 :  *
    2661                 :  * endptr is current (or recent) end of xlog; this is used to determine
    2662                 :  * whether we want to recycle rather than delete no-longer-wanted log files.
    2663                 :  */
    2664                 : static void
    2665                 : RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
    2666               2 : {
    2667                 :         uint32          endlogId;
    2668                 :         uint32          endlogSeg;
    2669                 :         int                     max_advance;
    2670                 :         DIR                *xldir;
    2671                 :         struct dirent *xlde;
    2672                 :         char            lastoff[MAXFNAMELEN];
    2673                 :         char            path[MAXPGPATH];
    2674                 : 
    2675                 :         /*
    2676                 :          * Initialize info about where to try to recycle to.  We allow recycling
    2677                 :          * segments up to XLOGfileslop segments beyond the current XLOG location.
    2678                 :          */
    2679               2 :         XLByteToPrevSeg(endptr, endlogId, endlogSeg);
    2680               2 :         max_advance = XLOGfileslop;
    2681                 : 
    2682               2 :         xldir = AllocateDir(XLOGDIR);
    2683               2 :         if (xldir == NULL)
    2684               0 :                 ereport(ERROR,
    2685                 :                                 (errcode_for_file_access(),
    2686                 :                                  errmsg("could not open transaction log directory \"%s\": %m",
    2687                 :                                                 XLOGDIR)));
    2688                 : 
    2689               2 :         XLogFileName(lastoff, ThisTimeLineID, log, seg);
    2690                 : 
    2691              16 :         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    2692                 :         {
    2693                 :                 /*
    2694                 :                  * We ignore the timeline part of the XLOG segment identifiers in
    2695                 :                  * deciding whether a segment is still needed.  This ensures that we
    2696                 :                  * won't prematurely remove a segment from a parent timeline. We could
    2697                 :                  * probably be a little more proactive about removing segments of
    2698                 :                  * non-parent timelines, but that would be a whole lot more
    2699                 :                  * complicated.
    2700                 :                  *
    2701                 :                  * We use the alphanumeric sorting property of the filenames to decide
    2702                 :                  * which ones are earlier than the lastoff segment.
    2703                 :                  */
    2704              24 :                 if (strlen(xlde->d_name) == 24 &&
    2705               6 :                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
    2706               6 :                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
    2707                 :                 {
    2708               2 :                         if (XLogArchiveCheckDone(xlde->d_name))
    2709                 :                         {
    2710               2 :                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
    2711                 : 
    2712                 :                                 /*
    2713                 :                                  * Before deleting the file, see if it can be recycled as a
    2714                 :                                  * future log segment.
    2715                 :                                  */
    2716               2 :                                 if (InstallXLogFileSegment(&endlogId, &endlogSeg, path,
    2717                 :                                                                                    true, &max_advance,
    2718                 :                                                                                    true))
    2719                 :                                 {
    2720               2 :                                         ereport(DEBUG2,
    2721                 :                                                         (errmsg("recycled transaction log file \"%s\"",
    2722                 :                                                                         xlde->d_name)));
    2723               2 :                                         CheckpointStats.ckpt_segs_recycled++;
    2724                 :                                         /* Needn't recheck that slot on future iterations */
    2725               2 :                                         if (max_advance > 0)
    2726                 :                                         {
    2727               2 :                                                 NextLogSeg(endlogId, endlogSeg);
    2728               2 :                                                 max_advance--;
    2729                 :                                         }
    2730                 :                                 }
    2731                 :                                 else
    2732                 :                                 {
    2733                 :                                         /* No need for any more future segments... */
    2734               0 :                                         ereport(DEBUG2,
    2735                 :                                                         (errmsg("removing transaction log file \"%s\"",
    2736                 :                                                                         xlde->d_name)));
    2737               0 :                                         unlink(path);
    2738               0 :                                         CheckpointStats.ckpt_segs_removed++;
    2739                 :                                 }
    2740                 : 
    2741               2 :                                 XLogArchiveCleanup(xlde->d_name);
    2742                 :                         }
    2743                 :                 }
    2744                 :         }
    2745                 : 
    2746               2 :         FreeDir(xldir);
    2747               2 : }
    2748                 : 
    2749                 : /*
    2750                 :  * Remove previous backup history files.  This also retries creation of
    2751                 :  * .ready files for any backup history files for which XLogArchiveNotify
    2752                 :  * failed earlier.
    2753                 :  */
    2754                 : static void
    2755                 : CleanupBackupHistory(void)
    2756               0 : {
    2757                 :         DIR                *xldir;
    2758                 :         struct dirent *xlde;
    2759                 :         char            path[MAXPGPATH];
    2760                 : 
    2761               0 :         xldir = AllocateDir(XLOGDIR);
    2762               0 :         if (xldir == NULL)
    2763               0 :                 ereport(ERROR,
    2764                 :                                 (errcode_for_file_access(),
    2765                 :                                  errmsg("could not open transaction log directory \"%s\": %m",
    2766                 :                                                 XLOGDIR)));
    2767                 : 
    2768               0 :         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    2769                 :         {
    2770               0 :                 if (strlen(xlde->d_name) > 24 &&
    2771               0 :                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
    2772               0 :                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
    2773                 :                                    ".backup") == 0)
    2774                 :                 {
    2775               0 :                         if (XLogArchiveCheckDone(xlde->d_name))
    2776                 :                         {
    2777               0 :                                 ereport(DEBUG2,
    2778                 :                                 (errmsg("removing transaction log backup history file \"%s\"",
    2779                 :                                                 xlde->d_name)));
    2780               0 :                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
    2781               0 :                                 unlink(path);
    2782               0 :                                 XLogArchiveCleanup(xlde->d_name);
    2783                 :                         }
    2784                 :                 }
    2785                 :         }
    2786                 : 
    2787               0 :         FreeDir(xldir);
    2788               0 : }
    2789                 : 
    2790                 : /*
    2791                 :  * Restore the backup blocks present in an XLOG record, if any.
    2792                 :  *
    2793                 :  * We assume all of the record has been read into memory at *record.
    2794                 :  *
    2795                 :  * Note: when a backup block is available in XLOG, we restore it
    2796                 :  * unconditionally, even if the page in the database appears newer.
    2797                 :  * This is to protect ourselves against database pages that were partially
    2798                 :  * or incorrectly written during a crash.  We assume that the XLOG data
    2799                 :  * must be good because it has passed a CRC check, while the database
    2800                 :  * page might not be.  This will force us to replay all subsequent
    2801                 :  * modifications of the page that appear in XLOG, rather than possibly
    2802                 :  * ignoring them as already applied, but that's not a huge drawback.
    2803                 :  */
    2804                 : static void
    2805                 : RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
    2806               0 : {
    2807                 :         Relation        reln;
    2808                 :         Buffer          buffer;
    2809                 :         Page            page;
    2810                 :         BkpBlock        bkpb;
    2811                 :         char       *blk;
    2812                 :         int                     i;
    2813                 : 
    2814               0 :         blk = (char *) XLogRecGetData(record) + record->xl_len;
    2815               0 :         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
    2816                 :         {
    2817               0 :                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
    2818               0 :                         continue;
    2819                 : 
    2820               0 :                 memcpy(&bkpb, blk, sizeof(BkpBlock));
    2821               0 :                 blk += sizeof(BkpBlock);
    2822                 : 
    2823               0 :                 reln = XLogOpenRelation(bkpb.node);
    2824               0 :                 buffer = XLogReadBuffer(reln, bkpb.block, true);
    2825                 :                 Assert(BufferIsValid(buffer));
    2826               0 :                 page = (Page) BufferGetPage(buffer);
    2827                 : 
    2828               0 :                 if (bkpb.hole_length == 0)
    2829                 :                 {
    2830               0 :                         memcpy((char *) page, blk, BLCKSZ);
    2831                 :                 }
    2832                 :                 else
    2833                 :                 {
    2834                 :                         /* must zero-fill the hole */
    2835               0 :                         MemSet((char *) page, 0, BLCKSZ);
    2836               0 :                         memcpy((char *) page, blk, bkpb.hole_offset);
    2837               0 :                         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
    2838                 :                                    blk + bkpb.hole_offset,
    2839                 :                                    BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
    2840                 :                 }
    2841                 : 
    2842               0 :                 PageSetLSN(page, lsn);
    2843               0 :                 PageSetTLI(page, ThisTimeLineID);
    2844               0 :                 MarkBufferDirty(buffer);
    2845               0 :                 UnlockReleaseBuffer(buffer);
    2846                 : 
    2847               0 :                 blk += BLCKSZ - bkpb.hole_length;
    2848                 :         }
    2849               0 : }
    2850                 : 
    2851                 : /*
    2852                 :  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
    2853                 :  * record (other than to the minimal extent of computing the amount of
    2854                 :  * data to read in) until we've checked the CRCs.
    2855                 :  *
    2856                 :  * We assume all of the record has been read into memory at *record.
    2857                 :  */
    2858                 : static bool
    2859                 : RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
    2860              28 : {
    2861                 :         pg_crc32        crc;
    2862                 :         int                     i;
    2863              28 :         uint32          len = record->xl_len;
    2864                 :         BkpBlock        bkpb;
    2865                 :         char       *blk;
    2866                 : 
    2867                 :         /* First the rmgr data */
    2868              28 :         INIT_CRC32(crc);
    2869              28 :         COMP_CRC32(crc, XLogRecGetData(record), len);
    2870                 : 
    2871                 :         /* Add in the backup blocks, if any */
    2872              28 :         blk = (char *) XLogRecGetData(record) + len;
    2873             112 :         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
    2874                 :         {
    2875                 :                 uint32          blen;
    2876                 : 
    2877              84 :                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
    2878              84 :                         continue;
    2879                 : 
    2880               0 :                 memcpy(&bkpb, blk, sizeof(BkpBlock));
    2881               0 :                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
    2882                 :                 {
    2883               0 :                         ereport(emode,
    2884                 :                                         (errmsg("incorrect hole size in record at %X/%X",
    2885                 :                                                         recptr.xlogid, recptr.xrecoff)));
    2886               0 :                         return false;
    2887                 :                 }
    2888               0 :                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
    2889               0 :                 COMP_CRC32(crc, blk, blen);
    2890               0 :                 blk += blen;
    2891                 :         }
    2892                 : 
    2893                 :         /* Check that xl_tot_len agrees with our calculation */
    2894              28 :         if (blk != (char *) record + record->xl_tot_len)
    2895                 :         {
    2896               0 :                 ereport(emode,
    2897                 :                                 (errmsg("incorrect total length in record at %X/%X",
    2898                 :                                                 recptr.xlogid, recptr.xrecoff)));
    2899               0 :                 return false;
    2900                 :         }
    2901                 : 
    2902                 :         /* Finally include the record header */
    2903              28 :         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
    2904                 :                            SizeOfXLogRecord - sizeof(pg_crc32));
    2905              28 :         FIN_CRC32(crc);
    2906                 : 
    2907              28 :         if (!EQ_CRC32(record->xl_crc, crc))
    2908                 :         {
    2909               0 :                 ereport(emode,
    2910                 :                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
    2911                 :                                 recptr.xlogid, recptr.xrecoff)));
    2912               0 :                 return false;
    2913                 :         }
    2914                 : 
    2915              28 :         return true;
    2916                 : }
    2917                 : 
    2918                 : /*
    2919                 :  * Attempt to read an XLOG record.
    2920                 :  *
    2921                 :  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
    2922                 :  * try to read a record just after the last one previously read.
    2923                 :  *
    2924                 :  * If no valid record is available, returns NULL, or fails if emode is PANIC.
    2925                 :  * (emode must be either PANIC or LOG.)
    2926                 :  *
    2927                 :  * The record is copied into readRecordBuf, so that on successful return,
    2928                 :  * the returned record pointer always points there.
    2929                 :  */
    2930                 : static XLogRecord *
    2931                 : ReadRecord(XLogRecPtr *RecPtr, int emode)
    2932              28 : {
    2933                 :         XLogRecord *record;
    2934                 :         char       *buffer;
    2935              28 :         XLogRecPtr      tmpRecPtr = EndRecPtr;
    2936              28 :         bool            randAccess = false;
    2937                 :         uint32          len,
    2938                 :                                 total_len;
    2939                 :         uint32          targetPageOff;
    2940                 :         uint32          targetRecOff;
    2941                 :         uint32          pageHeaderSize;
    2942                 : 
    2943              28 :         if (readBuf == NULL)
    2944                 :         {
    2945                 :                 /*
    2946                 :                  * First time through, permanently allocate readBuf.  We do it this
    2947                 :                  * way, rather than just making a static array, for two reasons: (1)
    2948                 :                  * no need to waste the storage in most instantiations of the backend;
    2949                 :                  * (2) a static char array isn't guaranteed to have any particular
    2950                 :                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
    2951                 :                  */
    2952              14 :                 readBuf = (char *) malloc(XLOG_BLCKSZ);
    2953                 :                 Assert(readBuf != NULL);
    2954                 :         }
    2955                 : 
    2956              28 :         if (RecPtr == NULL)
    2957                 :         {
    2958               0 :                 RecPtr = &tmpRecPtr;
    2959                 :                 /* fast case if next record is on same page */
    2960               0 :                 if (nextRecord != NULL)
    2961                 :                 {
    2962               0 :                         record = nextRecord;
    2963               0 :                         goto got_record;
    2964                 :                 }
    2965                 :                 /* align old recptr to next page */
    2966               0 :                 if (tmpRecPtr.xrecoff % XLOG_BLCKSZ != 0)
    2967               0 :                         tmpRecPtr.xrecoff += (XLOG_BLCKSZ - tmpRecPtr.xrecoff % XLOG_BLCKSZ);
    2968               0 :                 if (tmpRecPtr.xrecoff >= XLogFileSize)
    2969                 :                 {
    2970               0 :                         (tmpRecPtr.xlogid)++;
    2971               0 :                         tmpRecPtr.xrecoff = 0;
    2972                 :                 }
    2973                 :                 /* We will account for page header size below */
    2974                 :         }
    2975                 :         else
    2976                 :         {
    2977              28 :                 if (!XRecOffIsValid(RecPtr->xrecoff))
    2978               0 :                         ereport(PANIC,
    2979                 :                                         (errmsg("invalid record offset at %X/%X",
    2980                 :                                                         RecPtr->xlogid, RecPtr->xrecoff)));
    2981                 : 
    2982                 :                 /*
    2983                 :                  * Since we are going to a random position in WAL, forget any prior
    2984                 :                  * state about what timeline we were in, and allow it to be any
    2985                 :                  * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
    2986                 :                  * to go backwards (but we can't reset that variable right here, since
    2987                 :                  * we might not change files at all).
    2988                 :                  */
    2989              28 :                 lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
    2990              28 :                 randAccess = true;              /* allow curFileTLI to go backwards too */
    2991                 :         }
    2992                 : 
    2993              28 :         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
    2994                 :         {
    2995               0 :                 close(readFile);
    2996               0 :                 readFile = -1;
    2997                 :         }
    2998              28 :         XLByteToSeg(*RecPtr, readId, readSeg);
    2999              28 :         if (readFile < 0)
    3000                 :         {
    3001                 :                 /* Now it's okay to reset curFileTLI if random fetch */
    3002              14 :                 if (randAccess)
    3003              14 :                         curFileTLI = 0;
    3004                 : 
    3005              14 :                 readFile = XLogFileRead(readId, readSeg, emode);
    3006              14 :                 if (readFile < 0)
    3007               0 :                         goto next_record_is_invalid;
    3008                 : 
    3009                 :                 /*
    3010                 :                  * Whenever switching to a new WAL segment, we read the first page of
    3011                 :                  * the file and validate its header, even if that's not where the
    3012                 :                  * target record is.  This is so that we can check the additional
    3013                 :                  * identification info that is present in the first page's "long"
    3014                 :                  * header.
    3015                 :                  */
    3016              14 :                 readOff = 0;
    3017              14 :                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    3018                 :                 {
    3019               0 :                         ereport(emode,
    3020                 :                                         (errcode_for_file_access(),
    3021                 :                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
    3022                 :                                                         readId, readSeg, readOff)));
    3023                 :                         goto next_record_is_invalid;
    3024                 :                 }
    3025              14 :                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
    3026               0 :                         goto next_record_is_invalid;
    3027                 :         }
    3028                 : 
    3029              28 :         targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
    3030              28 :         if (readOff != targetPageOff)
    3031                 :         {
    3032              12 :                 readOff = targetPageOff;
    3033              12 :                 if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
    3034                 :                 {
    3035               0 :                         ereport(emode,
    3036                 :                                         (errcode_for_file_access(),
    3037                 :                                          errmsg("could not seek in log file %u, segment %u to offset %u: %m",
    3038                 :                                                         readId, readSeg, readOff)));
    3039                 :                         goto next_record_is_invalid;
    3040                 :                 }
    3041              12 :                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    3042                 :                 {
    3043               0 :                         ereport(emode,
    3044                 :                                         (errcode_for_file_access(),
    3045                 :                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
    3046                 :                                                         readId, readSeg, readOff)));
    3047                 :                         goto next_record_is_invalid;
    3048                 :                 }
    3049              12 :                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
    3050               0 :                         goto next_record_is_invalid;
    3051                 :         }
    3052              28 :         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
    3053              28 :         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
    3054              28 :         if (targetRecOff == 0)
    3055                 :         {
    3056                 :                 /*
    3057                 :                  * Can only get here in the continuing-from-prev-page case, because
    3058                 :                  * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
    3059                 :                  * to skip over the new page's header.
    3060                 :                  */
    3061               0 :                 tmpRecPtr.xrecoff += pageHeaderSize;
    3062               0 :                 targetRecOff = pageHeaderSize;
    3063                 :         }
    3064              28 :         else if (targetRecOff < pageHeaderSize)
    3065                 :         {
    3066               0 :                 ereport(emode,
    3067                 :                                 (errmsg("invalid record offset at %X/%X",
    3068                 :                                                 RecPtr->xlogid, RecPtr->xrecoff)));
    3069                 :                 goto next_record_is_invalid;
    3070                 :         }
    3071              28 :         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
    3072                 :                 targetRecOff == pageHeaderSize)
    3073                 :         {
    3074               0 :                 ereport(emode,
    3075                 :                                 (errmsg("contrecord is requested by %X/%X",
    3076                 :                                                 RecPtr->xlogid, RecPtr->xrecoff)));
    3077                 :                 goto next_record_is_invalid;
    3078                 :         }
    3079              28 :         record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
    3080                 : 
    3081              28 : got_record:;
    3082                 : 
    3083                 :         /*
    3084                 :          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
    3085                 :          * required.
    3086                 :          */
    3087              28 :         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
    3088                 :         {
    3089               0 :                 if (record->xl_len != 0)
    3090                 :                 {
    3091               0 :                         ereport(emode,
    3092                 :                                         (errmsg("invalid xlog switch record at %X/%X",
    3093                 :                                                         RecPtr->xlogid, RecPtr->xrecoff)));
    3094                 :                         goto next_record_is_invalid;
    3095                 :                 }
    3096                 :         }
    3097              28 :         else if (record->xl_len == 0)
    3098                 :         {
    3099               0 :                 ereport(emode,
    3100                 :                                 (errmsg("record with zero length at %X/%X",
    3101                 :                                                 RecPtr->xlogid, RecPtr->xrecoff)));
    3102                 :                 goto next_record_is_invalid;
    3103                 :         }
    3104              28 :         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
    3105                 :                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
    3106                 :                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
    3107                 :         {
    3108               0 :                 ereport(emode,
    3109                 :                                 (errmsg("invalid record length at %X/%X",
    3110                 :                                                 RecPtr->xlogid, RecPtr->xrecoff)));
    3111                 :                 goto next_record_is_invalid;
    3112                 :         }
    3113              28 :         if (record->xl_rmid > RM_MAX_ID)
    3114                 :         {
    3115               0 :                 ereport(emode,
    3116                 :                                 (errmsg("invalid resource manager ID %u at %X/%X",
    3117                 :                                                 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
    3118                 :                 goto next_record_is_invalid;
    3119                 :         }
    3120              28 :         if (randAccess)
    3121                 :         {
    3122                 :                 /*
    3123                 :                  * We can't exactly verify the prev-link, but surely it should be less
    3124                 :                  * than the record's own address.
    3125                 :                  */
    3126              28 :                 if (!XLByteLT(record->xl_prev, *RecPtr))
    3127                 :                 {
    3128               0 :                         ereport(emode,
    3129                 :                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
    3130                 :                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
    3131                 :                                                         RecPtr->xlogid, RecPtr->xrecoff)));
    3132                 :                         goto next_record_is_invalid;
    3133                 :                 }
    3134                 :         }
    3135                 :         else
    3136                 :         {
    3137                 :                 /*
    3138                 :                  * Record's prev-link should exactly match our previous location. This
    3139                 :                  * check guards against torn WAL pages where a stale but valid-looking
    3140                 :                  * WAL record starts on a sector boundary.
    3141                 :                  */
    3142               0 :                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
    3143                 :                 {
    3144               0 :                         ereport(emode,
    3145                 :                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
    3146                 :                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
    3147                 :                                                         RecPtr->xlogid, RecPtr->xrecoff)));
    3148                 :                         goto next_record_is_invalid;
    3149                 :                 }
    3150                 :         }
    3151                 : 
    3152                 :         /*
    3153                 :          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
    3154                 :          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
    3155                 :          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
    3156                 :          * enough for all "normal" records, but very large commit or abort records
    3157                 :          * might need more space.)
    3158                 :          */
    3159              28 :         total_len = record->xl_tot_len;
    3160              28 :         if (total_len > readRecordBufSize)
    3161                 :         {
    3162              14 :                 uint32          newSize = total_len;
    3163                 : 
    3164              14 :                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
    3165              14 :                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
    3166              14 :                 if (readRecordBuf)
    3167               0 :                         free(readRecordBuf);
    3168              14 :                 readRecordBuf = (char *) malloc(newSize);
    3169              14 :                 if (!readRecordBuf)
    3170                 :                 {
    3171               0 :                         readRecordBufSize = 0;
    3172                 :                         /* We treat this as a "bogus data" condition */
    3173               0 :                         ereport(emode,
    3174                 :                                         (errmsg("record length %u at %X/%X too long",
    3175                 :                                                         total_len, RecPtr->xlogid, RecPtr->xrecoff)));
    3176                 :                         goto next_record_is_invalid;
    3177                 :                 }
    3178              14 :                 readRecordBufSize = newSize;
    3179                 :         }
    3180                 : 
    3181              28 :         buffer = readRecordBuf;
    3182              28 :         nextRecord = NULL;
    3183              28 :         len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
    3184              28 :         if (total_len > len)
    3185                 :         {
    3186                 :                 /* Need to reassemble record */
    3187                 :                 XLogContRecord *contrecord;
    3188               0 :                 uint32          gotlen = len;
    3189                 : 
    3190               0 :                 memcpy(buffer, record, len);
    3191               0 :                 record = (XLogRecord *) buffer;
    3192               0 :                 buffer += len;
    3193                 :                 for (;;)
    3194                 :                 {
    3195               0 :                         readOff += XLOG_BLCKSZ;
    3196               0 :                         if (readOff >= XLogSegSize)
    3197                 :                         {
    3198               0 :                                 close(readFile);
    3199               0 :                                 readFile = -1;
    3200               0 :                                 NextLogSeg(readId, readSeg);
    3201               0 :                                 readFile = XLogFileRead(readId, readSeg, emode);
    3202               0 :                                 if (readFile < 0)
    3203               0 :                                         goto next_record_is_invalid;
    3204               0 :                                 readOff = 0;
    3205                 :                         }
    3206               0 :                         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    3207                 :                         {
    3208               0 :                                 ereport(emode,
    3209                 :                                                 (errcode_for_file_access(),
    3210                 :                                                  errmsg("could not read from log file %u, segment %u, offset %u: %m",
    3211                 :                                                                 readId, readSeg, readOff)));
    3212                 :                                 goto next_record_is_invalid;
    3213                 :                         }
    3214               0 :                         if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
    3215               0 :                                 goto next_record_is_invalid;
    3216               0 :                         if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
    3217                 :                         {
    3218               0 :                                 ereport(emode,
    3219                 :                                                 (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
    3220                 :                                                                 readId, readSeg, readOff)));
    3221                 :                                 goto next_record_is_invalid;
    3222                 :                         }
    3223               0 :                         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
    3224               0 :                         contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
    3225               0 :                         if (contrecord->xl_rem_len == 0 ||
    3226                 :                                 total_len != (contrecord->xl_rem_len + gotlen))
    3227                 :                         {
    3228               0 :                                 ereport(emode,
    3229                 :                                                 (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
    3230                 :                                                                 contrecord->xl_rem_len,
    3231                 :                                                                 readId, readSeg, readOff)));
    3232                 :                                 goto next_record_is_invalid;
    3233                 :                         }
    3234               0 :                         len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
    3235               0 :                         if (contrecord->xl_rem_len > len)
    3236                 :                         {
    3237               0 :                                 memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
    3238               0 :                                 gotlen += len;
    3239               0 :                                 buffer += len;
    3240               0 :                                 continue;
    3241                 :                         }
    3242               0 :                         memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
    3243                 :                                    contrecord->xl_rem_len);
    3244                 :                         break;
    3245                 :                 }
    3246               0 :                 if (!RecordIsValid(record, *RecPtr, emode))
    3247               0 :                         goto next_record_is_invalid;
    3248               0 :                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
    3249               0 :                 if (XLOG_BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
    3250                 :                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len))
    3251                 :                 {
    3252               0 :                         nextRecord = (XLogRecord *) ((char *) contrecord +
    3253                 :                                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len));
    3254                 :                 }
    3255               0 :                 EndRecPtr.xlogid = readId;
    3256               0 :                 EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
    3257                 :                         pageHeaderSize +
    3258                 :                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
    3259               0 :                 ReadRecPtr = *RecPtr;
    3260                 :                 /* needn't worry about XLOG SWITCH, it can't cross page boundaries */
    3261               0 :                 return record;
    3262                 :         }
    3263                 : 
    3264                 :         /* Record does not cross a page boundary */
    3265              28 :         if (!RecordIsValid(record, *RecPtr, emode))
    3266               0 :                 goto next_record_is_invalid;
    3267              28 :         if (XLOG_BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % XLOG_BLCKSZ +
    3268                 :                 MAXALIGN(total_len))
    3269              28 :                 nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
    3270              28 :         EndRecPtr.xlogid = RecPtr->xlogid;
    3271              28 :         EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
    3272              28 :         ReadRecPtr = *RecPtr;
    3273              28 :         memcpy(buffer, record, total_len);
    3274                 : 
    3275                 :         /*
    3276                 :          * Special processing if it's an XLOG SWITCH record
    3277                 :          */
    3278              28 :         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
    3279                 :         {
    3280                 :                 /* Pretend it extends to end of segment */
    3281               0 :                 EndRecPtr.xrecoff += XLogSegSize - 1;
    3282               0 :                 EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
    3283               0 :                 nextRecord = NULL;              /* definitely not on same page */
    3284                 : 
    3285                 :                 /*
    3286                 :                  * Pretend that readBuf contains the last page of the segment. This is
    3287                 :                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
    3288                 :                  * segment.
    3289                 :                  */
    3290               0 :                 readOff = XLogSegSize - XLOG_BLCKSZ;
    3291                 :         }
    3292              28 :         return (XLogRecord *) buffer;
    3293                 : 
    3294               0 : next_record_is_invalid:;
    3295               0 :         close(readFile);
    3296               0 :         readFile = -1;
    3297               0 :         nextRecord = NULL;
    3298               0 :         return NULL;
    3299                 : }
    3300                 : 
    3301                 : /*
    3302                 :  * Check whether the xlog header of a page just read in looks valid.
    3303                 :  *
    3304                 :  * This is just a convenience subroutine to avoid duplicated code in
    3305                 :  * ReadRecord.  It's not intended for use from anywhere else.
    3306                 :  */
    3307                 : static bool
    3308                 : ValidXLOGHeader(XLogPageHeader hdr, int emode)
    3309              26 : {
    3310                 :         XLogRecPtr      recaddr;
    3311                 : 
    3312              26 :         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
    3313                 :         {
    3314               0 :                 ereport(emode,
    3315                 :                                 (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
    3316                 :                                                 hdr->xlp_magic, readId, readSeg, readOff)));
    3317               0 :                 return false;
    3318                 :         }
    3319              26 :         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
    3320                 :         {
    3321               0 :                 ereport(emode,
    3322                 :                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
    3323                 :                                                 hdr->xlp_info, readId, readSeg, readOff)));
    3324               0 :                 return false;
    3325                 :         }
    3326              26 :         if (hdr->xlp_info & XLP_LONG_HEADER)
    3327                 :         {
    3328              14 :                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
    3329                 : 
    3330              14 :                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
    3331                 :                 {
    3332                 :                         char            fhdrident_str[32];
    3333                 :                         char            sysident_str[32];
    3334                 : 
    3335                 :                         /*
    3336                 :                          * Format sysids separately to keep platform-dependent format code
    3337                 :                          * out of the translatable message string.
    3338                 :                          */
    3339               0 :                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
    3340                 :                                          longhdr->xlp_sysid);
    3341               0 :                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
    3342                 :                                          ControlFile->system_identifier);
    3343               0 :                         ereport(emode,
    3344                 :                                         (errmsg("WAL file is from different system"),
    3345                 :                                          errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
    3346                 :                                                            fhdrident_str, sysident_str)));
    3347               0 :                         return false;
    3348                 :                 }
    3349              14 :                 if (longhdr->xlp_seg_size != XLogSegSize)
    3350                 :                 {
    3351               0 :                         ereport(emode,
    3352                 :                                         (errmsg("WAL file is from different system"),
    3353                 :                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
    3354               0 :                         return false;
    3355                 :                 }
    3356              14 :                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
    3357                 :                 {
    3358               0 :                         ereport(emode,
    3359                 :                                         (errmsg("WAL file is from different system"),
    3360                 :                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
    3361               0 :                         return false;
    3362                 :                 }
    3363                 :         }
    3364              12 :         else if (readOff == 0)
    3365                 :         {
    3366                 :                 /* hmm, first page of file doesn't have a long header? */
    3367               0 :                 ereport(emode,
    3368                 :                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
    3369                 :                                                 hdr->xlp_info, readId, readSeg, readOff)));
    3370               0 :                 return false;
    3371                 :         }
    3372                 : 
    3373              26 :         recaddr.xlogid = readId;
    3374              26 :         recaddr.xrecoff = readSeg * XLogSegSize + readOff;
    3375              26 :         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
    3376                 :         {
    3377               0 :                 ereport(emode,
    3378                 :                                 (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
    3379                 :                                                 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
    3380                 :                                                 readId, readSeg, readOff)));
    3381               0 :                 return false;
    3382                 :         }
    3383                 : 
    3384                 :         /*
    3385                 :          * Check page TLI is one of the expected values.
    3386                 :          */
    3387              26 :         if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
    3388                 :         {
    3389               0 :                 ereport(emode,
    3390                 :                                 (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
    3391                 :                                                 hdr->xlp_tli,
    3392                 :                                                 readId, readSeg, readOff)));
    3393               0 :                 return false;
    3394                 :         }
    3395                 : 
    3396                 :         /*
    3397                 :          * Since child timelines are always assigned a TLI greater than their
    3398                 :          * immediate parent's TLI, we should never see TLI go backwards across
    3399                 :          * successive pages of a consistent WAL sequence.
    3400                 :          *
    3401                 :          * Of course this check should only be applied when advancing sequentially
    3402                 :          * across pages; therefore ReadRecord resets lastPageTLI to zero when
    3403                 :          * going to a random page.
    3404                 :          */
    3405              26 :         if (hdr->xlp_tli < lastPageTLI)
    3406                 :         {
    3407               0 :                 ereport(emode,
    3408                 :                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
    3409                 :                                                 hdr->xlp_tli, lastPageTLI,
    3410                 :                                                 readId, readSeg, readOff)));
    3411               0 :                 return false;
    3412                 :         }
    3413              26 :         lastPageTLI = hdr->xlp_tli;
    3414              26 :         return true;
    3415                 : }
    3416                 : 
    3417                 : /*
    3418                 :  * Try to read a timeline's history file.
    3419                 :  *
    3420                 :  * If successful, return the list of component TLIs (the given TLI followed by
    3421                 :  * its ancestor TLIs).  If we can't find the history file, assume that the
    3422                 :  * timeline has no parents, and return a list of just the specified timeline
    3423                 :  * ID.
    3424                 :  */
    3425                 : static List *
    3426                 : readTimeLineHistory(TimeLineID targetTLI)
    3427              14 : {
    3428                 :         List       *result;
    3429                 :         char            path[MAXPGPATH];
    3430                 :         char            histfname[MAXFNAMELEN];
    3431                 :         char            fline[MAXPGPATH];
    3432                 :         FILE       *fd;
    3433                 : 
    3434              14 :         if (InArchiveRecovery)
    3435                 :         {
    3436               0 :                 TLHistoryFileName(histfname, targetTLI);
    3437               0 :                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
    3438                 :         }
    3439                 :         else
    3440              14 :                 TLHistoryFilePath(path, targetTLI);
    3441                 : 
    3442              14 :         fd = AllocateFile(path, "r");
    3443              14 :         if (fd == NULL)
    3444                 :         {
    3445              14 :                 if (errno != ENOENT)
    3446               0 :                         ereport(FATAL,
    3447                 :                                         (errcode_for_file_access(),
    3448                 :                                          errmsg("could not open file \"%s\": %m", path)));
    3449                 :                 /* Not there, so assume no parents */
    3450              14 :                 return list_make1_int((int) targetTLI);
    3451                 :         }
    3452                 : 
    3453               0 :         result = NIL;
    3454                 : 
    3455                 :         /*
    3456                 :          * Parse the file...
    3457                 :          */
    3458               0 :         while (fgets(fline, sizeof(fline), fd) != NULL)
    3459                 :         {
    3460                 :                 /* skip leading whitespace and check for # comment */
    3461                 :                 char       *ptr;
    3462                 :                 char       *endptr;
    3463                 :                 TimeLineID      tli;
    3464                 : 
    3465               0 :                 for (ptr = fline; *ptr; ptr++)
    3466                 :                 {
    3467               0 :                         if (!isspace((unsigned char) *ptr))
    3468               0 :                                 break;
    3469                 :                 }
    3470               0 :                 if (*ptr == '\0' || *ptr == '#')
    3471               0 :                         continue;
    3472                 : 
    3473                 :                 /* expect a numeric timeline ID as first field of line */
    3474               0 :                 tli = (TimeLineID) strtoul(ptr, &endptr, 0);
    3475               0 :                 if (endptr == ptr)
    3476               0 :                         ereport(FATAL,
    3477                 :                                         (errmsg("syntax error in history file: %s", fline),
    3478                 :                                          errhint("Expected a numeric timeline ID.")));
    3479                 : 
    3480               0 :                 if (result &&
    3481                 :                         tli <= (TimeLineID) linitial_int(result))
    3482               0 :                         ereport(FATAL,
    3483                 :                                         (errmsg("invalid data in history file: %s", fline),
    3484                 :                                    errhint("Timeline IDs must be in increasing sequence.")));
    3485                 : 
    3486                 :                 /* Build list with newest item first */
    3487               0 :                 result = lcons_int((int) tli, result);
    3488                 : 
    3489                 :                 /* we ignore the remainder of each line */
    3490                 :         }
    3491                 : 
    3492               0 :         FreeFile(fd);
    3493                 : 
    3494               0 :         if (result &&
    3495                 :                 targetTLI <= (TimeLineID) linitial_int(result))
    3496               0 :                 ereport(FATAL,
    3497                 :                                 (errmsg("invalid data in history file \"%s\"", path),
    3498                 :                         errhint("Timeline IDs must be less than child timeline's ID.")));
    3499                 : 
    3500               0 :         result = lcons_int((int) targetTLI, result);
    3501                 : 
    3502               0 :         ereport(DEBUG3,
    3503                 :                         (errmsg_internal("history of timeline %u is %s",
    3504                 :                                                          targetTLI, nodeToString(result))));
    3505                 : 
    3506               0 :         return result;
    3507                 : }
    3508                 : 
    3509                 : /*
    3510                 :  * Probe whether a timeline history file exists for the given timeline ID
    3511                 :  */
    3512                 : static bool
    3513                 : existsTimeLineHistory(TimeLineID probeTLI)
    3514               0 : {
    3515                 :         char            path[MAXPGPATH];
    3516                 :         char            histfname[MAXFNAMELEN];
    3517                 :         FILE       *fd;
    3518                 : 
    3519               0 :         if (InArchiveRecovery)
    3520                 :         {
    3521               0 :                 TLHistoryFileName(histfname, probeTLI);
    3522               0 :                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
    3523                 :         }
    3524                 :         else
    3525               0 :                 TLHistoryFilePath(path, probeTLI);
    3526                 : 
    3527               0 :         fd = AllocateFile(path, "r");
    3528               0 :         if (fd != NULL)
    3529                 :         {
    3530               0 :                 FreeFile(fd);
    3531               0 :                 return true;
    3532                 :         }
    3533                 :         else
    3534                 :         {
    3535               0 :                 if (errno != ENOENT)
    3536               0 :                         ereport(FATAL,
    3537                 :                                         (errcode_for_file_access(),
    3538                 :                                          errmsg("could not open file \"%s\": %m", path)));
    3539               0 :                 return false;
    3540                 :         }
    3541                 : }
    3542                 : 
    3543                 : /*
    3544                 :  * Find the newest existing timeline, assuming that startTLI exists.
    3545                 :  *
    3546                 :  * Note: while this is somewhat heuristic, it does positively guarantee
    3547                 :  * that (result + 1) is not a known timeline, and therefore it should
    3548                 :  * be safe to assign that ID to a new timeline.
    3549                 :  */
    3550                 : static TimeLineID
    3551                 : findNewestTimeLine(TimeLineID startTLI)
    3552               0 : {
    3553                 :         TimeLineID      newestTLI;
    3554                 :         TimeLineID      probeTLI;
    3555                 : 
    3556                 :         /*
    3557                 :          * The algorithm is just to probe for the existence of timeline history
    3558                 :          * files.  XXX is it useful to allow gaps in the sequence?
    3559                 :          */
    3560               0 :         newestTLI = startTLI;
    3561                 : 
    3562               0 :         for (probeTLI = startTLI + 1;; probeTLI++)
    3563                 :         {
    3564               0 :                 if (existsTimeLineHistory(probeTLI))
    3565                 :                 {
    3566               0 :                         newestTLI = probeTLI;           /* probeTLI exists */
    3567                 :                 }
    3568                 :                 else
    3569                 :                 {
    3570                 :                         /* doesn't exist, assume we're done */
    3571               0 :                         break;
    3572                 :                 }
    3573               0 :         }
    3574                 : 
    3575               0 :         return newestTLI;
    3576                 : }
    3577                 : 
    3578                 : /*
    3579                 :  * Create a new timeline history file.
    3580                 :  *
    3581                 :  *      newTLI: ID of the new timeline
    3582                 :  *      parentTLI: ID of its immediate parent
    3583                 :  *      endTLI et al: ID of the last used WAL file, for annotation purposes
    3584                 :  *
    3585                 :  * Currently this is only used during recovery, and so there are no locking
    3586                 :  * considerations.      But we should be just as tense as XLogFileInit to avoid
    3587                 :  * emplacing a bogus file.
    3588                 :  */
    3589                 : static void
    3590                 : writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
    3591                 :                                          TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
    3592               0 : {
    3593                 :         char            path[MAXPGPATH];
    3594                 :         char            tmppath[MAXPGPATH];
    3595                 :         char            histfname[MAXFNAMELEN];
    3596                 :         char            xlogfname[MAXFNAMELEN];
    3597                 :         char            buffer[BLCKSZ];
    3598                 :         int                     srcfd;
    3599                 :         int                     fd;
    3600                 :         int                     nbytes;
    3601                 : 
    3602                 :         Assert(newTLI > parentTLI); /* else bad selection of newTLI */
    3603                 : 
    3604                 :         /*
    3605                 :          * Write into a temp file name.
    3606                 :          */
    3607               0 :         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3608                 : 
    3609               0 :         unlink(tmppath);
    3610                 : 
    3611                 :         /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
    3612               0 :         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
    3613                 :                                            S_IRUSR | S_IWUSR);
    3614               0 :         if (fd < 0)
    3615               0 :                 ereport(ERROR,
    3616                 :                                 (errcode_for_file_access(),
    3617                 :                                  errmsg("could not create file \"%s\": %m", tmppath)));
    3618                 : 
    3619                 :         /*
    3620                 :          * If a history file exists for the parent, copy it verbatim
    3621                 :          */
    3622               0 :         if (InArchiveRecovery)
    3623                 :         {
    3624               0 :                 TLHistoryFileName(histfname, parentTLI);
    3625               0 :                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
    3626                 :         }
    3627                 :         else
    3628               0 :                 TLHistoryFilePath(path, parentTLI);
    3629                 : 
    3630               0 :         srcfd = BasicOpenFile(path, O_RDONLY, 0);
    3631               0 :         if (srcfd < 0)
    3632                 :         {
    3633               0 :                 if (errno != ENOENT)
    3634               0 :                         ereport(ERROR,
    3635                 :                                         (errcode_for_file_access(),
    3636                 :                                          errmsg("could not open file \"%s\": %m", path)));
    3637                 :                 /* Not there, so assume parent has no parents */
    3638                 :         }
    3639                 :         else
    3640                 :         {
    3641                 :                 for (;;)
    3642                 :                 {
    3643               0 :                         errno = 0;
    3644               0 :                         nbytes = (int) read(srcfd, buffer, sizeof(buffer));
    3645               0 :                         if (nbytes < 0 || errno != 0)
    3646               0 :                                 ereport(ERROR,
    3647                 :                                                 (errcode_for_file_access(),
    3648                 :                                                  errmsg("could not read file \"%s\": %m", path)));
    3649               0 :                         if (nbytes == 0)
    3650               0 :                                 break;
    3651               0 :                         errno = 0;
    3652               0 :                         if ((int) write(fd, buffer, nbytes) != nbytes)
    3653                 :                         {
    3654               0 :                                 int                     save_errno = errno;
    3655                 : 
    3656                 :                                 /*
    3657                 :                                  * If we fail to make the file, delete it to release disk
    3658                 :                                  * space
    3659                 :                                  */
    3660               0 :                                 unlink(tmppath);
    3661                 : 
    3662                 :                                 /*
    3663                 :                                  * if write didn't set errno, assume problem is no disk space
    3664                 :                                  */
    3665               0 :                                 errno = save_errno ? save_errno : ENOSPC;
    3666                 : 
    3667               0 :                                 ereport(ERROR,
    3668                 :                                                 (errcode_for_file_access(),
    3669                 :                                          errmsg("could not write to file \"%s\": %m", tmppath)));
    3670                 :                         }
    3671                 :                 }
    3672               0 :                 close(srcfd);
    3673                 :         }
    3674                 : 
    3675                 :         /*
    3676                 :          * Append one line with the details of this timeline split.
    3677                 :          *
    3678                 :          * If we did have a parent file, insert an extra newline just in case the
    3679                 :          * parent file failed to end with one.
    3680                 :          */
    3681               0 :         XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
    3682                 : 
    3683               0 :         snprintf(buffer, sizeof(buffer),
    3684                 :                          "%s%u\t%s\t%s transaction %u at %s\n",
    3685                 :                          (srcfd < 0) ? "" : "\n",
    3686                 :                          parentTLI,
    3687                 :                          xlogfname,
    3688                 :                          recoveryStopAfter ? "after" : "before",
    3689                 :                          recoveryStopXid,
    3690                 :                          timestamptz_to_str(recoveryStopTime));
    3691                 : 
    3692               0 :         nbytes = strlen(buffer);
    3693               0 :         errno = 0;
    3694               0 :         if ((int) write(fd, buffer, nbytes) != nbytes)
    3695                 :         {
    3696               0 :                 int                     save_errno = errno;
    3697                 : 
    3698                 :                 /*
    3699                 :                  * If we fail to make the file, delete it to release disk space
    3700                 :                  */
    3701               0 :                 unlink(tmppath);
    3702                 :                 /* if write didn't set errno, assume problem is no disk space */
    3703               0 :                 errno = save_errno ? save_errno : ENOSPC;
    3704                 : 
    3705               0 :                 ereport(ERROR,
    3706                 :                                 (errcode_for_file_access(),
    3707                 :                                  errmsg("could not write to file \"%s\": %m", tmppath)));
    3708                 :         }
    3709                 : 
    3710               0 :         if (pg_fsync(fd) != 0)
    3711               0 :                 ereport(ERROR,
    3712                 :                                 (errcode_for_file_access(),
    3713                 :                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3714                 : 
    3715               0 :         if (close(fd))
    3716               0 :                 ereport(ERROR,
    3717                 :                                 (errcode_for_file_access(),
    3718                 :                                  errmsg("could not close file \"%s\": %m", tmppath)));
    3719                 : 
    3720                 : 
    3721                 :         /*
    3722                 :          * Now move the completed history file into place with its final name.
    3723                 :          */
    3724               0 :         TLHistoryFilePath(path, newTLI);
    3725                 : 
    3726                 :         /*
    3727                 :          * Prefer link() to rename() here just to be really sure that we don't
    3728                 :          * overwrite an existing logfile.  However, there shouldn't be one, so
    3729                 :          * rename() is an acceptable substitute except for the truly paranoid.
    3730                 :          */
    3731                 : #if HAVE_WORKING_LINK
    3732               0 :         if (link(tmppath, path) < 0)
    3733               0 :                 ereport(ERROR,
    3734                 :                                 (errcode_for_file_access(),
    3735                 :                                  errmsg("could not link file \"%s\" to \"%s\": %m",
    3736                 :                                                 tmppath, path)));
    3737               0 :         unlink(tmppath);
    3738                 : #else
    3739                 :         if (rename(tmppath, path) < 0)
    3740                 :                 ereport(ERROR,
    3741                 :                                 (errcode_for_file_access(),
    3742                 :                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
    3743                 :                                                 tmppath, path)));
    3744                 : #endif
    3745                 : 
    3746                 :         /* The history file can be archived immediately. */
    3747               0 :         TLHistoryFileName(histfname, newTLI);
    3748               0 :         XLogArchiveNotify(histfname);
    3749               0 : }
    3750                 : 
    3751                 : /*
    3752                 :  * I/O routines for pg_control
    3753                 :  *
    3754                 :  * *ControlFile is a buffer in shared memory that holds an image of the
    3755                 :  * contents of pg_control.      WriteControlFile() initializes pg_control
    3756                 :  * given a preloaded buffer, ReadControlFile() loads the buffer from
    3757                 :  * the pg_control file (during postmaster or standalone-backend startup),
    3758                 :  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
    3759                 :  *
    3760                 :  * For simplicity, WriteControlFile() initializes the fields of pg_control
    3761                 :  * that are related to checking backend/database compatibility, and
    3762                 :  * ReadControlFile() verifies they are correct.  We could split out the
    3763                 :  * I/O and compatibility-check functions, but there seems no need currently.
    3764                 :  */
    3765                 : static void
    3766                 : WriteControlFile(void)
    3767               1 : {
    3768                 :         int                     fd;
    3769                 :         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
    3770                 :         char       *localeptr;
    3771                 : 
    3772                 :         /*
    3773                 :          * Initialize version and compatibility-check fields
    3774                 :          */
    3775               1 :         ControlFile->pg_control_version = PG_CONTROL_VERSION;
    3776               1 :         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
    3777                 : 
    3778               1 :         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
    3779               1 :         ControlFile->floatFormat = FLOATFORMAT_VALUE;
    3780                 : 
    3781               1 :         ControlFile->blcksz = BLCKSZ;
    3782               1 :         ControlFile->relseg_size = RELSEG_SIZE;
    3783               1 :         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
    3784               1 :         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
    3785                 : 
    3786               1 :         ControlFile->nameDataLen = NAMEDATALEN;
    3787               1 :         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
    3788                 : 
    3789               1 :         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
    3790                 : 
    3791                 : #ifdef HAVE_INT64_TIMESTAMP
    3792                 :         ControlFile->enableIntTimes = TRUE;
    3793                 : #else
    3794               1 :         ControlFile->enableIntTimes = FALSE;
    3795                 : #endif
    3796                 : 
    3797               1 :         ControlFile->localeBuflen = LOCALE_NAME_BUFLEN;
    3798               1 :         localeptr = setlocale(LC_COLLATE, NULL);
    3799               1 :         if (!localeptr)
    3800               0 :                 ereport(PANIC,
    3801                 :                                 (errmsg("invalid LC_COLLATE setting")));
    3802               1 :         StrNCpy(ControlFile->lc_collate, localeptr, LOCALE_NAME_BUFLEN);
    3803               1 :         localeptr = setlocale(LC_CTYPE, NULL);
    3804               1 :         if (!localeptr)
    3805               0 :                 ereport(PANIC,
    3806                 :                                 (errmsg("invalid LC_CTYPE setting")));
    3807               1 :         StrNCpy(ControlFile->lc_ctype, localeptr, LOCALE_NAME_BUFLEN);
    3808                 : 
    3809                 :         /* Contents are protected with a CRC */
    3810               1 :         INIT_CRC32(ControlFile->crc);
    3811               1 :         COMP_CRC32(ControlFile->crc,
    3812                 :                            (char *) ControlFile,
    3813                 :                            offsetof(ControlFileData, crc));
    3814               1 :         FIN_CRC32(ControlFile->crc);
    3815                 : 
    3816                 :         /*
    3817                 :          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
    3818                 :          * excess over sizeof(ControlFileData).  This reduces the odds of
    3819                 :          * premature-EOF errors when reading pg_control.  We'll still fail when we
    3820                 :          * check the contents of the file, but hopefully with a more specific
    3821                 :          * error than "couldn't read pg_control".
    3822                 :          */
    3823                 :         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
    3824                 :                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
    3825                 : 
    3826               1 :         memset(buffer, 0, PG_CONTROL_SIZE);
    3827               1 :         memcpy(buffer, ControlFile, sizeof(ControlFileData));
    3828                 : 
    3829               1 :         fd = BasicOpenFile(XLOG_CONTROL_FILE,
    3830                 :                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
    3831                 :                                            S_IRUSR | S_IWUSR);
    3832               1 :         if (fd < 0)
    3833               0 :                 ereport(PANIC,
    3834                 :                                 (errcode_for_file_access(),
    3835                 :                                  errmsg("could not create control file \"%s\": %m",
    3836                 :                                                 XLOG_CONTROL_FILE)));
    3837                 : 
    3838               1 :         errno = 0;
    3839               1 :         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
    3840                 :         {
    3841                 :                 /* if write didn't set errno, assume problem is no disk space */
    3842               0 :                 if (errno == 0)
    3843               0 :                         errno = ENOSPC;
    3844               0 :                 ereport(PANIC,
    3845                 :                                 (errcode_for_file_access(),
    3846                 :                                  errmsg("could not write to control file: %m")));
    3847                 :         }
    3848                 : 
    3849               1 :         if (pg_fsync(fd) != 0)
    3850               0 :                 ereport(PANIC,
    3851                 :                                 (errcode_for_file_access(),
    3852                 :                                  errmsg("could not fsync control file: %m")));
    3853                 : 
    3854               1 :         if (close(fd))
    3855               0 :                 ereport(PANIC,
    3856                 :                                 (errcode_for_file_access(),
    3857                 :                                  errmsg("could not close control file: %m")));
    3858               1 : }
    3859                 : 
    3860                 : static void
    3861                 : ReadControlFile(void)
    3862              27 : {
    3863                 :         pg_crc32        crc;
    3864                 :         int                     fd;
    3865                 : 
    3866                 :         /*
    3867                 :          * Read data...
    3868                 :          */
    3869              27 :         fd = BasicOpenFile(XLOG_CONTROL_FILE,
    3870                 :                                            O_RDWR | PG_BINARY,
    3871                 :                                            S_IRUSR | S_IWUSR);
    3872              27 :         if (fd < 0)
    3873               0 :                 ereport(PANIC,
    3874                 :                                 (errcode_for_file_access(),
    3875                 :                                  errmsg("could not open control file \"%s\": %m",
    3876                 :                                                 XLOG_CONTROL_FILE)));
    3877                 : 
    3878              27 :         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
    3879               0 :                 ereport(PANIC,
    3880                 :                                 (errcode_for_file_access(),
    3881                 :                                  errmsg("could not read from control file: %m")));
    3882                 : 
    3883              27 :         close(fd);
    3884                 : 
    3885                 :         /*
    3886                 :          * Check for expected pg_control format version.  If this is wrong, the
    3887                 :          * CRC check will likely fail because we'll be checking the wrong number
    3888                 :          * of bytes.  Complaining about wrong version will probably be more
    3889                 :          * enlightening than complaining about wrong CRC.
    3890                 :          */
    3891                 : 
    3892              27 :         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
    3893               0 :                 ereport(FATAL,
    3894                 :                                 (errmsg("database files are incompatible with server"),
    3895                 :                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
    3896                 :                                                    " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
    3897                 :                                                    ControlFile->pg_control_version, ControlFile->pg_control_version,
    3898                 :                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
    3899                 :                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
    3900                 : 
    3901              27 :         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
    3902               0 :                 ereport(FATAL,
    3903                 :                                 (errmsg("database files are incompatible with server"),
    3904                 :                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
    3905                 :                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
    3906                 :                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
    3907                 :                                  errhint("It looks like you need to initdb.")));
    3908                 : 
    3909                 :         /* Now check the CRC. */
    3910              27 :         INIT_CRC32(crc);
    3911              27 :         COMP_CRC32(crc,
    3912                 :                            (char *) ControlFile,
    3913                 :                            offsetof(ControlFileData, crc));
    3914              27 :         FIN_CRC32(crc);
    3915                 : 
    3916              27 :         if (!EQ_CRC32(crc, ControlFile->crc))
    3917               0 :                 ereport(FATAL,
    3918                 :                                 (errmsg("incorrect checksum in control file")));
    3919                 : 
    3920                 :         /*
    3921                 :          * Do compatibility checking immediately.  We do this here for 2 reasons:
    3922                 :          *
    3923                 :          * (1) if the database isn't compatible with the backend executable, we
    3924                 :          * want to abort before we can possibly do any damage;
    3925                 :          *
    3926                 :          * (2) this code is executed in the postmaster, so the setlocale() will
    3927                 :          * propagate to forked backends, which aren't going to read this file for
    3928                 :          * themselves.  (These locale settings are considered critical
    3929                 :          * compatibility items because they can affect sort order of indexes.)
    3930                 :          */
    3931              27 :         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
    3932               0 :                 ereport(FATAL,
    3933                 :                                 (errmsg("database files are incompatible with server"),
    3934                 :                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
    3935                 :                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
    3936                 :                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
    3937                 :                                  errhint("It looks like you need to initdb.")));
    3938              27 :         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
    3939               0 :                 ereport(FATAL,
    3940                 :                                 (errmsg("database files are incompatible with server"),
    3941                 :                    errdetail("The database cluster was initialized with MAXALIGN %d,"
    3942                 :                                          " but the server was compiled with MAXALIGN %d.",
    3943                 :                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
    3944                 :                                  errhint("It looks like you need to initdb.")));
    3945              27 :         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
    3946               0 :                 ereport(FATAL,
    3947                 :                                 (errmsg("database files are incompatible with server"),
    3948                 :                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
    3949                 :                                  errhint("It looks like you need to initdb.")));
    3950              27 :         if (ControlFile->blcksz != BLCKSZ)
    3951               0 :                 ereport(FATAL,
    3952                 :                                 (errmsg("database files are incompatible with server"),
    3953                 :                          errdetail("The database cluster was initialized with BLCKSZ %d,"
    3954                 :                                            " but the server was compiled with BLCKSZ %d.",
    3955                 :                                            ControlFile->blcksz, BLCKSZ),
    3956                 :                                  errhint("It looks like you need to recompile or initdb.")));
    3957              27 :         if (ControlFile->relseg_size != RELSEG_SIZE)
    3958               0 :                 ereport(FATAL,
    3959                 :                                 (errmsg("database files are incompatible with server"),
    3960                 :                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
    3961                 :                                   " but the server was compiled with RELSEG_SIZE %d.",
    3962                 :                                   ControlFile->relseg_size, RELSEG_SIZE),
    3963                 :                                  errhint("It looks like you need to recompile or initdb.")));
    3964              27 :         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
    3965               0 :                 ereport(FATAL,
    3966                 :                                 (errmsg("database files are incompatible with server"),
    3967                 :                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
    3968                 :                                   " but the server was compiled with XLOG_BLCKSZ %d.",
    3969                 :                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
    3970                 :                                  errhint("It looks like you need to recompile or initdb.")));
    3971              27 :         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
    3972               0 :                 ereport(FATAL,
    3973                 :                                 (errmsg("database files are incompatible with server"),
    3974                 :                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
    3975                 :                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
    3976                 :                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
    3977                 :                                  errhint("It looks like you need to recompile or initdb.")));
    3978              27 :         if (ControlFile->nameDataLen != NAMEDATALEN)
    3979               0 :                 ereport(FATAL,
    3980                 :                                 (errmsg("database files are incompatible with server"),
    3981                 :                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
    3982                 :                                   " but the server was compiled with NAMEDATALEN %d.",
    3983                 :                                   ControlFile->nameDataLen, NAMEDATALEN),
    3984                 :                                  errhint("It looks like you need to recompile or initdb.")));
    3985              27 :         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
    3986               0 :                 ereport(FATAL,
    3987                 :                                 (errmsg("database files are incompatible with server"),
    3988                 :                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
    3989                 :                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
    3990                 :                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
    3991                 :                                  errhint("It looks like you need to recompile or initdb.")));
    3992              27 :         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
    3993               0 :                 ereport(FATAL,
    3994                 :                                 (errmsg("database files are incompatible with server"),
    3995                 :                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
    3996                 :                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
    3997                 :                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
    3998                 :                                  errhint("It looks like you need to recompile or initdb.")));
    3999                 : 
    4000                 : #ifdef HAVE_INT64_TIMESTAMP
    4001                 :         if (ControlFile->enableIntTimes != TRUE)
    4002                 :                 ereport(FATAL,
    4003                 :                                 (errmsg("database files are incompatible with server"),
    4004                 :                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
    4005                 :                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
    4006                 :                                  errhint("It looks like you need to recompile or initdb.")));
    4007                 : #else
    4008              27 :         if (ControlFile->enableIntTimes != FALSE)
    4009               0 :                 ereport(FATAL,
    4010                 :                                 (errmsg("database files are incompatible with server"),
    4011                 :                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
    4012                 :                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
    4013                 :                                  errhint("It looks like you need to recompile or initdb.")));
    4014                 : #endif
    4015                 : 
    4016              27 :         if (ControlFile->localeBuflen != LOCALE_NAME_BUFLEN)
    4017               0 :                 ereport(FATAL,
    4018                 :                                 (errmsg("database files are incompatible with server"),
    4019                 :                                  errdetail("The database cluster was initialized with LOCALE_NAME_BUFLEN %d,"
    4020                 :                                   " but the server was compiled with LOCALE_NAME_BUFLEN %d.",
    4021                 :                                                    ControlFile->localeBuflen, LOCALE_NAME_BUFLEN),
    4022                 :                                  errhint("It looks like you need to recompile or initdb.")));
    4023              27 :         if (pg_perm_setlocale(LC_COLLATE, ControlFile->lc_collate) == NULL)
    4024               0 :                 ereport(FATAL,
    4025                 :                         (errmsg("database files are incompatible with operating system"),
    4026                 :                          errdetail("The database cluster was initialized with LC_COLLATE \"%s\","
    4027                 :                                            " which is not recognized by setlocale().",
    4028                 :                                            ControlFile->lc_collate),
    4029                 :                          errhint("It looks like you need to initdb or install locale support.")));
    4030              27 :         if (pg_perm_setlocale(LC_CTYPE, ControlFile->lc_ctype) == NULL)
    4031               0 :                 ereport(FATAL,
    4032                 :                         (errmsg("database files are incompatible with operating system"),
    4033                 :                 errdetail("The database cluster was initialized with LC_CTYPE \"%s\","
    4034                 :                                   " which is not recognized by setlocale().",
    4035                 :                                   ControlFile->lc_ctype),
    4036                 :                          errhint("It looks like you need to initdb or install locale support.")));
    4037                 : 
    4038                 :         /* Make the fixed locale settings visible as GUC variables, too */
    4039              27 :         SetConfigOption("lc_collate", ControlFile->lc_collate,
    4040                 :                                         PGC_INTERNAL, PGC_S_OVERRIDE);
    4041              27 :         SetConfigOption("lc_ctype", ControlFile->lc_ctype,
    4042                 :                                         PGC_INTERNAL, PGC_S_OVERRIDE);
    4043              27 : }
    4044                 : 
    4045                 : void
    4046                 : UpdateControlFile(void)
    4047              47 : {
    4048                 :         int                     fd;
    4049                 : 
    4050              47 :         INIT_CRC32(ControlFile->crc);
    4051              47 :         COMP_CRC32(ControlFile->crc,
    4052                 :                            (char *) ControlFile,
    4053                 :                            offsetof(ControlFileData, crc));
    4054              47 :         FIN_CRC32(ControlFile->crc);
    4055                 : 
    4056              47 :         fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4057                 :                                            O_RDWR | PG_BINARY,
    4058                 :                                            S_IRUSR | S_IWUSR);
    4059              47 :         if (fd < 0)
    4060               0 :                 ereport(PANIC,
    4061                 :                                 (errcode_for_file_access(),
    4062                 :                                  errmsg("could not open control file \"%s\": %m",
    4063                 :                                                 XLOG_CONTROL_FILE)));
    4064                 : 
    4065              47 :         errno = 0;
    4066              47 :         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
    4067                 :         {
    4068                 :                 /* if write didn't set errno, assume problem is no disk space */
    4069               0 :                 if (errno == 0)
    4070               0 :                         errno = ENOSPC;
    4071               0 :                 ereport(PANIC,
    4072                 :                                 (errcode_for_file_access(),
    4073                 :                                  errmsg("could not write to control file: %m")));
    4074                 :         }
    4075                 : 
    4076              47 :         if (pg_fsync(fd) != 0)
    4077               0 :                 ereport(PANIC,
    4078                 :                                 (errcode_for_file_access(),
    4079                 :                                  errmsg("could not fsync control file: %m")));
    4080                 : 
    4081              47 :         if (close(fd))
    4082               0 :                 ereport(PANIC,
    4083                 :                                 (errcode_for_file_access(),
    4084                 :                                  errmsg("could not close control file: %m")));
    4085              47 : }
    4086                 : 
    4087                 : /*
    4088                 :  * Initialization of shared memory for XLOG
    4089                 :  */
    4090                 : Size
    4091                 : XLOGShmemSize(void)
    4092              34 : {
    4093                 :         Size            size;
    4094                 : 
    4095                 :         /* XLogCtl */
    4096              34 :         size = sizeof(XLogCtlData);
    4097                 :         /* xlblocks array */
    4098              34 :         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
    4099                 :         /* extra alignment padding for XLOG I/O buffers */
    4100              34 :         size = add_size(size, ALIGNOF_XLOG_BUFFER);
    4101                 :         /* and the buffers themselves */
    4102              34 :         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
    4103                 : 
    4104                 :         /*
    4105                 :          * Note: we don't count ControlFileData, it comes out of the "slop factor"
    4106                 :          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
    4107                 :          * routine again below to compute the actual allocation size.
    4108                 :          */
    4109                 : 
    4110              34 :         return size;
    4111                 : }
    4112                 : 
    4113                 : void
    4114                 : XLOGShmemInit(void)
    4115              16 : {
    4116                 :         bool            foundCFile,
    4117                 :                                 foundXLog;
    4118                 :         char       *allocptr;
    4119                 : 
    4120              16 :         ControlFile = (ControlFileData *)
    4121                 :                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
    4122              16 :         XLogCtl = (XLogCtlData *)
    4123                 :                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
    4124                 : 
    4125              16 :         if (foundCFile || foundXLog)
    4126                 :         {
    4127                 :                 /* both should be present or neither */
    4128                 :                 Assert(foundCFile && foundXLog);
    4129                 :                 return;
    4130                 :         }
    4131                 : 
    4132              16 :         memset(XLogCtl, 0, sizeof(XLogCtlData));
    4133                 : 
    4134                 :         /*
    4135                 :          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
    4136                 :          * multiple of the alignment for same, so no extra alignment padding is
    4137                 :          * needed here.
    4138                 :          */
    4139              16 :         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
    4140              16 :         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
    4141              16 :         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
    4142              16 :         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
    4143                 : 
    4144                 :         /*
    4145                 :          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
    4146                 :          */
    4147              16 :         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
    4148              16 :         XLogCtl->pages = allocptr;
    4149              16 :         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
    4150                 : 
    4151                 :         /*
    4152                 :          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
    4153                 :          * in additional info.)
    4154                 :          */
    4155              16 :         XLogCtl->XLogCacheByte = (Size) XLOG_BLCKSZ *XLOGbuffers;
    4156                 : 
    4157              16 :         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
    4158              16 :         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
    4159              16 :         SpinLockInit(&XLogCtl->info_lck);
    4160                 : 
    4161                 :         /*
    4162                 :          * If we are not in bootstrap mode, pg_control should already exist. Read
    4163                 :          * and validate it immediately (see comments in ReadControlFile() for the
    4164                 :          * reasons why).
    4165                 :          */
    4166              16 :         if (!IsBootstrapProcessingMode())
    4167              13 :                 ReadControlFile();
    4168                 : }
    4169                 : 
    4170                 : /*
    4171                 :  * This func must be called ONCE on system install.  It creates pg_control
    4172                 :  * and the initial XLOG segment.
    4173                 :  */
    4174                 : void
    4175                 : BootStrapXLOG(void)
    4176               1 : {
    4177                 :         CheckPoint      checkPoint;
    4178                 :         char       *buffer;
    4179                 :         XLogPageHeader page;
    4180                 :         XLogLongPageHeader longpage;
    4181                 :         XLogRecord *record;
    4182                 :         bool            use_existent;
    4183                 :         uint64          sysidentifier;
    4184                 :         struct timeval tv;
    4185                 :         pg_crc32        crc;
    4186                 : 
    4187                 :         /*
    4188                 :          * Select a hopefully-unique system identifier code for this installation.
    4189                 :          * We use the result of gettimeofday(), including the fractional seconds
    4190                 :          * field, as being about as unique as we can easily get.  (Think not to
    4191                 :          * use random(), since it hasn't been seeded and there's no portable way
    4192                 :          * to seed it other than the system clock value...)  The upper half of the
    4193                 :          * uint64 value is just the tv_sec part, while the lower half is the XOR
    4194                 :          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
    4195                 :          * unnecessarily if "uint64" is really only 32 bits wide.  A person
    4196                 :          * knowing this encoding can determine the initialization time of the
    4197                 :          * installation, which could perhaps be useful sometimes.
    4198                 :          */
    4199               1 :         gettimeofday(&tv, NULL);
    4200               1 :         sysidentifier = ((uint64) tv.tv_sec) << 32;
    4201               1 :         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
    4202                 : 
    4203                 :         /* First timeline ID is always 1 */
    4204               1 :         ThisTimeLineID = 1;
    4205                 : 
    4206                 :         /* page buffer must be aligned suitably for O_DIRECT */
    4207               1 :         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
    4208               1 :         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
    4209               1 :         memset(page, 0, XLOG_BLCKSZ);
    4210                 : 
    4211                 :         /* Set up information for the initial checkpoint record */
    4212               1 :         checkPoint.redo.xlogid = 0;
    4213               1 :         checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
    4214               1 :         checkPoint.ThisTimeLineID = ThisTimeLineID;
    4215               1 :         checkPoint.nextXidEpoch = 0;
    4216               1 :         checkPoint.nextXid = FirstNormalTransactionId;
    4217               1 :         checkPoint.nextOid = FirstBootstrapObjectId;
    4218               1 :         checkPoint.nextMulti = FirstMultiXactId;
    4219               1 :         checkPoint.nextMultiOffset = 0;
    4220               1 :         checkPoint.time = (pg_time_t) time(NULL);
    4221                 : 
    4222               1 :         ShmemVariableCache->nextXid = checkPoint.nextXid;
    4223               1 :         ShmemVariableCache->nextOid = checkPoint.nextOid;
    4224               1 :         ShmemVariableCache->oidCount = 0;
    4225               1 :         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    4226                 : 
    4227                 :         /* Set up the XLOG page header */
    4228               1 :         page->xlp_magic = XLOG_PAGE_MAGIC;
    4229               1 :         page->xlp_info = XLP_LONG_HEADER;
    4230               1 :         page->xlp_tli = ThisTimeLineID;
    4231               1 :         page->xlp_pageaddr.xlogid = 0;
    4232               1 :         page->xlp_pageaddr.xrecoff = 0;
    4233               1 :         longpage = (XLogLongPageHeader) page;
    4234               1 :         longpage->xlp_sysid = sysidentifier;
    4235               1 :         longpage->xlp_seg_size = XLogSegSize;
    4236               1 :         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    4237                 : 
    4238                 :         /* Insert the initial checkpoint record */
    4239               1 :         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
    4240               1 :         record->xl_prev.xlogid = 0;
    4241               1 :         record->xl_prev.xrecoff = 0;
    4242               1 :         record->xl_xid = InvalidTransactionId;
    4243               1 :         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
    4244               1 :         record->xl_len = sizeof(checkPoint);
    4245               1 :         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    4246               1 :         record->xl_rmid = RM_XLOG_ID;
    4247               1 :         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
    4248                 : 
    4249               1 :         INIT_CRC32(crc);
    4250               1 :         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
    4251               1 :         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
    4252                 :                            SizeOfXLogRecord - sizeof(pg_crc32));
    4253               1 :         FIN_CRC32(crc);
    4254               1 :         record->xl_crc = crc;
    4255                 : 
    4256                 :         /* Create first XLOG segment file */
    4257               1 :         use_existent = false;
    4258               1 :         openLogFile = XLogFileInit(0, 0, &use_existent, false);
    4259                 : 
    4260                 :         /* Write the first page with the initial record */
    4261               1 :         errno = 0;
    4262               1 :         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    4263                 :         {
    4264                 :                 /* if write didn't set errno, assume problem is no disk space */
    4265               0 :                 if (errno == 0)
    4266               0 :                         errno = ENOSPC;
    4267               0 :                 ereport(PANIC,
    4268                 :                                 (errcode_for_file_access(),
    4269                 :                           errmsg("could not write bootstrap transaction log file: %m")));
    4270                 :         }
    4271                 : 
    4272               1 :         if (pg_fsync(openLogFile) != 0)
    4273               0 :                 ereport(PANIC,
    4274                 :                                 (errcode_for_file_access(),
    4275                 :                           errmsg("could not fsync bootstrap transaction log file: %m")));
    4276                 : 
    4277               1 :         if (close(openLogFile))
    4278               0 :                 ereport(PANIC,
    4279                 :                                 (errcode_for_file_access(),
    4280                 :                           errmsg("could not close bootstrap transaction log file: %m")));
    4281                 : 
    4282               1 :         openLogFile = -1;
    4283                 : 
    4284                 :         /* Now create pg_control */
    4285                 : 
    4286               1 :         memset(ControlFile, 0, sizeof(ControlFileData));
    4287                 :         /* Initialize pg_control status fields */
    4288               1 :         ControlFile->system_identifier = sysidentifier;
    4289               1 :         ControlFile->state = DB_SHUTDOWNED;
    4290               1 :         ControlFile->time = checkPoint.time;
    4291               1 :         ControlFile->checkPoint = checkPoint.redo;
    4292               1 :         ControlFile->checkPointCopy = checkPoint;
    4293                 :         /* some additional ControlFile fields are set in WriteControlFile() */
    4294                 : 
    4295               1 :         WriteControlFile();
    4296                 : 
    4297                 :         /* Bootstrap the commit log, too */
    4298               1 :         BootStrapCLOG();
    4299               1 :         BootStrapSUBTRANS();
    4300               1 :         BootStrapMultiXact();
    4301                 : 
    4302               1 :         pfree(buffer);
    4303               1 : }
    4304                 : 
    4305                 : static char *
    4306                 : str_time(pg_time_t tnow)
    4307               1 : {
    4308                 :         static char buf[128];
    4309                 : 
    4310               1 :         pg_strftime(buf, sizeof(buf),
    4311                 :                                 "%Y-%m-%d %H:%M:%S %Z",
    4312                 :                                 pg_localtime(&tnow, log_timezone));
    4313                 : 
    4314               1 :         return buf;
    4315                 : }
    4316                 : 
    4317                 : /*
    4318                 :  * See if there is a recovery command file (recovery.conf), and if so
    4319                 :  * read in parameters for archive recovery.
    4320                 :  *
    4321                 :  * XXX longer term intention is to expand this to
    4322                 :  * cater for additional parameters and controls
    4323                 :  * possibly use a flex lexer similar to the GUC one
    4324                 :  */
    4325                 : static void
    4326                 : readRecoveryCommandFile(void)
    4327              14 : {
    4328                 :         FILE       *fd;
    4329                 :         char            cmdline[MAXPGPATH];
    4330              14 :         TimeLineID      rtli = 0;
    4331              14 :         bool            rtliGiven = false;
    4332              14 :         bool            syntaxError = false;
    4333                 : 
    4334              14 :         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
    4335              14 :         if (fd == NULL)
    4336                 :         {
    4337              14 :                 if (errno == ENOENT)
    4338              14 :                         return;                         /* not there, so no archive recovery */
    4339               0 :                 ereport(FATAL,
    4340                 :                                 (errcode_for_file_access(),
    4341                 :                                  errmsg("could not open recovery command file \"%s\": %m",
    4342                 :                                                 RECOVERY_COMMAND_FILE)));
    4343                 :         }
    4344                 : 
    4345               0 :         ereport(LOG,
    4346                 :                         (errmsg("starting archive recovery")));
    4347                 : 
    4348                 :         /*
    4349                 :          * Parse the file...
    4350                 :          */
    4351               0 :         while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
    4352                 :         {
    4353                 :                 /* skip leading whitespace and check for # comment */
    4354                 :                 char       *ptr;
    4355                 :                 char       *tok1;
    4356                 :                 char       *tok2;
    4357                 : 
    4358               0 :                 for (ptr = cmdline; *ptr; ptr++)
    4359                 :                 {
    4360               0 :                         if (!isspace((unsigned char) *ptr))
    4361               0 :                                 break;
    4362                 :                 }
    4363               0 :                 if (*ptr == '\0' || *ptr == '#')
    4364               0 :                         continue;
    4365                 : 
    4366                 :                 /* identify the quoted parameter value */
    4367               0 :                 tok1 = strtok(ptr, "'");
    4368               0 :                 if (!tok1)
    4369                 :                 {
    4370               0 :                         syntaxError = true;
    4371               0 :                         break;
    4372                 :                 }
    4373               0 :                 tok2 = strtok(NULL, "'");
    4374               0 :                 if (!tok2)
    4375                 :                 {
    4376               0 :                         syntaxError = true;
    4377               0 :                         break;
    4378                 :                 }
    4379                 :                 /* reparse to get just the parameter name */
    4380               0 :                 tok1 = strtok(ptr, " \t=");
    4381               0 :                 if (!tok1)
    4382                 :                 {
    4383               0 :                         syntaxError = true;
    4384               0 :                         break;
    4385                 :                 }
    4386                 : 
    4387               0 :                 if (strcmp(tok1, "restore_command") == 0)
    4388                 :                 {
    4389               0 :                         recoveryRestoreCommand = pstrdup(tok2);
    4390               0 :                         ereport(LOG,
    4391                 :                                         (errmsg("restore_command = '%s'",
    4392                 :                                                         recoveryRestoreCommand)));
    4393                 :                 }
    4394               0 :                 else if (strcmp(tok1, "recovery_target_timeline") == 0)
    4395                 :                 {
    4396               0 :                         rtliGiven = true;
    4397               0 :                         if (strcmp(tok2, "latest") == 0)
    4398               0 :                                 rtli = 0;
    4399                 :                         else
    4400                 :                         {
    4401               0 :                                 errno = 0;
    4402               0 :                                 rtli = (TimeLineID) strtoul(tok2, NULL, 0);
    4403               0 :                                 if (errno == EINVAL || errno == ERANGE)
    4404               0 :                                         ereport(FATAL,
    4405                 :                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
    4406                 :                                                                         tok2)));
    4407                 :                         }
    4408               0 :                         if (rtli)
    4409               0 :                                 ereport(LOG,
    4410                 :                                                 (errmsg("recovery_target_timeline = %u", rtli)));
    4411                 :                         else
    4412               0 :                                 ereport(LOG,
    4413                 :                                                 (errmsg("recovery_target_timeline = latest")));
    4414                 :                 }
    4415               0 :                 else if (strcmp(tok1, "recovery_target_xid") == 0)
    4416                 :                 {
    4417               0 :                         errno = 0;
    4418               0 :                         recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
    4419               0 :                         if (errno == EINVAL || errno == ERANGE)
    4420               0 :                                 ereport(FATAL,
    4421                 :                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
    4422                 :                                                  tok2)));
    4423               0 :                         ereport(LOG,
    4424                 :                                         (errmsg("recovery_target_xid = %u",
    4425                 :                                                         recoveryTargetXid)));
    4426               0 :                         recoveryTarget = true;
    4427               0 :                         recoveryTargetExact = true;
    4428                 :                 }
    4429               0 :                 else if (strcmp(tok1, "recovery_target_time") == 0)
    4430                 :                 {
    4431                 :                         /*
    4432                 :                          * if recovery_target_xid specified, then this overrides
    4433                 :                          * recovery_target_time
    4434                 :                          */
    4435               0 :                         if (recoveryTargetExact)
    4436               0 :                                 continue;
    4437               0 :                         recoveryTarget = true;
    4438               0 :                         recoveryTargetExact = false;
    4439                 : 
    4440                 :                         /*
    4441                 :                          * Convert the time string given by the user to TimestampTz form.
    4442                 :                          */
    4443               0 :                         recoveryTargetTime =
    4444                 :                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
    4445                 :                                                                                                                 CStringGetDatum(tok2),
    4446                 :                                                                                                 ObjectIdGetDatum(InvalidOid),
    4447                 :                                                                                                                 Int32GetDatum(-1)));
    4448               0 :                         ereport(LOG,
    4449                 :                                         (errmsg("recovery_target_time = '%s'",
    4450                 :                                                         timestamptz_to_str(recoveryTargetTime))));
    4451                 :                 }
    4452               0 :                 else if (strcmp(tok1, "recovery_target_inclusive") == 0)
    4453                 :                 {
    4454                 :                         /*
    4455                 :                          * does nothing if a recovery_target is not also set
    4456                 :                          */
    4457               0 :                         if (strcmp(tok2, "true") == 0)
    4458               0 :                                 recoveryTargetInclusive = true;
    4459                 :                         else
    4460                 :                         {
    4461               0 :                                 recoveryTargetInclusive = false;
    4462               0 :                                 tok2 = "false";
    4463                 :                         }
    4464               0 :                         ereport(LOG,
    4465                 :                                         (errmsg("recovery_target_inclusive = %s", tok2)));
    4466                 :                 }
    4467               0 :                 else if (strcmp(tok1, "log_restartpoints") == 0)
    4468                 :                 {
    4469                 :                         /*
    4470                 :                          * does nothing if a recovery_target is not also set
    4471                 :                          */
    4472               0 :                         if (strcmp(tok2, "true") == 0)
    4473               0 :                                 recoveryLogRestartpoints = true;
    4474                 :                         else
    4475                 :                         {
    4476               0 :                                 recoveryLogRestartpoints = false;
    4477               0 :                                 tok2 = "false";
    4478                 :                         }
    4479               0 :                         ereport(LOG,
    4480                 :                                         (errmsg("log_restartpoints = %s", tok2)));
    4481                 :                 }
    4482                 :                 else
    4483               0 :                         ereport(FATAL,
    4484                 :                                         (errmsg("unrecognized recovery parameter \"%s\"",
    4485                 :                                                         tok1)));
    4486                 :         }
    4487                 : 
    4488               0 :         FreeFile(fd);
    4489                 : 
    4490               0 :         if (syntaxError)
    4491               0 :                 ereport(FATAL,
    4492                 :                                 (errmsg("syntax error in recovery command file: %s",
    4493                 :                                                 cmdline),
    4494                 :                           errhint("Lines should have the format parameter = 'value'.")));
    4495                 : 
    4496                 :         /* Check that required parameters were supplied */
    4497               0 :         if (recoveryRestoreCommand == NULL)
    4498               0 :                 ereport(FATAL,
    4499                 :                                 (errmsg("recovery command file \"%s\" did not specify restore_command",
    4500                 :                                                 RECOVERY_COMMAND_FILE)));
    4501                 : 
    4502                 :         /* Enable fetching from archive recovery area */
    4503               0 :         InArchiveRecovery = true;
    4504                 : 
    4505                 :         /*
    4506                 :          * If user specified recovery_target_timeline, validate it or compute the
    4507                 :          * "latest" value.    We can't do this until after we've gotten the restore
    4508                 :          * command and set InArchiveRecovery, because we need to fetch timeline
    4509                 :          * history files from the archive.
    4510                 :          */
    4511               0 :         if (rtliGiven)
    4512                 :         {
    4513               0 :                 if (rtli)
    4514                 :                 {
    4515                 :                         /* Timeline 1 does not have a history file, all else should */
    4516               0 :                         if (rtli != 1 && !existsTimeLineHistory(rtli))
    4517               0 :                                 ereport(FATAL,
    4518                 :                                                 (errmsg("recovery target timeline %u does not exist",
    4519                 :                                                                 rtli)));
    4520               0 :                         recoveryTargetTLI = rtli;
    4521                 :                 }
    4522                 :                 else
    4523                 :                 {
    4524                 :                         /* We start the "latest" search from pg_control's timeline */
    4525               0 :                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
    4526                 :                 }
    4527                 :         }
    4528                 : }
    4529                 : 
    4530                 : /*
    4531                 :  * Exit archive-recovery state
    4532                 :  */
    4533                 : static void
    4534                 : exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
    4535               0 : {
    4536                 :         char            recoveryPath[MAXPGPATH];
    4537                 :         char            xlogpath[MAXPGPATH];
    4538                 : 
    4539                 :         /*
    4540                 :          * We are no longer in archive recovery state.
    4541                 :          */
    4542               0 :         InArchiveRecovery = false;
    4543                 : 
    4544                 :         /*
    4545                 :          * We should have the ending log segment currently open.  Verify, and then
    4546                 :          * close it (to avoid problems on Windows with trying to rename or delete
    4547                 :          * an open file).
    4548                 :          */
    4549                 :         Assert(readFile >= 0);
    4550                 :         Assert(readId == endLogId);
    4551                 :         Assert(readSeg == endLogSeg);
    4552                 : 
    4553               0 :         close(readFile);
    4554               0 :         readFile = -1;
    4555                 : 
    4556                 :         /*
    4557                 :          * If the segment was fetched from archival storage, we want to replace
    4558                 :          * the existing xlog segment (if any) with the archival version.  This is
    4559                 :          * because whatever is in XLOGDIR is very possibly older than what we have
    4560                 :          * from the archives, since it could have come from restoring a PGDATA
    4561                 :          * backup.      In any case, the archival version certainly is more
    4562                 :          * descriptive of what our current database state is, because that is what
    4563                 :          * we replayed from.
    4564                 :          *
    4565                 :          * Note that if we are establishing a new timeline, ThisTimeLineID is
    4566                 :          * already set to the new value, and so we will create a new file instead
    4567                 :          * of overwriting any existing file.  (This is, in fact, always the case
    4568                 :          * at present.)
    4569                 :          */
    4570               0 :         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
    4571               0 :         XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
    4572                 : 
    4573               0 :         if (restoredFromArchive)
    4574                 :         {
    4575               0 :                 ereport(DEBUG3,
    4576                 :                                 (errmsg_internal("moving last restored xlog to \"%s\"",
    4577                 :                                                                  xlogpath)));
    4578               0 :                 unlink(xlogpath);               /* might or might not exist */
    4579               0 :                 if (rename(recoveryPath, xlogpath) != 0)
    4580               0 :                         ereport(FATAL,
    4581                 :                                         (errcode_for_file_access(),
    4582                 :                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
    4583                 :                                                         recoveryPath, xlogpath)));
    4584                 :                 /* XXX might we need to fix permissions on the file? */
    4585                 :         }
    4586                 :         else
    4587                 :         {
    4588                 :                 /*
    4589                 :                  * If the latest segment is not archival, but there's still a
    4590                 :                  * RECOVERYXLOG laying about, get rid of it.
    4591                 :                  */
    4592               0 :                 unlink(recoveryPath);   /* ignore any error */
    4593                 : 
    4594                 :                 /*
    4595                 :                  * If we are establishing a new timeline, we have to copy data from
    4596                 :                  * the last WAL segment of the old timeline to create a starting WAL
    4597                 :                  * segment for the new timeline.
    4598                 :                  */
    4599               0 :                 if (endTLI != ThisTimeLineID)
    4600               0 :                         XLogFileCopy(endLogId, endLogSeg,
    4601                 :                                                  endTLI, endLogId, endLogSeg);
    4602                 :         }
    4603                 : 
    4604                 :         /*
    4605                 :          * Let's just make real sure there are not .ready or .done flags posted
    4606                 :          * for the new segment.
    4607                 :          */
    4608               0 :         XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
    4609               0 :         XLogArchiveCleanup(xlogpath);
    4610                 : 
    4611                 :         /* Get rid of any remaining recovered timeline-history file, too */
    4612               0 :         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
    4613               0 :         unlink(recoveryPath);           /* ignore any error */
    4614                 : 
    4615                 :         /*
    4616                 :          * Rename the config file out of the way, so that we don't accidentally
    4617                 :          * re-enter archive recovery mode in a subsequent crash.
    4618                 :          */
    4619               0 :         unlink(RECOVERY_COMMAND_DONE);
    4620               0 :         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
    4621               0 :                 ereport(FATAL,
    4622                 :                                 (errcode_for_file_access(),
    4623                 :                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
    4624                 :                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
    4625                 : 
    4626               0 :         ereport(LOG,
    4627                 :                         (errmsg("archive recovery complete")));
    4628               0 : }
    4629                 : 
    4630                 : /*
    4631                 :  * For point-in-time recovery, this function decides whether we want to
    4632                 :  * stop applying the XLOG at or after the current record.
    4633                 :  *
    4634                 :  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
    4635                 :  * *includeThis is set TRUE if we should apply this record before stopping.
    4636                 :  * Also, some information is saved in recoveryStopXid et al for use in
    4637                 :  * annotating the new timeline's history file.
    4638                 :  */
    4639                 : static bool
    4640                 : recoveryStopsHere(XLogRecord *record, bool *includeThis)
    4641               0 : {
    4642                 :         bool            stopsHere;
    4643                 :         uint8           record_info;
    4644                 :         TimestampTz recordXtime;
    4645                 : 
    4646                 :         /* We only consider stopping at COMMIT or ABORT records */
    4647               0 :         if (record->xl_rmid != RM_XACT_ID)
    4648               0 :                 return false;
    4649               0 :         record_info = record->xl_info & ~XLR_INFO_MASK;
    4650               0 :         if (record_info == XLOG_XACT_COMMIT)
    4651                 :         {
    4652                 :                 xl_xact_commit *recordXactCommitData;
    4653                 : 
    4654               0 :                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
    4655               0 :                 recordXtime = recordXactCommitData->xact_time;
    4656                 :         }
    4657               0 :         else if (record_info == XLOG_XACT_ABORT)
    4658                 :         {
    4659                 :                 xl_xact_abort *recordXactAbortData;
    4660                 : 
    4661               0 :                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
    4662               0 :                 recordXtime = recordXactAbortData->xact_time;
    4663                 :         }
    4664                 :         else
    4665               0 :                 return false;
    4666                 : 
    4667                 :         /* Remember the most recent COMMIT/ABORT time for logging purposes */
    4668               0 :         recoveryLastXTime = recordXtime;
    4669                 : 
    4670                 :         /* Do we have a PITR target at all? */
    4671               0 :         if (!recoveryTarget)
    4672               0 :                 return false;
    4673                 : 
    4674               0 :         if (recoveryTargetExact)
    4675                 :         {
    4676                 :                 /*
    4677                 :                  * there can be only one transaction end record with this exact
    4678                 :                  * transactionid
    4679                 :                  *
    4680                 :                  * when testing for an xid, we MUST test for equality only, since
    4681                 :                  * transactions are numbered in the order they start, not the order
    4682                 :                  * they complete. A higher numbered xid will complete before you about
    4683                 :                  * 50% of the time...
    4684                 :                  */
    4685               0 :                 stopsHere = (record->xl_xid == recoveryTargetXid);
    4686               0 :                 if (stopsHere)
    4687               0 :                         *includeThis = recoveryTargetInclusive;
    4688                 :         }
    4689                 :         else
    4690                 :         {
    4691                 :                 /*
    4692                 :                  * there can be many transactions that share the same commit time, so
    4693                 :                  * we stop after the last one, if we are inclusive, or stop at the
    4694                 :                  * first one if we are exclusive
    4695                 :                  */
    4696               0 :                 if (recoveryTargetInclusive)
    4697               0 :                         stopsHere = (recordXtime > recoveryTargetTime);
    4698                 :                 else
    4699               0 :                         stopsHere = (recordXtime >= recoveryTargetTime);
    4700               0 :                 if (stopsHere)
    4701               0 :                         *includeThis = false;
    4702                 :         }
    4703                 : 
    4704               0 :         if (stopsHere)
    4705                 :         {
    4706               0 :                 recoveryStopXid = record->xl_xid;
    4707               0 :                 recoveryStopTime = recordXtime;
    4708               0 :                 recoveryStopAfter = *includeThis;
    4709                 : 
    4710               0 :                 if (record_info == XLOG_XACT_COMMIT)
    4711                 :                 {
    4712               0 :                         if (recoveryStopAfter)
    4713               0 :                                 ereport(LOG,
    4714                 :                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
    4715                 :                                                                 recoveryStopXid,
    4716                 :                                                                 timestamptz_to_str(recoveryStopTime))));
    4717                 :                         else
    4718               0 :                                 ereport(LOG,
    4719                 :                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
    4720                 :                                                                 recoveryStopXid,
    4721                 :                                                                 timestamptz_to_str(recoveryStopTime))));
    4722                 :                 }
    4723                 :                 else
    4724                 :                 {
    4725               0 :                         if (recoveryStopAfter)
    4726               0 :                                 ereport(LOG,
    4727                 :                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
    4728                 :                                                                 recoveryStopXid,
    4729                 :                                                                 timestamptz_to_str(recoveryStopTime))));
    4730                 :                         else
    4731               0 :                                 ereport(LOG,
    4732                 :                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
    4733                 :                                                                 recoveryStopXid,
    4734                 :                                                                 timestamptz_to_str(recoveryStopTime))));
    4735                 :                 }
    4736                 :         }
    4737                 : 
    4738               0 :         return stopsHere;
    4739                 : }
    4740                 : 
    4741                 : /*
    4742                 :  * This must be called ONCE during postmaster or standalone-backend startup
    4743                 :  */
    4744                 : void
    4745                 : StartupXLOG(void)
    4746              14 : {
    4747                 :         XLogCtlInsert *Insert;
    4748                 :         CheckPoint      checkPoint;
    4749                 :         bool            wasShutdown;
    4750              14 :         bool            reachedStopPoint = false;
    4751              14 :         bool            haveBackupLabel = false;
    4752                 :         XLogRecPtr      RecPtr,
    4753                 :                                 LastRec,
    4754                 :                                 checkPointLoc,
    4755                 :                                 minRecoveryLoc,
    4756                 :                                 EndOfLog;
    4757                 :         uint32          endLogId;
    4758                 :         uint32          endLogSeg;
    4759                 :         XLogRecord *record;
    4760                 :         uint32          freespace;
    4761                 :         TransactionId oldestActiveXID;
    4762                 : 
    4763                 :         /*
    4764                 :          * Read control file and check XLOG status looks valid.
    4765                 :          *
    4766                 :          * Note: in most control paths, *ControlFile is already valid and we need
    4767                 :          * not do ReadControlFile() here, but might as well do it to be sure.
    4768                 :          */
    4769              14 :         ReadControlFile();
    4770                 : 
    4771              14 :         if (ControlFile->state < DB_SHUTDOWNED ||
    4772                 :                 ControlFile->state > DB_IN_PRODUCTION ||
    4773                 :                 !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
    4774               0 :                 ereport(FATAL,
    4775                 :                                 (errmsg("control file contains invalid data")));
    4776                 : 
    4777              14 :         if (ControlFile->state == DB_SHUTDOWNED)
    4778              14 :                 ereport(LOG,
    4779                 :                                 (errmsg("database system was shut down at %s",
    4780                 :                                                 str_time(ControlFile->time))));
    4781               0 :         else if (ControlFile->state == DB_SHUTDOWNING)
    4782               0 :                 ereport(LOG,
    4783                 :                                 (errmsg("database system shutdown was interrupted; last known up at %s",
    4784                 :                                                 str_time(ControlFile->time))));
    4785               0 :         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
    4786               0 :                 ereport(LOG,
    4787                 :                    (errmsg("database system was interrupted while in recovery at %s",
    4788                 :                                    str_time(ControlFile->time)),
    4789                 :                         errhint("This probably means that some data is corrupted and"
    4790                 :                                         " you will have to use the last backup for recovery.")));
    4791               0 :         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
    4792               0 :                 ereport(LOG,
    4793                 :                                 (errmsg("database system was interrupted while in recovery at log time %s",
    4794                 :                                                 str_time(ControlFile->checkPointCopy.time)),
    4795                 :                                  errhint("If this has occurred more than once some data might be corrupted"
    4796                 :                           " and you might need to choose an earlier recovery target.")));
    4797               0 :         else if (ControlFile->state == DB_IN_PRODUCTION)
    4798               0 :                 ereport(LOG,
    4799                 :                           (errmsg("database system was interrupted; last known up at %s",
    4800                 :                                           str_time(ControlFile->time))));
    4801                 : 
    4802                 :         /* This is just to allow attaching to startup process with a debugger */
    4803                 : #ifdef XLOG_REPLAY_DELAY
    4804                 :         if (ControlFile->state != DB_SHUTDOWNED)
    4805                 :                 pg_usleep(60000000L);
    4806                 : #endif
    4807                 : 
    4808                 :         /*
    4809                 :          * Initialize on the assumption we want to recover to the same timeline
    4810                 :          * that's active according to pg_control.
    4811                 :          */
    4812              14 :         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
    4813                 : 
    4814                 :         /*
    4815                 :          * Check for recovery control file, and if so set up state for offline
    4816                 :          * recovery
    4817                 :          */
    4818              14 :         readRecoveryCommandFile();
    4819                 : 
    4820                 :         /* Now we can determine the list of expected TLIs */
    4821              14 :         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
    4822                 : 
    4823                 :         /*
    4824                 :          * If pg_control's timeline is not in expectedTLIs, then we cannot
    4825                 :          * proceed: the backup is not part of the history of the requested
    4826                 :          * timeline.
    4827                 :          */
    4828              14 :         if (!list_member_int(expectedTLIs,
    4829                 :                                                  (int) ControlFile->checkPointCopy.ThisTimeLineID))
    4830               0 :                 ereport(FATAL,
    4831                 :                                 (errmsg("requested timeline %u is not a child of database system timeline %u",
    4832                 :                                                 recoveryTargetTLI,
    4833                 :                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
    4834                 : 
    4835              14 :         if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
    4836                 :         {
    4837                 :                 /*
    4838                 :                  * When a backup_label file is present, we want to roll forward from
    4839                 :                  * the checkpoint it identifies, rather than using pg_control.
    4840                 :                  */
    4841               0 :                 record = ReadCheckpointRecord(checkPointLoc, 0);
    4842               0 :                 if (record != NULL)
    4843                 :                 {
    4844               0 :                         ereport(DEBUG1,
    4845                 :                                         (errmsg("checkpoint record is at %X/%X",
    4846                 :                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
    4847               0 :                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
    4848                 :                 }
    4849                 :                 else
    4850                 :                 {
    4851               0 :                         ereport(PANIC,
    4852                 :                                         (errmsg("could not locate required checkpoint record"),
    4853                 :                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
    4854                 :                 }
    4855                 :                 /* set flag to delete it later */
    4856               0 :                 haveBackupLabel = true;
    4857                 :         }
    4858                 :         else
    4859                 :         {
    4860                 :                 /*
    4861                 :                  * Get the last valid checkpoint record.  If the latest one according
    4862                 :                  * to pg_control is broken, try the next-to-last one.
    4863                 :                  */
    4864              14 :                 checkPointLoc = ControlFile->checkPoint;
    4865              14 :                 record = ReadCheckpointRecord(checkPointLoc, 1);
    4866              14 :                 if (record != NULL)
    4867                 :                 {
    4868              14 :                         ereport(DEBUG1,
    4869                 :                                         (errmsg("checkpoint record is at %X/%X",
    4870                 :                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
    4871                 :                 }
    4872                 :                 else
    4873                 :                 {
    4874               0 :                         checkPointLoc = ControlFile->prevCheckPoint;
    4875               0 :                         record = ReadCheckpointRecord(checkPointLoc, 2);
    4876               0 :                         if (record != NULL)
    4877                 :                         {
    4878               0 :                                 ereport(LOG,
    4879                 :                                                 (errmsg("using previous checkpoint record at %X/%X",
    4880                 :                                                           checkPointLoc.xlogid, checkPointLoc.xrecoff)));
    4881               0 :                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
    4882                 :                         }
    4883                 :                         else
    4884               0 :                                 ereport(PANIC,
    4885                 :                                          (errmsg("could not locate a valid checkpoint record")));
    4886                 :                 }
    4887                 :         }
    4888                 : 
    4889              14 :         LastRec = RecPtr = checkPointLoc;
    4890              14 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    4891              14 :         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
    4892                 : 
    4893              14 :         ereport(DEBUG1,
    4894                 :                         (errmsg("redo record is at %X/%X; shutdown %s",
    4895                 :                                         checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
    4896                 :                                         wasShutdown ? "TRUE" : "FALSE")));
    4897              14 :         ereport(DEBUG1,
    4898                 :                         (errmsg("next transaction ID: %u/%u; next OID: %u",
    4899                 :                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
    4900                 :                                         checkPoint.nextOid)));
    4901              14 :         ereport(DEBUG1,
    4902                 :                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
    4903                 :                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
    4904              14 :         if (!TransactionIdIsNormal(checkPoint.nextXid))
    4905               0 :                 ereport(PANIC,
    4906                 :                                 (errmsg("invalid next transaction ID")));
    4907                 : 
    4908              14 :         ShmemVariableCache->nextXid = checkPoint.nextXid;
    4909              14 :         ShmemVariableCache->nextOid = checkPoint.nextOid;
    4910              14 :         ShmemVariableCache->oidCount = 0;
    4911              14 :         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    4912                 : 
    4913                 :         /*
    4914                 :          * We must replay WAL entries using the same TimeLineID they were created
    4915                 :          * under, so temporarily adopt the TLI indicated by the checkpoint (see
    4916                 :          * also xlog_redo()).
    4917                 :          */
    4918              14 :         ThisTimeLineID = checkPoint.ThisTimeLineID;
    4919                 : 
    4920              14 :         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    4921                 : 
    4922              14 :         if (XLByteLT(RecPtr, checkPoint.redo))
    4923               0 :                 ereport(PANIC,
    4924                 :                                 (errmsg("invalid redo in checkpoint record")));
    4925                 : 
    4926                 :         /*
    4927                 :          * Check whether we need to force recovery from WAL.  If it appears to
    4928                 :          * have been a clean shutdown and we did not have a recovery.conf file,
    4929                 :          * then assume no recovery needed.
    4930                 :          */
    4931              14 :         if (XLByteLT(checkPoint.redo, RecPtr))
    4932                 :         {
    4933               0 :                 if (wasShutdown)
    4934               0 :                         ereport(PANIC,
    4935                 :                                         (errmsg("invalid redo record in shutdown checkpoint")));
    4936               0 :                 InRecovery = true;
    4937                 :         }
    4938              14 :         else if (ControlFile->state != DB_SHUTDOWNED)
    4939               0 :                 InRecovery = true;
    4940              14 :         else if (InArchiveRecovery)
    4941                 :         {
    4942                 :                 /* force recovery due to presence of recovery.conf */
    4943               0 :                 InRecovery = true;
    4944                 :         }
    4945                 : 
    4946                 :         /* REDO */
    4947              14 :         if (InRecovery)
    4948                 :         {
    4949                 :                 int                     rmid;
    4950                 : 
    4951                 :                 /*
    4952                 :                  * Update pg_control to show that we are recovering and to show the
    4953                 :                  * selected checkpoint as the place we are starting from. We also mark
    4954                 :                  * pg_control with any minimum recovery stop point obtained from a
    4955                 :                  * backup history file.
    4956                 :                  */
    4957               0 :                 if (InArchiveRecovery)
    4958                 :                 {
    4959               0 :                         ereport(LOG,
    4960                 :                                         (errmsg("automatic recovery in progress")));
    4961               0 :                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    4962                 :                 }
    4963                 :                 else
    4964                 :                 {
    4965               0 :                         ereport(LOG,
    4966                 :                                         (errmsg("database system was not properly shut down; "
    4967                 :                                                         "automatic recovery in progress")));
    4968               0 :                         ControlFile->state = DB_IN_CRASH_RECOVERY;
    4969                 :                 }
    4970               0 :                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
    4971               0 :                 ControlFile->checkPoint = checkPointLoc;
    4972               0 :                 ControlFile->checkPointCopy = checkPoint;
    4973               0 :                 if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
    4974               0 :                         ControlFile->minRecoveryPoint = minRecoveryLoc;
    4975               0 :                 ControlFile->time = (pg_time_t) time(NULL);
    4976               0 :                 UpdateControlFile();
    4977                 : 
    4978                 :                 /*
    4979                 :                  * If there was a backup label file, it's done its job and the info
    4980                 :                  * has now been propagated into pg_control.  We must get rid of the
    4981                 :                  * label file so that if we crash during recovery, we'll pick up at
    4982                 :                  * the latest recovery restartpoint instead of going all the way back
    4983                 :                  * to the backup start point.  It seems prudent though to just rename
    4984                 :                  * the file out of the way rather than delete it completely.
    4985                 :                  */
    4986               0 :                 if (haveBackupLabel)
    4987                 :                 {
    4988               0 :                         unlink(BACKUP_LABEL_OLD);
    4989               0 :                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
    4990               0 :                                 ereport(FATAL,
    4991                 :                                                 (errcode_for_file_access(),
    4992                 :                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
    4993                 :                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
    4994                 :                 }
    4995                 : 
    4996                 :                 /* Start up the recovery environment */
    4997               0 :                 XLogInitRelationCache();
    4998                 : 
    4999               0 :                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    5000                 :                 {
    5001               0 :                         if (RmgrTable[rmid].rm_startup != NULL)
    5002               0 :                                 RmgrTable[rmid].rm_startup();
    5003                 :                 }
    5004                 : 
    5005                 :                 /*
    5006                 :                  * Find the first record that logically follows the checkpoint --- it
    5007                 :                  * might physically precede it, though.
    5008                 :                  */
    5009               0 :                 if (XLByteLT(checkPoint.redo, RecPtr))
    5010                 :                 {
    5011                 :                         /* back up to find the record */
    5012               0 :                         record = ReadRecord(&(checkPoint.redo), PANIC);
    5013                 :                 }
    5014                 :                 else
    5015                 :                 {
    5016                 :                         /* just have to read next record after CheckPoint */
    5017               0 :                         record = ReadRecord(NULL, LOG);
    5018                 :                 }
    5019                 : 
    5020               0 :                 if (record != NULL)
    5021                 :                 {
    5022               0 :                         bool            recoveryContinue = true;
    5023               0 :                         bool            recoveryApply = true;
    5024                 :                         ErrorContextCallback errcontext;
    5025                 : 
    5026               0 :                         InRedo = true;
    5027               0 :                         ereport(LOG,
    5028                 :                                         (errmsg("redo starts at %X/%X",
    5029                 :                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
    5030                 : 
    5031                 :                         /*
    5032                 :                          * main redo apply loop
    5033                 :                          */
    5034                 :                         do
    5035                 :                         {
    5036                 : #ifdef WAL_DEBUG
    5037                 :                                 if (XLOG_DEBUG)
    5038                 :                                 {
    5039                 :                                         StringInfoData buf;
    5040                 : 
    5041                 :                                         initStringInfo(&buf);
    5042                 :                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
    5043                 :                                                                          ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
    5044                 :                                                                          EndRecPtr.xlogid, EndRecPtr.xrecoff);
    5045                 :                                         xlog_outrec(&buf, record);
    5046                 :                                         appendStringInfo(&buf, " - ");
    5047                 :                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
    5048                 :                                                                                                            record->xl_info,
    5049                 :                                                                                                          XLogRecGetData(record));
    5050                 :                                         elog(LOG, "%s", buf.data);
    5051                 :                                         pfree(buf.data);
    5052                 :                                 }
    5053                 : #endif
    5054                 : 
    5055                 :                                 /*
    5056                 :                                  * Have we reached our recovery target?
    5057                 :                                  */
    5058               0 :                                 if (recoveryStopsHere(record, &recoveryApply))
    5059                 :                                 {
    5060               0 :                                         reachedStopPoint = true;        /* see below */
    5061               0 :                                         recoveryContinue = false;
    5062               0 :                                         if (!recoveryApply)
    5063               0 :                                                 break;
    5064                 :                                 }
    5065                 : 
    5066                 :                                 /* Setup error traceback support for ereport() */
    5067               0 :                                 errcontext.callback = rm_redo_error_callback;
    5068               0 :                                 errcontext.arg = (void *) record;
    5069               0 :                                 errcontext.previous = error_context_stack;
    5070               0 :                                 error_context_stack = &errcontext;
    5071                 : 
    5072                 :                                 /* nextXid must be beyond record's xid */
    5073               0 :                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
    5074                 :                                                                                                  ShmemVariableCache->nextXid))
    5075                 :                                 {
    5076               0 :                                         ShmemVariableCache->nextXid = record->xl_xid;
    5077               0 :                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
    5078                 :                                 }
    5079                 : 
    5080               0 :                                 if (record->xl_info & XLR_BKP_BLOCK_MASK)
    5081               0 :                                         RestoreBkpBlocks(record, EndRecPtr);
    5082                 : 
    5083               0 :                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
    5084                 : 
    5085                 :                                 /* Pop the error context stack */
    5086               0 :                                 error_context_stack = errcontext.previous;
    5087                 : 
    5088               0 :                                 LastRec = ReadRecPtr;
    5089                 : 
    5090               0 :                                 record = ReadRecord(NULL, LOG);
    5091               0 :                         } while (record != NULL && recoveryContinue);
    5092                 : 
    5093                 :                         /*
    5094                 :                          * end of main redo apply loop
    5095                 :                          */
    5096                 : 
    5097               0 :                         ereport(LOG,
    5098                 :                                         (errmsg("redo done at %X/%X",
    5099                 :                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
    5100               0 :                         if (recoveryLastXTime)
    5101               0 :                                 ereport(LOG,
    5102                 :                                          (errmsg("last completed transaction was at log time %s",
    5103                 :                                                          timestamptz_to_str(recoveryLastXTime))));
    5104               0 :                         InRedo = false;
    5105                 :                 }
    5106                 :                 else
    5107                 :                 {
    5108                 :                         /* there are no WAL records following the checkpoint */
    5109               0 :                         ereport(LOG,
    5110                 :                                         (errmsg("redo is not required")));
    5111                 :                 }
    5112                 :         }
    5113                 : 
    5114                 :         /*
    5115                 :          * Re-fetch the last valid or last applied record, so we can identify the
    5116                 :          * exact endpoint of what we consider the valid portion of WAL.
    5117                 :          */
    5118              14 :         record = ReadRecord(&LastRec, PANIC);
    5119              14 :         EndOfLog = EndRecPtr;
    5120              14 :         XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
    5121                 : 
    5122                 :         /*
    5123                 :          * Complain if we did not roll forward far enough to render the backup
    5124                 :          * dump consistent.
    5125                 :          */
    5126              14 :         if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
    5127                 :         {
    5128               0 :                 if (reachedStopPoint)   /* stopped because of stop request */
    5129               0 :                         ereport(FATAL,
    5130                 :                                         (errmsg("requested recovery stop point is before end time of backup dump")));
    5131                 :                 else    /* ran off end of WAL */
    5132               0 :                         ereport(FATAL,
    5133                 :                                         (errmsg("WAL ends before end time of backup dump")));
    5134                 :         }
    5135                 : 
    5136                 :         /*
    5137                 :          * Consider whether we need to assign a new timeline ID.
    5138                 :          *
    5139                 :          * If we are doing an archive recovery, we always assign a new ID.      This
    5140                 :          * handles a couple of issues.  If we stopped short of the end of WAL
    5141                 :          * during recovery, then we are clearly generating a new timeline and must
    5142                 :          * assign it a unique new ID.  Even if we ran to the end, modifying the
    5143                 :          * current last segment is problematic because it may result in trying to
    5144                 :          * overwrite an already-archived copy of that segment, and we encourage
    5145                 :          * DBAs to make their archive_commands reject that.  We can dodge the
    5146                 :          * problem by making the new active segment have a new timeline ID.
    5147                 :          *
    5148                 :          * In a normal crash recovery, we can just extend the timeline we were in.
    5149                 :          */
    5150              14 :         if (InArchiveRecovery)
    5151                 :         {
    5152               0 :                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
    5153               0 :                 ereport(LOG,
    5154                 :                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
    5155               0 :                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
    5156                 :                                                          curFileTLI, endLogId, endLogSeg);
    5157                 :         }
    5158                 : 
    5159                 :         /* Save the selected TimeLineID in shared memory, too */
    5160              14 :         XLogCtl->ThisTimeLineID = ThisTimeLineID;
    5161                 : 
    5162                 :         /*
    5163                 :          * We are now done reading the old WAL.  Turn off archive fetching if it
    5164                 :          * was active, and make a writable copy of the last WAL segment. (Note
    5165                 :          * that we also have a copy of the last block of the old WAL in readBuf;
    5166                 :          * we will use that below.)
    5167                 :          */
    5168              14 :         if (InArchiveRecovery)
    5169               0 :                 exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
    5170                 : 
    5171                 :         /*
    5172                 :          * Prepare to write WAL starting at EndOfLog position, and init xlog
    5173                 :          * buffer cache using the block containing the last record from the
    5174                 :          * previous incarnation.
    5175                 :          */
    5176              14 :         openLogId = endLogId;
    5177              14 :         openLogSeg = endLogSeg;
    5178              14 :         openLogFile = XLogFileOpen(openLogId, openLogSeg);
    5179              14 :         openLogOff = 0;
    5180              14 :         Insert = &XLogCtl->Insert;
    5181              14 :         Insert->PrevRecord = LastRec;
    5182              14 :         XLogCtl->xlblocks[0].xlogid = openLogId;
    5183              14 :         XLogCtl->xlblocks[0].xrecoff =
    5184                 :                 ((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
    5185                 : 
    5186                 :         /*
    5187                 :          * Tricky point here: readBuf contains the *last* block that the LastRec
    5188                 :          * record spans, not the one it starts in.      The last block is indeed the
    5189                 :          * one we want to use.
    5190                 :          */
    5191                 :         Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
    5192              14 :         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
    5193              14 :         Insert->currpos = (char *) Insert->currpage +
    5194                 :                 (EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
    5195                 : 
    5196              14 :         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
    5197                 : 
    5198              14 :         XLogCtl->Write.LogwrtResult = LogwrtResult;
    5199              14 :         Insert->LogwrtResult = LogwrtResult;
    5200              14 :         XLogCtl->LogwrtResult = LogwrtResult;
    5201                 : 
    5202              14 :         XLogCtl->LogwrtRqst.Write = EndOfLog;
    5203              14 :         XLogCtl->LogwrtRqst.Flush = EndOfLog;
    5204                 : 
    5205              14 :         freespace = INSERT_FREESPACE(Insert);
    5206              14 :         if (freespace > 0)
    5207                 :         {
    5208                 :                 /* Make sure rest of page is zero */
    5209              14 :                 MemSet(Insert->currpos, 0, freespace);
    5210              14 :                 XLogCtl->Write.curridx = 0;
    5211                 :         }
    5212                 :         else
    5213                 :         {
    5214                 :                 /*
    5215                 :                  * Whenever Write.LogwrtResult points to exactly the end of a page,
    5216                 :                  * Write.curridx must point to the *next* page (see XLogWrite()).
    5217                 :                  *
    5218                 :                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
    5219                 :                  * this is sufficient.  The first actual attempt to insert a log
    5220                 :                  * record will advance the insert state.
    5221                 :                  */
    5222               0 :                 XLogCtl->Write.curridx = NextBufIdx(0);
    5223                 :         }
    5224                 : 
    5225                 :         /* Pre-scan prepared transactions to find out the range of XIDs present */
    5226              14 :         oldestActiveXID = PrescanPreparedTransactions();
    5227                 : 
    5228              14 :         if (InRecovery)
    5229                 :         {
    5230                 :                 int                     rmid;
    5231                 : 
    5232                 :                 /*
    5233                 :                  * Allow resource managers to do any required cleanup.
    5234                 :                  */
    5235               0 :                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    5236                 :                 {
    5237               0 :                         if (RmgrTable[rmid].rm_cleanup != NULL)
    5238               0 :                                 RmgrTable[rmid].rm_cleanup();
    5239                 :                 }
    5240                 : 
    5241                 :                 /*
    5242                 :                  * Check to see if the XLOG sequence contained any unresolved
    5243                 :                  * references to uninitialized pages.
    5244                 :                  */
    5245               0 :                 XLogCheckInvalidPages();
    5246                 : 
    5247                 :                 /*
    5248                 :                  * Reset pgstat data, because it may be invalid after recovery.
    5249                 :                  */
    5250               0 :                 pgstat_reset_all();
    5251                 : 
    5252                 :                 /*
    5253                 :                  * Perform a checkpoint to update all our recovery activity to disk.
    5254                 :                  *
    5255                 :                  * Note that we write a shutdown checkpoint rather than an on-line
    5256                 :                  * one. This is not particularly critical, but since we may be
    5257                 :                  * assigning a new TLI, using a shutdown checkpoint allows us to have
    5258                 :                  * the rule that TLI only changes in shutdown checkpoints, which
    5259                 :                  * allows some extra error checking in xlog_redo.
    5260                 :                  */
    5261               0 :                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    5262                 : 
    5263                 :                 /*
    5264                 :                  * Close down recovery environment
    5265                 :                  */
    5266               0 :                 XLogCloseRelationCache();
    5267                 :         }
    5268                 : 
    5269                 :         /*
    5270                 :          * Preallocate additional log files, if wanted.
    5271                 :          */
    5272              14 :         PreallocXlogFiles(EndOfLog);
    5273                 : 
    5274                 :         /*
    5275                 :          * Okay, we're officially UP.
    5276                 :          */
    5277              14 :         InRecovery = false;
    5278                 : 
    5279              14 :         ControlFile->state = DB_IN_PRODUCTION;
    5280              14 :         ControlFile->time = (pg_time_t) time(NULL);
    5281              14 :         UpdateControlFile();
    5282                 : 
    5283                 :         /* start the archive_timeout timer running */
    5284              14 :         XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
    5285                 : 
    5286                 :         /* initialize shared-memory copy of latest checkpoint XID/epoch */
    5287              14 :         XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
    5288              14 :         XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
    5289                 : 
    5290                 :         /* also initialize latestCompletedXid, to nextXid - 1 */
    5291              14 :         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
    5292              20 :         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
    5293                 : 
    5294                 :         /* Start up the commit log and related stuff, too */
    5295              14 :         StartupCLOG();
    5296              14 :         StartupSUBTRANS(oldestActiveXID);
    5297              14 :         StartupMultiXact();
    5298                 : 
    5299                 :         /* Reload shared-memory state for prepared transactions */
    5300              14 :         RecoverPreparedTransactions();
    5301                 : 
    5302                 :         /* Shut down readFile facility, free space */
    5303              14 :         if (readFile >= 0)
    5304                 :         {
    5305              14 :                 close(readFile);
    5306              14 :                 readFile = -1;
    5307                 :         }
    5308              14 :         if (readBuf)
    5309                 :         {
    5310              14 :                 free(readBuf);
    5311              14 :                 readBuf = NULL;
    5312                 :         }
    5313              14 :         if (readRecordBuf)
    5314                 :         {
    5315              14 :                 free(readRecordBuf);
    5316              14 :                 readRecordBuf = NULL;
    5317              14 :                 readRecordBufSize = 0;
    5318                 :         }
    5319              14 : }
    5320                 : 
    5321                 : /*
    5322                 :  * Subroutine to try to fetch and validate a prior checkpoint record.
    5323                 :  *
    5324                 :  * whichChkpt identifies the checkpoint (merely for reporting purposes).
    5325                 :  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
    5326                 :  */
    5327                 : static XLogRecord *
    5328                 : ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
    5329              14 : {
    5330                 :         XLogRecord *record;
    5331                 : 
    5332              14 :         if (!XRecOffIsValid(RecPtr.xrecoff))
    5333                 :         {
    5334               0 :                 switch (whichChkpt)
    5335                 :                 {
    5336                 :                         case 1:
    5337               0 :                                 ereport(LOG,
    5338                 :                                 (errmsg("invalid primary checkpoint link in control file")));
    5339                 :                                 break;
    5340                 :                         case 2:
    5341               0 :                                 ereport(LOG,
    5342                 :                                                 (errmsg("invalid secondary checkpoint link in control file")));
    5343                 :                                 break;
    5344                 :                         default:
    5345               0 :                                 ereport(LOG,
    5346                 :                                    (errmsg("invalid checkpoint link in backup_label file")));
    5347                 :                                 break;
    5348                 :                 }
    5349               0 :                 return NULL;
    5350                 :         }
    5351                 : 
    5352              14 :         record = ReadRecord(&RecPtr, LOG);
    5353                 : 
    5354              14 :         if (record == NULL)
    5355                 :         {
    5356               0 :                 switch (whichChkpt)
    5357                 :                 {
    5358                 :                         case 1:
    5359               0 :                                 ereport(LOG,
    5360                 :                                                 (errmsg("invalid primary checkpoint record")));
    5361                 :                                 break;
    5362                 :                         case 2:
    5363               0 :                                 ereport(LOG,
    5364                 :                                                 (errmsg("invalid secondary checkpoint record")));
    5365                 :                                 break;
    5366                 :                         default:
    5367               0 :                                 ereport(LOG,
    5368                 :                                                 (errmsg("invalid checkpoint record")));
    5369                 :                                 break;
    5370                 :                 }
    5371               0 :                 return NULL;
    5372                 :         }
    5373              14 :         if (record->xl_rmid != RM_XLOG_ID)
    5374                 :         {
    5375               0 :                 switch (whichChkpt)
    5376                 :                 {
    5377                 :                         case 1:
    5378               0 :                                 ereport(LOG,
    5379                 :                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
    5380                 :                                 break;
    5381                 :                         case 2:
    5382               0 :                                 ereport(LOG,
    5383                 :                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
    5384                 :                                 break;
    5385                 :                         default:
    5386               0 :                                 ereport(LOG,
    5387                 :                                 (errmsg("invalid resource manager ID in checkpoint record")));
    5388                 :                                 break;
    5389                 :                 }
    5390               0 :                 return NULL;
    5391                 :         }
    5392              14 :         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
    5393                 :                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
    5394                 :         {
    5395               0 :                 switch (whichChkpt)
    5396                 :                 {
    5397                 :                         case 1:
    5398               0 :                                 ereport(LOG,
    5399                 :                                    (errmsg("invalid xl_info in primary checkpoint record")));
    5400                 :                                 break;
    5401                 :                         case 2:
    5402               0 :                                 ereport(LOG,
    5403                 :                                  (errmsg("invalid xl_info in secondary checkpoint record")));
    5404                 :                                 break;
    5405                 :                         default:
    5406               0 :                                 ereport(LOG,
    5407                 :                                                 (errmsg("invalid xl_info in checkpoint record")));
    5408                 :                                 break;
    5409                 :                 }
    5410               0 :                 return NULL;
    5411                 :         }
    5412              14 :         if (record->xl_len != sizeof(CheckPoint) ||
    5413                 :                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
    5414                 :         {
    5415               0 :                 switch (whichChkpt)
    5416                 :                 {
    5417                 :                         case 1:
    5418               0 :                                 ereport(LOG,
    5419                 :                                         (errmsg("invalid length of primary checkpoint record")));
    5420                 :                                 break;
    5421                 :                         case 2:
    5422               0 :                                 ereport(LOG,
    5423                 :                                   (errmsg("invalid length of secondary checkpoint record")));
    5424                 :                                 break;
    5425                 :                         default:
    5426               0 :                                 ereport(LOG,
    5427                 :                                                 (errmsg("invalid length of checkpoint record")));
    5428                 :                                 break;
    5429                 :                 }
    5430               0 :                 return NULL;
    5431                 :         }
    5432              14 :         return record;
    5433                 : }
    5434                 : 
    5435                 : /*
    5436                 :  * This must be called during startup of a backend process, except that
    5437                 :  * it need not be called in a standalone backend (which does StartupXLOG
    5438                 :  * instead).  We need to initialize the local copies of ThisTimeLineID and
    5439                 :  * RedoRecPtr.
    5440                 :  *
    5441                 :  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
    5442                 :  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
    5443                 :  * unnecessary however, since the postmaster itself never touches XLOG anyway.
    5444                 :  */
    5445                 : void
    5446                 : InitXLOGAccess(void)
    5447             139 : {
    5448                 :         /* ThisTimeLineID doesn't change so we need no lock to copy it */
    5449             139 :         ThisTimeLineID = XLogCtl->ThisTimeLineID;
    5450                 :         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
    5451             139 :         (void) GetRedoRecPtr();
    5452             139 : }
    5453                 : 
    5454                 : /*
    5455                 :  * Once spawned, a backend may update its local RedoRecPtr from
    5456                 :  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
    5457                 :  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
    5458                 :  */
    5459                 : XLogRecPtr
    5460                 : GetRedoRecPtr(void)
    5461             275 : {
    5462                 :         /* use volatile pointer to prevent code rearrangement */
    5463             275 :         volatile XLogCtlData *xlogctl = XLogCtl;
    5464                 : 
    5465             550 :         SpinLockAcquire(&xlogctl->info_lck);
    5466                 :         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
    5467             275 :         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
    5468             275 :         SpinLockRelease(&xlogctl->info_lck);
    5469                 : 
    5470             275 :         return RedoRecPtr;
    5471                 : }
    5472                 : 
    5473                 : /*
    5474                 :  * GetInsertRecPtr -- Returns the current insert position.
    5475                 :  *
    5476                 :  * NOTE: The value *actually* returned is the position of the last full
    5477                 :  * xlog page. It lags behind the real insert position by at most 1 page.
    5478                 :  * For that, we don't need to acquire WALInsertLock which can be quite
    5479                 :  * heavily contended, and an approximation is enough for the current
    5480                 :  * usage of this function.
    5481                 :  */
    5482                 : XLogRecPtr
    5483                 : GetInsertRecPtr(void)
    5484               3 : {
    5485                 :         /* use volatile pointer to prevent code rearrangement */
    5486               3 :         volatile XLogCtlData *xlogctl = XLogCtl;
    5487                 :         XLogRecPtr      recptr;
    5488                 : 
    5489               3 :         SpinLockAcquire(&xlogctl->info_lck);
    5490               3 :         recptr = xlogctl->LogwrtRqst.Write;
    5491               3 :         SpinLockRelease(&xlogctl->info_lck);
    5492                 : 
    5493               3 :         return recptr;
    5494                 : }
    5495                 : 
    5496                 : /*
    5497                 :  * Get the time of the last xlog segment switch
    5498                 :  */
    5499                 : pg_time_t
    5500                 : GetLastSegSwitchTime(void)
    5501               0 : {
    5502                 :         pg_time_t       result;
    5503                 : 
    5504                 :         /* Need WALWriteLock, but shared lock is sufficient */
    5505               0 :         LWLockAcquire(WALWriteLock, LW_SHARED);
    5506               0 :         result = XLogCtl->Write.lastSegSwitchTime;
    5507               0 :         LWLockRelease(WALWriteLock);
    5508                 : 
    5509               0 :         return result;
    5510                 : }
    5511                 : 
    5512                 : /*
    5513                 :  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
    5514                 :  *
    5515                 :  * This is exported for use by code that would like to have 64-bit XIDs.
    5516                 :  * We don't really support such things, but all XIDs within the system
    5517                 :  * can be presumed "close to" the result, and thus the epoch associated
    5518                 :  * with them can be determined.
    5519                 :  */
    5520                 : void
    5521                 : GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
    5522               4 : {
    5523                 :         uint32          ckptXidEpoch;
    5524                 :         TransactionId ckptXid;
    5525                 :         TransactionId nextXid;
    5526                 : 
    5527                 :         /* Must read checkpoint info first, else have race condition */
    5528                 :         {
    5529                 :                 /* use volatile pointer to prevent code rearrangement */
    5530               4 :                 volatile XLogCtlData *xlogctl = XLogCtl;
    5531                 : 
    5532               4 :                 SpinLockAcquire(&xlogctl->info_lck);
    5533               4 :                 ckptXidEpoch = xlogctl->ckptXidEpoch;
    5534               4 :                 ckptXid = xlogctl->ckptXid;
    5535               4 :                 SpinLockRelease(&xlogctl->info_lck);
    5536                 :         }
    5537                 : 
    5538                 :         /* Now fetch current nextXid */
    5539               4 :         nextXid = ReadNewTransactionId();
    5540                 : 
    5541                 :         /*
    5542                 :          * nextXid is certainly logically later than ckptXid.  So if it's
    5543                 :          * numerically less, it must have wrapped into the next epoch.
    5544                 :          */
    5545               4 :         if (nextXid < ckptXid)
    5546               0 :                 ckptXidEpoch++;
    5547                 : 
    5548               4 :         *xid = nextXid;
    5549               4 :         *epoch = ckptXidEpoch;
    5550               4 : }
    5551                 : 
    5552                 : /*
    5553                 :  * This must be called ONCE during postmaster or standalone-backend shutdown
    5554                 :  */
    5555                 : void
    5556                 : ShutdownXLOG(int code, Datum arg)
    5557              13 : {
    5558              13 :         ereport(LOG,
    5559                 :                         (errmsg("shutting down")));
    5560                 : 
    5561              13 :         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    5562              13 :         ShutdownCLOG();
    5563              13 :         ShutdownSUBTRANS();
    5564              13 :         ShutdownMultiXact();
    5565                 : 
    5566              13 :         ereport(LOG,
    5567                 :                         (errmsg("database system is shut down")));
    5568              13 : }
    5569                 : 
    5570                 : /*
    5571                 :  * Log start of a checkpoint.
    5572                 :  */
    5573                 : static void
    5574                 : LogCheckpointStart(int flags)
    5575               0 : {
    5576               0 :         elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
    5577                 :                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
    5578                 :                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
    5579                 :                  (flags & CHECKPOINT_FORCE) ? " force" : "",
    5580                 :                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
    5581                 :                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
    5582                 :                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
    5583               0 : }
    5584                 : 
    5585                 : /*
    5586                 :  * Log end of a checkpoint.
    5587                 :  */
    5588                 : static void
    5589                 : LogCheckpointEnd(void)
    5590               0 : {
    5591                 :         long            write_secs,
    5592                 :                                 sync_secs,
    5593                 :                                 total_secs;
    5594                 :         int                     write_usecs,
    5595                 :                                 sync_usecs,
    5596                 :                                 total_usecs;
    5597                 : 
    5598               0 :         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
    5599                 : 
    5600               0 :         TimestampDifference(CheckpointStats.ckpt_start_t,
    5601                 :                                                 CheckpointStats.ckpt_end_t,
    5602                 :                                                 &total_secs, &total_usecs);
    5603                 : 
    5604               0 :         TimestampDifference(CheckpointStats.ckpt_write_t,
    5605                 :                                                 CheckpointStats.ckpt_sync_t,
    5606                 :                                                 &write_secs, &write_usecs);
    5607                 : 
    5608               0 :         TimestampDifference(CheckpointStats.ckpt_sync_t,
    5609                 :                                                 CheckpointStats.ckpt_sync_end_t,
    5610                 :                                                 &sync_secs, &sync_usecs);
    5611                 : 
    5612               0 :         elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
    5613                 :                  "%d transaction log file(s) added, %d removed, %d recycled; "
    5614                 :                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
    5615                 :                  CheckpointStats.ckpt_bufs_written,
    5616                 :                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    5617                 :                  CheckpointStats.ckpt_segs_added,
    5618                 :                  CheckpointStats.ckpt_segs_removed,
    5619                 :                  CheckpointStats.ckpt_segs_recycled,
    5620                 :                  write_secs, write_usecs / 1000,
    5621                 :                  sync_secs, sync_usecs / 1000,
    5622                 :                  total_secs, total_usecs / 1000);
    5623               0 : }
    5624                 : 
    5625                 : /*
    5626                 :  * Perform a checkpoint --- either during shutdown, or on-the-fly
    5627                 :  *
    5628                 :  * flags is a bitwise OR of the following:
    5629                 :  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
    5630                 :  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
    5631                 :  *              ignoring checkpoint_completion_target parameter.
    5632                 :  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
    5633                 :  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
    5634                 :  *
    5635                 :  * Note: flags contains other bits, of interest here only for logging purposes.
    5636                 :  * In particular note that this routine is synchronous and does not pay
    5637                 :  * attention to CHECKPOINT_WAIT.
    5638                 :  */
    5639                 : void
    5640                 : CreateCheckPoint(int flags)
    5641              19 : {
    5642              19 :         bool            shutdown = (flags & CHECKPOINT_IS_SHUTDOWN) != 0;
    5643                 :         CheckPoint      checkPoint;
    5644                 :         XLogRecPtr      recptr;
    5645              19 :         XLogCtlInsert *Insert = &XLogCtl->Insert;
    5646                 :         XLogRecData rdata;
    5647                 :         uint32          freespace;
    5648                 :         uint32          _logId;
    5649                 :         uint32          _logSeg;
    5650                 :         TransactionId *inCommitXids;
    5651                 :         int                     nInCommit;
    5652                 : 
    5653                 :         /*
    5654                 :          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
    5655                 :          * (This is just pro forma, since in the present system structure there is
    5656                 :          * only one process that is allowed to issue checkpoints at any given
    5657                 :          * time.)
    5658                 :          */
    5659              19 :         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
    5660                 : 
    5661                 :         /*
    5662                 :          * Prepare to accumulate statistics.
    5663                 :          *
    5664                 :          * Note: because it is possible for log_checkpoints to change while a
    5665                 :          * checkpoint proceeds, we always accumulate stats, even if
    5666                 :          * log_checkpoints is currently off.
    5667                 :          */
    5668              19 :         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    5669              19 :         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    5670                 : 
    5671                 :         /*
    5672                 :          * Use a critical section to force system panic if we have trouble.
    5673                 :          */
    5674              19 :         START_CRIT_SECTION();
    5675                 : 
    5676              19 :         if (shutdown)
    5677                 :         {
    5678              14 :                 ControlFile->state = DB_SHUTDOWNING;
    5679              14 :                 ControlFile->time = (pg_time_t) time(NULL);
    5680              14 :                 UpdateControlFile();
    5681                 :         }
    5682                 : 
    5683                 :         /*
    5684                 :          * Let smgr prepare for checkpoint; this has to happen before we determine
    5685                 :          * the REDO pointer.  Note that smgr must not do anything that'd have to
    5686                 :          * be undone if we decide no checkpoint is needed.
    5687                 :          */
    5688              19 :         smgrpreckpt();
    5689                 : 
    5690                 :         /* Begin filling in the checkpoint WAL record */
    5691              19 :         MemSet(&checkPoint, 0, sizeof(checkPoint));
    5692              19 :         checkPoint.ThisTimeLineID = ThisTimeLineID;
    5693              19 :         checkPoint.time = (pg_time_t) time(NULL);
    5694                 : 
    5695                 :         /*
    5696                 :          * We must hold WALInsertLock while examining insert state to determine
    5697                 :          * the checkpoint REDO pointer.
    5698                 :          */
    5699              19 :         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    5700                 : 
    5701                 :         /*
    5702                 :          * If this isn't a shutdown or forced checkpoint, and we have not inserted
    5703                 :          * any XLOG records since the start of the last checkpoint, skip the
    5704                 :          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
    5705                 :          * when the system is idle. That wastes log space, and more importantly it
    5706                 :          * exposes us to possible loss of both current and previous checkpoint
    5707                 :          * records if the machine crashes just as we're writing the update.
    5708                 :          * (Perhaps it'd make even more sense to checkpoint only when the previous
    5709                 :          * checkpoint record is in a different xlog page?)
    5710                 :          *
    5711                 :          * We have to make two tests to determine that nothing has happened since
    5712                 :          * the start of the last checkpoint: current insertion point must match
    5713                 :          * the end of the last checkpoint record, and its redo pointer must point
    5714                 :          * to itself.
    5715                 :          */
    5716              19 :         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
    5717                 :         {
    5718                 :                 XLogRecPtr      curInsert;
    5719                 : 
    5720               0 :                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
    5721               0 :                 if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
    5722                 :                         curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
    5723                 :                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
    5724                 :                         ControlFile->checkPoint.xlogid ==
    5725                 :                         ControlFile->checkPointCopy.redo.xlogid &&
    5726                 :                         ControlFile->checkPoint.xrecoff ==
    5727                 :                         ControlFile->checkPointCopy.redo.xrecoff)
    5728                 :                 {
    5729               0 :                         LWLockRelease(WALInsertLock);
    5730               0 :                         LWLockRelease(CheckpointLock);
    5731               0 :                         END_CRIT_SECTION();
    5732               0 :                         return;
    5733                 :                 }
    5734                 :         }
    5735                 : 
    5736                 :         /*
    5737                 :          * Compute new REDO record ptr = location of next XLOG record.
    5738                 :          *
    5739                 :          * NB: this is NOT necessarily where the checkpoint record itself will be,
    5740                 :          * since other backends may insert more XLOG records while we're off doing
    5741                 :          * the buffer flush work.  Those XLOG records are logically after the
    5742                 :          * checkpoint, even though physically before it.  Got that?
    5743                 :          */
    5744              19 :         freespace = INSERT_FREESPACE(Insert);
    5745              19 :         if (freespace < SizeOfXLogRecord)
    5746                 :         {
    5747               1 :                 (void) AdvanceXLInsertBuffer(false);
    5748                 :                 /* OK to ignore update return flag, since we will do flush anyway */
    5749               1 :                 freespace = INSERT_FREESPACE(Insert);
    5750                 :         }
    5751              19 :         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
    5752                 : 
    5753                 :         /*
    5754                 :          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
    5755                 :          * must be done while holding the insert lock AND the info_lck.
    5756                 :          *
    5757                 :          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
    5758                 :          * pointing past where it really needs to point.  This is okay; the only
    5759                 :          * consequence is that XLogInsert might back up whole buffers that it
    5760                 :          * didn't really need to.  We can't postpone advancing RedoRecPtr because
    5761                 :          * XLogInserts that happen while we are dumping buffers must assume that
    5762                 :          * their buffer changes are not included in the checkpoint.
    5763                 :          */
    5764                 :         {
    5765                 :                 /* use volatile pointer to prevent code rearrangement */
    5766              19 :                 volatile XLogCtlData *xlogctl = XLogCtl;
    5767                 : 
    5768              38 :                 SpinLockAcquire(&xlogctl->info_lck);
    5769              19 :                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
    5770              19 :                 SpinLockRelease(&xlogctl->info_lck);
    5771                 :         }
    5772                 : 
    5773                 :         /*
    5774                 :          * Now we can release WAL insert lock, allowing other xacts to proceed
    5775                 :          * while we are flushing disk buffers.
    5776                 :          */
    5777              19 :         LWLockRelease(WALInsertLock);
    5778                 : 
    5779                 :         /*
    5780                 :          * If enabled, log checkpoint start.  We postpone this until now so as not
    5781                 :          * to log anything if we decided to skip the checkpoint.
    5782                 :          */
    5783              19 :         if (log_checkpoints)
    5784               0 :                 LogCheckpointStart(flags);
    5785                 : 
    5786                 :         /*
    5787                 :          * Before flushing data, we must wait for any transactions that are
    5788                 :          * currently in their commit critical sections.  If an xact inserted its
    5789                 :          * commit record into XLOG just before the REDO point, then a crash
    5790                 :          * restart from the REDO point would not replay that record, which means
    5791                 :          * that our flushing had better include the xact's update of pg_clog.  So
    5792                 :          * we wait till he's out of his commit critical section before proceeding.
    5793                 :          * See notes in RecordTransactionCommit().
    5794                 :          *
    5795                 :          * Because we've already released WALInsertLock, this test is a bit fuzzy:
    5796                 :          * it is possible that we will wait for xacts we didn't really need to
    5797                 :          * wait for.  But the delay should be short and it seems better to make
    5798                 :          * checkpoint take a bit longer than to hold locks longer than necessary.
    5799                 :          * (In fact, the whole reason we have this issue is that xact.c does
    5800                 :          * commit record XLOG insertion and clog update as two separate steps
    5801                 :          * protected by different locks, but again that seems best on grounds of
    5802                 :          * minimizing lock contention.)
    5803                 :          *
    5804                 :          * A transaction that has not yet set inCommit when we look cannot be at
    5805                 :          * risk, since he's not inserted his commit record yet; and one that's
    5806                 :          * already cleared it is not at risk either, since he's done fixing clog
    5807                 :          * and we will correctly flush the update below.  So we cannot miss any
    5808                 :          * xacts we need to wait for.
    5809                 :          */
    5810              19 :         nInCommit = GetTransactionsInCommit(&inCommitXids);
    5811              19 :         if (nInCommit > 0)
    5812                 :         {
    5813                 :                 do
    5814                 :                 {
    5815               0 :                         pg_usleep(10000L);      /* wait for 10 msec */
    5816               0 :                 } while (HaveTransactionsInCommit(inCommitXids, nInCommit));
    5817                 :         }
    5818              19 :         pfree(inCommitXids);
    5819                 : 
    5820                 :         /*
    5821                 :          * Get the other info we need for the checkpoint record.
    5822                 :          */
    5823              19 :         LWLockAcquire(XidGenLock, LW_SHARED);
    5824              19 :         checkPoint.nextXid = ShmemVariableCache->nextXid;
    5825              19 :         LWLockRelease(XidGenLock);
    5826                 : 
    5827                 :         /* Increase XID epoch if we've wrapped around since last checkpoint */
    5828              19 :         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
    5829              19 :         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
    5830               0 :                 checkPoint.nextXidEpoch++;
    5831                 : 
    5832              19 :         LWLockAcquire(OidGenLock, LW_SHARED);
    5833              19 :         checkPoint.nextOid = ShmemVariableCache->nextOid;
    5834              19 :         if (!shutdown)
    5835               5 :                 checkPoint.nextOid += ShmemVariableCache->oidCount;
    5836              19 :         LWLockRelease(OidGenLock);
    5837                 : 
    5838              19 :         MultiXactGetCheckptMulti(shutdown,
    5839                 :                                                          &checkPoint.nextMulti,
    5840                 :                                                          &checkPoint.nextMultiOffset);
    5841                 : 
    5842                 :         /*
    5843                 :          * Having constructed the checkpoint record, ensure all shmem disk buffers
    5844                 :          * and commit-log buffers are flushed to disk.
    5845                 :          *
    5846                 :          * This I/O could fail for various reasons.  If so, we will fail to
    5847                 :          * complete the checkpoint, but there is no reason to force a system
    5848                 :          * panic. Accordingly, exit critical section while doing it.
    5849                 :          */
    5850              19 :         END_CRIT_SECTION();
    5851                 : 
    5852              19 :         CheckPointGuts(checkPoint.redo, flags);
    5853                 : 
    5854              19 :         START_CRIT_SECTION();
    5855                 : 
    5856                 :         /*
    5857                 :          * Now insert the checkpoint record into XLOG.
    5858                 :          */
    5859              19 :         rdata.data = (char *) (&checkPoint);
    5860              19 :         rdata.len = sizeof(checkPoint);
    5861              19 :         rdata.buffer = InvalidBuffer;
    5862              19 :         rdata.next = NULL;
    5863                 : 
    5864              19 :         recptr = XLogInsert(RM_XLOG_ID,
    5865                 :                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
    5866                 :                                                 XLOG_CHECKPOINT_ONLINE,
    5867                 :                                                 &rdata);
    5868                 : 
    5869              19 :         XLogFlush(recptr);
    5870                 : 
    5871                 :         /*
    5872                 :          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
    5873                 :          * = end of actual checkpoint record.
    5874                 :          */
    5875              19 :         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
    5876               0 :                 ereport(PANIC,
    5877                 :                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
    5878                 : 
    5879                 :         /*
    5880                 :          * Select point at which we can truncate the log, which we base on the
    5881                 :          * prior checkpoint's earliest info.
    5882                 :          */
    5883              19 :         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
    5884                 : 
    5885                 :         /*
    5886                 :          * Update the control file.
    5887                 :          */
    5888              19 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    5889              19 :         if (shutdown)
    5890              14 :                 ControlFile->state = DB_SHUTDOWNED;
    5891              19 :         ControlFile->prevCheckPoint = ControlFile->checkPoint;
    5892              19 :         ControlFile->checkPoint = ProcLastRecPtr;
    5893              19 :         ControlFile->checkPointCopy = checkPoint;
    5894              19 :         ControlFile->time = (pg_time_t) time(NULL);
    5895              19 :         UpdateControlFile();
    5896              19 :         LWLockRelease(ControlFileLock);
    5897                 : 
    5898                 :         /* Update shared-memory copy of checkpoint XID/epoch */
    5899                 :         {
    5900                 :                 /* use volatile pointer to prevent code rearrangement */
    5901              19 :                 volatile XLogCtlData *xlogctl = XLogCtl;
    5902                 : 
    5903              38 :                 SpinLockAcquire(&xlogctl->info_lck);
    5904              19 :                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
    5905              19 :                 xlogctl->ckptXid = checkPoint.nextXid;
    5906              19 :                 SpinLockRelease(&xlogctl->info_lck);
    5907                 :         }
    5908                 : 
    5909                 :         /*
    5910                 :          * We are now done with critical updates; no need for system panic if we
    5911                 :          * have trouble while fooling with old log segments.
    5912                 :          */
    5913              19 :         END_CRIT_SECTION();
    5914                 : 
    5915                 :         /*
    5916                 :          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
    5917                 :          */
    5918              19 :         smgrpostckpt();
    5919                 : 
    5920                 :         /*
    5921                 :          * Delete old log files (those no longer needed even for previous
    5922                 :          * checkpoint).
    5923                 :          */
    5924              19 :         if (_logId || _logSeg)
    5925                 :         {
    5926               2 :                 PrevLogSeg(_logId, _logSeg);
    5927               2 :                 RemoveOldXlogFiles(_logId, _logSeg, recptr);
    5928                 :         }
    5929                 : 
    5930                 :         /*
    5931                 :          * Make more log segments if needed.  (Do this after recycling old log
    5932                 :          * segments, since that may supply some of the needed files.)
    5933                 :          */
    5934              19 :         if (!shutdown)
    5935               5 :                 PreallocXlogFiles(recptr);
    5936                 : 
    5937                 :         /*
    5938                 :          * Truncate pg_subtrans if possible.  We can throw away all data before
    5939                 :          * the oldest XMIN of any running transaction.  No future transaction will
    5940                 :          * attempt to reference any pg_subtrans entry older than that (see Asserts
    5941                 :          * in subtrans.c).      During recovery, though, we mustn't do this because
    5942                 :          * StartupSUBTRANS hasn't been called yet.
    5943                 :          */
    5944              19 :         if (!InRecovery)
    5945              19 :                 TruncateSUBTRANS(GetOldestXmin(true, false));
    5946                 : 
    5947                 :         /* All real work is done, but log before releasing lock. */
    5948              19 :         if (log_checkpoints)
    5949               0 :                 LogCheckpointEnd();
    5950                 : 
    5951              19 :         LWLockRelease(CheckpointLock);
    5952                 : }
    5953                 : 
    5954                 : /*
    5955                 :  * Flush all data in shared memory to disk, and fsync
    5956                 :  *
    5957                 :  * This is the common code shared between regular checkpoints and
    5958                 :  * recovery restartpoints.
    5959                 :  */
    5960                 : static void
    5961                 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
    5962              19 : {
    5963              19 :         CheckPointCLOG();
    5964              19 :         CheckPointSUBTRANS();
    5965              19 :         CheckPointMultiXact();
    5966              19 :         CheckPointBuffers(flags);       /* performs all required fsyncs */
    5967                 :         /* We deliberately delay 2PC checkpointing as long as possible */
    5968              19 :         CheckPointTwoPhase(checkPointRedo);
    5969              19 : }
    5970                 : 
    5971                 : /*
    5972                 :  * Set a recovery restart point if appropriate
    5973                 :  *
    5974                 :  * This is similar to CreateCheckPoint, but is used during WAL recovery
    5975                 :  * to establish a point from which recovery can roll forward without
    5976                 :  * replaying the entire recovery log.  This function is called each time
    5977                 :  * a checkpoint record is read from XLOG; it must determine whether a
    5978                 :  * restartpoint is needed or not.
    5979                 :  */
    5980                 : static void
    5981                 : RecoveryRestartPoint(const CheckPoint *checkPoint)
    5982               0 : {
    5983                 :         int                     elapsed_secs;
    5984                 :         int                     rmid;
    5985                 : 
    5986                 :         /*
    5987                 :          * Do nothing if the elapsed time since the last restartpoint is less than
    5988                 :          * half of checkpoint_timeout.  (We use a value less than
    5989                 :          * checkpoint_timeout so that variations in the timing of checkpoints on
    5990                 :          * the master, or speed of transmission of WAL segments to a slave, won't
    5991                 :          * make the slave skip a restartpoint once it's synced with the master.)
    5992                 :          * Checking true elapsed time keeps us from doing restartpoints too often
    5993                 :          * while rapidly scanning large amounts of WAL.
    5994                 :          */
    5995               0 :         elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
    5996               0 :         if (elapsed_secs < CheckPointTimeout / 2)
    5997               0 :                 return;
    5998                 : 
    5999                 :         /*
    6000                 :          * Is it safe to checkpoint?  We must ask each of the resource managers
    6001                 :          * whether they have any partial state information that might prevent a
    6002                 :          * correct restart from this point.  If so, we skip this opportunity, but
    6003                 :          * return at the next checkpoint record for another try.
    6004                 :          */
    6005               0 :         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    6006                 :         {
    6007               0 :                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
    6008               0 :                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
    6009                 :                         {
    6010               0 :                                 elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
    6011                 :                                          rmid,
    6012                 :                                          checkPoint->redo.xlogid,
    6013                 :                                          checkPoint->redo.xrecoff);
    6014               0 :                                 return;
    6015                 :                         }
    6016                 :         }
    6017                 : 
    6018                 :         /*
    6019                 :          * OK, force data out to disk
    6020                 :          */
    6021               0 :         CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
    6022                 : 
    6023                 :         /*
    6024                 :          * Update pg_control so that any subsequent crash will restart from this
    6025                 :          * checkpoint.  Note: ReadRecPtr gives the XLOG address of the checkpoint
    6026                 :          * record itself.
    6027                 :          */
    6028               0 :         ControlFile->prevCheckPoint = ControlFile->checkPoint;
    6029               0 :         ControlFile->checkPoint = ReadRecPtr;
    6030               0 :         ControlFile->checkPointCopy = *checkPoint;
    6031               0 :         ControlFile->time = (pg_time_t) time(NULL);
    6032               0 :         UpdateControlFile();
    6033                 : 
    6034               0 :         ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
    6035                 :                         (errmsg("recovery restart point at %X/%X",
    6036                 :                                         checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
    6037               0 :         if (recoveryLastXTime)
    6038               0 :                 ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
    6039                 :                                 (errmsg("last completed transaction was at log time %s",
    6040                 :                                                 timestamptz_to_str(recoveryLastXTime))));
    6041                 : }
    6042                 : 
    6043                 : /*
    6044                 :  * Write a NEXTOID log record
    6045                 :  */
    6046                 : void
    6047                 : XLogPutNextOid(Oid nextOid)
    6048              12 : {
    6049                 :         XLogRecData rdata;
    6050                 : 
    6051              12 :         rdata.data = (char *) (&nextOid);
    6052              12 :         rdata.len = sizeof(Oid);
    6053              12 :         rdata.buffer = InvalidBuffer;
    6054              12 :         rdata.next = NULL;
    6055              12 :         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
    6056                 : 
    6057                 :         /*
    6058                 :          * We need not flush the NEXTOID record immediately, because any of the
    6059                 :          * just-allocated OIDs could only reach disk as part of a tuple insert or
    6060                 :          * update that would have its own XLOG record that must follow the NEXTOID
    6061                 :          * record.      Therefore, the standard buffer LSN interlock applied to those
    6062                 :          * records will ensure no such OID reaches disk before the NEXTOID record
    6063                 :          * does.
    6064                 :          *
    6065                 :          * Note, however, that the above statement only covers state "within" the
    6066                 :          * database.  When we use a generated OID as a file or directory name, we
    6067                 :          * are in a sense violating the basic WAL rule, because that filesystem
    6068                 :          * change may reach disk before the NEXTOID WAL record does.  The impact
    6069                 :          * of this is that if a database crash occurs immediately afterward, we
    6070                 :          * might after restart re-generate the same OID and find that it conflicts
    6071                 :          * with the leftover file or directory.  But since for safety's sake we
    6072                 :          * always loop until finding a nonconflicting filename, this poses no real
    6073                 :          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
    6074                 :          */
    6075              12 : }
    6076                 : 
    6077                 : /*
    6078                 :  * Write an XLOG SWITCH record.
    6079                 :  *
    6080                 :  * Here we just blindly issue an XLogInsert request for the record.
    6081                 :  * All the magic happens inside XLogInsert.
    6082                 :  *
    6083                 :  * The return value is either the end+1 address of the switch record,
    6084                 :  * or the end+1 address of the prior segment if we did not need to
    6085                 :  * write a switch record because we are already at segment start.
    6086                 :  */
    6087                 : XLogRecPtr
    6088                 : RequestXLogSwitch(void)
    6089               0 : {
    6090                 :         XLogRecPtr      RecPtr;
    6091                 :         XLogRecData rdata;
    6092                 : 
    6093                 :         /* XLOG SWITCH, alone among xlog record types, has no data */
    6094               0 :         rdata.buffer = InvalidBuffer;
    6095               0 :         rdata.data = NULL;
    6096               0 :         rdata.len = 0;
    6097               0 :         rdata.next = NULL;
    6098                 : 
    6099               0 :         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
    6100                 : 
    6101               0 :         return RecPtr;
    6102                 : }
    6103                 : 
    6104                 : /*
    6105                 :  * XLOG resource manager's routines
    6106                 :  */
    6107                 : void
    6108                 : xlog_redo(XLogRecPtr lsn, XLogRecord *record)
    6109               0 : {
    6110               0 :         uint8           info = record->xl_info & ~XLR_INFO_MASK;
    6111                 : 
    6112               0 :         if (info == XLOG_NEXTOID)
    6113                 :         {
    6114                 :                 Oid                     nextOid;
    6115                 : 
    6116               0 :                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
    6117               0 :                 if (ShmemVariableCache->nextOid < nextOid)
    6118                 :                 {
    6119               0 :                         ShmemVariableCache->nextOid = nextOid;
    6120               0 :                         ShmemVariableCache->oidCount = 0;
    6121                 :                 }
    6122                 :         }
    6123               0 :         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
    6124                 :         {
    6125                 :                 CheckPoint      checkPoint;
    6126                 : 
    6127               0 :                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    6128                 :                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
    6129               0 :                 ShmemVariableCache->nextXid = checkPoint.nextXid;
    6130               0 :                 ShmemVariableCache->nextOid = checkPoint.nextOid;
    6131               0 :                 ShmemVariableCache->oidCount = 0;
    6132               0 :                 MultiXactSetNextMXact(checkPoint.nextMulti,
    6133                 :                                                           checkPoint.nextMultiOffset);
    6134                 : 
    6135                 :                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    6136               0 :                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
    6137               0 :                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    6138                 : 
    6139                 :                 /*
    6140                 :                  * TLI may change in a shutdown checkpoint, but it shouldn't decrease
    6141                 :                  */
    6142               0 :                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
    6143                 :                 {
    6144               0 :                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
    6145                 :                                 !list_member_int(expectedTLIs,
    6146                 :                                                                  (int) checkPoint.ThisTimeLineID))
    6147               0 :                                 ereport(PANIC,
    6148                 :                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
    6149                 :                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
    6150                 :                         /* Following WAL records should be run with new TLI */
    6151               0 :                         ThisTimeLineID = checkPoint.ThisTimeLineID;
    6152                 :                 }
    6153                 : 
    6154               0 :                 RecoveryRestartPoint(&checkPoint);
    6155                 :         }
    6156               0 :         else if (info == XLOG_CHECKPOINT_ONLINE)
    6157                 :         {
    6158                 :                 CheckPoint      checkPoint;
    6159                 : 
    6160               0 :                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    6161                 :                 /* In an ONLINE checkpoint, treat the counters like NEXTOID */
    6162               0 :                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
    6163                 :                                                                   checkPoint.nextXid))
    6164               0 :                         ShmemVariableCache->nextXid = checkPoint.nextXid;
    6165               0 :                 if (ShmemVariableCache->nextOid < checkPoint.nextOid)
    6166                 :                 {
    6167               0 :                         ShmemVariableCache->nextOid = checkPoint.nextOid;
    6168               0 :                         ShmemVariableCache->oidCount = 0;
    6169                 :                 }
    6170               0 :                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
    6171                 :                                                                   checkPoint.nextMultiOffset);
    6172                 : 
    6173                 :                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    6174               0 :                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
    6175               0 :                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    6176                 : 
    6177                 :                 /* TLI should not change in an on-line checkpoint */
    6178               0 :                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
    6179               0 :                         ereport(PANIC,
    6180                 :                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
    6181                 :                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
    6182                 : 
    6183               0 :                 RecoveryRestartPoint(&checkPoint);
    6184                 :         }
    6185                 :         else if (info == XLOG_NOOP)
    6186                 :         {
    6187                 :                 /* nothing to do here */
    6188                 :         }
    6189                 :         else if (info == XLOG_SWITCH)
    6190                 :         {
    6191                 :                 /* nothing to do here */
    6192                 :         }
    6193               0 : }
    6194                 : 
    6195                 : void
    6196                 : xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
    6197               0 : {
    6198               0 :         uint8           info = xl_info & ~XLR_INFO_MASK;
    6199                 : 
    6200               0 :         if (info == XLOG_CHECKPOINT_SHUTDOWN ||
    6201                 :                 info == XLOG_CHECKPOINT_ONLINE)
    6202                 :         {
    6203               0 :                 CheckPoint *checkpoint = (CheckPoint *) rec;
    6204                 : 
    6205               0 :                 appendStringInfo(buf, "checkpoint: redo %X/%X; "
    6206                 :                                                  "tli %u; xid %u/%u; oid %u; multi %u; offset %u; %s",
    6207                 :                                                  checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
    6208                 :                                                  checkpoint->ThisTimeLineID,
    6209                 :                                                  checkpoint->nextXidEpoch, checkpoint->nextXid,
    6210                 :                                                  checkpoint->nextOid,
    6211                 :                                                  checkpoint->nextMulti,
    6212                 :                                                  checkpoint->nextMultiOffset,
    6213                 :                                  (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
    6214                 :         }
    6215               0 :         else if (info == XLOG_NOOP)
    6216                 :         {
    6217               0 :                 appendStringInfo(buf, "xlog no-op");
    6218                 :         }
    6219               0 :         else if (info == XLOG_NEXTOID)
    6220                 :         {
    6221                 :                 Oid                     nextOid;
    6222                 : 
    6223               0 :                 memcpy(&nextOid, rec, sizeof(Oid));
    6224               0 :                 appendStringInfo(buf, "nextOid: %u", nextOid);
    6225                 :         }
    6226               0 :         else if (info == XLOG_SWITCH)
    6227                 :         {
    6228               0 :                 appendStringInfo(buf, "xlog switch");
    6229                 :         }
    6230                 :         else
    6231               0 :                 appendStringInfo(buf, "UNKNOWN");
    6232               0 : }
    6233                 : 
    6234                 : #ifdef WAL_DEBUG
    6235                 : 
    6236                 : static void
    6237                 : xlog_outrec(StringInfo buf, XLogRecord *record)
    6238                 : {
    6239                 :         int                     i;
    6240                 : 
    6241                 :         appendStringInfo(buf, "prev %X/%X; xid %u",
    6242                 :                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
    6243                 :                                          record->xl_xid);
    6244                 : 
    6245                 :         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
    6246                 :         {
    6247                 :                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
    6248                 :                         appendStringInfo(buf, "; bkpb%d", i + 1);
    6249                 :         }
    6250                 : 
    6251                 :         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
    6252                 : }
    6253                 : #endif   /* WAL_DEBUG */
    6254                 : 
    6255                 : 
    6256                 : /*
    6257                 :  * GUC support
    6258                 :  */
    6259                 : const char *
    6260                 : assign_xlog_sync_method(const char *method, bool doit, GucSource source)
    6261              18 : {
    6262                 :         int                     new_sync_method;
    6263                 :         int                     new_sync_bit;
    6264                 : 
    6265              18 :         if (pg_strcasecmp(method, "fsync") == 0)
    6266                 :         {
    6267               0 :                 new_sync_method = SYNC_METHOD_FSYNC;
    6268               0 :                 new_sync_bit = 0;
    6269                 :         }
    6270                 : #ifdef HAVE_FSYNC_WRITETHROUGH
    6271                 :         else if (pg_strcasecmp(method, "fsync_writethrough") == 0)
    6272                 :         {
    6273                 :                 new_sync_method = SYNC_METHOD_FSYNC_WRITETHROUGH;
    6274                 :                 new_sync_bit = 0;
    6275                 :         }
    6276                 : #endif
    6277                 : #ifdef HAVE_FDATASYNC
    6278              18 :         else if (pg_strcasecmp(method, "fdatasync") == 0)
    6279                 :         {
    6280              18 :                 new_sync_method = SYNC_METHOD_FDATASYNC;
    6281              18 :                 new_sync_bit = 0;
    6282                 :         }
    6283                 : #endif
    6284                 : #ifdef OPEN_SYNC_FLAG
    6285               0 :         else if (pg_strcasecmp(method, "open_sync") == 0)
    6286                 :         {
    6287               0 :                 new_sync_method = SYNC_METHOD_OPEN;
    6288               0 :                 new_sync_bit = OPEN_SYNC_FLAG;
    6289                 :         }
    6290                 : #endif
    6291                 : #ifdef OPEN_DATASYNC_FLAG
    6292                 :         else if (pg_strcasecmp(method, "open_datasync") == 0)
    6293                 :         {
    6294                 :                 new_sync_method = SYNC_METHOD_OPEN;
    6295                 :                 new_sync_bit = OPEN_DATASYNC_FLAG;
    6296                 :         }
    6297                 : #endif
    6298                 :         else
    6299               0 :                 return NULL;
    6300                 : 
    6301              18 :         if (!doit)
    6302               0 :                 return method;
    6303                 : 
    6304              18 :         if (sync_method != new_sync_method || open_sync_bit != new_sync_bit)
    6305                 :         {
    6306                 :                 /*
    6307                 :                  * To ensure that no blocks escape unsynced, force an fsync on the
    6308                 :                  * currently open log segment (if any).  Also, if the open flag is
    6309                 :                  * changing, close the log file so it will be reopened (with new flag
    6310                 :                  * bit) at next use.
    6311                 :                  */
    6312               0 :                 if (openLogFile >= 0)
    6313                 :                 {
    6314               0 :                         if (pg_fsync(openLogFile) != 0)
    6315               0 :                                 ereport(PANIC,
    6316                 :                                                 (errcode_for_file_access(),
    6317                 :                                                  errmsg("could not fsync log file %u, segment %u: %m",
    6318                 :                                                                 openLogId, openLogSeg)));
    6319               0 :                         if (open_sync_bit != new_sync_bit)
    6320               0 :                                 XLogFileClose();
    6321                 :                 }
    6322               0 :                 sync_method = new_sync_method;
    6323               0 :                 open_sync_bit = new_sync_bit;
    6324                 :         }
    6325                 : 
    6326              18 :         return method;
    6327                 : }
    6328                 : 
    6329                 : 
    6330                 : /*
    6331                 :  * Issue appropriate kind of fsync (if any) on the current XLOG output file
    6332                 :  */
    6333                 : static void
    6334                 : issue_xlog_fsync(void)
    6335            4489 : {
    6336            4489 :         switch (sync_method)
    6337                 :         {
    6338                 :                 case SYNC_METHOD_FSYNC:
    6339               0 :                         if (pg_fsync_no_writethrough(openLogFile) != 0)
    6340               0 :                                 ereport(PANIC,
    6341                 :                                                 (errcode_for_file_access(),
    6342                 :                                                  errmsg("could not fsync log file %u, segment %u: %m",
    6343                 :                                                                 openLogId, openLogSeg)));
    6344                 :                         break;
    6345                 : #ifdef HAVE_FSYNC_WRITETHROUGH
    6346                 :                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
    6347                 :                         if (pg_fsync_writethrough(openLogFile) != 0)
    6348                 :                                 ereport(PANIC,
    6349                 :                                                 (errcode_for_file_access(),
    6350                 :                                                  errmsg("could not fsync write-through log file %u, segment %u: %m",
    6351                 :                                                                 openLogId, openLogSeg)));
    6352                 :                         break;
    6353                 : #endif
    6354                 : #ifdef HAVE_FDATASYNC
    6355                 :                 case SYNC_METHOD_FDATASYNC:
    6356            4489 :                         if (pg_fdatasync(openLogFile) != 0)
    6357               0 :                                 ereport(PANIC,
    6358                 :                                                 (errcode_for_file_access(),
    6359                 :                                         errmsg("could not fdatasync log file %u, segment %u: %m",
    6360                 :                                                    openLogId, openLogSeg)));
    6361                 :                         break;
    6362                 : #endif
    6363                 :                 case SYNC_METHOD_OPEN:
    6364                 :                         /* write synced it already */
    6365                 :                         break;
    6366                 :                 default:
    6367               0 :                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
    6368                 :                         break;
    6369                 :         }
    6370            4489 : }
    6371                 : 
    6372                 : 
    6373                 : /*
    6374                 :  * pg_start_backup: set up for taking an on-line backup dump
    6375                 :  *
    6376                 :  * Essentially what this does is to create a backup label file in $PGDATA,
    6377                 :  * where it will be archived as part of the backup dump.  The label file
    6378                 :  * contains the user-supplied label string (typically this would be used
    6379                 :  * to tell where the backup dump will be stored) and the starting time and
    6380                 :  * starting WAL location for the dump.
    6381                 :  */
    6382                 : Datum
    6383                 : pg_start_backup(PG_FUNCTION_ARGS)
    6384               0 : {
    6385               0 :         text       *backupid = PG_GETARG_TEXT_P(0);
    6386                 :         text       *result;
    6387                 :         char       *backupidstr;
    6388                 :         XLogRecPtr      checkpointloc;
    6389                 :         XLogRecPtr      startpoint;
    6390                 :         pg_time_t       stamp_time;
    6391                 :         char            strfbuf[128];
    6392                 :         char            xlogfilename[MAXFNAMELEN];
    6393                 :         uint32          _logId;
    6394                 :         uint32          _logSeg;
    6395                 :         struct stat stat_buf;
    6396                 :         FILE       *fp;
    6397                 : 
    6398               0 :         if (!superuser())
    6399               0 :                 ereport(ERROR,
    6400                 :                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
    6401                 :                                  errmsg("must be superuser to run a backup")));
    6402                 : 
    6403               0 :         if (!XLogArchivingActive())
    6404               0 :                 ereport(ERROR,
    6405                 :                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6406                 :                                  errmsg("WAL archiving is not active"),
    6407                 :                                  errhint("archive_mode must be enabled at server start.")));
    6408                 : 
    6409               0 :         if (!XLogArchiveCommandSet())
    6410               0 :                 ereport(ERROR,
    6411                 :                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6412                 :                                  errmsg("WAL archiving is not active"),
    6413                 :                                  errhint("archive_command must be defined before "
    6414                 :                                                  "online backups can be made safely.")));
    6415                 : 
    6416               0 :         backupidstr = DatumGetCString(DirectFunctionCall1(textout,
    6417                 :                                                                                                  PointerGetDatum(backupid)));
    6418                 : 
    6419                 :         /*
    6420                 :          * Mark backup active in shared memory.  We must do full-page WAL writes
    6421                 :          * during an on-line backup even if not doing so at other times, because
    6422                 :          * it's quite possible for the backup dump to obtain a "torn" (partially
    6423                 :          * written) copy of a database page if it reads the page concurrently with
    6424                 :          * our write to the same page.  This can be fixed as long as the first
    6425                 :          * write to the page in the WAL sequence is a full-page write. Hence, we
    6426                 :          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
    6427                 :          * are no dirty pages in shared memory that might get dumped while the
    6428                 :          * backup is in progress without having a corresponding WAL record.  (Once
    6429                 :          * the backup is complete, we need not force full-page writes anymore,
    6430                 :          * since we expect that any pages not modified during the backup interval
    6431                 :          * must have been correctly captured by the backup.)
    6432                 :          *
    6433                 :          * We must hold WALInsertLock to change the value of forcePageWrites, to
    6434                 :          * ensure adequate interlocking against XLogInsert().
    6435                 :          */
    6436               0 :         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    6437               0 :         if (XLogCtl->Insert.forcePageWrites)
    6438                 :         {
    6439               0 :                 LWLockRelease(WALInsertLock);
    6440               0 :                 ereport(ERROR,
    6441                 :                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6442                 :                                  errmsg("a backup is already in progress"),
    6443                 :                                  errhint("Run pg_stop_backup() and try again.")));
    6444                 :         }
    6445               0 :         XLogCtl->Insert.forcePageWrites = true;
    6446               0 :         LWLockRelease(WALInsertLock);
    6447                 : 
    6448                 :         /* Use a TRY block to ensure we release forcePageWrites if fail below */
    6449               0 :         PG_TRY();
    6450                 :         {
    6451                 :                 /*
    6452                 :                  * Force a CHECKPOINT.  Aside from being necessary to prevent torn
    6453                 :                  * page problems, this guarantees that two successive backup runs will
    6454                 :                  * have different checkpoint positions and hence different history
    6455                 :                  * file names, even if nothing happened in between.
    6456                 :                  *
    6457                 :                  * We don't use CHECKPOINT_IMMEDIATE, hence this can take awhile.
    6458                 :                  */
    6459               0 :                 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT);
    6460                 : 
    6461                 :                 /*
    6462                 :                  * Now we need to fetch the checkpoint record location, and also its
    6463                 :                  * REDO pointer.  The oldest point in WAL that would be needed to
    6464                 :                  * restore starting from the checkpoint is precisely the REDO pointer.
    6465                 :                  */
    6466               0 :                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6467               0 :                 checkpointloc = ControlFile->checkPoint;
    6468               0 :                 startpoint = ControlFile->checkPointCopy.redo;
    6469               0 :                 LWLockRelease(ControlFileLock);
    6470                 : 
    6471               0 :                 XLByteToSeg(startpoint, _logId, _logSeg);
    6472               0 :                 XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
    6473                 : 
    6474                 :                 /* Use the log timezone here, not the session timezone */
    6475               0 :                 stamp_time = (pg_time_t) time(NULL);
    6476               0 :                 pg_strftime(strfbuf, sizeof(strfbuf),
    6477                 :                                         "%Y-%m-%d %H:%M:%S %Z",
    6478                 :                                         pg_localtime(&stamp_time, log_timezone));
    6479                 : 
    6480                 :                 /*
    6481                 :                  * Check for existing backup label --- implies a backup is already
    6482                 :                  * running.  (XXX given that we checked forcePageWrites above, maybe
    6483                 :                  * it would be OK to just unlink any such label file?)
    6484                 :                  */
    6485               0 :                 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
    6486                 :                 {
    6487               0 :                         if (errno != ENOENT)
    6488               0 :                                 ereport(ERROR,
    6489                 :                                                 (errcode_for_file_access(),
    6490                 :                                                  errmsg("could not stat file \"%s\": %m",
    6491                 :                                                                 BACKUP_LABEL_FILE)));
    6492                 :                 }
    6493                 :                 else
    6494               0 :                         ereport(ERROR,
    6495                 :                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6496                 :                                          errmsg("a backup is already in progress"),
    6497                 :                                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
    6498                 :                                                          BACKUP_LABEL_FILE)));
    6499                 : 
    6500                 :                 /*
    6501                 :                  * Okay, write the file
    6502                 :                  */
    6503               0 :                 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
    6504               0 :                 if (!fp)
    6505               0 :                         ereport(ERROR,
    6506                 :                                         (errcode_for_file_access(),
    6507                 :                                          errmsg("could not create file \"%s\": %m",
    6508                 :                                                         BACKUP_LABEL_FILE)));
    6509               0 :                 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
    6510                 :                                 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
    6511               0 :                 fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
    6512                 :                                 checkpointloc.xlogid, checkpointloc.xrecoff);
    6513               0 :                 fprintf(fp, "START TIME: %s\n", strfbuf);
    6514               0 :                 fprintf(fp, "LABEL: %s\n", backupidstr);
    6515               0 :                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
    6516               0 :                         ereport(ERROR,
    6517                 :                                         (errcode_for_file_access(),
    6518                 :                                          errmsg("could not write file \"%s\": %m",
    6519                 :                                                         BACKUP_LABEL_FILE)));
    6520                 :         }
    6521               0 :         PG_CATCH();
    6522                 :         {
    6523                 :                 /* Turn off forcePageWrites on failure */
    6524               0 :                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    6525               0 :                 XLogCtl->Insert.forcePageWrites = false;
    6526               0 :                 LWLockRelease(WALInsertLock);
    6527                 : 
    6528               0 :                 PG_RE_THROW();
    6529                 :         }
    6530               0 :         PG_END_TRY();
    6531                 : 
    6532                 :         /*
    6533                 :          * We're done.  As a convenience, return the starting WAL location.
    6534                 :          */
    6535               0 :         snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
    6536                 :                          startpoint.xlogid, startpoint.xrecoff);
    6537               0 :         result = DatumGetTextP(DirectFunctionCall1(textin,
    6538                 :                                                                                          CStringGetDatum(xlogfilename)));
    6539               0 :         PG_RETURN_TEXT_P(result);
    6540                 : }
    6541                 : 
    6542                 : /*
    6543                 :  * pg_stop_backup: finish taking an on-line backup dump
    6544                 :  *
    6545                 :  * We remove the backup label file created by pg_start_backup, and instead
    6546                 :  * create a backup history file in pg_xlog (whence it will immediately be
    6547                 :  * archived).  The backup history file contains the same info found in
    6548                 :  * the label file, plus the backup-end time and WAL location.
    6549                 :  */
    6550                 : Datum
    6551                 : pg_stop_backup(PG_FUNCTION_ARGS)
    6552               0 : {
    6553                 :         text       *result;
    6554                 :         XLogRecPtr      startpoint;
    6555                 :         XLogRecPtr      stoppoint;
    6556                 :         pg_time_t       stamp_time;
    6557                 :         char            strfbuf[128];
    6558                 :         char            histfilepath[MAXPGPATH];
    6559                 :         char            startxlogfilename[MAXFNAMELEN];
    6560                 :         char            stopxlogfilename[MAXFNAMELEN];
    6561                 :         uint32          _logId;
    6562                 :         uint32          _logSeg;
    6563                 :         FILE       *lfp;
    6564                 :         FILE       *fp;
    6565                 :         char            ch;
    6566                 :         int                     ich;
    6567                 : 
    6568               0 :         if (!superuser())
    6569               0 :                 ereport(ERROR,
    6570                 :                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
    6571                 :                                  (errmsg("must be superuser to run a backup"))));
    6572                 : 
    6573                 :         /*
    6574                 :          * OK to clear forcePageWrites
    6575                 :          */
    6576               0 :         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    6577               0 :         XLogCtl->Insert.forcePageWrites = false;
    6578               0 :         LWLockRelease(WALInsertLock);
    6579                 : 
    6580                 :         /*
    6581                 :          * Force a switch to a new xlog segment file, so that the backup is valid
    6582                 :          * as soon as archiver moves out the current segment file. We'll report
    6583                 :          * the end address of the XLOG SWITCH record as the backup stopping point.
    6584                 :          */
    6585               0 :         stoppoint = RequestXLogSwitch();
    6586                 : 
    6587               0 :         XLByteToSeg(stoppoint, _logId, _logSeg);
    6588               0 :         XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
    6589                 : 
    6590                 :         /* Use the log timezone here, not the session timezone */
    6591               0 :         stamp_time = (pg_time_t) time(NULL);
    6592               0 :         pg_strftime(strfbuf, sizeof(strfbuf),
    6593                 :                                 "%Y-%m-%d %H:%M:%S %Z",
    6594                 :                                 pg_localtime(&stamp_time, log_timezone));
    6595                 : 
    6596                 :         /*
    6597                 :          * Open the existing label file
    6598                 :          */
    6599               0 :         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
    6600               0 :         if (!lfp)
    6601                 :         {
    6602               0 :                 if (errno != ENOENT)
    6603               0 :                         ereport(ERROR,
    6604                 :                                         (errcode_for_file_access(),
    6605                 :                                          errmsg("could not read file \"%s\": %m",
    6606                 :                                                         BACKUP_LABEL_FILE)));
    6607               0 :                 ereport(ERROR,
    6608                 :                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6609                 :                                  errmsg("a backup is not in progress")));
    6610                 :         }
    6611                 : 
    6612                 :         /*
    6613                 :          * Read and parse the START WAL LOCATION line (this code is pretty crude,
    6614                 :          * but we are not expecting any variability in the file format).
    6615                 :          */
    6616               0 :         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
    6617                 :                            &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
    6618                 :                            &ch) != 4 || ch != '\n')
    6619               0 :                 ereport(ERROR,
    6620                 :                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6621                 :                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    6622                 : 
    6623                 :         /*
    6624                 :          * Write the backup history file
    6625                 :          */
    6626               0 :         XLByteToSeg(startpoint, _logId, _logSeg);
    6627               0 :         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
    6628                 :                                                   startpoint.xrecoff % XLogSegSize);
    6629               0 :         fp = AllocateFile(histfilepath, "w");
    6630               0 :         if (!fp)
    6631               0 :                 ereport(ERROR,
    6632                 :                                 (errcode_for_file_access(),
    6633                 :                                  errmsg("could not create file \"%s\": %m",
    6634                 :                                                 histfilepath)));
    6635               0 :         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
    6636                 :                         startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
    6637               0 :         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
    6638                 :                         stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
    6639                 :         /* transfer remaining lines from label to history file */
    6640               0 :         while ((ich = fgetc(lfp)) != EOF)
    6641               0 :                 fputc(ich, fp);
    6642               0 :         fprintf(fp, "STOP TIME: %s\n", strfbuf);
    6643               0 :         if (fflush(fp) || ferror(fp) || FreeFile(fp))
    6644               0 :                 ereport(ERROR,
    6645                 :                                 (errcode_for_file_access(),
    6646                 :                                  errmsg("could not write file \"%s\": %m",
    6647                 :                                                 histfilepath)));
    6648                 : 
    6649                 :         /*
    6650                 :          * Close and remove the backup label file
    6651                 :          */
    6652               0 :         if (ferror(lfp) || FreeFile(lfp))
    6653               0 :                 ereport(ERROR,
    6654                 :                                 (errcode_for_file_access(),
    6655                 :                                  errmsg("could not read file \"%s\": %m",
    6656                 :                                                 BACKUP_LABEL_FILE)));
    6657               0 :         if (unlink(BACKUP_LABEL_FILE) != 0)
    6658               0 :                 ereport(ERROR,
    6659                 :                                 (errcode_for_file_access(),
    6660                 :                                  errmsg("could not remove file \"%s\": %m",
    6661                 :                                                 BACKUP_LABEL_FILE)));
    6662                 : 
    6663                 :         /*
    6664                 :          * Clean out any no-longer-needed history files.  As a side effect, this
    6665                 :          * will post a .ready file for the newly created history file, notifying
    6666                 :          * the archiver that history file may be archived immediately.
    6667                 :          */
    6668               0 :         CleanupBackupHistory();
    6669                 : 
    6670                 :         /*
    6671                 :          * We're done.  As a convenience, return the ending WAL location.
    6672                 :          */
    6673               0 :         snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
    6674                 :                          stoppoint.xlogid, stoppoint.xrecoff);
    6675               0 :         result = DatumGetTextP(DirectFunctionCall1(textin,
    6676                 :                                                                                  CStringGetDatum(stopxlogfilename)));
    6677               0 :         PG_RETURN_TEXT_P(result);
    6678                 : }
    6679                 : 
    6680                 : /*
    6681                 :  * pg_switch_xlog: switch to next xlog file
    6682                 :  */
    6683                 : Datum
    6684                 : pg_switch_xlog(PG_FUNCTION_ARGS)
    6685               0 : {
    6686                 :         text       *result;
    6687                 :         XLogRecPtr      switchpoint;
    6688                 :         char            location[MAXFNAMELEN];
    6689                 : 
    6690               0 :         if (!superuser())
    6691               0 :                 ereport(ERROR,
    6692                 :                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
    6693                 :                          (errmsg("must be superuser to switch transaction log files"))));
    6694                 : 
    6695               0 :         switchpoint = RequestXLogSwitch();
    6696                 : 
    6697                 :         /*
    6698                 :          * As a convenience, return the WAL location of the switch record
    6699                 :          */
    6700               0 :         snprintf(location, sizeof(location), "%X/%X",
    6701                 :                          switchpoint.xlogid, switchpoint.xrecoff);
    6702               0 :         result = DatumGetTextP(DirectFunctionCall1(textin,
    6703                 :                                                                                            CStringGetDatum(location)));
    6704               0 :         PG_RETURN_TEXT_P(result);
    6705                 : }
    6706                 : 
    6707                 : /*
    6708                 :  * Report the current WAL write location (same format as pg_start_backup etc)
    6709                 :  *
    6710                 :  * This is useful for determining how much of WAL is visible to an external
    6711                 :  * archiving process.  Note that the data before this point is written out
    6712                 :  * to the kernel, but is not necessarily synced to disk.
    6713                 :  */
    6714                 : Datum
    6715                 : pg_current_xlog_location(PG_FUNCTION_ARGS)
    6716               0 : {
    6717                 :         text       *result;
    6718                 :         char            location[MAXFNAMELEN];
    6719                 : 
    6720                 :         /* Make sure we have an up-to-date local LogwrtResult */
    6721                 :         {
    6722                 :                 /* use volatile pointer to prevent code rearrangement */
    6723               0 :                 volatile XLogCtlData *xlogctl = XLogCtl;
    6724                 : 
    6725               0 :                 SpinLockAcquire(&xlogctl->info_lck);
    6726               0 :                 LogwrtResult = xlogctl->LogwrtResult;
    6727               0 :                 SpinLockRelease(&xlogctl->info_lck);
    6728                 :         }
    6729                 : 
    6730               0 :         snprintf(location, sizeof(location), "%X/%X",
    6731                 :                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
    6732                 : 
    6733               0 :         result = DatumGetTextP(DirectFunctionCall1(textin,
    6734                 :                                                                                            CStringGetDatum(location)));
    6735               0 :         PG_RETURN_TEXT_P(result);
    6736                 : }
    6737                 : 
    6738                 : /*
    6739                 :  * Report the current WAL insert location (same format as pg_start_backup etc)
    6740                 :  *
    6741                 :  * This function is mostly for debugging purposes.
    6742                 :  */
    6743                 : Datum
    6744                 : pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
    6745               0 : {
    6746                 :         text       *result;
    6747               0 :         XLogCtlInsert *Insert = &XLogCtl->Insert;
    6748                 :         XLogRecPtr      current_recptr;
    6749                 :         char            location[MAXFNAMELEN];
    6750                 : 
    6751                 :         /*
    6752                 :          * Get the current end-of-WAL position ... shared lock is sufficient
    6753                 :          */
    6754               0 :         LWLockAcquire(WALInsertLock, LW_SHARED);
    6755               0 :         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
    6756               0 :         LWLockRelease(WALInsertLock);
    6757                 : 
    6758               0 :         snprintf(location, sizeof(location), "%X/%X",
    6759                 :                          current_recptr.xlogid, current_recptr.xrecoff);
    6760                 : 
    6761               0 :         result = DatumGetTextP(DirectFunctionCall1(textin,
    6762                 :                                                                                            CStringGetDatum(location)));
    6763               0 :         PG_RETURN_TEXT_P(result);
    6764                 : }
    6765                 : 
    6766                 : /*
    6767                 :  * Compute an xlog file name and decimal byte offset given a WAL location,
    6768                 :  * such as is returned by pg_stop_backup() or pg_xlog_switch().
    6769                 :  *
    6770                 :  * Note that a location exactly at a segment boundary is taken to be in
    6771                 :  * the previous segment.  This is usually the right thing, since the
    6772                 :  * expected usage is to determine which xlog file(s) are ready to archive.
    6773                 :  */
    6774                 : Datum
    6775                 : pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
    6776               0 : {
    6777               0 :         text       *location = PG_GETARG_TEXT_P(0);
    6778                 :         char       *locationstr;
    6779                 :         unsigned int uxlogid;
    6780                 :         unsigned int uxrecoff;
    6781                 :         uint32          xlogid;
    6782                 :         uint32          xlogseg;
    6783                 :         uint32          xrecoff;
    6784                 :         XLogRecPtr      locationpoint;
    6785                 :         char            xlogfilename[MAXFNAMELEN];
    6786                 :         Datum           values[2];
    6787                 :         bool            isnull[2];
    6788                 :         TupleDesc       resultTupleDesc;
    6789                 :         HeapTuple       resultHeapTuple;
    6790                 :         Datum           result;
    6791                 : 
    6792                 :         /*
    6793                 :          * Read input and parse
    6794                 :          */
    6795               0 :         locationstr = DatumGetCString(DirectFunctionCall1(textout,
    6796                 :                                                                                                  PointerGetDatum(location)));
    6797                 : 
    6798               0 :         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
    6799               0 :                 ereport(ERROR,
    6800                 :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    6801                 :                                  errmsg("could not parse transaction log location \"%s\"",
    6802                 :                                                 locationstr)));
    6803                 : 
    6804               0 :         locationpoint.xlogid = uxlogid;
    6805               0 :         locationpoint.xrecoff = uxrecoff;
    6806                 : 
    6807                 :         /*
    6808                 :          * Construct a tuple descriptor for the result row.  This must match this
    6809                 :          * function's pg_proc entry!
    6810                 :          */
    6811               0 :         resultTupleDesc = CreateTemplateTupleDesc(2, false);
    6812               0 :         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
    6813                 :                                            TEXTOID, -1, 0);
    6814               0 :         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
    6815                 :                                            INT4OID, -1, 0);
    6816                 : 
    6817               0 :         resultTupleDesc = BlessTupleDesc(resultTupleDesc);
    6818                 : 
    6819                 :         /*
    6820                 :          * xlogfilename
    6821                 :          */
    6822               0 :         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
    6823               0 :         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
    6824                 : 
    6825               0 :         values[0] = DirectFunctionCall1(textin,
    6826                 :                                                                         CStringGetDatum(xlogfilename));
    6827               0 :         isnull[0] = false;
    6828                 : 
    6829                 :         /*
    6830                 :          * offset
    6831                 :          */
    6832               0 :         xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;
    6833                 : 
    6834               0 :         values[1] = UInt32GetDatum(xrecoff);
    6835               0 :         isnull[1] = false;
    6836                 : 
    6837                 :         /*
    6838                 :          * Tuple jam: Having first prepared your Datums, then squash together
    6839                 :          */
    6840               0 :         resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
    6841                 : 
    6842               0 :         result = HeapTupleGetDatum(resultHeapTuple);
    6843                 : 
    6844               0 :         PG_RETURN_DATUM(result);
    6845                 : }
    6846                 : 
    6847                 : /*
    6848                 :  * Compute an xlog file name given a WAL location,
    6849                 :  * such as is returned by pg_stop_backup() or pg_xlog_switch().
    6850                 :  */
    6851                 : Datum
    6852                 : pg_xlogfile_name(PG_FUNCTION_ARGS)
    6853               0 : {
    6854               0 :         text       *location = PG_GETARG_TEXT_P(0);
    6855                 :         text       *result;
    6856                 :         char       *locationstr;
    6857                 :         unsigned int uxlogid;
    6858                 :         unsigned int uxrecoff;
    6859                 :         uint32          xlogid;
    6860                 :         uint32          xlogseg;
    6861                 :         XLogRecPtr      locationpoint;
    6862                 :         char            xlogfilename[MAXFNAMELEN];
    6863                 : 
    6864               0 :         locationstr = DatumGetCString(DirectFunctionCall1(textout,
    6865                 :                                                                                                  PointerGetDatum(location)));
    6866                 : 
    6867               0 :         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
    6868               0 :                 ereport(ERROR,
    6869                 :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    6870                 :                                  errmsg("could not parse transaction log location \"%s\"",
    6871                 :                                                 locationstr)));
    6872                 : 
    6873               0 :         locationpoint.xlogid = uxlogid;
    6874               0 :         locationpoint.xrecoff = uxrecoff;
    6875                 : 
    6876               0 :         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
    6877               0 :         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
    6878                 : 
    6879               0 :         result = DatumGetTextP(DirectFunctionCall1(textin,
    6880                 :                                                                                          CStringGetDatum(xlogfilename)));
    6881               0 :         PG_RETURN_TEXT_P(result);
    6882                 : }
    6883                 : 
    6884                 : /*
    6885                 :  * read_backup_label: check to see if a backup_label file is present
    6886                 :  *
    6887                 :  * If we see a backup_label during recovery, we assume that we are recovering
    6888                 :  * from a backup dump file, and we therefore roll forward from the checkpoint
    6889                 :  * identified by the label file, NOT what pg_control says.      This avoids the
    6890                 :  * problem that pg_control might have been archived one or more checkpoints
    6891                 :  * later than the start of the dump, and so if we rely on it as the start
    6892                 :  * point, we will fail to restore a consistent database state.
    6893                 :  *
    6894                 :  * We also attempt to retrieve the corresponding backup history file.
    6895                 :  * If successful, set *minRecoveryLoc to constrain valid PITR stopping
    6896                 :  * points.
    6897                 :  *
    6898                 :  * Returns TRUE if a backup_label was found (and fills the checkpoint
    6899                 :  * location into *checkPointLoc); returns FALSE if not.
    6900                 :  */
    6901                 : static bool
    6902                 : read_backup_label(XLogRecPtr *checkPointLoc, XLogRecPtr *minRecoveryLoc)
    6903              14 : {
    6904                 :         XLogRecPtr      startpoint;
    6905                 :         XLogRecPtr      stoppoint;
    6906                 :         char            histfilename[MAXFNAMELEN];
    6907                 :         char            histfilepath[MAXPGPATH];
    6908                 :         char            startxlogfilename[MAXFNAMELEN];
    6909                 :         char            stopxlogfilename[MAXFNAMELEN];
    6910                 :         TimeLineID      tli;
    6911                 :         uint32          _logId;
    6912                 :         uint32          _logSeg;
    6913                 :         FILE       *lfp;
    6914                 :         FILE       *fp;
    6915                 :         char            ch;
    6916                 : 
    6917                 :         /* Default is to not constrain recovery stop point */
    6918              14 :         minRecoveryLoc->xlogid = 0;
    6919              14 :         minRecoveryLoc->xrecoff = 0;
    6920                 : 
    6921                 :         /*
    6922                 :          * See if label file is present
    6923                 :          */
    6924              14 :         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
    6925              14 :         if (!lfp)
    6926                 :         {
    6927              14 :                 if (errno != ENOENT)
    6928               0 :                         ereport(FATAL,
    6929                 :                                         (errcode_for_file_access(),
    6930                 :                                          errmsg("could not read file \"%s\": %m",
    6931                 :                                                         BACKUP_LABEL_FILE)));
    6932              14 :                 return false;                   /* it's not there, all is fine */
    6933                 :         }
    6934                 : 
    6935                 :         /*
    6936                 :          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
    6937                 :          * is pretty crude, but we are not expecting any variability in the file
    6938                 :          * format).
    6939                 :          */
    6940               0 :         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
    6941                 :                            &startpoint.xlogid, &startpoint.xrecoff, &tli,
    6942                 :                            startxlogfilename, &ch) != 5 || ch != '\n')
    6943               0 :                 ereport(FATAL,
    6944                 :                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6945                 :                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    6946               0 :         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
    6947                 :                            &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
    6948                 :                            &ch) != 3 || ch != '\n')
    6949               0 :                 ereport(FATAL,
    6950                 :                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6951                 :                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    6952               0 :         if (ferror(lfp) || FreeFile(lfp))
    6953               0 :                 ereport(FATAL,
    6954                 :                                 (errcode_for_file_access(),
    6955                 :                                  errmsg("could not read file \"%s\": %m",
    6956                 :                                                 BACKUP_LABEL_FILE)));
    6957                 : 
    6958                 :         /*
    6959                 :          * Try to retrieve the backup history file (no error if we can't)
    6960                 :          */
    6961               0 :         XLByteToSeg(startpoint, _logId, _logSeg);
    6962               0 :         BackupHistoryFileName(histfilename, tli, _logId, _logSeg,
    6963                 :                                                   startpoint.xrecoff % XLogSegSize);
    6964                 : 
    6965               0 :         if (InArchiveRecovery)
    6966               0 :                 RestoreArchivedFile(histfilepath, histfilename, "RECOVERYHISTORY", 0);
    6967                 :         else
    6968               0 :                 BackupHistoryFilePath(histfilepath, tli, _logId, _logSeg,
    6969                 :                                                           startpoint.xrecoff % XLogSegSize);
    6970                 : 
    6971               0 :         fp = AllocateFile(histfilepath, "r");
    6972               0 :         if (fp)
    6973                 :         {
    6974                 :                 /*
    6975                 :                  * Parse history file to identify stop point.
    6976                 :                  */
    6977               0 :                 if (fscanf(fp, "START WAL LOCATION: %X/%X (file %24s)%c",
    6978                 :                                    &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
    6979                 :                                    &ch) != 4 || ch != '\n')
    6980               0 :                         ereport(FATAL,
    6981                 :                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6982                 :                                          errmsg("invalid data in file \"%s\"", histfilename)));
    6983               0 :                 if (fscanf(fp, "STOP WAL LOCATION: %X/%X (file %24s)%c",
    6984                 :                                    &stoppoint.xlogid, &stoppoint.xrecoff, stopxlogfilename,
    6985                 :                                    &ch) != 4 || ch != '\n')
    6986               0 :                         ereport(FATAL,
    6987                 :                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    6988                 :                                          errmsg("invalid data in file \"%s\"", histfilename)));
    6989               0 :                 *minRecoveryLoc = stoppoint;
    6990               0 :                 if (ferror(fp) || FreeFile(fp))
    6991               0 :                         ereport(FATAL,
    6992                 :                                         (errcode_for_file_access(),
    6993                 :                                          errmsg("could not read file \"%s\": %m",
    6994                 :                                                         histfilepath)));
    6995                 :         }
    6996                 : 
    6997               0 :         return true;
    6998                 : }
    6999                 : 
    7000                 : /*
    7001                 :  * Error context callback for errors occurring during rm_redo().
    7002                 :  */
    7003                 : static void
    7004                 : rm_redo_error_callback(void *arg)
    7005               0 : {
    7006               0 :         XLogRecord *record = (XLogRecord *) arg;
    7007                 :         StringInfoData buf;
    7008                 : 
    7009               0 :         initStringInfo(&buf);
    7010               0 :         RmgrTable[record->xl_rmid].rm_desc(&buf,
    7011                 :                                                                            record->xl_info,
    7012                 :                                                                            XLogRecGetData(record));
    7013                 : 
    7014                 :         /* don't bother emitting empty description */
    7015               0 :         if (buf.len > 0)
    7016               0 :                 errcontext("xlog redo %s", buf.data);
    7017                 : 
    7018               0 :         pfree(buf.data);
    7019               0 : }

Generated by: LTP GCOV extension version 1.5