本节简单介绍了PostgreSQL缓存管理(Buffer Manager)中的其中一个实现函数ReadBufferExtended,该函数返回对应请求关系数据块的buffer.。
Relation
关系的内存结构.
/*
* Here are the contents of a relation cache entry.
*/
typedef struct RelationData
{
RelFileNode rd_node; /* relation physical identifier */
/* use "struct" here to avoid needing to include smgr.h: */
struct SMgrRelationData *rd_smgr; /* cached file handle, or NULL */
int rd_refcnt; /* reference count */
BackendId rd_backend; /* owning backend id, if temporary relation */
bool rd_islocaltemp; /* rel is a temp rel of this session */
bool rd_isnailed; /* rel is nailed in cache */
bool rd_isvalid; /* relcache entry is valid */
char rd_indexvalid; /* state of rd_indexlist: 0 = not valid, 1 =
* valid, 2 = temporarily forced */
bool rd_statvalid; /* is rd_statlist valid? */
/*
* rd_createSubid is the ID of the highest subtransaction the rel has
* survived into; or zero if the rel was not created in the current top
* transaction. This can be now be relied on, whereas previously it could
* be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is
* the ID of the highest subtransaction the relfilenode change has
* survived into, or zero if not changed in the current transaction (or we
* have forgotten changing it). rd_newRelfilenodeSubid can be forgotten
* when a relation has multiple new relfilenodes within a single
* transaction, with one of them occurring in a subsequently aborted
* subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t;
* ROLLBACK TO save; -- rd_newRelfilenode is now forgotten
*/
SubTransactionId rd_createSubid; /* rel was created in current xact */
SubTransactionId rd_newRelfilenodeSubid; /* new relfilenode assigned in
* current xact */
Form_pg_class rd_rel; /* RELATION tuple */
TupleDesc rd_att; /* tuple descriptor */
Oid rd_id; /* relation's object id */
LockInfoData rd_lockInfo; /* lock mgr's info for locking relation */
RuleLock *rd_rules; /* rewrite rules */
MemoryContext rd_rulescxt; /* private memory cxt for rd_rules, if any */
TriggerDesc *trigdesc; /* Trigger info, or NULL if rel has none */
/* use "struct" here to avoid needing to include rowsecurity.h: */
struct RowSecurityDesc *rd_rsdesc; /* row security policies, or NULL */
/* data managed by RelationGetFKeyList: */
List *rd_fkeylist; /* list of ForeignKeyCacheInfo (see below) */
bool rd_fkeyvalid; /* true if list has been computed */
MemoryContext rd_partkeycxt; /* private memory cxt for the below */
struct PartitionKeyData *rd_partkey; /* partition key, or NULL */
MemoryContext rd_pdcxt; /* private context for partdesc */
struct PartitionDescData *rd_partdesc; /* partitions, or NULL */
List *rd_partcheck; /* partition CHECK quals */
/* data managed by RelationGetIndexList: */
List *rd_indexlist; /* list of OIDs of indexes on relation */
Oid rd_oidindex; /* OID of unique index on OID, if any */
Oid rd_pkindex; /* OID of primary key, if any */
Oid rd_replidindex; /* OID of replica identity index, if any */
/* data managed by RelationGetStatExtList: */
List *rd_statlist; /* list of OIDs of extended stats */
/* data managed by RelationGetIndexAttrBitmap: */
Bitmapset *rd_indexattr; /* columns used in non-projection indexes */
Bitmapset *rd_projindexattr; /* columns used in projection indexes */
Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */
Bitmapset *rd_pkattr; /* cols included in primary key */
Bitmapset *rd_idattr; /* included in replica identity index */
Bitmapset *rd_projidx; /* Oids of projection indexes */
PublicationActions *rd_pubactions; /* publication actions */
/*
* rd_options is set whenever rd_rel is loaded into the relcache entry.
* Note that you can NOT look into rd_rel for this data. NULL means "use
* defaults".
*/
bytea *rd_options; /* parsed pg_class.reloptions */
/* These are non-NULL only for an index relation: */
Form_pg_index rd_index; /* pg_index tuple describing this index */
/* use "struct" here to avoid needing to include htup.h: */
struct HeapTupleData *rd_indextuple; /* all of pg_index tuple */
/*
* index access support info (used only for an index relation)
*
* Note: only default support procs for each opclass are cached, namely
* those with lefttype and righttype equal to the opclass's opcintype. The
* arrays are indexed by support function number, which is a sufficient
* identifier given that restriction.
*
* Note: rd_amcache is available for index AMs to cache private data about
* an index. This must be just a cache since it may get reset at any time
* (in particular, it will get reset by a relcache inval message for the
* index). If used, it must point to a single memory chunk palloc'd in
* rd_indexcxt. A relcache reset will include freeing that chunk and
* setting rd_amcache = NULL.
*/
Oid rd_amhandler; /* OID of index AM's handler function */
MemoryContext rd_indexcxt; /* private memory cxt for this stuff */
/* use "struct" here to avoid needing to include amapi.h: */
struct IndexAmRoutine *rd_amroutine; /* index AM's API struct */
Oid *rd_opfamily; /* OIDs of op families for each index col */
Oid *rd_opcintype; /* OIDs of opclass declared input data types */
RegProcedure *rd_support; /* OIDs of support procedures */
FmgrInfo *rd_supportinfo; /* lookup info for support procedures */
int16 *rd_indoption; /* per-column AM-specific flags */
List *rd_indexprs; /* index expression trees, if any */
List *rd_indpred; /* index predicate tree, if any */
Oid *rd_exclops; /* OIDs of exclusion operators, if any */
Oid *rd_exclprocs; /* OIDs of exclusion ops' procs, if any */
uint16 *rd_exclstrats; /* exclusion ops' strategy numbers, if any */
void *rd_amcache; /* available for use by index AM */
Oid *rd_indcollation; /* OIDs of index collations */
/*
* foreign-table support
*
* rd_fdwroutine must point to a single memory chunk palloc'd in
* CacheMemoryContext. It will be freed and reset to NULL on a relcache
* reset.
*/
/* use "struct" here to avoid needing to include fdwapi.h: */
struct FdwRoutine *rd_fdwroutine; /* cached function pointers, or NULL */
/*
* Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new
* version of a table, we need to make any toast pointers inserted into it
* have the existing toast table's OID, not the OID of the transient toast
* table. If rd_toastoid isn't InvalidOid, it is the OID to place in
* toast pointers inserted into this rel. (Note it's set on the new
* version of the main heap, not the toast table itself.) This also
* causes toast_save_datum() to try to preserve toast value OIDs.
*/
Oid rd_toastoid; /* Real TOAST table's OID, or InvalidOid */
/* use "struct" here to avoid needing to include pgstat.h: */
struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
} RelationData;
typedef struct RelationData *Relation;
BufferAccessStrategy
buffer访问策略
/*
* Buffer identifiers.
* Buffer标识符
*
* Zero is invalid, positive is the index of a shared buffer (1..NBuffers),
* negative is the index of a local buffer (-1 .. -NLocBuffer).
* 0表示无效,正整数表示共享buffer的索引(1..N),
* 负数是本地buffer的索引(-1..-N)
*/
typedef int Buffer;
#define InvalidBuffer 0
/*
* Buffer access strategy objects.
* Buffer访问策略对象
*
* BufferAccessStrategyData is private to freelist.c
* BufferAccessStrategyData对freelist.c来说是私有的
*/
typedef struct BufferAccessStrategyData *BufferAccessStrategy;
/*
* Private (non-shared) state for managing a ring of shared buffers to re-use.
* This is currently the only kind of BufferAccessStrategy object, but someday
* we might have more kinds.
* 私有状态,用于管理可重用的环形缓冲区.
* 目前只有这么一种缓冲区访问策略对象,但未来某一天可以拥有更多.
*/
typedef struct BufferAccessStrategyData
{
/* Overall strategy type */
//全局的策略类型
BufferAccessStrategyType btype;
/* Number of elements in buffers[] array */
//buffers[]中的元素个数
int ring_size;
/*
* Index of the "current" slot in the ring, ie, the one most recently
* returned by GetBufferFromRing.
* 环形缓冲区中当前slot的索引,最近访问的通过函数GetBufferFromRing返回.
*/
int current;
/*
* True if the buffer just returned by StrategyGetBuffer had been in the
* ring already.
* 如正好通过StrategyGetBuffer返回的buffer已在环形缓冲区中,则返回T
*/
bool current_was_in_ring;
/*
* Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
* have not yet selected a buffer for this ring slot. For allocation
* simplicity this is palloc'd together with the fixed fields of the
* struct.
* buffer编号数组.
* InvalidBuffer(即:0)表示我们还没有为该slot选择buffer.
* 为了分配的简单性,这是palloc'd与结构的固定字段。
*/
Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
} BufferAccessStrategyData;
//Block结构体指针
typedef void *Block;
/* Possible arguments for GetAccessStrategy() */
//GetAccessStrategy()函数可取值的参数
typedef enum BufferAccessStrategyType
{
//常规的随机访问
BAS_NORMAL, /* Normal random access */
//大规模的只读扫描
BAS_BULKREAD, /* Large read-only scan (hint bit updates are
* ok) */
//大量的多块写(如 COPY IN)
BAS_BULKWRITE, /* Large multi-block write (e.g. COPY IN) */
//VACUUM
BAS_VACUUM /* VACUUM */
} BufferAccessStrategyType;
ReadBufferMode
ReadBufferExtended函数所可能使用的读取模式.
/*
* In RBM_NORMAL mode, the page is read from disk, and the page header is
* validated. An error is thrown if the page header is not valid. (But
* note that an all-zero page is considered "valid"; see PageIsVerified().)
* 在RBM_NORMAL模式,page从磁盘中读取,page头部已被验证有效.假如page头部是无效的,那会抛出错误.
* (但是需要注意,初始化的page被认为是有效的;详细参见PageIsVerified函数)
*
* RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
* valid, the page is zeroed instead of throwing an error. This is intended
* for non-critical data, where the caller is prepared to repair errors.
* RBM_ZERO_ON_ERROR类似于正常模式,但如果page header是无效的,则初始化page(置0),而不是报错.
* 在调用者准备修复错误时,针对非关键数据使用.
*
* In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
* filled with zeros instead of reading it from disk. Useful when the caller
* is going to fill the page from scratch, since this saves I/O and avoids
* unnecessary failure if the page-on-disk has corrupt page headers.
* The page is returned locked to ensure that the caller has a chance to
* initialize the page before it's made visible to others.
* Caution: do not use this mode to read a page that is beyond the relation's
* current physical EOF; that is likely to cause problems in md.c when
* the page is modified and written out. P_NEW is OK, though.
* 在RBM_ZERO_AND_LOCK模式,如果page还没有处于buffer cache,填充0而不是从磁盘中读取.
* 在调用者从scratch填充page时使用,因为这样可以节省I/O并避免不必要的page-on-disk的header错误.
* page会被锁定并返回,确保在page可见前由调用者初始化此page.
* 特别注意:不要在在关系文件标记位(EOF)后使用这种模式读取page,这会在md.c中,修改page并写出该page后出现问题.
* 但是,P_NEW是可以的.
*
* RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
* a cleanup-strength lock on the page.
* RBM_ZERO_AND_CLEANUP_LOCK模式与RBM_ZERO_AND_LOCK模式类似,但在page上请求cleanup-strength lock.
*
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
* RBM_NORMAL_NO_LOG模式与RBM_NORMAL一致.
*/
/* Possible modes for ReadBufferExtended() */
typedef enum
{
RBM_NORMAL, /* Normal read */
RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
* initialize. Also locks the page. */
RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page
* in "cleanup" mode */
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
* replay; otherwise same as RBM_NORMAL */
} ReadBufferMode;
ReadBufferExtended返回对应请求关系数据块的buffer,实现逻辑比较简单,详见代码.
主要的实现逻辑在ReadBuffer_common中,该函数后续再行介绍.
/*
* ReadBufferExtended -- returns a buffer containing the requested
* block of the requested relation. If the blknum
* requested is P_NEW, extend the relation file and
* allocate a new block. (Caller is responsible for
* ensuring that only one backend tries to extend a
* relation at the same time!)
* ReadBufferExtended -- 返回对应请求关系数据块的buffer.
* 如果blknum是P_NEW,则扩展关系文件并分配新块.
* (调用者有责任确保只有一个后台进程在同一时刻尝试扩展关系)
*
* Returns: the buffer number for the buffer containing
* the block read. The returned buffer has been pinned.
* Does not return on error --- elog's instead.
* 返回:对应block的buffer编号.返回的buffer已被pinned.不需要返回错误,因为elog已进行处理.
*
* Assume when this function is called, that reln has been opened already.
* 假定调用该函数时,关系reln已被打开.
*
* In RBM_NORMAL mode, the page is read from disk, and the page header is
* validated. An error is thrown if the page header is not valid. (But
* note that an all-zero page is considered "valid"; see PageIsVerified().)
* 在RBM_NORMAL模式,page从磁盘中读取,page头部已被验证有效.假如page头部是无效的,那会抛出错误.
* (但是需要注意,初始化的page被认为是有效的;详细参见PageIsVerified函数)
*
* RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
* valid, the page is zeroed instead of throwing an error. This is intended
* for non-critical data, where the caller is prepared to repair errors.
* RBM_ZERO_ON_ERROR类似于正常模式,但如果page header是无效的,则初始化page(置0),而不是报错.
* 在调用者准备修复错误时,针对非关键数据使用.
*
* In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
* filled with zeros instead of reading it from disk. Useful when the caller
* is going to fill the page from scratch, since this saves I/O and avoids
* unnecessary failure if the page-on-disk has corrupt page headers.
* The page is returned locked to ensure that the caller has a chance to
* initialize the page before it's made visible to others.
* Caution: do not use this mode to read a page that is beyond the relation's
* current physical EOF; that is likely to cause problems in md.c when
* the page is modified and written out. P_NEW is OK, though.
* 在RBM_ZERO_AND_LOCK模式,如果page还没有处于buffer cache,填充0而不是从磁盘中读取.
* 在调用者从scratch填充page时使用,因为这样可以节省I/O并避免不必要的page-on-disk的header错误.
* page会被锁定并返回,确保在page可见前由调用者初始化此page.
* 特别注意:不要在在关系文件标记位(EOF)后使用这种模式读取page,这会在md.c中,修改page并写出该page后出现问题.
* 但是,P_NEW是可以的.
*
* RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
* a cleanup-strength lock on the page.
* RBM_ZERO_AND_CLEANUP_LOCK模式与RBM_ZERO_AND_LOCK模式类似,但在page上请求cleanup-strength lock.
*
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
* RBM_NORMAL_NO_LOG模式与RBM_NORMAL一致.
*
* If strategy is not NULL, a nondefault buffer access strategy is used.
* See buffer/README for details.
* 如strategy非空,则使用非默认的buffer访问策略.详细参见buffer/README.
*/
Buffer
ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
ReadBufferMode mode, BufferAccessStrategy strategy)
{
bool hit;
Buffer buf;
/* Open it at the smgr level if not already done */
//打开relation,级别为smgr
RelationOpenSmgr(reln);
/*
* Reject attempts to read non-local temporary relations; we would be
* likely to get wrong data since we have no visibility into the owning
* session's local buffers.
* 拒绝尝试访问非本地临时relations.
* 由于没有自己会话的本地缓存可见信息,因此读取临时表会得到错误的数据.
*/
if (RELATION_IS_OTHER_TEMP(reln))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot access temporary tables of other sessions")));
/*
* Read the buffer, and update pgstat counters to reflect a cache hit or
* miss.
* 调用ReadBuffer_common读取buffer,更新pgstat计数器以反映命中还是缺失.
*/
pgstat_count_buffer_read(reln);
buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
forkNum, blockNum, mode, strategy, &hit);
if (hit)
pgstat_count_buffer_hit(reln);
return buf;
}
使用bt查看调用栈
(gdb) bt
#0 ReadBufferExtended (reln=0x7f497fe72788, forkNum=MAIN_FORKNUM, blockNum=0, mode=RBM_NORMAL, strategy=0x0)
at bufmgr.c:647
#1 0x00000000004d974f in heapgetpage (scan=0x1d969d8, page=0) at heapam.c:379
#2 0x00000000004daeb2 in heapgettup_pagemode (scan=0x1d969d8, dir=ForwardScanDirection, nkeys=0, key=0x0) at heapam.c:837
#3 0x00000000004dcf2b in heap_getnext (scan=0x1d969d8, direction=ForwardScanDirection) at heapam.c:1842
#4 0x000000000070ec39 in SeqNext (node=0x1d95890) at nodeSeqscan.c:80
#5 0x00000000006e0ab0 in ExecScanFetch (node=0x1d95890, accessMtd=0x70eba9 <SeqNext>, recheckMtd=0x70ec74 <SeqRecheck>)
at execScan.c:95
#6 0x00000000006e0b22 in ExecScan (node=0x1d95890, accessMtd=0x70eba9 <SeqNext>, recheckMtd=0x70ec74 <SeqRecheck>)
at execScan.c:145
#7 0x000000000070ecbe in ExecSeqScan (pstate=0x1d95890) at nodeSeqscan.c:129
#8 0x00000000006dee2a in ExecProcNodeFirst (node=0x1d95890) at execProcnode.c:445
#9 0x00000000007021b8 in ExecProcNode (node=0x1d95890) at ../../../src/include/executor/executor.h:237
#10 0x00000000007022dd in ExecLimit (pstate=0x1d95680) at nodeLimit.c:95
#11 0x00000000006dee2a in ExecProcNodeFirst (node=0x1d95680) at execProcnode.c:445
#12 0x00000000006d3d8d in ExecProcNode (node=0x1d95680) at ../../../src/include/executor/executor.h:237
#13 0x00000000006d65c5 in ExecutePlan (estate=0x1d95468, planstate=0x1d95680, use_parallel_mode=false,
operation=CMD_SELECT, sendTuples=true, numberTuples=0, direction=ForwardScanDirection, dest=0x1d00ea8,
execute_once=true) at execMain.c:1723
#14 0x00000000006d4357 in standard_ExecutorRun (queryDesc=0x1cfdc28, direction=ForwardScanDirection, count=0,
execute_once=true) at execMain.c:364
#15 0x00000000006d417f in ExecutorRun (queryDesc=0x1cfdc28, direction=ForwardScanDirection, count=0, execute_once=true)
at execMain.c:307
#16 0x00000000008bffd4 in PortalRunSelect (portal=0x1d3ebf8, forward=true, count=0, dest=0x1d00ea8) at pquery.c:932
#17 0x00000000008bfc72 in PortalRun (portal=0x1d3ebf8, count=9223372036854775807, isTopLevel=true, run_once=true,
dest=0x1d00ea8, altdest=0x1d00ea8, completionTag=0x7ffc1fc513d0 "") at pquery.c:773
#18 0x00000000008b9cd4 in exec_simple_query (query_string=0x1cd8ec8 "select * from t1 limit 10;") at postgres.c:1145
---Type <return> to continue, or q <return> to quit---
#19 0x00000000008bdf5f in PostgresMain (argc=1, argv=0x1d05278, dbname=0x1d050e0 "testdb", username=0x1cd5ba8 "xdb")
at postgres.c:4182
#20 0x000000000081c16d in BackendRun (port=0x1cfae00) at postmaster.c:4361
#21 0x000000000081b8e0 in BackendStartup (port=0x1cfae00) at postmaster.c:4033
#22 0x0000000000817cda in ServerLoop () at postmaster.c:1706
#23 0x0000000000817590 in PostmasterMain (argc=1, argv=0x1cd3b60) at postmaster.c:1379
#24 0x0000000000741003 in main (argc=1, argv=0x1cd3b60) at main.c:228
(gdb)
逻辑较为简单,这里不再详细跟踪.
PG Source Code
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。