相关代码的路径如下所示:/src/backend/storage/smgr。外存管理负责处理数据库与外存介质(在PostgreSQL中只实现了磁盘的管理操作)的交互过程。在PostgreSQL中,外存管理由SMGR(主要代码在smgr.c中)提供对外存操作的统一接口。SMGR负责统管各种介质管理器,会根据上层的请求选择具体的截止管理器进行操作。这里介绍磁盘管理器,每个表文件在磁盘中都以一定的结构进行存储,针对磁盘,外存管理模块提供了磁盘管理器和VFD机制。在PG 8.4.1版本中,还为每个表文件创建了两个附属文件,即空闲空间映射表文件(FSM)和可见性映射表文件(VM)。另外,对于大数据存储,PG也提供了两种处理机制。
smgr.c public interface routines to storage manager switch 存储管理器切换公共接口
smgrtype.c storage manager type 存储管理器类型
md.c manages relations that reside on magnetic disk 管理驻留在磁盘上的关系
存储管理器类型
smgrtype.c定义了存储管理器类型storage manager type(src/backend/storage/smgr/smgrtype.c)
1 typedef struct smgrid 2 { 3 const char *smgr_name; 4 } smgrid; 5 6 /* 7 * StorageManager[] -- List of defined storage managers. 8 */ 9 static const smgrid StorageManager[] = { 10 {"magnetic disk"} 11 }; 12 static const int NStorageManagers = lengthof(StorageManager);
smgrid结构体包含const char指针,storageManager是smgrid的数组,NStorageMangers是storageManager中smgrid结构体元素的个数。目前只实现了磁盘管理器magnetic disk。
smgr文件函数的参数Datum smgrin(PG_FUNCTION_ARGS) --> unsigned long smgrin(struct FunctionCallInfoData * fcinfo)。
PG_GETARG_CSTRING(n)预定义为DatumGetCString(PG_GETARG_DATUM(n))、DatumGetCString(x)预定义为((char *) DatumGetPointer(x))、DatumGetPointer(x)预定义为((Pointer)(x))、typedef char* Pointer。PG_GETARG_DATUM(n)预定义为(fcinfo->arg[n])。PG_GETARG_CSTRING(0)-->DatumGetCString(PG_GETARG_DATUM(0))-->((char *) DatumGetPointer(PG_GETARG_DATUM(0)))-->((char *) ((Pointer)(PG_GETARG_DATUM(0))))-->((char *) ((char*)(PG_GETARG_DATUM(0))))-->((char *) ((char*)((fcinfo->arg[0]))))。
PG_RETURN_INT16(x)预定义为return Int16GetDatum(x),Int16GetDatum(x)预定义为((Datum)SET_2_BYTES(x))、SET_2_BYTES(value)预定义为((Datum)(value)&0x0000ffff)。PG_RETURN_INT16(0)-->return Int16GetDatum(0)-->return ((Datum)SET_2_BYTES(x))-->return ((unsigned long)((unsigned long)(0)&0x0000ffff))。
smgrin函数用于查找输入参数中指定存储管理器在存储管理器数组中的序号
#define PG_FUNCTION_ARGS FunctionCallInfo fcinfo
typedef struct FunctionCallInfoData *FunctionCallInfo;
1 Datum smgrin(PG_FUNCTION_ARGS) 2 { 3 char *s = PG_GETARG_CSTRING(0); 4 int16 i; 5 6 for (i = 0; i < NStorageManagers; i++) 7 { 8 if (strcmp(s, StorageManager[i].smgr_name) == 0) 9 PG_RETURN_INT16(i); 10 } 11 elog(ERROR, "unrecognized storage manager name \"%s\"", s); 12 PG_RETURN_INT16(0); 13 }
PG_GETARG_INT16(n)预定义为DatumGetInt16(PG_GETARG_DATUM(n))、DatumGetInt16(x)预定义为((int16)GET_2_BYTES(x))、GET_2_BYTES(datum)预定义为(((Datum)(datum))&0x0000ffff)、PG_GETARG_DATUM(n)预定义为(fcinfo->arg[n])。
PG_GETARG_INT16(0)-->DatumGetInt16(PG_GETARG_DATUM(0))-->((int16)GET_2_BYTES(PG_GETARG_DATUM(0)))-->((int16)(((unsigned long)(PG_GETARG_DATUM(0)))&0x0000ffff))-->((int16)(((unsigned long)((fcinfo->arg[0])))&0x0000ffff))
PG_RETURN_CSTRING(x)预定义为return CStringGetDatum(x)、CStringGetDatum(x)预定义为PointerGetDatum(x)、PointerGetDatum(x)预定义为((Datum)(x))
PG_RETURN_CSTRING(x)-->return CStringGetDatum(x)-->return PointerGetDatum(x)-->return ((Datum)(x))-->return ((unsigned long)(x))
smgrout函数用于提取输入参数中指定存储管理器序号的管理器名并返回
1 Datum smgrout(PG_FUNCTION_ARGS) 2 { 3 int16 i = PG_GETARG_INT16(0); 4 char *s; 5 6 if (i >= NStorageManagers || i < 0) 7 elog(ERROR, "invalid storage manager id: %d", i); 8 9 s = pstrdup(StorageManager[i].smgr_name); 10 PG_RETURN_CSTRING(s); 11 }
PG_RETURN_BOOL(x)-->return BoolGetDatum(x)-->return ((Datum)((x)?1:0))-->return ((unsigned long)((x)?1:0))
smgreq和smgrne用于判别输入参数的arg[0]和arg[1]是否相同或不同
1 Datum smgreq(PG_FUNCTION_ARGS) 2 { 3 int16 a = PG_GETARG_INT16(0); 4 int16 b = PG_GETARG_INT16(1); 5 PG_RETURN_BOOL(a == b); 6 } 7 8 Datum smgrne(PG_FUNCTION_ARGS) 9 { 10 int16 a = PG_GETARG_INT16(0); 11 int16 b = PG_GETARG_INT16(1); 12 PG_RETURN_BOOL(a != b); 13 }
存储管理器切换公共接口
此函数指针结构体定义了smgr.c与任何单独的存储管理器模块之间的API。 请注意,通常期望smgr子功能通过elog(ERROR)报告问题。 一个例外是smgr_unlink应该使用elog(WARNING)而不是erroring out,因为我们通常在提交后/中止清理期间取消链接的关系,因此raise error为时已晚。 另外,在bootstrap和/或WAL恢复期间应允许通常是错误的各种条件-有关详细信息,请参见md.c中的注释。src/backend/storage/smgr/smgr.c
1 typedef struct f_smgr 2 { 3 void (*smgr_init) (void); /* may be NULL */ 4 void (*smgr_shutdown) (void); /* may be NULL */ 5 void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); 6 void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, 7 bool isRedo); 8 bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); 9 void (*smgr_unlink) (RelFileNode rnode, ForkNumber forknum, 10 bool isRedo); 11 void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, 12 BlockNumber blocknum, char *buffer, bool isTemp); 13 void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, 14 BlockNumber blocknum); 15 void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, 16 BlockNumber blocknum, char *buffer); 17 void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, 18 BlockNumber blocknum, char *buffer, bool isTemp); 19 BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); 20 void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, 21 BlockNumber nblocks, bool isTemp); 22 void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); 23 void (*smgr_pre_ckpt) (void); /* may be NULL */ 24 void (*smgr_sync) (void); /* may be NULL */ 25 void (*smgr_post_ckpt) (void); /* may be NULL */ 26 } f_smgr; 27 static const f_smgr smgrsw[] = { 28 /* magnetic disk */ //这里的函数定义在md.c中 29 {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend, 30 mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync, 31 mdpreckpt, mdsync, mdpostckpt 32 } 33 }; 34 // NSmgr是smgrsw数组中f_smgr结构体元素个数 35 static const int NSmgr = lengthof(smgrsw);
静态smgrshutdown和smgr_internal_unlink函数
1 /* local function prototypes */ 2 static void smgrshutdown(int code, Datum arg); 3 static void smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum, 4 int which, bool isTemp, bool isRedo);
smgrinit和smgrshutdown函数
smgrinit()、smgrshutdown()初始化或关闭存储管理器。smgrinit在后端backend启动(正常或独立情况)期间被调用,而不在postmaster启动期间调用。 因此,在此处创建或在smgrshutdown中销毁的任何资源都是后端本地的。sngrinit函数顺序调用smgrsw结构体数组中相应存储管理器的初始化函数。最后注册smgrshutdown函数。
1 void smgrinit(void) 2 { 3 int i; 4 5 for (i = 0; i < NSmgr; i++) 6 { 7 if (smgrsw[i].smgr_init) 8 (*(smgrsw[i].smgr_init)) (); 9 } 10 11 /* register the shutdown proc */ 12 on_proc_exit(smgrshutdown, 0); 13 }
smgrshutdown在backend关闭时用于smgr清理工作的函数,并且需要在smgrinit函数执行完前挂载在on_proc_exit勾子上。
1 static void smgrshutdown(int code, Datum arg) 2 { 3 int i; 4 for (i = 0; i < NSmgr; i++) 5 { 6 if (smgrsw[i].smgr_shutdown) 7 (*(smgrsw[i].smgr_shutdown)) (); 8 } 9 }
smgropen函数
smgropen()返回一个SMgrRekation对象,需要时创建(不尝试打开该对象)。参数rnode是包含表空间、数据库、表oid的结构体。当第一次调用smgropen时需要初始化SMgrRelationHash执行的哈希表,它包含了所有extant SMgrRelation对象。从代码逻辑可以看出该哈希表,它的key是RelFileNode,value是SMgrRelationData,哈希函数是tag_hash。接下来在该哈希表中查找或者创建形参指定的条目。如果没有找到需要创建该条目,并初始化smgr_owner、smgr_which和md_fd参数(根据MAX_FORKNUM)。
1 SMgrRelation smgropen(RelFileNode rnode) 2 { 3 SMgrRelation reln; 4 bool found; 5 if (SMgrRelationHash == NULL) // 存储了所有extant SMgrRelation对象的哈希表 6 { 7 /* First time through: initialize the hash table */ 8 HASHCTL ctl; 9 MemSet(&ctl, 0, sizeof(ctl)); 10 ctl.keysize = sizeof(RelFileNode); 11 ctl.entrysize = sizeof(SMgrRelationData); 12 ctl.hash = tag_hash; 13 SMgrRelationHash = hash_create("smgr relation table", 400, 14 &ctl, HASH_ELEM | HASH_FUNCTION); 15 } 16 /* Look up or create an entry */ 17 reln = (SMgrRelation) hash_search(SMgrRelationHash, 18 (void *) &rnode, 19 HASH_ENTER, &found); 20 /* Initialize it if not present before */ 21 if (!found) 22 { 23 int forknum; 24 /* hash_search already filled in the lookup key */ 25 reln->smgr_owner = NULL; 26 reln->smgr_which = 0; /* we only have md.c at present */ 27 /* mark it not open */ 28 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) 29 reln->md_fd[forknum] = NULL; 30 } 31 return reln; 32 }
smgrsetowner()函数建立到SMgrRelation对象的long-lived引用。参数SMgrRelation *其实是SMgrRelationData **,而SMgrRelation结构体中包含了SMgrRelationData **成员。也就是需要先卸载老的owner,然后将新owner放入reln结构体。
1 void smgrsetowner(SMgrRelation *owner, SMgrRelation reln) 2 { 3 /* 4 * First, unhook any old owner. (Normally there shouldn't be any, but it 5 * seems possible that this can happen during swap_relation_files() 6 * depending on the order of processing. It's ok to close the old 7 * relcache entry early in that case.) 8 */ 9 if (reln->smgr_owner) 10 *(reln->smgr_owner) = NULL; 11 12 /* Now establish the ownership relationship. */ 13 reln->smgr_owner = owner; 14 *owner = reln; 15 }
smgrexists函数判别底层文件是否存在
1 bool smgrexists(SMgrRelation reln, ForkNumber forknum) 2 { 3 return (*(smgrsw[reln->smgr_which].smgr_exists)) (reln, forknum); 4 }
smgrclose()函数关闭和删除SMgrRelation对象
1 void smgrclose(SMgrRelation reln) 2 { 3 SMgrRelation *owner; 4 ForkNumber forknum; 5 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) 6 (*(smgrsw[reln->smgr_which].smgr_close)) (reln, forknum); 7 owner = reln->smgr_owner; 8 if (hash_search(SMgrRelationHash, 9 (void *) &(reln->smgr_rnode), 10 HASH_REMOVE, NULL) == NULL) 11 elog(ERROR, "SMgrRelation hashtable corrupted"); 12 /* 13 * Unhook the owner pointer, if any. We do this last since in the remote 14 * possibility of failure above, the SMgrRelation object will still exist. 15 */ 16 if (owner) 17 *owner = NULL; 18 }
关闭所有存在的SMgrRelation对象
1 void smgrcloseall(void) 2 { 3 HASH_SEQ_STATUS status; 4 SMgrRelation reln; 5 /* Nothing to do if hashtable not set up */ 6 if (SMgrRelationHash == NULL) 7 return; 8 hash_seq_init(&status, SMgrRelationHash); 9 while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL) 10 smgrclose(reln); 11 }
smgrclosenode()函数关闭给定RelFile关联的SMgrRelation对象
1 void smgrclosenode(RelFileNode rnode) 2 { 3 SMgrRelation reln; 4 /* Nothing to do if hashtable not set up */ 5 if (SMgrRelationHash == NULL) 6 return; 7 reln = (SMgrRelation) hash_search(SMgrRelationHash, 8 (void *) &rnode, 9 HASH_FIND, NULL); 10 if (reln != NULL) 11 smgrclose(reln); 12 }
smgrcreate函数创建新关系
* Given an already-created (but presumably unused) SMgrRelation,
* cause the underlying disk file or other storage for the fork
* to be created.
*
* If isRedo is true, it is okay for the underlying file to exist
* already because we are in a WAL replay sequence.
1 void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) 2 { 3 /* 4 * Exit quickly in WAL replay mode if we've already opened the file. If 5 * it's open, it surely must exist. 6 */ 7 if (isRedo && reln->md_fd[forknum] != NULL) 8 return; 9 10 /* 11 * We may be using the target table space for the first time in this 12 * database, so create a per-database subdirectory if needed. 13 * 14 * XXX this is a fairly ugly violation of module layering, but this seems 15 * to be the best place to put the check. Maybe TablespaceCreateDbspace 16 * should be here and not in commands/tablespace.c? But that would imply 17 * importing a lot of stuff that smgr.c oughtn't know, either. 18 */ 19 TablespaceCreateDbspace(reln->smgr_rnode.spcNode, 20 reln->smgr_rnode.dbNode, 21 isRedo); 22 23 (*(smgrsw[reln->smgr_which].smgr_create)) (reln, forknum, isRedo); 24 }
等等函数,后续博客继续分析
关于SMgrRelatonData、SMgrRelationHash(HTAB类型的指针)之间的关系
static HTAB *SMgrRelationHash = NULL。