相關代碼的路徑如下所示:/src/backend/storage/smgr。外存管理負責處理數據庫與外存介質(在PostgreSQL中只實現了磁盤的管理操作)的交互過程。在PostgreSQL中,外存管理由SMGR(主要代碼在smgr.c中)提供對外存操作的統一接口。SMGR負責統管各種介質管理器,會根據上層的請求選擇具體的截止管理器進行操作。這里介紹磁盤管理器,每個表文件在磁盤中都以一定的結構進行存儲,針對磁盤,外存管理模塊提供了磁盤管理器和VFD機制。在PG 8.4.1版本中,還為每個表文件創建了兩個附屬文件,即空閑空間映射表文件(FSM)和可見性映射表文件(VM)。另外,對於大數據存儲,PG也提供了兩種處理機制。
smgr.c public interface routines to storage manager switch 存儲管理器切換公共接口
smgrtype.c storage manager type 存儲管理器類型
md.c manages relations that reside on magnetic disk 管理駐留在磁盤上的關系
存儲管理器類型
smgrtype.c定義了存儲管理器類型storage manager type(src/backend/storage/smgr/smgrtype.c)
1 typedef struct smgrid 2 { 3 const char *smgr_name; 4 } smgrid; 5 6 /* 7 * StorageManager[] -- List of defined storage managers. 8 */ 9 static const smgrid StorageManager[] = { 10 {"magnetic disk"} 11 }; 12 static const int NStorageManagers = lengthof(StorageManager);
smgrid結構體包含const char指針,storageManager是smgrid的數組,NStorageMangers是storageManager中smgrid結構體元素的個數。目前只實現了磁盤管理器magnetic disk。
smgr文件函數的參數Datum smgrin(PG_FUNCTION_ARGS) --> unsigned long smgrin(struct FunctionCallInfoData * fcinfo)。
PG_GETARG_CSTRING(n)預定義為DatumGetCString(PG_GETARG_DATUM(n))、DatumGetCString(x)預定義為((char *) DatumGetPointer(x))、DatumGetPointer(x)預定義為((Pointer)(x))、typedef char* Pointer。PG_GETARG_DATUM(n)預定義為(fcinfo->arg[n])。PG_GETARG_CSTRING(0)-->DatumGetCString(PG_GETARG_DATUM(0))-->((char *) DatumGetPointer(PG_GETARG_DATUM(0)))-->((char *) ((Pointer)(PG_GETARG_DATUM(0))))-->((char *) ((char*)(PG_GETARG_DATUM(0))))-->((char *) ((char*)((fcinfo->arg[0]))))。
PG_RETURN_INT16(x)預定義為return Int16GetDatum(x),Int16GetDatum(x)預定義為((Datum)SET_2_BYTES(x))、SET_2_BYTES(value)預定義為((Datum)(value)&0x0000ffff)。PG_RETURN_INT16(0)-->return Int16GetDatum(0)-->return ((Datum)SET_2_BYTES(x))-->return ((unsigned long)((unsigned long)(0)&0x0000ffff))。
smgrin函數用於查找輸入參數中指定存儲管理器在存儲管理器數組中的序號
#define PG_FUNCTION_ARGS FunctionCallInfo fcinfo
typedef struct FunctionCallInfoData *FunctionCallInfo;
1 Datum smgrin(PG_FUNCTION_ARGS) 2 { 3 char *s = PG_GETARG_CSTRING(0); 4 int16 i; 5 6 for (i = 0; i < NStorageManagers; i++) 7 { 8 if (strcmp(s, StorageManager[i].smgr_name) == 0) 9 PG_RETURN_INT16(i); 10 } 11 elog(ERROR, "unrecognized storage manager name \"%s\"", s); 12 PG_RETURN_INT16(0); 13 }
PG_GETARG_INT16(n)預定義為DatumGetInt16(PG_GETARG_DATUM(n))、DatumGetInt16(x)預定義為((int16)GET_2_BYTES(x))、GET_2_BYTES(datum)預定義為(((Datum)(datum))&0x0000ffff)、PG_GETARG_DATUM(n)預定義為(fcinfo->arg[n])。
PG_GETARG_INT16(0)-->DatumGetInt16(PG_GETARG_DATUM(0))-->((int16)GET_2_BYTES(PG_GETARG_DATUM(0)))-->((int16)(((unsigned long)(PG_GETARG_DATUM(0)))&0x0000ffff))-->((int16)(((unsigned long)((fcinfo->arg[0])))&0x0000ffff))
PG_RETURN_CSTRING(x)預定義為return CStringGetDatum(x)、CStringGetDatum(x)預定義為PointerGetDatum(x)、PointerGetDatum(x)預定義為((Datum)(x))
PG_RETURN_CSTRING(x)-->return CStringGetDatum(x)-->return PointerGetDatum(x)-->return ((Datum)(x))-->return ((unsigned long)(x))
smgrout函數用於提取輸入參數中指定存儲管理器序號的管理器名並返回
1 Datum smgrout(PG_FUNCTION_ARGS) 2 { 3 int16 i = PG_GETARG_INT16(0); 4 char *s; 5 6 if (i >= NStorageManagers || i < 0) 7 elog(ERROR, "invalid storage manager id: %d", i); 8 9 s = pstrdup(StorageManager[i].smgr_name); 10 PG_RETURN_CSTRING(s); 11 }
PG_RETURN_BOOL(x)-->return BoolGetDatum(x)-->return ((Datum)((x)?1:0))-->return ((unsigned long)((x)?1:0))
smgreq和smgrne用於判別輸入參數的arg[0]和arg[1]是否相同或不同
1 Datum smgreq(PG_FUNCTION_ARGS) 2 { 3 int16 a = PG_GETARG_INT16(0); 4 int16 b = PG_GETARG_INT16(1); 5 PG_RETURN_BOOL(a == b); 6 } 7 8 Datum smgrne(PG_FUNCTION_ARGS) 9 { 10 int16 a = PG_GETARG_INT16(0); 11 int16 b = PG_GETARG_INT16(1); 12 PG_RETURN_BOOL(a != b); 13 }
存儲管理器切換公共接口
此函數指針結構體定義了smgr.c與任何單獨的存儲管理器模塊之間的API。 請注意,通常期望smgr子功能通過elog(ERROR)報告問題。 一個例外是smgr_unlink應該使用elog(WARNING)而不是erroring out,因為我們通常在提交后/中止清理期間取消鏈接的關系,因此raise error為時已晚。 另外,在bootstrap和/或WAL恢復期間應允許通常是錯誤的各種條件-有關詳細信息,請參見md.c中的注釋。src/backend/storage/smgr/smgr.c
1 typedef struct f_smgr 2 { 3 void (*smgr_init) (void); /* may be NULL */ 4 void (*smgr_shutdown) (void); /* may be NULL */ 5 void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); 6 void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, 7 bool isRedo); 8 bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); 9 void (*smgr_unlink) (RelFileNode rnode, ForkNumber forknum, 10 bool isRedo); 11 void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, 12 BlockNumber blocknum, char *buffer, bool isTemp); 13 void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, 14 BlockNumber blocknum); 15 void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, 16 BlockNumber blocknum, char *buffer); 17 void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, 18 BlockNumber blocknum, char *buffer, bool isTemp); 19 BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); 20 void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, 21 BlockNumber nblocks, bool isTemp); 22 void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); 23 void (*smgr_pre_ckpt) (void); /* may be NULL */ 24 void (*smgr_sync) (void); /* may be NULL */ 25 void (*smgr_post_ckpt) (void); /* may be NULL */ 26 } f_smgr; 27 static const f_smgr smgrsw[] = { 28 /* magnetic disk */ //這里的函數定義在md.c中 29 {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend, 30 mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync, 31 mdpreckpt, mdsync, mdpostckpt 32 } 33 }; 34 // NSmgr是smgrsw數組中f_smgr結構體元素個數 35 static const int NSmgr = lengthof(smgrsw);
靜態smgrshutdown和smgr_internal_unlink函數
1 /* local function prototypes */ 2 static void smgrshutdown(int code, Datum arg); 3 static void smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum, 4 int which, bool isTemp, bool isRedo);
smgrinit和smgrshutdown函數
smgrinit()、smgrshutdown()初始化或關閉存儲管理器。smgrinit在后端backend啟動(正常或獨立情況)期間被調用,而不在postmaster啟動期間調用。 因此,在此處創建或在smgrshutdown中銷毀的任何資源都是后端本地的。sngrinit函數順序調用smgrsw結構體數組中相應存儲管理器的初始化函數。最后注冊smgrshutdown函數。
1 void smgrinit(void) 2 { 3 int i; 4 5 for (i = 0; i < NSmgr; i++) 6 { 7 if (smgrsw[i].smgr_init) 8 (*(smgrsw[i].smgr_init)) (); 9 } 10 11 /* register the shutdown proc */ 12 on_proc_exit(smgrshutdown, 0); 13 }
smgrshutdown在backend關閉時用於smgr清理工作的函數,並且需要在smgrinit函數執行完前掛載在on_proc_exit勾子上。
1 static void smgrshutdown(int code, Datum arg) 2 { 3 int i; 4 for (i = 0; i < NSmgr; i++) 5 { 6 if (smgrsw[i].smgr_shutdown) 7 (*(smgrsw[i].smgr_shutdown)) (); 8 } 9 }
smgropen函數
smgropen()返回一個SMgrRekation對象,需要時創建(不嘗試打開該對象)。參數rnode是包含表空間、數據庫、表oid的結構體。當第一次調用smgropen時需要初始化SMgrRelationHash執行的哈希表,它包含了所有extant SMgrRelation對象。從代碼邏輯可以看出該哈希表,它的key是RelFileNode,value是SMgrRelationData,哈希函數是tag_hash。接下來在該哈希表中查找或者創建形參指定的條目。如果沒有找到需要創建該條目,並初始化smgr_owner、smgr_which和md_fd參數(根據MAX_FORKNUM)。
1 SMgrRelation smgropen(RelFileNode rnode) 2 { 3 SMgrRelation reln; 4 bool found; 5 if (SMgrRelationHash == NULL) // 存儲了所有extant SMgrRelation對象的哈希表 6 { 7 /* First time through: initialize the hash table */ 8 HASHCTL ctl; 9 MemSet(&ctl, 0, sizeof(ctl)); 10 ctl.keysize = sizeof(RelFileNode); 11 ctl.entrysize = sizeof(SMgrRelationData); 12 ctl.hash = tag_hash; 13 SMgrRelationHash = hash_create("smgr relation table", 400, 14 &ctl, HASH_ELEM | HASH_FUNCTION); 15 } 16 /* Look up or create an entry */ 17 reln = (SMgrRelation) hash_search(SMgrRelationHash, 18 (void *) &rnode, 19 HASH_ENTER, &found); 20 /* Initialize it if not present before */ 21 if (!found) 22 { 23 int forknum; 24 /* hash_search already filled in the lookup key */ 25 reln->smgr_owner = NULL; 26 reln->smgr_which = 0; /* we only have md.c at present */ 27 /* mark it not open */ 28 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) 29 reln->md_fd[forknum] = NULL; 30 } 31 return reln; 32 }
smgrsetowner()函數建立到SMgrRelation對象的long-lived引用。參數SMgrRelation *其實是SMgrRelationData **,而SMgrRelation結構體中包含了SMgrRelationData **成員。也就是需要先卸載老的owner,然后將新owner放入reln結構體。
1 void smgrsetowner(SMgrRelation *owner, SMgrRelation reln) 2 { 3 /* 4 * First, unhook any old owner. (Normally there shouldn't be any, but it 5 * seems possible that this can happen during swap_relation_files() 6 * depending on the order of processing. It's ok to close the old 7 * relcache entry early in that case.) 8 */ 9 if (reln->smgr_owner) 10 *(reln->smgr_owner) = NULL; 11 12 /* Now establish the ownership relationship. */ 13 reln->smgr_owner = owner; 14 *owner = reln; 15 }
smgrexists函數判別底層文件是否存在
1 bool smgrexists(SMgrRelation reln, ForkNumber forknum) 2 { 3 return (*(smgrsw[reln->smgr_which].smgr_exists)) (reln, forknum); 4 }
smgrclose()函數關閉和刪除SMgrRelation對象
1 void smgrclose(SMgrRelation reln) 2 { 3 SMgrRelation *owner; 4 ForkNumber forknum; 5 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) 6 (*(smgrsw[reln->smgr_which].smgr_close)) (reln, forknum); 7 owner = reln->smgr_owner; 8 if (hash_search(SMgrRelationHash, 9 (void *) &(reln->smgr_rnode), 10 HASH_REMOVE, NULL) == NULL) 11 elog(ERROR, "SMgrRelation hashtable corrupted"); 12 /* 13 * Unhook the owner pointer, if any. We do this last since in the remote 14 * possibility of failure above, the SMgrRelation object will still exist. 15 */ 16 if (owner) 17 *owner = NULL; 18 }
關閉所有存在的SMgrRelation對象
1 void smgrcloseall(void) 2 { 3 HASH_SEQ_STATUS status; 4 SMgrRelation reln; 5 /* Nothing to do if hashtable not set up */ 6 if (SMgrRelationHash == NULL) 7 return; 8 hash_seq_init(&status, SMgrRelationHash); 9 while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL) 10 smgrclose(reln); 11 }
smgrclosenode()函數關閉給定RelFile關聯的SMgrRelation對象
1 void smgrclosenode(RelFileNode rnode) 2 { 3 SMgrRelation reln; 4 /* Nothing to do if hashtable not set up */ 5 if (SMgrRelationHash == NULL) 6 return; 7 reln = (SMgrRelation) hash_search(SMgrRelationHash, 8 (void *) &rnode, 9 HASH_FIND, NULL); 10 if (reln != NULL) 11 smgrclose(reln); 12 }
smgrcreate函數創建新關系
* Given an already-created (but presumably unused) SMgrRelation,
* cause the underlying disk file or other storage for the fork
* to be created.
*
* If isRedo is true, it is okay for the underlying file to exist
* already because we are in a WAL replay sequence.
1 void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) 2 { 3 /* 4 * Exit quickly in WAL replay mode if we've already opened the file. If 5 * it's open, it surely must exist. 6 */ 7 if (isRedo && reln->md_fd[forknum] != NULL) 8 return; 9 10 /* 11 * We may be using the target table space for the first time in this 12 * database, so create a per-database subdirectory if needed. 13 * 14 * XXX this is a fairly ugly violation of module layering, but this seems 15 * to be the best place to put the check. Maybe TablespaceCreateDbspace 16 * should be here and not in commands/tablespace.c? But that would imply 17 * importing a lot of stuff that smgr.c oughtn't know, either. 18 */ 19 TablespaceCreateDbspace(reln->smgr_rnode.spcNode, 20 reln->smgr_rnode.dbNode, 21 isRedo); 22 23 (*(smgrsw[reln->smgr_which].smgr_create)) (reln, forknum, isRedo); 24 }
等等函數,后續博客繼續分析
關於SMgrRelatonData、SMgrRelationHash(HTAB類型的指針)之間的關系
static HTAB *SMgrRelationHash = NULL。