http://blog.csdn.net/acceptedxukai/article/details/18136903
http://blog.csdn.net/acceptedxukai/article/details/18181563
本文所引用的源碼全部來自Redis2.8.2版本。
Redis AOF數據持久化機制的實現相關代碼是redis.c, redis.h, aof.c, bio.c, rio.c, config.c
在閱讀本文之前請先閱讀Redis數據持久化機制AOF原理分析之配置詳解文章,了解AOF相關參數的解析,文章鏈接
http://blog.csdn.net/acceptedxukai/article/details/18135219
轉載請注明,文章出自http://blog.csdn.net/acceptedxukai/article/details/18136903
下面將介紹AOF數據持久化機制的實現
Server啟動加載AOF文件數據
Server啟動加載AOF文件數據的執行步驟為:main() -> initServerConfig() -> loadServerConfig() -> initServer() -> loadDataFromDisk()。initServerConfig()主要為初始化默認的AOF參數配置;loadServerConfig()加載配置文件redis.conf中AOF的參數配置,覆蓋Server的默認AOF參數配置,如果配置appendonly on,那么AOF數據持久化功能將被激活,server.aof_state參數被設置為REDIS_AOF_ON;loadDataFromDisk()判斷server.aof_state == REDIS_AOF_ON,結果為True就調用loadAppendOnlyFile函數加載AOF文件中的數據,加載的方法就是讀取AOF文件中數據,由於AOF文件中存儲的數據與客戶端發送的請求格式相同完全符合Redis的通信協議,因此Server創建偽客戶端fakeClient,將解析后的AOF文件數據像客戶端請求一樣調用各種指令,cmd->proc(fakeClient),將AOF文件中的數據重現到Redis Server數據庫中。
- /* Function called at startup to load RDB or AOF file in memory. */
- void loadDataFromDisk(void) {
- long long start = ustime();
- if (server.aof_state == REDIS_AOF_ON) {
- if (loadAppendOnlyFile(server.aof_filename) == REDIS_OK)
- redisLog(REDIS_NOTICE,"DB loaded from append only file: %.3f seconds",(float)(ustime()-start)/1000000);
- } else {
- if (rdbLoad(server.rdb_filename) == REDIS_OK) {
- redisLog(REDIS_NOTICE,"DB loaded from disk: %.3f seconds",
- (float)(ustime()-start)/1000000);
- } else if (errno != ENOENT) {
- redisLog(REDIS_WARNING,"Fatal error loading the DB: %s. Exiting.",strerror(errno));
- exit(1);
- }
- }
- }
Server首先判斷加載AOF文件是因為AOF文件中的數據要比RDB文件中的數據要新。
- int loadAppendOnlyFile(char *filename) {
- struct redisClient *fakeClient;
- FILE *fp = fopen(filename,"r");
- struct redis_stat sb;
- int old_aof_state = server.aof_state;
- long loops = 0;
- //redis_fstat就是fstat64函數,通過fileno(fp)得到文件描述符,獲取文件的狀態存儲於sb中,
- //具體可以參考stat函數,st_size就是文件的字節數
- if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {
- server.aof_current_size = 0;
- fclose(fp);
- return REDIS_ERR;
- }
- if (fp == NULL) {//打開文件失敗
- redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
- exit(1);
- }
- /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
- * to the same file we're about to read. */
- server.aof_state = REDIS_AOF_OFF;
- fakeClient = createFakeClient(); //建立偽終端
- startLoading(fp); // 定義於 rdb.c ,更新服務器的載入狀態
- while(1) {
- int argc, j;
- unsigned long len;
- robj **argv;
- char buf[128];
- sds argsds;
- struct redisCommand *cmd;
- /* Serve the clients from time to time */
- // 有間隔地處理外部請求,ftello()函數得到文件的當前位置,返回值為long
- if (!(loops++ % 1000)) {
- loadingProgress(ftello(fp));//保存aof文件讀取的位置,ftellno(fp)獲取文件當前位置
- aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT);//處理事件
- }
- //按行讀取AOF數據
- if (fgets(buf,sizeof(buf),fp) == NULL) {
- if (feof(fp))//達到文件尾EOF
- break;
- else
- goto readerr;
- }
- //讀取AOF文件中的命令,依照Redis的協議處理
- if (buf[0] != '*') goto fmterr;
- argc = atoi(buf+1);//參數個數
- if (argc < 1) goto fmterr;
- argv = zmalloc(sizeof(robj*)*argc);//參數值
- for (j = 0; j < argc; j++) {
- if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
- if (buf[0] != '$') goto fmterr;
- len = strtol(buf+1,NULL,10);//每個bulk的長度
- argsds = sdsnewlen(NULL,len);//新建一個空sds
- //按照bulk的長度讀取
- if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
- argv[j] = createObject(REDIS_STRING,argsds);
- if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF 跳過\r\n*/
- }
- /* Command lookup */
- cmd = lookupCommand(argv[0]->ptr);
- if (!cmd) {
- redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", (char*)argv[0]->ptr);
- exit(1);
- }
- /* Run the command in the context of a fake client */
- fakeClient->argc = argc;
- fakeClient->argv = argv;
- cmd->proc(fakeClient);//執行命令
- /* The fake client should not have a reply */
- redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0);
- /* The fake client should never get blocked */
- redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0);
- /* Clean up. Command code may have changed argv/argc so we use the
- * argv/argc of the client instead of the local variables. */
- for (j = 0; j < fakeClient->argc; j++)
- decrRefCount(fakeClient->argv[j]);
- zfree(fakeClient->argv);
- }
- /* This point can only be reached when EOF is reached without errors.
- * If the client is in the middle of a MULTI/EXEC, log error and quit. */
- if (fakeClient->flags & REDIS_MULTI) goto readerr;
- fclose(fp);
- freeFakeClient(fakeClient);
- server.aof_state = old_aof_state;
- stopLoading();
- aofUpdateCurrentSize(); //更新server.aof_current_size,AOF文件大小
- server.aof_rewrite_base_size = server.aof_current_size;
- return REDIS_OK;
- …………
- }
在前面一篇關於AOF參數配置的博客遺留了一個問題,server.aof_current_size參數的初始化,下面解決這個疑問。
- void aofUpdateCurrentSize(void) {
- struct redis_stat sb;
- if (redis_fstat(server.aof_fd,&sb) == -1) {
- redisLog(REDIS_WARNING,"Unable to obtain the AOF file length. stat: %s",
- strerror(errno));
- } else {
- server.aof_current_size = sb.st_size;
- }
- }
redis_fstat是作者對Linux中fstat64函數的重命名,該還是就是獲取文件相關的參數信息,具體可以Google之,sb.st_size就是當前AOF文件的大小。這里需要知道server.aof_fd即AOF文件描述符,該參數的初始化在initServer()函數中
- /* Open the AOF file if needed. */
- if (server.aof_state == REDIS_AOF_ON) {
- server.aof_fd = open(server.aof_filename,O_WRONLY|O_APPEND|O_CREAT,0644);
- if (server.aof_fd == -1) {
- redisLog(REDIS_WARNING, "Can't open the append-only file: %s",strerror(errno));
- exit(1);
- }
- }
至此,Redis Server啟動加載硬盤中AOF文件數據的操作就成功結束了。
Server數據庫產生新數據如何持久化到硬盤
當客戶端執行Set等修改數據庫中字段的指令時就會造成Server數據庫中數據被修改,這些修改的數據應該被實時更新到AOF文件中,並且也要按照一定的fsync機制刷新到硬盤中,保證數據不會丟失。
在上一篇博客中,提到了三種fsync方式:appendfsync always, appendfsync everysec, appendfsync no. 具體體現在server.aof_fsync參數中。
首先看當客戶端請求的指令造成數據被修改,Redis是如何將修改數據的指令添加到server.aof_buf中的。
call() -> propagate() -> feedAppendOnlyFile(),call()函數判斷執行指令后是否造成數據被修改。
feedAppendOnlyFile函數首先會判斷Server是否開啟了AOF,如果開啟AOF,那么根據Redis通訊協議將修改數據的指令重現成請求的字符串,注意在超時設置的處理方式,接着將字符串append到server.aof_buf中即可。該函數最后兩行代碼需要注意,這才是重點,如果server.aof_child_pid != -1那么表明此時Server正在重寫rewrite AOF文件,需要將被修改的數據追加到server.aof_rewrite_buf_blocks鏈表中,等待rewrite結束后,追加到AOF文件中。具體見下面代碼的注釋。
- /* Propagate the specified command (in the context of the specified database id)
- * to AOF and Slaves.
- *
- * flags are an xor between:
- * + REDIS_PROPAGATE_NONE (no propagation of command at all)
- * + REDIS_PROPAGATE_AOF (propagate into the AOF file if is enabled)
- * + REDIS_PROPAGATE_REPL (propagate into the replication link)
- */
- void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
- int flags)
- {
- //將cmd指令變動的數據追加到AOF文件中
- if (server.aof_state != REDIS_AOF_OFF && flags & REDIS_PROPAGATE_AOF)
- feedAppendOnlyFile(cmd,dbid,argv,argc);
- if (flags & REDIS_PROPAGATE_REPL)
- replicationFeedSlaves(server.slaves,dbid,argv,argc);
- }
- //cmd指令修改了數據,先將更新的數據寫到server.aof_buf中
- void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
- sds buf = sdsempty();
- robj *tmpargv[3];
- /* The DB this command was targeting is not the same as the last command
- * we appendend. To issue a SELECT command is needed. */
- // 當前 db 不是指定的 aof db,通過創建 SELECT 命令來切換數據庫
- if (dictid != server.aof_selected_db) {
- char seldb[64];
- snprintf(seldb,sizeof(seldb),"%d",dictid);
- buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
- (unsigned long)strlen(seldb),seldb);
- server.aof_selected_db = dictid;
- }
- // 將 EXPIRE / PEXPIRE / EXPIREAT 命令翻譯為 PEXPIREAT 命令
- if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
- cmd->proc == expireatCommand) {
- /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
- buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
- }// 將 SETEX / PSETEX 命令翻譯為 SET 和 PEXPIREAT 組合命令
- else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
- /* Translate SETEX/PSETEX to SET and PEXPIREAT */
- tmpargv[0] = createStringObject("SET",3);
- tmpargv[1] = argv[1];
- tmpargv[2] = argv[3];
- buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
- decrRefCount(tmpargv[0]);
- buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
- } else {//其他的指令直接追加
- /* All the other commands don't need translation or need the
- * same translation already operated in the command vector
- * for the replication itself. */
- buf = catAppendOnlyGenericCommand(buf,argc,argv);
- }
- /* Append to the AOF buffer. This will be flushed on disk just before
- * of re-entering the event loop, so before the client will get a
- * positive reply about the operation performed. */
- // 將 buf 追加到服務器的 aof_buf 末尾,在beforeSleep中寫到AOF文件中,並且根據情況fsync刷新到硬盤
- if (server.aof_state == REDIS_AOF_ON)
- server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));
- /* If a background append only file rewriting is in progress we want to
- * accumulate the differences between the child DB and the current one
- * in a buffer, so that when the child process will do its work we
- * can append the differences to the new append only file. */
- //如果server.aof_child_pid不為1,那就說明有快照進程正在寫數據到臨時文件(已經開始rewrite),
- //那么必須先將這段時間接收到的指令更新的數據先暫時存儲起來,等到快照進程完成任務后,
- //將這部分數據寫入到AOF文件末尾,保證數據不丟失
- //解釋為什么需要aof_rewrite_buf_blocks,當server在進行rewrite時即讀取所有數據庫中的數據,
- //有些數據已經寫到新的AOF文件,但是此時客戶端執行指令又將該值修改了,因此造成了差異
- if (server.aof_child_pid != -1)
- aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
- /*這里說一下server.aof_buf和server.aof_rewrite_buf_blocks的區別
- aof_buf是正常情況下aof文件打開的時候,會不斷將這份數據寫入到AOF文件中。
- aof_rewrite_buf_blocks 是如果用戶主動觸發了寫AOF文件的命令時,比如 config set appendonly yes命令
- 那么redis會fork創建一個后台進程,也就是當時的數據快照,然后將數據寫入到一個臨時文件中去。
- 在此期間發送的命令,我們需要把它們記錄起來,等后台進程完成AOF臨時文件寫后,serverCron定時任務
- 感知到這個退出動作,然后就會調用backgroundRewriteDoneHandler進而調用aofRewriteBufferWrite函數,
- 將aof_rewrite_buf_blocks上面的數據,也就是diff數據寫入到臨時AOF文件中,然后再unlink替換正常的AOF文件。
- 因此可以知道,aof_buf一般情況下比aof_rewrite_buf_blocks要少,
- 但開始的時候可能aof_buf包含一些后者不包含的前面部分數據。*/
- sdsfree(buf);
- }
Server在每次事件循環之前會調用一次beforeSleep函數,下面看看這個函數做了什么工作?
- /* This function gets called every time Redis is entering the
- * main loop of the event driven library, that is, before to sleep
- * for ready file descriptors. */
- void beforeSleep(struct aeEventLoop *eventLoop) {
- REDIS_NOTUSED(eventLoop);
- listNode *ln;
- redisClient *c;
- /* Run a fast expire cycle (the called function will return
- * ASAP if a fast cycle is not needed). */
- if (server.active_expire_enabled && server.masterhost == NULL)
- activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST);
- /* Try to process pending commands for clients that were just unblocked. */
- while (listLength(server.unblocked_clients)) {
- ln = listFirst(server.unblocked_clients);
- redisAssert(ln != NULL);
- c = ln->value;
- listDelNode(server.unblocked_clients,ln);
- c->flags &= ~REDIS_UNBLOCKED;
- /* Process remaining data in the input buffer. */
- //處理客戶端在阻塞期間接收到的客戶端發送的請求
- if (c->querybuf && sdslen(c->querybuf) > 0) {
- server.current_client = c;
- processInputBuffer(c);
- server.current_client = NULL;
- }
- }
- /* Write the AOF buffer on disk */
- //將server.aof_buf中的數據追加到AOF文件中並fsync到硬盤上
- flushAppendOnlyFile(0);
- }
通過上面的代碼及注釋可以發現,beforeSleep函數做了三件事:1、處理過期鍵,2、處理阻塞期間的客戶端請求,3、將server.aof_buf中的數據追加到AOF文件中並fsync刷新到硬盤上,flushAppendOnlyFile函數給定了一個參數force,表示是否強制寫入AOF文件,0表示非強制即支持延遲寫,1表示強制寫入。
- void flushAppendOnlyFile(int force) {
- ssize_t nwritten;
- int sync_in_progress = 0;
- if (sdslen(server.aof_buf) == 0) return;
- // 返回后台正在等待執行的 fsync 數量
- if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
- sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;
- // AOF 模式為每秒 fsync ,並且 force 不為 1 如果可以的話,推延沖洗
- if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
- /* With this append fsync policy we do background fsyncing.
- * If the fsync is still in progress we can try to delay
- * the write for a couple of seconds. */
- // 如果 aof_fsync 隊列里已經有正在等待的任務
- if (sync_in_progress) {
- // 上一次沒有推遲沖洗過,記錄推延的當前時間,然后返回
- if (server.aof_flush_postponed_start == 0) {
- /* No previous write postponinig, remember that we are
- * postponing the flush and return. */
- server.aof_flush_postponed_start = server.unixtime;
- return;
- } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
- // 允許在兩秒之內的推延沖洗
- /* We were already waiting for fsync to finish, but for less
- * than two seconds this is still ok. Postpone again. */
- return;
- }
- /* Otherwise fall trough, and go write since we can't wait
- * over two seconds. */
- server.aof_delayed_fsync++;
- redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
- }
- }
- /* If you are following this code path, then we are going to write so
- * set reset the postponed flush sentinel to zero. */
- server.aof_flush_postponed_start = 0;
- /* We want to perform a single write. This should be guaranteed atomic
- * at least if the filesystem we are writing is a real physical one.
- * While this will save us against the server being killed I don't think
- * there is much to do about the whole server stopping for power problems
- * or alike */
- // 將 AOF 緩存寫入到文件,如果一切幸運的話,寫入會原子性地完成
- nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
- if (nwritten != (signed)sdslen(server.aof_buf)) {//出錯
- /* Ooops, we are in troubles. The best thing to do for now is
- * aborting instead of giving the illusion that everything is
- * working as expected. */
- if (nwritten == -1) {
- redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
- } else {
- redisLog(REDIS_WARNING,"Exiting on short write while writing to "
- "the append-only file: %s (nwritten=%ld, "
- "expected=%ld)",
- strerror(errno),
- (long)nwritten,
- (long)sdslen(server.aof_buf));
- if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
- redisLog(REDIS_WARNING, "Could not remove short write "
- "from the append-only file. Redis may refuse "
- "to load the AOF the next time it starts. "
- "ftruncate: %s", strerror(errno));
- }
- }
- exit(1);
- }
- server.aof_current_size += nwritten;
- /* Re-use AOF buffer when it is small enough. The maximum comes from the
- * arena size of 4k minus some overhead (but is otherwise arbitrary). */
- // 如果 aof 緩存不是太大,那么重用它,否則,清空 aof 緩存
- if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
- sdsclear(server.aof_buf);
- } else {
- sdsfree(server.aof_buf);
- server.aof_buf = sdsempty();
- }
- /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
- * children doing I/O in the background. */
- //aof rdb子進程運行中不支持fsync並且aof rdb子進程正在運行,那么直接返回,
- //但是數據已經寫到aof文件中,只是沒有刷新到硬盤
- if (server.aof_no_fsync_on_rewrite &&
- (server.aof_child_pid != -1 || server.rdb_child_pid != -1))
- return;
- /* Perform the fsync if needed. */
- if (server.aof_fsync == AOF_FSYNC_ALWAYS) {//總是fsync,那么直接進行fsync
- /* aof_fsync is defined as fdatasync() for Linux in order to avoid
- * flushing metadata. */
- aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */
- server.aof_last_fsync = server.unixtime;
- } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
- server.unixtime > server.aof_last_fsync)) {
- if (!sync_in_progress) aof_background_fsync(server.aof_fd);//放到后台線程進行fsync
- server.aof_last_fsync = server.unixtime;
- }
- }
上述代碼中請關注server.aof_fsync參數,即設置Redis fsync AOF文件到硬盤的策略,如果設置為AOF_FSYNC_ALWAYS,那么直接在主進程中fsync,如果設置為AOF_FSYNC_EVERYSEC,那么放入后台線程中fsync,后台線程的代碼在bio.c中。
小結
文章寫到這,已經解決的了Redis Server啟動加載AOF文件和如何將客戶端請求產生的新的數據追加到AOF文件中,對於追加數據到AOF文件中,根據fsync的配置策略如何將寫入到AOF文件中的新數據刷新到硬盤中,直接在主進程中fsync或是在后台線程fsync。
至此,AOF數據持久化還剩下如何rewrite AOF,接受客戶端發送的BGREWRITEAOF請求,此部分內容待下篇博客中解析。
感謝此篇博客給我在理解Redis AOF數據持久化方面的巨大幫助,http://chenzhenianqing.cn/articles/786.html
本人Redis-2.8.2的源碼注釋已經放到Github中,有需要的讀者可以下載,我也會在后續的時間中更新,https://github.com/xkeyideal/annotated-redis-2.8.2
本人不怎么會使用Git,望有人能教我一下。
--------------------------------------------------------------------------------------------------------------------------------------------------------------
本文所引用的源碼全部來自Redis2.8.2版本。
Redis AOF數據持久化機制的實現相關代碼是redis.c, redis.h, aof.c, bio.c, rio.c, config.c
在閱讀本文之前請先閱讀Redis數據持久化機制AOF原理分析之配置詳解文章,了解AOF相關參數的解析,文章鏈接
http://blog.csdn.net/acceptedxukai/article/details/18135219
接着上一篇文章,本文將介紹Redis是如何實現AOF rewrite的。
轉載請注明,文章出自http://blog.csdn.net/acceptedxukai/article/details/18181563
AOF rewrite的觸發機制
如果Redis只是將客戶端修改數據庫的指令重現存儲在AOF文件中,那么AOF文件的大小會不斷的增加,因為AOF文件只是簡單的重現存儲了客戶端的指令,而並沒有進行合並。對於該問題最簡單的處理方式,即當AOF文件滿足一定條件時就對AOF進行rewrite,rewrite是根據當前內存數據庫中的數據進行遍歷寫到一個臨時的AOF文件,待寫完后替換掉原來的AOF文件即可。
Redis觸發AOF rewrite機制有三種:
1、Redis Server接收到客戶端發送的BGREWRITEAOF指令請求,如果當前AOF/RDB數據持久化沒有在執行,那么執行,反之,等當前AOF/RDB數據持久化結束后執行AOF rewrite
2、在Redis配置文件redis.conf中,用戶設置了auto-aof-rewrite-percentage和auto-aof-rewrite-min-size參數,並且當前AOF文件大小server.aof_current_size大於auto-aof-rewrite-min-size(server.aof_rewrite_min_size),同時AOF文件大小的增長率大於auto-aof-rewrite-percentage(server.aof_rewrite_perc)時,會自動觸發AOF rewrite
3、用戶設置“config set appendonly yes”開啟AOF的時,調用startAppendOnly函數會觸發rewrite
下面分別介紹上述三種機制的處理.
接收到BGREWRITEAOF指令
- <span style="font-size:12px;">void bgrewriteaofCommand(redisClient *c) {
- //AOF rewrite正在執行,那么直接返回
- if (server.aof_child_pid != -1) {
- addReplyError(c,"Background append only file rewriting already in progress");
- } else if (server.rdb_child_pid != -1) {
- //AOF rewrite未執行,但RDB數據持久化正在執行,那么設置AOF rewrite狀態為scheduled
- //待RDB結束后執行AOF rewrite
- server.aof_rewrite_scheduled = 1;
- addReplyStatus(c,"Background append only file rewriting scheduled");
- } else if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
- //直接執行AOF rewrite
- addReplyStatus(c,"Background append only file rewriting started");
- } else {
- addReply(c,shared.err);
- }
- }</span>
- /* Start a scheduled AOF rewrite if this was requested by the user while
- * a BGSAVE was in progress. */
- // 如果用戶執行 BGREWRITEAOF 命令的話,在后台開始 AOF 重寫
- //當用戶執行BGREWRITEAOF命令時,如果RDB文件正在寫,那么將server.aof_rewrite_scheduled標記為1
- //當RDB文件寫完后開啟AOF rewrite
- if (server.rdb_child_pid == -1 && server.aof_child_pid == -1 &&
- server.aof_rewrite_scheduled)
- {
- rewriteAppendOnlyFileBackground();
- }
Server自動對AOF進行rewrite
- /* Trigger an AOF rewrite if needed */
- //滿足一定條件rewrite AOF文件
- if (server.rdb_child_pid == -1 &&
- server.aof_child_pid == -1 &&
- server.aof_rewrite_perc &&
- server.aof_current_size > server.aof_rewrite_min_size)
- {
- long long base = server.aof_rewrite_base_size ?
- server.aof_rewrite_base_size : 1;
- long long growth = (server.aof_current_size*100/base) - 100;
- if (growth >= server.aof_rewrite_perc) {
- redisLog(REDIS_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
- rewriteAppendOnlyFileBackground();
- }
- }
config set appendonly yes
- if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
- int enable = yesnotoi(o->ptr);
- if (enable == -1) goto badfmt;
- if (enable == 0 && server.aof_state != REDIS_AOF_OFF) {//appendonly no 關閉AOF
- stopAppendOnly();
- } else if (enable && server.aof_state == REDIS_AOF_OFF) {//appendonly yes rewrite AOF
- if (startAppendOnly() == REDIS_ERR) {
- addReplyError(c,
- "Unable to turn on AOF. Check server logs.");
- return;
- }
- }
- }
- int startAppendOnly(void) {
- server.aof_last_fsync = server.unixtime;
- server.aof_fd = open(server.aof_filename,O_WRONLY|O_APPEND|O_CREAT,0644);
- redisAssert(server.aof_state == REDIS_AOF_OFF);
- if (server.aof_fd == -1) {
- redisLog(REDIS_WARNING,"Redis needs to enable the AOF but can't open the append only file: %s",strerror(errno));
- return REDIS_ERR;
- }
- if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {//rewrite
- close(server.aof_fd);
- redisLog(REDIS_WARNING,"Redis needs to enable the AOF but can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.");
- return REDIS_ERR;
- }
- /* We correctly switched on AOF, now wait for the rerwite to be complete
- * in order to append data on disk. */
- server.aof_state = REDIS_AOF_WAIT_REWRITE;
- return REDIS_OK;
- }
Redis AOF rewrite機制的實現
從上述分析可以看出rewrite的實現全部依靠rewriteAppendOnlyFileBackground函數,下面分析該函數,通過下面的代碼可以看出,Redis是fork出一個子進程來操作AOF rewrite,然后子進程調用rewriteAppendOnlyFile函數,將數據寫到一個臨時文件temp-rewriteaof-bg-%d.aof中。如果子進程完成會通過exit(0)函數通知父進程rewrite結束,在serverCron函數中使用wait3函數接收子進程退出狀態,然后執行后續的AOF rewrite的收尾工作,后面將會分析。- int rewriteAppendOnlyFileBackground(void) {
- pid_t childpid;
- long long start;
- // 后台重寫正在執行
- if (server.aof_child_pid != -1) return REDIS_ERR;
- start = ustime();
- if ((childpid = fork()) == 0) {
- char tmpfile[256];
- /* Child */
- closeListeningSockets(0);//
- redisSetProcTitle("redis-aof-rewrite");
- snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
- if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
- size_t private_dirty = zmalloc_get_private_dirty();
- if (private_dirty) {
- redisLog(REDIS_NOTICE,
- "AOF rewrite: %zu MB of memory used by copy-on-write",
- private_dirty/(1024*1024));
- }
- exitFromChild(0);
- } else {
- exitFromChild(1);
- }
- } else {
- /* Parent */
- server.stat_fork_time = ustime()-start;
- if (childpid == -1) {
- redisLog(REDIS_WARNING,
- "Can't rewrite append only file in background: fork: %s",
- strerror(errno));
- return REDIS_ERR;
- }
- redisLog(REDIS_NOTICE,
- "Background append only file rewriting started by pid %d",childpid);
- server.aof_rewrite_scheduled = 0;
- server.aof_rewrite_time_start = time(NULL);
- server.aof_child_pid = childpid;
- updateDictResizePolicy();
- /* We set appendseldb to -1 in order to force the next call to the
- * feedAppendOnlyFile() to issue a SELECT command, so the differences
- * accumulated by the parent into server.aof_rewrite_buf will start
- * with a SELECT statement and it will be safe to merge. */
- server.aof_selected_db = -1;
- replicationScriptCacheFlush();
- return REDIS_OK;
- }
- return REDIS_OK; /* unreached */
- }
- int rewriteAppendOnlyFile(char *filename) {
- dictIterator *di = NULL;
- dictEntry *de;
- rio aof;
- FILE *fp;
- char tmpfile[256];
- int j;
- long long now = mstime();
- /* Note that we have to use a different temp name here compared to the
- * one used by rewriteAppendOnlyFileBackground() function. */
- snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
- fp = fopen(tmpfile,"w");
- if (!fp) {
- redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
- return REDIS_ERR;
- }
- rioInitWithFile(&aof,fp); //初始化讀寫函數,rio.c
- //設置r->io.file.autosync = bytes;每32M刷新一次
- if (server.aof_rewrite_incremental_fsync)
- rioSetAutoSync(&aof,REDIS_AOF_AUTOSYNC_BYTES);
- for (j = 0; j < server.dbnum; j++) {//遍歷每個數據庫
- char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
- redisDb *db = server.db+j;
- dict *d = db->dict;
- if (dictSize(d) == 0) continue;
- di = dictGetSafeIterator(d);
- if (!di) {
- fclose(fp);
- return REDIS_ERR;
- }
- /* SELECT the new DB */
- if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
- if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;
- /* Iterate this DB writing every entry */
- while((de = dictNext(di)) != NULL) {
- sds keystr;
- robj key, *o;
- long long expiretime;
- keystr = dictGetKey(de);
- o = dictGetVal(de);
- initStaticStringObject(key,keystr);
- expiretime = getExpire(db,&key);
- /* If this key is already expired skip it */
- if (expiretime != -1 && expiretime < now) continue;
- /* Save the key and associated value */
- if (o->type == REDIS_STRING) {
- /* Emit a SET command */
- char cmd[]="*3\r\n$3\r\nSET\r\n";
- if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
- /* Key and value */
- if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
- if (rioWriteBulkObject(&aof,o) == 0) goto werr;
- } else if (o->type == REDIS_LIST) {
- if (rewriteListObject(&aof,&key,o) == 0) goto werr;
- } else if (o->type == REDIS_SET) {
- if (rewriteSetObject(&aof,&key,o) == 0) goto werr;
- } else if (o->type == REDIS_ZSET) {
- if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;
- } else if (o->type == REDIS_HASH) {
- if (rewriteHashObject(&aof,&key,o) == 0) goto werr;
- } else {
- redisPanic("Unknown object type");
- }
- /* Save the expire time */
- if (expiretime != -1) {
- char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
- if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
- if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
- if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;
- }
- }
- dictReleaseIterator(di);
- }
- /* Make sure data will not remain on the OS's output buffers */
- fflush(fp);
- aof_fsync(fileno(fp));//將tempfile文件刷新到硬盤
- fclose(fp);
- /* Use RENAME to make sure the DB file is changed atomically only
- * if the generate DB file is ok. */
- if (rename(tmpfile,filename) == -1) {//重命名文件名,注意rename后的文件也是一個臨時文件
- redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
- unlink(tmpfile);
- return REDIS_ERR;
- }
- redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
- return REDIS_OK;
- werr:
- fclose(fp);
- unlink(tmpfile);
- redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
- if (di) dictReleaseIterator(di);
- return REDIS_ERR;
- }
- if (server.aof_child_pid != -1)
- aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
- /* Check if a background saving or AOF rewrite in progress terminated. */
- //如果RDB bgsave或AOF rewrite子進程已經執行,通過獲取子進程的退出狀態,對后續的工作進行處理
- if (server.rdb_child_pid != -1 || server.aof_child_pid != -1) {//
- int statloc;
- pid_t pid;
- if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
- int exitcode = WEXITSTATUS(statloc);//獲取退出的狀態
- int bysignal = 0;
- if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);
- if (pid == server.rdb_child_pid) {
- backgroundSaveDoneHandler(exitcode,bysignal);
- } else if (pid == server.aof_child_pid) {
- backgroundRewriteDoneHandler(exitcode,bysignal);
- } else {
- redisLog(REDIS_WARNING,
- "Warning, detected child with unmatched pid: %ld",
- (long)pid);
- }
- // 如果 BGSAVE 和 BGREWRITEAOF 都已經完成,那么重新開始 REHASH
- updateDictResizePolicy();
- }
- }
- /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
- * Handle this. */
- void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
- if (!bysignal && exitcode == 0) {//子進程退出狀態正確
- int newfd, oldfd;
- char tmpfile[256];
- long long now = ustime();
- redisLog(REDIS_NOTICE,
- "Background AOF rewrite terminated with success");
- /* Flush the differences accumulated by the parent to the
- * rewritten AOF. */
- snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
- (int)server.aof_child_pid);
- newfd = open(tmpfile,O_WRONLY|O_APPEND);
- if (newfd == -1) {
- redisLog(REDIS_WARNING,
- "Unable to open the temporary AOF produced by the child: %s", strerror(errno));
- goto cleanup;
- }
- //處理server.aof_rewrite_buf_blocks中DIFF數據
- if (aofRewriteBufferWrite(newfd) == -1) {
- redisLog(REDIS_WARNING,
- "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
- close(newfd);
- goto cleanup;
- }
- redisLog(REDIS_NOTICE,
- "Parent diff successfully flushed to the rewritten AOF (%lu bytes)", aofRewriteBufferSize());
- /* The only remaining thing to do is to rename the temporary file to
- * the configured file and switch the file descriptor used to do AOF
- * writes. We don't want close(2) or rename(2) calls to block the
- * server on old file deletion.
- *
- * There are two possible scenarios:
- *
- * 1) AOF is DISABLED and this was a one time rewrite. The temporary
- * file will be renamed to the configured file. When this file already
- * exists, it will be unlinked, which may block the server.
- *
- * 2) AOF is ENABLED and the rewritten AOF will immediately start
- * receiving writes. After the temporary file is renamed to the
- * configured file, the original AOF file descriptor will be closed.
- * Since this will be the last reference to that file, closing it
- * causes the underlying file to be unlinked, which may block the
- * server.
- *
- * To mitigate the blocking effect of the unlink operation (either
- * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we
- * use a background thread to take care of this. First, we
- * make scenario 1 identical to scenario 2 by opening the target file
- * when it exists. The unlink operation after the rename(2) will then
- * be executed upon calling close(2) for its descriptor. Everything to
- * guarantee atomicity for this switch has already happened by then, so
- * we don't care what the outcome or duration of that close operation
- * is, as long as the file descriptor is released again. */
- if (server.aof_fd == -1) {
- /* AOF disabled */
- /* Don't care if this fails: oldfd will be -1 and we handle that.
- * One notable case of -1 return is if the old file does
- * not exist. */
- oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
- } else {
- /* AOF enabled */
- oldfd = -1; /* We'll set this to the current AOF filedes later. */
- }
- /* Rename the temporary file. This will not unlink the target file if
- * it exists, because we reference it with "oldfd". */
- //把臨時文件改名為正常的AOF文件名。由於當前oldfd已經指向這個之前的正常文件名的文件,
- //所以當前不會造成unlink操作,得等那個oldfd被close的時候,內核判斷該文件沒有指向了,就刪除之。
- if (rename(tmpfile,server.aof_filename) == -1) {
- redisLog(REDIS_WARNING,
- "Error trying to rename the temporary AOF file: %s", strerror(errno));
- close(newfd);
- if (oldfd != -1) close(oldfd);
- goto cleanup;
- }
- //如果AOF關閉了,那只要處理新文件,直接關閉這個新的文件即可
- //但是這里會不會導致服務器卡呢?這個newfd應該是臨時文件的最后一個fd了,不會的,
- //因為這個文件在本函數不會寫入數據,因為stopAppendOnly函數會清空aof_rewrite_buf_blocks列表。
- if (server.aof_fd == -1) {
- /* AOF disabled, we don't need to set the AOF file descriptor
- * to this new file, so we can close it. */
- close(newfd);
- } else {
- /* AOF enabled, replace the old fd with the new one. */
- oldfd = server.aof_fd;
- //指向新的fd,此時這個fd由於上面的rename語句存在,已經為正常aof文件名
- server.aof_fd = newfd;
- //fsync到硬盤
- if (server.aof_fsync == AOF_FSYNC_ALWAYS)
- aof_fsync(newfd);
- else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
- aof_background_fsync(newfd);
- server.aof_selected_db = -1; /* Make sure SELECT is re-issued */
- aofUpdateCurrentSize();
- server.aof_rewrite_base_size = server.aof_current_size;
- /* Clear regular AOF buffer since its contents was just written to
- * the new AOF from the background rewrite buffer. */
- //rewrite得到的肯定是最新的數據,所以aof_buf中的數據沒有意義,直接清空
- sdsfree(server.aof_buf);
- server.aof_buf = sdsempty();
- }
- server.aof_lastbgrewrite_status = REDIS_OK;
- redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully");
- /* Change state from WAIT_REWRITE to ON if needed */
- //下面判斷是否需要打開AOF,比如bgrewriteaofCommand就不需要打開AOF。
- if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
- server.aof_state = REDIS_AOF_ON;
- /* Asynchronously close the overwritten AOF. */
- //讓后台線程去關閉這個舊的AOF文件FD,只要CLOSE就行,會自動unlink的,因為上面已經有rename
- if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);
- redisLog(REDIS_VERBOSE,
- "Background AOF rewrite signal handler took %lldus", ustime()-now);
- } else if (!bysignal && exitcode != 0) {
- server.aof_lastbgrewrite_status = REDIS_ERR;
- redisLog(REDIS_WARNING,
- "Background AOF rewrite terminated with error");
- } else {
- server.aof_lastbgrewrite_status = REDIS_ERR;
- redisLog(REDIS_WARNING,
- "Background AOF rewrite terminated by signal %d", bysignal);
- }
- cleanup:
- aofRewriteBufferReset();
- aofRemoveTempFile(server.aof_child_pid);
- server.aof_child_pid = -1;
- server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;
- server.aof_rewrite_time_start = -1;
- /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */
- if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
- server.aof_rewrite_scheduled = 1;
- }
小結