深入剖析SolrCloud（四）

本文轉載自查看原文 2012-02-29 00:07 9125 Java

在上一篇中介紹了連接Zookeeper集群的方法，這一篇將圍繞一個有趣的話題---來展開，這就是Replication（索引復制），關於Solr Replication的詳細介紹，可以參考http://wiki.apache.org/solr/SolrReplication。

在開始這個話題之前，先從我最近在應用中引入solr的master/slave架構時，遇到的一個讓我困擾的實際問題。

應用場景簡單描述如下：

1）首先master節點下載索引分片，然后創建配置文件，加入master節點的replication配置片段，再對索引分片進行合並(關於mergeIndex，可以參考http://wiki.apache.org/solr/MergingSolrIndexes)，然后利用上述配置文件和索引數據去創建一個solr核。

2）slave節點創建配置文件，加入slave節點的replication配置片段，創建一個空的solr核，等待從master節點進行索引數據同步

出現的問題：slave節點沒有從master節點同步到數據。

問題分析：

1）首先檢查master節點，獲取最新的可復制索引的版本號，

http://master_host:port/solr/replication?command=indexversion

發現返回的索引版本號是0，這說明mater節點根本沒有觸發replication動作，

2）為了確認上述判斷，在slave節點上進一步查看replication的詳細信息

http://slave_host:port/solr/replication?command=details

發現確實如此，盡管master節點的索引版本號和slave節點的索引版本號不一致，但索引卻沒有同步過來，再分別查看master節點和slave節點的日志，發現索引復制動作確實沒有開始。

綜上所述，確實是master節點沒有觸發索引復制動作，那究竟是為何呢？先將原因擺出來，后面會通過源碼的分析來加以說明。

原因：solr合並索引時，不管你是通過mergeindexes的http命令，還是調用底層lucene的IndexWriter，記得最后一定要提交一個commit，否則，不僅索引不僅不會對查詢可見，更是對於master/slave架構的solr集群來說，master節點的replication動作不會觸發，因為indexversion沒有感知到變化。

好了，下面開始對Solr的Replication的分析。

Solr容器在加載solr核的時候，會對已經注冊的各個實現SolrCoreAware接口的Handler進行回調，調用其inform方法。

對於ReplicationHandler來說，就是在這里對自己是屬於master節點還是slave節點進行判斷，若是slave節點，則創建一個SnapPuller對象，定時負責從master節點主動拉索引數據下來；若是master節點，則只設置相應的參數。

   public void inform(SolrCore core) {
     this.core = core;
    registerFileStreamResponseWriter();
    registerCloseHook();
    NamedList slave = (NamedList) initArgs.get("slave");
     boolean enableSlave = isEnabled( slave );
     if (enableSlave) {
      tempSnapPuller = snapPuller = new SnapPuller(slave, this, core);
      isSlave = true;
    }
    NamedList master = (NamedList) initArgs.get("master");
     boolean enableMaster = isEnabled( master );

     if (!enableSlave && !enableMaster) {
      enableMaster = true;
      master = new NamedList<Object>();
    }

     if (enableMaster) {
      includeConfFiles = (String) master.get(CONF_FILES);
       if (includeConfFiles != null && includeConfFiles.trim().length() > 0) {
        List<String> files = Arrays.asList(includeConfFiles.split(","));
         for (String file : files) {
           if (file.trim().length() == 0) continue;
          String[] strs = file.split(":");
           // if there is an alias add it or it is null
          confFileNameAlias.add(strs[0], strs.length > 1 ? strs[1] : null);
        }
        LOG.info("Replication enabled for following config files: " + includeConfFiles);
      }
      List backup = master.getAll("backupAfter");
       boolean backupOnCommit = backup.contains("commit");
       boolean backupOnOptimize = !backupOnCommit && backup.contains("optimize");
      List replicateAfter = master.getAll(REPLICATE_AFTER);
      replicateOnCommit = replicateAfter.contains("commit");
      replicateOnOptimize = !replicateOnCommit && replicateAfter.contains("optimize");

       if (!replicateOnCommit && ! replicateOnOptimize) {
        replicateOnCommit = true;
      }

       // if we only want to replicate on optimize, we need the deletion policy to
       // save the last optimized commit point.
       if (replicateOnOptimize) {
        IndexDeletionPolicyWrapper wrapper = core.getDeletionPolicy();
        IndexDeletionPolicy policy = wrapper == null ? null : wrapper.getWrappedDeletionPolicy();
         if (policy instanceof SolrDeletionPolicy) {
          SolrDeletionPolicy solrPolicy = (SolrDeletionPolicy)policy;
           if (solrPolicy.getMaxOptimizedCommitsToKeep() < 1) {
            solrPolicy.setMaxOptimizedCommitsToKeep(1);
          }
        } else {
          LOG.warn("Replication can't call setMaxOptimizedCommitsToKeep on " + policy);
        }
      }

       if (replicateOnOptimize || backupOnOptimize) {
        core.getUpdateHandler().registerOptimizeCallback(getEventListener(backupOnOptimize, replicateOnOptimize));
      }
       if (replicateOnCommit || backupOnCommit) {
        replicateOnCommit = true;
        core.getUpdateHandler().registerCommitCallback(getEventListener(backupOnCommit, replicateOnCommit));
      }
       if (replicateAfter.contains("startup")) {
        replicateOnStart = true;
        RefCounted<SolrIndexSearcher> s = core.getNewestSearcher( false);
         try {
          DirectoryReader reader = s== null ? null : s.get().getIndexReader();
           if (reader!= null && reader.getIndexCommit() != null && reader.getIndexCommit().getGeneration() != 1L) {
             try {
               if(replicateOnOptimize){
                Collection<IndexCommit> commits = DirectoryReader.listCommits(reader.directory());
                 for (IndexCommit ic : commits) {
                   if(ic.getSegmentCount() == 1){
                     if(indexCommitPoint == null || indexCommitPoint.getGeneration() < ic.getGeneration()) indexCommitPoint = ic;
                  }
                }
              } else{
                indexCommitPoint = reader.getIndexCommit();
              }
            } finally {
               // We don't need to save commit points for replication, the SolrDeletionPolicy
               // always saves the last commit point (and the last optimized commit point, if needed)
               /** *
              if(indexCommitPoint != null){
                core.getDeletionPolicy().saveCommitPoint(indexCommitPoint.getGeneration());
              }
              ** */
            }
          }

           // reboot the writer on the new index
          core.getUpdateHandler().newIndexWriter();

        } catch (IOException e) {
          LOG.warn("Unable to get IndexCommit on startup", e);
        } finally {
           if (s!= null) s.decref();
        }
      }
      String reserve = (String) master.get(RESERVE);
       if (reserve != null && !reserve.trim().equals("")) {
        reserveCommitDuration = SnapPuller.readInterval(reserve);
      }
      LOG.info("Commits will be reserved for  " + reserveCommitDuration);
      isMaster = true;
    }

}

ReplicationHandler可以響應多種命令：

1) indexversion。

這里需要了解的第一個概念是索引提交點(IndexCommit)，這是底層lucene的東西，可以自行查閱資料。首先獲取最新的索引提交點，然后從其中獲取索引版本號和索引所屬代。

IndexCommit commitPoint = indexCommitPoint;   // make a copy so it won't change
       if (commitPoint != null && replicationEnabled.get()) {
        core.getDeletionPolicy().setReserveDuration(commitPoint.getVersion(), reserveCommitDuration);
        rsp.add(CMD_INDEX_VERSION, commitPoint.getVersion());

rsp.add(GENERATION, commitPoint.getGeneration());

2）backup。這個命令用來對索引做快照。首先獲取最新的索引提交點，然后創建做一個SnapShooter，具體的快照動作由這個對象完成，

private void doSnapShoot(SolrParams params, SolrQueryResponse rsp, SolrQueryRequest req) {

    try {
      int numberToKeep = params.getInt(NUMBER_BACKUPS_TO_KEEP, Integer.MAX_VALUE);
      IndexDeletionPolicyWrapper delPolicy = core.getDeletionPolicy();
      IndexCommit indexCommit = delPolicy.getLatestCommit();

      if(indexCommit == null) {
        indexCommit = req.getSearcher().getReader().getIndexCommit();
      }

      // small race here before the commit point is saved
      new SnapShooter(core, params.get("location")).createSnapAsync(indexCommit, numberToKeep, this);

    } catch (Exception e) {
      LOG.warn("Exception during creating a snapshot", e);
      rsp.add("exception", e);
    }
  }

快照對象會啟動一個線程去異步地做一個索引備份。

void createSnapAsync(final IndexCommit indexCommit, final int numberToKeep, final ReplicationHandler replicationHandler) {

replicationHandler.core.getDeletionPolicy().saveCommitPoint(indexCommit.getVersion());

new Thread() {

@Override

public void run() {

createSnapshot(indexCommit, numberToKeep, replicationHandler);

}

}.start();

}

void createSnapshot(final IndexCommit indexCommit, int numberToKeep, ReplicationHandler replicationHandler) {

NamedList details = new NamedList();

details.add("startTime", new Date().toString());

File snapShotDir = null;

String directoryName = null;

Lock lock = null;

try {

if(numberToKeep<Integer.MAX_VALUE) {

deleteOldBackups(numberToKeep);

}

SimpleDateFormat fmt = new SimpleDateFormat(DATE_FMT, Locale.US);

directoryName = "snapshot." + fmt.format(new Date());

lock = lockFactory.makeLock(directoryName + ".lock");

if (lock.isLocked()) return;

snapShotDir = new File(snapDir, directoryName);

if (!snapShotDir.mkdir()) {

LOG.warn("Unable to create snapshot directory: " + snapShotDir.getAbsolutePath());

return;

}

Collection<String> files = indexCommit.getFileNames();

FileCopier fileCopier = new FileCopier(solrCore.getDeletionPolicy(), indexCommit);

fileCopier.copyFiles(files, snapShotDir);

details.add("fileCount", files.size());

details.add("status", "success");

details.add("snapshotCompletedAt", new Date().toString());

} catch (Exception e) {

SnapPuller.delTree(snapShotDir);

LOG.error("Exception while creating snapshot", e);

details.add("snapShootException", e.getMessage());

} finally {

replicationHandler.core.getDeletionPolicy().releaseCommitPoint(indexCommit.getVersion());

replicationHandler.snapShootDetails = details;

if (lock != null) {

try {

lock.release();

} catch (IOException e) {

LOG.error("Unable to release snapshoot lock: " + directoryName + ".lock");

}

3）fetchindex。響應來自slave節點的取索引文件的請求，會啟動一個線程來實現索引文件的獲取。

String masterUrl = solrParams.get(MASTER_URL);

if (!isSlave && masterUrl == null) {

rsp.add(STATUS,ERR_STATUS);

rsp.add("message","No slave configured or no 'masterUrl' Specified");

return;

}

final SolrParams paramsCopy = new ModifiableSolrParams(solrParams);

new Thread() {

@Override

public void run() {

doFetch(paramsCopy);

}

}.start();

rsp.add(STATUS, OK_STATUS);

具體的獲取動作是通過SnapPuller對象來實現的，首先嘗試獲取pull對象鎖，如果請求鎖失敗，則說明還有取索引數據動作未結束，如果請求鎖成功，就調用SnapPuller對象的fetchLatestIndex方法來取最新的索引數據。

void doFetch(SolrParams solrParams) {

String masterUrl = solrParams == null ? null : solrParams.get(MASTER_URL);

if (!snapPullLock.tryLock())

return;

try {

tempSnapPuller = snapPuller;

if (masterUrl != null) {

NamedList<Object> nl = solrParams.toNamedList();

nl.remove(SnapPuller.POLL_INTERVAL);

tempSnapPuller = new SnapPuller(nl, this, core);

}

tempSnapPuller.fetchLatestIndex(core);

} catch (Exception e) {

LOG.error("SnapPull failed ", e);

} finally {

tempSnapPuller = snapPuller;

snapPullLock.unlock();

}

最后真正的取索引數據過程，首先，若mastet節點的indexversion為0，則說明master節點根本沒有提供可供復制的索引數據，若master節點和slave節點的indexversion相同，則說明slave節點目前與master節點索引數據狀態保持一致，無需同步。若兩者的indexversion不同，則開始索引復制過程，首先從master節點上下載指定索引版本號的索引文件列表，然后創建一個索引文件同步服務線程來完成同並工作。

這里需要區分的是，如果master節點的年代比slave節點要老，那就說明兩者已經不相容，此時slave節點需要新建一個索引目錄，再從master節點做一次全量索引復制。還需要注意的一點是，索引同步也是可以同步配置文件的，若配置文件發生變化，則需要對solr核進行一次reload操作。最對了，還有，和文章開頭一樣， slave節點同步完數據后，別忘了做一次commit操作，以便刷新自己的索引提交點到最新的狀態。最后，關閉並等待同步服務線程結束。此外，具體的取索引文件是通過FileFetcher對象來完成。

boolean fetchLatestIndex(SolrCore core) throws IOException {

replicationStartTime = System.currentTimeMillis();

try {

//get the current 'replicateable' index version in the master

NamedList response = null;

try {

response = getLatestVersion();

} catch (Exception e) {

LOG.error("Master at: " + masterUrl + " is not available. Index fetch failed. Exception: " + e.getMessage());

return false;

}

long latestVersion = (Long) response.get(CMD_INDEX_VERSION);

long latestGeneration = (Long) response.get(GENERATION);

if (latestVersion == 0L) {

//there is nothing to be replicated

return false;

}

IndexCommit commit;

RefCounted<SolrIndexSearcher> searcherRefCounted = null;

try {

searcherRefCounted = core.getNewestSearcher(false);

commit = searcherRefCounted.get().getReader().getIndexCommit();

} finally {

if (searcherRefCounted != null)

searcherRefCounted.decref();

}

if (commit.getVersion() == latestVersion && commit.getGeneration() == latestGeneration) {

//master and slave are alsready in sync just return

LOG.info("Slave in sync with master.");

return false;

}

LOG.info("Master's version: " + latestVersion + ", generation: " + latestGeneration);

LOG.info("Slave's version: " + commit.getVersion() + ", generation: " + commit.getGeneration());

LOG.info("Starting replication process");

// get the list of files first

fetchFileList(latestVersion);

// this can happen if the commit point is deleted before we fetch the file list.

if(filesToDownload.isEmpty()) return false;

LOG.info("Number of files in latest index in master: " + filesToDownload.size());

// Create the sync service

fsyncService = Executors.newSingleThreadExecutor();

// use a synchronized list because the list is read by other threads (to show details)

filesDownloaded = Collections.synchronizedList(new ArrayList<Map<String, Object>>());

// if the generateion of master is older than that of the slave , it means they are not compatible to be copied

// then a new index direcory to be created and all the files need to be copied

boolean isFullCopyNeeded = commit.getGeneration() >= latestGeneration;

File tmpIndexDir = createTempindexDir(core);

if (isIndexStale())

isFullCopyNeeded = true;

successfulInstall = false;

boolean deleteTmpIdxDir = true;

File indexDir = null ;

try {

indexDir = new File(core.getIndexDir());

downloadIndexFiles(isFullCopyNeeded, tmpIndexDir, latestVersion);

LOG.info("Total time taken for download : " + ((System.currentTimeMillis() - replicationStartTime) / 1000) + " secs");

Collection<Map<String, Object>> modifiedConfFiles = getModifiedConfFiles(confFilesToDownload);

if (!modifiedConfFiles.isEmpty()) {

downloadConfFiles(confFilesToDownload, latestVersion);

if (isFullCopyNeeded) {

successfulInstall = modifyIndexProps(tmpIndexDir.getName());

deleteTmpIdxDir = false;

} else {

successfulInstall = copyIndexFiles(tmpIndexDir, indexDir);

}

if (successfulInstall) {

LOG.info("Configuration files are modified, core will be reloaded");

logReplicationTimeAndConfFiles(modifiedConfFiles, successfulInstall);//write to a file time of replication and conf files.

reloadCore();

}

} else {

terminateAndWaitFsyncService();

if (isFullCopyNeeded) {

successfulInstall = modifyIndexProps(tmpIndexDir.getName());

deleteTmpIdxDir = false;

} else {

successfulInstall = copyIndexFiles(tmpIndexDir, indexDir);

}

if (successfulInstall) {

logReplicationTimeAndConfFiles(modifiedConfFiles, successfulInstall);

doCommit();

}

replicationStartTime = 0;

return successfulInstall;

} catch (ReplicationHandlerException e) {

LOG.error("User aborted Replication");

} catch (SolrException e) {

throw e;

} catch (Exception e) {

throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Index fetch failed : ", e);

} finally {

if (deleteTmpIdxDir) delTree(tmpIndexDir);

else delTree(indexDir);

}

return successfulInstall;

} finally {

if (!successfulInstall) {

logReplicationTimeAndConfFiles(null, successfulInstall);

}

filesToDownload = filesDownloaded = confFilesDownloaded = confFilesToDownload = null;

replicationStartTime = 0;

fileFetcher = null;

if (fsyncService != null && !fsyncService.isShutdown()) fsyncService.shutdownNow();

fsyncService = null;

stop = false;

fsyncException = null;

}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 深入剖析SolrCloud（二） Solr4.8.0源碼分析(16)之SolrCloud索引深入(3) 深入剖析Linux IO原理深入剖析 Spring 框架的 BeanFactory 《深入剖析ngx》——配置解析《深入剖析ngx》—— 事件管理深入剖析Android音頻之AudioTrack 源碼剖析——深入Windows句柄本質 Android硬件抽象層(HAL)深入剖析(一) Android硬件抽象層(HAL)深入剖析(二)