在(上)中講了如何得到csv文件並調用noduleCADEvaluationLUNA16.py求取froc值,這里就講一講froc值是如何求取的。
annotations_filename = './annotations/annotations.csv' annotations_excluded_filename = './annotations/annotations_excluded.csv' seriesuids_filename = './annotations/seriesuids.csv' results_filename = './annotations/3DRes18FasterR-CNN.csv'#3D Faster R-CNN - Res18.csv' #top5.csv'# seriesuids_path = '/home/ustc/lclin/code/DeepLung/evaluationScript/series_uid9.csv' results_path = "/home/ustc/lclin/code/DeepLung/detector/results/wzy_res18/retrft969/val120/predanno-0.125pbb.csv" noduleCADEvaluation(annotations_filename,annotations_excluded_filename,seriesuids_path,results_path,'./')
如上面代碼所示,輸入標簽文件,結果文件,調用noduleCADEvaluation函數即可,任務完成,十分簡單。
接下來看一下這個函數。
def noduleCADEvaluation(annotations_filename,annotations_excluded_filename,seriesuids_filename,results_filename,outputDir): ''' function to load annotations and evaluate a CAD algorithm @param annotations_filename: list of annotations @param annotations_excluded_filename: list of annotations that are excluded from analysis @param seriesuids_filename: list of CT images in seriesuids @param results_filename: list of CAD marks with probabilities @param outputDir: output directory ''' print annotations_filename (allNodules, seriesUIDs) = collect(annotations_filename, annotations_excluded_filename, seriesuids_filename) #根據標簽和用戶id,求出所有結節,所有用戶 evaluateCAD(seriesUIDs, results_filename, outputDir, allNodules, #根據結節,用戶,結果文件,輸出froc值 os.path.splitext(os.path.basename(results_filename))[0], maxNumberOfCADMarks=100, performBootstrapping=bPerformBootstrapping, numberOfBootstrapSamples=bNumberOfBootstrapSamples, confidence=bConfidence)
這個函數調用了兩個函數collect和evaluateCAD。先從collect開始看。
def collect(annotations_filename,annotations_excluded_filename,seriesuids_filename): annotations = csvTools.readCSV(annotations_filename) annotations_excluded = csvTools.readCSV(annotations_excluded_filename) seriesUIDs_csv = csvTools.readCSV(seriesuids_filename) seriesUIDs = [] #建立一個用戶列表,將用戶id添加進去 for seriesUID in seriesUIDs_csv: seriesUIDs.append(seriesUID[0]) #seriesUID也是一個列表,只不過只有一個元素 allNodules = collectNoduleAnnotations(annotations, annotations_excluded, seriesUIDs) return (allNodules, seriesUIDs)
該函數又調用了另一個函數collectNoduleAnnotations。
def collectNoduleAnnotations(annotations, annotations_excluded, seriesUIDs): allNodules = {} #將所有結節存儲在字典中 noduleCount = 0 noduleCountTotal = 0 for seriesuid in seriesUIDs: # print 'adding nodule annotations: ' + seriesuid nodules = [] numberOfIncludedNodules = 0 #對真正用來檢測的結節計數 # add included findings header = annotations[0] for annotation in annotations[1:]: nodule_seriesuid = annotation[header.index(seriesuid_label)] if seriesuid == nodule_seriesuid: #將結節所屬的用戶id與要建立字典索引的id比較,若相同,就獲取它 nodule = getNodule(annotation, header, state = "Included") nodules.append(nodule) numberOfIncludedNodules += 1 # add excluded findings header = annotations_excluded[0] for annotation in annotations_excluded[1:]: nodule_seriesuid = annotation[header.index(seriesuid_label)] if seriesuid == nodule_seriesuid: #對無關結節也執行上面的操作,不同的是不對include計數 nodule = getNodule(annotation, header, state = "Excluded") nodules.append(nodule) allNodules[seriesuid] = nodules noduleCount += numberOfIncludedNodules noduleCountTotal += len(nodules) print 'Total number of included nodule annotations: ' + str(noduleCount) print 'Total number of nodule annotations: ' + str(noduleCountTotal) return allNodules
這段代碼比較簡單,其中有一個獲取結節的函數getNodule,需要看一下
def getNodule(annotation, header, state = ""): nodule = NoduleFinding() nodule.coordX = annotation[header.index(coordX_label)] #依次將x,y,z添加到nodule對象的屬性中 nodule.coordY = annotation[header.index(coordY_label)] nodule.coordZ = annotation[header.index(coordZ_label)] if diameter_mm_label in header: #檢查有無直徑標簽,有則添加 nodule.diameter_mm = annotation[header.index(diameter_mm_label)] if CADProbability_label in header: #檢查有無概率標簽,有則添加 nodule.CADprobability = annotation[header.index(CADProbability_label)] if not state == "": nodule.state = state return nodule
這里又又出現一個函數,准確地說是類,NoduleFinding(),這個類存在於nodulefinding.py模塊,也是在evaluationScript這個文件夾中。
一起來欣賞下這個類,一目了然,而且這個模塊也只有這段代碼。
class NoduleFinding(object): ''' Represents a nodule ''' def __init__(self, noduleid=None, coordX=None, coordY=None, coordZ=None, coordType="World", CADprobability=None, noduleType=None, diameter=None, state=None, seriesInstanceUID=None): # set the variables and convert them to the correct type self.id = noduleid self.coordX = coordX self.coordY = coordY self.coordZ = coordZ self.coordType = coordType self.CADprobability = CADprobability self.noduleType = noduleType self.diameter_mm = diameter self.state = state self.candidateID = None self.seriesuid = seriesInstanceUID
另外,大家可能會奇怪,xxx_label都是些什么,這些在文件noduleCADEvaluationLUNA16.py中都有定義,如下,看一下標簽文件就明白了。
seriesuid_label = 'seriesuid' coordX_label = 'coordX' coordY_label = 'coordY' coordZ_label = 'coordZ' diameter_mm_label = 'diameter_mm' CADProbability_label = 'probability'
最后,回到起點,看看froc究竟是如何計算的,有請evaluateCAD函數,這段代碼超級長。
def evaluateCAD(seriesUIDs, results_filename, outputDir, allNodules, CADSystemName, maxNumberOfCADMarks=-1,#輸入病人id,檢測結果,評估結果的輸出目錄,所有標簽結節, performBootstrapping=True,numberOfBootstrapSamples=1000,confidence = 0.95): #檢測系統的名字,其余參數與自助采樣有關,不太懂 ''' function to evaluate a CAD algorithm @param seriesUIDs: list of the seriesUIDs of the cases to be processed @param results_filename: file with results @param outputDir: output directory @param allNodules: dictionary with all nodule annotations of all cases, keys of the dictionary are the seriesuids @param CADSystemName: name of the CAD system, to be used in filenames and on FROC curve ''' nodOutputfile = open(os.path.join(outputDir,'CADAnalysis.txt'),'w') #寫入CADAnalysis文件 nodOutputfile.write("\n") nodOutputfile.write((60 * "*") + "\n") nodOutputfile.write("CAD Analysis: %s\n" % CADSystemName) nodOutputfile.write((60 * "*") + "\n") nodOutputfile.write("\n") results = csvTools.readCSV(results_filename)#讀取檢測結果csv格式,seriesuid,coordX,coordY,coordZ,probability allCandsCAD = {} for seriesuid in seriesUIDs: #對每個病例讀取相應的候選結節 # collect candidates from result file nodules = {} #從檢測結果文件中獲取與病例對應的候選結節,添加進字典 header = results[0] i = 0 for result in results[1:]: nodule_seriesuid = result[header.index(seriesuid_label)] if seriesuid == nodule_seriesuid: nodule = getNodule(result, header) nodule.candidateID = i nodules[nodule.candidateID] = nodule i += 1 if (maxNumberOfCADMarks > 0): # number of CAD marks, only keep must suspicous marks if len(nodules.keys()) > maxNumberOfCADMarks: # make a list of all probabilities probs = [] for keytemp, noduletemp in nodules.iteritems(): probs.append(float(noduletemp.CADprobability)) probs.sort(reverse=True) # sort from large to small probThreshold = probs[maxNumberOfCADMarks] nodules2 = {} nrNodules2 = 0 for keytemp, noduletemp in nodules.iteritems(): if nrNodules2 >= maxNumberOfCADMarks: break if float(noduletemp.CADprobability) > probThreshold: nodules2[keytemp] = noduletemp nrNodules2 += 1 nodules = nodules2 # print 'adding candidates: ' + seriesuid allCandsCAD[seriesuid] = nodules #將病例與對應候選結節存入字典 # open output files nodNoCandFile = open(os.path.join(outputDir, "nodulesWithoutCandidate_%s.txt" % CADSystemName), 'w') # --- iterate over all cases (seriesUIDs) and determine how # often a nodule annotation is not covered by a candidate # initialize some variables to be used in the loop candTPs = 0 #這里是一堆定義,讓人頭大 candFPs = 0 candFNs = 0 candTNs = 0 totalNumberOfCands = 0 #總候選結節數,也就是你的檢測結果文件中的所有結節數量 totalNumberOfNodules = 0 #總標簽結節數 doubleCandidatesIgnored = 0 irrelevantCandidates = 0 minProbValue = -1000000000.0 # minimum value of a float FROCGTList = [] FROCProbList = [] FPDivisorList = [] excludeList = [] FROCtoNoduleMap = [] ignoredCADMarksList = [] # -- loop over the cases for seriesuid in seriesUIDs: # get the candidates for this case try: candidates = allCandsCAD[seriesuid] except KeyError: candidates = {} # add to the total number of candidates totalNumberOfCands += len(candidates.keys()) # make a copy in which items will be deleted candidates2 = candidates.copy() #對候選結節復制一個副本 # get the nodule annotations on this case try: noduleAnnots = allNodules[seriesuid] #獲取所有結節中屬於該病例的結節(既包括真的結節,也包括無關的結節) except KeyError: noduleAnnots = [] # - loop over the nodule annotations for noduleAnnot in noduleAnnots: #對標簽結節循環處理 # increment the number of nodules if noduleAnnot.state == "Included": #如果是用來評測的真結節,則 totalNumberOfNodules += 1 x = float(noduleAnnot.coordX) #獲取結節坐標 y = float(noduleAnnot.coordY) z = float(noduleAnnot.coordZ) # 2. Check if the nodule annotation is covered by a candidate # A nodule is marked as detected when the center of mass of the candidate is within a distance R of # the center of the nodule. In order to ensure that the CAD mark is displayed within the nodule on the # CT scan, we set R to be the radius of the nodule size. diameter = float(noduleAnnot.diameter_mm) if diameter < 0.0: diameter = 10.0 radiusSquared = pow((diameter / 2.0), 2.0) found = False noduleMatches = [] for key, candidate in candidates.iteritems(): #對每一個候選結節,判斷是否與真實結節相交 x2 = float(candidate.coordX) y2 = float(candidate.coordY) z2 = float(candidate.coordZ) dist = math.pow(x - x2, 2.) + math.pow(y - y2, 2.) + math.pow(z - z2, 2.) if dist < radiusSquared: #判斷是否在半徑距離內 if (noduleAnnot.state == "Included"): #若是用來檢測的結節,添加進noduleMatches,並刪除該候選結節 found = True noduleMatches.append(candidate) if key not in candidates2.keys():#這里不復雜,就是說把每個與標簽結節相交的候選結節提取出來后,要刪除副本中的id,這樣是檢測會否有其它標簽結節也與該結節所屬的候選結節相交 print "This is strange: CAD mark %s detected two nodules! Check for overlapping nodule annotations, SeriesUID: %s, nodule Annot ID: %s" % (str(candidate.id), seriesuid, str(noduleAnnot.id)) else: del candidates2[key] elif (noduleAnnot.state == "Excluded"): # an excluded nodule #若是無關結節,則添加進ignoredCADMarksList if bOtherNodulesAsIrrelevant: # delete marks on excluded nodules so they don't count as false positives if key in candidates2.keys(): irrelevantCandidates += 1 ignoredCADMarksList.append("%s,%s,%s,%s,%s,%s,%.9f" % (seriesuid, -1, candidate.coordX, candidate.coordY, candidate.coordZ, str(candidate.id), float(candidate.CADprobability))) del candidates2[key] if len(noduleMatches) > 1: # double detection #如果一個標簽結節對應多個候選結節,記錄下數目 doubleCandidatesIgnored += (len(noduleMatches) - 1) if noduleAnnot.state == "Included": #若該結節是include結節 # only include it for FROC analysis if it is included # otherwise, the candidate will not be counted as FP, but ignored in the # analysis since it has been deleted from the nodules2 vector of candidates if found == True: #若找到與之匹配的候選結節 # append the sample with the highest probability for the FROC analysis maxProb = None for idx in range(len(noduleMatches)): candidate = noduleMatches[idx] if (maxProb is None) or (float(candidate.CADprobability) > maxProb): maxProb = float(candidate.CADprobability) #記錄匹配的候選結節的概率,將最大概率存入maxProb FROCGTList.append(1.0) #添加 1 FROCProbList.append(float(maxProb)) #添加剛剛的最大概率 FPDivisorList.append(seriesuid) #添加病例id excludeList.append(False) #添加False FROCtoNoduleMap.append("%s,%s,%s,%s,%s,%.9f,%s,%.9f" % (seriesuid, noduleAnnot.id, noduleAnnot.coordX, noduleAnnot.coordY, noduleAnnot.coordZ, float(noduleAnnot.diameter_mm), str(candidate.id), float(candidate.CADprobability))) candTPs += 1 #添加1 else: #若未找到與之匹配的結節 candFNs += 1 # append a positive sample with the lowest probability, such that this is added in the FROC analysis FROCGTList.append(1.0) FROCProbList.append(minProbValue) FPDivisorList.append(seriesuid) excludeList.append(True) FROCtoNoduleMap.append("%s,%s,%s,%s,%s,%.9f,%s,%s" % (seriesuid, noduleAnnot.id, noduleAnnot.coordX, noduleAnnot.coordY, noduleAnnot.coordZ, float(noduleAnnot.diameter_mm), int(-1), "NA")) nodNoCandFile.write("%s,%s,%s,%s,%s,%.9f,%s\n" % (seriesuid, noduleAnnot.id, noduleAnnot.coordX, noduleAnnot.coordY, noduleAnnot.coordZ, float(noduleAnnot.diameter_mm), str(-1))) # add all false positives to the vectors for key, candidate3 in candidates2.iteritems(): #此時candidata2中都是無人領取的結節,都是FP candFPs += 1 FROCGTList.append(0.0) FROCProbList.append(float(candidate3.CADprobability)) FPDivisorList.append(seriesuid) excludeList.append(False) FROCtoNoduleMap.append("%s,%s,%s,%s,%s,%s,%.9f" % (seriesuid, -1, candidate3.coordX, candidate3.coordY, candidate3.coordZ, str(candidate3.id), float(candidate3.CADprobability))) if not (len(FROCGTList) == len(FROCProbList) and len(FROCGTList) == len(FPDivisorList) and len(FROCGTList) == len(FROCtoNoduleMap) and len(FROCGTList) == len(excludeList)): nodOutputfile.write("Length of FROC vectors not the same, this should never happen! Aborting..\n") nodOutputfile.write("Candidate detection results:\n") nodOutputfile.write(" True positives: %d\n" % candTPs) nodOutputfile.write(" False positives: %d\n" % candFPs) nodOutputfile.write(" False negatives: %d\n" % candFNs) nodOutputfile.write(" True negatives: %d\n" % candTNs) nodOutputfile.write(" Total number of candidates: %d\n" % totalNumberOfCands) nodOutputfile.write(" Total number of nodules: %d\n" % totalNumberOfNodules) nodOutputfile.write(" Ignored candidates on excluded nodules: %d\n" % irrelevantCandidates) nodOutputfile.write(" Ignored candidates which were double detections on a nodule: %d\n" % doubleCandidatesIgnored) if int(totalNumberOfNodules) == 0: nodOutputfile.write(" Sensitivity: 0.0\n") else: nodOutputfile.write(" Sensitivity: %.9f\n" % (float(candTPs) / float(totalNumberOfNodules))) nodOutputfile.write(" Average number of candidates per scan: %.9f\n" % (float(totalNumberOfCands) / float(len(seriesUIDs)))) # compute FROC print FROCGTList print FROCProbList print len(FROCGTList) fps, sens, thresholds = computeFROC(FROCGTList,FROCProbList,len(seriesUIDs),excludeList) #這一步最為關鍵,計算FROC值,其實返回的是召回率與假陽性率的列表 if performBootstrapping: #是否使用自助采樣,不太懂 fps_bs_itp,sens_bs_mean,sens_bs_lb,sens_bs_up = computeFROC_bootstrap(FROCGTList,FROCProbList,FPDivisorList,seriesUIDs,excludeList, numberOfBootstrapSamples=numberOfBootstrapSamples, confidence = confidence) # Write FROC curve #計算到上面就結束了,后面是一些畫圖 with open(os.path.join(outputDir, "froc_%s.txt" % CADSystemName), 'w') as f: for i in range(len(sens)): f.write("%.9f,%.9f,%.9f\n" % (fps[i], sens[i], thresholds[i])) # Write FROC vectors to disk as well with open(os.path.join(outputDir, "froc_gt_prob_vectors_%s.csv" % CADSystemName), 'w') as f: for i in range(len(FROCGTList)): f.write("%d,%.9f\n" % (FROCGTList[i], FROCProbList[i])) fps_itp = np.linspace(FROC_minX, FROC_maxX, num=10001) sens_itp = np.interp(fps_itp, fps, sens) frvvlu = 0 nxth = 0.125 for fp, ss in zip(fps_itp, sens_itp): if abs(fp - nxth) < 3e-4: frvvlu += ss nxth *= 2 if abs(nxth - 16) < 1e-5: break print(frvvlu/7, nxth) print(sens_itp[fps_itp==0.125]+sens_itp[fps_itp==0.25]+sens_itp[fps_itp==0.5]+sens_itp[fps_itp==1]+sens_itp[fps_itp==2]\ +sens_itp[fps_itp==4]+sens_itp[fps_itp==8]) if performBootstrapping: # Write mean, lower, and upper bound curves to disk with open(os.path.join(outputDir, "froc_%s_bootstrapping.csv" % CADSystemName), 'w') as f: f.write("FPrate,Sensivity[Mean],Sensivity[Lower bound],Sensivity[Upper bound]\n") for i in range(len(fps_bs_itp)): f.write("%.9f,%.9f,%.9f,%.9f\n" % (fps_bs_itp[i], sens_bs_mean[i], sens_bs_lb[i], sens_bs_up[i])) else: fps_bs_itp = None sens_bs_mean = None sens_bs_lb = None sens_bs_up = None # create FROC graphs if int(totalNumberOfNodules) > 0: graphTitle = str("") fig1 = plt.figure() ax = plt.gca() clr = 'b' plt.plot(fps_itp, sens_itp, color=clr, label="%s" % CADSystemName, lw=2) if performBootstrapping: plt.plot(fps_bs_itp, sens_bs_mean, color=clr, ls='--') plt.plot(fps_bs_itp, sens_bs_lb, color=clr, ls=':') # , label = "lb") plt.plot(fps_bs_itp, sens_bs_up, color=clr, ls=':') # , label = "ub") ax.fill_between(fps_bs_itp, sens_bs_lb, sens_bs_up, facecolor=clr, alpha=0.05) xmin = FROC_minX xmax = FROC_maxX plt.xlim(xmin, xmax) plt.ylim(0.5, 1) plt.xlabel('Average number of false positives per scan') plt.ylabel('Sensitivity') plt.legend(loc='lower right') plt.title('FROC performance - %s' % (CADSystemName)) if bLogPlot: plt.xscale('log', basex=2) ax.xaxis.set_major_formatter(FixedFormatter([0.125,0.25,0.5,1,2,4,8])) # set your ticks manually ax.xaxis.set_ticks([0.125,0.25,0.5,1,2,4,8]) ax.yaxis.set_ticks(np.arange(0.5, 1, 0.1)) # ax.yaxis.set_ticks(np.arange(0, 1.1, 0.1)) plt.grid(b=True, which='both') plt.tight_layout() plt.savefig(os.path.join(outputDir, "froc_%s.png" % CADSystemName), bbox_inches=0, dpi=300) return (fps, sens, thresholds, fps_bs_itp, sens_bs_mean, sens_bs_lb, sens_bs_up)
這里會計算FROC,調用的是computeFROC函數。
def computeFROC(FROCGTList, FROCProbList, totalNumberOfImages, excludeList): # Remove excluded candidates FROCGTList_local = [] FROCProbList_local = [] for i in range(len(excludeList)): if excludeList[i] == False: FROCGTList_local.append(FROCGTList[i]) FROCProbList_local.append(FROCProbList[i]) numberOfDetectedLesions = sum(FROCGTList_local) totalNumberOfLesions = sum(FROCGTList) totalNumberOfCandidates = len(FROCProbList_local) fpr, tpr, thresholds = skl_metrics.roc_curve(FROCGTList_local, FROCProbList_local, pos_label=1) if sum(FROCGTList) == len(FROCGTList): # Handle border case when there are no false positives and ROC analysis give nan values. print "WARNING, this system has no false positives.." fps = np.zeros(len(fpr)) else: fps = fpr * (totalNumberOfCandidates - numberOfDetectedLesions) / totalNumberOfImages sens = (tpr * numberOfDetectedLesions) / totalNumberOfLesions return fps, sens, thresholds
到此,結束。