這周看完faster-rcnn后,應該對其源碼進行一個解析,以便后面的使用。
那首先直接先主函數出發py-faster-rcnn/tools/train_faster_rcnn_alt_opt.py
我們在后端的運行命令為
python ./py-faster-rcnn/tools/train_faster_rcnn_alt_opt.py
--gpu
0
--net_name
ZF
--weights
data/imagenet_models/ZF.v2.caffemodel
--imdb
voc_2007_trainval
--cfg
experiments/cfgs/faster_rcnn_alt_opt.yml
從這條命令就可以看出,我們是使用0id的GPU,使用ZF網絡,預訓練模型使用ZF.v2.caffemodel,數據集使用voc_2007_trainval,配置文件cfg使用faster_rcnn_alt_opt.yml。
先進入主函數:
if __name__ == '__main__': args = parse_args() #獲取命令行參數 #Namespace(cfg_file='experiments/cfgs/faster_rcnn_alt_opt.yml', gpu_id=0, imdb_name='voc_2007_trainval',
#net_name='ZF', pretrained_model='data/imagenet_models/ZF.v2.caffemodel', set_cfgs=None)
print('Called with args:') print(args) if args.cfg_file is not None: ##配置文件存在,則加載配置文件 cfg_from_file(args.cfg_file) ##進入config.py文件,通過yaml加載后使用edict轉化格式,然后通過_merge_a_into_b(a, b)迭代融合成一個config if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) cfg.GPU_ID = args.gpu_id ##設置使用的GPU的id,一般直接為0 # -------------------------------------------------------------------------- # Pycaffe doesn't reliably free GPU memory when instantiated nets are # discarded (e.g. "del net" in Python code). To work around this issue, each # training stage is executed in a separate process using # multiprocessing.Process. # -------------------------------------------------------------------------- # queue for communicated results between processes mp_queue = mp.Queue() ##創建一個多線程的對象 # solves, iters, etc. for each training stage solvers, max_iters, rpn_test_prototxt = get_solvers(args.net_name) ##獲得solvers等信息
進入get_solvers()函數:
def get_solvers(net_name): ##ZF net # Faster R-CNN Alternating Optimization n = 'faster_rcnn_alt_opt' ##采取alt_opt訓練方式 # Solver for each training stage solvers = [[net_name, n, 'stage1_rpn_solver60k80k.pt'], [net_name, n, 'stage1_fast_rcnn_solver30k40k.pt'], [net_name, n, 'stage2_rpn_solver60k80k.pt'], [net_name, n, 'stage2_fast_rcnn_solver30k40k.pt']] solvers = [os.path.join(cfg.MODELS_DIR, *s) for s in solvers] ##記錄該訓練方式的各階段的solver(訓練參數),即rpn訓練和整體faster_rcnn訓練的slover # Iterations for each training stage max_iters = [80000, 40000, 80000, 40000] # max_iters = [100, 100, 100, 100] # Test prototxt for the RPN rpn_test_prototxt = os.path.join( cfg.MODELS_DIR, net_name, n, 'rpn_test.pt') ##記錄rpn測試的prototext,即rpn測試時的網絡結構 return solvers, max_iters, rpn_test_prototxt
接着回到主函數里面,開始第一階段的訓練:
print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' print 'Stage 1 RPN, init from ImageNet model' print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' cfg.TRAIN.SNAPSHOT_INFIX = 'stage1' mp_kwargs = dict( queue=mp_queue, imdb_name=args.imdb_name, ##'voc_2007_trainval' init_model=args.pretrained_model, ##使用預訓練模型'data/imagenet_models/ZF.v2.caffemodel' solver=solvers[0], ##'py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt' max_iters=max_iters[0], ##最大迭代次數80000 cfg=cfg) p = mp.Process(target=train_rpn, kwargs=mp_kwargs) ##設置進程對象,進程執行train_rpn函數,使用mp_kwargs參數 p.start() rpn_stage1_out = mp_queue.get() ##獲取線程中的數據,這里屬於進程間的通信 p.join() ##等待子線性結束
接着進入train_rpn()函數來看看:
def train_rpn(queue=None, imdb_name=None, init_model=None, solver=None, max_iters=None, cfg=None): """Train a Region Proposal Network in a separate training process. """ ##注意,第一階段的訓練沒有使用任何的建議框,而是使用gt_boxes來訓練 cfg.TRAIN.HAS_RPN = True cfg.TRAIN.BBOX_REG = False # 只針對 Fast R-CNN bbox regression來開啟該選項 cfg.TRAIN.PROPOSAL_METHOD = 'gt' #默認使用gt來進行區域建議 cfg.TRAIN.IMS_PER_BATCH = 1 print 'Init model: {}'.format(init_model) print('Using config:') pprint.pprint(cfg) ##pprint專門打印python數據結構類 import caffe _init_caffe(cfg) ##初始化caffe,設置了隨機數種子,以及使用caffe訓練時的模式(gpu/cpu) roidb, imdb = get_roidb(imdb_name) print 'roidb len: {}'.format(len(roidb)) output_dir = get_output_dir(imdb) print 'Output will be saved to `{:s}`'.format(output_dir) model_paths = train_net(solver, roidb, output_dir, pretrained_model=init_model, max_iters=max_iters) # Cleanup all but the final model for i in model_paths[:-1]: os.remove(i) rpn_model_path = model_paths[-1] # Send final model path through the multiprocessing queue queue.put({'model_path': rpn_model_path})
pprint.pprint(cfg)打印出來的config的配置項:
Using config: {'DATA_DIR': '/home/home/FRCN_ROOT/py-faster-rcnn/data', 'DEDUP_BOXES': 0.0625, 'EPS': 1e-14, 'EXP_DIR': 'faster_rcnn_alt_opt', 'GPU_ID': 0, 'MATLAB': 'matlab', 'MODELS_DIR': '/home/home/FRCN_ROOT/py-faster-rcnn/models/pascal_voc', 'PIXEL_MEANS': array([[[ 102.9801, 115.9465, 122.7717]]]), 'RNG_SEED': 3, 'ROOT_DIR': '/home/home/FRCN_ROOT/py-faster-rcnn', 'TEST': {'BBOX_REG': True, 'HAS_RPN': True, 'MAX_SIZE': 1000, 'NMS': 0.3, 'PROPOSAL_METHOD': 'selective_search', 'RPN_MIN_SIZE': 16, 'RPN_NMS_THRESH': 0.7, 'RPN_POST_NMS_TOP_N': 300, 'RPN_PRE_NMS_TOP_N': 6000, 'SCALES': [600], 'SVM': False}, 'TRAIN': {'ASPECT_GROUPING': True, 'BATCH_SIZE': 128, 'BBOX_INSIDE_WEIGHTS': [1.0, 1.0, 1.0, 1.0], 'BBOX_NORMALIZE_MEANS': [0.0, 0.0, 0.0, 0.0], 'BBOX_NORMALIZE_STDS': [0.1, 0.1, 0.2, 0.2], 'BBOX_NORMALIZE_TARGETS': True, 'BBOX_NORMALIZE_TARGETS_PRECOMPUTED': False, 'BBOX_REG': False, 'BBOX_THRESH': 0.5, 'BG_THRESH_HI': 0.5, 'BG_THRESH_LO': 0.0, 'FG_FRACTION': 0.25, 'FG_THRESH': 0.5, 'HAS_RPN': True, 'IMS_PER_BATCH': 1, 'MAX_SIZE': 1000, 'PROPOSAL_METHOD': 'gt', 'RPN_BATCHSIZE': 256, 'RPN_BBOX_INSIDE_WEIGHTS': [1.0, 1.0, 1.0, 1.0], 'RPN_CLOBBER_POSITIVES': False, 'RPN_FG_FRACTION': 0.5, 'RPN_MIN_SIZE': 16, 'RPN_NEGATIVE_OVERLAP': 0.3, 'RPN_NMS_THRESH': 0.7, 'RPN_POSITIVE_OVERLAP': 0.7, 'RPN_POSITIVE_WEIGHT': -1.0, 'RPN_POST_NMS_TOP_N': 2000, 'RPN_PRE_NMS_TOP_N': 12000, 'SCALES': [600], 'SNAPSHOT_INFIX': 'stage1', 'SNAPSHOT_ITERS': 10000, 'USE_FLIPPED': True, 'USE_PREFETCH': False}, 'USE_GPU_NMS': True}
繼續,現在我們進入函數 roidb, imdb = get_roidb(imdb_name):
def get_roidb(imdb_name, rpn_file=None): imdb = get_imdb(imdb_name) print 'Loaded dataset `{:s}` for training'.format(imdb.name) ##加載數據完畢 imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD) ##設置區域建議所使用的方法gt,具體使用eval融合字符串再賦值 print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD) if rpn_file is not None: imdb.config['rpn_file'] = rpn_file roidb = get_training_roidb(imdb) return roidb, imdb
進入imdb = get_imdb(imdb_name)函數,該文件在/py-faster-rcnn/lib/datasets/factory.py,其實主要是運用工廠模式來適配不同的數據集:
for year in ['2007', '2012']:
for split in ['train', 'val', 'trainval', 'test']:
name = 'voc_{}_{}'.format(year, split)
__sets[name] = (lambda split=split, year=year: pascal_voc(split, year))
def get_imdb(name): """Get an imdb (image database) by name.""" if not __sets.has_key(name): raise KeyError('Unknown dataset: {}'.format(name)) return __sets[name]() ##執行該函數,該函數對應上面的lambda,適配pascal_voc來建造數據
這里其實也是調用了pascal_voc()函數來創建imdb數據,pascal_voc類見py-faster-rcnn/lib/datasets/pascal_voc.py文件中,如下:
class pascal_voc(imdb): def __init__(self, image_set, year, devkit_path=None): imdb.__init__(self, 'voc_' + year + '_' + image_set) ##進入基類imdb來進行初始化 self._year = year self._image_set = image_set self._devkit_path = self._get_default_path() if devkit_path is None \ else devkit_path self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year) self._classes = ('__background__', # always index 0 該數據集加上背景一共有21類 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes))) ##將各個類隨機轉化成對應的數字,比如sheep=17 self._image_ext = '.jpg' self._image_index = self._load_image_set_index() ##讀取py-faster-rcnn/data/VOCdevkit2007/VOC2007/ImageSets/Main/trainval.txt
##為每個圖片標注index,不如000005.jpg=0000
# Default to roidb handler self._roidb_handler = self.selective_search_roidb self._salt = str(uuid.uuid4()) self._comp_id = 'comp4' # PASCAL specific config options self.config = {'cleanup' : True, 'use_salt' : True, 'use_diff' : False, 'matlab_eval' : False, 'rpn_file' : None, 'min_size' : 2} assert os.path.exists(self._devkit_path), \ 'VOCdevkit path does not exist: {}'.format(self._devkit_path) assert os.path.exists(self._data_path), \ 'Path does not exist: {}'.format(self._data_path)
這里只截取了一部分,可以發現,pascal_voc這個類主要用來組織輸入的圖片數據,存儲圖片的相關信息,但並不存儲圖片;而實際上,pascal_voc類是imdb類的一個子類;進入imdb的類:
class imdb(object): """Image database.""" def __init__(self, name): self._name = name self._num_classes = 0 self._classes = [] self._image_index = [] self._obj_proposer = 'selective_search' ##先前的fast rcnn默認使用ss方法進行區域建議 self._roidb = None self._roidb_handler = self.default_roidb # Use this dict for storing dataset specific config options self.config = {} @property def name(self): ##基類屬性在子類(pascal類)創建時若有賦值操作則自動生成 return self._name @property def num_classes(self): return len(self._classes) @property def classes(self): return self._classes @property def image_index(self): return self._image_index @property ##把方法裝飾成該類的屬性 def roidb_handler(self): return self._roidb_handler @roidb_handler.setter ##對roidb_handler產生另外一個裝飾器,使用setter屬性進行賦值 def roidb_handler(self, val): self._roidb_handler = val def set_proposal_method(self, method): ##運用setter來設置訓練方法 method = eval('self.' + method + '_roidb') self.roidb_handler = method @property def roidb(self): # A roidb is a list of dictionaries, each with the following keys: # boxes # gt_overlaps # gt_classes # flipped if self._roidb is not None: return self._roidb self._roidb = self.roidb_handler() return self._roidb @property def cache_path(self): cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache')) if not os.path.exists(cache_path): os.makedirs(cache_path) return cache_path @property def num_images(self): return len(self.image_index)
此時我們看看現在的變量值:
好了現在imdb數據已經獲得了,再回到get_roidb()里面的imdb = get_imdb(imdb_name)函數中,緊接着set_proposal_method()函數設置了產生proposal的方法,實際也是向imdb中添加roidb數據:
def set_proposal_method(self, method): method = eval('self.' + method + '_roidb') self.roidb_handler = method ##method=self.gt_roidb,這里其實是調用了pascal_voc.py文件里面的gt_roidb()函數
首先用eval()對這個方法進行解析,使其有效,再傳入roidb_handler中,這里就要回到之前的train_rpn()函數中了,它里面設置了cfg.TRAIN.PROPOSAL_METHOD='gt'(默認值是selective search,先前用於fast rcnn的),先進入gt_roidb()函數中:
def gt_roidb(self): """ Return the database of ground-truth regions of interest. This function loads/saves from/to a cache file to speed up future calls. """ cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') ##如果存在gt框的位置文件則加載並返回gt框的信息(roidb) if os.path.exists(cache_file): with open(cache_file, 'rb') as fid: roidb = cPickle.load(fid) print '{} gt roidb loaded from {}'.format(self.name, cache_file) return roidb gt_roidb = [self._load_pascal_annotation(index) ##如果不存在則直接讀取文件的 for index in self.image_index] with open(cache_file, 'wb') as fid: cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL) print 'wrote gt roidb to {}'.format(cache_file) return gt_roidb
這里的gt_roidb = [self._load_pascal_annotation(index)函數為:
def _load_pascal_annotation(self, index): """ Load image and bounding boxes info from XML file in the PASCAL VOC format. """ filename = os.path.join(self._data_path, 'Annotations', index + '.xml') tree = ET.parse(filename) ##從硬盤導入xml文件 objs = tree.findall('object') ##找到object的tag if not self.config['use_diff']: ##取出tag為difficult的object # Exclude the samples labeled as difficult non_diff_objs = [ obj for obj in objs if int(obj.find('difficult').text) == 0] # if len(non_diff_objs) != len(objs): # print 'Removed {} difficult objects'.format( # len(objs) - len(non_diff_objs)) objs = non_diff_objs num_objs = len(objs) boxes = np.zeros((num_objs, 4), dtype=np.uint16) ##boxes的存儲坐標,4個,所以為四列 gt_classes = np.zeros((num_objs), dtype=np.int32) ##gt框的類 overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) ##重疊率矩陣 # "Seg" area for pascal is just the box area seg_areas = np.zeros((num_objs), dtype=np.float32) ##面積 # Load object bounding boxes into a data frame. for ix, obj in enumerate(objs): bbox = obj.find('bndbox') # Make pixel indexes 0-based x1 = float(bbox.find('xmin').text) - 1 y1 = float(bbox.find('ymin').text) - 1 x2 = float(bbox.find('xmax').text) - 1 y2 = float(bbox.find('ymax').text) - 1 cls = self._class_to_ind[obj.find('name').text.lower().strip()] boxes[ix, :] = [x1, y1, x2, y2] gt_classes[ix] = cls overlaps[ix, cls] = 1.0 seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1) overlaps = scipy.sparse.csr_matrix(overlaps) return {'boxes' : boxes, 'gt_classes': gt_classes, 'gt_overlaps' : overlaps, 'flipped' : False, 'seg_areas' : seg_areas}
由上面可以看出roidb的結構是一個包含有5個key的字典。
這個時候就從imdb獲得了最初的roidb格式的數據,但這還不是訓練時的roidb數據,再回到get_roidb()函數中,通過get_training_roidb(imdb)函數得到最終用於訓練的roidb數據,進入該函數:
def get_training_roidb(imdb): """Returns a roidb (Region of Interest database) for use in training.""" if cfg.TRAIN.USE_FLIPPED: print 'Appending horizontally-flipped training examples...' imdb.append_flipped_images() ##如果設置了翻轉項,則對圖片進行水平翻轉后添加,原來5000張圖片,加入翻轉后為10000左右,這里可以理解成數據增強 print 'done' print 'Preparing training data...' rdl_roidb.prepare_roidb(imdb) ##對roidb加入額外的信息,方便訓練 print 'done' return imdb.roidb
進入翻轉函數append_flipped_images():
def append_flipped_images(self): num_images = self.num_images widths = self._get_widths() ##具體里面是使用PIL庫來獲取width for i in xrange(num_images): boxes = self.roidb[i]['boxes'].copy() oldx1 = boxes[:, 0].copy() oldx2 = boxes[:, 2].copy() boxes[:, 0] = widths[i] - oldx2 - 1 boxes[:, 2] = widths[i] - oldx1 - 1 assert (boxes[:, 2] >= boxes[:, 0]).all() entry = {'boxes' : boxes, 'gt_overlaps' : self.roidb[i]['gt_overlaps'], 'gt_classes' : self.roidb[i]['gt_classes'], 'flipped' : True} self.roidb.append(entry) self._image_index = self._image_index * 2
進入rdl_roidb.prepare_roidb(imdb)函數:
def prepare_roidb(imdb): """Enrich the imdb's roidb by adding some derived quantities that are useful for training. This function precomputes the maximum overlap, taken over ground-truth boxes, between each ROI and each ground-truth box. The class with maximum overlap is also recorded. """ sizes = [PIL.Image.open(imdb.image_path_at(i)).size for i in xrange(imdb.num_images)] roidb = imdb.roidb for i in xrange(len(imdb.image_index)): ##加入位置,寬,高等信息 roidb[i]['image'] = imdb.image_path_at(i) roidb[i]['width'] = sizes[i][0] roidb[i]['height'] = sizes[i][1] # need gt_overlaps as a dense array for argmax gt_overlaps = roidb[i]['gt_overlaps'].toarray() # max overlap with gt over classes (columns) max_overlaps = gt_overlaps.max(axis=1) # gt class that had the max overlap max_classes = gt_overlaps.argmax(axis=1) roidb[i]['max_classes'] = max_classes ##加入最大概率類 roidb[i]['max_overlaps'] = max_overlaps ##加入最大重疊率 # sanity checks # max overlap of 0 => class should be zero (background) zero_inds = np.where(max_overlaps == 0)[0] assert all(max_classes[zero_inds] == 0) # max overlap > 0 => class should not be zero (must be a fg class) nonzero_inds = np.where(max_overlaps > 0)[0] assert all(max_classes[nonzero_inds] != 0)
查看此時roidb的結構:
此時roidb的圖片000005.jpg的,也即index為00000的圖片的數據結構下有:boxes、flipped(是否翻轉過)、gt_classes、gt_overlaps、height、image、max_classes、max_overlaps、seg_areas(boxes的面積)、width、__len__
到這里為止,我們已經成功利用工廠模式適配pascal_voc的數據集,並讀取xml文件來獲取數據集的gt框(roisdb),第一部分介紹完畢。