『TensorFlow』SSD源碼學習_其五:TFR數據讀取&數據預處理


Fork版本項目地址:SSD

一、TFR數據讀取

創建slim.dataset.Dataset對象

在train_ssd_network.py獲取數據操作如下,首先需要slim.dataset.Dataset對象

# Select the dataset.
# 'imagenet', 'train', tfr文件存儲位置
# TFR文件命名格式:'voc_2012_%s_*.tfrecord',%s使用train或者test
dataset = dataset_factory.get_dataset(
    FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

獲取過程會經過一系列臃腫的調用,我把中間被調用的函數(們)寫在了下面,由上到下依次調用:

def get_dataset(name, split_name, dataset_dir, file_pattern=None, reader=None):
    """
    Returns:
        A `Dataset` class.
    Raises:
        ValueError: If the dataset `name` is unknown.
    """
    if name not in datasets_map:
        raise ValueError('Name of dataset unknown %s' % name)
    # pascalvoc_2012.get_split
    return datasets_map[name].get_split(split_name,
                                        dataset_dir,
                                        file_pattern,
                                        reader)


def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
    """
    Returns:
      A `Dataset` namedtuple.
    Raises:
        ValueError: if `split_name` is not a valid train/test split.
    """
    if not file_pattern:
        file_pattern = FILE_PATTERN  # 需要文件命名格式滿足:'voc_2012_%s_*.tfrecord'
    return pascalvoc_common.get_split(split_name, dataset_dir,
                                      file_pattern, reader,
                                      SPLITS_TO_SIZES,  # {'train': 17125,}
                                      ITEMS_TO_DESCRIPTIONS,
                                      NUM_CLASSES  # 20
                                      )
    """
    ITEMS_TO_DESCRIPTIONS = {
    'image': 'A color image of varying height and width.',
    'shape': 'Shape of the image',
    'object/bbox': 'A list of bounding boxes, one per each object.',
    'object/label': 'A list of labels, one per each object.',
    }
    """

最終調用,獲取slim.dataset.Dataset(解析見『TensorFlow』從磁盤讀取數據),實際上能夠傳入滿足slim.dataset.Dataset的參數即可:

def get_split(split_name, dataset_dir, file_pattern, reader,
              split_to_sizes, items_to_descriptions, num_classes):
    """Gets a dataset tuple with instructions for reading Pascal VOC dataset.

    Args:
      split_name: A train/test split name.
      dataset_dir: The base directory of the dataset sources.
      file_pattern: The file pattern to use when matching the dataset sources.
        It is assumed that the pattern contains a '%s' string so that the split
        name can be inserted.
      reader: The TensorFlow reader type.

    Returns:
      A `Dataset` namedtuple.

    Raises:
        ValueError: if `split_name` is not a valid train/test split.
    """
    # 'train'
    if split_name not in split_to_sizes:
        raise ValueError('split name %s was not recognized.' % split_name)
    file_pattern = os.path.join(dataset_dir, file_pattern % split_name)

    # Allowing None in the signature so that dataset_factory can use the default.
    if reader is None:
        reader = tf.TFRecordReader
    # Features in Pascal VOC TFRecords.
    keys_to_features = {  # 解碼TFR文件方式
        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
        'image/height': tf.FixedLenFeature([1], tf.int64),
        'image/width': tf.FixedLenFeature([1], tf.int64),
        'image/channels': tf.FixedLenFeature([1], tf.int64),
        'image/shape': tf.FixedLenFeature([3], tf.int64),
        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
    }
    items_to_handlers = {  # 解碼二進制數據條目
        'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'shape': slim.tfexample_decoder.Tensor('image/shape'),
        'object/bbox': slim.tfexample_decoder.BoundingBox(
                ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
        'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'),
        'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
        'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
    }
    # 解碼實施
    decoder = slim.tfexample_decoder.TFExampleDecoder(
        keys_to_features, items_to_handlers)

    labels_to_names = None
    # tf.gfile.Exists(os.path.join(dataset_dir, 'labels.txt'))
    if dataset_utils.has_labels(dataset_dir):
        labels_to_names = dataset_utils.read_label_file(dataset_dir)
    # else:
    #     labels_to_names = create_readable_names_for_imagenet_labels()
    #     dataset_utils.write_label_file(labels_to_names, dataset_dir)

    return slim.dataset.Dataset(
            data_sources=file_pattern,                    # TFR文件名
            reader=reader,                                # 閱讀器
            decoder=decoder,                              # 解碼Tensor
            num_samples=split_to_sizes[split_name],       # 數目
            items_to_descriptions=items_to_descriptions,  # decoder條目描述字段
            num_classes=num_classes,                      # 類別數
            labels_to_names=labels_to_names               # 字典{圖片:類別,……}
    )

''' items_to_descriptions:
    {'image': 'A color image of varying height and width.',
     'shape': 'Shape of the image',
     'object/bbox': 'A list of bounding boxes, one per each object.',
     'object/label': 'A list of labels, one per each object.',}
'''

這里額外說一句,存儲數據中ymin、xmin、ymax、xmax格子存儲為(n,)的shape(n表示圖像中對象數目),但是在進行了items_to_handlers之后,新的handlers:object/bbox形狀變化為(n, 4),由於這涉及到多目標檢測后續一系列處理,所以值得注意。

從TFR中獲取 batch數據

            with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                provider = slim.dataset_data_provider.DatasetDataProvider(
                    dataset,  # DatasetDataProvider 需要 slim.dataset.Dataset 做參數
                    num_readers=FLAGS.num_readers,
                    common_queue_capacity=20 * FLAGS.batch_size,
                    common_queue_min=10 * FLAGS.batch_size,
                    shuffle=True)
            # Get for SSD network: image, labels, bboxes.c
            # DatasetDataProvider可以通過TFR字段獲取batch size數據
            [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',
                                                             'object/label',
                                                             'object/bbox'])

 此時數據已經獲取完畢,預處理之后即可加入運算。

注意,直到現在為止,我們僅對圖片數據進行了解碼,並沒有擴充維度,也就是說其維度依然是3維

二、數據處理

獲取對應數據集的預處里函數,並使用其處理上面小結中獲取的batch數據,

image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

# Pre-processing image, labels and bboxes.
image, glabels, gbboxes = \
    image_preprocessing_fn(image, glabels, gbboxes,
                           out_shape=ssd_shape,  # (300,300)
                           data_format=DATA_FORMAT)  # 'NCHW'

 有的時候你會覺得這種層層調用非常的sb……下面兩步依舊是個調用鏈,

def get_preprocessing(name, is_training=False):
    preprocessing_fn_map = {
        'ssd_300_vgg': ssd_vgg_preprocessing,
        'ssd_512_vgg': ssd_vgg_preprocessing,
    }

    if name not in preprocessing_fn_map:
        raise ValueError('Preprocessing name [%s] was not recognized' % name)

    def preprocessing_fn(image, labels, bboxes,
                         out_shape, data_format='NHWC', **kwargs):
        return preprocessing_fn_map[name].preprocess_image(
            image, labels, bboxes, out_shape, data_format=data_format,
            is_training=is_training, **kwargs)
    return preprocessing_fn


def preprocess_image(image,
                     labels,
                     bboxes,
                     out_shape,
                     data_format,
                     is_training=False,
                     **kwargs):
    if is_training:
        return preprocess_for_train(image, labels, bboxes,
                                    out_shape=out_shape,
                                    data_format=data_format)
    else:
        return preprocess_for_eval(image, labels, bboxes,
                                   out_shape=out_shape,
                                   data_format=data_format,
                                   **kwargs)

之后就是數據具體的預處理函數,本篇我們僅僅關注訓練預處理。

訓練數據預處理概覽

大致流程是:

有條件的在原圖上裁剪一個區域

計算裁剪后區域和各個標注框的重疊,視閾值保留bboxes和labels

裁剪出來的圖片放大到輸入圖片大小(bbox都是歸一化的,不需要放縮)

隨機翻轉(bbox要同步翻轉)

其他預處理(不涉及bbox)

返回image, labels, bboxes

def preprocess_for_train(image, labels, bboxes,
                         out_shape, data_format='NHWC',
                         scope='ssd_preprocessing_train'):
    """Preprocesses the given image for training.
    """
    fast_mode = False
    with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]):
        if image.get_shape().ndims != 3:
            raise ValueError('Input must be of size [height, width, C>0]')
        # Convert to float scaled [0, 1].
        if image.dtype != tf.float32:
            image = tf.image.convert_image_dtype(image, dtype=tf.float32)
        tf_summary_image(image, bboxes, 'image_with_bboxes')
        # 上面保證了圖片是3維的tf.float32格式

        # (有條件的)隨機裁剪,篩選調整后的labels(n,)bboxes(n, 4),裁剪圖片對應原圖坐標(4,)
        dst_image, labels, bboxes, distort_bbox = \
            distorted_bounding_box_crop(image, labels, bboxes,
                                        min_object_covered=MIN_OBJECT_COVERED,  # 0.25
                                        aspect_ratio_range=CROP_RATIO_RANGE)  # (0.6, 1.67)

        # Resize image to output size.
        dst_image = tf_image.resize_image(dst_image, out_shape,
                                          method=tf.image.ResizeMethod.BILINEAR,
                                          align_corners=False)
        tf_summary_image(dst_image, bboxes, 'image_shape_distorted')

        # Randomly flip the image horizontally.
        dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes)

        # Randomly distort the colors. There are 4 ways to do it.
        dst_image = apply_with_random_selector(
                dst_image,
                lambda x, ordering: distort_color(x, ordering, fast_mode),
                num_cases=4)
        tf_summary_image(dst_image, bboxes, 'image_color_distorted')

        # Rescale to VGG input scale.
        image = dst_image * 255.
        image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN])
        # mean = tf.constant(means, dtype=image.dtype)
        # image = image - mean

        # Image data format.
        if data_format == 'NCHW':
            image = tf.transpose(image, perm=(2, 0, 1))
        # 'NHWC' (n,) (n, 4)
        return image, labels, bboxes

裁剪圖片並調整labels、bboxes

整體流程如下,

調用內置函數保證裁剪的大小范圍以及一定會包含一些關注目標,返回裁剪參數

裁剪(注意保留裁剪位置參數)圖片

計算裁剪框和各個檢測框的重疊,並設置閾值舍棄、調整保留框坐標

def distorted_bounding_box_crop(image,
                                labels,
                                bboxes,
                                min_object_covered=0.3,
                                aspect_ratio_range=(0.9, 1.1),
                                area_range=(0.1, 1.0),
                                max_attempts=200,
                                clip_bboxes=True,
                                scope=None):
    """Generates cropped_image using a one of the bboxes randomly distorted.

    See `tf.image.sample_distorted_bounding_box` for more documentation.

    Args:
        image: 3-D Tensor of image (it will be converted to floats in [0, 1]).
        bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
            where each coordinate is [0, 1) and the coordinates are arranged
            as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole
            image.
        min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
            area of the image must contain at least this fraction of any bounding box
            supplied.
        aspect_ratio_range: An optional list of `floats`. The cropped area of the
            image must have an aspect ratio = width / height within this range.
        area_range: An optional list of `floats`. The cropped area of the image
            must contain a fraction of the supplied image within in this range.
        max_attempts: An optional `int`. Number of attempts at generating a cropped
            region of the image of the specified constraints. After `max_attempts`
            failures, return the entire image.
        scope: Optional scope for name_scope.
    Returns:
        A tuple, a 3-D Tensor cropped_image and the distorted bbox
    """
    with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]):
        # 高級的隨機裁剪
        # The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width
        # and height of the underlying image.
        # 1-D, 1-D, [1, 1, 4]
        bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box(
                tf.shape(image),
                bounding_boxes=tf.expand_dims(bboxes, 0),  # [1, n, 4]
                min_object_covered=min_object_covered,
                aspect_ratio_range=aspect_ratio_range,
                area_range=area_range,
                max_attempts=max_attempts,
                use_image_if_no_bounding_boxes=True)
        '''
        Returns:
            A tuple of `Tensor` objects (begin, size, bboxes).

        begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`. 
            Provide as input to `tf.slice`.
        size: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`. 
            Provide as input to `tf.slice`.
        bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
            Provide as input to `tf.image.draw_bounding_boxes`.
        '''
        # [4]
        distort_bbox = distort_bbox[0, 0]

        # Crop the image to the specified bounding box.
        cropped_image = tf.slice(image, bbox_begin, bbox_size)
        # Restore the shape since the dynamic slice loses 3rd dimension.
        cropped_image.set_shape([None, None, 3])  # <-----設置了尺寸了哈

        # Update bounding boxes: resize and filter out.
        bboxes = tfe.bboxes_resize(distort_bbox, bboxes)  # [4], [n, 4]
        labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes,
                                                   threshold=BBOX_CROP_OVERLAP,  # 0.5
                                                   assign_negative=False)
        # 返回隨機裁剪的圖片,篩選調整后的labels(n,)、bboxes(n, 4),裁剪圖片對應原圖坐標(4,)
        return cropped_image, labels, bboxes, distort_bbox

三個關鍵函數:

tf.image.sample_distorted_bounding_box 裁剪,用法查看文檔,就是裁剪一個子圖,返回最后參數是子圖坐標

bboxes_resize 框坐標原點置為裁剪框左上角點,xy單位長度置為裁剪框wh(歸一化)

bboxes_filter_overlap 計算重疊區/原框的百分比,舍棄達不到閾值的labels和bboxes

其中第二個函數我們前面並未強調,但是,由於所有的涉及框坐標的計算都是基於圖像坐標歸一化之后(tf內置函數都是這樣),所以這一步計算是必要的,將坐標系由原圖(注意是圖,這也導致了兩者單位長度差別很大)轉換為裁剪框,並設定單位長度。

def bboxes_resize(bbox_ref, bboxes, name=None):
    # Tensors inputs.
    with tf.name_scope(name, 'bboxes_resize'):
        # Translate.
        # bbox_ref:['ymin', 'xmin', 'ymax', 'xmax']
        v = tf.stack([bbox_ref[0], bbox_ref[1], bbox_ref[0], bbox_ref[1]])
        bboxes = bboxes - v
        # Scale.
        s = tf.stack([bbox_ref[2] - bbox_ref[0],  # h
                      bbox_ref[3] - bbox_ref[1],  # w
                      bbox_ref[2] - bbox_ref[0],
                      bbox_ref[3] - bbox_ref[1]])
        bboxes = bboxes / s
        return bboxes

def bboxes_filter_overlap(labels, bboxes,
                          threshold=0.5, assign_negative=False,
                          scope=None):
    """Filter out bounding boxes based on (relative )overlap with reference
    box [0, 0, 1, 1].  Remove completely bounding boxes, or assign negative
    labels to the one outside (useful for latter processing...).

    Return:
      labels, bboxes: Filtered (or newly assigned) elements.
    """
    with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]):
        # (N,) Tensor:和[0,0,1,1]相交面積大於0的位置返回面積比(相交/原本),小於0的位置返回0
        scores = bboxes_intersection(tf.constant([0, 0, 1, 1], bboxes.dtype),
                                     bboxes)
        mask = scores > threshold
        if assign_negative:  # 保留所有的label和框,重疊區不夠的label置負
            labels = tf.where(mask, labels, -labels)  # 交叉滿足的標記為正,否則為負
        else:  # 刪除重疊區不夠的label和框
            labels = tf.boolean_mask(labels, mask)  # bool掩碼,類似於array的bool切片
            bboxes = tf.boolean_mask(bboxes, mask)
        return labels, bboxes


# 被上面函數調用,計算相交(和裁剪框)面積占原框面積比值
def bboxes_intersection(bbox_ref, bboxes, name=None):
    """Compute relative intersection between a reference box and a
    collection of bounding boxes. Namely, compute the quotient between
    intersection area and box area.

    Args:
      bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es).
      bboxes: (N, 4) Tensor, collection of bounding boxes.
    Return:
      (N,) Tensor with relative intersection.
    """
    with tf.name_scope(name, 'bboxes_intersection'):
        # Should be more efficient to first transpose.
        bboxes = tf.transpose(bboxes)
        bbox_ref = tf.transpose(bbox_ref)
        # Intersection bbox and volume.
        int_ymin = tf.maximum(bboxes[0], bbox_ref[0])
        int_xmin = tf.maximum(bboxes[1], bbox_ref[1])
        int_ymax = tf.minimum(bboxes[2], bbox_ref[2])
        int_xmax = tf.minimum(bboxes[3], bbox_ref[3])
        h = tf.maximum(int_ymax - int_ymin, 0.)
        w = tf.maximum(int_xmax - int_xmin, 0.)
        # Volumes.
        inter_vol = h * w  # 各個框在[0,0,1,1]內的面積
        bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1])  # 各個框面積
        scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection')
        # from tensorflow.python.ops import math_ops
        # 大於0的位置返回面積比,小於0的位置返回0
        # tf.where(math_ops.greater(bboxes_vol, 0),  # 返回bool表是否大於0
        #          math_ops.divide(inter_vol, bboxes_vol),
        #          tf.zeros_like(inter_vol), name=name)
        return scores

其他預處理函數沒什么特別注意的,不多介紹,自行查看源碼即可。

至此,數據預處理完成,我們給出自從TFR中獲取數據到預處理完成的局部代碼,如下,

        with tf.device(deploy_config.inputs_device()):
            with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                provider = slim.dataset_data_provider.DatasetDataProvider(
                    dataset,  # DatasetDataProvider 需要 slim.dataset.Dataset 做參數
                    num_readers=FLAGS.num_readers,
                    common_queue_capacity=20 * FLAGS.batch_size,
                    common_queue_min=10 * FLAGS.batch_size,
                    shuffle=True)
            # Get for SSD network: image, labels, bboxes.c
            # DatasetDataProvider可以通過TFR字段獲取batch size數據
            [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',
                                                             'object/label',
                                                             'object/bbox'])
            # Pre-processing image, labels and bboxes.
            # 'CHW' (n,) (n, 4)
            image, glabels, gbboxes = \
                image_preprocessing_fn(image, glabels, gbboxes,
                                       out_shape=ssd_shape,  # (300,300)
                                       data_format=DATA_FORMAT)  # 'NCHW'

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM