此代碼是我對maskrcnn的一些修改,基本還原所有內容,但更加簡潔,使代碼更易解讀。里面有很多注釋,非常詳細,可自己慢慢品味。
若有一些問題,歡迎指正與交流。
此代碼為訓練文件.py
"""
MASKRCNN algrithm for object detection and instance segmentation
Written and modified by tang jun on JAN , 2019
if you have questions , please connect me by Email: tangjunjunfighter@163.com
"""
import scipy
# import os
# import random
# import datetime
# import re
# import math
# import logging
# from collections import OrderedDict
# import multiprocessing
# import numpy as np
import tensorflow as tf
import keras
# import keras.backend as K # keras中的后端backend及其相關函數
# import keras.layers as KL
# import keras.engine as KE
# import keras.models as KM
import math
import os
import sys
import numpy as np
import cv2
import matplotlib.pyplot as plt
import yaml
from PIL import Image
import random
# from mrcnn1 import utils, model as modellib, visualize
# from mrcnn1 import utils, model as modellib, visualize
import model as modellib
# from mrcnn1 import visualize
from distutils.version import LooseVersion
assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
assert LooseVersion(keras.__version__) >= LooseVersion('2.0.8')
ROOT_DIR = os.getcwd() # 得到當前路徑
sys.path.append(ROOT_DIR) # To find local version of the library
# Directory to save logs and trained models
MODEL_DIR = os.path.join(ROOT_DIR, "logs") # 在當前路徑的logs文件路徑
iter_num = 0
# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5") # 載入訓練模型權重路徑
class Config_config(object):
"""Base configuration class. For custom configurations, create a
sub-class that inherits from this one and override properties
that need to be changed.
"""
IMAGE_RESIZE_MODE = "square"
IMAGE_MIN_DIM = 128
IMAGE_MAX_DIM = 256
NUM_CLASSES = 1 + 4 # Override in sub-classes
PRE_NMS_LIMIT = 6000 # 判斷在訓練時候的提取層提取個數,若大於anchors則提取anchors個,否則相反
IMAGE_CHANNEL_COUNT = 3
# Name the configurations. For example, 'COCO', 'Experiment 3', ...etc.
# Useful if your code needs to do things differently depending on which
# experiment is running.
# NAME = "shapes" # Override in sub-classes
# GPU_COUNT = 1
# IMAGES_PER_GPU = 1
# Number of training steps per epoch
# This doesn't need to match the size of the training set. Tensorboard
# updates are saved at the end of each epoch, so setting this to a
# smaller number means getting more frequent TensorBoard updates.
# Validation stats are also calculated at each epoch end and they
# might take a while, so don't set this too small to avoid spending
# a lot of time on validation stats.
STEPS_PER_EPOCH = 5
# Number of validation steps to run at the end of every training epoch.
# A bigger number improves accuracy of validation stats, but slows
# down the training.
VALIDATION_STEPS = 50
# Backbone network architecture
# Supported values are: resnet50, resnet101.
# You can also provide a callable that should have the signature
# of model.resnet_graph. If you do so, you need to supply a callable
# to COMPUTE_BACKBONE_SHAPE as well
BACKBONE = "resnet101"
# Only useful if you supply a callable to BACKBONE. Should compute
# the shape of each layer of the FPN Pyramid.
# See model.compute_backbone_shapes
# COMPUTE_BACKBONE_SHAPE = None
# The strides of each layer of the FPN Pyramid. These values
# are based on a Resnet101 backbone.
BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# Size of the fully-connected layers in the classification graph
FPN_CLASSIF_FC_LAYERS_SIZE = 1024
# Size of the top-down layers used to build the feature pyramid
TOP_DOWN_PYRAMID_SIZE = 256 # 定義rpn后每一層的通道數
# Number of classification classes (including background)
# Length of square anchor side in pixels
RPN_ANCHOR_SCALES = (8, 16, 32, 64, 128)
# Ratios of anchors at each cell (width/height)
# A value of 1 represents a square anchor, and 0.5 is a wide anchor
RPN_ANCHOR_RATIOS = [0.5, 1, 2]
# Anchor stride
# If 1 then anchors are created for each cell in the backbone feature map.
# If 2, then anchors are created for every other cell, and so on.
RPN_ANCHOR_STRIDE = 1
# Non-max suppression threshold to filter RPN proposals.
# You can increase this during training to generate more propsals.
RPN_NMS_THRESHOLD = 0.7 # 小於該閾值被保留
# How many anchors per image to use for RPN training
RPN_TRAIN_ANCHORS_PER_IMAGE = 256 # rpn數據需要此值,rpn網絡也需要次之
# ROIs kept after non-maximum supression (training and inference)
POST_NMS_ROIS_TRAINING = 2000 # 訓練模型在rpn后提取層的數量
POST_NMS_ROIS_INFERENCE = 1000 # 測試模型在rpn后提取層的數量
# If enabled, resizes instance masks to a smaller size to reduce
# memory load. Recommended when using high-resolution images.
USE_MINI_MASK = False
MINI_MASK_SHAPE = (56, 56) # (height, width) of the mini-mask
# Input image resizing
# Generally, use the "square" resizing mode for training and inferencing
# and it should work well in most cases. In this mode, images are scaled
# up such that the small side is = IMAGE_MIN_DIM, but ensuring that the
# scaling doesn't make the long side > IMAGE_MAX_DIM. Then the image is
# padded with zeros to make it a square so multiple images can be put
# in one batch.
# Available resizing modes:
# none: No resizing or padding. Return the image unchanged.
# square: Resize and pad with zeros to get a square image
# of size [max_dim, max_dim].
# pad64: Pads width and height with zeros to make them multiples of 64.
# If IMAGE_MIN_DIM or IMAGE_MIN_SCALE are not None, then it scales
# up before padding. IMAGE_MAX_DIM is ignored in this mode.
# The multiple of 64 is needed to ensure smooth scaling of feature
# maps up and down the 6 levels of the FPN pyramid (2**6=64).
# crop: Picks random crops from the image. First, scales the image based
# on IMAGE_MIN_DIM and IMAGE_MIN_SCALE, then picks a random crop of
# size IMAGE_MIN_DIM x IMAGE_MIN_DIM. Can be used in training only.
# IMAGE_MAX_DIM is not used in this mode.
IMAGE_RESIZE_MODE = "square"
# Minimum scaling ratio. Checked after MIN_IMAGE_DIM and can force further
# up scaling. For example, if set to 2 then images are scaled up to double
# the width and height, or more, even if MIN_IMAGE_DIM doesn't require it.
# Howver, in 'square' mode, it can be overruled by IMAGE_MAX_DIM.
IMAGE_MIN_SCALE = 0
# Image mean (RGB)
MEAN_PIXEL = np.array([123.7, 116.8, 103.9])
# Number of ROIs per image to feed to classifier/mask heads
# The Mask RCNN paper uses 512 but often the RPN doesn't generate
# enough positive proposals to fill this and keep a positive:negative
# ratio of 1:3. You can increase the number of proposals by adjusting
# the RPN NMS threshold.
TRAIN_ROIS_PER_IMAGE = 100 # target層
# Percent of positive ROIs used to train classifier/mask heads
ROI_POSITIVE_RATIO = 0.33
# Pooled ROIs
POOL_SIZE = 7
MASK_POOL_SIZE = 14
# Shape of output mask
# To change this you also need to change the neural network mask branch
MASK_SHAPE = [28, 28]
# Maximum number of ground truth instances to use in one image
MAX_GT_INSTANCES = 100
# Bounding box refinement standard deviation for RPN and final detections.
RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# Max number of final detections
DETECTION_MAX_INSTANCES = 100
# Minimum probability value to accept a detected instance
# ROIs below this threshold are skipped
DETECTION_MIN_CONFIDENCE = 0.7
# Non-maximum suppression threshold for detection
DETECTION_NMS_THRESHOLD = 0.3
# Learning rate and momentum
# The Mask RCNN paper uses lr=0.02, but on TensorFlow it causes
# weights to explode. Likely due to differences in optimzer
# implementation.
LEARNING_RATE = 0.001
LEARNING_MOMENTUM = 0.9
# Weight decay regularization
WEIGHT_DECAY = 0.0001
# Loss weights for more precise optimization.
# Can be used for R-CNN training setup.
LOSS_WEIGHTS = {
"rpn_class_loss": 1.,
"rpn_bbox_loss": 1.,
"mrcnn_class_loss": 1.,
"mrcnn_bbox_loss": 1.,
"mrcnn_mask_loss": 1.
}
# Use RPN ROIs or externally generated ROIs for training
# Keep this True for most situations. Set to False if you want to train
# the head branches on ROI generated by code rather than the ROIs from
# the RPN. For example, to debug the classifier head without having to
# train the RPN.
USE_RPN_ROIS = True
# Train or freeze batch normalization layers
# None: Train BN layers. This is the normal mode
# False: Freeze BN layers. Good when using a small batch size
# True: (don't use). Set layer in training mode even when inferencing
TRAIN_BN = True # Defaulting to False since batch size is often small
# Gradient norm clipping
GRADIENT_CLIP_NORM = 5.0
batch_size=1
def __init__(self):
"""Set values of computed attributes."""
# Effective batch size
# self.BATCH_SIZE = self.IMAGES_PER_GPU * self.GPU_COUNT
# Input image size
if self.IMAGE_RESIZE_MODE == "crop":
self.IMAGE_SHAPE = np.array([self.IMAGE_MIN_DIM, self.IMAGE_MIN_DIM, 3])
else:
self.IMAGE_SHAPE = np.array([self.IMAGE_MAX_DIM, self.IMAGE_MAX_DIM, 3])
# Image meta data length
# See compose_image_meta() for details
self.IMAGE_META_SIZE = 1 + 3 + 3 + 4 + 1 + self.NUM_CLASSES
def display(self):
"""Display Configuration values."""
print("\nConfigurations:")
for a in dir(self):
if not a.startswith("__") and not callable(getattr(self, a)):
print("{:30} {}".format(a, getattr(self, a)))
print("\n")
# 預測圖片基本配置更改
class Predict_Config(Config_config):
GPU_COUNT = 1
IMAGES_PER_GPU = 1
IMAGE_MIN_DIM = 128
IMAGE_MAX_DIM = 256
batch_size = 1
config = Config_config() # 基本配置建立實列
config.display() # 顯示基本配置
import skimage.color
import skimage.io
import skimage.transform
class Dataset_data(object):
"""The base class for dataset classes.
To use it, create a new class that adds functions specific to the dataset
you want to use. For example:
class CatsAndDogsDataset(Dataset):
def load_cats_and_dogs(self):
...
def load_mask(self, image_id):
...
def image_reference(self, image_id):
...
See COCODataset and ShapesDataset as examples.
"""
def __init__(self, class_map=None):
self.image_ids = []
self.image_info = []
# Background is always the first class
self.class_info = [{ "id": 0, "name": "BG"}]
# self.source_class_ids = {"":[0],"shapes": [0,1,2,3,4]}
self.class_names = [] # 包含0背景名字
def add_class(self, class_id, class_name):
# assert "." not in source, "Source name cannot contain a dot"
# Does the class exist already?
for info in self.class_info:
if info["id"] == class_id:
# source.class_id combination already available, skip
return
# Add the class
self.class_info.append({
# "source": source,
"id": class_id,
"name": class_name,
})
def add_image(self, image_id, path, **kwargs):
image_info = {
"id": image_id,
# "source": source,
"path": path,
}
image_info.update(kwargs)
self.image_info.append(image_info)
def data_load_information(self, img_floder): # count表示transon文件的數量 img_floder 是transon文件路徑
"""
該函數向class_info添加不良類的代碼必須手動修改,
該函數主要保存類別信息,圖片信息(如原始圖片路徑,
高寬及mask圖片路徑等)。
該函數只要輸入文件名字,它會自動遍歷所有文件,
並保存文件圖片的信息。
"""
# Add classes
self.add_class( 1, "line_bulge") # 添加標簽,這里只有一個不良 ###########################################################
self.add_class( 2, "dot_concave")
self.add_class( 3, "dot_bulge")
self.add_class( 4, "Irregular_concave")
img_file_list = os.listdir(img_floder) # 返回文件夾中包含的名字目錄
count = len(img_file_list) # 有多少數量
id = 0
for sorce_path in img_file_list: # 遍歷所有文件夾
yaml_path = os.path.join(img_floder + '\\' + sorce_path, 'info.yaml') # label_names: - _background_ - NG
mask_path = os.path.join(img_floder + '\\' + sorce_path, 'label.png')
img_path = os.path.join(img_floder + '\\' + sorce_path, 'img.png')
cv_img = cv2.imdecode(np.fromfile(mask_path, dtype=np.uint8),
cv2.IMREAD_UNCHANGED) # np.fromfile以np.uint8讀取文件 # cv2.imdecode緩存中讀取數據,並解碼成圖像格式
self.add_image( image_id=id, path=img_path, width=cv_img.shape[1], height=cv_img.shape[0],
mask_path=mask_path, yaml_path=yaml_path)
id += 1
if id > count:
break
self.num_classes = len(self.class_info)
self.class_ids = np.arange(self.num_classes)
self.class_names = [c["name"] for c in self.class_info] # 保存圖片類別,包含0
self.num_images = len(self.image_info) # 保存圖片數量
self.image_ids = np.arange(self.num_images) # 根據圖片數量產生圖片編號
def load_image(self, image_id):
"""
該函數在數據產生時候使用
Load the specified image and return a [H,W,3] Numpy array.
"""
# Load image
image = skimage.io.imread(self.image_info[image_id]['path'])
# If grayscale. Convert to RGB for consistency.
if image.ndim != 3:
image = skimage.color.gray2rgb(image)
# If has an alpha channel, remove it for consistency
if image.shape[-1] == 4:
image = image[..., :3]
return image
def load_mask(self, image_id):
"""
該函數也是在數據產生中使用,主要根據圖片序列,產生圖片的mask,
將有mask的修改成值為1,其它為0,並返回每個mask對應的類數值,
返回mask與class_ids,其中mask為[w,h,object],
class_ids為[object],如[w,h,4]與[1,3,1,2]
"""
# global iter_num
info = self.image_info[image_id] # according image_id that belong int to choose image_info information
img = Image.open(info['mask_path']) # loading mask_path from label_image that original image handled have changed mask image with label
num_obj = np.max(img) # 取一個最大值,得到驗證有多少個物體就會是多少,如這張圖有3個mask則該值等於3
mask = np.zeros([info['height'], info['width'], num_obj], dtype=np.uint8)
count=1
for index in range(num_obj):
for i in range(info['width']):
for j in range(info['height']):
# info['width'] 與info['height'] 為label.png圖像的寬度與高度
at_pixel = img.getpixel((i, j))
if at_pixel == index + 1:
mask[j, i, index] = 1 # 將有mask位置取1
mask = mask.astype(np.uint8)
# occlusion = np.logical_not(mask[:, :, -1]).astype(np.uint8) #
# for i in range(count - 2, -1, -1):
# mask[:, :, i] = mask[:, :, i] * occlusion
# occlusion = np.logical_and(occlusion, np.logical_not(mask[:, :, i]))
#
labels = self.from_yaml_get_class(image_id)
labels_form = []
num_classes=len(self.class_info) # 包含背景BG
for i in range(len(labels)): # search image_id label to add labels_form.append
for j in range(1,num_classes):
if labels[i].find(self.class_info[j]["name"]) != -1: # find()function checking if having line_bulge,
# if so ,return start index if not ,return -1.therefore judge return value equal -1
labels_form.append(self.class_info[j]["name"])
class_ids = np.array([self.class_names.index(s) for s in labels_form])
# 按照class_ids 選定圖片,然后按照yaml文件的分類匹配到class中,並給出整數代表
return mask, class_ids.astype(np.int32)
# 解析labelme中得到的yaml文件,從而得到mask每一層對應的實例標簽
def from_yaml_get_class(self, image_id):
'''
temp={'label_names': ['_background_', '11111', '22222', '3333']}
labels=['_background_', '11111', '22222', '3333']
labels[0]=['11111', '22222', '3333']
:param image_id:
:return:
'''
info = self.image_info[image_id]
with open(info['yaml_path']) as f:
temp = yaml.load(f.read(), Loader=yaml.FullLoader)
labels = temp['label_names']
del labels[0]
return labels
def generate_pyramid_anchors(self, scales, ratios, feature_shapes, feature_strides, anchor_stride):
"""Generate anchors at different levels of a feature pyramid. Each scale
is associated with a level of the pyramid, but each ratio is used in
all levels of the pyramid.
Returns:
anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted
with the same order of the given scales. So, anchors of scale[0] come
first, then anchors of scale[1], and so on.
"""
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
anchors = []
for i in range(len(scales)):
# anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i], feature_strides[i], anchor_stride))
"""
scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
shape: [height, width] spatial shape of the feature map over which to generate anchors.
feature_stride: Stride of the feature map relative to the image in pixels.
anchor_stride: Stride of anchors on the feature map. For example, if the value is 2 then generate anchors for every other feature map pixel.
"""
# Get all combinations of scales and ratios
scale, ratios = np.meshgrid(np.array(scales[i]), np.array(ratios))
scale = scale.flatten()
ratios = ratios.flatten()
shape = feature_shapes[i]
feature_stride = feature_strides[i]
# Enumerate heights and widths from scales and ratios
# 實際得到box的寬與高
heights = scale / np.sqrt(ratios)
widths = scale * np.sqrt(ratios)
# Enumerate shifts in feature space
# 實際得到box坐標中心
shifts_y = np.arange(0, shape[0],
anchor_stride) * feature_stride # anchor_stride 表示原圖img/stride縮放后以anchor_stride為步長取像素,
# 一此作為中心點,而后乘以feature_stride(stride)將像素中心放回原圖像位置中。
shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
# Enumerate combinations of shifts, widths, and heights
box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
# code above make center of bboxes and height width of bboxes
# Convert to corner coordinates (y1, x1, y2, x2)
boxes = np.concatenate([box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes], axis=1)
# convert center height and width coordinate of bbox to four coordinates which respectively represnt top left corner and lower right corner
anchors.append(boxes)
return np.concatenate(anchors, axis=0)
def resize(self, image, output_shape, order=1, mode='constant', cval=0, clip=True,
preserve_range=False, anti_aliasing=False, anti_aliasing_sigma=None):
"""A wrapper for Scikit-Image resize().
Scikit-Image generates warnings on every call to resize() if it doesn't
receive the right parameters. The right parameters depend on the version
of skimage. This solves the problem by using different parameters per
version. And it provides a central place to control resizing defaults.
"""
if LooseVersion(skimage.__version__) >= LooseVersion("0.14"):
# New in 0.14: anti_aliasing. Default it to False for backward
# compatibility with skimage 0.13.
return skimage.transform.resize(
image, output_shape,
order=order, mode=mode, cval=cval, clip=clip,
preserve_range=preserve_range, anti_aliasing=anti_aliasing,
anti_aliasing_sigma=anti_aliasing_sigma)
else:
return skimage.transform.resize(
image, output_shape,
order=order, mode=mode, cval=cval, clip=clip,
preserve_range=preserve_range)
def resize_image(self,image, min_dim=None, max_dim=None, min_scale=None, mode="square"):
"""Resizes an image keeping the aspect ratio unchanged.
min_dim: if provided, resizes the image such that it's smaller dimension == min_dim
max_dim: if provided, ensures that the image longest side doesn't exceed this value.
min_scale: if provided, ensure that the image is scaled up by at least
this percent even if min_dim doesn't require it.
mode: Resizing mode.
none: No resizing. Return the image unchanged.
square: Resize and pad with zeros to get a square image of size [max_dim, max_dim].
pad64: Pads width and height with zeros to make them multiples of 64.
If min_dim or min_scale are provided, it scales the image up
before padding. max_dim is ignored in this mode.
The multiple of 64 is needed to ensure smooth scaling of feature
maps up and down the 6 levels of the FPN pyramid (2**6=64).
crop: Picks random crops from the image. First, scales the image based
on min_dim and min_scale, then picks a random crop of
size min_dim x min_dim. Can be used in training only.
max_dim is not used in this mode.
Returns:
image: the resized image
window: (y1, x1, y2, x2). If max_dim is provided, padding might
be inserted in the returned image. If so, this window is the
coordinates of the image part of the full image (excluding
the padding). The x2, y2 pixels are not included.
scale: The scale factor used to resize the image
padding: Padding added to the image [(top, bottom), (left, right), (0, 0)]
"""
# Keep track of image dtype and return results in the same dtype
image_dtype = image.dtype
# Default window (y1, x1, y2, x2) and default scale == 1.
h, w = image.shape[:2]
window = (0, 0, h, w)
scale = 1
padding = [(0, 0), (0, 0), (0, 0)]
if mode == "none":
return image, window, scale, padding
# Scale?
if min_dim:
# Scale up but not down
scale = max(1, min_dim / min(h, w)) # h, w是原始圖片的高與寬
if min_scale and scale < min_scale: # min_scale是最小填充倍數的,至少要大於它
scale = min_scale
# Does it exceed max dim?
if max_dim and mode == "square":
image_max = max(h, w)
if round(image_max * scale) > max_dim: # 最終原圖片最大邊擴充不能超過最大max_dim維度,否則重新選擇scale
scale = max_dim / image_max
# Resize image using bilinear interpolation
if scale != 1:
image = self.resize(image, (round(h * scale), round(w * scale)), preserve_range=True)
# 上一行代碼對圖像做了resize,那么會改變圖像的尺寸,這是我不願意看到的,我覺的這樣會對缺陷特征有損失,
# 或者出現變異,因此小心這里的變化
# Need padding or cropping?
if mode == "square":
# Get new height and width
h, w = image.shape[:2] # 此時已經將原圖按照scale進行了改變
top_pad = (max_dim - h) // 2
bottom_pad = max_dim - h - top_pad
left_pad = (max_dim - w) // 2
right_pad = max_dim - w - left_pad
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode='constant', constant_values=0) # 將改變的圖片進行了填充
window = (top_pad, left_pad, h + top_pad, w + left_pad) # 保存經過resize后圖片的真實大小
elif mode == "pad64":
h, w = image.shape[:2]
# Both sides must be divisible by 64
assert min_dim % 64 == 0, "Minimum dimension must be a multiple of 64"
# Height
if h % 64 > 0:
max_h = h - (h % 64) + 64
top_pad = (max_h - h) // 2
bottom_pad = max_h - h - top_pad
else:
top_pad = bottom_pad = 0
# Width
if w % 64 > 0:
max_w = w - (w % 64) + 64
left_pad = (max_w - w) // 2
right_pad = max_w - w - left_pad
else:
left_pad = right_pad = 0
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode='constant', constant_values=0)
window = (top_pad, left_pad, h + top_pad, w + left_pad)
else:
raise Exception("Mode {} not supported".format(mode))
return image.astype(image_dtype), window, scale, padding
def resize_mask(self,mask, scale, padding):
# scale是輸入圖像的尺寸變化,padding是最大維度的背景填充,mask有效坐標對應原來輸入的圖像中
"""Resizes a mask using the given scale and padding.
Typically, you get the scale and padding from resize_image() to
ensure both, the image and the mask, are resized consistently.
scale: mask scaling factor
padding: Padding to add to the mask in the form
[(top, bottom), (left, right), (0, 0)]
"""
# Suppress warning from scipy 0.13.0, the output shape of zoom() is
# calculated with round() instead of int()
# with warnings.catch_warnings():
# warnings.simplefilter("ignore")
mask = scipy.ndimage.zoom(mask, zoom=[scale, scale, 1], order=0)
# if crop is not None:
# y, x, h, w = crop
# mask = mask[y:y + h, x:x + w]
# else:
mask = np.pad(mask, padding, mode='constant', constant_values=0)
return mask
def extract_bboxes(self,mask): # [[num_instances, (y1, x1, y2, x2)]]
# in a word,bbox proced by mask will contain all mask which value equal 1.
"""Compute bounding boxes from masks.
mask: [height, width, num_instances]. Mask pixels are either 1 or 0.
Returns: bbox array [num_instances, (y1, x1, y2, x2)].
"""
boxes = np.zeros([mask.shape[-1], 4], dtype=np.int32)
# the last dimension for mask (num_instances) is bbox for instance every picture
for i in range(mask.shape[-1]):
m = mask[:, :, i]
# Bounding box.
horizontal_indicies = np.where(np.any(m, axis=0))[0]
vertical_indicies = np.where(np.any(m, axis=1))[0]
if horizontal_indicies.shape[0]:
x1, x2 = horizontal_indicies[[0, -1]]
y1, y2 = vertical_indicies[[0, -1]]
# x2 and y2 should not be part of the box. Increment by 1.
x2 += 1
y2 += 1
else:
# No mask for this instance. Might happen due to
# resizing or cropping. Set bbox to zeros
x1, x2, y1, y2 = 0, 0, 0, 0
boxes[i] = np.array([y1, x1, y2, x2])
return boxes.astype(np.int32)
def load_image_gt(self, config, image_id, augment=False, augmentation=None):
# Load image and mask
print("image_id : ", image_id) # 打印載入圖片的序號
image = self.load_image(image_id)
mask, class_ids = self.load_mask(image_id)
original_shape = image.shape
image, window, scale, padding = self.resize_image(
image,
min_dim=config.IMAGE_MIN_DIM,
min_scale=config.IMAGE_MIN_SCALE,
max_dim=config.IMAGE_MAX_DIM,
mode=config.IMAGE_RESIZE_MODE)
mask = self.resize_mask(mask, scale, padding)
print('data_resize_image and resize_mask')
# Random horizontal flips.
# TODO: will be removed in a future update in favor of augmentation
if random.randint(0, 1):
image = np.fliplr(image)
mask = np.fliplr(mask)
# Augmentation
# This requires the imgaug lib (https://github.com/aleju/imgaug)
if augmentation:
import imgaug
# Augmenters that are safe to apply to masks
# Some, such as Affine, have settings that make them unsafe, so always
# test your augmentation on masks
MASK_AUGMENTERS = ["Sequential", "SomeOf", "OneOf", "Sometimes",
"Fliplr", "Flipud", "CropAndPad",
"Affine", "PiecewiseAffine"]
def hook(images, augmenter, parents, default):
"""Determines which augmenters to apply to masks."""
return augmenter.__class__.__name__ in MASK_AUGMENTERS
# Store shapes before augmentation to compare
image_shape = image.shape
mask_shape = mask.shape
# Make augmenters deterministic to apply similarly to images and masks
det = augmentation.to_deterministic()
image = det.augment_image(image)
# Change mask to np.uint8 because imgaug doesn't support np.bool
mask = det.augment_image(mask.astype(np.uint8), hooks=imgaug.HooksImages(activator=hook))
# Verify that shapes didn't change
assert image.shape == image_shape, "Augmentation shouldn't change image size"
assert mask.shape == mask_shape, "Augmentation shouldn't change mask size"
# Change mask back to bool
mask = mask.astype(np.bool)
# Note that some boxes might be all zeros if the corresponding mask got cropped out.
# and here is to filter them out
_idx = np.sum(mask, axis=(0, 1)) > 0
mask = mask[:, :, _idx]
class_ids = class_ids[_idx]
# Bounding boxes. Note that some boxes might be all zeros
# if the corresponding mask got cropped out.
# bbox: [num_instances, (y1, x1, y2, x2)]
bbox = self.extract_bboxes(mask)
# Active classes
# Different datasets have different classes, so track the
# classes supported in the dataset of this image.
active_class_ids = np.ones([self.num_classes], dtype=np.int32)
image_meta = np.array(
[image_id] + # size=1
list(original_shape) + # size=3
list(image.shape) + # size=3
list(window) + # size=4 (y1, x1, y2, x2) in image cooredinates
[scale] + # size=1
list(active_class_ids) # size=num_classes
)
print('using model data')
return image, image_meta, class_ids, bbox, mask
def compute_overlaps(self,boxes1, boxes2):
# each value in boxes2 compute with all boxes1,and calling compute_iou function
# finally, value save in [number_boxes1,number_boxes2]
"""Computes IoU overlaps between two sets of boxes.
boxes1, boxes2: [N, (y1, x1, y2, x2)].
For better performance, pass the largest set first and the smaller second.
"""
# Areas of anchors and GT boxes
area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
# Compute overlaps to generate matrix [boxes1 count, boxes2 count]
# Each cell contains the IoU value.
overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0])) # building variables for overlaps to save
for i in range(overlaps.shape[1]):
box2 = boxes2[i]
y1 = np.maximum(box2[0], boxes1[:, 0])
y2 = np.minimum(box2[2], boxes1[:, 2])
x1 = np.maximum(box2[1], boxes1[:, 1])
x2 = np.minimum(box2[3], boxes1[:, 3])
intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
union = area2[i] + area1[:] - intersection[:]
overlaps[:, i] = intersection / union
return overlaps
def build_rpn_targets(self, anchors, gt_class_ids, gt_boxes, config):
print('data_rpn_box')
"""Given the anchors and GT boxes, compute overlaps and identify positive
anchors and deltas to refine them to match their corresponding GT boxes.
anchors: [num_anchors, (y1, x1, y2, x2)]
gt_class_ids: [num_gt_boxes] Integer class IDs.
gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
Returns:
rpn_match: [N] (int32) matches between anchors and GT boxes.
1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
"""
# RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
# RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))
# Handle COCO crowds
# A crowd box in COCO is a bounding box around several instances. Exclude
# them from training. A crowd box is given a negative class ID.
crowd_ix = np.where(gt_class_ids < 0)[0]
if crowd_ix.shape[0] > 0:
# Filter out crowds from ground truth class IDs and boxes
non_crowd_ix = np.where(gt_class_ids > 0)[0]
crowd_boxes = gt_boxes[crowd_ix]
gt_class_ids = gt_class_ids[non_crowd_ix]
gt_boxes = gt_boxes[non_crowd_ix]
# Compute overlaps with crowd boxes [anchors, crowds]
crowd_overlaps = self.compute_overlaps(anchors, crowd_boxes)
crowd_iou_max = np.amax(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001)
else:
# All anchors don't intersect a crowd
no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
# Compute overlaps [num_anchors, num_gt_boxes]
overlaps = self.compute_overlaps(anchors, gt_boxes)
# Match anchors to GT Boxes
# If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.
# If an anchor overlaps a GT box with IoU < 0.3 then it's negative.
# Neutral anchors are those that don't match the conditions above,
# and they don't influence the loss function.
# However, don't keep any GT box unmatched (rare, but happens). Instead,
# match it to the closest anchor (even if its max IoU is < 0.3).
#
# 1. Set negative anchors first. They get overwritten below if a GT box is
# matched to them. Skip boxes in crowd areas.
anchor_iou_argmax = np.argmax(overlaps, axis=1)
anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
# 2. Set an anchor for each GT box (regardless of IoU value).
# If multiple anchors have the same IoU match all of them
gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:, 0]
rpn_match[gt_iou_argmax] = 1
# 3. Set anchors with high overlap as positive.
rpn_match[anchor_iou_max >= 0.7] = 1
# Subsample to balance positive and negative anchors
# Don't let positives be more than half the anchors
ids = np.where(rpn_match == 1)[0]
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)
if extra > 0:
# Reset the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# Same for negative proposals
ids = np.where(rpn_match == -1)[0]
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
np.sum(rpn_match == 1))
if extra > 0:
# Rest the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# For positive anchors, compute shift and scale needed to transform them
# to match the corresponding GT boxes.
ids = np.where(rpn_match == 1)[0]
ix = 0 # index into rpn_bbox
# TODO: use box_refinement() rather than duplicating the code here
for i, a in zip(ids, anchors[ids]):
# Closest gt box (it might have IoU < 0.7)
gt = gt_boxes[anchor_iou_argmax[i]]
# Convert coordinates to center plus width/height.
# GT Box
gt_h = gt[2] - gt[0]
gt_w = gt[3] - gt[1]
gt_center_y = gt[0] + 0.5 * gt_h
gt_center_x = gt[1] + 0.5 * gt_w
# Anchor
a_h = a[2] - a[0]
a_w = a[3] - a[1]
a_center_y = a[0] + 0.5 * a_h
a_center_x = a[1] + 0.5 * a_w
# Compute the bbox refinement that the RPN should predict.
rpn_bbox[ix] = [
(gt_center_y - a_center_y) / a_h,
(gt_center_x - a_center_x) / a_w,
np.log(gt_h / a_h),
np.log(gt_w / a_w),
]
# Normalize
rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
ix += 1
return rpn_match, rpn_bbox
def generate_random_rois(self, image_shape, count, gt_boxes):
"""Generates ROI proposals similar to what a region proposal network
would generate.
image_shape: [Height, Width, Depth]
count: Number of ROIs to generate
gt_class_ids: [N] Integer ground truth class IDs
gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.
Returns: [count, (y1, x1, y2, x2)] ROI boxes in pixels.
"""
# placeholder
rois = np.zeros((count, 4), dtype=np.int32)
# Generate random ROIs around GT boxes (90% of count)
rois_per_box = int(0.9 * count / gt_boxes.shape[0])
for i in range(gt_boxes.shape[0]):
gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
h = gt_y2 - gt_y1
w = gt_x2 - gt_x1
# random boundaries
r_y1 = max(gt_y1 - h, 0)
r_y2 = min(gt_y2 + h, image_shape[0])
r_x1 = max(gt_x1 - w, 0)
r_x2 = min(gt_x2 + w, image_shape[1])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
threshold][:rois_per_box]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
threshold][:rois_per_box]
if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
box_rois = np.hstack([y1, x1, y2, x2])
rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois
# Generate random ROIs anywhere in the image (10% of count)
remaining_count = count - (rois_per_box * gt_boxes.shape[0])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))
x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
threshold][:remaining_count]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
threshold][:remaining_count]
if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
global_rois = np.hstack([y1, x1, y2, x2])
rois[-remaining_count:] = global_rois
return rois
def box_refinement(self,box, gt_box):
"""Compute refinement needed to transform box to gt_box.
box and gt_box are [N, (y1, x1, y2, x2)]. (y2, x2) is
assumed to be outside the box.
"""
box = box.astype(np.float32)
gt_box = gt_box.astype(np.float32)
height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width
gt_height = gt_box[:, 2] - gt_box[:, 0]
gt_width = gt_box[:, 3] - gt_box[:, 1]
gt_center_y = gt_box[:, 0] + 0.5 * gt_height
gt_center_x = gt_box[:, 1] + 0.5 * gt_width
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = np.log(gt_height / height)
dw = np.log(gt_width / width)
return np.stack([dy, dx, dh, dw], axis=1)
def build_detection_targets(self, rpn_rois, gt_class_ids, gt_boxes, gt_masks, config):
"""Generate targets for training Stage 2 classifier and mask heads.
This is not used in normal training. It's useful for debugging or to train
the Mask RCNN heads without using the RPN head.
Inputs:
rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
gt_class_ids: [instance count] Integer class IDs
gt_boxes: [instance count, (y1, x1, y2, x2)]
gt_masks: [height, width, instance count] Ground truth masks. Can be full
size or mini-masks.
Returns:
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
bbox refinements.
masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped
to bbox boundaries and resized to neural network output size.
"""
assert rpn_rois.shape[0] > 0
assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format(
gt_class_ids.dtype)
assert gt_boxes.dtype == np.int32, "Expected int but got {}".format(
gt_boxes.dtype)
assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format(
gt_masks.dtype)
# It's common to add GT Boxes to ROIs but we don't do that here because
# according to XinLei Chen's paper, it doesn't help.
# Trim empty padding in gt_boxes and gt_masks parts
instance_ids = np.where(gt_class_ids > 0)[0]
assert instance_ids.shape[0] > 0, "Image must contain instances."
gt_class_ids = gt_class_ids[instance_ids]
gt_boxes = gt_boxes[instance_ids]
gt_masks = gt_masks[:, :, instance_ids]
# Compute areas of ROIs and ground truth boxes.
rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * \
(rpn_rois[:, 3] - rpn_rois[:, 1])
gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * \
(gt_boxes[:, 3] - gt_boxes[:, 1])
# Compute overlaps [rpn_rois, gt_boxes]
overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
for i in range(overlaps.shape[1]):
gt = gt_boxes[i]
overlaps[:, i] = self.compute_iou(
gt, rpn_rois, gt_box_area[i], rpn_roi_area)
# Assign ROIs to GT boxes
rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)
rpn_roi_iou_max = overlaps[np.arange(
overlaps.shape[0]), rpn_roi_iou_argmax]
# GT box assigned to each ROI
rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]
rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]
# Positive ROIs are those with >= 0.5 IoU with a GT box.
fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]
# Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
# TODO: To hard example mine or not to hard example mine, that's the question
# bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
# Subsample ROIs. Aim for 33% foreground.
# FG
fg_roi_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO)
if fg_ids.shape[0] > fg_roi_count:
keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
else:
keep_fg_ids = fg_ids
# BG
remaining = config.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[0]
if bg_ids.shape[0] > remaining:
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
else:
keep_bg_ids = bg_ids
# Combine indices of ROIs to keep
keep = np.concatenate([keep_fg_ids, keep_bg_ids])
# Need more?
remaining = config.TRAIN_ROIS_PER_IMAGE - keep.shape[0]
if remaining > 0:
# Looks like we don't have enough samples to maintain the desired
# balance. Reduce requirements and fill in the rest. This is
# likely different from the Mask RCNN paper.
# There is a small chance we have neither fg nor bg samples.
if keep.shape[0] == 0:
# Pick bg regions with easier IoU threshold
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
assert bg_ids.shape[0] >= remaining
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
assert keep_bg_ids.shape[0] == remaining
keep = np.concatenate([keep, keep_bg_ids])
else:
# Fill the rest with repeated bg rois.
keep_extra_ids = np.random.choice(
keep_bg_ids, remaining, replace=True)
keep = np.concatenate([keep, keep_extra_ids])
assert keep.shape[0] == config.TRAIN_ROIS_PER_IMAGE, \
"keep doesn't match ROI batch size {}, {}".format(
keep.shape[0], config.TRAIN_ROIS_PER_IMAGE)
# Reset the gt boxes assigned to BG ROIs.
rpn_roi_gt_boxes[keep_bg_ids, :] = 0
rpn_roi_gt_class_ids[keep_bg_ids] = 0
# For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
rois = rpn_rois[keep]
roi_gt_boxes = rpn_roi_gt_boxes[keep]
roi_gt_class_ids = rpn_roi_gt_class_ids[keep]
roi_gt_assignment = rpn_roi_iou_argmax[keep]
# Class-aware bbox deltas. [y, x, log(h), log(w)]
bboxes = np.zeros((config.TRAIN_ROIS_PER_IMAGE,
config.NUM_CLASSES, 4), dtype=np.float32)
pos_ids = np.where(roi_gt_class_ids > 0)[0]
bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = self.box_refinement(
rois[pos_ids], roi_gt_boxes[pos_ids, :4])
# Normalize bbox refinements
bboxes /= config.BBOX_STD_DEV
# Generate class-specific target masks
masks = np.zeros((config.TRAIN_ROIS_PER_IMAGE, config.MASK_SHAPE[0], config.MASK_SHAPE[1], config.NUM_CLASSES),
dtype=np.float32)
for i in pos_ids:
class_id = roi_gt_class_ids[i]
assert class_id > 0, "class id must be greater than 0"
gt_id = roi_gt_assignment[i]
class_mask = gt_masks[:, :, gt_id]
# if config.USE_MINI_MASK:
# # Create a mask placeholder, the size of the image
# placeholder = np.zeros(config.IMAGE_SHAPE[:2], dtype=bool)
# # GT box
# gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id]
# gt_w = gt_x2 - gt_x1
# gt_h = gt_y2 - gt_y1
# # Resize mini mask to size of GT box
# placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \
# np.round(utils.resize(class_mask, (gt_h, gt_w))).astype(bool)
# # Place the mini batch in the placeholder
# class_mask = placeholder
# Pick part of the mask and resize it
y1, x1, y2, x2 = rois[i].astype(np.int32)
m = class_mask[y1:y2, x1:x2]
mask = self.resize(m, config.MASK_SHAPE)
masks[i, :, :, class_id] = mask
return rois, roi_gt_class_ids, bboxes, masks
def data_generator(self, config, shuffle=True, augment=False, augmentation=None,
random_rois=0, batch_size=1, detection_targets=False):
b = 0 # batch item index
image_index = -1
image_ids = np.copy(self.image_ids) # dataset.image_ids 運用了 @property
error_count = 0
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
backbone_shapes = \
np.array([[int(math.ceil(config.IMAGE_SHAPE[0] / stride)),
int(math.ceil(config.IMAGE_SHAPE[1] / stride))] for stride in
config.BACKBONE_STRIDES]) # BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# compute_backbone_shapes(config, config.IMAGE_SHAPE) # (5,2) # [4, 8, 16, 32, 64]
anchors = self.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, # (8, 16, 32, 64, 128)
config.RPN_ANCHOR_RATIOS, # [0.5, 1, 2]
backbone_shapes, # image_shape / [4, 8, 16, 32, 64] is five rows 2 cols
config.BACKBONE_STRIDES, # [4, 8, 16, 32, 64]
config.RPN_ANCHOR_STRIDE) # =1
print('data_class_data_anchors')
# 【n,4】
# 得到的anchor數量為 每個scale分別是3*(image_shape/4)**2,3*(image_shape/8)**2,3*(image_shape/16)**2,
# 3*(image_shape/4)**2,3*(image_shape/64)**2,
# Keras requires a generator to run indefinitely.
while True:
try:
# Increment index to pick next image. Shuffle if at the start of an epoch.
image_index = (image_index + 1) % len(image_ids)
if shuffle and image_index == 0:
np.random.shuffle(image_ids)
# Get GT bounding boxes and masks for image.
image_id = image_ids[image_index]
image, image_meta, gt_class_ids, gt_boxes, gt_masks = \
self.load_image_gt(config, image_id, augment=augment,
augmentation=augmentation)
# Skip images that have no instances. This can happen in cases
# where we train on a subset of classes and the image doesn't
# have any of the classes we care about.
if not np.any(gt_class_ids > 0):
continue
# RPN Targets
rpn_match, rpn_bbox = self.build_rpn_targets(anchors, gt_class_ids, gt_boxes, config)
# Mask R-CNN Targets
if random_rois:
rpn_rois = self.generate_random_rois(image.shape, random_rois, gt_boxes)
if detection_targets:
rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask = \
self.build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks, config)
# Init batch arrays
if b == 0:
batch_image_meta = np.zeros((batch_size,) + image_meta.shape, dtype=image_meta.dtype)
batch_rpn_match = np.zeros([batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype)
batch_rpn_bbox = np.zeros([batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype)
batch_images = np.zeros((batch_size,) + image.shape, dtype=np.float32)
batch_gt_class_ids = np.zeros((batch_size, config.MAX_GT_INSTANCES), dtype=np.int32)
batch_gt_boxes = np.zeros((batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32)
batch_gt_masks = np.zeros(
(batch_size, gt_masks.shape[0], gt_masks.shape[1], config.MAX_GT_INSTANCES),
dtype=gt_masks.dtype)
if random_rois:
batch_rpn_rois = np.zeros((batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype)
if detection_targets:
batch_rois = np.zeros((batch_size,) + rois.shape, dtype=rois.dtype)
batch_mrcnn_class_ids = np.zeros((batch_size,) + mrcnn_class_ids.shape,
dtype=mrcnn_class_ids.dtype)
batch_mrcnn_bbox = np.zeros((batch_size,) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype)
batch_mrcnn_mask = np.zeros((batch_size,) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype)
# If more instances than fits in the array, sub-sample from them.
if gt_boxes.shape[0] > config.MAX_GT_INSTANCES:
ids = np.random.choice(np.arange(gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False)
gt_class_ids = gt_class_ids[ids]
gt_boxes = gt_boxes[ids]
gt_masks = gt_masks[:, :, ids]
# Add to batch
batch_image_meta[b] = image_meta
batch_rpn_match[b] = rpn_match[:, np.newaxis]
batch_rpn_bbox[b] = rpn_bbox
batch_images[b] = image.astype(np.float32) - config.MEAN_PIXEL
batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids
batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes
batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks
if random_rois:
batch_rpn_rois[b] = rpn_rois
if detection_targets:
batch_rois[b] = rois
batch_mrcnn_class_ids[b] = mrcnn_class_ids
batch_mrcnn_bbox[b] = mrcnn_bbox
batch_mrcnn_mask[b] = mrcnn_mask
b += 1
# Batch full?
if b >= batch_size:
inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox,
batch_gt_class_ids, batch_gt_boxes, batch_gt_masks]
outputs = []
if random_rois:
inputs.extend([batch_rpn_rois])
if detection_targets:
inputs.extend([batch_rois])
# Keras requires that output and targets have the same number of dimensions
batch_mrcnn_class_ids = np.expand_dims(
batch_mrcnn_class_ids, -1)
outputs.extend(
[batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask])
print('data_load_finish')
yield inputs , outputs
# start a new batch
b = 0
except:
raise Exception("not pass")
'''
可能會拋出異常,屬於正常,因為出現生成器銷毀而出現的。
Exception ignored in: <generator object Dataset_data.data_generator at 0x000002002D40BB48>
Traceback (most recent call last):
File "C:/Users/51102/Desktop/MASKRCNN_tangjun/Mask_RCNN-master/train_demo.py", line 1249, in data_generator
raise Exception("not pass")
Exception: not pass
'''
def train_model():
img_floder ='C:\\Users\\51102\\Desktop\\maskrcnn(tangjun)\\1021' ####################################################################################################
dataset_train = Dataset_data()
dataset_train.data_load_information(img_floder)
model = modellib.MaskRCNN(mode="training", config=config)
COCO_MODEL_PATH='C:\\Users\\51102\\Desktop\\maskrcnn(tangjun)\\mask_rcnn_shapes_0002.h5'
model.load_weights(COCO_MODEL_PATH, by_name=True,
exclude=["mrcnn_class_logits", "mrcnn_bbox_fc",
"mrcnn_bbox", "mrcnn_mask"])
# 產生數據
train_generator = dataset_train.data_generator(config, shuffle=True,
augmentation=None,
batch_size=config.batch_size)
model.train(train_generator,
learning_rate=config.LEARNING_RATE,
epochs=4,
layers='heads')
# Fine tune all layers
# Passing layers="all" trains all layers. You can also
# pass a regular expression to select which layers to
# train by name pattern.
# model.train(dataset_train, dataset_train,
# learning_rate=config.LEARNING_RATE / 10,
# epochs=3,
# layers="all")
from skimage.measure import find_contours
import matplotlib.pyplot as plt
from matplotlib import patches
from matplotlib.patches import Polygon
import colorsys
def random_colors(N, bright=True):
"""
Generate random colors.
To get visually distinct colors, generate them in HSV space then
convert to RGB.
"""
brightness = 1.0 if bright else 0.7
hsv = [(i / N, 1, brightness) for i in range(N)]
colors = list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv))
random.shuffle(colors)
return colors
def apply_mask(image, mask, color, alpha=0.5):
"""Apply the given mask to the image.
"""
for c in range(3):
image[:, :, c] = np.where(mask == 1,
image[:, :, c] *
(1 - alpha) + alpha * color[c] * 255,
image[:, :, c])
return image
def display_instances(image, boxes, masks, class_ids, class_names,
scores=None, title="",
figsize=(16, 16), ax=None,
show_mask=True, show_bbox=True,
colors=None, captions=None):
"""
boxes: [num_instance, (y1, x1, y2, x2, class_id)] in image coordinates.
masks: [height, width, num_instances]
class_ids: [num_instances]
class_names: list of class names of the dataset
scores: (optional) confidence scores for each box
title: (optional) Figure title
show_mask, show_bbox: To show masks and bounding boxes or not
figsize: (optional) the size of the image
colors: (optional) An array or colors to use with each object
captions: (optional) A list of strings to use as captions for each object
"""
# Number of instances
N = boxes.shape[0]
if not N:
print("\n*** No instances to display *** \n")
else:
assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0]
# If no axis is passed, create one and automatically call show()
auto_show = False
if not ax:
_, ax = plt.subplots(1, figsize=figsize)
auto_show = True
# Generate random colors
colors = colors or random_colors(N)
# Show area outside image boundaries.
height, width = image.shape[:2]
ax.set_ylim(height + 10, -10)
ax.set_xlim(-10, width + 10)
ax.axis('off')
ax.set_title(title)
masked_image = image.astype(np.uint32).copy()
for i in range(N):
color = colors[i]
# Bounding box
if not np.any(boxes[i]):
# Skip this instance. Has no bbox. Likely lost in image cropping.
continue
y1, x1, y2, x2 = boxes[i]
# cv.rectangle(masked_image, (y1[0],x1[0]), (y2[0],x2[0]), (0, 250, 0), 2) # 自己添加代碼
if show_bbox:
p = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2,
alpha=0.7, linestyle="dashed",
edgecolor=color, facecolor='none')
ax.add_patch(p)
# Label
if not captions:
class_id = class_ids[i]
score = scores[i] if scores is not None else None
label = class_names[class_id]
caption = "{} {:.3f}".format(label, score) if score else label
else:
caption = captions[i]
ax.text(x1, y1 + 8, caption,
color='w', size=11, backgroundcolor="none")
# Mask
mask = masks[:, :, i]
if show_mask:
masked_image = apply_mask(masked_image, mask, color)
# Mask Polygon
# Pad to ensure proper polygons for masks that touch image edges.
padded_mask = np.zeros(
(mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8)
padded_mask[1:-1, 1:-1] = mask
contours = find_contours(padded_mask, 0.5)
for verts in contours:
# Subtract the padding and flip (y, x) to (x, y)
verts = np.fliplr(verts) - 1
p = Polygon(verts, facecolor="none", edgecolor=color)
ax.add_patch(p)
ax.imshow(masked_image.astype(np.uint8))
if auto_show:
plt.show()
return masked_image
def predict():
import skimage.io
# Create models in training mode
config = Predict_Config()
config.display()
model = modellib.MaskRCNN(mode="inference", config=config)
# model_path = 'C:\\Users\\51102\\Desktop\mask-rcnn-me\\MASKRCNN_myself\Mask_RCNN-master\\logs\\shapes20200216T1602\\mask_rcnn_shapes_0002.h5'
model_path = 'C:\\Users\\51102\\Desktop\\maskrcnn(tangjun)\\log\\04.h5'
# Load trained weights (fill in path to trained weights here)
assert model_path != "", "Provide path to trained weights"
print("Loading weights from ", model_path)
model.load_weights(model_path, by_name=True)
class_names = ['BG', 'line_bulge','dot_concave','dot_bulge','Irregular_concave']
# file_names ='C:\\Users\\51102\\Desktop\\maskrcnn(tangjun)\\1.jpg'
file_names='C:\\Users\\51102\\Desktop\\maskrcnn(tangjun)\\3.bmp'
# image = skimage.io.imread(os.path.join(IMAGE_DIR, random.choice(file_names)))
image = skimage.io.imread(file_names)
image=image[:, :, 0:3]
print('image=', image.shape)
# Run detection
results = model.detect([image], log_print=1)
'''
results.append({
"rois": final_rois,
"class_ids": final_class_ids,
"scores": final_scores,
"masks": final_masks})
'''
# Visualize results
r = results[0]
print('r=',r)
display_instances(image, r['rois'], r['masks'], r['class_ids'], class_names, r['scores'])
if __name__ == "__main__":
train_model()
# predict()
此代碼為模型文件.py
"""
MASKRCNN algrithm for object detection and instance segmentation
Written and modified by tang jun on JAN , 2019
if you have questions , please connect me by Email: tangjunjunfighter@163.com
"""
import skimage.color
import skimage.io
import skimage.transform
# import urllib.request
# import shutil
# import warnings
# from distutils.version import LooseVersion
import scipy
# import os
# import random
# import datetime
import re
import math
# import logging
# from collections import OrderedDict
# import multiprocessing
import numpy as np
import tensorflow as tf
import keras
import keras.backend as K # keras中的后端backend及其相關函數
import keras.layers as KL
import keras.engine as KE
import keras.models as KM
# from mrcnn1 import utils
# Requires TensorFlow 1.3+ and Keras 2.0.8+.
from distutils.version import LooseVersion
assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
assert LooseVersion(keras.__version__) >= LooseVersion('2.0.8')
############################################################
# Utility Functions
############################################################
def batch_slice(inputs, graph_fn, batch_size, names=None):
"""Splits inputs into slices and feeds each slice to a copy of the given
computation graph and then combines the results. It allows you to run a
graph on a batch of inputs even if the graph is written to support one
instance only.
inputs: list of tensors. All must have the same first dimension length
graph_fn: A function that returns a TF tensor that's part of a graph.
batch_size: number of slices to divide the data into.
names: If provided, assigns names to the resulting tensors.
"""
if not isinstance(inputs, list):
inputs = [inputs]
outputs = []
for i in range(batch_size):
inputs_slice = [x[i] for x in inputs] # [scores[i], xi[i]]的值
output_slice = graph_fn(*inputs_slice)
if not isinstance(output_slice, (tuple, list)):
output_slice = [output_slice] # 將其轉換為列表
outputs.append(output_slice)
# Change outputs from a list of slices where each is
# a list of outputs to a list of outputs and each has
# a list of slices
outputs = list(zip(*outputs))
if names is None:
names = [None] * len(outputs) # 將其變成對應的輸出類
result = [tf.stack(o, axis=0, name=n) for o, n in zip(outputs, names)]
if len(result) == 1:
result = result[0]
return result
def norm_boxes(boxes, shape):
"""Converts boxes from pixel coordinates to normalized coordinates.
boxes: [N, (y1, x1, y2, x2)] in pixel coordinates
shape: [..., (height, width)] in pixels
Note: In pixel coordinates (y2, x2) is outside the box. But in normalized
coordinates it's inside the box.
Returns:
[N, (y1, x1, y2, x2)] in normalized coordinates
"""
h, w = shape
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
return np.divide((boxes - shift), scale).astype(np.float32)
def denorm_boxes(boxes, shape):
h, w = shape
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
return np.around(np.multiply(boxes, scale) + shift).astype(np.int32)
def compute_iou(box, boxes, box_area, boxes_area):
# one box compare multiple boxes , we will get number depending on boxes,and return list.
"""Calculates IoU of the given box with the array of the given boxes.
box: 1D vector [y1, x1, y2, x2]
boxes: [boxes_count, (y1, x1, y2, x2)]
box_area: float. the area of 'box'
boxes_area: array of length boxes_count.
Note: the areas are passed in rather than calculated here for
efficiency. Calculate once in the caller to avoid duplicate work.
"""
# Calculate intersection areas
y1 = np.maximum(box[0], boxes[:, 0])
y2 = np.minimum(box[2], boxes[:, 2])
x1 = np.maximum(box[1], boxes[:, 1])
x2 = np.minimum(box[3], boxes[:, 3])
intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
union = box_area + boxes_area[:] - intersection[:]
iou = intersection / union
return iou
def log(text, array=None):
"""Prints a text message. And, optionally, if a Numpy array is provided it
prints it's shape, min, and max values.
"""
if array is not None:
text = text.ljust(25)
text += ("shape: {:20} ".format(str(array.shape)))
if array.size:
text += ("min: {:10.5f} max: {:10.5f}".format(array.min(),array.max()))
else:
text += ("min: {:10} max: {:10}".format("",""))
text += " {}".format(array.dtype)
print(text)
class BatchNorm(KL.BatchNormalization):
"""Extends the Keras BatchNormalization class to allow a central place
to make changes if needed.
Batch normalization has a negative effect on training if batches are small
so this layer is often frozen (via setting in Config class) and functions
as linear layer.
"""
def call(self, inputs, training=None):
"""
Note about training values:
None: Train BN layers. This is the normal mode
False: Freeze BN layers. Good when batch size is small
True: (don't use). Set layer in training mode even when making inferences
"""
return super(self.__class__, self).call(inputs, training=training)
def compute_backbone_shapes(config, image_shape):
"""Computes the width and height of each stage of the backbone network.
Returns:
[N, (height, width)]. Where N is the number of stages
"""
if callable(config.BACKBONE): # 檢測對象是否可被調用 # BACKBONE = "resnet101"
return config.COMPUTE_BACKBONE_SHAPE(image_shape)
# Currently supports ResNet only
assert config.BACKBONE in ["resnet50", "resnet101"]
return np.array([[int(math.ceil(image_shape[0] / stride)), int(math.ceil(image_shape[1] / stride))] for stride in config.BACKBONE_STRIDES]) # BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# [4, 8, 16, 32, 64] # 會出現 [5,2]
############################################################
# Resnet Graph
############################################################
# Code adopted from:
def identity_block(input_tensor, kernel_size, filters, stage, block, use_bias=True, train_bn=True):
"""The identity_block is the block that has no conv layer at shortcut
# Arguments
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the nb_filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: 'a','b'..., current block label, used for generating layer names
use_bias: Boolean. To use or not use a bias in conv layers.
train_bn: Boolean. Train or freeze Batch Norm layers
"""
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
x = BatchNorm(name=bn_name_base + '2a')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',name=conv_name_base + '2b', use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + '2b')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + '2c')(x, training=train_bn)
x = KL.Add()([x, input_tensor])
x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
return x
def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2), use_bias=True, train_bn=True):
"""conv_block is the block that has a conv layer at shortcut
# Arguments
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the nb_filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: 'a','b'..., current block label, used for generating layer names
use_bias: Boolean. To use or not use a bias in conv layers.
train_bn: Boolean. Train or freeze Batch Norm layers
Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
And the shortcut should have subsample=(2,2) as well
"""
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = KL.Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
x = BatchNorm(name=bn_name_base + '2a')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',name=conv_name_base + '2b', use_bias=use_bias)(x)
#stride 默認為1
x = BatchNorm(name=bn_name_base + '2b')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + '2c')(x, training=train_bn)
shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides, name=conv_name_base + '1', use_bias=use_bias)(input_tensor)
shortcut = BatchNorm(name=bn_name_base + '1')(shortcut, training=train_bn)
x = KL.Add()([x, shortcut]) # 將所有張量加起來,是每個對應元素的求和 x與shortcut張量維度及大小完全一致
x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
return x
def resnet_graph(input_image, architecture, stage5=False, train_bn=True):
"""Build a ResNet graph.
architecture: Can be resnet50 or resnet101
stage5: Boolean. If False, stage5 of the network is not created
train_bn: Boolean. Train or freeze Batch Norm layers
"""
assert architecture in ["resnet50", "resnet101"]
# Stage 1
x = KL.ZeroPadding2D((3, 3))(input_image) # w and h add three row and col format in default ways
x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
x = BatchNorm(name='bn_conv1')(x, training=train_bn) # BatchNorm 應該是繼承了庫函數的類
x = KL.Activation('relu')(x)
C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
# C1 and x have reduced the 4 times from original image after stage one
# Stage 2
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), train_bn=train_bn) # the size of image above will not change ,because strides is (1,1) which have modified default value
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', train_bn=train_bn)
C2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', train_bn=train_bn)
# Stage 3
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', train_bn=train_bn)
C3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', train_bn=train_bn)
# Stage 4
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', train_bn=train_bn)
block_count = {"resnet50": 5, "resnet101": 22}[architecture] # 挺巧妙的
for i in range(block_count):
x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), train_bn=train_bn)
C4 = x
# Stage 5
if stage5:
x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', train_bn=train_bn)
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', train_bn=train_bn)
C5 = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', train_bn=train_bn)
else:
C5 = None
return [C1, C2, C3, C4, C5]
############################################################
# Proposal Layer
############################################################
def apply_box_deltas_graph(boxes, deltas):
"""Applies the given deltas to the given boxes.
boxes: [N, (y1, x1, y2, x2)] boxes to update
deltas: [N, (dy, dx, log(dh), log(dw))] refinements to apply
"""
# Convert to y, x, h, w
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
center_y = boxes[:, 0] + 0.5 * height
center_x = boxes[:, 1] + 0.5 * width
# Apply deltas
center_y += deltas[:, 0] * height
center_x += deltas[:, 1] * width
height *= tf.exp(deltas[:, 2])
width *= tf.exp(deltas[:, 3])
# Convert back to y1, x1, y2, x2
y1 = center_y - 0.5 * height
x1 = center_x - 0.5 * width
y2 = y1 + height
x2 = x1 + width
result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out")
return result
def clip_boxes_graph(boxes, window):
"""
boxes: [N, (y1, x1, y2, x2)]
window: [4] in the form y1, x1, y2, x2
"""
# Split
wy1, wx1, wy2, wx2 = tf.split(window, 4)
y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)
# Clip
y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes")
clipped.set_shape((clipped.shape[0], 4))
return clipped
class ProposalLayer(KE.Layer):
"""Receives anchor scores and selects a subset to pass as proposals
to the second stage. Filtering is done based on anchor scores and
non-max suppression to remove overlaps. It also applies bounding
box refinement deltas to anchors.
Inputs:
rpn_probs: [batch, num_anchors, (bg prob, fg prob)]
rpn_bbox: [batch, num_anchors, (dy, dx, log(dh), log(dw))]
anchors: [batch, num_anchors, (y1, x1, y2, x2)] anchors in normalized coordinates
Returns:
Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
"""
def __init__(self, proposal_count, nms_threshold, config=None, **kwargs):
super(ProposalLayer, self).__init__(**kwargs) # adopt super function to call parent class original function
self.config = config
self.proposal_count = proposal_count
self.nms_threshold = nms_threshold
def call(self, inputs):
# Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
scores = inputs[0][:, :, 1] # 變成了[Batch, num_rois] 取的前景
# Box deltas [batch, num_rois, 4]
deltas = inputs[1]
deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4]) # RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# 上一行代碼相當於增加了一個維度
# Anchors
anchors = inputs[2]
# Improve performance by trimming to top anchors by score
# and doing the rest on the smaller subset.
pre_nms_limit = tf.minimum(self.config.PRE_NMS_LIMIT, tf.shape(anchors)[1]) # 返回最小的值 # self.config.PRE_NMS_LIMIT=6000 tf.shape(anchors)[1] = num_anchors
ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True, name="top_anchors").indices # indices 只是那一行的值,將會從大到小排序 # 返回每行最大k個值與對應的索引 value and indices
# scores 是n行2列
scores = batch_slice([scores, ix], lambda x, y: tf.gather(x, y), self.config.batch_size) # IMAGES_PER_GPU=2 [?,?]
deltas = batch_slice([deltas, ix], lambda x, y: tf.gather(x, y), self.config.batch_size) # [?,?,?]
pre_nms_anchors = batch_slice([anchors, ix], lambda a, x: tf.gather(a, x), self.config.batch_size, names=["pre_nms_anchors"]) #[?,?,?]
# Apply deltas to anchors to get refined anchors.
# [batch, N, (y1, x1, y2, x2)]
boxes = batch_slice([pre_nms_anchors, deltas], lambda x, y: apply_box_deltas_graph(x, y), self.config.batch_size, names=["refined_anchors"])
# boxes 是坐標 預測出來的delta是中心點與高和寬
# Clip to image boundaries. Since we're in normalized coordinates,
# clip to 0..1 range. [batch, N, (y1, x1, y2, x2)]
window = np.array([0, 0, 1, 1], dtype=np.float32)
boxes = batch_slice(boxes,
lambda x: clip_boxes_graph(x, window),
self.config.batch_size,
names=["refined_anchors_clipped"])
# Filter out small boxes
# According to Xinlei Chen's paper, this reduces detection accuracy
# for small objects, so we're skipping it.
# Non-max suppression
def nms(boxes, scores):
indices = tf.image.non_max_suppression(
boxes, scores, self.proposal_count,
self.nms_threshold, name="rpn_non_max_suppression")
proposals = tf.gather(boxes, indices)
# Pad if needed
padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
proposals = tf.pad(proposals, [(0, padding), (0, 0)])
return proposals
proposals = batch_slice([boxes, scores], nms, self.config.batch_size)
return proposals
def compute_output_shape(self, input_shape):
return (None, self.proposal_count, 4)
############################################################
# ROIAlign Layer
############################################################
# def log2_graph(x):
# """Implementation of Log2. TF doesn't have a native implementation."""
# return tf.log(x) / tf.log(2.0)
class PyramidROIAlign(KE.Layer):
"""Implements ROI Pooling on multiple levels of the feature pyramid.
Params:
- pool_shape: [pool_height, pool_width] of the output pooled regions. Usually [7, 7]
Inputs:
- boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
coordinates. Possibly padded with zeros if not enough
boxes to fill the array.
- image_meta: [batch, (meta data)] Image details. See compose_image_meta()
- feature_maps: List of feature maps from different levels of the pyramid.
Each is [batch, height, width, channels]
Output:
Pooled regions in the shape: [batch, num_boxes, pool_height, pool_width, channels].
The width and height are those specific in the pool_shape in the layer
constructor.
"""
def __init__(self, pool_shape, **kwargs):
super(PyramidROIAlign, self).__init__(**kwargs)
self.pool_shape = tuple(pool_shape)
def call(self, inputs):
# Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
boxes = inputs[0]
# Image meta
# Holds details about the image. See compose_image_meta()
image_meta = inputs[1]
# Feature Maps. List of feature maps from different level of the
# feature pyramid. Each is [batch, height, width, channels]
feature_maps = inputs[2:]
# Assign each ROI to a level in the pyramid based on the ROI area.
y1, x1, y2, x2 = tf.split(boxes, 4, axis=2) # [p2,p3,p4,p5]
h = y2 - y1
w = x2 - x1
# Use shape of first image. Images in a batch must have the same size.
image_shape = parse_image_meta_graph(image_meta)['image_shape'][0] # 使用 "image_shape": image_shape
# return {"image_id": image_id,"original_image_shape": original_image_shape,
# "image_shape": image_shape,"window": window,"scale": scale, "active_class_ids": active_class_ids, }
# Equation 1 in the Feature Pyramid Networks paper. Account for
# the fact that our coordinates are normalized here.
# e.g. a 224x224 ROI (in pixels) maps to P4
image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
roi_level=tf.log(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area))) / tf.log(2.0)
roi_level = tf.minimum(5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
roi_level = tf.squeeze(roi_level, 2)
# Loop through levels and apply ROI pooling to each. P2 to P5.
pooled = []
box_to_level = [] # 保存每一層的索引號
for i, level in enumerate(range(2, 6)):
ix = tf.where(tf.equal(roi_level, level)) # ix為每一層的索引號,與boxes對應
level_boxes = tf.gather_nd(boxes, ix) # 提取單層的boxes
# Box indices for crop_and_resize.
box_indices = tf.cast(ix[:, 0], tf.int32) # 變成如此格式[ 6 7 8 9 17 18]
# Keep track of which box is mapped to which level
box_to_level.append(ix)
# Stop gradient propogation to ROI proposals
level_boxes = tf.stop_gradient(level_boxes)
box_indices = tf.stop_gradient(box_indices)
# Crop and Resize
# From Mask R-CNN paper: "We sample four regular locations, so
# that we can evaluate either max or average pooling. In fact,
# interpolating only a single value at each bin center (without
# pooling) is nearly as effective."
#
# Here we use the simplified approach of a single value per bin,
# which is how it's done in tf.crop_and_resize()
# Result: [batch * num_boxes, pool_height, pool_width, channels]
pooled.append(tf.image.crop_and_resize(feature_maps[i], level_boxes, box_indices, self.pool_shape, method="bilinear")) # 雙線性插值
# 特征圖(batch) 對應batch選的box 挑選對應batch特征
#上一步代碼將對應batch的對應特征層的特征圖挑選出來,然后用該層對應的特征box去框住,然后pooling出pool_shape的尺寸
# Pack pooled features into one tensor
pooled = tf.concat(pooled, axis=0) # 按行拼接,則列的維度不變,行的維度增加
# Pack box_to_level mapping into one array and add another
# column representing the order of pooled boxes
box_to_level = tf.concat(box_to_level, axis=0) # 按行拼接,則列的維度不變,行的維度增加 保存box對應的層
box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1) # [len(box_to_level[0]),1] 大概形式為:[[2],[3],[1],[0]....]
box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis=1)
'''
box_to_level=[1,12,0]
[2,24,1]
[0,31,2]
...
'''
# Rearrange pooled features to match the order of the original boxes
# Sort box_to_level by batch then box index
# TF doesn't have a way to sort by two columns, so merge them and sort.
sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
ix = tf.nn.top_k(sorting_tensor, k=tf.shape(box_to_level)[0]).indices[::-1] # [ 8 9 10 11] 用來存儲索引,
'''
[[ 0.98925872 0.15743092 0.76471106 0.5949957 ]
[ 0.95766488 0.67846336 0.21058844 0.2644312 ]
[ 0.65531991 0.61445187 0.65372938 0.88111084]]
TopKV2(values=array([[ 0.98925872, 0.76471106],
[ 0.95766488, 0.67846336],
[ 0.88111084, 0.65531991]]), indices=array([[0, 2],
[0, 1],
[3, 0]]))
'''
ix = tf.gather(box_to_level[:, 2], ix) # boxes所對應的索引號
pooled = tf.gather(pooled, ix) #
# Re-add the batch dimension
shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0) # [ 4 number 7 7 256]
pooled = tf.reshape(pooled, shape)
return pooled
def compute_output_shape(self, input_shape):
return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1], )
############################################################
# Detection Target Layer
############################################################
def overlaps_graph(boxes1, boxes2):
"""Computes IoU overlaps between two sets of boxes.
boxes1, boxes2: [N, (y1, x1, y2, x2)].
"""
# 1. Tile boxes2 and repeat boxes1. This allows us to compare
# every boxes1 against every boxes2 without loops.
# TF doesn't have an equivalent to np.repeat() so simulate it
# using tf.tile() and tf.reshape.
b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1), [1, 1, tf.shape(boxes2)[0]]), [-1, 4])
b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
# 2. Compute intersections
b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)
b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)
y1 = tf.maximum(b1_y1, b2_y1)
x1 = tf.maximum(b1_x1, b2_x1)
y2 = tf.minimum(b1_y2, b2_y2)
x2 = tf.minimum(b1_x2, b2_x2)
intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
# 3. Compute unions
b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
union = b1_area + b2_area - intersection
# 4. Compute IoU and reshape to [boxes1, boxes2]
iou = intersection / union
overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
return overlaps
def box_refinement_graph(box, gt_box): # all of dimension must same
"""Compute refinement needed to transform box to gt_box.
box and gt_box are [N, (y1, x1, y2, x2)]
"""
box = tf.cast(box, tf.float32)
gt_box = tf.cast(gt_box, tf.float32)
height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width
gt_height = gt_box[:, 2] - gt_box[:, 0]
gt_width = gt_box[:, 3] - gt_box[:, 1]
gt_center_y = gt_box[:, 0] + 0.5 * gt_height
gt_center_x = gt_box[:, 1] + 0.5 * gt_width
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = tf.log(gt_height / height)
dw = tf.log(gt_width / width)
result = tf.stack([dy, dx, dh, dw], axis=1) # 真實box經過變換后得到的delta
return result
def detection_targets_graph(proposals, gt_class_ids, gt_boxes, gt_masks, config):
'''
DetectionTargetLayer的輸入包含了,target_rois, input_gt_class_ids, gt_boxes, input_gt_masks。
其中target_rois是ProposalLayer輸出的結果。首先,計算target_rois中的每一個rois和哪一個真實的框gt_boxes iou值,
如果最大的iou大於0.5,則被認為是正樣本,負樣本是是iou小於0.5並且和crowd box相交不大的anchor,選擇出了正負樣本,
還要保證樣本的均衡性,具體可以才配置文件中進行配置。最后計算了正樣本中的anchor和哪一個真實的框最接近,
用真實的框和anchor計算出偏移值,並且將mask的大小resize成28*28的
:param proposals:
:param gt_class_ids:
:param gt_boxes:
:param gt_masks:
:param config:
:return:
'''
"""Generates detection targets for one image. Subsamples proposals and
generates target class IDs, bounding box deltas, and masks for each.
Inputs:
proposals: [POST_NMS_ROIS_TRAINING, (y1, x1, y2, x2)] in normalized coordinates. Might
be zero padded if there are not enough proposals.
gt_class_ids: [MAX_GT_INSTANCES] int class IDs
gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type.
Returns: Target ROIs and corresponding class IDs, bounding box shifts,
and masks.
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded.
deltas: [TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw))]
masks: [TRAIN_ROIS_PER_IMAGE, height, width]. Masks cropped to bbox
boundaries and resized to neural network output size.
Note: Returned arrays might be zero padded if not enough target ROIs.
"""
# Assertions
asserts = [
tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals],
name="roi_assertion"),
]
with tf.control_dependencies(asserts):
proposals = tf.identity(proposals)
# Remove zero padding
proposals, _ = trim_zeros_graph(proposals, name="trim_proposals")
gt_boxes, non_zeros = trim_zeros_graph(gt_boxes, name="trim_gt_boxes")
gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros, name="trim_gt_class_ids")
gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2, name="trim_gt_masks")
# Handle COCO crowds
# 在coco數據集中,有的框會標注很多的物體,在訓練中,去掉這些框
# A crowd box in COCO is a bounding box around several instances. Exclude
# them from training. A crowd box is given a negative class ID.
crowd_ix = tf.where(gt_class_ids < 0)[:, 0] #
non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
crowd_boxes = tf.gather(gt_boxes, crowd_ix)
# 下面就是一張圖片中真實存在的物體用於訓練,已經排除了crowd box
gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)
gt_boxes = tf.gather(gt_boxes, non_crowd_ix)
gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2)
# Compute overlaps matrix [proposals, gt_boxes]
overlaps = overlaps_graph(proposals, gt_boxes)
# Compute overlaps with crowd boxes [proposals, crowd_boxes]
crowd_overlaps = overlaps_graph(proposals, crowd_boxes)
crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001) # 計算擁擠的crowd box
# Determine positive and negative ROIs
roi_iou_max = tf.reduce_max(overlaps, axis=1)
'''
上一行代碼的解釋:
roi_iou_max= [[0.76174609 0.80333894 0.68258544 0.57697359 0.85310562]
[0.43019702 0.52369922 0.97526372 0.73503863 0.57165666]
[0.35172219 0.23619196 0.50828622 0.60014882 0.67331094]
[0.15814392 0.68016351 0.08231241 0.47771463 0.69517046]]
返回值:[0.85310562 0.97526372 0.67331094 0.69517046]
'''
# 1. Positive ROIs are those with >= 0.5 IoU with a GT box
positive_roi_bool = (roi_iou_max >= 0.5) # eg:[ True False False False False True],實際該順序代表proposal的順序
positive_indices = tf.where(positive_roi_bool)[:, 0] # eg:[0 5] # 按overlaps的行挑選 # 得到 positive_roi_bppl滿足的位置
# 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.
negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0] ################## 維度不一樣無法運行
# Subsample ROIs. Aim for 33% positive
# Positive ROIs
positive_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO) # TRAIN_ROIS_PER_IMAGE = 200 ROI_POSITIVE_RATIO = 0.33 (32,0.33)
positive_indices = tf.random_shuffle(positive_indices)[:positive_count] #[:positive_count]中的positive_count超過positive_indices數量,則全部shuffle
# 從proposals個中選擇IOU值>=0.5后的序列,隨機排序后,挑選前positive_count
positive_count = tf.shape(positive_indices)[0]
# Negative ROIs. Add enough to maintain positive:negative ratio.
r = 1.0 / config.ROI_POSITIVE_RATIO # 1/0.33
negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count # positive_count*3-positive_count
negative_indices = tf.random_shuffle(negative_indices)[:negative_count] # 從proposals個中選擇IOU值<0.5后的序列,隨機排序后,挑選前negative_count
# Gather selected ROIs
positive_rois = tf.gather(proposals, positive_indices) # 按positive_indices選擇出len(positive_indices)個proposals的正樣本
negative_rois = tf.gather(proposals, negative_indices)# 按negative_indices選擇出len(negative_indices)個proposals
# Assign positive ROIs to GT boxes.
positive_overlaps = tf.gather(overlaps, positive_indices) # 通過變換得到positive_indices索引(按行挑選的),實際上面代碼只是做了正樣本與負樣本中如何挑選在len(proposal)中隨機len(positive_indices)的正樣本
# 以下函數實際為選擇后的positive_overlaps正樣本取尋找對應的標簽
roi_gt_box_assignment = tf.cond( # tf.cond(條件,a,b)條件滿足輸出a,否則b
tf.greater(tf.shape(positive_overlaps)[1], 0), # tf.greater(a,b) 必須滿足a>b輸出為True
true_fn = lambda: tf.argmax(positive_overlaps, axis=1), # 輸出該行列中最大的值
false_fn = lambda: tf.cast(tf.constant([]),tf.int64)
)
roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment) # 尋找到對應標簽后,開始提取對應的真實box,即從提取網絡后進行處理選擇的正樣本positive_overlaps所對應的box
roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment) # 尋找到對應標簽后,開始提取對應的真實class_ids,即從提取網絡后進行處理選擇的正樣本positive_overlaps所對應的class_ids
# Compute bbox refinement for positive ROIs
deltas = box_refinement_graph(positive_rois, roi_gt_boxes) # positive_rois, roi_gt_boxes 一一對應的 #[dy,dx,dh,dw]
deltas /= config.BBOX_STD_DEV # BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# Assign positive ROIs to GT masks
# Permute masks to [N, height, width, 1]
transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1) # 調換維度位置, 並增加最后一個維度
# Pick the right mask for each ROI
# 根據roi_gt_box_assignment采集正樣本mask
roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment) # 尋找到對應標簽后,開始提取對應的真實mask,即從提取網絡后進行處理選擇的正樣本positive_overlaps所對應的mask
# Compute mask targets
boxes = positive_rois # boxes置為positive_rois,即正樣本推薦框
# if config.USE_MINI_MASK:
# '''
# 如果采用mini_mask,則需要在這里將positive_rois轉換到roi_gt_boxes的范圍內,
# 因為mini_mask僅僅記錄了gt_boxes內部的mask信息
# 正如作者解釋注釋的"We store mask pixels that are inside the object bounding box,
# '''
# # Transform ROI coordinates from normalized image space
# # to normalized mini-mask space.
# y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)
# gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)
# gt_h = gt_y2 - gt_y1
# gt_w = gt_x2 - gt_x1
# y1 = (y1 - gt_y1) / gt_h
# x1 = (x1 - gt_x1) / gt_w
# y2 = (y2 - gt_y1) / gt_h
# x2 = (x2 - gt_x1) / gt_w
# boxes = tf.concat([y1, x1, y2, x2], 1)
#
box_ids = tf.range(0, tf.shape(roi_masks)[0])
masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes, box_ids, config.MASK_SHAPE) # MASK_SHAPE = [28, 28] # roi_masks必須是A 4-D tensor of shape [batch, image_height, image_width, depth]
# roi_masks與boxes一一對應,其中boxes來源挑選的網絡輸出框。
# 經過嘗試 box_ids 將roi_masks[0]被boxes[0]截取,並resize成mask_shape尺寸(28,28),以此類推,最終輸出roi_masks.shape[0]個圖像,即為mask圖
'''
這個函數操作相當於RoiPooling操作,函數原型是:
def crop_and_resize(image, boxes, box_ind, crop_size, method=None, extrapolation_value=None, name=None):
參數解釋:
image:表示特征圖,最終得到的每個proposal的特征圖從這個特征圖上得到
boxes:表示每個proposal的坐標(N,4)一般是
box_ind:表示proposal是來自mini_batch中的哪一張圖片
crop_size:表示Roi_pooling之后的大小
'''
# Remove the extra dimension from masks.
'''
# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
shape(squeeze(t)) ==> [2, 3]
Or, to remove specific size 1 dimensions:
# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
'''
masks = tf.squeeze(masks, axis=3) # [roi_gt_box_assignment,height,width,1] axis=3的維度必須是1,否則會報錯
# Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with binary cross entropy loss.
masks = tf.round(masks) # 四舍五入 0.5是舍棄的
# Append negative ROIs and pad bbox deltas and masks that are not used for negative ROIs with zeros.
rois = tf.concat([positive_rois, negative_rois], axis=0)
N = tf.shape(negative_rois)[0]
P = tf.maximum(config.TRAIN_ROIS_PER_IMAGE - tf.shape(rois)[0], 0) # TRAIN_ROIS_PER_IMAGE = 32
rois = tf.pad(rois, [(0, P), (0, 0)])
roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)])
roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)])
deltas = tf.pad(deltas, [(0, N + P), (0, 0)])
masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)])
'''
通過rpn網絡得到的anchor,選擇出來正負樣本,並計算出正樣本和真實框的差距,以及要預測的mask的值,
這些都是在后面的網絡中計算損失函數需要的真實值
'''
# 返回的變量按第一維度的上面是正樣本對應的,下面是負樣本,且對應的負樣本其它變量用0填充
return rois, roi_gt_class_ids, deltas, masks
class DetectionTargetLayer(KE.Layer):
"""Subsamples proposals and generates target box refinement, class_ids,
and masks for each.
Inputs:
proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might be zero padded if there are not enough proposals.
gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs.
gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type
Returns: Target ROIs and corresponding class IDs, bounding box shifts, and masks.
rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw)]
target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width]
Masks cropped to bbox boundaries and resized to neural
network output size.
Note: Returned arrays might be zero padded if not enough target ROIs.
"""
def __init__(self, config, **kwargs):
super(DetectionTargetLayer, self).__init__(**kwargs)
self.config = config
def call(self, inputs):
proposals = inputs[0]
gt_class_ids = inputs[1]
gt_boxes = inputs[2]
gt_masks = inputs[3]
# Slice the batch and run a graph for each slice
# TODO: Rename target_bbox to target_deltas for clarity
names = ["rois", "target_class_ids", "target_bbox", "target_mask"]
outputs = batch_slice([proposals, gt_class_ids, gt_boxes, gt_masks],
lambda w, x, y, z: detection_targets_graph(w, x, y, z, self.config), self.config.batch_size, names=names)
return outputs
def compute_output_shape(self, input_shape):
return [
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), # rois
(None, self.config.TRAIN_ROIS_PER_IMAGE), # class_ids
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), # deltas
(None, self.config.TRAIN_ROIS_PER_IMAGE, self.config.MASK_SHAPE[0], self.config.MASK_SHAPE[1]) # masks
]
def compute_mask(self, inputs, mask=None):
return [None, None, None, None]
############################################################
# Detection Layer
############################################################
def refine_detections_graph(rois, probs, deltas, window, config):
"""Refine classified proposals and filter overlaps and return final
detections.
Inputs:
rois: [N, (y1, x1, y2, x2)] in normalized coordinates
probs: [N, num_classes]. Class probabilities.
deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific
bounding box deltas.
window: (y1, x1, y2, x2) in normalized coordinates. The part of the image
that contains the image excluding the padding.
Returns detections shaped: [num_detections, (y1, x1, y2, x2, class_id, score)] where
coordinates are normalized.
"""
# Class IDs per ROI
class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
# Class probability of the top class of each ROI
indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)
class_scores = tf.gather_nd(probs, indices)
# Class-specific bounding box deltas
deltas_specific = tf.gather_nd(deltas, indices)
# Apply bounding box deltas
# Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates
refined_rois = apply_box_deltas_graph(
rois, deltas_specific * config.BBOX_STD_DEV)
# Clip boxes to image window
refined_rois = clip_boxes_graph(refined_rois, window)
# TODO: Filter out boxes with zero area
# Filter out background boxes
keep = tf.where(class_ids > 0)[:, 0]
# Filter out low confidence boxes
if config.DETECTION_MIN_CONFIDENCE:
conf_keep = tf.where(class_scores >= config.DETECTION_MIN_CONFIDENCE)[:, 0]
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
tf.expand_dims(conf_keep, 0))
keep = tf.sparse_tensor_to_dense(keep)[0]
# Apply per-class NMS
# 1. Prepare variables
pre_nms_class_ids = tf.gather(class_ids, keep)
pre_nms_scores = tf.gather(class_scores, keep)
pre_nms_rois = tf.gather(refined_rois, keep)
unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]
def nms_keep_map(class_id):
"""Apply Non-Maximum Suppression on ROIs of the given class."""
# Indices of ROIs of the given class
ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
# Apply NMS
class_keep = tf.image.non_max_suppression(
tf.gather(pre_nms_rois, ixs),
tf.gather(pre_nms_scores, ixs),
max_output_size=config.DETECTION_MAX_INSTANCES,
iou_threshold=config.DETECTION_NMS_THRESHOLD)
# Map indices
class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
# Pad with -1 so returned tensors have the same shape
gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0]
class_keep = tf.pad(class_keep, [(0, gap)],
mode='CONSTANT', constant_values=-1) # 補齊位置時候用-1填充
# Set shape so map_fn() can infer result shape
class_keep.set_shape([config.DETECTION_MAX_INSTANCES])
return class_keep
# class_keep 得到的索引號就是原來class_ids中所擁有索引,指定索引是幾就對應class_ids中的位置
# 2. Map over class IDs
nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,
dtype=tf.int64)
# 3. Merge results into one list, and remove -1 padding
nms_keep = tf.reshape(nms_keep, [-1])
nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
# 4. Compute intersection between keep and nms_keep
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
tf.expand_dims(nms_keep, 0))
keep = tf.sparse_tensor_to_dense(keep)[0]
# Keep top detections
roi_count = config.DETECTION_MAX_INSTANCES
class_scores_keep = tf.gather(class_scores, keep)
num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
keep = tf.gather(keep, top_ids)
# Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
# Coordinates are normalized.
detections = tf.concat([
tf.gather(refined_rois, keep),
tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],
tf.gather(class_scores, keep)[..., tf.newaxis]
], axis=1)
# Pad with zeros if detections < DETECTION_MAX_INSTANCES
gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
return detections
class DetectionLayer(KE.Layer):
"""Takes classified proposal boxes and their bounding box deltas and
returns the final detection boxes.
Returns:
[batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where
coordinates are normalized.
"""
def __init__(self, config=None, **kwargs):
super(DetectionLayer, self).__init__(**kwargs)
self.config = config
def call(self, inputs):
rois = inputs[0]
mrcnn_class = inputs[1]
mrcnn_bbox = inputs[2]
image_meta = inputs[3]
# Get windows of images in normalized coordinates. Windows are the area
# in the image that excludes the padding.
# Use the shape of the first image in the batch to normalize the window
# because we know that all images get resized to the same size.
m = parse_image_meta_graph(image_meta)
image_shape = m['image_shape'][0]
window = norm_boxes_graph(m['window'], image_shape[:2])
# Run detection refinement graph on each item in the batch
detections_batch = batch_slice(
[rois, mrcnn_class, mrcnn_bbox, window],
lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config),
self.config.batch_size)
# Reshape output
# [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
# normalized coordinates
return tf.reshape(
detections_batch,
[self.config.batch_size, self.config.DETECTION_MAX_INSTANCES, 6])
def compute_output_shape(self, input_shape):
return (None, self.config.DETECTION_MAX_INSTANCES, 6)
############################################################
# Region Proposal Network (RPN)
############################################################
def build_rpn_model(anchor_stride, anchors_per_location, depth):
"""Builds a Keras model of the Region Proposal Network.
It wraps the RPN graph so it can be used multiple times with shared
weights.
anchors_per_location: number of anchors per pixel in the feature map
anchor_stride: Controls the density of anchors. Typically 1 (anchors for
every pixel in the feature map), or 2 (every other pixel).
depth: Depth of the backbone feature map.
Returns a Keras Model object. The model outputs, when called, are:
rpn_class_logits: [batch, H * W * anchors_per_location, 2] Anchor classifier logits (before softmax)
rpn_probs: [batch, H * W * anchors_per_location, 2] Anchor classifier probabilities.
rpn_bbox: [batch, H * W * anchors_per_location, (dy, dx, log(dh), log(dw))] Deltas to be
applied to anchors.
"""
input_feature_map = KL.Input(shape=[None, None, depth], name="input_rpn_feature_map")
# TODO: check if stride of 2 causes alignment issues if the feature map
# is not even.
# Shared convolutional base of the RPN
shared = KL.Conv2D(512, (3, 3), padding='same', activation='relu', strides=anchor_stride,
name='rpn_conv_shared')(input_feature_map)
# Anchor Score. [batch, height, width, anchors per location * 2].
# 下面一句代碼的含義,輸出圖片大小沒有改變,因為卷積核為(1,1),輸出圖片為6張,而每一張圖片都有height,width
# 所以最終輸出為[batch, H * W * anchors_per_location, 2]的數量,沒毛病
x = KL.Conv2D(2 * anchors_per_location, (1, 1), padding='valid', activation='linear', name='rpn_class_raw')(shared)
# Reshape to [batch, anchors, 2]=[batch,3,2]
rpn_class_logits = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)
# Softmax on last dimension of BG/FG.
rpn_probs = KL.Activation("softmax", name="rpn_class_xxx")(rpn_class_logits)
# Bounding box refinement. [batch, H, W, anchors per location * depth]
# where depth is [x, y, log(w), log(h)]
x = KL.Conv2D(anchors_per_location * 4, (1, 1), padding="valid", activation='linear', name='rpn_bbox_pred')(shared)
# Reshape to [batch, anchors, 4]
rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)
outputs = [rpn_class_logits, rpn_probs, rpn_bbox]
return KM.Model([input_feature_map], outputs, name="rpn_model")
############################################################
# Feature Pyramid Network Heads
############################################################
def fpn_classifier_graph(rois, feature_maps, image_meta, pool_size, num_classes, train_bn=True, fc_layers_size=1024):
"""Builds the computation graph of the feature pyramid network classifier and regressor heads.
rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized coordinates.
feature_maps: List of feature maps from different layers of the pyramid,
[P2, P3, P4, P5]. Each has a different resolution. # 256 for every layer
image_meta: [batch, (meta data)] Image details. See compose_image_meta() [batch,1+3+3+4+1+num_class]
pool_size: The width of the square feature map generated from ROI Pooling.
meta = np.array(
[image_id] + # size=1
list(original_image_shape) + # size=3
list(image_shape) + # size=3
list(window) + # size=4 (y1, x1, y2, x2) in image cooredinates
[scale] + # size=1
list(active_class_ids) # size=num_classes
)
num_classes: number of classes, which determines the depth of the results
train_bn: Boolean. Train or freeze Batch Norm layers
fc_layers_size: Size of the 2 FC layers
Returns:
logits: [batch, num_rois, NUM_CLASSES] classifier logits (before softmax)
probs: [batch, num_rois, NUM_CLASSES] classifier probabilities
bbox_deltas: [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))] Deltas to apply to proposal boxes
"""
# ROI Pooling
# Shape: [batch, num_rois, POOL_SIZE, POOL_SIZE, channels]
x = PyramidROIAlign([pool_size, pool_size], name="roi_align_classifier")([rois, image_meta] + feature_maps)
# Two 1024 FC layers (implemented with Conv2D for consistency)
x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (pool_size, pool_size), padding="valid"),
name="mrcnn_class_conv1")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn1')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (1, 1)),
name="mrcnn_class_conv2")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn2')(x, training=train_bn)
x = KL.Activation('relu')(x)
shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),
name="pool_squeeze")(x)
# Classifier head
mrcnn_class_logits = KL.TimeDistributed(KL.Dense(num_classes),
name='mrcnn_class_logits')(shared)
mrcnn_probs = KL.TimeDistributed(KL.Activation("softmax"),
name="mrcnn_class")(mrcnn_class_logits)
# BBox head
# [batch, num_rois, NUM_CLASSES * (dy, dx, log(dh), log(dw))]
x = KL.TimeDistributed(KL.Dense(num_classes * 4, activation='linear'),
name='mrcnn_bbox_fc')(shared)
# Reshape to [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))]
s = K.int_shape(x)
mrcnn_bbox = KL.Reshape((s[1], num_classes, 4), name="mrcnn_bbox")(x)
return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox
def build_fpn_mask_graph(rois, feature_maps, image_meta, pool_size, num_classes, train_bn=True):
"""Builds the computation graph of the mask head of Feature Pyramid Network.
rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized coordinates.
feature_maps: List of feature maps from different layers of the pyramid,
[P2, P3, P4, P5]. Each has a different resolution.
image_meta: [batch, (meta data)] Image details. See compose_image_meta()
pool_size: The width of the square feature map generated from ROI Pooling.
num_classes: number of classes, which determines the depth of the results
train_bn: Boolean. Train or freeze Batch Norm layers
Returns: Masks [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, NUM_CLASSES]
"""
# ROI Pooling
# Shape: [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, channels]
x = PyramidROIAlign([pool_size, pool_size], name="roi_align_mask")([rois, image_meta] + feature_maps)
# Conv layers
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv1")(x)
x = KL.TimeDistributed(BatchNorm(),
name='mrcnn_mask_bn1')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv2")(x)
x = KL.TimeDistributed(BatchNorm(),
name='mrcnn_mask_bn2')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv3")(x)
x = KL.TimeDistributed(BatchNorm(),
name='mrcnn_mask_bn3')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv4")(x)
x = KL.TimeDistributed(BatchNorm(),
name='mrcnn_mask_bn4')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2DTranspose(256, (2, 2), strides=2, activation="relu"),
name="mrcnn_mask_deconv")(x)
x = KL.TimeDistributed(KL.Conv2D(num_classes, (1, 1), strides=1, activation="sigmoid"),
name="mrcnn_mask")(x)
return x
############################################################
# Loss Functions
############################################################
def smooth_l1_loss(y_true, y_pred):
"""
Implements Smooth-L1 loss. y_true and y_pred are typically: [N, 4], but could be any shape.
按照作者最正統的解釋: 因為回歸的targets沒有明確的限制,因此可能會出現較大的錯誤的偏移去主導 loss 的情況,最終造成梯度爆炸,
使用 smooth L1 loss 能夠更好地避免這種情況。
Smooth L1 Loss結合了L2 Loss收斂更快,且在0點有導數,便於收斂的好處。也在邊界區域結合了L1 Loss的好處,讓網絡對異常值更加robust,
能夠在偏移值較大時還能拉回來。
Smooth L1 Loss相比於L2 Loss對於離群點、異常值(outliers)更不敏感,或者說是更加魯棒,可控制梯度的量級使訓練時不容易跑飛。
(Fast R-CNN中的解釋:L1 loss that is less sensitive to outliers than the L2 loss used in R-CNN and SPPnet)
原文鏈接:https://blog.csdn.net/ytusdc/article/details/86659696
"""
diff = K.abs(y_true - y_pred)
less_than_one = K.cast(K.less(diff, 1.0), "float32") # 逐個元素比對 (x < y) 的真值
loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5) # 當diff絕對值小於1時候less_than_one為1,否則為0,為計算此部分損失將用1-less_than_one
return loss
def rpn_class_loss_graph(rpn_match, rpn_class_logits):
"""RPN anchor classifier loss.
rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_class_logits: [batch, anchors, 2]. RPN classifier logits for BG/FG.
"""
# Squeeze last dim to simplify
rpn_match = tf.squeeze(rpn_match, -1) # 去掉維度
# Get anchor classes. Convert the -1/+1 match to 0/1 values.
anchor_class = K.cast(K.equal(rpn_match, 1), tf.int32) # rpn 等於1的全為True,否則為False,在經過轉換變成1或0
# Positive and Negative anchors contribute to the loss,
# but neutral anchors (match value = 0) don't.
indices = tf.where(K.not_equal(rpn_match, 0)) # [n,2] 將不為0的位置找到 因為中立不需要進行loss計算,因此不需要。所以這一步是剔除得到需要進行loss計算的索引
# Pick rows that contribute to the loss and filter out the rest.
rpn_class_logits = tf.gather_nd(rpn_class_logits, indices) # 取了對應的batch與anchor的2列
anchor_class = tf.gather_nd(anchor_class, indices) # 將anchor_class 轉變成0,1類,0表示背景,1表示前景
# Cross entropy loss
loss = K.sparse_categorical_crossentropy(target=anchor_class,
output=rpn_class_logits,
from_logits=True)
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0)) # 根據一個標量值在兩個操作之間切換,損失函數不能為負號
return loss
def rpn_bbox_loss_graph(config, target_bbox, rpn_match, rpn_bbox):
"""Return the RPN bounding box loss graph.
config: the model config object.
target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))].
Uses 0 padding to fill in unsed bbox deltas.
rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
"""
# Positive anchors contribute to the loss, but negative and
# neutral anchors (match value of 0 or -1) don't.
rpn_match = K.squeeze(rpn_match, -1) # 刪除最后一個維度將三維變成二維 [batch,anchors]
indices = tf.where(K.equal(rpn_match, 1)) # 挑選正樣本序列
# Pick bbox deltas that contribute to the loss
rpn_bbox = tf.gather_nd(rpn_bbox, indices) # 挑選正樣本的bbox
# Trim target bounding box deltas to the same length as rpn_bbox.
batch_counts = K.sum(K.cast(K.equal(rpn_match, 1), tf.int32), axis=1) # 計算出每個batch有多少個 [batch]
# target_bbox = batch_pack_graph(target_bbox, batch_counts, config.batch_size)
"""Picks different number of values from each row in x depending on the values in counts.
"""
target_bbox_temp=target_bbox
outputs = []
for i in range(config.batch_size):
outputs.append(target_bbox_temp[i, :batch_counts[i]])
target_bbox = tf.concat(outputs, axis=0)
loss = smooth_l1_loss(target_bbox, rpn_bbox)
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
return loss
def mrcnn_class_loss_graph(target_class_ids, pred_class_logits,active_class_ids):
"""Loss for the classifier head of Mask RCNN.
target_class_ids: [batch, num_rois]. Integer class IDs. Uses zero
padding to fill in the array.
pred_class_logits: [batch, num_rois, num_classes]
active_class_ids: [batch, num_classes]. Has a value of 1 for
classes that are in the dataset of the image, and 0
for classes that are not in the dataset.
"""
# During model building, Keras calls this function with
# target_class_ids of type float32. Unclear why. Cast it
# to int to get around it.
target_class_ids = tf.cast(target_class_ids, 'int64')
# Find predictions of classes that are not in the dataset.
pred_class_ids = tf.argmax(pred_class_logits, axis=2)
# TODO: Update this line to work with batch > 1. Right now it assumes all images in a batch have the same active_class_ids
pred_active = tf.gather(active_class_ids[0], pred_class_ids)
# Loss
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_class_ids, logits=pred_class_logits)
# Erase losses of predictions of classes that are not in the active
# classes of the image.
loss = loss * pred_active
# Computer loss mean. Use only predictions that contribute
# to the loss to get a correct mean.
loss = tf.reduce_sum(loss) / tf.reduce_sum(pred_active)
return loss
def mrcnn_bbox_loss_graph(target_bbox, target_class_ids, pred_bbox):
"""Loss for Mask R-CNN bounding box refinement.
target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))]
target_class_ids: [batch, num_rois]. Integer class IDs.
pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))]
"""
# Reshape to merge batch and roi dimensions for simplicity.
target_class_ids = K.reshape(target_class_ids, (-1,)) # 變成一個維度batch*num_rois
target_bbox = K.reshape(target_bbox, (-1, 4)) # [batch*num_rois,4]
pred_bbox = K.reshape(pred_bbox, (-1, K.int_shape(pred_bbox)[2], 4)) # [batch*num_rois,num_classes,4]
# Only positive ROIs contribute to the loss. And only
# the right class_id of each ROI. Get their indices.
positive_roi_ix = tf.where(target_class_ids > 0)[:, 0] # [[0],[1],[2],[3],[4]]將二維變成了[0 1 2 3 4]
positive_roi_class_ids = tf.cast(tf.gather(target_class_ids, positive_roi_ix), tf.int64) # 選擇正樣本的id序列,並提取出來
indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1) # 框的序列與class_id對應起來(全為正樣本)
# Gather the deltas (predicted and true) that contribute to loss
target_bbox = tf.gather(target_bbox, positive_roi_ix) # 根據正樣本序列,提取出
pred_bbox = tf.gather_nd(pred_bbox, indices)
# Smooth-L1 Loss
loss = K.switch(tf.size(target_bbox) > 0,smooth_l1_loss(y_true=target_bbox, y_pred=pred_bbox), tf.constant(0.0))
loss = K.mean(loss)
return loss
def mrcnn_mask_loss_graph(target_masks, target_class_ids, pred_masks):
"""Mask binary cross-entropy loss for the masks head.
target_masks: [batch, num_rois, height, width].
A float32 tensor of values 0 or 1. Uses zero padding to fill array.
target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.
pred_masks: [batch, proposals, height, width, num_classes] float32 tensor
with values from 0 to 1.
"""
# Reshape for simplicity. Merge first two dimensions into one.
target_class_ids = K.reshape(target_class_ids, (-1,))
mask_shape = tf.shape(target_masks)
target_masks = K.reshape(target_masks, (-1, mask_shape[2], mask_shape[3]))
pred_shape = tf.shape(pred_masks)
pred_masks = K.reshape(pred_masks,
(-1, pred_shape[2], pred_shape[3], pred_shape[4]))
# Permute predicted masks to [N, num_classes, height, width]
pred_masks = tf.transpose(pred_masks, [0, 3, 1, 2])
# Only positive ROIs contribute to the loss. And only
# the class specific mask of each ROI.
positive_ix = tf.where(target_class_ids > 0)[:, 0]
positive_class_ids = tf.cast(
tf.gather(target_class_ids, positive_ix), tf.int64)
indices = tf.stack([positive_ix, positive_class_ids], axis=1)
# Gather the masks (predicted and true) that contribute to loss
y_true = tf.gather(target_masks, positive_ix)
y_pred = tf.gather_nd(pred_masks, indices)
# Compute binary cross entropy. If no positive ROIs, then return 0.
# shape: [batch, roi, num_classes]
loss = K.switch(tf.size(y_true) > 0,
K.binary_crossentropy(target=y_true, output=y_pred),
tf.constant(0.0))
loss = K.mean(loss)
return loss
############################################################
# Data Generator
############################################################
def resize(image, output_shape, order=1, mode='constant', cval=0, clip=True,
preserve_range=False, anti_aliasing=False, anti_aliasing_sigma=None):
"""A wrapper for Scikit-Image resize().
Scikit-Image generates warnings on every call to resize() if it doesn't
receive the right parameters. The right parameters depend on the version
of skimage. This solves the problem by using different parameters per
version. And it provides a central place to control resizing defaults.
"""
if LooseVersion(skimage.__version__) >= LooseVersion("0.14"):
# New in 0.14: anti_aliasing. Default it to False for backward
# compatibility with skimage 0.13.
return skimage.transform.resize(
image, output_shape,
order=order, mode=mode, cval=cval, clip=clip,
preserve_range=preserve_range, anti_aliasing=anti_aliasing,
anti_aliasing_sigma=anti_aliasing_sigma)
else:
return skimage.transform.resize(
image, output_shape,
order=order, mode=mode, cval=cval, clip=clip,
preserve_range=preserve_range)
def resize_image(image, min_dim=None, max_dim=None, min_scale=None, mode="square"):
"""Resizes an image keeping the aspect ratio unchanged.
min_dim: if provided, resizes the image such that it's smaller dimension == min_dim
max_dim: if provided, ensures that the image longest side doesn't exceed this value.
min_scale: if provided, ensure that the image is scaled up by at least
this percent even if min_dim doesn't require it.
mode: Resizing mode.
none: No resizing. Return the image unchanged.
square: Resize and pad with zeros to get a square image of size [max_dim, max_dim].
pad64: Pads width and height with zeros to make them multiples of 64.
If min_dim or min_scale are provided, it scales the image up
before padding. max_dim is ignored in this mode.
The multiple of 64 is needed to ensure smooth scaling of feature
maps up and down the 6 levels of the FPN pyramid (2**6=64).
crop: Picks random crops from the image. First, scales the image based
on min_dim and min_scale, then picks a random crop of
size min_dim x min_dim. Can be used in training only.
max_dim is not used in this mode.
Returns:
image: the resized image
window: (y1, x1, y2, x2). If max_dim is provided, padding might
be inserted in the returned image. If so, this window is the
coordinates of the image part of the full image (excluding
the padding). The x2, y2 pixels are not included.
scale: The scale factor used to resize the image
padding: Padding added to the image [(top, bottom), (left, right), (0, 0)]
"""
# Keep track of image dtype and return results in the same dtype
image_dtype = image.dtype
# Default window (y1, x1, y2, x2) and default scale == 1.
h, w = image.shape[:2]
window = (0, 0, h, w)
scale = 1
padding = [(0, 0), (0, 0), (0, 0)]
if mode == "none":
return image, window, scale, padding
# Scale?
if min_dim:
# Scale up but not down
scale = max(1, min_dim / min(h, w)) # h, w是原始圖片的高與寬
if min_scale and scale < min_scale: # min_scale是最小填充倍數的,至少要大於它
scale = min_scale
# Does it exceed max dim?
if max_dim and mode == "square":
image_max = max(h, w)
if round(image_max * scale) > max_dim: # 最終原圖片最大邊擴充不能超過最大max_dim維度,否則重新選擇scale
scale = max_dim / image_max
# Resize image using bilinear interpolation
if scale != 1:
image = resize(image, (round(h * scale), round(w * scale)), preserve_range=True)
# 上一行代碼對圖像做了resize,那么會改變圖像的尺寸,這是我不願意看到的,我覺的這樣會對缺陷特征有損失,
# 或者出現變異,因此小心這里的變化
# Need padding or cropping?
if mode == "square":
# Get new height and width
h, w = image.shape[:2] # 此時已經將原圖按照scale進行了改變
top_pad = (max_dim - h) // 2
bottom_pad = max_dim - h - top_pad
left_pad = (max_dim - w) // 2
right_pad = max_dim - w - left_pad
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode='constant', constant_values=0) # 將改變的圖片進行了填充
window = (top_pad, left_pad, h + top_pad, w + left_pad) # 保存經過resize后圖片的真實大小
elif mode == "pad64":
h, w = image.shape[:2]
# Both sides must be divisible by 64
assert min_dim % 64 == 0, "Minimum dimension must be a multiple of 64"
# Height
if h % 64 > 0:
max_h = h - (h % 64) + 64
top_pad = (max_h - h) // 2
bottom_pad = max_h - h - top_pad
else:
top_pad = bottom_pad = 0
# Width
if w % 64 > 0:
max_w = w - (w % 64) + 64
left_pad = (max_w - w) // 2
right_pad = max_w - w - left_pad
else:
left_pad = right_pad = 0
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode='constant', constant_values=0)
window = (top_pad, left_pad, h + top_pad, w + left_pad)
else:
raise Exception("Mode {} not supported".format(mode))
return image.astype(image_dtype), window, scale, padding
def resize_mask(mask, scale, padding):
# scale是輸入圖像的尺寸變化,padding是最大維度的背景填充,mask有效坐標對應原來輸入的圖像中
"""Resizes a mask using the given scale and padding.
Typically, you get the scale and padding from resize_image() to
ensure both, the image and the mask, are resized consistently.
scale: mask scaling factor
padding: Padding to add to the mask in the form
[(top, bottom), (left, right), (0, 0)]
"""
# Suppress warning from scipy 0.13.0, the output shape of zoom() is
# calculated with round() instead of int()
# with warnings.catch_warnings():
# warnings.simplefilter("ignore")
mask = scipy.ndimage.zoom(mask, zoom=[scale, scale, 1], order=0)
# if crop is not None:
# y, x, h, w = crop
# mask = mask[y:y + h, x:x + w]
# else:
mask = np.pad(mask, padding, mode='constant', constant_values=0)
return mask
def extract_bboxes(mask): # [[num_instances, (y1, x1, y2, x2)]]
# in a word,bbox proced by mask will contain all mask which value equal 1.
"""Compute bounding boxes from masks.
mask: [height, width, num_instances]. Mask pixels are either 1 or 0.
Returns: bbox array [num_instances, (y1, x1, y2, x2)].
"""
boxes = np.zeros([mask.shape[-1], 4], dtype=np.int32)
# the last dimension for mask (num_instances) is bbox for instance every picture
for i in range(mask.shape[-1]):
m = mask[:, :, i]
# Bounding box.
horizontal_indicies = np.where(np.any(m, axis=0))[0]
vertical_indicies = np.where(np.any(m, axis=1))[0]
if horizontal_indicies.shape[0]:
x1, x2 = horizontal_indicies[[0, -1]]
y1, y2 = vertical_indicies[[0, -1]]
# x2 and y2 should not be part of the box. Increment by 1.
x2 += 1
y2 += 1
else:
# No mask for this instance. Might happen due to
# resizing or cropping. Set bbox to zeros
x1, x2, y1, y2 = 0, 0, 0, 0
boxes[i] = np.array([y1, x1, y2, x2])
return boxes.astype(np.int32)
def box_refinement(box, gt_box):
"""Compute refinement needed to transform box to gt_box.
box and gt_box are [N, (y1, x1, y2, x2)]. (y2, x2) is
assumed to be outside the box.
"""
box = box.astype(np.float32)
gt_box = gt_box.astype(np.float32)
height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width
gt_height = gt_box[:, 2] - gt_box[:, 0]
gt_width = gt_box[:, 3] - gt_box[:, 1]
gt_center_y = gt_box[:, 0] + 0.5 * gt_height
gt_center_x = gt_box[:, 1] + 0.5 * gt_width
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = np.log(gt_height / height)
dw = np.log(gt_width / width)
return np.stack([dy, dx, dh, dw], axis=1)
def build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks, config):
"""Generate targets for training Stage 2 classifier and mask heads.
This is not used in normal training. It's useful for debugging or to train
the Mask RCNN heads without using the RPN head.
Inputs:
rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
gt_class_ids: [instance count] Integer class IDs
gt_boxes: [instance count, (y1, x1, y2, x2)]
gt_masks: [height, width, instance count] Ground truth masks. Can be full
size or mini-masks.
Returns:
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
bbox refinements.
masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped
to bbox boundaries and resized to neural network output size.
"""
assert rpn_rois.shape[0] > 0
assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format(
gt_class_ids.dtype)
assert gt_boxes.dtype == np.int32, "Expected int but got {}".format(
gt_boxes.dtype)
assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format(
gt_masks.dtype)
# It's common to add GT Boxes to ROIs but we don't do that here because
# according to XinLei Chen's paper, it doesn't help.
# Trim empty padding in gt_boxes and gt_masks parts
instance_ids = np.where(gt_class_ids > 0)[0]
assert instance_ids.shape[0] > 0, "Image must contain instances."
gt_class_ids = gt_class_ids[instance_ids]
gt_boxes = gt_boxes[instance_ids]
gt_masks = gt_masks[:, :, instance_ids]
# Compute areas of ROIs and ground truth boxes.
rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * \
(rpn_rois[:, 3] - rpn_rois[:, 1])
gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * \
(gt_boxes[:, 3] - gt_boxes[:, 1])
# Compute overlaps [rpn_rois, gt_boxes]
overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
for i in range(overlaps.shape[1]):
gt = gt_boxes[i]
overlaps[:, i] = compute_iou(
gt, rpn_rois, gt_box_area[i], rpn_roi_area)
# Assign ROIs to GT boxes
rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)
rpn_roi_iou_max = overlaps[np.arange(
overlaps.shape[0]), rpn_roi_iou_argmax]
# GT box assigned to each ROI
rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]
rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]
# Positive ROIs are those with >= 0.5 IoU with a GT box.
fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]
# Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
# TODO: To hard example mine or not to hard example mine, that's the question
# bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
# Subsample ROIs. Aim for 33% foreground.
# FG
fg_roi_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO)
if fg_ids.shape[0] > fg_roi_count:
keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
else:
keep_fg_ids = fg_ids
# BG
remaining = config.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[0]
if bg_ids.shape[0] > remaining:
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
else:
keep_bg_ids = bg_ids
# Combine indices of ROIs to keep
keep = np.concatenate([keep_fg_ids, keep_bg_ids])
# Need more?
remaining = config.TRAIN_ROIS_PER_IMAGE - keep.shape[0]
if remaining > 0:
# Looks like we don't have enough samples to maintain the desired
# balance. Reduce requirements and fill in the rest. This is
# likely different from the Mask RCNN paper.
# There is a small chance we have neither fg nor bg samples.
if keep.shape[0] == 0:
# Pick bg regions with easier IoU threshold
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
assert bg_ids.shape[0] >= remaining
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
assert keep_bg_ids.shape[0] == remaining
keep = np.concatenate([keep, keep_bg_ids])
else:
# Fill the rest with repeated bg rois.
keep_extra_ids = np.random.choice(
keep_bg_ids, remaining, replace=True)
keep = np.concatenate([keep, keep_extra_ids])
assert keep.shape[0] == config.TRAIN_ROIS_PER_IMAGE, \
"keep doesn't match ROI batch size {}, {}".format(
keep.shape[0], config.TRAIN_ROIS_PER_IMAGE)
# Reset the gt boxes assigned to BG ROIs.
rpn_roi_gt_boxes[keep_bg_ids, :] = 0
rpn_roi_gt_class_ids[keep_bg_ids] = 0
# For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
rois = rpn_rois[keep]
roi_gt_boxes = rpn_roi_gt_boxes[keep]
roi_gt_class_ids = rpn_roi_gt_class_ids[keep]
roi_gt_assignment = rpn_roi_iou_argmax[keep]
# Class-aware bbox deltas. [y, x, log(h), log(w)]
bboxes = np.zeros((config.TRAIN_ROIS_PER_IMAGE,
config.NUM_CLASSES, 4), dtype=np.float32)
pos_ids = np.where(roi_gt_class_ids > 0)[0]
bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = box_refinement(
rois[pos_ids], roi_gt_boxes[pos_ids, :4])
# Normalize bbox refinements
bboxes /= config.BBOX_STD_DEV
# Generate class-specific target masks
masks = np.zeros((config.TRAIN_ROIS_PER_IMAGE, config.MASK_SHAPE[0], config.MASK_SHAPE[1], config.NUM_CLASSES),
dtype=np.float32)
for i in pos_ids:
class_id = roi_gt_class_ids[i]
assert class_id > 0, "class id must be greater than 0"
gt_id = roi_gt_assignment[i]
class_mask = gt_masks[:, :, gt_id]
# Pick part of the mask and resize it
y1, x1, y2, x2 = rois[i].astype(np.int32)
m = class_mask[y1:y2, x1:x2]
mask = resize(m, config.MASK_SHAPE)
masks[i, :, :, class_id] = mask
return rois, roi_gt_class_ids, bboxes, masks
def compute_overlaps(boxes1, boxes2):
# each value in boxes2 compute with all boxes1,and calling compute_iou function
# finally, value save in [number_boxes1,number_boxes2]
"""Computes IoU overlaps between two sets of boxes.
boxes1, boxes2: [N, (y1, x1, y2, x2)].
For better performance, pass the largest set first and the smaller second.
"""
# Areas of anchors and GT boxes
area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
# Compute overlaps to generate matrix [boxes1 count, boxes2 count]
# Each cell contains the IoU value.
overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0])) # building variables for overlaps to save
for i in range(overlaps.shape[1]):
box2 = boxes2[i]
y1 = np.maximum(box2[0], boxes1[:, 0])
y2 = np.minimum(box2[2], boxes1[:, 2])
x1 = np.maximum(box2[1], boxes1[:, 1])
x2 = np.minimum(box2[3], boxes1[:, 3])
intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
union = area2[i] + area1[:] - intersection[:]
overlaps[:, i] = intersection / union
return overlaps
def build_rpn_targets(anchors, gt_class_ids, gt_boxes, config):
print('mode_data_rpn_box')
"""Given the anchors and GT boxes, compute overlaps and identify positive
anchors and deltas to refine them to match their corresponding GT boxes.
anchors: [num_anchors, (y1, x1, y2, x2)]
gt_class_ids: [num_gt_boxes] Integer class IDs.
gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
Returns:
rpn_match: [N] (int32) matches between anchors and GT boxes.
1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
"""
# RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
# RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))
# Handle COCO crowds
# A crowd box in COCO is a bounding box around several instances. Exclude
# them from training. A crowd box is given a negative class ID.
crowd_ix = np.where(gt_class_ids < 0)[0]
if crowd_ix.shape[0] > 0:
# Filter out crowds from ground truth class IDs and boxes
non_crowd_ix = np.where(gt_class_ids > 0)[0]
crowd_boxes = gt_boxes[crowd_ix]
gt_class_ids = gt_class_ids[non_crowd_ix]
gt_boxes = gt_boxes[non_crowd_ix]
# Compute overlaps with crowd boxes [anchors, crowds]
crowd_overlaps = compute_overlaps(anchors, crowd_boxes)
crowd_iou_max = np.amax(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001)
else:
# All anchors don't intersect a crowd
no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
# Compute overlaps [num_anchors, num_gt_boxes]
overlaps = compute_overlaps(anchors, gt_boxes)
# Match anchors to GT Boxes
# If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.
# If an anchor overlaps a GT box with IoU < 0.3 then it's negative.
# Neutral anchors are those that don't match the conditions above,
# and they don't influence the loss function.
# However, don't keep any GT box unmatched (rare, but happens). Instead,
# match it to the closest anchor (even if its max IoU is < 0.3).
#
# 1. Set negative anchors first. They get overwritten below if a GT box is
# matched to them. Skip boxes in crowd areas.
anchor_iou_argmax = np.argmax(overlaps, axis=1)
anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
# 2. Set an anchor for each GT box (regardless of IoU value).
# If multiple anchors have the same IoU match all of them
gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:,0]
rpn_match[gt_iou_argmax] = 1
# 3. Set anchors with high overlap as positive.
rpn_match[anchor_iou_max >= 0.7] = 1
# Subsample to balance positive and negative anchors
# Don't let positives be more than half the anchors
ids = np.where(rpn_match == 1)[0]
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)
if extra > 0:
# Reset the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# Same for negative proposals
ids = np.where(rpn_match == -1)[0]
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
np.sum(rpn_match == 1))
if extra > 0:
# Rest the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# For positive anchors, compute shift and scale needed to transform them
# to match the corresponding GT boxes.
ids = np.where(rpn_match == 1)[0]
ix = 0 # index into rpn_bbox
# TODO: use box_refinement() rather than duplicating the code here
for i, a in zip(ids, anchors[ids]):
# Closest gt box (it might have IoU < 0.7)
gt = gt_boxes[anchor_iou_argmax[i]]
# Convert coordinates to center plus width/height.
# GT Box
gt_h = gt[2] - gt[0]
gt_w = gt[3] - gt[1]
gt_center_y = gt[0] + 0.5 * gt_h
gt_center_x = gt[1] + 0.5 * gt_w
# Anchor
a_h = a[2] - a[0]
a_w = a[3] - a[1]
a_center_y = a[0] + 0.5 * a_h
a_center_x = a[1] + 0.5 * a_w
# Compute the bbox refinement that the RPN should predict.
rpn_bbox[ix] = [
(gt_center_y - a_center_y) / a_h,
(gt_center_x - a_center_x) / a_w,
np.log(gt_h / a_h),
np.log(gt_w / a_w),
]
# Normalize
rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
ix += 1
return rpn_match, rpn_bbox
def generate_random_rois(image_shape, count, gt_boxes):
"""Generates ROI proposals similar to what a region proposal network
would generate.
image_shape: [Height, Width, Depth]
count: Number of ROIs to generate
gt_class_ids: [N] Integer ground truth class IDs
gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.
Returns: [count, (y1, x1, y2, x2)] ROI boxes in pixels.
"""
# placeholder
rois = np.zeros((count, 4), dtype=np.int32)
# Generate random ROIs around GT boxes (90% of count)
rois_per_box = int(0.9 * count / gt_boxes.shape[0])
for i in range(gt_boxes.shape[0]):
gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
h = gt_y2 - gt_y1
w = gt_x2 - gt_x1
# random boundaries
r_y1 = max(gt_y1 - h, 0)
r_y2 = min(gt_y2 + h, image_shape[0])
r_x1 = max(gt_x1 - w, 0)
r_x2 = min(gt_x2 + w, image_shape[1])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
threshold][:rois_per_box]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
threshold][:rois_per_box]
if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
box_rois = np.hstack([y1, x1, y2, x2])
rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois
# Generate random ROIs anywhere in the image (10% of count)
remaining_count = count - (rois_per_box * gt_boxes.shape[0])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))
x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
threshold][:remaining_count]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
threshold][:remaining_count]
if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
global_rois = np.hstack([y1, x1, y2, x2])
rois[-remaining_count:] = global_rois
return rois
def generate_pyramid_anchors(scales, ratios, feature_shapes, feature_strides,anchor_stride):
"""Generate anchors at different levels of a feature pyramid. Each scale
is associated with a level of the pyramid, but each ratio is used in
all levels of the pyramid.
Returns:
anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted
with the same order of the given scales. So, anchors of scale[0] come
first, then anchors of scale[1], and so on.
"""
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
anchors = []
for i in range(len(scales)):
# anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i], feature_strides[i], anchor_stride))
"""
scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
shape: [height, width] spatial shape of the feature map over which to generate anchors.
feature_stride: Stride of the feature map relative to the image in pixels.
anchor_stride: Stride of anchors on the feature map. For example, if the value is 2 then generate anchors for every other feature map pixel.
"""
# Get all combinations of scales and ratios
scale, ratios = np.meshgrid(np.array(scales[i]), np.array(ratios))
scale = scale.flatten()
ratios = ratios.flatten()
shape=feature_shapes[i]
feature_stride=feature_strides[i]
# Enumerate heights and widths from scales and ratios
# 實際得到box的寬與高
heights = scale / np.sqrt(ratios)
widths = scale * np.sqrt(ratios)
# Enumerate shifts in feature space
# 實際得到box坐標中心
shifts_y = np.arange(0, shape[0],
anchor_stride) * feature_stride # anchor_stride 表示原圖img/stride縮放后以anchor_stride為步長取像素,
# 一此作為中心點,而后乘以feature_stride(stride)將像素中心放回原圖像位置中。
shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
# Enumerate combinations of shifts, widths, and heights
box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
# code above make center of bboxes and height width of bboxes
# Convert to corner coordinates (y1, x1, y2, x2)
boxes = np.concatenate([box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes], axis=1)
# convert center height and width coordinate of bbox to four coordinates which respectively represnt top left corner and lower right corner
anchors.append(boxes)
return np.concatenate(anchors, axis=0)
############################################################
# MaskRCNN Class
############################################################
class MaskRCNN( ):
"""Encapsulates the Mask RCNN model functionality.
The actual Keras model is in the keras_model property.
"""
# model_dir=D:\MASKRCNN\mask-rcnn-me\MASKRCNN_myself\Mask_RCNN-master\logs
def __init__(self, mode, config):
"""
mode: Either "training" or "inference"
config: A Sub-class of the Config class
"""
assert mode in ['training', 'inference']
self.mode = mode
self.config = config
self.keras_model = self.build(mode=mode, config=config)
def build(self, mode, config):
"""Build Mask R-CNN architecture.
input_shape: The shape of the input image.
mode: Either "training" or "inference". The inputs and
outputs of the model differ accordingly.
"""
assert mode in ['training', 'inference']
# Image size must be dividable by 2 multiple times
h, w = config.IMAGE_SHAPE[:2] # 800 or 1024 have provided inherent numbers
if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): # 這里就限定了下采樣不會產生坐標誤差
raise Exception("Image size must be dividable by 2 at least 6 times "
"to avoid fractions when downscaling and upscaling."
"For example, use 256, 320, 384, 448, 512, ... etc. ")
# Inputs
input_image = KL.Input(shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image")
input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE], name="input_image_meta")
# 實際給出輸入的變量是多了一個batch的維度
if mode == "training":
# RPN GT
input_rpn_match = KL.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
input_rpn_bbox = KL.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)
# RPN_TRAIN_ANCHORS_PER_IMAGE = 256
# Detection GT (class IDs, bounding boxes, and masks)
# 1. GT Class IDs (zero padded)
input_gt_class_ids = KL.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32)
# 2. GT Boxes in pixels (zero padded)
# [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
input_gt_boxes = KL.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
# Normalize coordinates
gt_boxes = KL.Lambda(lambda x: norm_boxes_graph(x, K.shape(input_image)[1:3]))(input_gt_boxes)
# 3. GT Masks (zero padded)
# [batch, height, width, MAX_GT_INSTANCES] MAX_GT_INSTANCES=100
# if config.USE_MINI_MASK: # USE_MINI_MASK=true
# input_gt_masks = KL.Input(shape=[config.MINI_MASK_SHAPE[0],config.MINI_MASK_SHAPE[1], None], name="input_gt_masks", dtype=bool) # MINI_MASK_SHAPE = (56, 56)
# else:
input_gt_masks = KL.Input(shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], name= "input_gt_masks", dtype=bool) # 1024 or 512
elif mode == "inference":
# Anchors in normalized coordinates
input_anchors = KL.Input(shape=[None, 4], name="input_anchors")
# Build the shared convolutional layers.
# Bottom-up Layers
# Returns a list of the last layers of each stage, 5 in total.
# Don't create the thead (stage 5), so we pick the 4th item in the list.
# if callable(config.BACKBONE): # 檢查一個函數是否可被調用 BACKBONE = "resnet101"
# _, C2, C3, C4, C5 = config.BACKBONE(input_image, stage5=True, train_bn=config.TRAIN_BN)
# # 上一行的代碼應該是調用訓練好的網絡結構吧
# callable()方法用來檢測對象是否可被調用,可被調用指的是對象能否使用()括號的方法調用
# else:
_, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE, stage5=True, train_bn=config.TRAIN_BN)
# Top-down Layers
# TODO: add assert to varify feature map sizes match what's in config
P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5) # config.TOP_DOWN_PYRAMID_SIZE=256
P4 = KL.Add(name="fpn_p4add")([KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)]) # channel of the end bring into correspondence with other channel
P3 = KL.Add(name="fpn_p3add")([KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)])
P2 = KL.Add(name="fpn_p2add")([KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)])
# Attach 3x3 conv to all P layers to get the final feature maps.
P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2)
P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3)
P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4)
P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5) # TOP_DOWN_PYRAMID_SIZE = 256
# P6 is used for the 5th anchor scale in RPN. Generated by
# subsampling from P5 with stride of 2.
P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)
# Note that P6 is used in RPN, but not in the classifier heads.
rpn_feature_maps = [P2, P3, P4, P5, P6]
mrcnn_feature_maps = [P2, P3, P4, P5]
# Anchors
if mode == "training":
anchors = self.get_anchors(config.IMAGE_SHAPE)
# Duplicate across the batch dimension because Keras requires it
# TODO: can this be optimized to avoid duplicating the anchors?
anchors = np.broadcast_to(anchors, (config.batch_size,) + anchors.shape) # 將每一張圖得到的boxes變成batch
# A hack to get around Keras's bad support for constants
anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) # ks接受tf的變量
else:
anchors = input_anchors
# RPN Model, 返回的是keras的Module對象, 注意keras中的Module對象是可call的
# RPN Model
rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE)
# H * W * anchors_per_location 每一層都會有這么多框,而H=img/4(stride)
# config.TOP_DOWN_PYRAMID_SIZE=256 RPN_ANCHOR_RATIOS = [0.5, 1, 2] RPN_ANCHOR_STRIDE=1
# Loop through pyramid layers
layer_outputs = [] # list of lists # 保存各pyramid特征經過RPN之后的結果
for p in rpn_feature_maps: # rpn_feature_maps = [P2, P3, P4, P5, P6]
layer_outputs.append(rpn([p]))
# Concatenate layer outputs
# Convert from list of lists of level outputs to list of lists
# of outputs across levels.
# e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] # "rpn_class_logits"為分類 "rpn_class"為置信度 "rpn_bbox"為box
outputs = list(zip(*layer_outputs)) # [[logits2,……6], [class2,……6], [bbox2,……6]]
outputs = [KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names)] #
# [batch, num_anchors, 2/4]
# 其中num_anchors指的是全部特征層上的anchors總數
rpn_class_logits, rpn_class, rpn_bbox = outputs # "rpn_class_logits"為分類 "rpn_class"為置信度 "rpn_bbox"為box
# Generate proposals
# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
# and zero padded.
# Generate proposals
# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
# and zero padded.
# POST_NMS_ROIS_INFERENCE = 1000
# POST_NMS_ROIS_TRAINING = 2000
proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\
else config.POST_NMS_ROIS_INFERENCE
'''
上一步我們獲取了全部錨框的信息,這里我們的目的是從中挑選指定個數的更可能包含obj的錨框作為建議區域,
即我們希望獲取在上一步的二分類中前景得分更高的框,同時,由於錨框生成算法的設計,
其數量巨大且重疊嚴重,我們在得分高低的基礎上,進一步的希望能夠去重(非極大值抑制),
這就是proposal生成的目的。proposal_count是一個整數,用於指定生成proposal數目,
不足時會生成坐標為[0,0,0,0]的空值進行補全。
這里的變量scores = inputs[0][:, :, 1],即我們只需要全部候選框的前景得分
'''
rpn_rois = ProposalLayer(
proposal_count=proposal_count, # proposal_count=2000 for train proposal_count=1000 for inference
nms_threshold=config.RPN_NMS_THRESHOLD, # 0.7 小於該閾值被保留
name="ROI",
config=config)([rpn_class, rpn_bbox, anchors])
if mode == "training":
# Class ID mask to mark class IDs supported by the dataset the image
# came from.
active_class_ids = KL.Lambda(lambda x: parse_image_meta_graph(x)["active_class_ids"])(input_image_meta)
if not config.USE_RPN_ROIS:
# Ignore predicted ROIs and use ROIs provided as an input.
input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],name="input_roi", dtype=np.int32)
# Normalize coordinates
target_rois = KL.Lambda(lambda x: norm_boxes_graph(x, K.shape(input_image)[1:3]))(input_rois)
else:
target_rois = rpn_rois # 用rpn層
# Generate detection targets
# Subsamples proposals and generates target outputs for training
# Note that proposal class IDs, gt_boxes, and gt_masks are zero
# padded. Equally, returned rois and targets are zero padded.
rois, target_class_ids, target_bbox, target_mask = DetectionTargetLayer(config, name="proposal_targets")([target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])
# rois=[batch_size,none,4], target_class_ids=[batch_size,none], target_bbox=deltas=[batch_size,none,4], target_mask=[batch_size,height,width]
# rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
# target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE].Integer class IDs.
# target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw)]
# target_mask:[batch, TRAIN_ROIS_PER_IMAGE, height, width]
# Masks cropped to bbox boundaries and resized to neural network output size.
# Network Heads
# TODO: verify that this handles zero padded ROIs
mrcnn_class_logits, mrcnn_class, mrcnn_bbox =fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta,
config.POOL_SIZE, config.NUM_CLASSES, # config.POOL_SIZE = 7
train_bn=config.TRAIN_BN,
fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # FPN_CLASSIF_FC_LAYERS_SIZE = 1024
mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps,
input_image_meta,
config.MASK_POOL_SIZE,
config.NUM_CLASSES,
train_bn=config.TRAIN_BN)
# TODO: clean up (use tf.identify if necessary)
output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois)
# Losses
rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")([input_rpn_match, rpn_class_logits])
rpn_bbox_loss = KL.Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")([input_rpn_bbox, input_rpn_match, rpn_bbox])
class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([target_class_ids, mrcnn_class_logits, active_class_ids])
bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")([target_bbox, target_class_ids, mrcnn_bbox])
mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")([target_mask, target_class_ids, mrcnn_mask])
# Model
inputs = [input_image, input_image_meta,
input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks]
if not config.USE_RPN_ROIS:
inputs.append(input_rois)
outputs = [rpn_class_logits, rpn_class, rpn_bbox,
mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask,
rpn_rois, output_rois,
rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss]
model = KM.Model(inputs, outputs, name='mask_rcnn')
else:
# Network Heads
# Proposal classifier and BBox regressor heads
mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,
config.POOL_SIZE, config.NUM_CLASSES,
train_bn=config.TRAIN_BN,
fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)
# Detections
# output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
# normalized coordinates
detections = DetectionLayer(config, name="mrcnn_detection")([rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])
# Create masks for detections
detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections)
mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN)
model = KM.Model([input_image, input_image_meta, input_anchors],[detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox],name='mask_rcnn')
return model
def load_weights(self, filepath, by_name=False, exclude=None):
"""Modified version of the corresponding Keras function with
the addition of multi-GPU support and the ability to exclude
some layers from loading.
exclude: list of layer names to exclude
"""
import h5py
# Conditional import to support versions of Keras before 2.2
# TODO: remove in about 6 months (end of 2018)
try:
from keras.engine import saving
except ImportError:
# Keras before 2.2 used the 'topology' namespace.
from keras.engine import topology as saving
if exclude:
by_name = True
if h5py is None:
raise ImportError('`load_weights` requires h5py.')
f = h5py.File(filepath, mode='r')
if 'layer_names' not in f.attrs and 'model_weights' in f:
f = f['model_weights']
# In multi-GPU training, we wrap the model. Get layers
# of the inner model because they have the weights.
keras_model = self.keras_model
layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\
else keras_model.layers
# Exclude some layers
if exclude:
layers = filter(lambda l: l.name not in exclude, layers)
if by_name:
saving.load_weights_from_hdf5_group_by_name(f, layers)
else:
saving.load_weights_from_hdf5_group(f, layers)
if hasattr(f, 'close'):
f.close()
def compile(self, learning_rate, momentum):
"""Gets the model ready for training. Adds losses, regularization, and
metrics. Then calls the Keras compile() function.
"""
# Optimizer object
optimizer = keras.optimizers.SGD(
lr=learning_rate, momentum=momentum,
clipnorm=self.config.GRADIENT_CLIP_NORM)
# Add Losses
# First, clear previously set losses to avoid duplication
self.keras_model._losses = []
self.keras_model._per_input_losses = {}
loss_names = [
"rpn_class_loss", "rpn_bbox_loss",
"mrcnn_class_loss", "mrcnn_bbox_loss", "mrcnn_mask_loss"]
for name in loss_names:
layer = self.keras_model.get_layer(name)
if layer.output in self.keras_model.losses:
continue
loss = (
tf.reduce_mean(layer.output, keepdims=True)
* self.config.LOSS_WEIGHTS.get(name, 1.))
self.keras_model.add_loss(loss)
# Add L2 Regularization
# Skip gamma and beta weights of batch normalization layers.
reg_losses = [
keras.regularizers.l2(self.config.WEIGHT_DECAY)(w) / tf.cast(tf.size(w), tf.float32)
for w in self.keras_model.trainable_weights
if 'gamma' not in w.name and 'beta' not in w.name]
self.keras_model.add_loss(tf.add_n(reg_losses))
# Compile
self.keras_model.compile(optimizer=optimizer)
# Add metrics for losses
for name in loss_names:
if name in self.keras_model.metrics_names:
continue
layer = self.keras_model.get_layer(name)
self.keras_model.metrics_names.append(name)
loss = (
tf.reduce_mean(layer.output, keepdims=True)
* self.config.LOSS_WEIGHTS.get(name, 1.))
self.keras_model.metrics_tensors.append(loss)
def set_trainable(self, layer_regex, keras_model=None, indent=0):
"""Sets model layers as trainable if their names match
the given regular expression.
hasattr() 函數用於判斷對象是否包含對應的屬性
"""
keras_model = keras_model or self.keras_model
layers =keras_model.layers
for layer in layers:
# Is the layer a model?
if layer.__class__.__name__ == 'Model':
print("In model: ", layer.name)
self.set_trainable(
layer_regex, keras_model=layer, indent=indent + 4)
continue
if not layer.weights:
continue
# Is it trainable?
trainable = bool(re.fullmatch(layer_regex, layer.name))
# 上面是一個Bool型,表示凍結還是不凍結
# Update layer. If layer is a container, update inner layer.
if layer.__class__.__name__ == 'TimeDistributed':
layer.layer.trainable = trainable
else:
layer.trainable = trainable
def train(self, train_dataset, learning_rate, epochs, layers, custom_callbacks=None):
"""Train the model.
train_dataset, val_dataset: Training and validation Dataset objects.
learning_rate: The learning rate to train with
epochs: Number of training epochs. Note that previous training epochs
are considered to be done alreay, so this actually determines
the epochs to train in total rather than in this particaular
call.
layers: Allows selecting wich layers to train. It can be:
- A regular expression to match layer names to train
- One of these predefined values:
heads: The RPN, classifier and mask heads of the network
all: All the layers
3+: Train Resnet stage 3 and up
4+: Train Resnet stage 4 and up
5+: Train Resnet stage 5 and up
augmentation: Optional. An imgaug (https://github.com/aleju/imgaug)
augmentation. For example, passing imgaug.augmenters.Fliplr(0.5)
flips images right/left 50% of the time. You can pass complex
augmentations as well. This augmentation applies 50% of the
time, and when it does it flips images right/left half the time
and adds a Gaussian blur with a random sigma in range 0 to 5.
augmentation = imgaug.augmenters.Sometimes(0.5, [
imgaug.augmenters.Fliplr(0.5),
imgaug.augmenters.GaussianBlur(sigma=(0.0, 5.0))
])
custom_callbacks: Optional. Add custom callbacks to be called
with the keras fit_generator method. Must be list of type keras.callbacks.
no_augmentation_sources: Optional. List of sources to exclude for
augmentation. A source is string that identifies a dataset and is
defined in the Dataset class.
"""
assert self.mode == "training", "Create model in training mode."
# Pre-defined layer regular expressions
layer_regex = {
# all layers but the backbone
"heads": r"(mrcnn1\_.*)|(rpn\_.*)|(fpn\_.*)",
# From a specific Resnet stage and up
"3+": r"(res3.*)|(bn3.*)|(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn1\_.*)|(rpn\_.*)|(fpn\_.*)",
"4+": r"(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn1\_.*)|(rpn\_.*)|(fpn\_.*)",
"5+": r"(res5.*)|(bn5.*)|(mrcnn1\_.*)|(rpn\_.*)|(fpn\_.*)",
# All layers
"all": ".*",
}
if layers in layer_regex.keys():
layers = layer_regex[layers]
# Callbacks
callbacks = [
# keras.callbacks.TensorBoard(log_dir=self.log_dir, histogram_freq=0, write_graph=True, write_images=False),
keras.callbacks.ModelCheckpoint("C:\\Users\\51102\\Desktop\\maskrcnn(tangjun)\\log\\{epoch:02d}.h5", verbose=0, save_weights_only=True),
]
self.set_trainable(layers)
self.compile(learning_rate, self.config.LEARNING_MOMENTUM)
self.keras_model.fit_generator(
train_dataset,
initial_epoch=0,#self.epoch,
epochs=epochs,
steps_per_epoch=self.config.STEPS_PER_EPOCH,
callbacks=callbacks,
# validation_data=val_generator,
# validation_steps=self.config.VALIDATION_STEPS,
# max_queue_size=100,
# workers=workers,
# use_multiprocessing=True,
)
def mold_inputs(self, images):
"""Takes a list of images and modifies them to the format expected
as an input to the neural network.
images: List of image matrices [height,width,depth]. Images can have
different sizes.
Returns 3 Numpy matrices:
molded_images: [N, h, w, 3]. Images resized and normalized.
image_metas: [N, length of meta data]. Details about each image.
windows: [N, (y1, x1, y2, x2)]. The portion of the image that has the
original image (padding excluded).
"""
molded_images = []
image_metas = []
windows = []
for image in images:
# Resize image
# TODO: move resizing to mold_image()
molded_image, window, scale, padding = resize_image(
image,
min_dim=self.config.IMAGE_MIN_DIM,
min_scale=self.config.IMAGE_MIN_SCALE,
max_dim=self.config.IMAGE_MAX_DIM,
mode=self.config.IMAGE_RESIZE_MODE)
molded_image = molded_image.astype(np.float32) - self.config.MEAN_PIXEL # 減平均像素
# Build image_meta 形式為np數組
# Build image_meta
image_meta = np.array(
[0] + # size=1
list(image.shape) + # size=3
list(molded_image.shape) + # size=3
list(window) + # size=4 (y1, x1, y2, x2) in image cooredinates
[scale] + # size=1
list(np.zeros([self.config.NUM_CLASSES], dtype=np.int32)) # size=num_classes
)
# Append
molded_images.append(molded_image)
windows.append(window)
image_metas.append(image_meta)
# Pack into arrays
molded_images = np.stack(molded_images)
image_metas = np.stack(image_metas)
windows = np.stack(windows)
return molded_images, image_metas, windows
def unmold_detections(self, detections, mrcnn_mask, original_image_shape, image_shape, window):
"""Reformats the detections of one image from the format of the neural
network output to a format suitable for use in the rest of the
application.
detections: [N, (y1, x1, y2, x2, class_id, score)] in normalized coordinates
mrcnn_mask: [N, height, width, num_classes]
original_image_shape: [H, W, C] Original image shape before resizing
image_shape: [H, W, C] Shape of the image after resizing and padding
window: [y1, x1, y2, x2] Pixel coordinates of box in the image where the real
image is excluding the padding.
Returns:
boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels
class_ids: [N] Integer class IDs for each bounding box
scores: [N] Float probability scores of the class_id
masks: [height, width, num_instances] Instance masks
"""
# How many detections do we have?
# Detections array is padded with zeros. Find the first class_id == 0.
zero_ix = np.where(detections[:, 4] == 0)[0] # 去除為0的
N = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0] # 有意義的檢測結果數N
# N 是 detections中box為非0的數量,因為detections[:,4]是先有值,表示有檢測box,沒有檢測到的
# box值為0,因此zeros_ix[0]剛好表示有值的個數
# Extract boxes, class_ids, scores, and class-specific masks
boxes = detections[:N, :4] # [N, (y1, x1, y2, x2)] 提取有值的box
class_ids = detections[:N, 4].astype(np.int32) # [N, class_id] 提取對應的類
scores = detections[:N, 5] # [N, score] 提取對應的置信度
masks = mrcnn_mask[np.arange(N), :, :, class_ids] # [N, height, width, num_classes] 提取對應的msask
# Translate normalized coordinates in the resized image to pixel
# coordinates in the original image before resizing
# 下面將resize后的圖像在對應的window中規范化
h, w = image_shape[:2] # image_shape 為resize后圖像圖像的畫布,該畫布包含輸入圖片的resize,也是預測輸入尺寸
scale_norm = np.array([h - 1, w - 1, h - 1, w - 1])
shift_norm = np.array([0, 0, 1, 1])
# window 為輸入圖像所占畫布尺寸
window = np.divide((window - shift_norm), scale_norm).astype(np.float32)
# window = norm_boxes(window, image_shape[:2]) # window相對輸入圖片規范化
wy1, wx1, wy2, wx2 = window
shift = np.array([wy1, wx1, wy1, wx1])
wh = wy2 - wy1 # window height
ww = wx2 - wx1 # window width
scale = np.array([wh, ww, wh, ww])
# Convert boxes to normalized coordinates on the window
boxes = np.divide(boxes - shift, scale) # box相對window坐標規范化,經過上面的處理
# Convert boxes to pixel coordinates on the original image
# boxes = denorm_boxes(boxes, original_image_shape[:2]) # box相對原圖解規范化
h, w = original_image_shape[:2]
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
boxes = np.around(np.multiply(boxes, scale) + shift).astype(np.int32)
# Filter out detections with zero area. Happens in early training when
# network weights are still random
exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
# 上一行代碼是h*w判斷是否box不對,然后排除,給出是h*w小於0的位置編號
if exclude_ix.shape[0] > 0: # 如果有就刪除
boxes = np.delete(boxes, exclude_ix, axis=0)
class_ids = np.delete(class_ids, exclude_ix, axis=0)
scores = np.delete(scores, exclude_ix, axis=0)
masks = np.delete(masks, exclude_ix, axis=0)
N = class_ids.shape[0]
# Resize masks to original image size and set boundary threshold.
full_masks = []
for i in range(N): # 單個box操作
# Convert neural network mask to full size mask
# 以下代碼是將預測的mask進行resize並填充到真實圖像對應的區域
"""Converts a mask generated by the neural network to a format similar
to its original shape.
mask: [height, width] of type float. A small, typically 28x28 mask.
bbox: [y1, x1, y2, x2]. The box to fit the mask in.
Returns a binary mask with the same size as the original image.
"""
threshold = 0.5 # 這是決定是否有mask展現的閾值
y1, x1, y2, x2 = boxes[i]
mask_temp = resize(masks[i], (y2 - y1, x2 - x1)) # 將預測的mask resize 到box中
mask_temp = np.where(mask_temp >= threshold, 1, 0).astype(np.bool)
# Put the mask in the right location.
full_mask = np.zeros(original_image_shape[:2], dtype=np.bool)
full_mask[y1:y2, x1:x2] = mask_temp # full_mask 為二維的
full_masks.append(full_mask) # 將其添加到列表中
# np.stack 表示增加一個維度,這里為二維,增加一維是-1的維度為2
full_masks = np.stack(full_masks, axis=-1)\
if full_masks else np.empty(original_image_shape[:2] + (0,))
# [n, (y1, x1, y2, x2)]
# [n, class_id]
# [n, class_id]
# [h, w, n]
return boxes, class_ids, scores, full_masks
def detect(self, images, log_print=0):
"""Runs the detection pipeline.
images: List of images, potentially of different sizes.
Returns a list of dicts, one dict per image. The dict contains:
rois: [N, (y1, x1, y2, x2)] detection bounding boxes
class_ids: [N] int class IDs
scores: [N] float probability scores for the class IDs
masks: [H, W, N] instance binary masks
"""
assert self.mode == "inference", "Create model in inference mode."
assert len(images) == self.config.batch_size, "len(images) must be equal to BATCH_SIZE"
# Mold inputs to format expected by the neural network
molded_images, image_metas, windows = self.mold_inputs(images)
# Validate image sizes
# All images in a batch MUST be of the same size
image_shape = molded_images[0].shape
for g in molded_images[1:]:
assert g.shape == image_shape,\
"After resizing, all images must have the same size. Check IMAGE_RESIZE_MODE and image sizes."
# Anchors
anchors = self.get_anchors(image_shape)
# Duplicate across the batch dimension because Keras requires it
# TODO: can this be optimized to avoid duplicating the anchors?
anchors = np.broadcast_to(anchors, (self.config.batch_size,) + anchors.shape)
# 日志記錄
if log_print:
log("molded_images", molded_images)
log("image_metas", image_metas)
log("anchors", anchors)
detections, _, _, mrcnn_mask, _, _, _ =\
self.keras_model.predict([molded_images, image_metas, anchors], verbose=0)
# 根據模型得到預測時候想要的結果
# Process detections
results = [] # 建立空列表,保存最終結果
for i, image in enumerate(images):
# 需要單張處理,因為原始圖片images不保證每張尺寸一致
final_rois, final_class_ids, final_scores, final_masks =\
self.unmold_detections(detections[i], mrcnn_mask[i],
image.shape, molded_images[i].shape, windows[i])
# windows 是保存resize圖像尺寸中真實圖像分布在該尺寸中的坐標位置,左上角與右下角的坐標點
results.append({
"rois": final_rois,
"class_ids": final_class_ids,
"scores": final_scores,
"masks": final_masks})
print('it is ok ')
return results
def generate_pyramid_anchors(self,scales, ratios, feature_shapes, feature_strides, anchor_stride):
"""Generate anchors at different levels of a feature pyramid. Each scale
is associated with a level of the pyramid, but each ratio is used in
all levels of the pyramid.
Returns:
anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted
with the same order of the given scales. So, anchors of scale[0] come
first, then anchors of scale[1], and so on.
"""
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
anchors = []
for i in range(len(scales)):
# anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i], feature_strides[i], anchor_stride))
"""
scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
shape: [height, width] spatial shape of the feature map over which to generate anchors.
feature_stride: Stride of the feature map relative to the image in pixels.
anchor_stride: Stride of anchors on the feature map. For example, if the value is 2 then generate anchors for every other feature map pixel.
"""
# Get all combinations of scales and ratios
scale, ratios = np.meshgrid(np.array(scales[i]), np.array(ratios))
scale = scale.flatten()
ratios = ratios.flatten()
shape = feature_shapes[i]
feature_stride = feature_strides[i]
# Enumerate heights and widths from scales and ratios
# 實際得到box的寬與高
heights = scale / np.sqrt(ratios)
widths = scale * np.sqrt(ratios)
# Enumerate shifts in feature space
# 實際得到box坐標中心
shifts_y = np.arange(0, shape[0],
anchor_stride) * feature_stride # anchor_stride 表示原圖img/stride縮放后以anchor_stride為步長取像素,
# 一此作為中心點,而后乘以feature_stride(stride)將像素中心放回原圖像位置中。
shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
# Enumerate combinations of shifts, widths, and heights
box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
# code above make center of bboxes and height width of bboxes
# Convert to corner coordinates (y1, x1, y2, x2)
boxes = np.concatenate([box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes], axis=1)
# convert center height and width coordinate of bbox to four coordinates which respectively represnt top left corner and lower right corner
anchors.append(boxes)
return np.concatenate(anchors, axis=0)
def get_anchors(self, image_shape):
"""Returns anchor pyramid for the given image size."""
# [N, (height, width)]
backbone_shapes = compute_backbone_shapes(self.config, image_shape)
# Cache anchors and reuse if image shape is the same
if not hasattr(self, "_anchor_cache"):
self._anchor_cache = {}
if not tuple(image_shape) in self._anchor_cache:
# Generate Anchors: [anchor_count, (y1, x1, y2, x2)]
# Generate Anchors
a = self.generate_pyramid_anchors(
self.config.RPN_ANCHOR_SCALES,# (32, 64, 128, 256, 512)->16, 32,64, 128, 256
self.config.RPN_ANCHOR_RATIOS, # [0.5, 1, 2]
backbone_shapes, # with shape [N, (height, width)]
self.config.BACKBONE_STRIDES,# [4, 8, 16, 32, 64]
self.config.RPN_ANCHOR_STRIDE)# 1
# Keep a copy of the latest anchors in pixel coordinates because
# it's used in inspect_model notebooks.
# TODO: Remove this after the notebook are refactored to not use it
self.anchors = a #[n,4]
# Normalize coordinates
# self._anchor_cache[tuple(image_shape)] = utils.norm_boxes(a, image_shape[:2])
# 進行歸一化
h, w = image_shape[:2]
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
self._anchor_cache[tuple(image_shape)]= np.divide((a - shift), scale).astype(np.float32)
return self._anchor_cache[tuple(image_shape)]
def find_trainable_layer(self, layer):
"""If a layer is encapsulated by another layer, this function
digs through the encapsulation and returns the layer that holds
the weights.
"""
if layer.__class__.__name__ == 'TimeDistributed':
return self.find_trainable_layer(layer.layer)
return layer
def get_trainable_layers(self):
"""Returns a list of layers that have weights."""
layers = []
# Loop through all layers
for l in self.keras_model.layers:
# If layer is a wrapper, find inner trainable layer
l = self.find_trainable_layer(l)
# Include layer if it has weights
if l.get_weights():
layers.append(l)
return layers
def parse_image_meta_graph(meta):
image_id = meta[:, 0]
original_image_shape = meta[:, 1:4]
image_shape = meta[:, 4:7]
window = meta[:, 7:11] # (y1, x1, y2, x2) window of image in in pixels
scale = meta[:, 11]
active_class_ids = meta[:, 12:]
return {
"image_id": image_id,
"original_image_shape": original_image_shape,
"image_shape": image_shape,
"window": window,
"scale": scale,
"active_class_ids": active_class_ids,
}
############################################################
# Miscellenous Graph Functions
############################################################
def trim_zeros_graph(boxes, name='trim_zeros'):
"""Often boxes are represented with matrices of shape [N, 4] and
are padded with zeros. This removes zero boxes.
boxes: [N, 4] matrix of boxes.
non_zeros: [N] a 1D boolean mask identifying the rows to keep
"""
non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)
boxes = tf.boolean_mask(boxes, non_zeros, name=name) # tf.boolean_mask 為True的保存下來
return boxes, non_zeros
# def batch_pack_graph(x, counts, num_rows):
# """Picks different number of values from each row in x depending on the values in counts.
# """
# outputs = []
# for i in range(num_rows):
# outputs.append(x[i, :counts[i]])
# return tf.concat(outputs, axis=0)
def norm_boxes_graph(boxes, shape):
"""Converts boxes from pixel coordinates to normalized coordinates.
boxes: [..., (y1, x1, y2, x2)] in pixel coordinates
shape: [..., (height, width)] in pixels
Note: In pixel coordinates (y2, x2) is outside the box. But in normalized
coordinates it's inside the box.
Returns:
[..., (y1, x1, y2, x2)] in normalized coordinates
"""
h, w = tf.split(tf.cast(shape, tf.float32), 2) # h 在第一維度分成2個
scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)
shift = tf.constant([0., 0., 1., 1.])
return tf.divide(boxes - shift, scale)
# def denorm_boxes_graph(boxes, shape):
# """Converts boxes from normalized coordinates to pixel coordinates.
# boxes: [..., (y1, x1, y2, x2)] in normalized coordinates
# shape: [..., (height, width)] in pixels
#
# Note: In pixel coordinates (y2, x2) is outside the box. But in normalized
# coordinates it's inside the box.
#
# Returns:
# [..., (y1, x1, y2, x2)] in pixel coordinates
# """
# h, w = tf.split(tf.cast(shape, tf.float32), 2)
# scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)
# shift = tf.constant([0., 0., 1., 1.])
# return tf.cast(tf.round(tf.multiply(boxes, scale) + shift), tf.int32)
此代碼為推理文件.py
"""
MASKRCNN algrithm for object detection and instance segmentation
Written and modified by tang jun on JAN , 2019
if you have questions , please connect me by Email: tangjunjunfighter@163.com
"""
import scipy
import os
import random
import datetime
import re
import math
import logging
from collections import OrderedDict
import multiprocessing
import numpy as np
import tensorflow as tf
import keras
import keras.backend as K # keras中的后端backend及其相關函數
import keras.layers as KL
import keras.engine as KE
import keras.models as KM
import math
import os
import sys
import numpy as np
import cv2
import matplotlib.pyplot as plt
import yaml
from PIL import Image
import random
# from mrcnn1 import utils, model as modellib, visualize
# from mrcnn1 import utils, model as modellib, visualize
import model as modellib
import visualize
from distutils.version import LooseVersion
assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
assert LooseVersion(keras.__version__) >= LooseVersion('2.0.8')
ROOT_DIR = os.getcwd() # 得到當前路徑
sys.path.append(ROOT_DIR) # To find local version of the library
# Directory to save logs and trained models
MODEL_DIR = os.path.join(ROOT_DIR, "logs") # 在當前路徑的logs文件路徑
iter_num = 0
# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5") # 載入訓練模型權重路徑
class Config_config(object):
"""Base configuration class. For custom configurations, create a
sub-class that inherits from this one and override properties
that need to be changed.
"""
IMAGE_RESIZE_MODE = "square"
IMAGE_MIN_DIM = 128
IMAGE_MAX_DIM = 256
NUM_CLASSES = 1 + 4 # Override in sub-classes
PRE_NMS_LIMIT = 6000
IMAGE_CHANNEL_COUNT = 3
# Name the configurations. For example, 'COCO', 'Experiment 3', ...etc.
# Useful if your code needs to do things differently depending on which
# experiment is running.
NAME = "shapes" # Override in sub-classes
GPU_COUNT = 1
IMAGES_PER_GPU = 1
# Number of training steps per epoch
# This doesn't need to match the size of the training set. Tensorboard
# updates are saved at the end of each epoch, so setting this to a
# smaller number means getting more frequent TensorBoard updates.
# Validation stats are also calculated at each epoch end and they
# might take a while, so don't set this too small to avoid spending
# a lot of time on validation stats.
STEPS_PER_EPOCH = 3
# Number of validation steps to run at the end of every training epoch.
# A bigger number improves accuracy of validation stats, but slows
# down the training.
VALIDATION_STEPS = 50
# Backbone network architecture
# Supported values are: resnet50, resnet101.
# You can also provide a callable that should have the signature
# of model.resnet_graph. If you do so, you need to supply a callable
# to COMPUTE_BACKBONE_SHAPE as well
BACKBONE = "resnet101"
# Only useful if you supply a callable to BACKBONE. Should compute
# the shape of each layer of the FPN Pyramid.
# See model.compute_backbone_shapes
# COMPUTE_BACKBONE_SHAPE = None
# The strides of each layer of the FPN Pyramid. These values
# are based on a Resnet101 backbone.
BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# Size of the fully-connected layers in the classification graph
FPN_CLASSIF_FC_LAYERS_SIZE = 1024
# Size of the top-down layers used to build the feature pyramid
TOP_DOWN_PYRAMID_SIZE = 256
# Number of classification classes (including background)
# Length of square anchor side in pixels
RPN_ANCHOR_SCALES = (8, 16, 32, 64, 128)
# Ratios of anchors at each cell (width/height)
# A value of 1 represents a square anchor, and 0.5 is a wide anchor
RPN_ANCHOR_RATIOS = [0.5, 1, 2]
# Anchor stride
# If 1 then anchors are created for each cell in the backbone feature map.
# If 2, then anchors are created for every other cell, and so on.
RPN_ANCHOR_STRIDE = 1
# Non-max suppression threshold to filter RPN proposals.
# You can increase this during training to generate more propsals.
RPN_NMS_THRESHOLD = 0.7
# How many anchors per image to use for RPN training
RPN_TRAIN_ANCHORS_PER_IMAGE = 256 # rpn數據需要此值,rpn網絡也需要次之
# ROIs kept after non-maximum supression (training and inference)
POST_NMS_ROIS_TRAINING = 2000
POST_NMS_ROIS_INFERENCE = 1000
# If enabled, resizes instance masks to a smaller size to reduce
# memory load. Recommended when using high-resolution images.
USE_MINI_MASK = False
MINI_MASK_SHAPE = (56, 56) # (height, width) of the mini-mask
# Input image resizing
# Generally, use the "square" resizing mode for training and inferencing
# and it should work well in most cases. In this mode, images are scaled
# up such that the small side is = IMAGE_MIN_DIM, but ensuring that the
# scaling doesn't make the long side > IMAGE_MAX_DIM. Then the image is
# padded with zeros to make it a square so multiple images can be put
# in one batch.
# Available resizing modes:
# none: No resizing or padding. Return the image unchanged.
# square: Resize and pad with zeros to get a square image
# of size [max_dim, max_dim].
# pad64: Pads width and height with zeros to make them multiples of 64.
# If IMAGE_MIN_DIM or IMAGE_MIN_SCALE are not None, then it scales
# up before padding. IMAGE_MAX_DIM is ignored in this mode.
# The multiple of 64 is needed to ensure smooth scaling of feature
# maps up and down the 6 levels of the FPN pyramid (2**6=64).
# crop: Picks random crops from the image. First, scales the image based
# on IMAGE_MIN_DIM and IMAGE_MIN_SCALE, then picks a random crop of
# size IMAGE_MIN_DIM x IMAGE_MIN_DIM. Can be used in training only.
# IMAGE_MAX_DIM is not used in this mode.
IMAGE_RESIZE_MODE = "square"
# Minimum scaling ratio. Checked after MIN_IMAGE_DIM and can force further
# up scaling. For example, if set to 2 then images are scaled up to double
# the width and height, or more, even if MIN_IMAGE_DIM doesn't require it.
# Howver, in 'square' mode, it can be overruled by IMAGE_MAX_DIM.
IMAGE_MIN_SCALE = 0
# Image mean (RGB)
MEAN_PIXEL = np.array([123.7, 116.8, 103.9])
# Number of ROIs per image to feed to classifier/mask heads
# The Mask RCNN paper uses 512 but often the RPN doesn't generate
# enough positive proposals to fill this and keep a positive:negative
# ratio of 1:3. You can increase the number of proposals by adjusting
# the RPN NMS threshold.
TRAIN_ROIS_PER_IMAGE = 100
# Percent of positive ROIs used to train classifier/mask heads
ROI_POSITIVE_RATIO = 0.33
# Pooled ROIs
POOL_SIZE = 7
MASK_POOL_SIZE = 14
# Shape of output mask
# To change this you also need to change the neural network mask branch
MASK_SHAPE = [28, 28]
# Maximum number of ground truth instances to use in one image
MAX_GT_INSTANCES = 100
# Bounding box refinement standard deviation for RPN and final detections.
RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# Max number of final detections
DETECTION_MAX_INSTANCES = 100
# Minimum probability value to accept a detected instance
# ROIs below this threshold are skipped
DETECTION_MIN_CONFIDENCE = 0.9 # 大於就選擇
# Non-maximum suppression threshold for detection
DETECTION_NMS_THRESHOLD = 0.15 # 小於就選擇
# Learning rate and momentum
# The Mask RCNN paper uses lr=0.02, but on TensorFlow it causes
# weights to explode. Likely due to differences in optimzer
# implementation.
LEARNING_RATE = 0.001
LEARNING_MOMENTUM = 0.9
# Weight decay regularization
WEIGHT_DECAY = 0.0001
# Loss weights for more precise optimization.
# Can be used for R-CNN training setup.
LOSS_WEIGHTS = {
"rpn_class_loss": 1.,
"rpn_bbox_loss": 1.,
"mrcnn_class_loss": 1.,
"mrcnn_bbox_loss": 1.,
"mrcnn_mask_loss": 1.
}
# Use RPN ROIs or externally generated ROIs for training
# Keep this True for most situations. Set to False if you want to train
# the head branches on ROI generated by code rather than the ROIs from
# the RPN. For example, to debug the classifier head without having to
# train the RPN.
USE_RPN_ROIS = True
# Train or freeze batch normalization layers
# None: Train BN layers. This is the normal mode
# False: Freeze BN layers. Good when using a small batch size
# True: (don't use). Set layer in training mode even when inferencing
TRAIN_BN = True # Defaulting to False since batch size is often small
# Gradient norm clipping
GRADIENT_CLIP_NORM = 5.0
batch_size=1
def __init__(self):
"""Set values of computed attributes."""
# Effective batch size
# self.BATCH_SIZE = self.IMAGES_PER_GPU * self.GPU_COUNT
# Input image size
if self.IMAGE_RESIZE_MODE == "crop":
self.IMAGE_SHAPE = np.array([self.IMAGE_MIN_DIM, self.IMAGE_MIN_DIM, 3])
else:
self.IMAGE_SHAPE = np.array([self.IMAGE_MAX_DIM, self.IMAGE_MAX_DIM, 3])
# Image meta data length
# See compose_image_meta() for details
self.IMAGE_META_SIZE = 1 + 3 + 3 + 4 + 1 + self.NUM_CLASSES
def display(self):
"""Display Configuration values."""
print("\nConfigurations:")
for a in dir(self):
if not a.startswith("__") and not callable(getattr(self, a)):
print("{:30} {}".format(a, getattr(self, a)))
print("\n")
# 預測圖片基本配置更改
class Predict_Config(Config_config):
GPU_COUNT = 1
IMAGES_PER_GPU = 1
IMAGE_MIN_DIM = 128
IMAGE_MAX_DIM = 256
batch_size = 1
def predict():
import skimage.io
config = Predict_Config()
config.display()
model = modellib.MaskRCNN(mode="inference", config=config)
model_path = 'C:\\Users\\51102\\Desktop\\maskrcnn(tangjun)\\log\\04.h5'
# Load trained weights (fill in path to trained weights here)
assert model_path != "", "Provide path to trained weights"
print("Loading weights from ", model_path)
model.load_weights(model_path, by_name=True)
class_names = ['BG', 'line_bulge','dot_concave','dot_bulge','Irregular_concave']
file_names ='D:\\MASKRCNN\\mask-rcnn-me\\MASKRCNN_myself\\0.bmp'
# image = skimage.io.imread(os.path.join(IMAGE_DIR, random.choice(file_names)))
image = skimage.io.imread(file_names)
image=image[:, :, 0:3]
print('image=', image.shape)
# Run detection
results = model.detect([image], log_print=1)
'''
results.append({
"rois": final_rois,
"class_ids": final_class_ids,
"scores": final_scores,
"masks": final_masks})
'''
# Visualize results
r = results[0]
print('r=',r)
visualize.display_instances(image, r['rois'], r['masks'], r['class_ids'], class_names, r['scores'])
if __name__ == "__main__":
predict()
此代碼為顯示輔助文件.py
"""
MASKRCNN algrithm for object detection and instance segmentation
Written and modified by tang jun on JAN , 2019
if you have questions , please connect me by Email: tangjunjunfighter@163.com
"""
import cv2 as cv # 自己添加的模塊
import os
import sys
import random
import itertools
import colorsys
import numpy as np
from skimage.measure import find_contours
import matplotlib.pyplot as plt
from matplotlib import patches, lines
from matplotlib.patches import Polygon
import IPython.display
# # Root directory of the project
# ROOT_DIR = os.path.abspath("../")
#
# # Import Mask RCNN
# sys.path.append(ROOT_DIR) # To find local version of the library
def random_colors(N, bright=True):
"""
Generate random colors.
To get visually distinct colors, generate them in HSV space then
convert to RGB.
"""
brightness = 1.0 if bright else 0.7
hsv = [(i / N, 1, brightness) for i in range(N)]
colors = list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv))
random.shuffle(colors)
return colors
def apply_mask(image, mask, color, alpha=0.5):
"""Apply the given mask to the image.
"""
for c in range(3):
image[:, :, c] = np.where(mask == 1,
image[:, :, c] *
(1 - alpha) + alpha * color[c] * 255,
image[:, :, c])
return image
def display_instances(image, boxes, masks, class_ids, class_names,
scores=None, title="",
figsize=(16, 16), ax=None,
show_mask=True, show_bbox=True,
colors=None, captions=None):
"""
boxes: [num_instance, (y1, x1, y2, x2, class_id)] in image coordinates.
masks: [height, width, num_instances]
class_ids: [num_instances]
class_names: list of class names of the dataset
scores: (optional) confidence scores for each box
title: (optional) Figure title
show_mask, show_bbox: To show masks and bounding boxes or not
figsize: (optional) the size of the image
colors: (optional) An array or colors to use with each object
captions: (optional) A list of strings to use as captions for each object
"""
# Number of instances
N = boxes.shape[0]
if not N:
print("\n*** No instances to display *** \n")
else:
assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0]
# If no axis is passed, create one and automatically call show()
auto_show = False
if not ax:
_, ax = plt.subplots(1, figsize=figsize)
auto_show = True
# Generate random colors
colors = colors or random_colors(N)
# Show area outside image boundaries.
height, width = image.shape[:2]
ax.set_ylim(height + 10, -10)
ax.set_xlim(-10, width + 10)
ax.axis('off')
ax.set_title(title)
masked_image = image.astype(np.uint32).copy()
for i in range(N):
color = colors[i]
# Bounding box
if not np.any(boxes[i]):
# Skip this instance. Has no bbox. Likely lost in image cropping.
continue
y1, x1, y2, x2 = boxes[i]
# cv.rectangle(masked_image, (y1[0],x1[0]), (y2[0],x2[0]), (0, 250, 0), 2) # 自己添加代碼
if show_bbox:
p = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2,
alpha=0.7, linestyle="dashed",
edgecolor=color, facecolor='none')
ax.add_patch(p)
# Label
if not captions:
class_id = class_ids[i]
score = scores[i] if scores is not None else None
label = class_names[class_id]
caption = "{} {:.3f}".format(label, score) if score else label
else:
caption = captions[i]
ax.text(x1, y1 + 8, caption,
color='w', size=11, backgroundcolor="none")
# Mask
mask = masks[:, :, i]
if show_mask:
masked_image = apply_mask(masked_image, mask, color)
# Mask Polygon
# Pad to ensure proper polygons for masks that touch image edges.
padded_mask = np.zeros(
(mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8)
padded_mask[1:-1, 1:-1] = mask
contours = find_contours(padded_mask, 0.5)
for verts in contours:
# Subtract the padding and flip (y, x) to (x, y)
verts = np.fliplr(verts) - 1
p = Polygon(verts, facecolor="none", edgecolor=color)
ax.add_patch(p)
ax.imshow(masked_image.astype(np.uint8))
if auto_show:
plt.show()
return masked_image