分享數據集的集中常用處理代碼,使用的時候記得改一下自己的路徑,ann_dir是coco的json文件路徑,img_dir是圖片路徑。如果用pycharm控制台輸出的中文為亂碼,將pycharm中的編碼全改成utf-8(設置->編輯器->文件編碼),把能改成utf-8的選項都改了。
1️⃣ 有些數據集中含有unicode編碼,也就是對應的中文,我們記錄好每個unicode編碼對應的id。然后將文件中的unicode編碼轉成id。
# -*- coding: utf-8 -*-
import json
import os
import random
import time
import shutil
import glob
category=['無瑕疵','花板跳', '水漬', '星跳', '漿斑', '油漬', '燒毛痕', '死皺', '筘路', '浪紋檔', '三絲', '跳紗', '雙經', '修痕',
'污漬', '百腳', '松經', '跳花', '吊經', '緯紗不良', '斷氨綸', '雙緯', '粗維', '磨痕', '雲織', '整經結', '稀密檔', '斷經',
'粗經', '緯縮', '色差檔', '毛粒', '破洞', '結頭', '軋痕']
root_path=os.getcwd()
ann_dir=os.path.join(root_path,"smartdiagnosisofclothflaw_round1train1_datasets",
"guangdong1_round1_train1_20190818","Annotations")
img_dir=os.path.join(root_path,"smartdiagnosisofclothflaw_round1train1_datasets",
"guangdong1_round1_train1_20190818","defect_Images")
# 訓練集比例
train_percent = 0.8
#####################################################################################
##### 數據集中文改英文
#####################################################################################
def unicode2id():
ann_file=os.path.join(ann_dir,"anno_train.json")
print(ann_file)
# # 輸出訓練數據集中所有的類別
# category_temp=set()
# with open(anno_file, 'r', encoding='unicode_escape') as f:
# json_data = json.load(f)
# for i in json_data:
# category.add(i['defect_name'])
# print(category_temp)
# print(len(category_temp))
data1=[]
# 將數據集中的中文unicode編碼,改編成數字id
with open(ann_file, 'r', encoding='unicode_escape') as f:
json_data = json.load(f)
for i in json_data:
data1.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
with open(os.path.join(ann_dir,'data.json'), 'w') as f:
json.dump(data1, f)
2️⃣ COCO數據集划分為train和val
#####################################################################################
##### COCO數據集划分為train,val
#####################################################################################
def coco_dataset_split():
time_start = time.time()
# 建立輸出文件夾
if not os.path.exists(os.path.join(root_path, "COCO2017")):
os.makedirs(os.path.join(root_path, "COCO2017"))
if not os.path.exists(os.path.join(root_path, "COCO2017","annotations")):
os.makedirs(os.path.join(root_path,"COCO2017", "annotations"))
if not os.path.exists(os.path.join(root_path, "COCO2017","train2017")):
os.makedirs(os.path.join(root_path,"COCO2017", "train2017"))
if not os.path.exists(os.path.join(root_path,"COCO2017", "val2017")):
os.makedirs(os.path.join(root_path,"COCO2017", "val2017"))
# 保存路徑
save_img_train_dir = os.path.join(root_path, "COCO2017", "train2017")
save_img_val_dir = os.path.join(root_path, "COCO2017", "val2017")
save_ann_train_file = os.path.join(root_path, "COCO2017", "annotations", "instances_train2017.json")
save_ann_val_file = os.path.join(root_path, "COCO2017", "annotations", "instances_val2017.json")
# 數據集類別及數量
images_list = os.listdir(img_dir)
images_num = len(images_list)
train_num = int(images_num * train_percent)
val_num=images_num-train_num
train_list = random.sample(images_list, train_num)
val_list = list(set(images_list) - set(train_list))
print("| Images num: ",images_num)
print("| Train num: ",train_num)
print("| Val num: ",val_num)
# 復制圖片。
for image_name in train_list:
shutil.copy(os.path.join(img_dir, image_name), os.path.join(save_img_train_dir, image_name))
for image_name in val_list:
shutil.copy(os.path.join(img_dir, image_name), os.path.join(save_img_val_dir, image_name))
ann_path=os.path.join(ann_dir,"anno_train.json")
# 提取annotation
train2017=[]
val2017=[]
with open(ann_path, 'r', encoding='unicode_escape') as fp:
json_data = json.load(fp)
for i in json_data:
if i['name'] not in val_list:
train2017.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
else:
val2017.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
# 寫入annotation
with open(save_ann_train_file, 'w') as fp:
json.dump(train2017, fp)
with open(save_ann_val_file, 'w') as fp:
json.dump(val2017, fp)
time_end = time.time()
cost_time=time_end-time_start
print("| Cost time: ",cost_time//60//60,"hour",cost_time//60%60,"min",cost_time%60,"s")
3️⃣ COCO數據集轉換成VOC數據集,復制圖片比較耗時,耐心等待就行了。為了節省時間,沒有可視化復制圖片的進度。如果想加,可以百度一下tqdm,加到復制圖片的for循環中就可以了。
#####################################################################################
##### coco數據集轉換成voc數據集
#####################################################################################
def coco2voc():
from lxml.etree import Element, SubElement, tostring
from xml.dom.minidom import parseString
# 創建保存的文件夾
if not os.path.exists(os.path.join(root_path, "VOCdevkit", "VOC2012")):
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012"))
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "Annotations"))
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets"))
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main"))
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "JPEGImages"))
# json文件路徑
ann_path = os.path.join(ann_dir,"data.json")
ann_file = open(ann_path, "r", encoding='utf-8')
ann_json_list = json.load(ann_file)
save_xml_path = os.path.join(root_path, "VOCdevkit", 'VOC2012', 'Annotations')
# 保存每個圖片對應的category以及bbox.
img_names = []
img_bbox_category = {}
for ann in ann_json_list:
# 獲取coco數據集中json的信息
img_name = ann['name']
category = ann['defect_name']
bbox = ann['bbox']
if img_name not in img_names:
img_names.append(img_name)
img_bbox_category[img_name] = [{"category":category,"bbox":bbox}]
else:
img_bbox_category[img_name].append({"category":category,"bbox":bbox})
print('| Images start copy.')
# 復制所有的圖片到voc數據集中。
for img_name in img_names:
shutil.copy(os.path.join(img_dir, img_name), os.path.join(root_path, "VOCdevkit", 'VOC2012', 'JPEGImages', img_name))
print('| Images copy finish.')
print('| Jsons start transform')
# 第一層循環遍歷所有的照片,提出json中所有的信息,並分別放到不同xml文件中。
for img_name in img_bbox_category.keys():
# 獲取圖片名字
img_name_temp = img_name
root_node = Element('annotation')
node_filename = SubElement(root_node, 'filename')
node_filename.text = img_name_temp
from PIL import Image
node_size = SubElement(root_node, 'size')
node_width = SubElement(node_size, 'width')
node_height = SubElement(node_size, 'height')
img_m = Image.open(os.path.join(img_dir,img_name))
node_width.text = str(img_m.width) # 圖片的寬
node_height.text = str(img_m.height) # 圖片的高
# 第二層循環遍歷有多少個框
for bbox_and_category in img_bbox_category[img_name_temp]:
category_temp = bbox_and_category["category"]
bbox_temp = bbox_and_category["bbox"]
# 類別名字
node_object = SubElement(root_node, 'object')
node_name = SubElement(node_object, 'name')
node_name.text = str(category_temp)
node_bndbox = SubElement(node_object, 'bndbox')
node_xmin = SubElement(node_bndbox, 'xmin')
node_xmin.text = str(bbox_temp[0])
node_ymin = SubElement(node_bndbox, 'ymin')
node_ymin.text = str(bbox_temp[1])
node_xmax = SubElement(node_bndbox, 'xmax')
node_xmax.text = str(bbox_temp[2])
node_ymax = SubElement(node_bndbox, 'ymax')
node_ymax.text = str(bbox_temp[3])
xml = tostring(root_node)
dom = parseString(xml)
# print xml 打印查看結果
img_name_temp = img_name_temp.replace(".jpg", "")
xml_name = os.path.join(save_xml_path, img_name_temp+'.xml')
with open(xml_name, 'wb') as f:
f.write(dom.toprettyxml(indent='\t', encoding='utf-8'))
# f.write(dom.toprettyxml(indent='\t',))
print('| Jsons transform finish.')
4️⃣ voc數據集轉換成coco數據集
#####################################################################################
##### voc數據集轉換成coco數據集
#####################################################################################
def voc2coco():
import datetime
from PIL import Image
# 處理coco數據集中category字段。
# 創建一個 {類名 : id} 的字典,並保存到 總標簽data 字典中。
class_name_to_id = {'class1':1, 'class2':2, 'class3':3, 'class4':4, 'class5':5, 'class6':6, 'class7':7, 'class8':8}
# 創建coco的文件夾
if not os.path.exists(os.path.join(root_path, "coco2017")):
os.makedirs(os.path.join(root_path, "coco2017"))
os.makedirs(os.path.join(root_path, "coco2017", "annotations"))
os.makedirs(os.path.join(root_path, "coco2017", "train2017"))
os.makedirs(os.path.join(root_path, "coco2017", "val2017"))
# 創建 總標簽data
now = datetime.datetime.now()
data = dict(
info=dict(
description=None,
url=None,
version=None,
year=now.year,
contributor=None,
date_created=now.strftime("%Y-%m-%d %H:%M:%S.%f"),
),
licenses=[dict(url=None, id=0, name=None, )],
images=[
# license, file_name,url, height, width, date_captured, id
],
type="instances",
annotations=[
# segmentation, area, iscrowd, image_id, bbox, category_id, id
],
categories=[
# supercategory, id, name
],
)
for name,id in class_name_to_id.items():
data["categories"].append(
dict(supercategory=None, id=id, name=name, )
)
# 處理coco數據集train中images字段。
images_dir=os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages')
images=os.listdir(images_dir)
# 生成每個圖片對應的image_id
images_id={}
for idx,image_name in enumerate(images):
images_id.update({image_name[:-4]:idx})
# 獲取訓練圖片
train_img=[]
fp = open(os.path.join(root_path,'VOCdevkit','VOC2012','ImageSets','Main','train.txt'))
for i in fp.readlines():
train_img.append(i[:-1]+".jpg")
# 獲取訓練圖片的數據
for image in train_img:
img = Image.open(os.path.join(images_dir,image))
data["images"].append(
dict(
license=0,
url=None,
file_name=image, # 圖片的文件名帶后綴
height=img.height,
width=img.width,
date_captured=None,
# id=image[:-4],
id=images_id[image[:-4]],
)
)
# 獲取coco數據集train中annotations字段。
train_xml=[i[:-4]+'.xml' for i in train_img]
bbox_id=0
for xml in train_xml:
category = []
xmin = []
ymin = []
xmax = []
ymax = []
import xml.etree.ElementTree as ET
tree = ET.parse(os.path.join(root_path,'VOCdevkit','VOC2012','Annotations',xml))
root = tree.getroot()
object = root.findall('object')
for i in object:
category.append(class_name_to_id[i.findall('name')[0].text])
bndbox = i.findall('bndbox')
for j in bndbox:
xmin.append(float(j.findall('xmin')[0].text))
ymin.append(float(j.findall('ymin')[0].text))
xmax.append(float(j.findall('xmax')[0].text))
ymax.append(float(j.findall('ymax')[0].text))
for i in range(len(category)):
data["annotations"].append(
dict(
id=bbox_id,
image_id=images_id[xml[:-4]],
category_id=category[i],
area=(xmax[i]-xmin[i])*(ymax[i]-ymin[i]),
bbox=[xmin[i],ymin[i],xmax[i]-xmin[i],ymax[i]-ymin[i]],
iscrowd=0,
)
)
bbox_id+=1
# 生成訓練集的json
json.dump(data, open(os.path.join(root_path,'coco2017','annotations','instances_train2017.json'), 'w'))
# 獲取驗證圖片
val_img = []
fp = open(os.path.join(root_path, 'VOCdevkit', 'VOC2012', 'ImageSets', 'Main', 'val.txt'))
for i in fp.readlines():
val_img.append(i[:-1] + ".jpg")
# 將訓練的images和annotations清空,
del data['images']
data['images']=[]
del data['annotations']
data['annotations']=[]
# 獲取驗證集圖片的數據
for image in val_img:
img = Image.open(os.path.join(images_dir, image))
data["images"].append(
dict(
license=0,
url=None,
file_name=image, # 圖片的文件名帶后綴
height=img.height,
width=img.width,
date_captured=None,
id=images_id[image[:-4]],
)
)
# 處理coco數據集驗證集中annotations字段。
val_xml=[i[:-4]+'.xml' for i in val_img]
for xml in val_xml:
category = []
xmin = []
ymin = []
xmax = []
ymax = []
import xml.etree.ElementTree as ET
tree = ET.parse(os.path.join(root_path,'VOCdevkit','VOC2012','Annotations',xml))
root = tree.getroot()
object = root.findall('object')
for i in object:
category.append(class_name_to_id[i.findall('name')[0].text])
bndbox = i.findall('bndbox')
for j in bndbox:
xmin.append(float(j.findall('xmin')[0].text))
ymin.append(float(j.findall('ymin')[0].text))
xmax.append(float(j.findall('xmax')[0].text))
ymax.append(float(j.findall('ymax')[0].text))
for i in range(len(category)):
data["annotations"].append(
dict(
id=bbox_id,
image_id=images_id[xml[:-4]],
category_id=category[i],
area=(xmax[i]-xmin[i])*(ymax[i]-ymin[i]),
bbox=[xmin[i],ymin[i],xmax[i]-xmin[i],ymax[i]-ymin[i]],
iscrowd=0,
)
)
bbox_id+=1
# 生成驗證集的json
json.dump(data, open(os.path.join(root_path,'coco2017','annotations','instances_val2017.json'), 'w'))
print('| VOC -> COCO annotations transform finish.')
print('Start copy images...')
for img_name in train_img:
shutil.copy(os.path.join(root_path,"VOCdevkit", "VOC2012", "JPEGImages", img_name), os.path.join(root_path, "coco2017", 'train2017', img_name))
print('| Train images copy finish.')
for img_name in val_img:
shutil.copy(os.path.join(root_path,"VOCdevkit", "VOC2012", "JPEGImages", img_name), os.path.join(root_path, "coco2017", 'val2017', img_name))
print('| Val images copy finish.')
5️⃣ VOC數據集划分為train和val
#####################################################################################
##### voc數據集划分為train,val
#####################################################################################
def voc_dataset_split():
file_train = open(
os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main", "train.txt"), 'w')
file_val = open(
os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main", "val.txt"), 'w')
xml_total_filename = glob.glob(os.path.join(root_path, "VOCdevkit", 'VOC2012', 'Annotations', "*.xml"))
for idx,xml in enumerate(xml_total_filename):
xml_total_filename[idx]=xml.split('\\')[-1]
num_total = len(xml_total_filename)
num_train = int(num_total*train_percent)
train_sample = random.sample(xml_total_filename, num_train)
for name in xml_total_filename:
if name in train_sample:
file_train.write(name[:-4]+'\n')
else:
file_val.write(name[:-4]+'\n')
file_train.close()
file_val.close()
6️⃣ 檢查數據集中圖片是否有損壞
#####################################################################################
##### OSError: image file is truncated (9 bytes not processed)
##### 檢查數據集中圖片是否有損壞。找到有問題圖片,刪掉它,並修改數據集。
#####################################################################################
def check_images():
from PIL import Image
images_dir=os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages')
images=os.listdir(images_dir)
for i in images:
try:
img = Image.open(os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages',i)) # 如果圖片不存在,報錯FileNotFoundError
img.load() # 如果圖片不完整,報錯OSError: image file is truncated
except (FileNotFoundError, OSError):
print(i)
7️⃣ coco數據集將gt可視化,查看
#####################################################################################
# ##### coco數據集將gt可視化,查看
# #####################################################################################
def visiual_gt():
import cv2
# 獲取bboxes
json_file = os.path.join(root_path,'COCO2017','annotations','instances_train2017.json')
data = json.load(open(json_file, 'r'))
# annotations = data['annotations']
images=[]
for d in data:
images.append(d['name'])
# 讀取圖片
for i in random.sample(range(len(images)),5):
img = cv2.imread(os.path.join(root_path,'COCO2017','train2017',images[i]))
bboxes = [] # 獲取每個圖片的bboxes
for d in data:
if d['name']==images[i]:
bboxes.append(d["bbox"])
# 生成錨框
for bbox in bboxes:
left_top = (int(bbox[0]), int(bbox[1])) # 這里數據集中bbox的含義是,左上角坐標和右下角坐標。
right_bottom = (int(bbox[2]), int(bbox[3])) # 根據不同數據集中bbox的含義,進行修改。
cv2.rectangle(img, left_top, right_bottom, (0, 255, 0), 2) # 圖像,左上角,右下坐標,顏色,粗細
cv2.imshow('image', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
博客所有的代碼,都同一放到一個python文件中,用那個就調用那個文件。
if __name__ == '__main__':
random.seed(777)
print("—" * 50)
# unicode2id() # 數據集unicode編碼轉id
# coco_dataset_split() # coco數據集拆分。
# coco2voc() # coco數據集轉換成voc數據集
# voc_dataset_split() # voc數據集拆分
# check_images() # 檢查圖片是否有損壞
# visiual_gt() # coco數據集將gt可視化,查看
voc2coco() # voc數據集轉換成coco數據集
print("—" * 50)
⭐ 完結撒花,如果有需要幫助的評論或者私聊都可以,看到就回答了。