caffe prepare genvoc


原文链接: caffe prepare genvoc

数据集划分
这里我们将1465张图片按照7:2:1的比例随机划分为训练集、测试集和验证集。为了方便运行,

  1. 我们先建立一个名为images的文件夹,下面有 JPEGImages 和 Annotations 分别存放了所有的图片和对应的标注文件。同样,最后附上划分数据集的主要函数:

split_data.py

"""
split data to train, test and valid data.
author: Meringue
date: 2018/2/27
"""

import os
import shutil
import random

def _copy_file(src_file, dst_file):
	"""copy file.
	"""
	if not os.path.isfile(src_file):
		print"%s not exist!" %(src_file)
	else:
		fpath, fname = os.path.split(dst_file)
		if not os.path.exists(fpath):
			os.makedirs(fpath)
		shutil.copyfile(src_file, dst_file)


def split_data(data_dir, train_dir, test_dir, valid_dir, ratio=[0.7, 0.2, 0.1], shuffle=True):
	""" split data to train data, test data, valid data.
	Args:
		data_dir -- data dir to to be splitted.
		train_dir, test_dir, valid_dir -- splitted dir.
		ratio -- [train_ratio, test_ratio, valid_ratio].
		shuffle -- shuffle or not.
	"""
	all_img_dir = os.path.join(data_dir, "JPEGImages/")
	all_xml_dir = os.path.join(data_dir, "Annotations/")
	train_img_dir = os.path.join(train_dir, "JPEGImages/")
	train_xml_dir = os.path.join(train_dir, "Annotations/")
	test_img_dir = os.path.join(test_dir, "JPEGImages/")
	test_xml_dir = os.path.join(test_dir, "Annotations/")
	valid_img_dir = os.path.join(valid_dir, "JPEGImages/")
	valid_xml_dir = os.path.join(valid_dir, "Annotations/")

	all_imgs_name = os.listdir(all_img_dir)
	img_num = len(all_imgs_name)
	train_num = int(1.0*img_num*ratio[0]/sum(ratio))
	test_num = int(1.0*img_num*ratio[1]/sum(ratio))
	valid_num = img_num-train_num-test_num

	if shuffle:
		random.shuffle(all_imgs_name)
	train_imgs_name = all_imgs_name[:train_num]
	test_imgs_name = all_imgs_name[train_num:train_num+test_num]
	valid_imgs_name = all_imgs_name[-valid_num:]

	for img_name in train_imgs_name:
		img_srcfile = os.path.join(all_img_dir, img_name)
		xml_srcfile = os.path.join(all_xml_dir, img_name.split(".")[0]+".xml")
		xml_name = img_name.split(".")[0] + ".xml"

		img_dstfile = os.path.join(train_img_dir, img_name)
		xml_dstfile = os.path.join(train_xml_dir, xml_name)
		_copy_file(img_srcfile, img_dstfile)
		_copy_file(xml_srcfile, xml_dstfile)

	for img_name in test_imgs_name:
		img_srcfile = os.path.join(all_img_dir, img_name)
		xml_srcfile = os.path.join(all_xml_dir, img_name.split(".")[0]+".xml")
		xml_name = img_name.split(".")[0] + ".xml"

		img_dstfile = os.path.join(test_img_dir, img_name)
		xml_dstfile = os.path.join(test_xml_dir, xml_name)
		_copy_file(img_srcfile, img_dstfile)
		_copy_file(xml_srcfile, xml_dstfile)

	for img_name in valid_imgs_name:
		img_srcfile = os.path.join(all_img_dir, img_name)
		xml_srcfile = os.path.join(all_xml_dir, img_name.split(".")[0]+".xml")
		xml_name = img_name.split(".")[0] + ".xml"

		img_dstfile = os.path.join(valid_img_dir, img_name)
		xml_dstfile = os.path.join(valid_xml_dir, xml_name)
		_copy_file(img_srcfile, img_dstfile)
		_copy_file(xml_srcfile, xml_dstfile)

if __name__ == "__main__":
	data_dir = "/home/meringue/Documents/traffic_sign_detection/data/images"
	train_dir = "/home/meringue/Documents/traffic_sign_detection/data/train_images"
	test_dir = "/home/meringue/Documents/traffic_sign_detection/data/test_images"
	valid_dir = "/home/meringue/Documents/traffic_sign_detection/data/valid_images"

	print "start splitting..."
	split_data(data_dir, train_dir, test_dir, valid_dir)
`