2021-09-29 19:27:46 +08:00
|
|
|
|
# coding:utf8
|
|
|
|
|
import os
|
|
|
|
|
import shutil
|
|
|
|
|
import random
|
|
|
|
|
import argparse
|
|
|
|
|
|
2021-10-08 15:31:48 +08:00
|
|
|
|
|
2021-10-21 18:39:14 +08:00
|
|
|
|
# 删除划分的训练集、验证集、测试集文件夹,重新创建一个空的文件夹
|
2021-09-29 19:27:46 +08:00
|
|
|
|
def isCreateOrDeleteFolder(path, flag):
|
|
|
|
|
flagPath = os.path.join(path, flag)
|
2021-10-21 18:39:14 +08:00
|
|
|
|
|
2021-09-29 19:27:46 +08:00
|
|
|
|
if os.path.exists(flagPath):
|
|
|
|
|
shutil.rmtree(flagPath)
|
2021-10-21 18:39:14 +08:00
|
|
|
|
|
2021-09-29 19:27:46 +08:00
|
|
|
|
os.makedirs(flagPath)
|
|
|
|
|
flagAbsPath = os.path.abspath(flagPath)
|
|
|
|
|
return flagAbsPath
|
|
|
|
|
|
|
|
|
|
|
2022-02-10 22:40:19 +08:00
|
|
|
|
def splitTrainVal(root, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag):
|
2021-10-21 18:39:14 +08:00
|
|
|
|
# 按照指定的比例划分训练集、验证集、测试集
|
2022-02-10 22:40:19 +08:00
|
|
|
|
dataAbsPath = os.path.abspath(root)
|
2021-10-21 18:39:14 +08:00
|
|
|
|
|
2021-09-29 19:27:46 +08:00
|
|
|
|
if flag == "det":
|
2022-02-10 22:40:19 +08:00
|
|
|
|
labelFilePath = os.path.join(dataAbsPath, args.detLabelFileName)
|
2021-09-29 19:27:46 +08:00
|
|
|
|
elif flag == "rec":
|
2022-02-10 22:40:19 +08:00
|
|
|
|
labelFilePath = os.path.join(dataAbsPath, args.recLabelFileName)
|
2021-10-21 18:39:14 +08:00
|
|
|
|
|
2021-09-29 19:27:46 +08:00
|
|
|
|
labelFileRead = open(labelFilePath, "r", encoding="UTF-8")
|
|
|
|
|
labelFileContent = labelFileRead.readlines()
|
|
|
|
|
random.shuffle(labelFileContent)
|
|
|
|
|
labelRecordLen = len(labelFileContent)
|
2021-10-21 18:39:14 +08:00
|
|
|
|
|
2021-09-29 19:27:46 +08:00
|
|
|
|
for index, labelRecordInfo in enumerate(labelFileContent):
|
|
|
|
|
imageRelativePath = labelRecordInfo.split('\t')[0]
|
|
|
|
|
imageLabel = labelRecordInfo.split('\t')[1]
|
|
|
|
|
imageName = os.path.basename(imageRelativePath)
|
2021-10-21 18:39:14 +08:00
|
|
|
|
|
2021-09-29 19:27:46 +08:00
|
|
|
|
if flag == "det":
|
2022-02-10 22:40:19 +08:00
|
|
|
|
imagePath = os.path.join(dataAbsPath, imageName)
|
2021-09-29 19:27:46 +08:00
|
|
|
|
elif flag == "rec":
|
2022-02-10 22:40:19 +08:00
|
|
|
|
imagePath = os.path.join(dataAbsPath, "{}\\{}".format(args.recImageDirName, imageName))
|
2021-10-21 18:39:14 +08:00
|
|
|
|
|
|
|
|
|
# 按预设的比例划分训练集、验证集、测试集
|
|
|
|
|
trainValTestRatio = args.trainValTestRatio.split(":")
|
|
|
|
|
trainRatio = eval(trainValTestRatio[0]) / 10
|
|
|
|
|
valRatio = trainRatio + eval(trainValTestRatio[1]) / 10
|
|
|
|
|
curRatio = index / labelRecordLen
|
|
|
|
|
|
|
|
|
|
if curRatio < trainRatio:
|
2021-09-29 19:27:46 +08:00
|
|
|
|
imageCopyPath = os.path.join(absTrainRootPath, imageName)
|
|
|
|
|
shutil.copy(imagePath, imageCopyPath)
|
|
|
|
|
trainTxt.write("{}\t{}".format(imageCopyPath, imageLabel))
|
2021-10-21 18:39:14 +08:00
|
|
|
|
elif curRatio >= trainRatio and curRatio < valRatio:
|
2021-09-29 19:27:46 +08:00
|
|
|
|
imageCopyPath = os.path.join(absValRootPath, imageName)
|
|
|
|
|
shutil.copy(imagePath, imageCopyPath)
|
|
|
|
|
valTxt.write("{}\t{}".format(imageCopyPath, imageLabel))
|
2021-10-21 18:39:14 +08:00
|
|
|
|
else:
|
|
|
|
|
imageCopyPath = os.path.join(absTestRootPath, imageName)
|
|
|
|
|
shutil.copy(imagePath, imageCopyPath)
|
|
|
|
|
testTxt.write("{}\t{}".format(imageCopyPath, imageLabel))
|
2021-09-29 19:27:46 +08:00
|
|
|
|
|
|
|
|
|
|
2021-10-08 15:31:48 +08:00
|
|
|
|
# 删掉存在的文件
|
|
|
|
|
def removeFile(path):
|
|
|
|
|
if os.path.exists(path):
|
|
|
|
|
os.remove(path)
|
|
|
|
|
|
|
|
|
|
|
2021-09-29 19:27:46 +08:00
|
|
|
|
def genDetRecTrainVal(args):
|
|
|
|
|
detAbsTrainRootPath = isCreateOrDeleteFolder(args.detRootPath, "train")
|
|
|
|
|
detAbsValRootPath = isCreateOrDeleteFolder(args.detRootPath, "val")
|
2021-10-21 18:39:14 +08:00
|
|
|
|
detAbsTestRootPath = isCreateOrDeleteFolder(args.detRootPath, "test")
|
2021-09-29 19:27:46 +08:00
|
|
|
|
recAbsTrainRootPath = isCreateOrDeleteFolder(args.recRootPath, "train")
|
|
|
|
|
recAbsValRootPath = isCreateOrDeleteFolder(args.recRootPath, "val")
|
2021-10-21 18:39:14 +08:00
|
|
|
|
recAbsTestRootPath = isCreateOrDeleteFolder(args.recRootPath, "test")
|
|
|
|
|
|
2021-10-08 15:31:48 +08:00
|
|
|
|
removeFile(os.path.join(args.detRootPath, "train.txt"))
|
|
|
|
|
removeFile(os.path.join(args.detRootPath, "val.txt"))
|
2021-10-21 18:39:14 +08:00
|
|
|
|
removeFile(os.path.join(args.detRootPath, "test.txt"))
|
2021-10-08 15:31:48 +08:00
|
|
|
|
removeFile(os.path.join(args.recRootPath, "train.txt"))
|
|
|
|
|
removeFile(os.path.join(args.recRootPath, "val.txt"))
|
2021-10-21 18:39:14 +08:00
|
|
|
|
removeFile(os.path.join(args.recRootPath, "test.txt"))
|
|
|
|
|
|
2021-09-29 19:27:46 +08:00
|
|
|
|
detTrainTxt = open(os.path.join(args.detRootPath, "train.txt"), "a", encoding="UTF-8")
|
|
|
|
|
detValTxt = open(os.path.join(args.detRootPath, "val.txt"), "a", encoding="UTF-8")
|
2021-10-21 18:39:14 +08:00
|
|
|
|
detTestTxt = open(os.path.join(args.detRootPath, "test.txt"), "a", encoding="UTF-8")
|
2021-09-29 19:27:46 +08:00
|
|
|
|
recTrainTxt = open(os.path.join(args.recRootPath, "train.txt"), "a", encoding="UTF-8")
|
|
|
|
|
recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8")
|
2021-10-21 18:39:14 +08:00
|
|
|
|
recTestTxt = open(os.path.join(args.recRootPath, "test.txt"), "a", encoding="UTF-8")
|
|
|
|
|
|
2022-02-10 22:40:19 +08:00
|
|
|
|
splitTrainVal(args.datasetRootPath, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt,
|
|
|
|
|
detTestTxt, "det")
|
|
|
|
|
|
|
|
|
|
for root, dirs, files in os.walk(args.datasetRootPath):
|
2021-09-29 19:27:46 +08:00
|
|
|
|
for dir in dirs:
|
2022-02-10 22:40:19 +08:00
|
|
|
|
if dir == 'crop_img':
|
|
|
|
|
splitTrainVal(root, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt,
|
|
|
|
|
recTestTxt, "rec")
|
|
|
|
|
else:
|
|
|
|
|
continue
|
2021-09-29 19:27:46 +08:00
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
2022-02-10 22:40:19 +08:00
|
|
|
|
|
2021-09-29 19:27:46 +08:00
|
|
|
|
if __name__ == "__main__":
|
2021-10-21 18:39:14 +08:00
|
|
|
|
# 功能描述:分别划分检测和识别的训练集、验证集、测试集
|
2021-09-29 19:27:46 +08:00
|
|
|
|
# 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注,
|
2021-10-21 18:39:14 +08:00
|
|
|
|
# 如此会有多个标注好的图像文件夹汇总并划分训练集、验证集、测试集的需求
|
2021-09-29 19:27:46 +08:00
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
|
parser.add_argument(
|
2021-10-21 18:39:14 +08:00
|
|
|
|
"--trainValTestRatio",
|
|
|
|
|
type=str,
|
|
|
|
|
default="6:2:2",
|
|
|
|
|
help="ratio of trainset:valset:testset")
|
2021-09-29 19:27:46 +08:00
|
|
|
|
parser.add_argument(
|
2022-02-10 22:40:19 +08:00
|
|
|
|
"--datasetRootPath",
|
2021-09-29 19:27:46 +08:00
|
|
|
|
type=str,
|
2022-02-10 22:40:19 +08:00
|
|
|
|
default="../train_data/",
|
2021-09-29 19:27:46 +08:00
|
|
|
|
help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..."
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--detRootPath",
|
|
|
|
|
type=str,
|
2021-10-21 18:39:14 +08:00
|
|
|
|
default="../train_data/det",
|
2021-09-29 19:27:46 +08:00
|
|
|
|
help="the path where the divided detection dataset is placed")
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--recRootPath",
|
|
|
|
|
type=str,
|
2021-10-21 18:39:14 +08:00
|
|
|
|
default="../train_data/rec",
|
2021-09-29 19:27:46 +08:00
|
|
|
|
help="the path where the divided recognition dataset is placed"
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--detLabelFileName",
|
|
|
|
|
type=str,
|
|
|
|
|
default="Label.txt",
|
|
|
|
|
help="the name of the detection annotation file")
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--recLabelFileName",
|
|
|
|
|
type=str,
|
|
|
|
|
default="rec_gt.txt",
|
|
|
|
|
help="the name of the recognition annotation file"
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--recImageDirName",
|
|
|
|
|
type=str,
|
|
|
|
|
default="crop_img",
|
|
|
|
|
help="the name of the folder where the cropped recognition dataset is located"
|
|
|
|
|
)
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
genDetRecTrainVal(args)
|