PaddleOCR/gen_ocr_train_val.py
–MrCuiHao 02bd0f0ddb 1.增加gen_ocr_train_val.py使用说明文档和文档中用到的图片
2.增加windows环境下可以直接执行gen_ocr_train_val.py的bat文件gen_ocr_train_val.bat
3.执行gen_ocr_train_val.py脚本时,如果没有划分好的train.txt和val.txt标注文件,会报没有这些文件的错误
2021-10-08 16:17:18 +08:00

121 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding:utf8
import os
import shutil
import random
import argparse
# 删除划分的训练集和验证集文件夹,重新创建一个空的文件夹
def isCreateOrDeleteFolder(path, flag):
flagPath = os.path.join(path, flag)
if os.path.exists(flagPath):
shutil.rmtree(flagPath)
os.makedirs(flagPath)
flagAbsPath = os.path.abspath(flagPath)
return flagAbsPath
def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, trainTxt, valTxt, flag):
# 按照指定的比例划分训练集和验证集
labelPath = os.path.join(root, dir)
labelAbsPath = os.path.abspath(labelPath)
if flag == "det":
labelFilePath = os.path.join(labelAbsPath, args.detLabelFileName)
elif flag == "rec":
labelFilePath = os.path.join(labelAbsPath, args.recLabelFileName)
labelFileRead = open(labelFilePath, "r", encoding="UTF-8")
labelFileContent = labelFileRead.readlines()
random.shuffle(labelFileContent)
labelRecordLen = len(labelFileContent)
for index, labelRecordInfo in enumerate(labelFileContent):
imageRelativePath = labelRecordInfo.split('\t')[0]
imageLabel = labelRecordInfo.split('\t')[1]
imageName = os.path.basename(imageRelativePath)
if flag == "det":
imagePath = os.path.join(labelAbsPath, imageName)
elif flag == "rec":
imagePath = os.path.join(labelAbsPath, "{}\\{}".format(args.recImageDirName, imageName))
# 小于划分比例trainValRatio时数据集划分到训练集否则测试集
if index / labelRecordLen < args.trainValRatio:
imageCopyPath = os.path.join(absTrainRootPath, imageName)
shutil.copy(imagePath, imageCopyPath)
trainTxt.write("{}\t{}".format(imageCopyPath, imageLabel))
else:
imageCopyPath = os.path.join(absValRootPath, imageName)
shutil.copy(imagePath, imageCopyPath)
valTxt.write("{}\t{}".format(imageCopyPath, imageLabel))
# 删掉存在的文件
def removeFile(path):
if os.path.exists(path):
os.remove(path)
def genDetRecTrainVal(args):
detAbsTrainRootPath = isCreateOrDeleteFolder(args.detRootPath, "train")
detAbsValRootPath = isCreateOrDeleteFolder(args.detRootPath, "val")
recAbsTrainRootPath = isCreateOrDeleteFolder(args.recRootPath, "train")
recAbsValRootPath = isCreateOrDeleteFolder(args.recRootPath, "val")
removeFile(os.path.join(args.detRootPath, "train.txt"))
removeFile(os.path.join(args.detRootPath, "val.txt"))
removeFile(os.path.join(args.recRootPath, "train.txt"))
removeFile(os.path.join(args.recRootPath, "val.txt"))
detTrainTxt = open(os.path.join(args.detRootPath, "train.txt"), "a", encoding="UTF-8")
detValTxt = open(os.path.join(args.detRootPath, "val.txt"), "a", encoding="UTF-8")
recTrainTxt = open(os.path.join(args.recRootPath, "train.txt"), "a", encoding="UTF-8")
recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8")
for root, dirs, files in os.walk(args.labelRootPath):
for dir in dirs:
splitTrainVal(root, dir, detAbsTrainRootPath, detAbsValRootPath, detTrainTxt, detValTxt, "det")
splitTrainVal(root, dir, recAbsTrainRootPath, recAbsValRootPath, recTrainTxt, recValTxt, "rec")
break
if __name__ == "__main__":
# 功能描述:分别划分检测和识别的训练集和验证集
# 说明可以根据自己的路径和需求调整参数图像数据往往多人合作分批标注每一批图像数据放在一个文件夹内用PPOCRLabel进行标注
# 如此会有多个标注好的图像文件夹汇总并划分训练集和验证集的需求
parser = argparse.ArgumentParser()
parser.add_argument(
"--trainValRatio",
type=float,
default=0.8,
help="ratio of training set to validation set")
parser.add_argument(
"--labelRootPath",
type=str,
default="./train_data/label",
help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..."
)
parser.add_argument(
"--detRootPath",
type=str,
default="./train_data/det",
help="the path where the divided detection dataset is placed")
parser.add_argument(
"--recRootPath",
type=str,
default="./train_data/rec",
help="the path where the divided recognition dataset is placed"
)
parser.add_argument(
"--detLabelFileName",
type=str,
default="Label.txt",
help="the name of the detection annotation file")
parser.add_argument(
"--recLabelFileName",
type=str,
default="rec_gt.txt",
help="the name of the recognition annotation file"
)
parser.add_argument(
"--recImageDirName",
type=str,
default="crop_img",
help="the name of the folder where the cropped recognition dataset is located"
)
args = parser.parse_args()
genDetRecTrainVal(args)