修改完成:划分det与rec数据集脚本,以及对应的中英文文档指令及其数据集存放树状结构。已提交过release2.4分支
parent
efc0908277
commit
f58d8d0372
|
@ -200,18 +200,28 @@ For some data that are difficult to recognize, the recognition results will not
|
|||
|
||||
```
|
||||
cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder
|
||||
python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec
|
||||
python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data
|
||||
```
|
||||
|
||||
Parameter Description:
|
||||
|
||||
- `trainValTestRatio` is the division ratio of the number of images in the training set, validation set, and test set, set according to your actual situation, the default is `6:2:2`
|
||||
|
||||
- `labelRootPath` is the storage path of the dataset labeled by PPOCRLabel, the default is `../train_data/label`
|
||||
|
||||
- `detRootPath` is the path where the text detection dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/det`
|
||||
|
||||
- `recRootPath` is the path where the character recognition dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/rec`
|
||||
- `datasetRootPath` is the storage path of the complete dataset labeled by PPOCRLabel. The default path is `PaddleOCR/train_data` .
|
||||
```
|
||||
|-train_data
|
||||
|-crop_img
|
||||
|- word_001_crop_0.png
|
||||
|- word_002_crop_0.jpg
|
||||
|- word_003_crop_0.jpg
|
||||
| ...
|
||||
| Label.txt
|
||||
| rec_gt.txt
|
||||
|- word_001.png
|
||||
|- word_002.jpg
|
||||
|- word_003.jpg
|
||||
| ...
|
||||
```
|
||||
|
||||
### 3.6 Error message
|
||||
|
||||
|
|
|
@ -185,18 +185,28 @@ PPOCRLabel支持三种导出方式:
|
|||
|
||||
```
|
||||
cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下
|
||||
python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec
|
||||
python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data
|
||||
```
|
||||
|
||||
参数说明:
|
||||
|
||||
- `trainValTestRatio` 是训练集、验证集、测试集的图像数量划分比例,根据实际情况设定,默认是`6:2:2`
|
||||
|
||||
- `labelRootPath` 是PPOCRLabel标注的数据集存放路径,默认是`../train_data/label`
|
||||
|
||||
- `detRootPath` 是根据PPOCRLabel标注的数据集划分后的文本检测数据集存放的路径,默认是`../train_data/det `
|
||||
|
||||
- `recRootPath` 是根据PPOCRLabel标注的数据集划分后的字符识别数据集存放的路径,默认是`../train_data/rec`
|
||||
- `datasetRootPath` 是PPOCRLabel标注的完整数据集存放路径。默认路径是 `PaddleOCR/train_data` 分割数据集前应有如下结构:
|
||||
```
|
||||
|-train_data
|
||||
|-crop_img
|
||||
|- word_001_crop_0.png
|
||||
|- word_002_crop_0.jpg
|
||||
|- word_003_crop_0.jpg
|
||||
| ...
|
||||
| Label.txt
|
||||
| rec_gt.txt
|
||||
|- word_001.png
|
||||
|- word_002.jpg
|
||||
|- word_003.jpg
|
||||
| ...
|
||||
```
|
||||
|
||||
### 3.6 错误提示
|
||||
|
||||
|
|
|
@ -17,15 +17,14 @@ def isCreateOrDeleteFolder(path, flag):
|
|||
return flagAbsPath
|
||||
|
||||
|
||||
def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag):
|
||||
def splitTrainVal(root, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag):
|
||||
# 按照指定的比例划分训练集、验证集、测试集
|
||||
labelPath = os.path.join(root, dir)
|
||||
labelAbsPath = os.path.abspath(labelPath)
|
||||
dataAbsPath = os.path.abspath(root)
|
||||
|
||||
if flag == "det":
|
||||
labelFilePath = os.path.join(labelAbsPath, args.detLabelFileName)
|
||||
labelFilePath = os.path.join(dataAbsPath, args.detLabelFileName)
|
||||
elif flag == "rec":
|
||||
labelFilePath = os.path.join(labelAbsPath, args.recLabelFileName)
|
||||
labelFilePath = os.path.join(dataAbsPath, args.recLabelFileName)
|
||||
|
||||
labelFileRead = open(labelFilePath, "r", encoding="UTF-8")
|
||||
labelFileContent = labelFileRead.readlines()
|
||||
|
@ -38,9 +37,9 @@ def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, absTestRootPath,
|
|||
imageName = os.path.basename(imageRelativePath)
|
||||
|
||||
if flag == "det":
|
||||
imagePath = os.path.join(labelAbsPath, imageName)
|
||||
imagePath = os.path.join(dataAbsPath, imageName)
|
||||
elif flag == "rec":
|
||||
imagePath = os.path.join(labelAbsPath, "{}\\{}".format(args.recImageDirName, imageName))
|
||||
imagePath = os.path.join(dataAbsPath, "{}\\{}".format(args.recImageDirName, imageName))
|
||||
|
||||
# 按预设的比例划分训练集、验证集、测试集
|
||||
trainValTestRatio = args.trainValTestRatio.split(":")
|
||||
|
@ -90,15 +89,20 @@ def genDetRecTrainVal(args):
|
|||
recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8")
|
||||
recTestTxt = open(os.path.join(args.recRootPath, "test.txt"), "a", encoding="UTF-8")
|
||||
|
||||
for root, dirs, files in os.walk(args.labelRootPath):
|
||||
for dir in dirs:
|
||||
splitTrainVal(root, dir, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt,
|
||||
splitTrainVal(args.datasetRootPath, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt,
|
||||
detTestTxt, "det")
|
||||
splitTrainVal(root, dir, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt,
|
||||
|
||||
for root, dirs, files in os.walk(args.datasetRootPath):
|
||||
for dir in dirs:
|
||||
if dir == 'crop_img':
|
||||
splitTrainVal(root, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt,
|
||||
recTestTxt, "rec")
|
||||
else:
|
||||
continue
|
||||
break
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 功能描述:分别划分检测和识别的训练集、验证集、测试集
|
||||
# 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注,
|
||||
|
@ -110,9 +114,9 @@ if __name__ == "__main__":
|
|||
default="6:2:2",
|
||||
help="ratio of trainset:valset:testset")
|
||||
parser.add_argument(
|
||||
"--labelRootPath",
|
||||
"--datasetRootPath",
|
||||
type=str,
|
||||
default="../train_data/label",
|
||||
default="../train_data/",
|
||||
help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..."
|
||||
)
|
||||
parser.add_argument(
|
||||
|
|
Loading…
Reference in New Issue