修改完成:划分det与rec数据集脚本,以及对应的中英文文档指令及其数据集存放树状结构。已提交过release2.4分支
parent
efc0908277
commit
f58d8d0372
|
@ -198,21 +198,31 @@ For some data that are difficult to recognize, the recognition results will not
|
||||||
|
|
||||||
- Enter the following command in the terminal to execute the dataset division script:
|
- Enter the following command in the terminal to execute the dataset division script:
|
||||||
|
|
||||||
```
|
```
|
||||||
cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder
|
cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder
|
||||||
python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec
|
python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data
|
||||||
```
|
```
|
||||||
|
|
||||||
Parameter Description:
|
Parameter Description:
|
||||||
|
|
||||||
- `trainValTestRatio` is the division ratio of the number of images in the training set, validation set, and test set, set according to your actual situation, the default is `6:2:2`
|
- `trainValTestRatio` is the division ratio of the number of images in the training set, validation set, and test set, set according to your actual situation, the default is `6:2:2`
|
||||||
|
|
||||||
- `labelRootPath` is the storage path of the dataset labeled by PPOCRLabel, the default is `../train_data/label`
|
- `datasetRootPath` is the storage path of the complete dataset labeled by PPOCRLabel. The default path is `PaddleOCR/train_data` .
|
||||||
|
```
|
||||||
- `detRootPath` is the path where the text detection dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/det`
|
|-train_data
|
||||||
|
|-crop_img
|
||||||
- `recRootPath` is the path where the character recognition dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/rec`
|
|- word_001_crop_0.png
|
||||||
|
|- word_002_crop_0.jpg
|
||||||
|
|- word_003_crop_0.jpg
|
||||||
|
| ...
|
||||||
|
| Label.txt
|
||||||
|
| rec_gt.txt
|
||||||
|
|- word_001.png
|
||||||
|
|- word_002.jpg
|
||||||
|
|- word_003.jpg
|
||||||
|
| ...
|
||||||
|
```
|
||||||
|
|
||||||
### 3.6 Error message
|
### 3.6 Error message
|
||||||
|
|
||||||
- If paddleocr is installed with whl, it has a higher priority than calling PaddleOCR class with paddleocr.py, which may cause an exception if whl package is not updated.
|
- If paddleocr is installed with whl, it has a higher priority than calling PaddleOCR class with paddleocr.py, which may cause an exception if whl package is not updated.
|
||||||
|
|
|
@ -185,19 +185,29 @@ PPOCRLabel支持三种导出方式:
|
||||||
|
|
||||||
```
|
```
|
||||||
cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下
|
cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下
|
||||||
python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec
|
python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data
|
||||||
```
|
```
|
||||||
|
|
||||||
参数说明:
|
参数说明:
|
||||||
|
|
||||||
- `trainValTestRatio` 是训练集、验证集、测试集的图像数量划分比例,根据实际情况设定,默认是`6:2:2`
|
- `trainValTestRatio` 是训练集、验证集、测试集的图像数量划分比例,根据实际情况设定,默认是`6:2:2`
|
||||||
|
|
||||||
- `labelRootPath` 是PPOCRLabel标注的数据集存放路径,默认是`../train_data/label`
|
- `datasetRootPath` 是PPOCRLabel标注的完整数据集存放路径。默认路径是 `PaddleOCR/train_data` 分割数据集前应有如下结构:
|
||||||
|
```
|
||||||
- `detRootPath` 是根据PPOCRLabel标注的数据集划分后的文本检测数据集存放的路径,默认是`../train_data/det `
|
|-train_data
|
||||||
|
|-crop_img
|
||||||
- `recRootPath` 是根据PPOCRLabel标注的数据集划分后的字符识别数据集存放的路径,默认是`../train_data/rec`
|
|- word_001_crop_0.png
|
||||||
|
|- word_002_crop_0.jpg
|
||||||
|
|- word_003_crop_0.jpg
|
||||||
|
| ...
|
||||||
|
| Label.txt
|
||||||
|
| rec_gt.txt
|
||||||
|
|- word_001.png
|
||||||
|
|- word_002.jpg
|
||||||
|
|- word_003.jpg
|
||||||
|
| ...
|
||||||
|
```
|
||||||
|
|
||||||
### 3.6 错误提示
|
### 3.6 错误提示
|
||||||
|
|
||||||
- 如果同时使用whl包安装了paddleocr,其优先级大于通过paddleocr.py调用PaddleOCR类,whl包未更新时会导致程序异常。
|
- 如果同时使用whl包安装了paddleocr,其优先级大于通过paddleocr.py调用PaddleOCR类,whl包未更新时会导致程序异常。
|
||||||
|
|
|
@ -17,15 +17,14 @@ def isCreateOrDeleteFolder(path, flag):
|
||||||
return flagAbsPath
|
return flagAbsPath
|
||||||
|
|
||||||
|
|
||||||
def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag):
|
def splitTrainVal(root, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag):
|
||||||
# 按照指定的比例划分训练集、验证集、测试集
|
# 按照指定的比例划分训练集、验证集、测试集
|
||||||
labelPath = os.path.join(root, dir)
|
dataAbsPath = os.path.abspath(root)
|
||||||
labelAbsPath = os.path.abspath(labelPath)
|
|
||||||
|
|
||||||
if flag == "det":
|
if flag == "det":
|
||||||
labelFilePath = os.path.join(labelAbsPath, args.detLabelFileName)
|
labelFilePath = os.path.join(dataAbsPath, args.detLabelFileName)
|
||||||
elif flag == "rec":
|
elif flag == "rec":
|
||||||
labelFilePath = os.path.join(labelAbsPath, args.recLabelFileName)
|
labelFilePath = os.path.join(dataAbsPath, args.recLabelFileName)
|
||||||
|
|
||||||
labelFileRead = open(labelFilePath, "r", encoding="UTF-8")
|
labelFileRead = open(labelFilePath, "r", encoding="UTF-8")
|
||||||
labelFileContent = labelFileRead.readlines()
|
labelFileContent = labelFileRead.readlines()
|
||||||
|
@ -38,9 +37,9 @@ def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, absTestRootPath,
|
||||||
imageName = os.path.basename(imageRelativePath)
|
imageName = os.path.basename(imageRelativePath)
|
||||||
|
|
||||||
if flag == "det":
|
if flag == "det":
|
||||||
imagePath = os.path.join(labelAbsPath, imageName)
|
imagePath = os.path.join(dataAbsPath, imageName)
|
||||||
elif flag == "rec":
|
elif flag == "rec":
|
||||||
imagePath = os.path.join(labelAbsPath, "{}\\{}".format(args.recImageDirName, imageName))
|
imagePath = os.path.join(dataAbsPath, "{}\\{}".format(args.recImageDirName, imageName))
|
||||||
|
|
||||||
# 按预设的比例划分训练集、验证集、测试集
|
# 按预设的比例划分训练集、验证集、测试集
|
||||||
trainValTestRatio = args.trainValTestRatio.split(":")
|
trainValTestRatio = args.trainValTestRatio.split(":")
|
||||||
|
@ -90,15 +89,20 @@ def genDetRecTrainVal(args):
|
||||||
recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8")
|
recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8")
|
||||||
recTestTxt = open(os.path.join(args.recRootPath, "test.txt"), "a", encoding="UTF-8")
|
recTestTxt = open(os.path.join(args.recRootPath, "test.txt"), "a", encoding="UTF-8")
|
||||||
|
|
||||||
for root, dirs, files in os.walk(args.labelRootPath):
|
splitTrainVal(args.datasetRootPath, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt,
|
||||||
|
detTestTxt, "det")
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk(args.datasetRootPath):
|
||||||
for dir in dirs:
|
for dir in dirs:
|
||||||
splitTrainVal(root, dir, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt,
|
if dir == 'crop_img':
|
||||||
detTestTxt, "det")
|
splitTrainVal(root, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt,
|
||||||
splitTrainVal(root, dir, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt,
|
recTestTxt, "rec")
|
||||||
recTestTxt, "rec")
|
else:
|
||||||
|
continue
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 功能描述:分别划分检测和识别的训练集、验证集、测试集
|
# 功能描述:分别划分检测和识别的训练集、验证集、测试集
|
||||||
# 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注,
|
# 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注,
|
||||||
|
@ -110,9 +114,9 @@ if __name__ == "__main__":
|
||||||
default="6:2:2",
|
default="6:2:2",
|
||||||
help="ratio of trainset:valset:testset")
|
help="ratio of trainset:valset:testset")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--labelRootPath",
|
"--datasetRootPath",
|
||||||
type=str,
|
type=str,
|
||||||
default="../train_data/label",
|
default="../train_data/",
|
||||||
help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..."
|
help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..."
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|
Loading…
Reference in New Issue