mirror of https://github.com/open-mmlab/mmocr.git
76 lines
2.6 KiB
Python
76 lines
2.6 KiB
Python
# Copyright (c) OpenMMLab. All rights reserved.
|
|
import argparse
|
|
import os
|
|
import os.path as osp
|
|
import shutil
|
|
import xml.etree.ElementTree as ET
|
|
import zipfile
|
|
from xml.etree.ElementTree import ParseError
|
|
|
|
|
|
def extract(root_path):
|
|
idx = 0
|
|
for language in ['English', 'Korean', 'Mixed']:
|
|
for camera in ['Digital_Camera', 'Mobile_Phone']:
|
|
crt_path = osp.join(root_path, 'KAIST', language, camera)
|
|
zips = os.listdir(crt_path)
|
|
for zip in zips:
|
|
extracted_path = osp.join(root_path, 'tmp', zip)
|
|
extract_zipfile(osp.join(crt_path, zip), extracted_path)
|
|
for file in os.listdir(extracted_path):
|
|
if file.endswith('xml'):
|
|
src_ann = os.path.join(extracted_path, file)
|
|
# Filtering broken annotations
|
|
try:
|
|
ET.parse(src_ann)
|
|
except ParseError:
|
|
continue
|
|
src_img = None
|
|
img_names = [
|
|
file.replace('xml', suffix)
|
|
for suffix in ['jpg', 'JPG']
|
|
]
|
|
for im in img_names:
|
|
img_path = osp.join(extracted_path, im)
|
|
if osp.exists(img_path):
|
|
src_img = img_path
|
|
if src_img:
|
|
shutil.move(
|
|
src_ann,
|
|
osp.join(root_path, 'annotations',
|
|
str(idx).zfill(5) + '.xml'))
|
|
shutil.move(
|
|
src_img,
|
|
osp.join(root_path, 'imgs',
|
|
str(idx).zfill(5) + '.jpg'))
|
|
idx += 1
|
|
|
|
|
|
def extract_zipfile(zip_path, dst_dir, delete=True):
|
|
|
|
files = zipfile.ZipFile(zip_path)
|
|
for file in files.namelist():
|
|
files.extract(file, dst_dir)
|
|
if delete:
|
|
os.remove(zip_path)
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description='Extract KAIST zips')
|
|
parser.add_argument('root_path', help='Root path of KAIST')
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
root_path = args.root_path
|
|
assert osp.exists(root_path)
|
|
extract(root_path)
|
|
shutil.rmtree(osp.join(args.root_path, 'tmp'))
|
|
shutil.rmtree(osp.join(args.root_path, 'KAIST'))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|