Improved `dataset_stats()` YAML checks (#8125)

* Update dataloaders.py

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
pull/8186/head
Glenn Jocher 2022-06-11 19:30:54 +02:00 committed by GitHub
parent 6adc53ba5f
commit c23a441c9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 23 additions and 12 deletions

View File

@ -859,7 +859,7 @@ def flatten_recursive(path=DATASETS_DIR / 'coco128'):
shutil.copyfile(file, new_path / Path(file).name) shutil.copyfile(file, new_path / Path(file).name)
def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.datasets import *; extract_boxes() def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.dataloaders import *; extract_boxes()
# Convert detection dataset into classification dataset, with one directory per class # Convert detection dataset into classification dataset, with one directory per class
path = Path(path) # images dir path = Path(path) # images dir
shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing
@ -895,7 +895,7 @@ def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.datasets import
def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False): def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False):
""" Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files
Usage: from utils.datasets import *; autosplit() Usage: from utils.dataloaders import *; autosplit()
Arguments Arguments
path: Path to images directory path: Path to images directory
weights: Train, val, test weights (list, tuple) weights: Train, val, test weights (list, tuple)
@ -972,29 +972,40 @@ def verify_image_label(args):
def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False): def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):
""" Return dataset statistics dictionary with images and instances counts per split per class """ Return dataset statistics dictionary with images and instances counts per split per class
To run in parent directory: export PYTHONPATH="$PWD/yolov5" To run in parent directory: export PYTHONPATH="$PWD/yolov5"
Usage1: from utils.datasets import *; dataset_stats('coco128.yaml', autodownload=True) Usage1: from utils.dataloaders import *; dataset_stats('coco128.yaml', autodownload=True)
Usage2: from utils.datasets import *; dataset_stats('path/to/coco128_with_yaml.zip') Usage2: from utils.dataloaders import *; dataset_stats('path/to/coco128_with_yaml.zip')
Arguments Arguments
path: Path to data.yaml or data.zip (with data.yaml inside data.zip) path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
autodownload: Attempt to download dataset if not found locally autodownload: Attempt to download dataset if not found locally
verbose: Print stats dictionary verbose: Print stats dictionary
""" """
def round_labels(labels): def _round_labels(labels):
# Update labels to integer class and 6 decimal place floats # Update labels to integer class and 6 decimal place floats
return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels] return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
def unzip(path): def _find_yaml(dir):
# Unzip data.zip TODO: CONSTRAINT: path/to/abc.zip MUST unzip to 'path/to/abc/' # Return data.yaml file
files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
assert files, f'No *.yaml file found in {dir}'
if len(files) > 1:
files = [f for f in files if f.stem == dir.stem] # prefer *.yaml files that match dir name
assert files, f'Multiple *.yaml files found in {dir}, only 1 *.yaml file allowed'
assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
return files[0]
def _unzip(path):
# Unzip data.zip
if str(path).endswith('.zip'): # path is data.zip if str(path).endswith('.zip'): # path is data.zip
assert Path(path).is_file(), f'Error unzipping {path}, file not found' assert Path(path).is_file(), f'Error unzipping {path}, file not found'
ZipFile(path).extractall(path=path.parent) # unzip ZipFile(path).extractall(path=path.parent) # unzip
dir = path.with_suffix('') # dataset directory == zip name dir = path.with_suffix('') # dataset directory == zip name
return True, str(dir), next(dir.rglob('*.yaml')) # zipped, data_dir, yaml_path assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
return True, str(dir), _find_yaml(dir) # zipped, data_dir, yaml_path
else: # path is data.yaml else: # path is data.yaml
return False, None, path return False, None, path
def hub_ops(f, max_dim=1920): def _hub_ops(f, max_dim=1920):
# HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
f_new = im_dir / Path(f).name # dataset-hub image filename f_new = im_dir / Path(f).name # dataset-hub image filename
try: # use PIL try: # use PIL
@ -1012,7 +1023,7 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA) im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
cv2.imwrite(str(f_new), im) cv2.imwrite(str(f_new), im)
zipped, data_dir, yaml_path = unzip(Path(path)) zipped, data_dir, yaml_path = _unzip(Path(path))
with open(check_yaml(yaml_path), errors='ignore') as f: with open(check_yaml(yaml_path), errors='ignore') as f:
data = yaml.safe_load(f) # data dict data = yaml.safe_load(f) # data dict
if zipped: if zipped:
@ -1038,12 +1049,12 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
'unlabelled': int(np.all(x == 0, 1).sum()), 'unlabelled': int(np.all(x == 0, 1).sum()),
'per_class': (x > 0).sum(0).tolist()}, 'per_class': (x > 0).sum(0).tolist()},
'labels': [{ 'labels': [{
str(Path(k).name): round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]} str(Path(k).name): _round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
if hub: if hub:
im_dir = hub_dir / 'images' im_dir = hub_dir / 'images'
im_dir.mkdir(parents=True, exist_ok=True) im_dir.mkdir(parents=True, exist_ok=True)
for _ in tqdm(ThreadPool(NUM_THREADS).imap(hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'): for _ in tqdm(ThreadPool(NUM_THREADS).imap(_hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'):
pass pass
# Profile # Profile