from __future__ import print_function, absolute_import import os import glob import re import sys import urllib import tarfile import zipfile import os.path as osp from scipy.io import loadmat import numpy as np import h5py from scipy.misc import imsave from utils.iotools import mkdir_if_missing, write_json, read_json from .base import BaseImgDataset class CUHK03(BaseImgDataset): """ CUHK03 Reference: Li et al. DeepReID: Deep Filter Pairing Neural Network for Person Re-identification. CVPR 2014. URL: http://www.ee.cuhk.edu.hk/~xgwang/CUHK_identification.html#! Dataset statistics: # identities: 1360 # images: 13164 # cameras: 6 # splits: 20 (classic) Args: split_id (int): split index (default: 0) cuhk03_labeled (bool): whether to load labeled images; if false, detected images are loaded (default: False) """ dataset_dir = 'cuhk03' def __init__(self, root='data', split_id=0, cuhk03_labeled=False, cuhk03_classic_split=False, verbose=True, use_lmdb=False, **kwargs): super(CUHK03, self).__init__() self.dataset_dir = osp.join(root, self.dataset_dir) self.data_dir = osp.join(self.dataset_dir, 'cuhk03_release') self.raw_mat_path = osp.join(self.data_dir, 'cuhk-03.mat') self.imgs_detected_dir = osp.join(self.dataset_dir, 'images_detected') self.imgs_labeled_dir = osp.join(self.dataset_dir, 'images_labeled') self.split_classic_det_json_path = osp.join(self.dataset_dir, 'splits_classic_detected.json') self.split_classic_lab_json_path = osp.join(self.dataset_dir, 'splits_classic_labeled.json') self.split_new_det_json_path = osp.join(self.dataset_dir, 'splits_new_detected.json') self.split_new_lab_json_path = osp.join(self.dataset_dir, 'splits_new_labeled.json') self.split_new_det_mat_path = osp.join(self.dataset_dir, 'cuhk03_new_protocol_config_detected.mat') self.split_new_lab_mat_path = osp.join(self.dataset_dir, 'cuhk03_new_protocol_config_labeled.mat') self._check_before_run() self._preprocess() if cuhk03_labeled: image_type = 'labeled' split_path = self.split_classic_lab_json_path if cuhk03_classic_split else self.split_new_lab_json_path else: image_type = 'detected' split_path = self.split_classic_det_json_path if cuhk03_classic_split else self.split_new_det_json_path splits = read_json(split_path) assert split_id < len(splits), "Condition split_id ({}) < len(splits) ({}) is false".format(split_id, len(splits)) split = splits[split_id] print("Split index = {}".format(split_id)) train = split['train'] query = split['query'] gallery = split['gallery'] num_train_pids = split['num_train_pids'] num_query_pids = split['num_query_pids'] num_gallery_pids = split['num_gallery_pids'] num_total_pids = num_train_pids + num_query_pids num_train_imgs = split['num_train_imgs'] num_query_imgs = split['num_query_imgs'] num_gallery_imgs = split['num_gallery_imgs'] num_total_imgs = num_train_imgs + num_query_imgs if verbose: print("=> CUHK03 ({}) loaded".format(image_type)) print("Dataset statistics:") print(" ------------------------------") print(" subset | # ids | # images") print(" ------------------------------") print(" train | {:5d} | {:8d}".format(num_train_pids, num_train_imgs)) print(" query | {:5d} | {:8d}".format(num_query_pids, num_query_imgs)) print(" gallery | {:5d} | {:8d}".format(num_gallery_pids, num_gallery_imgs)) print(" ------------------------------") print(" total | {:5d} | {:8d}".format(num_total_pids, num_total_imgs)) print(" ------------------------------") self.train = train self.query = query self.gallery = gallery self.num_train_pids = num_train_pids self.num_query_pids = num_query_pids self.num_gallery_pids = num_gallery_pids if use_lmdb: self.generate_lmdb() def _check_before_run(self): """Check if all files are available before going deeper""" if not osp.exists(self.dataset_dir): raise RuntimeError("'{}' is not available".format(self.dataset_dir)) if not osp.exists(self.data_dir): raise RuntimeError("'{}' is not available".format(self.data_dir)) if not osp.exists(self.raw_mat_path): raise RuntimeError("'{}' is not available".format(self.raw_mat_path)) if not osp.exists(self.split_new_det_mat_path): raise RuntimeError("'{}' is not available".format(self.split_new_det_mat_path)) if not osp.exists(self.split_new_lab_mat_path): raise RuntimeError("'{}' is not available".format(self.split_new_lab_mat_path)) def _preprocess(self): """ This function is a bit complex and ugly, what it does is 1. Extract data from cuhk-03.mat and save as png images. 2. Create 20 classic splits. (Li et al. CVPR'14) 3. Create new split. (Zhong et al. CVPR'17) """ print("Note: if root path is changed, the previously generated json files need to be re-generated (delete them first)") if osp.exists(self.imgs_labeled_dir) and \ osp.exists(self.imgs_detected_dir) and \ osp.exists(self.split_classic_det_json_path) and \ osp.exists(self.split_classic_lab_json_path) and \ osp.exists(self.split_new_det_json_path) and \ osp.exists(self.split_new_lab_json_path): return mkdir_if_missing(self.imgs_detected_dir) mkdir_if_missing(self.imgs_labeled_dir) print("Extract image data from {} and save as png".format(self.raw_mat_path)) mat = h5py.File(self.raw_mat_path, 'r') def _deref(ref): return mat[ref][:].T def _process_images(img_refs, campid, pid, save_dir): img_paths = [] # Note: some persons only have images for one view for imgid, img_ref in enumerate(img_refs): img = _deref(img_ref) # skip empty cell if img.size == 0 or img.ndim < 3: continue # images are saved with the following format, index-1 (ensure uniqueness) # campid: index of camera pair (1-5) # pid: index of person in 'campid'-th camera pair # viewid: index of view, {1, 2} # imgid: index of image, (1-10) viewid = 1 if imgid < 5 else 2 img_name = '{:01d}_{:03d}_{:01d}_{:02d}.png'.format(campid+1, pid+1, viewid, imgid+1) img_path = osp.join(save_dir, img_name) imsave(img_path, img) img_paths.append(img_path) return img_paths def _extract_img(name): print("Processing {} images (extract and save) ...".format(name)) meta_data = [] imgs_dir = self.imgs_detected_dir if name == 'detected' else self.imgs_labeled_dir for campid, camp_ref in enumerate(mat[name][0]): camp = _deref(camp_ref) num_pids = camp.shape[0] for pid in range(num_pids): img_paths = _process_images(camp[pid,:], campid, pid, imgs_dir) assert len(img_paths) > 0, "campid{}-pid{} has no images".format(campid, pid) meta_data.append((campid+1, pid+1, img_paths)) print("done camera pair {} with {} identities".format(campid+1, num_pids)) return meta_data meta_detected = _extract_img('detected') meta_labeled = _extract_img('labeled') def _extract_classic_split(meta_data, test_split): train, test = [], [] num_train_pids, num_test_pids = 0, 0 num_train_imgs, num_test_imgs = 0, 0 for i, (campid, pid, img_paths) in enumerate(meta_data): if [campid, pid] in test_split: for img_path in img_paths: camid = int(osp.basename(img_path).split('_')[2]) test.append((img_path, num_test_pids, camid)) num_test_pids += 1 num_test_imgs += len(img_paths) else: for img_path in img_paths: camid = int(osp.basename(img_path).split('_')[2]) train.append((img_path, num_train_pids, camid)) num_train_pids += 1 num_train_imgs += len(img_paths) return train, num_train_pids, num_train_imgs, test, num_test_pids, num_test_imgs print("Creating classic splits (# = 20) ...") splits_classic_det, splits_classic_lab = [], [] for split_ref in mat['testsets'][0]: test_split = _deref(split_ref).tolist() # create split for detected images train, num_train_pids, num_train_imgs, test, num_test_pids, num_test_imgs = \ _extract_classic_split(meta_detected, test_split) splits_classic_det.append({ 'train': train, 'query': test, 'gallery': test, 'num_train_pids': num_train_pids, 'num_train_imgs': num_train_imgs, 'num_query_pids': num_test_pids, 'num_query_imgs': num_test_imgs, 'num_gallery_pids': num_test_pids, 'num_gallery_imgs': num_test_imgs, }) # create split for labeled images train, num_train_pids, num_train_imgs, test, num_test_pids, num_test_imgs = \ _extract_classic_split(meta_labeled, test_split) splits_classic_lab.append({ 'train': train, 'query': test, 'gallery': test, 'num_train_pids': num_train_pids, 'num_train_imgs': num_train_imgs, 'num_query_pids': num_test_pids, 'num_query_imgs': num_test_imgs, 'num_gallery_pids': num_test_pids, 'num_gallery_imgs': num_test_imgs, }) write_json(splits_classic_det, self.split_classic_det_json_path) write_json(splits_classic_lab, self.split_classic_lab_json_path) def _extract_set(filelist, pids, pid2label, idxs, img_dir, relabel): tmp_set = [] unique_pids = set() for idx in idxs: img_name = filelist[idx][0] camid = int(img_name.split('_')[2]) pid = pids[idx] if relabel: pid = pid2label[pid] img_path = osp.join(img_dir, img_name) tmp_set.append((img_path, int(pid), camid)) unique_pids.add(pid) return tmp_set, len(unique_pids), len(idxs) def _extract_new_split(split_dict, img_dir): train_idxs = split_dict['train_idx'].flatten() - 1 # index-0 pids = split_dict['labels'].flatten() train_pids = set(pids[train_idxs]) pid2label = {pid: label for label, pid in enumerate(train_pids)} query_idxs = split_dict['query_idx'].flatten() - 1 gallery_idxs = split_dict['gallery_idx'].flatten() - 1 filelist = split_dict['filelist'].flatten() train_info = _extract_set(filelist, pids, pid2label, train_idxs, img_dir, relabel=True) query_info = _extract_set(filelist, pids, pid2label, query_idxs, img_dir, relabel=False) gallery_info = _extract_set(filelist, pids, pid2label, gallery_idxs, img_dir, relabel=False) return train_info, query_info, gallery_info print("Creating new splits for detected images (767/700) ...") train_info, query_info, gallery_info = _extract_new_split( loadmat(self.split_new_det_mat_path), self.imgs_detected_dir, ) splits = [{ 'train': train_info[0], 'query': query_info[0], 'gallery': gallery_info[0], 'num_train_pids': train_info[1], 'num_train_imgs': train_info[2], 'num_query_pids': query_info[1], 'num_query_imgs': query_info[2], 'num_gallery_pids': gallery_info[1], 'num_gallery_imgs': gallery_info[2], }] write_json(splits, self.split_new_det_json_path) print("Creating new splits for labeled images (767/700) ...") train_info, query_info, gallery_info = _extract_new_split( loadmat(self.split_new_lab_mat_path), self.imgs_labeled_dir, ) splits = [{ 'train': train_info[0], 'query': query_info[0], 'gallery': gallery_info[0], 'num_train_pids': train_info[1], 'num_train_imgs': train_info[2], 'num_query_pids': query_info[1], 'num_query_imgs': query_info[2], 'num_gallery_pids': gallery_info[1], 'num_gallery_imgs': gallery_info[2], }] write_json(splits, self.split_new_lab_json_path)