deep-person-reid/torchreid/datasets/ilids.py

168 lines
5.9 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import glob
import re
import sys
import urllib
import tarfile
import zipfile
import os.path as osp
from scipy.io import loadmat
import numpy as np
import h5py
from scipy.misc import imsave
from collections import defaultdict
import copy
import random
from torchreid.utils.iotools import mkdir_if_missing, write_json, read_json
from .bases import BaseImageDataset
class iLIDS(BaseImageDataset):
"""QMUL-iLIDS
Reference:
Zheng et al. Associating Groups of People. BMVC 2009.
Dataset statistics:
# identities: 119
# images: 476
# cameras: 8 (not explicitly provided)
"""
dataset_dir = 'ilids'
def __init__(self, root='data', split_id=0, verbose=True, **kwargs):
super(iLIDS, self).__init__(root)
self.dataset_dir = osp.join(self.root, self.dataset_dir)
self.dataset_url = 'http://www.eecs.qmul.ac.uk/~jason/data/i-LIDS_Pedestrian.tgz'
self.data_dir = osp.join(self.dataset_dir, 'i-LIDS_Pedestrian/Persons')
self.split_path = osp.join(self.dataset_dir, 'splits.json')
self.download_data()
required_files = [
self.dataset_dir,
self.data_dir
]
self.check_before_run(required_files)
self.prepare_split()
splits = read_json(self.split_path)
if split_id >= len(splits):
raise ValueError('split_id exceeds range, received {}, but expected between 0 and {}'.format(split_id, len(splits)-1))
split = splits[split_id]
train, query, gallery = self.process_split(split)
if verbose:
self.print_dataset_statistics(train, query, gallery)
self.train = train
self.query = query
self.gallery = gallery
self.num_train_pids, self.num_train_imgs, self.num_train_cams = self.get_imagedata_info(self.train)
self.num_query_pids, self.num_query_imgs, self.num_query_cams = self.get_imagedata_info(self.query)
self.num_gallery_pids, self.num_gallery_imgs, self.num_gallery_cams = self.get_imagedata_info(self.gallery)
def download_data(self):
if osp.exists(self.dataset_dir):
return
mkdir_if_missing(self.dataset_dir)
fpath = osp.join(self.dataset_dir, osp.basename(self.dataset_url))
print('Downloading QMUL-iLIDS dataset')
urllib.urlretrieve(self.dataset_url, fpath)
print('Extracting files')
tar = tarfile.open(fpath)
tar.extractall(path=self.dataset_dir)
tar.close()
def prepare_split(self):
if not osp.exists(self.split_path):
print('Creating splits ...')
paths = glob.glob(osp.join(self.data_dir, '*.jpg'))
img_names = [osp.basename(path) for path in paths]
num_imgs = len(img_names)
assert num_imgs == 476, 'There should be 476 images, but got {}, please check the data'.format(num_imgs)
# store image names
# image naming format:
# the first four digits denote the person ID
# the last four digits denote the sequence index
pid_dict = defaultdict(list)
for img_name in img_names:
pid = int(img_name[:4])
pid_dict[pid].append(img_name)
pids = list(pid_dict.keys())
num_pids = len(pids)
assert num_pids == 119, 'There should be 119 identities, but got {}, please check the data'.format(num_pids)
num_train_pids = int(num_pids * 0.5)
num_test_pids = num_pids - num_train_pids # supposed to be 60
splits = []
for _ in range(10):
# randomly choose num_train_pids train IDs and num_test_pids test IDs
pids_copy = copy.deepcopy(pids)
random.shuffle(pids_copy)
train_pids = pids_copy[:num_train_pids]
test_pids = pids_copy[num_train_pids:]
train = []
query = []
gallery = []
# for train IDs, all images are used in the train set.
for pid in train_pids:
img_names = pid_dict[pid]
train.extend(img_names)
# for each test ID, randomly choose two images, one for
# query and the other one for gallery.
for pid in test_pids:
img_names = pid_dict[pid]
samples = random.sample(img_names, 2)
query.append(samples[0])
gallery.append(samples[1])
split = {'train': train, 'query': query, 'gallery': gallery}
splits.append(split)
print('Totally {} splits are created'.format(len(splits)))
write_json(splits, self.split_path)
print('Split file is saved to {}'.format(self.split_path))
def get_pid2label(self, img_names):
pid_container = set()
for img_name in img_names:
pid = int(img_name[:4])
pid_container.add(pid)
pid2label = {pid: label for label, pid in enumerate(pid_container)}
return pid2label
def parse_img_names(self, img_names, pid2label=None):
output = []
for img_name in img_names:
pid = int(img_name[:4])
if pid2label is not None:
pid = pid2label[pid]
camid = int(img_name[4:7]) - 1 # 0-based
img_path = osp.join(self.data_dir, img_name)
output.append((img_path, pid, camid))
return output
def process_split(self, split):
train, query, gallery = [], [], []
train_pid2label = self.get_pid2label(split['train'])
train = self.parse_img_names(split['train'], train_pid2label)
query = self.parse_img_names(split['query'])
gallery = self.parse_img_names(split['gallery'])
return train, query, gallery