deep-person-reid/torchreid/datasets/ilids.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import glob
import re
import sys
import urllib
import tarfile
import zipfile
import os.path as osp
from scipy.io import loadmat
import numpy as np
import h5py
from scipy.misc import imsave
from collections import defaultdict
import copy
import random

from torchreid.utils.iotools import mkdir_if_missing, write_json, read_json
from .bases import BaseImageDataset


class iLIDS(BaseImageDataset):
    """QMUL-iLIDS

    Reference:
    Zheng et al. Associating Groups of People. BMVC 2009.
    
    Dataset statistics:
    # identities: 119
    # images: 476
    # cameras: 8 (not explicitly provided)
    """
    dataset_dir = 'ilids'

    def __init__(self, root='data', split_id=0, verbose=True, **kwargs):
        super(iLIDS, self).__init__(root)
        self.dataset_dir = osp.join(self.root, self.dataset_dir)
        self.dataset_url = 'http://www.eecs.qmul.ac.uk/~jason/data/i-LIDS_Pedestrian.tgz'
        self.data_dir = osp.join(self.dataset_dir, 'i-LIDS_Pedestrian/Persons')
        self.split_path = osp.join(self.dataset_dir, 'splits.json')

        self.download_data()
        
        required_files = [
            self.dataset_dir,
            self.data_dir
        ]
        self.check_before_run(required_files)

        self.prepare_split()
        splits = read_json(self.split_path)
        if split_id >= len(splits):
            raise ValueError('split_id exceeds range, received {}, but expected between 0 and {}'.format(split_id, len(splits)-1))
        split = splits[split_id]

        train, query, gallery = self.process_split(split)

        if verbose:
            self.print_dataset_statistics(train, query, gallery)

        self.train = train
        self.query = query
        self.gallery = gallery

        self.num_train_pids, self.num_train_imgs, self.num_train_cams = self.get_imagedata_info(self.train)
        self.num_query_pids, self.num_query_imgs, self.num_query_cams = self.get_imagedata_info(self.query)
        self.num_gallery_pids, self.num_gallery_imgs, self.num_gallery_cams = self.get_imagedata_info(self.gallery)

    def download_data(self):
        if osp.exists(self.dataset_dir):
            return

        mkdir_if_missing(self.dataset_dir)
        fpath = osp.join(self.dataset_dir, osp.basename(self.dataset_url))

        print('Downloading QMUL-iLIDS dataset')
        urllib.urlretrieve(self.dataset_url, fpath)

        print('Extracting files')
        tar = tarfile.open(fpath)
        tar.extractall(path=self.dataset_dir)
        tar.close()

    def prepare_split(self):
        if not osp.exists(self.split_path):
            print('Creating splits ...')
            
            paths = glob.glob(osp.join(self.data_dir, '*.jpg'))
            img_names = [osp.basename(path) for path in paths]
            num_imgs = len(img_names)
            assert num_imgs == 476, 'There should be 476 images, but got {}, please check the data'.format(num_imgs)

            # store image names
            # image naming format:
            #   the first four digits denote the person ID
            #   the last four digits denote the sequence index
            pid_dict = defaultdict(list)
            for img_name in img_names:
                pid = int(img_name[:4])
                pid_dict[pid].append(img_name)
            pids = list(pid_dict.keys())
            num_pids = len(pids)
            assert num_pids == 119, 'There should be 119 identities, but got {}, please check the data'.format(num_pids)

            num_train_pids = int(num_pids * 0.5)
            num_test_pids = num_pids - num_train_pids # supposed to be 60

            splits = []
            for _ in range(10):
                # randomly choose num_train_pids train IDs and num_test_pids test IDs
                pids_copy = copy.deepcopy(pids)
                random.shuffle(pids_copy)
                train_pids = pids_copy[:num_train_pids]
                test_pids = pids_copy[num_train_pids:]

                train = []
                query = []
                gallery = []

                # for train IDs, all images are used in the train set.
                for pid in train_pids:
                    img_names = pid_dict[pid]
                    train.extend(img_names)

                # for each test ID, randomly choose two images, one for
                # query and the other one for gallery.
                for pid in test_pids:
                    img_names = pid_dict[pid]
                    samples = random.sample(img_names, 2)
                    query.append(samples[0])
                    gallery.append(samples[1])

                split = {'train': train, 'query': query, 'gallery': gallery}
                splits.append(split)

            print('Totally {} splits are created'.format(len(splits)))
            write_json(splits, self.split_path)
            print('Split file is saved to {}'.format(self.split_path))

    def get_pid2label(self, img_names):
        pid_container = set()
        for img_name in img_names:
            pid = int(img_name[:4])
            pid_container.add(pid)
        pid2label = {pid: label for label, pid in enumerate(pid_container)}
        return pid2label

    def parse_img_names(self, img_names, pid2label=None):
        output = []
        for img_name in img_names:
            pid = int(img_name[:4])
            if pid2label is not None:
                pid = pid2label[pid]
            camid = int(img_name[4:7]) - 1 # 0-based
            img_path = osp.join(self.data_dir, img_name)
            output.append((img_path, pid, camid))
        return output

    def process_split(self, split):
        train, query, gallery = [], [], []
        train_pid2label = self.get_pid2label(split['train'])
        train = self.parse_img_names(split['train'], train_pid2label)
        query = self.parse_img_names(split['query'])
        gallery = self.parse_img_names(split['gallery'])
        return train, query, gallery
standardize code 2018-07-04 10:32:43 +01:00			`from __future__ import absolute_import`
			`from __future__ import division`
			`from __future__ import print_function`
update model & script 2018-07-02 10:17:14 +01:00
			`import os`
			`import glob`
			`import re`
			`import sys`
			`import urllib`
			`import tarfile`
			`import zipfile`
			`import os.path as osp`
			`from scipy.io import loadmat`
			`import numpy as np`
			`import h5py`
			`from scipy.misc import imsave`
update ilids and prid 2019-02-27 09:31:12 +00:00			`from collections import defaultdict`
			`import copy`
			`import random`
update model & script 2018-07-02 10:17:14 +01:00
mv deepreid/ to torchreid/ 2018-08-15 09:48:17 +01:00			`from torchreid.utils.iotools import mkdir_if_missing, write_json, read_json`
add num_cams; create base classes 2018-11-04 22:59:46 +00:00			`from .bases import BaseImageDataset`
update model & script 2018-07-02 10:17:14 +01:00

add num_cams; create base classes 2018-11-04 22:59:46 +00:00			`class iLIDS(BaseImageDataset):`
polish code 2019-03-15 14:49:18 +00:00			`"""QMUL-iLIDS`
update model & script 2018-07-02 10:17:14 +01:00
			`Reference:`
update ilids and prid 2019-02-27 09:31:12 +00:00			`Zheng et al. Associating Groups of People. BMVC 2009.`
update model & script 2018-07-02 10:17:14 +01:00
			`Dataset statistics:`
update ilids and prid 2019-02-27 09:31:12 +00:00			`# identities: 119`
			`# images: 476`
			`# cameras: 8 (not explicitly provided)`
update model & script 2018-07-02 10:17:14 +01:00			`"""`
update ilids and prid 2019-02-27 09:31:12 +00:00			`dataset_dir = 'ilids'`
update model & script 2018-07-02 10:17:14 +01:00
rm lmdb 2018-08-11 22:22:48 +01:00			`def __init__(self, root='data', split_id=0, verbose=True, **kwargs):`
add os.path.expanduser(root) to dataset files 2019-01-09 13:43:39 +00:00			`super(iLIDS, self).__init__(root)`
			`self.dataset_dir = osp.join(self.root, self.dataset_dir)`
update ilids and prid 2019-02-27 09:31:12 +00:00			`self.dataset_url = 'http://www.eecs.qmul.ac.uk/~jason/data/i-LIDS_Pedestrian.tgz'`
			`self.data_dir = osp.join(self.dataset_dir, 'i-LIDS_Pedestrian/Persons')`
update model & script 2018-07-02 10:17:14 +01:00			`self.split_path = osp.join(self.dataset_dir, 'splits.json')`

update ilids and prid 2019-02-27 09:31:12 +00:00			`self.download_data()`
polish code 2019-03-15 14:49:18 +00:00
			`required_files = [`
			`self.dataset_dir,`
			`self.data_dir`
			`]`
			`self.check_before_run(required_files)`
update model & script 2018-07-02 10:17:14 +01:00
update ilids and prid 2019-02-27 09:31:12 +00:00			`self.prepare_split()`
update model & script 2018-07-02 10:17:14 +01:00			`splits = read_json(self.split_path)`
			`if split_id >= len(splits):`
update print 2019-01-30 22:41:47 +00:00			`raise ValueError('split_id exceeds range, received {}, but expected between 0 and {}'.format(split_id, len(splits)-1))`
update model & script 2018-07-02 10:17:14 +01:00			`split = splits[split_id]`

update ilids and prid 2019-02-27 09:31:12 +00:00			`train, query, gallery = self.process_split(split)`
update model & script 2018-07-02 10:17:14 +01:00
add lmdb generator 2018-07-02 11:57:01 +01:00			`if verbose:`
add num_cams; create base classes 2018-11-04 22:59:46 +00:00			`self.print_dataset_statistics(train, query, gallery)`
update model & script 2018-07-02 10:17:14 +01:00
			`self.train = train`
			`self.query = query`
			`self.gallery = gallery`

add num_cams; create base classes 2018-11-04 22:59:46 +00:00			`self.num_train_pids, self.num_train_imgs, self.num_train_cams = self.get_imagedata_info(self.train)`
			`self.num_query_pids, self.num_query_imgs, self.num_query_cams = self.get_imagedata_info(self.query)`
			`self.num_gallery_pids, self.num_gallery_imgs, self.num_gallery_cams = self.get_imagedata_info(self.gallery)`
update model & script 2018-07-02 10:17:14 +01:00
update ilids and prid 2019-02-27 09:31:12 +00:00			`def download_data(self):`
update model & script 2018-07-02 10:17:14 +01:00			`if osp.exists(self.dataset_dir):`
			`return`

			`mkdir_if_missing(self.dataset_dir)`
			`fpath = osp.join(self.dataset_dir, osp.basename(self.dataset_url))`

update ilids and prid 2019-02-27 09:31:12 +00:00			`print('Downloading QMUL-iLIDS dataset')`
update model & script 2018-07-02 10:17:14 +01:00			`urllib.urlretrieve(self.dataset_url, fpath)`

update print 2019-01-30 22:41:47 +00:00			`print('Extracting files')`
update model & script 2018-07-02 10:17:14 +01:00			`tar = tarfile.open(fpath)`
			`tar.extractall(path=self.dataset_dir)`
			`tar.close()`

update ilids and prid 2019-02-27 09:31:12 +00:00			`def prepare_split(self):`
update model & script 2018-07-02 10:17:14 +01:00			`if not osp.exists(self.split_path):`
update print 2019-01-30 22:41:47 +00:00			`print('Creating splits ...')`
update model & script 2018-07-02 10:17:14 +01:00
update ilids and prid 2019-02-27 09:31:12 +00:00			`paths = glob.glob(osp.join(self.data_dir, '*.jpg'))`
			`img_names = [osp.basename(path) for path in paths]`
			`num_imgs = len(img_names)`
			`assert num_imgs == 476, 'There should be 476 images, but got {}, please check the data'.format(num_imgs)`

			`# store image names`
			`# image naming format:`
			`# the first four digits denote the person ID`
			`# the last four digits denote the sequence index`
			`pid_dict = defaultdict(list)`
			`for img_name in img_names:`
			`pid = int(img_name[:4])`
			`pid_dict[pid].append(img_name)`
			`pids = list(pid_dict.keys())`
			`num_pids = len(pids)`
			`assert num_pids == 119, 'There should be 119 identities, but got {}, please check the data'.format(num_pids)`

			`num_train_pids = int(num_pids * 0.5)`
			`num_test_pids = num_pids - num_train_pids # supposed to be 60`
update model & script 2018-07-02 10:17:14 +01:00
			`splits = []`
update ilids and prid 2019-02-27 09:31:12 +00:00			`for _ in range(10):`
			`# randomly choose num_train_pids train IDs and num_test_pids test IDs`
			`pids_copy = copy.deepcopy(pids)`
			`random.shuffle(pids_copy)`
			`train_pids = pids_copy[:num_train_pids]`
			`test_pids = pids_copy[num_train_pids:]`

			`train = []`
			`query = []`
			`gallery = []`

			`# for train IDs, all images are used in the train set.`
			`for pid in train_pids:`
			`img_names = pid_dict[pid]`
			`train.extend(img_names)`

			`# for each test ID, randomly choose two images, one for`
			`# query and the other one for gallery.`
			`for pid in test_pids:`
			`img_names = pid_dict[pid]`
			`samples = random.sample(img_names, 2)`
			`query.append(samples[0])`
			`gallery.append(samples[1])`

			`split = {'train': train, 'query': query, 'gallery': gallery}`
update model & script 2018-07-02 10:17:14 +01:00			`splits.append(split)`

update ilids and prid 2019-02-27 09:31:12 +00:00			`print('Totally {} splits are created'.format(len(splits)))`
update model & script 2018-07-02 10:17:14 +01:00			`write_json(splits, self.split_path)`
update ilids and prid 2019-02-27 09:31:12 +00:00			`print('Split file is saved to {}'.format(self.split_path))`
update model & script 2018-07-02 10:17:14 +01:00
update ilids and prid 2019-02-27 09:31:12 +00:00			`def get_pid2label(self, img_names):`
			`pid_container = set()`
			`for img_name in img_names:`
			`pid = int(img_name[:4])`
			`pid_container.add(pid)`
			`pid2label = {pid: label for label, pid in enumerate(pid_container)}`
			`return pid2label`

			`def parse_img_names(self, img_names, pid2label=None):`
			`output = []`
			`for img_name in img_names:`
			`pid = int(img_name[:4])`
			`if pid2label is not None:`
			`pid = pid2label[pid]`
			`camid = int(img_name[4:7]) - 1 # 0-based`
			`img_path = osp.join(self.data_dir, img_name)`
			`output.append((img_path, pid, camid))`
			`return output`

			`def process_split(self, split):`
			`train, query, gallery = [], [], []`
			`train_pid2label = self.get_pid2label(split['train'])`
			`train = self.parse_img_names(split['train'], train_pid2label)`
			`query = self.parse_img_names(split['query'])`
			`gallery = self.parse_img_names(split['gallery'])`
			`return train, query, gallery`