From 9abf5e04c987ec56f83427b917c53b74dfee26a6 Mon Sep 17 00:00:00 2001 From: yancong <32220263+ice-tong@users.noreply.github.com> Date: Mon, 28 Nov 2022 11:13:51 +0800 Subject: [PATCH] [Docs] Translate fileio doc (#753) * translate the fileio doc * Update docs/en/advanced_tutorials/fileio.md Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> * Update docs/en/advanced_tutorials/fileio.md Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> --- docs/en/advanced_tutorials/fileio.md | 212 +++++++++++++++++++++++- docs/zh_cn/advanced_tutorials/fileio.md | 2 +- 2 files changed, 212 insertions(+), 2 deletions(-) diff --git a/docs/en/advanced_tutorials/fileio.md b/docs/en/advanced_tutorials/fileio.md index f45fd03f..9de13553 100644 --- a/docs/en/advanced_tutorials/fileio.md +++ b/docs/en/advanced_tutorials/fileio.md @@ -1,3 +1,213 @@ # File IO -Coming soon. Please refer to [chinese documentation](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/fileio.html). +`MMEngine` implements a unified set of file reading and writing interfaces in `fileio` module. With the `fileio` module, we can use the same function to handle different file formats, such as `json`, `yaml` and `pickle`. Other file formats can also be easily extended. + +The `fileio` module also supports reading and writing files from a variety of file storage backends, including disk, Petrel (for internal use), Memcached, LMDB, and HTTP. + +## Load and dump data + +`MMEngine` provides a universal API for loading and dumping data, currently supported formats are `json`, `yaml`, and `pickle`. + +### Load from disk or dump to disk + +```python +from mmengine import load, dump + +# load data from a file +data = load('test.json') +data = load('test.yaml') +data = load('test.pkl') +# load data from a file-like object +with open('test.json', 'r') as f: + data = load(f, file_format='json') + +# dump data to a string +json_str = dump(data, file_format='json') + +# dump data to a file with a filename (infer format from file extension) +dump(data, 'out.pkl') + +# dump data to a file with a file-like object +with open('test.yaml', 'w') as f: + data = dump(data, f, file_format='yaml') +``` + +### Load from other backends or dump to other backends + +```python +from mmengine import load, dump + +# load data from a file +data = load('s3://bucket-name/test.json') +data = load('s3://bucket-name/test.yaml') +data = load('s3://bucket-name/test.pkl') + +# dump data to a file with a filename (infer format from file extension) +dump(data, 's3://bucket-name/out.pkl') +``` + +It is also very convenient to extend the API to support more file formats. All you need to do is to write a file handler inherited from `BaseFileHandler` and register it with one or several file formats. + +```python +from mmengine import register_handler, BaseFileHandler + +# To register multiple file formats, a list can be used as the argument. +# @register_handler(['txt', 'log']) +@register_handler('txt') +class TxtHandler1(BaseFileHandler): + + def load_from_fileobj(self, file): + return file.read() + + def dump_to_fileobj(self, obj, file): + file.write(str(obj)) + + def dump_to_str(self, obj, **kwargs): + return str(obj) +``` + +Here is an example of `PickleHandler`. + +```python +from mmengine import BaseFileHandler +import pickle + +class PickleHandler(BaseFileHandler): + + def load_from_fileobj(self, file, **kwargs): + return pickle.load(file, **kwargs) + + def load_from_path(self, filepath, **kwargs): + return super(PickleHandler, self).load_from_path( + filepath, mode='rb', **kwargs) + + def dump_to_str(self, obj, **kwargs): + kwargs.setdefault('protocol', 2) + return pickle.dumps(obj, **kwargs) + + def dump_to_fileobj(self, obj, file, **kwargs): + kwargs.setdefault('protocol', 2) + pickle.dump(obj, file, **kwargs) + + def dump_to_path(self, obj, filepath, **kwargs): + super(PickleHandler, self).dump_to_path( + obj, filepath, mode='wb', **kwargs) +``` + +## Load a text file as a list or dict + +For example `a.txt` is a text file with 5 lines. + +``` +a +b +c +d +e +``` + +### Load from disk + +Use `list_from_file` to load the list from `a.txt`. + +```python +from mmengine import list_from_file + +print(list_from_file('a.txt')) +# ['a', 'b', 'c', 'd', 'e'] +print(list_from_file('a.txt', offset=2)) +# ['c', 'd', 'e'] +print(list_from_file('a.txt', max_num=2)) +# ['a', 'b'] +print(list_from_file('a.txt', prefix='/mnt/')) +# ['/mnt/a', '/mnt/b', '/mnt/c', '/mnt/d', '/mnt/e'] +``` + +For example `b.txt` is a text file with 3 lines. + +``` +1 cat +2 dog cow +3 panda +``` + +Then use `dict_from_file` to load the dict from `b.txt`. + +```python +from mmengine import dict_from_file + +print(dict_from_file('b.txt')) +# {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'} +print(dict_from_file('b.txt', key_type=int)) +# {1: 'cat', 2: ['dog', 'cow'], 3: 'panda'} +``` + +### Load from other backends + +Use `list_from_file` to load the list from `s3://bucket-name/a.txt`. + +```python +from mmengine import list_from_file + +print(list_from_file('s3://bucket-name/a.txt')) +# ['a', 'b', 'c', 'd', 'e'] +print(list_from_file('s3://bucket-name/a.txt', offset=2)) +# ['c', 'd', 'e'] +print(list_from_file('s3://bucket-name/a.txt', max_num=2)) +# ['a', 'b'] +print(list_from_file('s3://bucket-name/a.txt', prefix='/mnt/')) +# ['/mnt/a', '/mnt/b', '/mnt/c', '/mnt/d', '/mnt/e'] +``` + +Use `dict_from_file` to load the dict from `s3://bucket-name/b.txt`. + +```python +from mmengine import dict_from_file + +print(dict_from_file('s3://bucket-name/b.txt')) +# {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'} +print(dict_from_file('s3://bucket-name/b.txt', key_type=int)) +# {1: 'cat', 2: ['dog', 'cow'], 3: 'panda'} +``` + +## Load and dump checkpoints + +We can read the checkpoints from disk or internet in the following way. + +```python +import torch + +filepath1 = '/path/of/your/checkpoint1.pth' +filepath2 = 'http://path/of/your/checkpoint3.pth' + +# read filepath1 from disk +checkpoint = torch.load(filepath1) +# save checkpoints to disk +torch.save(checkpoint, filepath1) + +# read filepath2 from internet +checkpoint = torch.utils.model_zoo.load_url(filepath2) +``` + +In `MMEngine`, reading and writing checkpoints in different storage forms can be uniformly implemented with `load_checkpoint` and `save_checkpoint`. + +```python +from mmengine import load_checkpoint, save_checkpoint + +filepath1 = '/path/of/your/checkpoint1.pth' +filepath2 = 's3://bucket-name/path/of/your/checkpoint1.pth' +filepath3 = 'http://path/of/your/checkpoint3.pth' + +# read checkpoints from disk +checkpoint = load_checkpoint(filepath1) +# save checkpoints from disk +save_checkpoint(checkpoint, filepath1) + +# read checkpoints from s3 +checkpoint = load_checkpoint(filepath2) +# save checkpoints from s3 +save_checkpoint(checkpoint, filepath2) + +# read checkpoints from internet +checkpoint = load_checkpoint(filepath3) +``` diff --git a/docs/zh_cn/advanced_tutorials/fileio.md b/docs/zh_cn/advanced_tutorials/fileio.md index 3cce4320..5bd3f7d0 100644 --- a/docs/zh_cn/advanced_tutorials/fileio.md +++ b/docs/zh_cn/advanced_tutorials/fileio.md @@ -189,7 +189,7 @@ torch.save(checkpoint, filepath1) checkpoint = torch.utils.model_zoo.load_url(filepath2) ``` -在 `mmengine` 中,得益于多文件存储后端的支持,不同存储形式的权重文件读写可以通过 +在 `MMEngine` 中,得益于多文件存储后端的支持,不同存储形式的权重文件读写可以通过 `load_checkpoint` 和 `save_checkpoint` 来统一实现。 ```python