mirror of
https://github.com/open-mmlab/mmocr.git
synced 2025-06-03 21:54:47 +08:00
* update ner standard code format * add pytest * fix pre-commit * Annotate the dataset section * fix pre-commit for dataset * rm big files and add comments in dataset * rename configs for ner task * minor changes if metric * Note modification * fix pre-commit * detail modification * rm transform * rm magic number * fix warnings in pylint * fix pre-commit * correct help info * rename model files * rename err fixed * 428_tag * Adjust to more general pipline * update unit test rate * update * Unit test coverage over 90% and add Readme * modify details * fix precommit * update * fix pre-commit * update * update * update * update result * update readme * update baseline config * update config and small minor changes * minor changes in readme and etc. * back to original * update toy config * upload model and log * fix pytest * Modify the notes. * fix readme * Delete Chinese punctuation * add demo and fix some logic and naming problems * add To_tensor transformer for ner and load pretrained model in config * delete extra lines * split ner loss to MaskedCrossEntropyLoss and MaskedFocalLoss * update config * fix err * updata * modify noqa * update new model report * fix err in ner demo * Update ner_dataset.py * Update test_ner_dataset.py * Update ner_dataset.py * Update ner_transforms.py * rm toy config and data * add comment * add empty * fix conflict * fix precommit * fix pytest * fix pytest err * Update ner_dataset.py * change dataset name to cluener2020 * move the postprocess in metric to convertor * rm __init__ etc. * precommit * add discription in loss * add auto download * add http * update * remove some 'issert' * replace unsqueeze * update config * update doc and bert.py * update * update demo code Co-authored-by: weihuaqiang <weihuaqiang@sensetime.com> Co-authored-by: Hongbin Sun <hongbin306@gmail.com>
115 lines
3.4 KiB
Python
115 lines
3.4 KiB
Python
import json
|
|
import os.path as osp
|
|
import tempfile
|
|
|
|
import torch
|
|
|
|
from mmocr.datasets.ner_dataset import NerDataset
|
|
from mmocr.models.ner.convertors.ner_convertor import NerConvertor
|
|
|
|
|
|
def _create_dummy_ann_file(ann_file):
|
|
data = {
|
|
'text': '彭小军认为,国内银行现在走的是台湾的发卡模式',
|
|
'label': {
|
|
'address': {
|
|
'台湾': [[15, 16]]
|
|
},
|
|
'name': {
|
|
'彭小军': [[0, 2]]
|
|
}
|
|
}
|
|
}
|
|
|
|
with open(ann_file, 'w') as fw:
|
|
fw.write(json.dumps(data, ensure_ascii=False) + '\n')
|
|
|
|
|
|
def _create_dummy_vocab_file(vocab_file):
|
|
with open(vocab_file, 'w') as fw:
|
|
for char in list(map(chr, range(ord('a'), ord('z') + 1))):
|
|
fw.write(char + '\n')
|
|
|
|
|
|
def _create_dummy_loader():
|
|
loader = dict(
|
|
type='HardDiskLoader',
|
|
repeat=1,
|
|
parser=dict(type='LineJsonParser', keys=['text', 'label']))
|
|
return loader
|
|
|
|
|
|
def test_ner_dataset():
|
|
# test initialization
|
|
loader = _create_dummy_loader()
|
|
categories = [
|
|
'address', 'book', 'company', 'game', 'government', 'movie', 'name',
|
|
'organization', 'position', 'scene'
|
|
]
|
|
|
|
# create dummy data
|
|
tmp_dir = tempfile.TemporaryDirectory()
|
|
ann_file = osp.join(tmp_dir.name, 'fake_data.txt')
|
|
vocab_file = osp.join(tmp_dir.name, 'fake_vocab.txt')
|
|
_create_dummy_ann_file(ann_file)
|
|
_create_dummy_vocab_file(vocab_file)
|
|
|
|
max_len = 128
|
|
ner_convertor = dict(
|
|
type='NerConvertor',
|
|
annotation_type='bio',
|
|
vocab_file=vocab_file,
|
|
categories=categories,
|
|
max_len=max_len)
|
|
|
|
test_pipeline = [
|
|
dict(
|
|
type='NerTransform',
|
|
label_convertor=ner_convertor,
|
|
max_len=max_len),
|
|
dict(type='ToTensorNER')
|
|
]
|
|
dataset = NerDataset(ann_file, loader, pipeline=test_pipeline)
|
|
|
|
# test pre_pipeline
|
|
img_info = dataset.data_infos[0]
|
|
results = dict(img_info=img_info)
|
|
dataset.pre_pipeline(results)
|
|
|
|
# test prepare_train_img
|
|
dataset.prepare_train_img(0)
|
|
|
|
# test evaluation
|
|
result = [[['address', 15, 16], ['name', 0, 2]]]
|
|
|
|
dataset.evaluate(result)
|
|
|
|
# test pred convert2entity function
|
|
pred = [
|
|
21, 7, 17, 17, 21, 21, 21, 21, 21, 21, 13, 21, 21, 21, 21, 21, 1, 11,
|
|
21, 21, 7, 17, 17, 21, 21, 21, 21, 21, 21, 13, 21, 21, 21, 21, 21, 1,
|
|
11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
|
|
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
|
|
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
|
|
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
|
|
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
|
|
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 21, 21, 21, 21, 21,
|
|
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 21, 21, 21, 21,
|
|
21, 21
|
|
]
|
|
preds = [pred[:128]]
|
|
mask = [0] * 128
|
|
for i in range(10):
|
|
mask[i] = 1
|
|
assert len(preds[0]) == len(mask)
|
|
masks = torch.tensor([mask])
|
|
convertor = NerConvertor(
|
|
annotation_type='bio',
|
|
vocab_file=vocab_file,
|
|
categories=categories,
|
|
max_len=128)
|
|
all_entities = convertor.convert_pred2entities(preds=preds, masks=masks)
|
|
assert len(all_entities[0][0]) == 3
|
|
|
|
tmp_dir.cleanup()
|