mmsegmentation/projects/CAT-Seg/cat_seg/utils/tokenizer.py

# Copyright (c) OpenMMLab. All rights reserved.
import gzip
import html
import os
from functools import lru_cache

import ftfy
import regex as re


@lru_cache()
def default_bpe():
    """Return default BPE vocabulary path."""
    return os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        'bpe_vocab/bpe_simple_vocab_16e6.txt.gz')


@lru_cache()
def bytes_to_unicode():
    """Returns list of utf-8 byte and a corresponding list of unicode strings.

    The reversible bpe codes work on unicode strings. This means you need a
    large # of unicode characters in your vocab if you want to avoid UNKs. When
    you're at something like a 10B token dataset you end up needing around 5K
    for decent coverage. This is a significant percentage of your normal, say,
    32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
    unicode strings. And avoids mapping to whitespace/control characters the
    bpe code barfs on.
    """
    bs = list(range(ord('!'),
                    ord('~') + 1)) + list(range(
                        ord('¡'),
                        ord('¬') + 1)) + list(range(ord('®'),
                                                    ord('ÿ') + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length
    strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


def basic_clean(text):
    """Clean string."""
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


def whitespace_clean(text):
    """Clean whitespace in string."""
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


class SimpleTokenizer:
    """Customized Tokenizer implementation."""

    def __init__(self, bpe_path: str = default_bpe()):
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
        merges = merges[1:49152 - 256 - 2 + 1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v + '</w>' for v in vocab]
        for merge in merges:
            vocab.append(''.join(merge))
        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {
            '<|startoftext|>': '<|startoftext|>',
            '<|endoftext|>': '<|endoftext|>'
        }
        self.pat = re.compile(
            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|\
                'll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)

    def bpe(self, token):
        """Refer to bpe vocabulary dictionary."""
        if token in self.cache:
            return self.cache[token]
        word = tuple(token[:-1]) + (token[-1] + '</w>', )
        pairs = get_pairs(word)

        if not pairs:
            return token + '</w>'

        while True:
            bigram = min(
                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except ValueError:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word) - 1 and word[
                        i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        """Encode text strings."""
        bpe_tokens = []
        text = whitespace_clean(basic_clean(text)).lower()
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b]
                            for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token]
                              for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        """Decoder tokens to strings."""
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode(
            'utf-8', errors='replace').replace('</w>', ' ')
        return text
[Project] Support CAT-Seg from CVPR2023 (#3098) Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers. ## Motivation Support CAT-Seg open-vocabulary semantic segmentation (CVPR2023). ## Modification Support CAT-Seg open-vocabulary semantic segmentation (CVPR2023). - [x] Support CAT-Seg model training. - [x] CLIP model based `backbone` (R101 & Swin-B), aggregation layers based `neck`, and `decoder` head. - [x] Provide customized coco-stuff164k_384x384 training configs. - [x] Language model supports for `open vocabulary` (OV) tasks. - [x] Support CLIP-based pretrained language model (LM) inference. - [x] Add commonly used prompts templates. - [x] Add README tutorials. - [x] Add zero-shot testing scripts. Working on the following tasks. - [x] Add unit test. ## BC-breaking (Optional) Does the modification introduce changes that break the backward-compatibility of the downstream repos? If so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR. ## Use cases (Optional) If this PR introduces a new feature, it is better to list some use cases here, and update the documentation. ## Checklist 1. Pre-commit or other linting tools are used to fix the potential lint issues. 2. The modification is covered by complete unit tests. If not, please add more unit test to ensure the correctness. 3. If the modification has potential influence on downstream projects, this PR should be tested with downstream projects, like MMDet or MMDet3D. 4. The documentation has been modified accordingly, like docstring or example tutorials. --------- Co-authored-by: xiexinch <xiexinch@outlook.com> 2023-08-09 23:57:30 +08:00			`# Copyright (c) OpenMMLab. All rights reserved.`
			`import gzip`
			`import html`
			`import os`
			`from functools import lru_cache`

			`import ftfy`
			`import regex as re`


			`@lru_cache()`
			`def default_bpe():`
			`"""Return default BPE vocabulary path."""`
			`return os.path.join(`
			`os.path.dirname(os.path.abspath(__file__)),`
			`'bpe_vocab/bpe_simple_vocab_16e6.txt.gz')`


			`@lru_cache()`
			`def bytes_to_unicode():`
			`"""Returns list of utf-8 byte and a corresponding list of unicode strings.`

			`The reversible bpe codes work on unicode strings. This means you need a`
			`large # of unicode characters in your vocab if you want to avoid UNKs. When`
			`you're at something like a 10B token dataset you end up needing around 5K`
			`for decent coverage. This is a significant percentage of your normal, say,`
			`32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and`
			`unicode strings. And avoids mapping to whitespace/control characters the`
			`bpe code barfs on.`
			`"""`
			`bs = list(range(ord('!'),`
			`ord('~') + 1)) + list(range(`
			`ord('¡'),`
			`ord('¬') + 1)) + list(range(ord('®'),`
			`ord('ÿ') + 1))`
			`cs = bs[:]`
			`n = 0`
			`for b in range(2**8):`
			`if b not in bs:`
			`bs.append(b)`
			`cs.append(2**8 + n)`
			`n += 1`
			`cs = [chr(n) for n in cs]`
			`return dict(zip(bs, cs))`


			`def get_pairs(word):`
			`"""Return set of symbol pairs in a word.`

			`Word is represented as tuple of symbols (symbols being variable-length`
			`strings).`
			`"""`
			`pairs = set()`
			`prev_char = word[0]`
			`for char in word[1:]:`
			`pairs.add((prev_char, char))`
			`prev_char = char`
			`return pairs`


			`def basic_clean(text):`
			`"""Clean string."""`
			`text = ftfy.fix_text(text)`
			`text = html.unescape(html.unescape(text))`
			`return text.strip()`


			`def whitespace_clean(text):`
			`"""Clean whitespace in string."""`
			`text = re.sub(r'\s+', ' ', text)`
			`text = text.strip()`
			`return text`


			`class SimpleTokenizer:`
			`"""Customized Tokenizer implementation."""`

			`def __init__(self, bpe_path: str = default_bpe()):`
			`self.byte_encoder = bytes_to_unicode()`
			`self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}`
			`merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')`
			`merges = merges[1:49152 - 256 - 2 + 1]`
			`merges = [tuple(merge.split()) for merge in merges]`
			`vocab = list(bytes_to_unicode().values())`
			`vocab = vocab + [v + '</w>' for v in vocab]`
			`for merge in merges:`
			`vocab.append(''.join(merge))`
			`vocab.extend(['<\|startoftext\|>', '<\|endoftext\|>'])`
			`self.encoder = dict(zip(vocab, range(len(vocab))))`
			`self.decoder = {v: k for k, v in self.encoder.items()}`
			`self.bpe_ranks = dict(zip(merges, range(len(merges))))`
			`self.cache = {`
			`'<\|startoftext\|>': '<\|startoftext\|>',`
			`'<\|endoftext\|>': '<\|endoftext\|>'`
			`}`
			`self.pat = re.compile(`
			`r"""<\\|startoftext\\|>\|<\\|endoftext\\|>\|'s\|'t\|'re\|'ve\|'m\|\`
			`'ll\|'d\|[\p{L}]+\|[\p{N}]\|[^\s\p{L}\p{N}]+""", re.IGNORECASE)`

			`def bpe(self, token):`
			`"""Refer to bpe vocabulary dictionary."""`
			`if token in self.cache:`
			`return self.cache[token]`
			`word = tuple(token[:-1]) + (token[-1] + '</w>', )`
			`pairs = get_pairs(word)`

			`if not pairs:`
			`return token + '</w>'`

			`while True:`
			`bigram = min(`
			`pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))`
			`if bigram not in self.bpe_ranks:`
			`break`
			`first, second = bigram`
			`new_word = []`
			`i = 0`
			`while i < len(word):`
			`try:`
			`j = word.index(first, i)`
			`new_word.extend(word[i:j])`
			`i = j`
			`except ValueError:`
			`new_word.extend(word[i:])`
			`break`

			`if word[i] == first and i < len(word) - 1 and word[`
			`i + 1] == second:`
			`new_word.append(first + second)`
			`i += 2`
			`else:`
			`new_word.append(word[i])`
			`i += 1`
			`new_word = tuple(new_word)`
			`word = new_word`
			`if len(word) == 1:`
			`break`
			`else:`
			`pairs = get_pairs(word)`
			`word = ' '.join(word)`
			`self.cache[token] = word`
			`return word`

			`def encode(self, text):`
			`"""Encode text strings."""`
			`bpe_tokens = []`
			`text = whitespace_clean(basic_clean(text)).lower()`
			`for token in re.findall(self.pat, text):`
			`token = ''.join(self.byte_encoder[b]`
			`for b in token.encode('utf-8'))`
			`bpe_tokens.extend(self.encoder[bpe_token]`
			`for bpe_token in self.bpe(token).split(' '))`
			`return bpe_tokens`

			`def decode(self, tokens):`
			`"""Decoder tokens to strings."""`
			`text = ''.join([self.decoder[token] for token in tokens])`
			`text = bytearray([self.byte_decoder[c] for c in text]).decode(`
			`'utf-8', errors='replace').replace('</w>', ' ')`
			`return text`