From cff7b0bf07cb46e1ad4fd199f6bdeb538925c8c4 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sat, 2 Sep 2023 23:52:13 -0600 Subject: [PATCH] convert.py : BPE fixes (#2938) * convert.py: BPE fixes? * Remove unnecessary conditional in addl token error handling --- convert.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/convert.py b/convert.py index 6c89b5ecc..5a7483b43 100755 --- a/convert.py +++ b/convert.py @@ -323,15 +323,27 @@ class BpeVocab: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) added_tokens: dict[str, int] if fname_added_tokens is not None: + # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: - added_tokens = {} + # Fall back to trying to find the added tokens in tokenizer.json + tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' + if not tokenizer_json_file.is_file(): + added_tokens = {} + else: + tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) + added_tokens = dict( + (item['content'], item['id']) + for item in tokenizer_json.get('added_tokens', []) + # Added tokens here can be duplicates of the main vocabulary. + if item['content'] not in self.bpe_tokenizer ) vocab_size: int = len(self.bpe_tokenizer) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: - raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + expected_end_id = vocab_size + len(actual_ids) - 1 + raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) self.added_tokens_list = [text for (text, idx) in items] @@ -345,10 +357,22 @@ class BpeVocab: from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import] byte_encoder = tokenization_gpt2.bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} + score = 0.0 for i, item in enumerate(tokenizer): text: bytes = item.encode("utf-8") - score: float = -i - yield text, score, gguf.TokenType.USER_DEFINED + # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior? + if i <= 258 and text.startswith(b'<') and text.endswith(b'>'): + if i == 0 and text == b'': + toktype = gguf.TokenType.UNKNOWN + elif i == 1 or i == 2: + toktype = gguf.TokenType.CONTROL + elif i >= 3 and text.startswith(b'<0x'): + toktype = gguf.TokenType.BYTE + else: + toktype = gguf.TokenType.NORMAL + else: + toktype = gguf.TokenType.NORMAL + yield text, score, toktype def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: