From 1c84003c08027f5d3a4cb876f51d6b6224a34d0e Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Mon, 2 Oct 2023 18:07:24 -0400 Subject: [PATCH] convert : fix vocab size when not defined in hparams (#3421) --- convert-falcon-hf-to-gguf.py | 17 +++++------------ convert-gptneox-hf-to-gguf.py | 15 +++++---------- convert-starcoder-hf-to-gguf.py | 17 +++++------------ 3 files changed, 15 insertions(+), 34 deletions(-) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 958358563..3a9300c37 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -134,26 +134,19 @@ print("gguf: get tokenizer metadata") tokens: list[bytearray] = [] -tokenizer_json_file = dir_model / 'tokenizer.json' -if not tokenizer_json_file.is_file(): - print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) - sys.exit(1) - # gpt2 tokenizer gguf_writer.add_tokenizer_model("gpt2") -with open(tokenizer_json_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - print("gguf: get gpt2 tokenizer vocab") -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) - # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py tokenizer = AutoTokenizer.from_pretrained(dir_model) +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size + reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} byte_encoder = bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py index 782410e44..60679a2f4 100755 --- a/convert-gptneox-hf-to-gguf.py +++ b/convert-gptneox-hf-to-gguf.py @@ -131,24 +131,19 @@ print("gguf: get tokenizer metadata") tokens: list[bytearray] = [] -tokenizer_json_file = dir_model / 'tokenizer.json' -if not tokenizer_json_file.is_file(): - print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) - sys.exit(1) - # gpt2 tokenizer gguf_writer.add_tokenizer_model("gpt2") -with open(tokenizer_json_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - print("gguf: get gpt2 tokenizer vocab") -vocab_size = len(tokenizer_json["model"]["vocab"]) - # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py tokenizer = AutoTokenizer.from_pretrained(dir_model) +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size + reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} byte_encoder = bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 48e88a777..f469beb81 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -118,26 +118,19 @@ print("gguf: get tokenizer metadata") tokens: list[bytearray] = [] -tokenizer_json_file = dir_model / 'tokenizer.json' -if not tokenizer_json_file.is_file(): - print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) - sys.exit(1) - # gpt2 tokenizer gguf_writer.add_tokenizer_model("gpt2") -with open(tokenizer_json_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - print("gguf: get gpt2 tokenizer vocab") -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) - # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py tokenizer = AutoTokenizer.from_pretrained(dir_model) +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size + reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} byte_encoder = bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()}