From 5c872dbca2c7979b1f6dafc97db0774b8bbf9372 Mon Sep 17 00:00:00 2001 From: akawrykow <142945436+akawrykow@users.noreply.github.com> Date: Thu, 14 Sep 2023 10:19:42 -0700 Subject: [PATCH] falcon : use stated vocab size (#2914) --- convert-falcon-hf-to-gguf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 6ed2b88c6..5d4ad04a4 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -137,7 +137,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f: print("gguf: get gpt2 tokenizer vocab") -vocab_size = len(tokenizer_json["model"]["vocab"]) +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py tokenizer = AutoTokenizer.from_pretrained(dir_model)