From ecf90b1a5114034bc0939b3968f549fe4d63cf6d Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 28 Sep 2023 14:30:15 -0400 Subject: [PATCH] gguf : make token scores and types optional (#3347) --- convert-falcon-hf-to-gguf.py | 6 ------ convert-starcoder-hf-to-gguf.py | 6 ------ llama.cpp | 18 ++++++++---------- 3 files changed, 8 insertions(+), 22 deletions(-) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 88338d823..958358563 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -133,8 +133,6 @@ gguf_writer.add_file_type(ftype) print("gguf: get tokenizer metadata") tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] tokenizer_json_file = dir_model / 'tokenizer.json' if not tokenizer_json_file.is_file(): @@ -177,12 +175,8 @@ for i in range(vocab_size): text = bytearray(pad_token) tokens.append(text) - scores.append(0.0) # dymmy - toktypes.append(gguf.TokenType.NORMAL) # dummy gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) special_vocab.add_to_gguf(gguf_writer) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 331e84e98..48e88a777 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -117,8 +117,6 @@ gguf_writer.add_file_type(ftype) print("gguf: get tokenizer metadata") tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] tokenizer_json_file = dir_model / 'tokenizer.json' if not tokenizer_json_file.is_file(): @@ -161,12 +159,8 @@ for i in range(vocab_size): text = bytearray(pad_token) tokens.append(text) - scores.append(0.0) # dymmy - toktypes.append(gguf.TokenType.NORMAL) # dummy gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) special_vocab.add_to_gguf(gguf_writer) diff --git a/llama.cpp b/llama.cpp index 140533553..15de7600c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1931,20 +1931,18 @@ static void llm_load_vocab( throw std::runtime_error("cannot find tokenizer vocab in model file\n"); } + const float * scores = nullptr; const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); - if (score_idx == -1) { - throw std::runtime_error("cannot find tokenizer scores in model file\n"); + if (score_idx != -1) { + scores = (const float * ) gguf_get_arr_data(ctx, score_idx); } - const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - + const int * toktypes = nullptr; const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); - if (toktype_idx == -1) { - throw std::runtime_error("cannot find token type list in GGUF file\n"); + if (toktype_idx != -1) { + toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); } - const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); - // determine vocab type { std::string tokenizer_name; @@ -2012,8 +2010,8 @@ static void llm_load_vocab( auto & token_data = vocab.id_to_token[i]; token_data.text = std::move(word); - token_data.score = scores[i]; - token_data.type = (llama_token_type) toktypes[i]; + token_data.score = scores ? scores[i] : 0.0f; + token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL; } // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'