diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py index fa4a044ca..5b038fc0a 100644 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggmlv3-to-gguf.py @@ -1,10 +1,12 @@ -import sys, struct, math, argparse +import sys, struct, math, argparse, warnings from pathlib import Path import numpy as np import gguf +warnings.filterwarnings('error') + # Note: Does not support GGML_QKK_64 QK_K = 256 # Items here are (block size, type size) @@ -215,15 +217,10 @@ class GGMLToGGUF: if self.vocab_override is not None: vo = self.vocab_override print('* Adding vocab item(s)') - for (idx, vitem) in enumerate(vo.all_tokens()): - if len(vitem) == 3: - tokens.append(vitem[0]) - scores.append(vitem[1]) - toktypes.append(vitem[2]) - else: - # Maybe try to guess the token type here? - tokens.append(vitem[0]) - scores.append(vitem[1]) + for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): + tokens.append(vbytes) + scores.append(score) + toktypes.append(ttype) assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) @@ -231,9 +228,21 @@ class GGMLToGGUF: gguf_writer.add_token_types(toktypes) return print(f'* Adding {hp.n_vocab} vocab item(s)') + assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab' for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): tt = 1 # Normal - if len(vbytes) == 0: + # Special handling for UNK, BOS, EOS tokens. + if tokid <= 2: + if tokid == 0: + vbytes = b'' + tt = 2 + elif tokid == 1: + vbytes = b'' + tt = 3 + else: + vbytes = b'' + tt = 3 + elif len(vbytes) == 0: tt = 3 # Control elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1: vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8') @@ -246,6 +255,9 @@ class GGMLToGGUF: gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) + gguf_writer.add_unk_token_id(0) + gguf_writer.add_bos_token_id(1) + gguf_writer.add_eos_token_id(2) def add_tensors(self, gguf_writer): nm = self.name_map @@ -315,7 +327,11 @@ def main(): data = np.memmap(cfg.input, mode = 'r') model = GGMLV3Model() print('* Scanning GGML input file') - offset = model.load(data, 0) + try: + offset = model.load(data, 0) + except OverflowError: + print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr) + raise print(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None @@ -330,4 +346,5 @@ def main(): converter.save() print(f'* Successful completion. Output saved to: {cfg.output}') -main() +if __name__ == '__main__': + main() diff --git a/llama.cpp b/llama.cpp index 6c5da1309..fd8eaa180 100644 --- a/llama.cpp +++ b/llama.cpp @@ -703,7 +703,7 @@ struct llama_vocab { // default LLaMA special tokens id special_bos_id = 1; id special_eos_id = 2; - id special_unk_id = -1; + id special_unk_id = 0; id special_sep_id = -1; id special_pad_id = -1;