Improve handling of special tokens in GGML to GGUF converter (#2725)

* Improve UNK, BOS, EOS token handling when converting without metadata.

* Allow importing as a module.

* Remove some obsolete code and minor cleanups.

* Set default UNK token mapping from -1 to 0 in llama.cpp

* Try to handle overflow due to buggy Windows Python with a better error message
This commit is contained in:
Kerfuffle 2023-08-22 17:39:39 -06:00 committed by GitHub
parent 46ef5b5fcf
commit 777f42ba18
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 14 deletions

View file

@ -1,10 +1,12 @@
import sys, struct, math, argparse import sys, struct, math, argparse, warnings
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import gguf import gguf
warnings.filterwarnings('error')
# Note: Does not support GGML_QKK_64 # Note: Does not support GGML_QKK_64
QK_K = 256 QK_K = 256
# Items here are (block size, type size) # Items here are (block size, type size)
@ -215,15 +217,10 @@ class GGMLToGGUF:
if self.vocab_override is not None: if self.vocab_override is not None:
vo = self.vocab_override vo = self.vocab_override
print('* Adding vocab item(s)') print('* Adding vocab item(s)')
for (idx, vitem) in enumerate(vo.all_tokens()): for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
if len(vitem) == 3: tokens.append(vbytes)
tokens.append(vitem[0]) scores.append(score)
scores.append(vitem[1]) toktypes.append(ttype)
toktypes.append(vitem[2])
else:
# Maybe try to guess the token type here?
tokens.append(vitem[0])
scores.append(vitem[1])
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
@ -231,9 +228,21 @@ class GGMLToGGUF:
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
return return
print(f'* Adding {hp.n_vocab} vocab item(s)') print(f'* Adding {hp.n_vocab} vocab item(s)')
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
tt = 1 # Normal tt = 1 # Normal
if len(vbytes) == 0: # Special handling for UNK, BOS, EOS tokens.
if tokid <= 2:
if tokid == 0:
vbytes = b'<unk>'
tt = 2
elif tokid == 1:
vbytes = b'<s>'
tt = 3
else:
vbytes = b'</s>'
tt = 3
elif len(vbytes) == 0:
tt = 3 # Control tt = 3 # Control
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1: elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8') vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
@ -246,6 +255,9 @@ class GGMLToGGUF:
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
gguf_writer.add_unk_token_id(0)
gguf_writer.add_bos_token_id(1)
gguf_writer.add_eos_token_id(2)
def add_tensors(self, gguf_writer): def add_tensors(self, gguf_writer):
nm = self.name_map nm = self.name_map
@ -315,7 +327,11 @@ def main():
data = np.memmap(cfg.input, mode = 'r') data = np.memmap(cfg.input, mode = 'r')
model = GGMLV3Model() model = GGMLV3Model()
print('* Scanning GGML input file') print('* Scanning GGML input file')
offset = model.load(data, 0) try:
offset = model.load(data, 0)
except OverflowError:
print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr)
raise
print(f'* GGML model hyperparameters: {model.hyperparameters}') print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None vocab_override = None
params_override = None params_override = None
@ -330,4 +346,5 @@ def main():
converter.save() converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}') print(f'* Successful completion. Output saved to: {cfg.output}')
main() if __name__ == '__main__':
main()

View file

@ -703,7 +703,7 @@ struct llama_vocab {
// default LLaMA special tokens // default LLaMA special tokens
id special_bos_id = 1; id special_bos_id = 1;
id special_eos_id = 2; id special_eos_id = 2;
id special_unk_id = -1; id special_unk_id = 0;
id special_sep_id = -1; id special_sep_id = -1;
id special_pad_id = -1; id special_pad_id = -1;