From 880e352277fc017df4d5794f0c21c44e1eae2b84 Mon Sep 17 00:00:00 2001 From: howlger Date: Thu, 21 Dec 2023 18:07:34 +0100 Subject: [PATCH] py : open merges file as 'utf-8' (#4566) Otherwise, on Windows converting bling-phi-2-v0 () via convert-hf-to-gguf.py will fail with the following error: ``` Traceback (most recent call last): File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 1061, in model_instance.set_vocab() File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 52, in set_vocab self._set_vocab_gpt2() File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 264, in _set_vocab_gpt2 special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) File "C:\Users\User\git\gguf\gguf\vocab.py", line 33, in __init__ self._load(Path(path)) File "C:\Users\User\git\gguf\gguf\vocab.py", line 81, in _load self._try_load_merges_txt(path) File "C:\Users\User\git\gguf\gguf\vocab.py", line 95, in _try_load_merges_txt for line in fp: File "C:\Users\User\miniconda3\envs\gguf\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1415: character maps to ``` --- gguf-py/gguf/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 76924d8f2..cd1942975 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -84,7 +84,7 @@ class SpecialVocab: merges_file = path / 'merges.txt' if not merges_file.is_file(): return False - with open(merges_file, 'r') as fp: + with open(merges_file, 'r', encoding = 'utf-8') as fp: first_line = next(fp, '').strip() if not first_line.startswith('#'): fp.seek(0)