py : open merges file as 'utf-8' (#4566)

Otherwise, on Windows converting bling-phi-2-v0 (<https://huggingface.co/llmware/bling-phi-2-v0>) via convert-hf-to-gguf.py will fail with the following error:

```
Traceback (most recent call last):
  File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 1061, in <module>
    model_instance.set_vocab()
  File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 52, in set_vocab
    self._set_vocab_gpt2()
  File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 264, in _set_vocab_gpt2
    special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
  File "C:\Users\User\git\gguf\gguf\vocab.py", line 33, in __init__
    self._load(Path(path))
  File "C:\Users\User\git\gguf\gguf\vocab.py", line 81, in _load
    self._try_load_merges_txt(path)
  File "C:\Users\User\git\gguf\gguf\vocab.py", line 95, in _try_load_merges_txt
    for line in fp:
  File "C:\Users\User\miniconda3\envs\gguf\lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1415: character maps to <undefined>
```
This commit is contained in:
howlger 2023-12-21 18:07:34 +01:00 committed by GitHub
parent 66f35a2f48
commit 880e352277
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -84,7 +84,7 @@ class SpecialVocab:
merges_file = path / 'merges.txt'
if not merges_file.is_file():
return False
with open(merges_file, 'r') as fp:
with open(merges_file, 'r', encoding = 'utf-8') as fp:
first_line = next(fp, '').strip()
if not first_line.startswith('#'):
fp.seek(0)