diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml new file mode 100644 index 000000000..56d17b66c --- /dev/null +++ b/.github/workflows/python-lint.yml @@ -0,0 +1,20 @@ +name: flake8 Lint + +on: [push, pull_request] + +jobs: + flake8-lint: + runs-on: ubuntu-latest + name: Lint + steps: + - name: Check out source repository + uses: actions/checkout@v3 + - name: Set up Python environment + uses: actions/setup-python@v4 + with: + python-version: "3.11" + - name: flake8 Lint + uses: py-actions/flake8@v2 + with: + ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704" + exclude: "examples/*,examples/*/**,*/**/__init__.py" diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 3a618fd4d..1105670c1 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -827,13 +827,14 @@ class StableLMModel(Model): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) self.gguf_writer.add_layer_norm_eps(1e-5) ###### CONVERSION LOGIC ###### + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file") parser.add_argument( diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index 0c1235670..e359330af 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -14,11 +14,13 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf + class GGMLFormat(IntEnum): GGML = 0 GGMF = 1 GGJT = 2 + class GGMLFType(IntEnum): ALL_F32 = 0 MOSTLY_F16 = 1 @@ -38,6 +40,7 @@ class GGMLFType(IntEnum): MOSTLY_Q5_K_M = 17 MOSTLY_Q6_K = 18 + class Hyperparameters: def __init__(self): self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0 @@ -69,6 +72,7 @@ class Hyperparameters: def __str__(self): return f'' + class Vocab: def __init__(self, load_scores = True): self.items = [] @@ -90,6 +94,7 @@ class Vocab: self.items.append((item_text, item_score)) return offset - orig_offset + class Tensor: def __init__(self, use_padding = True): self.name = None @@ -123,6 +128,7 @@ class Tensor: # print(n_dims, name_len, dtype, self.dims, self.name, pad) return offset - orig_offset + class GGMLModel: def __init__(self): self.hyperparameters = None @@ -159,8 +165,8 @@ class GGMLModel: if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16): err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.' elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2): - if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, - GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): + if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, + GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): err = 'Q4 and Q8 quantizations changed in GGJTv3.' if len(err) > 0: raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.') @@ -187,6 +193,7 @@ class GGMLModel: hp.set_n_ff(self) return offset + class GGMLToGGUF: def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None): hp = ggml_model.hyperparameters @@ -217,7 +224,7 @@ class GGMLToGGUF: gguf_writer = gguf.GGUFWriter( self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], - use_temp_file = False ) + use_temp_file = False) self.add_params(gguf_writer) self.add_vocab(gguf_writer) if self.special_vocab is not None: @@ -341,7 +348,8 @@ class GGMLToGGUF: mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, - raw_dtype = tensor.dtype ) + raw_dtype = tensor.dtype) + def handle_metadata(cfg, hp): import convert @@ -365,38 +373,40 @@ def handle_metadata(cfg, hp): raise ValueError('Unable to load metadata') vocab = convert.load_vocab( cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, - cfg.vocabtype ) + cfg.vocabtype) # FIXME: Respect cfg.vocab_dir? svocab = gguf.SpecialVocab(cfg.model_metadata_dir, - load_merges = cfg.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + load_merges = cfg.vocabtype == 'bpe', + n_vocab = vocab.vocab_size) convert.check_vocab_size(params, vocab) return (params, vocab, svocab) + def handle_args(): parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF') parser.add_argument('--input', '-i', type = Path, required = True, - help = 'Input GGMLv3 filename') + help = 'Input GGMLv3 filename') parser.add_argument('--output', '-o', type = Path, required = True, - help ='Output GGUF filename') + help ='Output GGUF filename') parser.add_argument('--name', - help = 'Set model name') + help = 'Set model name') parser.add_argument('--desc', - help = 'Set model description') + help = 'Set model description') parser.add_argument('--gqa', type = int, default = 1, - help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') + help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') parser.add_argument('--eps', default = '5.0e-06', - help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') + help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') parser.add_argument('--context-length', '-c', type=int, default = 2048, - help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') + help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') parser.add_argument('--model-metadata-dir', '-m', type = Path, - help ='Load HuggingFace/.pth vocab and metadata from the specified directory') + help ='Load HuggingFace/.pth vocab and metadata from the specified directory') parser.add_argument("--vocab-dir", type=Path, - help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") + help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", - help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") + help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") return parser.parse_args() + def main(): cfg = handle_args() print(f'* Using config: {cfg}') @@ -406,7 +416,7 @@ def main(): data = np.memmap(cfg.input, mode = 'r') model = GGMLModel() print('* Scanning GGML input file') - offset = model.load(data, 0) + offset = model.load(data, 0) # noqa print(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None @@ -421,12 +431,15 @@ def main(): print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') if model.file_format == GGMLFormat.GGML: print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') - converter = GGMLToGGUF(model, data, cfg, + converter = GGMLToGGUF( + model, data, cfg, params_override = params_override, vocab_override = vocab_override, - special_vocab = special_vocab ) + special_vocab = special_vocab + ) converter.save() print(f'* Successful completion. Output saved to: {cfg.output}') + if __name__ == '__main__': main() diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index 240f87306..206b7d5ff 100644 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -9,6 +9,7 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf + def _flatten_dict(dct, tensors, prefix=None): assert isinstance(dct, dict) for key in dct.keys(): @@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None): raise ValueError(type(dct[key])) return None + def _get_sentencepiece_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' print('gguf: getting sentencepiece tokenizer from', tokenizer_path) @@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path): pass return tokens, scores, toktypes + def main(): parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") @@ -125,6 +128,5 @@ def main(): print("") - if __name__ == '__main__': main() diff --git a/convert.py b/convert.py old mode 100755 new mode 100644 index 5b6344aa8..3ad836ce0 --- a/convert.py +++ b/convert.py @@ -46,6 +46,7 @@ DEFAULT_CONCURRENCY = 8 # data types # + @dataclass(frozen=True) class DataType: name: str @@ -55,15 +56,18 @@ class DataType: def elements_to_bytes(self, n_elements: int) -> int: return n_elements * self.dtype.itemsize + @dataclass(frozen=True) class UnquantizedDataType(DataType): pass + DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) + @dataclass(frozen=True) class QuantizedDataType(DataType): block_size: int @@ -77,6 +81,7 @@ class QuantizedDataType(DataType): assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' return self.quantized_dtype.itemsize * (n_elements // self.block_size) + @dataclass(frozen=True) class Q8_0QuantizedDataType(QuantizedDataType): # Mini Q8_0 quantization in Python! @@ -86,6 +91,7 @@ class Q8_0QuantizedDataType(QuantizedDataType): n_blocks = arr.size // self.block_size blocks = arr.reshape((n_blocks, self.block_size)) # Much faster implementation of block quantization contributed by @Cebtenzzre + def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: d = abs(blocks).max(axis = 1) / np.float32(127) with np.errstate(divide = 'ignore'): @@ -94,10 +100,11 @@ class Q8_0QuantizedDataType(QuantizedDataType): yield from zip(d, qs) return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) + DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', - dtype = np.dtype(np.float32), valid_conversions = [], - ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, - quantized_dtype = np.dtype([('d', ' 1 else DT_F32 + GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { GGMLFileType.AllF32 : DT_F32, GGMLFileType.MostlyF16 : DT_F16, @@ -138,6 +148,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { # hparams loading # + @dataclass class Params: n_vocab: int @@ -167,11 +178,11 @@ class Params: # try transformer naming first if "model.layers.0.self_attn.q_proj.weight" in model: - n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming - n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) else: - n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) + n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) if n_layer < 1: raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" @@ -308,7 +319,7 @@ class BpeVocab: (item['content'], item['id']) for item in tokenizer_json.get('added_tokens', []) # Added tokens here can be duplicates of the main vocabulary. - if item['content'] not in self.bpe_tokenizer ) + if item['content'] not in self.bpe_tokenizer) vocab_size: int = len(self.bpe_tokenizer) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) @@ -326,7 +337,6 @@ class BpeVocab: def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.bpe_tokenizer - from transformers.models.gpt2 import tokenization_gpt2 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} for i, _ in enumerate(tokenizer): @@ -406,6 +416,7 @@ class SentencePieceVocab: def __repr__(self) -> str: return f"" + Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' # @@ -413,13 +424,14 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' # TODO: reuse (probably move to gguf.py?) # + def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: - #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) + # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) + .swapaxes(1, 2) + .reshape(weights.shape)) class Tensor(metaclass=ABCMeta): @@ -500,7 +512,7 @@ class LazyTensor: ret = self._load() # Should be okay if it maps to the same numpy type? assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ - (self.data_type, ret.data_type, self.description) + (self.data_type, ret.data_type, self.description) return ret def astype(self, data_type: DataType) -> LazyTensor: @@ -588,6 +600,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe return lazy_tensor.load().permute(n_head, n_head_kv) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) @@ -595,6 +608,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_ s[0] = s[0] // 3 return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().part(n_part) @@ -744,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus: In = TypeVar('In') Out = TypeVar('Out') + def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than @@ -778,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc break yield result + def check_vocab_size(params: Params, vocab: Vocab) -> None: if params.n_vocab != vocab.vocab_size: assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) @@ -796,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None: class OutputFile: - def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: @@ -876,7 +892,7 @@ class OutputFile: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: + def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out, endianess=endianess) @@ -938,8 +954,9 @@ class OutputFile: of.close() + def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: - wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type + wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): return GGMLFileType.AllF32 @@ -952,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT raise Exception(f"Unexpected combination of types: {name_to_type}") + def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()} + def convert_model_names(model: LazyModel, params: Params) -> LazyModel: tmap = gguf.TensorNameMap(ARCH, params.n_layer) should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) @@ -968,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: print(f"Permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) - #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] + # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: print(f"Unpacking and permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) @@ -993,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: return out + def nth_multifile_path(path: Path, n: int) -> Path | None: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the nth path in the model. @@ -1174,8 +1194,8 @@ def main(args_in: list[str] | None = None) -> None: # FIXME: Try to respect vocab_dir somehow? vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, - load_merges = args.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + load_merges = args.vocabtype == 'bpe', + n_vocab = vocab.vocab_size) outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) print(f"Wrote {outfile}") @@ -1188,8 +1208,8 @@ def main(args_in: list[str] | None = None) -> None: vocab = load_vocab(vocab_dir, args.vocabtype) # FIXME: Try to respect vocab_dir somehow? special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, - load_merges = args.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + load_merges = args.vocabtype == 'bpe', + n_vocab = vocab.vocab_size) model = model_plus.model model = convert_model_names(model, params) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ab7382c44..b8ec977c8 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -221,7 +221,7 @@ class GGUFWriter: if self.endianess == GGUFEndian.BIG: tensor.byteswap(inplace=True) if self.use_temp_file and self.temp_file is None: - fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024) + fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024) fp.seek(0) self.temp_file = fp diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py index 65e1c0dbf..4f06ec9bb 100644 --- a/tests/test-tokenizer-0-falcon.py +++ b/tests/test-tokenizer-0-falcon.py @@ -14,34 +14,34 @@ dir_tokenizer = args.dir_tokenizer tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is πŸ¦™.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", - "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", - "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - "\n =", - "' era", - ] + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + "\n =", + "' era", +] for text in tests: print('text: ', text) diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py index 21df8e6e4..f3d4d7e3d 100644 --- a/tests/test-tokenizer-0-llama.py +++ b/tests/test-tokenizer-0-llama.py @@ -14,32 +14,32 @@ dir_tokenizer = args.dir_tokenizer tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is πŸ¦™.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", - "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", - "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - ] + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", +] for text in tests: