From deb7dfca4b9725cd295d1426db75fe8e0a6d5312 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 22 Aug 2023 20:05:59 +0300 Subject: [PATCH] gguf : add ftype meta info to the model (#2710) * llama : add ftype meta info to the model ggml-ci * convert.py : add ftype when converting (does not work) * convert.py : fix Enum to IntEnum ggml-ci --- convert.py | 29 +++++++++++++++++++++++------ gguf.py | 4 ++++ llama.cpp | 21 ++++++++++++++++++--- llama.h | 2 ++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/convert.py b/convert.py index c29c032cd..71978d671 100644 --- a/convert.py +++ b/convert.py @@ -69,7 +69,10 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = { 'I32': DT_I32, } -class GGMLFileType(enum.Enum): +# TODO: match this with `llama_ftype` +# TODO: rename to LLAMAFileType +# TODO: move to `gguf.py` +class GGMLFileType(enum.IntEnum): AllF32 = 0 MostlyF16 = 1 # except 1d tensors @@ -101,6 +104,8 @@ class Params: n_head_kv: int f_norm_eps: float + ftype: Optional[GGMLFileType] = None + @staticmethod def find_n_mult(n_ff: int, n_embd: int) -> int: # hardcoded magic range @@ -738,6 +743,9 @@ class OutputFile: self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_layer_norm_rms_eps (params.f_norm_eps) + if params.ftype: + self.gguf.add_file_type(params.ftype) + def add_meta_vocab(self, vocab: Vocab) -> None: tokens = [] scores = [] @@ -1020,6 +1028,12 @@ def main(args_in: Optional[List[str]] = None) -> None: " - LLaMA v2: --ctx 4096\n") params.n_ctx = args.ctx + if args.outtype: + params.ftype = { + "f32": GGMLFileType.AllF32, + "f16": GGMLFileType.MostlyF16, + }[args.outtype] + print(f"params = {params}") vocab: Vocab @@ -1040,11 +1054,14 @@ def main(args_in: Optional[List[str]] = None) -> None: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab = load_vocab(vocab_dir, args.vocabtype) - model = model_plus.model - model = convert_model_names(model, params) - output_type = pick_output_type(model, args.outtype) - model = convert_to_output_type(model, output_type) - outfile = args.outfile or default_outfile(model_plus.paths, output_type) + model = model_plus.model + model = convert_model_names(model, params) + ftype = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, ftype) + outfile = args.outfile or default_outfile(model_plus.paths, ftype) + + params.ftype = ftype + print(f"Writing {outfile}, format {ftype}") OutputFile.write_all(outfile, params, model, vocab) print(f"Wrote {outfile}") diff --git a/gguf.py b/gguf.py index 9776649c7..465746718 100644 --- a/gguf.py +++ b/gguf.py @@ -26,6 +26,7 @@ KEY_GENERAL_DESCRIPTION = "general.description" KEY_GENERAL_LICENSE = "general.license" KEY_GENERAL_SOURCE_URL = "general.source.url" KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" +KEY_GENERAL_FILE_TYPE = "general.file_type" # LLM KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length" @@ -595,6 +596,9 @@ class GGUFWriter: def add_source_hf_repo(self, repo: str): self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo) + def add_file_type(self, ftype: int): + self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype) + def add_name(self, name: str): self.add_string(KEY_GENERAL_NAME, name) diff --git a/llama.cpp b/llama.cpp index 0584749c5..6abdc44f2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -995,6 +995,16 @@ struct llama_model_loader { } break; } + // this is a way to mark that we have "guessed" the file type + ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); + + { + const int kid = gguf_find_key(ctx_gguf, "general.file_type"); + if (kid >= 0) { + ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid); + } + } + for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(ctx_gguf, i); const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); @@ -1197,7 +1207,11 @@ struct llama_model_loader { // load LLaMA models // -const char * llama_model_ftype_name(enum llama_ftype ftype) { +std::string llama_model_ftype_name(enum llama_ftype ftype) { + if (ftype & LLAMA_FTYPE_GUESSED) { + return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; + } + switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; @@ -1426,7 +1440,7 @@ static void llama_model_load_internal( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); - LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype)); + LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml->n_elements*1e-9); // general kv @@ -3450,6 +3464,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // copy the KV pairs from the input file gguf_set_kv (ctx_out, model_loader->ctx_gguf); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); + gguf_set_val_u32(ctx_out, "general.file_type", ftype); #ifdef GGML_USE_K_QUANTS int n_attention_wv = 0; @@ -4310,7 +4325,7 @@ int llama_model_n_embd(const struct llama_model * model) { } int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) { - return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype)); + return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } int llama_model_quantize( diff --git a/llama.h b/llama.h index aa5b7d69c..7ce478d54 100644 --- a/llama.h +++ b/llama.h @@ -103,6 +103,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors + + LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; typedef struct llama_token_data {