From 0d3094f0c742ce61f84feb6e4f0b59beee6194d7 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 24 Aug 2023 20:04:05 +0200 Subject: [PATCH] gguf : add rope_freq_base parameter for CodeLlama (#2769) --- convert.py | 43 +++++++++++++++++++++++++------------------ gguf.py | 6 +++++- llama.cpp | 20 ++++++++++++++++++-- 3 files changed, 48 insertions(+), 21 deletions(-) diff --git a/convert.py b/convert.py index b7c626d84..f335d0085 100755 --- a/convert.py +++ b/convert.py @@ -104,6 +104,8 @@ class Params: n_head_kv: int f_norm_eps: float + f_rope_freq_base: Optional[float] = None + ftype: Optional[GGMLFileType] = None # path to the directory containing the model files @@ -194,15 +196,16 @@ class Params: def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params': config = json.load(open(config_path)) - n_vocab = config["vocab_size"] if "vocab_size" in config else -1 - n_embd = config["dim"] - n_layer = config["n_layers"] - n_mult = config["multiple_of"] - n_ctx = 2048 if config["norm_eps"] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2 - n_ff = -1 - n_head = config["n_heads"] - n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head - f_norm_eps = config["norm_eps"] + n_vocab = config["vocab_size"] if "vocab_size" in config else -1 + n_embd = config["dim"] + n_layer = config["n_layers"] + n_mult = config["multiple_of"] + n_ctx = 2048 if config["norm_eps"] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2 + n_ff = -1 + n_head = config["n_heads"] + n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head + f_norm_eps = config["norm_eps"] + f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None if n_vocab == -1: n_vocab = model["tok_embeddings.weight"].shape[0] @@ -211,15 +214,16 @@ class Params: n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = n_mult, - n_layer = n_layer, - n_ctx = n_ctx, - n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head_kv, - f_norm_eps = f_norm_eps, + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = n_mult, + n_layer = n_layer, + n_ctx = n_ctx, + n_ff = n_ff, + n_head = n_head, + n_head_kv = n_head_kv, + f_norm_eps = f_norm_eps, + f_rope_freq_base = f_rope_freq_base, ) @staticmethod @@ -754,6 +758,9 @@ class OutputFile: self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_layer_norm_rms_eps (params.f_norm_eps) + if params.f_rope_freq_base: + self.gguf.add_rope_freq_base(params.f_rope_freq_base) + if params.ftype: self.gguf.add_file_type(params.ftype) diff --git a/gguf.py b/gguf.py index 5c37f0f0b..f4db7001b 100755 --- a/gguf.py +++ b/gguf.py @@ -47,6 +47,7 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" # RoPE KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" +KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base" KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear" # tokenization @@ -663,7 +664,10 @@ class GGUFWriter: self.add_uint32( KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count) - def add_rope_scale_linear(self, value: float): + def add_rope_freq_base(self, value: float): + self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value) + + def add_rope_scale_linear(self, value: float): self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value) def add_tokenizer_model(self, model: str): diff --git a/llama.cpp b/llama.cpp index b5266c1e1..67319396e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -195,6 +195,7 @@ enum llm_kv { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, LLM_KV_ROPE_DIMENSION_COUNT, + LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_TOKENIZER_MODEL, @@ -238,6 +239,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, @@ -1561,12 +1563,26 @@ static void llm_load_hparams( hparams.n_head_kv = hparams.n_head; GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); - // TODO: manually setting rope scale should override this + // TODO: manually setting rope freq base and scale should override this + // FIXME: partial fix when the param specified is not the default value, but + // will not work for overriding the model value to the params default + + llama_context_params defaults = llama_context_default_params(); + + // rope_freq_base + { + float ropebase = 10000.0f; + GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) { + rope_freq_base = ropebase; + } + } + // rope_freq_scale (inverse of the kv) is optional { float ropescale = 1.0f; GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - if (ropescale != 1.0f) { + if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) { rope_freq_scale = 1.0f/ropescale; } }