diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md index fd561fcbc..0f37d295b 100644 --- a/examples/convert-llama2c-to-ggml/README.md +++ b/examples/convert-llama2c-to-ggml/README.md @@ -12,18 +12,14 @@ usage: ./convert-llama2c-to-ggml [options] options: -h, --help show this help message and exit - --copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin') + --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf') --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin') ``` An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows: -`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin` - -For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually: - -`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin` +`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` Now you can use the model with a command like: diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index f8a58dc7a..51d90ea6a 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -10,9 +10,48 @@ #include #include #include +#include #include #include +// GGUF keys & tensor names. + +#define KV_GENERAL_ARCHITECTURE "general.architecture" +#define KV_GENERAL_NAME "general.name" + +#define KV_TOKENIZER_MODEL "tokenizer.ggml.model" +#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens" +#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type" +#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores" +#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id" +#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id" +#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id" +#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id" +#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id" +#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json" + +#define KV_CONTEXT_LENGTH "llama.context_length" +#define KV_EMBEDDING_LENGTH "llama.embedding_length" +#define KV_BLOCK_COUNT "llama.block_count" +#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length" +#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count" +#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv" +#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon" +#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count" + +#define TN_TOKEN_EMBD "token_embd.weight" +#define TN_OUTPUT_NORM "output_norm.weight" +#define TN_OUTPUT "output.weight" +#define TN_ATTN_NORM "blk.%d.attn_norm.weight" +#define TN_ATTN_Q "blk.%d.attn_q.weight" +#define TN_ATTN_K "blk.%d.attn_k.weight" +#define TN_ATTN_V "blk.%d.attn_v.weight" +#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight" +#define TN_FFN_NORM "blk.%d.ffn_norm.weight" +#define TN_FFN_GATE "blk.%d.ffn_gate.weight" +#define TN_FFN_DOWN "blk.%d.ffn_down.weight" +#define TN_FFN_UP "blk.%d.ffn_up.weight" + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif @@ -20,6 +59,11 @@ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' #define LLAMA_FILE_VERSION_GGJT_V3 3 +#define TOKENIZER_NAME "llama" +#define UNKNOWN_TOKEN_ID 0 +#define BOS_TOKEN_ID 1 +#define EOS_TOKEN_ID 2 + //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. typedef struct { int dim; // transformer dimension @@ -183,6 +227,7 @@ struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_embd = 4096; + uint32_t n_ff = 11008; uint32_t n_mult = 4; uint32_t n_head = 32; uint32_t n_layer = 32; @@ -214,6 +259,8 @@ struct my_llama_layer { struct my_llama_model { struct ggml_context * ctx = NULL; + std::string name; + my_llama_hparams hparams; struct ggml_tensor * tok_embeddings; @@ -276,18 +323,13 @@ struct train_params { int mem_compute1_gb; }; -uint32_t get_n_ff(const struct my_llama_hparams* hparams) { - const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; - return n_ff; -} - void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_embd: %d\n", __func__, params->n_embd); printf("%s: n_mult: %d\n", __func__, params->n_mult); printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); + printf("%s: n_ff: %d\n", __func__, params->n_ff); printf("%s: n_layer: %d\n", __func__, params->n_layer); printf("%s: n_rot: %d\n", __func__, params->n_rot); } @@ -299,7 +341,7 @@ void init_model(struct my_llama_model * model) { const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; - const uint32_t n_ff = get_n_ff(&hparams); + const uint32_t n_ff = hparams.n_ff; struct ggml_context * ctx = model->ctx; model->train_its = 0; @@ -481,21 +523,6 @@ struct llama_file { return std::string(chars.data(), len); } - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - ~llama_file() { if (fp) { std::fclose(fp); @@ -503,30 +530,6 @@ struct llama_file { } }; -void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { - if (tensor == NULL) { - file->write_u32(0); - file->write_u32(0); - file->write_u32(GGML_TYPE_F32); - file->seek((0-file->tell()) & 31, SEEK_CUR); - return; - } - const char * name = ggml_get_name(tensor); - uint32_t name_len = strlen(name); - uint32_t nd = tensor->n_dims; - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - file->write_u32(nd); - file->write_u32(name_len); - file->write_u32(tensor->type); - file->write_raw(ne, sizeof(ne[0]) * nd); - file->write_raw(name, name_len); - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->write_raw(tensor->data, ggml_nbytes(tensor)); -} - bool is_ggml_file(const char *filename) { llama_file file(filename, "rb"); if (file.size < 4) { @@ -536,48 +539,96 @@ bool is_ggml_file(const char *filename) { return magic == GGUF_MAGIC; } +static std::string llama_escape_whitespaces(const std::string& text) { + std::ostringstream out; + for (char c : text) { + if (c == ' ') out << "\xe2\x96\x81"; + else out << c; + } + return out.str(); +} + void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { -#pragma message("TODO: implement reading vocabulary using gguf") -// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary -// if (is_ggml_file(filename)) { -// -// struct llama_context_params llama_params = llama_context_default_params(); -// llama_params.vocab_only = true; -// -// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params); -// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); -// -// const int n_vocab = llama_n_vocab(lctx); -// vocab->id_to_token.resize(n_vocab); -// for (int i=0; iid_to_token[i].text = llama_token_get_text(lctx, i); -// vocab->id_to_token[i].score = llama_token_get_score(lctx, i); -// vocab->id_to_token[i].type = llama_token_get_type(lctx, i); -// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i); -// } -// llama_free(lctx); -// llama_free_model(lmodel); -// } else - { // assume llama2.c vocabulary - printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename); + if (is_ggml_file(filename)) { + struct ggml_context * ctx_data = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + }; + + struct gguf_context * ctx = gguf_init_from_file(filename, params); + GGML_ASSERT(ctx != NULL); + + const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL); + GGML_ASSERT(model_idx >= 0); + std::string tokenizer_name = gguf_get_val_str(ctx, model_idx); + GGML_ASSERT(tokenizer_name == TOKENIZER_NAME); + + const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST); + GGML_ASSERT(token_idx >= 0); + + const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES); + GGML_ASSERT(score_idx >= 0); + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + + const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE); + GGML_ASSERT(toktype_idx >= 0); + const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + + vocab->id_to_token.resize(n_vocab); + + for (uint32_t i = 0; i < n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab->token_to_id[word] = i; + + auto & token_data = vocab->id_to_token[i]; + token_data.text = std::move(word); + token_data.score = scores[i]; + token_data.type = (llama_token_type) toktypes[i]; + } + ggml_free(ctx_data); + gguf_free(ctx); + } else { + // assume llama2.c vocabulary + printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); llama_file file(filename, "rb"); const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused vocab->id_to_token.resize(n_vocab); - for (int i=0; i single byte tokens. - char byte_val; - if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) { - char cstr[2] = { byte_val, 0 }; - text = cstr; + + unsigned char byte_val; + llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL; + if (id == UNKNOWN_TOKEN_ID) { + text = ""; + type = LLAMA_TOKEN_TYPE_UNKNOWN; + } else if (id == BOS_TOKEN_ID) { + text = ""; + type = LLAMA_TOKEN_TYPE_CONTROL; + } else if (id == EOS_TOKEN_ID) { + text = ""; + type = LLAMA_TOKEN_TYPE_CONTROL; + } else if (text.empty()) { + type = LLAMA_TOKEN_TYPE_CONTROL; + } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) { + // Text of byte tokens is already in the expected format. + type = LLAMA_TOKEN_TYPE_BYTE; + } else { + type = LLAMA_TOKEN_TYPE_NORMAL; } - vocab->id_to_token[i].text = text; - vocab->id_to_token[i].score = score; - vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED; - vocab->token_to_id.emplace(text, i); + text = llama_escape_whitespaces(text); + + vocab->id_to_token[id].text = text; + vocab->id_to_token[id].score = score; + vocab->id_to_token[id].type = type; + vocab->token_to_id.emplace(text, id); } } } @@ -619,33 +670,6 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar } void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; - } - -#pragma message("TODO: implement file saving using gguf") - // write_magic - file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic - file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version - // write_hparams - file.write_u32(model->hparams.n_vocab); - file.write_u32(model->hparams.n_embd); - file.write_u32(model->hparams.n_mult); - file.write_u32(model->hparams.n_head); - file.write_u32(model->hparams.n_layer); - file.write_u32(model->hparams.n_rot); - file.write_u32(LLAMA_FTYPE_ALL_F32); - - // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. - uint32_t n_vocab = model->hparams.n_vocab; - for (uint32_t i = 0; i < n_vocab; i++) { - const auto & token_data = vocab->id_to_token.at(i); - file.write_u32((uint32_t) token_data.text.size()); - file.write_raw(token_data.text.data(), token_data.text.size()); - file.write_raw(&token_data.score, sizeof(token_data.score)); - } - // stuff AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor @@ -658,8 +682,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod // for rms-att-weight int row_length = model->hparams.n_embd; const auto & hparams = model->hparams; - //int n_ff = model->hparams.n_embd; - int n_ff = get_n_ff(&hparams); + int n_ff = model->hparams.n_ff; for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ auto & layer = model->layers[i]; @@ -677,28 +700,91 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); } + + struct gguf_context * ctx = gguf_init_empty(); + + std::vector tokens; + std::vector scores; + std::vector token_types; + for (const llama_vocab::token_data & token_data : vocab->id_to_token) { + tokens.push_back(token_data.text.c_str()); + scores.push_back(token_data.score); + token_types.push_back(token_data.type); + } + gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size()); + gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size()); + gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size()); + + gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME); + + gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama"); + gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama"); + + // special tokens + gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); + gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); + + gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); + gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); + gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); + // n_head_kv is optional, default to n_head + // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...); + gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer); + gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot); + gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f); + // write tensors - write_tensor(&file, model->tok_embeddings); - write_tensor(&file, model->norm); - write_tensor(&file, model->output); // ? + ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD); + gguf_add_tensor(ctx, model->tok_embeddings); + + ggml_set_name(model->norm, TN_OUTPUT_NORM); + gguf_add_tensor(ctx, model->norm); + + ggml_set_name(model->output, TN_OUTPUT); + gguf_add_tensor(ctx, model->output); + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; - write_tensor(&file, layer.attention_norm); - write_tensor(&file, layer.wq); - write_tensor(&file, layer.wk); - write_tensor(&file, layer.wv); - write_tensor(&file, layer.wo); - write_tensor(&file, layer.ffn_norm); - write_tensor(&file, layer.w1); - write_tensor(&file, layer.w2); - write_tensor(&file, layer.w3); + ggml_format_name(layer.wq, TN_ATTN_Q, i); + gguf_add_tensor(ctx, layer.wq); + + ggml_format_name(layer.wk, TN_ATTN_K, i); + gguf_add_tensor(ctx, layer.wk); + + ggml_format_name(layer.wv, TN_ATTN_V, i); + gguf_add_tensor(ctx, layer.wv); + + ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i); + gguf_add_tensor(ctx, layer.wo); + + ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i); + gguf_add_tensor(ctx, layer.attention_norm); + + ggml_format_name(layer.w1, TN_FFN_GATE, i); + gguf_add_tensor(ctx, layer.w1); + + ggml_format_name(layer.w2, TN_FFN_DOWN, i); + gguf_add_tensor(ctx, layer.w2); + + ggml_format_name(layer.w3, TN_FFN_UP, i); + gguf_add_tensor(ctx, layer.w3); + + ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i); + gguf_add_tensor(ctx, layer.ffn_norm); } + + gguf_write_to_file(ctx, filename, false); + gguf_free(ctx); } struct train_params get_default_train_params() { struct train_params params; - params.fn_vocab_model = "tokenizer.bin"; + params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_llama2c_output_model = "ak_llama_model.bin"; params.fn_train_data = "shakespeare.txt"; params.fn_checkpoint_in = "checkpoint.bin"; @@ -751,7 +837,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params) fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model); + fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model); fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n"); fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model); fprintf(stderr, "\n"); @@ -812,6 +898,14 @@ bool params_parse(int argc, char ** argv, struct train_params * params) { return true; } +std::string basename(const std::string &path) { + size_t pos = path.find_last_of("/"); + if (pos == std::string::npos) { + return path; + } + return path.substr(pos + 1); +} + int main(int argc, char ** argv) { struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { @@ -840,6 +934,7 @@ int main(int argc, char ** argv) { model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); model.hparams.n_ctx = params.n_ctx; model.hparams.n_embd = config.dim; //params.n_embd; + model.hparams.n_ff = config.hidden_dim; model.hparams.n_mult = 32;//params.n_mult; model.hparams.n_head = config.n_heads; //params.n_head; model.hparams.n_layer = config.n_layers; //params.n_layer; @@ -853,6 +948,7 @@ int main(int argc, char ** argv) { model.ctx = ggml_init(lcparams); init_model(&model); + model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);