From d1031cf49c3b958b915fd558e23453471c29ac33 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 20 Oct 2023 21:07:23 +0300 Subject: [PATCH] sampling : refactor init to use llama_sampling_params (#3696) * sampling : refactor init to use llama_sampling_params * llama : combine repetition, frequency and presence penalties in 1 call * examples : remove embd-input and gptneox-wip * sampling : rename penalty params + reduce size of "prev" vector * sampling : add llama_sampling_print helper * sampling : hide prev behind API and apply #3661 ggml-ci --- Makefile | 9 +- README.md | 1 - common/common.cpp | 69 +- common/common.h | 3 +- common/sampling.cpp | 73 +- common/sampling.h | 32 +- examples/CMakeLists.txt | 32 +- examples/embd-input/.gitignore | 4 - examples/embd-input/CMakeLists.txt | 17 - examples/embd-input/README.md | 63 -- examples/embd-input/embd-input-lib.cpp | 221 ----- examples/embd-input/embd-input-test.cpp | 35 - examples/embd-input/embd-input.h | 27 - examples/embd-input/embd_input.py | 72 -- examples/embd-input/llava.py | 71 -- examples/embd-input/minigpt4.py | 129 --- examples/embd-input/panda_gpt.py | 99 -- examples/gptneox-wip/cmpnct_gpt2bpe.hpp | 1133 ----------------------- examples/gptneox-wip/falcon-main.cpp | 1111 ---------------------- examples/gptneox-wip/gptneox-main.cpp | 1083 ---------------------- examples/infill/CMakeLists.txt | 2 +- examples/infill/infill.cpp | 67 +- examples/llava/llava-utils.h | 58 +- examples/main/main.cpp | 28 +- examples/parallel/parallel.cpp | 4 +- examples/server/server.cpp | 227 +++-- examples/speculative/speculative.cpp | 12 +- llama.cpp | 94 +- llama.h | 16 +- tests/test-sampling.cpp | 75 +- 30 files changed, 365 insertions(+), 4502 deletions(-) delete mode 100644 examples/embd-input/.gitignore delete mode 100644 examples/embd-input/CMakeLists.txt delete mode 100644 examples/embd-input/README.md delete mode 100644 examples/embd-input/embd-input-lib.cpp delete mode 100644 examples/embd-input/embd-input-test.cpp delete mode 100644 examples/embd-input/embd-input.h delete mode 100755 examples/embd-input/embd_input.py delete mode 100755 examples/embd-input/llava.py delete mode 100755 examples/embd-input/minigpt4.py delete mode 100755 examples/embd-input/panda_gpt.py delete mode 100644 examples/gptneox-wip/cmpnct_gpt2bpe.hpp delete mode 100644 examples/gptneox-wip/falcon-main.cpp delete mode 100644 examples/gptneox-wip/gptneox-main.cpp diff --git a/Makefile b/Makefile index 04104bee8..325ae747b 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search \ + simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search \ speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o # Binaries only useful for tests @@ -608,13 +608,6 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml. server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) - - -embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput - gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/README.md b/README.md index ce63c6f0e..49bb556a8 100644 --- a/README.md +++ b/README.md @@ -962,7 +962,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m / - [main](./examples/main/README.md) - [server](./examples/server/README.md) -- [embd-input](./examples/embd-input/README.md) - [jeopardy](./examples/jeopardy/README.md) - [BLIS](./docs/BLIS.md) - [Performance troubleshooting](./docs/token_generation_performance_tips.md) diff --git a/common/common.cpp b/common/common.cpp index ce14d66b8..2ef902bd5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -107,7 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::string arg; gpt_params default_params; const std::string arg_prefix = "--"; - llama_sampling_params & sparams = params.sampling_params; + llama_sampling_params & sparams = params.sparams; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -241,25 +241,26 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - sparams.repeat_last_n = std::stoi(argv[i]); + sparams.penalty_last_n = std::stoi(argv[i]); + sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); } else if (arg == "--repeat-penalty") { if (++i >= argc) { invalid_param = true; break; } - sparams.repeat_penalty = std::stof(argv[i]); + sparams.penalty_repeat = std::stof(argv[i]); } else if (arg == "--frequency-penalty") { if (++i >= argc) { invalid_param = true; break; } - sparams.frequency_penalty = std::stof(argv[i]); + sparams.penalty_freq = std::stof(argv[i]); } else if (arg == "--presence-penalty") { if (++i >= argc) { invalid_param = true; break; } - sparams.presence_penalty = std::stof(argv[i]); + sparams.penalty_present = std::stof(argv[i]); } else if (arg == "--mirostat") { if (++i >= argc) { invalid_param = true; @@ -572,7 +573,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.grammar = argv[i]; + sparams.grammar = argv[i]; } else if (arg == "--grammar-file") { if (++i >= argc) { invalid_param = true; @@ -587,7 +588,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(params.grammar) + std::back_inserter(sparams.grammar) ); #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters @@ -640,7 +641,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - const llama_sampling_params & sparams = params.sampling_params; + const llama_sampling_params & sparams = params.sparams; printf("usage: %s [options]\n", argv[0]); printf("\n"); @@ -678,10 +679,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); - printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n); - printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty); - printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty); - printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty); + printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); + printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); + printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); + printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); printf(" --mirostat N use Mirostat sampling.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); @@ -878,7 +879,7 @@ std::tuple llama_init_from_gpt_par } if (params.ignore_eos) { - params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY; + params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY; } { @@ -1123,28 +1124,28 @@ std::string get_sortable_timestamp() { void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { - const llama_sampling_params & sparams = params.sampling_params; + const llama_sampling_params & sparams = params.sparams; fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); fprintf(stream, "build_number: %d\n", BUILD_NUMBER); - fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); - fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); + fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); - fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); - fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); - fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); - fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); - fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); - fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); - fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); - fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); - fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); + fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); + fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); + fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); + fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); + fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); #ifdef NDEBUG fprintf(stream, "debug: false\n"); @@ -1178,8 +1179,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); - fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty); - dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); + fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); + dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str()); fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); @@ -1238,14 +1239,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); - fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty); + fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str()); fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens); fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); - fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty); + fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); fprintf(stream, "reverse_prompt:\n"); for (std::string ap : params.antiprompt) { diff --git a/common/common.h b/common/common.h index 65d3d20cd..84523a4fb 100644 --- a/common/common.h +++ b/common/common.h @@ -56,7 +56,7 @@ struct gpt_params { float rope_freq_scale = 0.0f; // RoPE frequency scaling factor // // sampling parameters - struct llama_sampling_params sampling_params; + struct llama_sampling_params sparams; std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model_draft = ""; // draft model for speculative decoding @@ -66,7 +66,6 @@ struct gpt_params { std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string input_prefix = ""; // string to prefix user inputs with std::string input_suffix = ""; // string to suffix user inputs with - std::string grammar = ""; // optional BNF-like grammar to constrain sampling std::vector antiprompt; // string upon seeing which more user input is prompted std::string logdir = ""; // directory in which to save YAML log files diff --git a/common/sampling.cpp b/common/sampling.cpp index 0b2466581..6f0af3c4a 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -1,9 +1,9 @@ #include "sampling.h" -struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) { +struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { struct llama_sampling_context * result = new llama_sampling_context(); - result->params = params.sampling_params; + result->params = params; result->grammar = nullptr; // if there is a grammar, parse it @@ -23,7 +23,7 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_params & pa grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root")); } - result->prev.resize(params.n_ctx); + result->prev.resize(params.n_prev); return result; } @@ -66,25 +66,56 @@ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * ds dst->prev = src->prev; } +llama_token llama_sampling_last(llama_sampling_context * ctx) { + return ctx->prev.back(); +} + +std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) { + const int size = ctx_sampling->prev.size(); + + n = std::min(n, size); + + std::string result; + + for (int i = size - n; i < size; i++) { + result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]); + } + + return result; +} + +std::string llama_sampling_print(const llama_sampling_params & params) { + char result[1024]; + + snprintf(result, sizeof(result), + "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", + params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, + params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, + params.mirostat, params.mirostat_eta, params.mirostat_tau); + + return std::string(result); +} + llama_token llama_sampling_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, const int idx) { - const int n_ctx = llama_n_ctx(ctx_main); - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const llama_sampling_params & params = ctx_sampling->params; + const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); + const float temp = params.temp; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; + const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; + const float penalty_repeat = params.penalty_repeat; + const float penalty_freq = params.penalty_freq; + const float penalty_present = params.penalty_present; const int mirostat = params.mirostat; const float mirostat_tau = params.mirostat_tau; const float mirostat_eta = params.mirostat_eta; @@ -97,7 +128,7 @@ llama_token llama_sampling_sample( float * logits = llama_get_logits_ith(ctx_main, idx); - // Apply params.logit_bias map + // apply params.logit_bias map for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { logits[it->first] += it->second; } @@ -117,14 +148,10 @@ llama_token llama_sampling_sample( // apply penalties if (!prev.empty()) { const float nl_logit = logits[llama_token_nl(ctx_main)]; - const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx); - llama_sample_repetition_penalty(ctx_main, &cur_p, - prev.data() + prev.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p, - prev.data() + prev.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); + llama_sample_repetition_penalties(ctx_main, &cur_p, + prev.data() + prev.size() - penalty_last_n, + penalty_last_n, penalty_repeat, penalty_freq, penalty_present); if (!penalize_nl) { for (size_t idx = 0; idx < cur_p.size; idx++) { @@ -141,7 +168,7 @@ llama_token llama_sampling_sample( } if (temp <= 0) { - // Greedy sampling + // greedy sampling id = llama_sample_token_greedy(ctx_main, &cur_p); } else { if (mirostat == 1) { @@ -152,8 +179,9 @@ llama_token llama_sampling_sample( llama_sample_temp(ctx_main, &cur_p, temp); id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu); } else { - // Temperature sampling + // temperature sampling size_t min_keep = std::max(1, params.n_probs); + llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); @@ -183,11 +211,12 @@ llama_token llama_sampling_sample( void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, - llama_token id) { + llama_token id, + bool apply_grammar) { ctx_sampling->prev.erase(ctx_sampling->prev.begin()); ctx_sampling->prev.push_back(id); - if (ctx_sampling->grammar != NULL) { + if (ctx_sampling->grammar != NULL && apply_grammar) { llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id); } } diff --git a/common/sampling.h b/common/sampling.h index 50afcbc12..62ea6d4cf 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -10,30 +10,30 @@ // sampling parameters typedef struct llama_sampling_params { + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // 1.0 = disabled - float repeat_penalty = 1.10f; // 1.0 = disabled - int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float frequency_penalty = 0.00f; // 0.0 = disabled - float presence_penalty = 0.00f; // 0.0 = disabled + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + std::string grammar; // optional BNF-like grammar to constrain sampling // Classifier-Free Guidance // https://arxiv.org/abs/2306.17806 - std::string cfg_negative_prompt; // string to help guidance - float cfg_scale = 1.f; // How strong is guidance + std::string cfg_negative_prompt; // string to help guidance + float cfg_scale = 1.f; // how strong is guidance std::unordered_map logit_bias; // logit bias for specific tokens - } llama_sampling_params; // general sampler context @@ -58,7 +58,7 @@ struct llama_sampling_context { #include "common.h" // Create a new sampling context instance. -struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params); +struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params); void llama_sampling_free(struct llama_sampling_context * ctx); @@ -70,6 +70,15 @@ void llama_sampling_reset(llama_sampling_context * ctx); // Copy the sampler context void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst); +// Get the last sampled token +llama_token llama_sampling_last(llama_sampling_context * ctx); + +// Get a string representation of the last sampled tokens +std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n); + +// Print sampling parameters into a string +std::string llama_sampling_print(const llama_sampling_params & params); + // this is a common sampling function used across the examples for convenience // it can serve as a starting point for implementing your own sampling function // Note: When using multiple sequences, it is the caller's responsibility to call @@ -96,4 +105,5 @@ llama_token llama_sampling_sample( void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, - llama_token id); + llama_token id, + bool apply_grammar); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e16c65f7c..75b8df676 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,26 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(main) - add_subdirectory(quantize) - add_subdirectory(quantize-stats) - add_subdirectory(perplexity) - add_subdirectory(embedding) - add_subdirectory(save-load-state) - add_subdirectory(benchmark) add_subdirectory(baby-llama) - add_subdirectory(train-text-from-scratch) - add_subdirectory(finetune) - add_subdirectory(convert-llama2c-to-ggml) - add_subdirectory(simple) add_subdirectory(batched) add_subdirectory(batched-bench) - add_subdirectory(speculative) - add_subdirectory(parallel) - add_subdirectory(embd-input) - add_subdirectory(llava) - add_subdirectory(llama-bench) add_subdirectory(beam-search) + add_subdirectory(benchmark) + add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(embedding) + add_subdirectory(finetune) + add_subdirectory(infill) + add_subdirectory(llama-bench) + add_subdirectory(llava) + add_subdirectory(main) + add_subdirectory(parallel) + add_subdirectory(perplexity) + add_subdirectory(quantize) + add_subdirectory(quantize-stats) + add_subdirectory(save-load-state) + add_subdirectory(simple) + add_subdirectory(speculative) + add_subdirectory(train-text-from-scratch) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore deleted file mode 100644 index 87ef68771..000000000 --- a/examples/embd-input/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -PandaGPT -MiniGPT-4 -*.pth - diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt deleted file mode 100644 index 5bbb1ea02..000000000 --- a/examples/embd-input/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -set(TARGET embdinput) -add_library(${TARGET} embd-input-lib.cpp embd-input.h) -install(TARGETS ${TARGET} LIBRARY) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() - -set(TARGET embd-input-test) -add_executable(${TARGET} embd-input-test.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md deleted file mode 100644 index 5c4c75ea7..000000000 --- a/examples/embd-input/README.md +++ /dev/null @@ -1,63 +0,0 @@ -### Examples for input embedding directly - -## Requirement -build `libembdinput.so` -run the following comman in main dir (../../). -``` -make -``` - -## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py) - -1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/). -2. Convert it to ggml format. -3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin). - -``` -import torch - -bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin" -pth_path = "./examples/embd-input/llava_projection.pth" - -dic = torch.load(bin_path) -used_key = ["model.mm_projector.weight","model.mm_projector.bias"] -torch.save({k: dic[k] for k in used_key}, pth_path) -``` -4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`. - - -## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py) - -1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format. -The `adapter_config.json` is -``` -{ - "peft_type": "LORA", - "fan_in_fan_out": false, - "bias": null, - "modules_to_save": null, - "r": 32, - "lora_alpha": 32, - "lora_dropout": 0.1, - "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"] -} -``` -2. Papare the `vicuna` v0 model. -3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model. -4. Clone the PandaGPT source. -``` -git clone https://github.com/yxuansu/PandaGPT -``` -5. Install the requirement of PandaGPT. -6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py. - -## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py) - -1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`. -2. Clone the MiniGPT-4 source. -``` -git clone https://github.com/Vision-CAIR/MiniGPT-4/ -``` -3. Install the requirement of PandaGPT. -4. Papare the `vicuna` v0 model. -5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`. diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp deleted file mode 100644 index 3ce33842c..000000000 --- a/examples/embd-input/embd-input-lib.cpp +++ /dev/null @@ -1,221 +0,0 @@ -#include "build-info.h" -#include "common.h" -#include "embd-input.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static llama_context ** g_ctx; - -extern "C" { - -struct MyModel* create_mymodel(int argc, char ** argv) { - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - return nullptr; - } - - print_build_info(); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = uint32_t(time(NULL)); - } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - - llama_backend_init(params.numa); - - llama_model * model; - llama_context * ctx; - - g_ctx = &ctx; - - // load the model and apply lora adapter, if any - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return nullptr; - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", get_system_info(params).c_str()); - } - struct MyModel * ret = new MyModel(); - ret->ctx = ctx; - ret->params = params; - ret->n_past = 0; - // printf("ctx: %d\n", ret->ctx); - return ret; -} - -void free_mymodel(struct MyModel * mymodel) { - llama_context * ctx = mymodel->ctx; - llama_print_timings(ctx); - llama_free(ctx); - delete mymodel; -} - - -bool eval_float(void * model, float * input, int N){ - MyModel * mymodel = (MyModel*)model; - llama_context * ctx = mymodel->ctx; - gpt_params params = mymodel->params; - int n_emb = llama_n_embd(llama_get_model(ctx)); - int n_past = mymodel->n_past; - int n_batch = N; // params.n_batch; - - for (int i = 0; i < (int) N; i += n_batch) { - int n_eval = (int) N - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; - if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - n_past += n_eval; - } - mymodel->n_past = n_past; - return true; -} - -bool eval_tokens(void * model, std::vector tokens) { - MyModel * mymodel = (MyModel* )model; - llama_context * ctx; - ctx = mymodel->ctx; - gpt_params params = mymodel->params; - int n_past = mymodel->n_past; - for (int i = 0; i < (int) tokens.size(); i += params.n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; - } - if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - n_past += n_eval; - } - mymodel->n_past = n_past; - return true; -} - -bool eval_id(struct MyModel* mymodel, int id) { - std::vector tokens; - tokens.push_back(id); - return eval_tokens(mymodel, tokens); -} - -bool eval_string(struct MyModel * mymodel,const char* str){ - llama_context * ctx = mymodel->ctx; - std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx, str2, true); - eval_tokens(mymodel, embd_inp); - return true; -} - -llama_token sampling_id(struct MyModel* mymodel) { - llama_context* ctx = mymodel->ctx; - gpt_params params = mymodel->params; - llama_sampling_params & sparams = params.sampling_params; - // int n_ctx = llama_n_ctx(ctx); - - // out of user input, sample next token - const float temp = sparams.temp; - const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k; - const float top_p = sparams.top_p; - const float tfs_z = sparams.tfs_z; - const float typical_p = sparams.typical_p; - // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - // const float repeat_penalty = params.repeat_penalty; - // const float alpha_presence = params.presence_penalty; - // const float alpha_frequency = params.frequency_penalty; - const int mirostat = sparams.mirostat; - const float mirostat_tau = sparams.mirostat_tau; - const float mirostat_eta = sparams.mirostat_eta; - // const bool penalize_nl = params.penalize_nl; - - llama_token id = 0; - { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(llama_get_model(ctx)); - - // Apply params.logit_bias map - for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - // TODO: Apply penalties - // float nl_logit = logits[llama_token_nl(ctx)]; - // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); - // llama_sample_repetition_penalty(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, repeat_penalty); - // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, alpha_frequency, alpha_presence); - // if (!penalize_nl) { - // logits[llama_token_nl(ctx)] = nl_logit; - // } - - if (temp <= 0) { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } else { - if (mirostat == 1) { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temp(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } else if (mirostat == 2) { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temp(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temp(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - } - - return id; -} - -const char * sampling(struct MyModel * mymodel) { - llama_context * ctx = mymodel->ctx; - int id = sampling_id(mymodel); - static std::string ret; - if (id == llama_token_eos(ctx)) { - ret = ""; - } else { - ret = llama_token_to_piece(ctx, id); - } - eval_id(mymodel, id); - return ret.c_str(); -} - -} diff --git a/examples/embd-input/embd-input-test.cpp b/examples/embd-input/embd-input-test.cpp deleted file mode 100644 index dc4a0e488..000000000 --- a/examples/embd-input/embd-input-test.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include "embd-input.h" -#include -#include -#include - -int main(int argc, char** argv) { - - auto mymodel = create_mymodel(argc, argv); - int N = 10; - int max_tgt_len = 500; - int n_embd = llama_n_embd(llama_get_model(mymodel->ctx)); - - // add random float embd to test evaluation - float * data = new float[N*n_embd]; - std::default_random_engine e; - std::uniform_real_distribution u(0,1); - for (int i=0;iparams.prompt.c_str()); - const char* tmp; - for (int i=0; i")==0) break; - printf("%s", tmp); - fflush(stdout); - } - printf("\n"); - free_mymodel(mymodel); - return 0; -} diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h deleted file mode 100644 index eff5e3b84..000000000 --- a/examples/embd-input/embd-input.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _EMBD_INPUT_H_ -#define _EMBD_INPUT_H_ 1 - -#include "common.h" -#include "llama.h" - -extern "C" { - -typedef struct MyModel { - llama_context* ctx; - gpt_params params; - int n_past = 0; -} MyModel; - -struct MyModel* create_mymodel(int argc, char ** argv); - -bool eval_float(void* model, float* input, int N); -bool eval_tokens(void* model, std::vector tokens); -bool eval_id(struct MyModel* mymodel, int id); -bool eval_string(struct MyModel* mymodel, const char* str); -const char * sampling(struct MyModel* mymodel); -llama_token sampling_id(struct MyModel* mymodel); -void free_mymodel(struct MyModel* mymodel); - -} - -#endif diff --git a/examples/embd-input/embd_input.py b/examples/embd-input/embd_input.py deleted file mode 100755 index f146acdc1..000000000 --- a/examples/embd-input/embd_input.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -import ctypes -from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int -import numpy as np -import os - -libc = cdll.LoadLibrary("./libembdinput.so") -libc.sampling.restype=c_char_p -libc.create_mymodel.restype=c_void_p -libc.eval_string.argtypes=[c_void_p, c_char_p] -libc.sampling.argtypes=[c_void_p] -libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int] - - -class MyModel: - def __init__(self, args): - argc = len(args) - c_str = [c_char_p(i.encode()) for i in args] - args_c = (c_char_p * argc)(*c_str) - self.model = c_void_p(libc.create_mymodel(argc, args_c)) - self.max_tgt_len = 512 - self.print_string_eval = True - - def __del__(self): - libc.free_mymodel(self.model) - - def eval_float(self, x): - libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1]) - - def eval_string(self, x): - libc.eval_string(self.model, x.encode()) # c_char_p(x.encode())) - if self.print_string_eval: - print(x) - - def eval_token(self, x): - libc.eval_id(self.model, x) - - def sampling(self): - s = libc.sampling(self.model) - return s - - def stream_generate(self, end=""): - ret = b"" - end = end.encode() - for _ in range(self.max_tgt_len): - tmp = self.sampling() - ret += tmp - yield tmp - if ret.endswith(end): - break - - def generate_with_print(self, end=""): - ret = b"" - for i in self.stream_generate(end=end): - ret += i - print(i.decode(errors="replace"), end="", flush=True) - print("") - return ret.decode(errors="replace") - - - def generate(self, end=""): - text = b"".join(self.stream_generate(end=end)) - return text.decode(errors="replace") - -if __name__ == "__main__": - model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) - model.eval_string("""user: what is the color of the flag of UN?""") - x = np.random.random((5120,10))# , dtype=np.float32) - model.eval_float(x) - model.eval_string("""assistant:""") - for i in model.generate(): - print(i.decode(errors="replace"), end="", flush=True) diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py deleted file mode 100755 index 06fad55f4..000000000 --- a/examples/embd-input/llava.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -sys.path.insert(0, os.path.dirname(__file__)) -from embd_input import MyModel -import numpy as np -from torch import nn -import torch -from transformers import CLIPVisionModel, CLIPImageProcessor -from PIL import Image - -# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1' -vision_tower = "openai/clip-vit-large-patch14" -select_hidden_state_layer = -2 -# (vision_config.image_size // vision_config.patch_size) ** 2 -image_token_len = (224//14)**2 - -class Llava: - def __init__(self, args): - self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower) - self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower) - self.mm_projector = nn.Linear(1024, 5120) - self.model = MyModel(["main", *args]) - - def load_projection(self, path): - state = torch.load(path) - self.mm_projector.load_state_dict({ - "weight": state["model.mm_projector.weight"], - "bias": state["model.mm_projector.bias"]}) - - def chat(self, question): - self.model.eval_string("user: ") - self.model.eval_string(question) - self.model.eval_string("\nassistant: ") - return self.model.generate_with_print() - - def chat_with_image(self, image, question): - with torch.no_grad(): - embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] - image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True) - select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer] - image_feature = select_hidden_state[:, 1:] - embd_image = self.mm_projector(image_feature) - embd_image = embd_image.cpu().numpy()[0] - self.model.eval_string("user: ") - self.model.eval_token(32003-2) # im_start - self.model.eval_float(embd_image.T) - for i in range(image_token_len-embd_image.shape[0]): - self.model.eval_token(32003-3) # im_patch - self.model.eval_token(32003-1) # im_end - self.model.eval_string(question) - self.model.eval_string("\nassistant: ") - return self.model.generate_with_print() - - -if __name__=="__main__": - # model form liuhaotian/LLaVA-13b-delta-v1-1 - a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"]) - # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin. - # Also here can use pytorch_model-00003-of-00003.bin directly. - a.load_projection(os.path.join( - os.path.dirname(__file__) , - "llava_projection.pth")) - respose = a.chat_with_image( - Image.open("./media/llama1-logo.png").convert('RGB'), - "what is the text in the picture?") - respose - a.chat("what is the color of it?") - - - diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py deleted file mode 100755 index 7b13e4a5c..000000000 --- a/examples/embd-input/minigpt4.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -sys.path.insert(0, os.path.dirname(__file__)) -from embd_input import MyModel -import numpy as np -from torch import nn -import torch -from PIL import Image - -minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4") -sys.path.insert(0, minigpt4_path) -from minigpt4.models.blip2 import Blip2Base -from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor - - -class MiniGPT4(Blip2Base): - """ - MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4 - """ - def __init__(self, - args, - vit_model="eva_clip_g", - q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth", - img_size=224, - drop_path_rate=0, - use_grad_checkpoint=False, - vit_precision="fp32", - freeze_vit=True, - freeze_qformer=True, - num_query_token=32, - llama_model="", - prompt_path="", - prompt_template="", - max_txt_len=32, - end_sym='\n', - low_resource=False, # use 8 bit and put vit in cpu - device_8bit=0 - ): - super().__init__() - self.img_size = img_size - self.low_resource = low_resource - self.preprocessor = Blip2ImageEvalProcessor(img_size) - - print('Loading VIT') - self.visual_encoder, self.ln_vision = self.init_vision_encoder( - vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision - ) - print('Loading VIT Done') - print('Loading Q-Former') - self.Qformer, self.query_tokens = self.init_Qformer( - num_query_token, self.visual_encoder.num_features - ) - self.Qformer.cls = None - self.Qformer.bert.embeddings.word_embeddings = None - self.Qformer.bert.embeddings.position_embeddings = None - for layer in self.Qformer.bert.encoder.layer: - layer.output = None - layer.intermediate = None - self.load_from_pretrained(url_or_filename=q_former_model) - print('Loading Q-Former Done') - self.llama_proj = nn.Linear( - self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size - ) - self.max_txt_len = max_txt_len - self.end_sym = end_sym - self.model = MyModel(["main", *args]) - # system prompt - self.model.eval_string("Give the following image: ImageContent. " - "You will be able to see the image once I provide it to you. Please answer my questions." - "###") - - def encode_img(self, image): - image = self.preprocessor(image) - image = image.unsqueeze(0) - device = image.device - if self.low_resource: - self.vit_to_cpu() - image = image.to("cpu") - - with self.maybe_autocast(): - image_embeds = self.ln_vision(self.visual_encoder(image)).to(device) - image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device) - - query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) - query_output = self.Qformer.bert( - query_embeds=query_tokens, - encoder_hidden_states=image_embeds, - encoder_attention_mask=image_atts, - return_dict=True, - ) - - inputs_llama = self.llama_proj(query_output.last_hidden_state) - # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device) - return inputs_llama - - def load_projection(self, path): - state = torch.load(path)["model"] - self.llama_proj.load_state_dict({ - "weight": state["llama_proj.weight"], - "bias": state["llama_proj.bias"]}) - - def chat(self, question): - self.model.eval_string("Human: ") - self.model.eval_string(question) - self.model.eval_string("\n### Assistant:") - return self.model.generate_with_print(end="###") - - def chat_with_image(self, image, question): - with torch.no_grad(): - embd_image = self.encode_img(image) - embd_image = embd_image.cpu().numpy()[0] - self.model.eval_string("Human: ") - self.model.eval_float(embd_image.T) - self.model.eval_string(" ") - self.model.eval_string(question) - self.model.eval_string("\n### Assistant:") - return self.model.generate_with_print(end="###") - - -if __name__=="__main__": - a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"]) - a.load_projection(os.path.join( - os.path.dirname(__file__) , - "pretrained_minigpt4.pth")) - respose = a.chat_with_image( - Image.open("./media/llama1-logo.png").convert('RGB'), - "what is the text in the picture?") - a.chat("what is the color of it?") diff --git a/examples/embd-input/panda_gpt.py b/examples/embd-input/panda_gpt.py deleted file mode 100755 index 891ad7cc9..000000000 --- a/examples/embd-input/panda_gpt.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -sys.path.insert(0, os.path.dirname(__file__)) -from embd_input import MyModel -import numpy as np -from torch import nn -import torch - -# use PandaGPT path -panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT") -imagebind_ckpt_path = "./models/panda_gpt/" - -sys.path.insert(0, os.path.join(panda_gpt_path,"code","model")) -from ImageBind.models import imagebind_model -from ImageBind import data - -ModalityType = imagebind_model.ModalityType -max_tgt_len = 400 - -class PandaGPT: - def __init__(self, args): - self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path) - self.visual_encoder.eval() - self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120) - self.max_tgt_len = max_tgt_len - self.model = MyModel(["main", *args]) - self.generated_text = "" - self.device = "cpu" - - def load_projection(self, path): - state = torch.load(path, map_location="cpu") - self.llama_proj.load_state_dict({ - "weight": state["llama_proj.weight"], - "bias": state["llama_proj.bias"]}) - - def eval_inputs(self, inputs): - self.model.eval_string("") - embds = self.extract_multimoal_feature(inputs) - for i in embds: - self.model.eval_float(i.T) - self.model.eval_string(" ") - - def chat(self, question): - return self.chat_with_image(None, question) - - def chat_with_image(self, inputs, question): - if self.generated_text == "": - self.model.eval_string("###") - self.model.eval_string(" Human: ") - if inputs: - self.eval_inputs(inputs) - self.model.eval_string(question) - self.model.eval_string("\n### Assistant:") - ret = self.model.generate_with_print(end="###") - self.generated_text += ret - return ret - - def extract_multimoal_feature(self, inputs): - features = [] - for key in ["image", "audio", "video", "thermal"]: - if key + "_paths" in inputs: - embeds = self.encode_data(key, inputs[key+"_paths"]) - features.append(embeds) - return features - - def encode_data(self, data_type, data_paths): - - type_map = { - "image": ModalityType.VISION, - "audio": ModalityType.AUDIO, - "video": ModalityType.VISION, - "thermal": ModalityType.THERMAL, - } - load_map = { - "image": data.load_and_transform_vision_data, - "audio": data.load_and_transform_audio_data, - "video": data.load_and_transform_video_data, - "thermal": data.load_and_transform_thermal_data - } - - load_function = load_map[data_type] - key = type_map[data_type] - - inputs = {key: load_function(data_paths, self.device)} - with torch.no_grad(): - embeddings = self.visual_encoder(inputs) - embeds = embeddings[key] - embeds = self.llama_proj(embeds).cpu().numpy() - return embeds - - -if __name__=="__main__": - a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"]) - a.load_projection("./models/panda_gpt/adapter_model.bin") - a.chat_with_image( - {"image_paths": ["./media/llama1-logo.png"]}, - "what is the text in the picture? 'llama' or 'lambda'?") - a.chat("what is the color of it?") diff --git a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp deleted file mode 100644 index 9d433f4b1..000000000 --- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp +++ /dev/null @@ -1,1133 +0,0 @@ -#ifndef CMPNCT_GPT2BPE -#define CMPNCT_GPT2BPE - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -// Unicode GPT2 Byte Pair Encoding Tokenizer -// Adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License] -// Removed loading of merges from HF json and parts made for a specific vocab - - -//----------------- -// Unicode library (from cmpnct_unicode.cpp) -//----------------- - -// Minimal library for high performance handling and categorization of UTF8 strings and characters -// Using std::string - -enum CNCTCharType { - DIGIT, // a numerical char in any language - LETTER, // a letter in any language - WHITESPACE, // any form of whitespace - ACCENT_MARK, // letter modifiers like ´ in é - PUNCTUATION, // punctuation including brackets - SYMBOL, // math, currency, other symbols - CONTROL, // control characters - MIXED, // a mix of the above - UNIDENTIFIED // something more exotic like emoji or separators -}; - -struct CNCTUnicode; - -struct CNCTString { - std::string str; - size_t utf8_chars; - - CNCTCharType char_type=UNIDENTIFIED; - bool is_sequential=false; - - size_t seq_offset_bytes=0; - size_t seq_offset_utf8_chars=0; - - bool operator==(const std::string &other) const; - bool operator==(const char other) const; - bool operator==(const CNCTString &other) const; - CNCTString &operator+=(const std::string &other); - CNCTString &operator+=(const char other); - friend CNCTString operator+(CNCTString lhs, const std::string &rhs); - friend CNCTString operator+(CNCTString lhs, const char rhs); - CNCTString& operator+=(const CNCTString& other); - friend CNCTString operator+(CNCTString lhs, const CNCTString& rhs); -}; - -struct CNCTUnicode { - static bool check_code_range(int c, const std::vector>& ranges); - static CNCTCharType get_code_type(int c); - static CNCTCharType get_code_type(const std::string &utf8_char); - static int utf8_len(const char c); - static int strlen_utf8(std::string src); - static std::vector split_utf8(const std::string &src); - static std::vector split_utf8_enhanced(const std::string &src); - static CNCTCharType string_identify(const std::string& str); - static bool string_test(const std::string& str, CNCTCharType chartype); -}; - -static const std::vector> digit_ranges = { -{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F}, -{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F}, -{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468}, -{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909}, -{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A}, -{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739}, -{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9}, -{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9}, -}; - -static const std::vector> letter_ranges = { -{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374}, -{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559}, -{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710}, -{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A}, -{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2}, -{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33}, -{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD}, -{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61}, -{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0}, -{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3}, -{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61}, -{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A}, -{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C}, -{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7}, -{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5}, -{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C}, -{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3}, -{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB}, -{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F}, -{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D}, -{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4}, -{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107}, -{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E}, -{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F}, -{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006}, -{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF}, -{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788}, -{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE}, -{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B}, -{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4}, -{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB}, -{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44}, -{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7}, -{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA}, -{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D}, -{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835}, -{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7}, -{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35}, -{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C}, -{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147}, -{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288}, -{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339}, -{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE}, -{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909}, -{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00}, -{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F}, -{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0}, -{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77}, -{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08}, -{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C}, -{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514}, -{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA}, -{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D}, -{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27}, -{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52}, -{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72}, -{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734}, -{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A}, -}; - -static const std::vector> whitespace_ranges = { -{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}, -}; - -static const std::vector> accent_mark_ranges = { -{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4}, -{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B}, -{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7}, -{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC}, -{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63}, -{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83}, -{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57}, -{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC}, -{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E}, -{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734}, -{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E}, -{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2}, -{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F}, -{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881}, -{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D}, -{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E}, -{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F}, -{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134}, -{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303}, -{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E}, -{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938}, -{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47}, -{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45}, -{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92}, -{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36}, -{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A}, -{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF}, -}; - -static const std::vector> punctuation_ranges = { -{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB}, -{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D}, -{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76}, -{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA}, -{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A}, -{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027}, -{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998}, -{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F}, -{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF}, -{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F}, -{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20}, -{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857}, -{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D}, -{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9}, -{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946}, -{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F}, -{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F}, -}; - -static const std::vector> symbol_ranges = { -{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7}, -{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608}, -{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA}, -{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5}, -{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C}, -{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF}, -{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B}, -{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4}, -{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, -{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3}, -{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A}, -{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69}, -{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F}, -{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F}, -{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241}, -{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789}, -{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC}, -{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD}, -{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8}, -{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53}, -{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA}, -}; - -static const std::vector> control_ranges = { -{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C}, -{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F}, -{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5}, -{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29}, -{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80}, -{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5}, -{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54}, -{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7}, -{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C}, -{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB}, -{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49}, -{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7}, -{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5}, -{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC}, -{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF}, -{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F}, -{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF}, -{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F}, -{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F}, -{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F}, -{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC}, -{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF}, -{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F}, -{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF}, -{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF}, -{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F}, -{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA}, -{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA}, -{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2}, -{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1}, -{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B}, -{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F}, -{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F}, -{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807}, -{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E}, -{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E}, -{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8}, -{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF}, -{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF}, -{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E}, -{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334}, -{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C}, -{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF}, -{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936}, -{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09}, -{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B}, -{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF}, -{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F}, -{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF}, -{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163}, -{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A}, -{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8}, -{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F}, -{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A}, -{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6}, -{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26}, -{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50}, -{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B}, -{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF}, -{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F}, -{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F}, -{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F}, -{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F}, -{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF}, -}; - -//String -bool CNCTString::operator==(const std::string& other) const { - return str.compare(other) == 0; -} -bool CNCTString::operator==(const char other) const { - return str.compare(std::string(1, other)) == 0; -} -bool CNCTString::operator==(const CNCTString& other) const { - return str.compare(other.str) == 0; -} -// + operators -CNCTString& CNCTString::operator+=(const std::string& other) { - str += other; - int new_len = CNCTUnicode::strlen_utf8(other); - utf8_chars += new_len; - char_type = CNCTUnicode::string_identify(str); - seq_offset_bytes += other.size(); - seq_offset_utf8_chars += new_len; - return *this; -} - -CNCTString& CNCTString::operator+=(const char other) { - std::string str = std::string(1, other); - *this += str; - return *this; -} - -CNCTString& CNCTString::operator+=(const CNCTString& other) { - str += other.str; - utf8_chars += other.utf8_chars; - char_type = CNCTUnicode::string_identify(str); - seq_offset_bytes += other.str.size(); - seq_offset_utf8_chars += other.utf8_chars; - return *this; -} - -struct CRCompare { - bool operator()(const std::pair& p, int i) { - return p.second < i; - } - bool operator()(int i, const std::pair& p) { - return i < p.first; - } -}; - -// binary search for code range -bool CNCTUnicode::check_code_range(int c, const std::vector> &ranges) { - auto it = std::upper_bound(ranges.begin(), ranges.end(), c, CRCompare()); - if (it != ranges.begin()) { - --it; - } - return c >= it->first && c <= it->second; -} - -// these are binary searches, it takes only a few operations -CNCTCharType CNCTUnicode::get_code_type(int c) { - if (check_code_range(c, letter_ranges)) { - return LETTER; - } - if (check_code_range(c, digit_ranges)) { - return DIGIT; - } - if (check_code_range(c, whitespace_ranges)) { - return WHITESPACE; - } - if (check_code_range(c, punctuation_ranges)) { - return PUNCTUATION; - } - if (check_code_range(c, symbol_ranges)) { - return SYMBOL; - } - if (check_code_range(c, accent_mark_ranges)) { - return ACCENT_MARK; - } - if (check_code_range(c, control_ranges)) { - return CONTROL; - } - return UNIDENTIFIED; -} - -static int utf8_to_unicode(const std::string& utf8_char) { - int c = 0; - int len = (int)utf8_char.size(); - if (len == 1) { - c = utf8_char[0]; - } else if (len == 2) { - c = ((utf8_char[0] & 0x1F) << 6) | (utf8_char[1] & 0x3F); - } else if (len == 3) { - c = ((utf8_char[0] & 0x0F) << 12) | ((utf8_char[1] & 0x3F) << 6) | (utf8_char[2] & 0x3F); - } else if (len == 4) { - c = ((utf8_char[0] & 0x07) << 18) | ((utf8_char[1] & 0x3F) << 12) | ((utf8_char[2] & 0x3F) << 6) | (utf8_char[3] & 0x3F); - } - return c; -} - -CNCTCharType CNCTUnicode::get_code_type(const std::string &utf8_char) { - return get_code_type(utf8_to_unicode(utf8_char)); -} - -int CNCTUnicode::utf8_len(const char c) -{ - if ((c & 0x80) == 0) { - return 1; // ASCII character - } - if ((c & 0xE0) == 0xC0) { - return 2; // 2-byte character - } - if ((c & 0xF0) == 0xE0) { - return 3; // 3-byte character - } - if ((c & 0xF0) == 0xF0) { - return 4; // 4-byte character - } - return 1; // not valid utf8 - // static const uint8_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - // return lookup[static_cast(c) >> 4]; -} - -int CNCTUnicode::strlen_utf8(const std::string src) { - int len = 0; - for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) { - int char_len = utf8_len(*it); - if (char_len > 1) { - it += char_len - 1; - } - len += 1; - } - return len; -} - -// split a string into unicode strings -std::vector CNCTUnicode::split_utf8(const std::string &src) { - std::vector result; - for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) { - int char_len = utf8_len(*it); - std::string str(it, it + char_len); - result.push_back(str); - if (char_len > 1) { - it += char_len - 1; - } - } - return result; -} - -// split a string into unicode strings (CNCTString) with sequence information -std::vector CNCTUnicode::split_utf8_enhanced(const std::string &src) { - std::vector result; - int seq_offset_bytes=0; - int seq_offset_utf8_chars=0; - for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) { - int char_len = utf8_len(*it); - std::string str(it, it + char_len); - CNCTString cnct_str; - cnct_str.seq_offset_bytes = seq_offset_bytes; - cnct_str.seq_offset_utf8_chars = seq_offset_utf8_chars; - cnct_str.str = str; - cnct_str.utf8_chars = 1; - cnct_str.char_type = get_code_type(str); - #if 0 - switch (cnct_str.char_type) - { - case DIGIT: - printf("%s = DIGIT\n", str.c_str()); - break; - case LETTER: - printf("%s = LETTER\n", str.c_str()); - break; - case WHITESPACE: - printf("%s = WHITESPACE\n", str.c_str()); - break; - case PUNCTUATION: - printf("%s = PUNCTUATION\n", str.c_str()); - break; - case UNIDENTIFIED: - printf("%s = UNIDENTIFIED\n", str.c_str()); - break; - case SYMBOL: - printf("%s = SYMBOL\n", str.c_str()); - break; - case CONTROL: - printf("%s = CONTROL\n", str.c_str()); - break; - } - #endif - - result.push_back(cnct_str); - seq_offset_bytes += char_len; - seq_offset_utf8_chars += 1; - if (char_len > 1) { - it += char_len - 1; - } - - } - return result; -} - -// return the type of the string -CNCTCharType CNCTUnicode::string_identify(const std::string &str) { - CNCTCharType result = UNIDENTIFIED; - std::string::const_iterator it = str.begin(); - while (it != str.end()) { - int len = utf8_len(*it); - int c = 0; - for (int i = 0; i < len && it != str.end(); ++i, ++it) { - c = (c << 8) | static_cast(*it); - } - switch (get_code_type(c)) { - case DIGIT: - if (result == UNIDENTIFIED) { - result = DIGIT; - } else if (result != DIGIT) { - return MIXED; - } - break; - case LETTER: - if (result == UNIDENTIFIED) { - result = LETTER; - } else if (result != LETTER) { - return MIXED; - } - break; - case WHITESPACE: - if (result == UNIDENTIFIED) { - result = WHITESPACE; - } else if (result != WHITESPACE) { - return MIXED; - } - break; - case PUNCTUATION: - if (result == UNIDENTIFIED) { - result = PUNCTUATION; - } else if (result != PUNCTUATION) { - return MIXED; - } - break; - default: - return MIXED; - break; - } - } - return result; -} - -// verify the content of a string -bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype) -{ - std::string::const_iterator it = str.begin(); - while (it != str.end()) { - int len = utf8_len(*it); - int c = 0; - for (int i = 0; i < len && it != str.end(); ++i, ++it) { - c = (c << 8) | static_cast(*it); - } - if (get_code_type(c) != chartype) { - return false; - } - } - return true; -} - -//----------------- -// llama.cpp GPT2 vocab (from libfalcon.cpp) -//----------------- - -std::string replaceAll(std::string str, const std::string& from, const std::string& to) { - size_t start_pos = 0; - while((start_pos = str.find(from, start_pos)) != std::string::npos) { - str.replace(start_pos, from.length(), to); - start_pos += to.length(); // Handles case where 'to' is a substring of 'from' - } - return str; -} - -struct TrieNode { - std::map map; - int32_t Id = -1; -}; - -struct Trie { - TrieNode *root; - - Trie() : root(new TrieNode()) {} - - ~Trie() { - if(root) - deleteTrie(root); - } - - // Move constructor - Trie(Trie&& other) noexcept : root(other.root) { - other.root = nullptr; - } - - // Move assignment operator - Trie& operator=(Trie&& other) noexcept { - if (this != &other) { - if(root) - deleteTrie(root); - root = other.root; - other.root = nullptr; - } - return *this; - } - - void insert(const std::string &token, int32_t Id) { - TrieNode* current = root; - for(auto ch : token) { - if(current->map.find(ch) == current->map.end()) { - current->map[ch] = new TrieNode(); - } - current = current->map[ch]; - } - current->Id = Id; - } - - void reset() { - deleteTrie(root); - root = new TrieNode(); - } - -private: - void deleteTrie(TrieNode* node) { - for(auto &it: node->map) { - deleteTrie(it.second); - } - delete node; - } - -}; - -struct gpt2bpe_vocab { - using id = int32_t; - using token = std::string; - - std::map max_token_length; // max length, for each 2byte prefix - std::map, int> bpe_ranks; - std::vector> bpe_merges; - - id special_bos_id = -1; - id special_eos_id = -1; - id special_unk_id = -1; - id special_sep_id = -1; - id special_pad_id = -1; - - id linefeed_id = -1; - - std::unordered_map token_to_id; - std::unordered_map id_to_token; - - Trie trie; // highspeed access to tokens by prefix tree - - // populate trie from map - void populate_trie_from_map() { - trie.reset(); - for (const auto& pair : token_to_id) { - trie.insert(pair.first, pair.second); - if (pair.first.size() >= 2) { - std::string prefix = pair.first.substr(0, 2); - max_token_length[prefix] = std::max(max_token_length[prefix], (uint32_t)pair.first.size()); - } - } - } - // populate token ranks map - int populate_bpe_ranks(std::vector> bpe_merges_) { - for (int i = 0; i < (int)bpe_merges_.size(); i++) { - bpe_ranks.emplace(bpe_merges_[i], i); - } - bpe_merges = bpe_merges_; - return bpe_merges_.size(); - } - - // Trim whitespace characters from the beginning and end of the string - void trim(std::string& str) { - // Remove whitespace characters from the beginning of the string - str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](int ch) { - return !std::isspace(ch); - })); - - // Remove whitespace characters from the end of the string - str.erase(std::find_if(str.rbegin(), str.rend(), [](int ch) { - return !std::isspace(ch); - }).base(), str.end()); - } - - // get max token length available for a prefix of 2 bytes (string at least 2 bytes long) - int get_max_token_length(const std::string& string) const { - if (string.size() < 2) { - return -1; - } - std::string prefix = string.substr(0, 2); - if (max_token_length.find(prefix) == max_token_length.end()) { - return 0; - } - return max_token_length.at(prefix); - } - - // function to find if two tokens match in bpe_rank, return rank or -1 - int find_bpe_rank(const std::string& token1, const std::string& token2) const { - std::string left_token = token1; - std::string right_token = token2; - left_token = replaceAll(left_token, " ", "Ġ"); - left_token = replaceAll(left_token, "\n", "Ċ"); - right_token = replaceAll(right_token, " ", "Ġ"); - right_token = replaceAll(right_token, "\n", "Ċ"); - - auto it = bpe_ranks.find(std::make_pair(left_token, right_token)); - if (it == bpe_ranks.end()) { - return -1; - } - return it->second; - } - - std::pair find_longest_match(const std::string& snippet) const { - TrieNode* current = trie.root; - gpt2bpe_vocab::id last_matched_id = -1; - std::string last_matched_token = ""; - std::string current_token = ""; - for (auto ch : snippet) { - if (current->map.find(ch) == current->map.end()) { - break; - } - current = current->map[ch]; - current_token += ch; - if (current->Id != -1) { - last_matched_id = current->Id; - last_matched_token = current_token; - } - } - return {last_matched_id, last_matched_token}; - } - -}; - - -// -// tokenizer - bpe type, gpt2 tokenization compatible -// - -struct ggllm_bpe_symbol { - using index = int; - index prev; - index next; - const char * text; - size_t n; -}; - -static_assert(std::is_trivially_copyable::value, "ggllm_bpe_symbol is not trivially copyable"); - -struct ggllm_bpe_bigram { - struct comparator { - bool operator()(ggllm_bpe_bigram & l, ggllm_bpe_bigram & r) { - return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); - } - }; - - using queue_storage = std::vector; - using queue = std::priority_queue; - ggllm_bpe_symbol::index left; - ggllm_bpe_symbol::index right; - std::string text; - int rank; - size_t size; -}; - -struct gpt2bpe_tokenizer { - gpt2bpe_tokenizer(const gpt2bpe_vocab & vocab, bool g2ws_): vocab_(vocab) { flag_g2ws = g2ws_; } - - void tokenize(const std::string & text, std::vector & output) { - int final_prev_index = -1; - // auto start = ggml_time_us(); - auto word_collection = bpe_gpt2_preprocess(text); - // auto end = ggml_time_us(); - // fprintf(stderr, "%s: preprocessing took %0.3f ms\n", __func__, (end - start) / 1000.0); - - symbols_final.clear(); - - for (auto & word : word_collection) { - work_queue_ = ggllm_bpe_bigram::queue(); - symbols_.clear(); - - int index = 0; - size_t offset = 0; - - while (offset < word.size()) { - ggllm_bpe_symbol sym; - size_t char_len = std::min(word.size() - offset, (size_t) CNCTUnicode::utf8_len(word[offset])); - sym.text = word.c_str() + offset; - sym.n = 1; - sym.n = char_len; - offset += sym.n; - sym.prev = index - 1; - sym.next = offset == word.size() ? -1 : index + 1; - index++; - symbols_.emplace_back(sym); - } - for (size_t i = 1; i < symbols_.size(); ++i) { - add_new_bigram(i - 1, i); - } - - // build token(s) - while (!work_queue_.empty()) { - auto bigram = work_queue_.top(); - work_queue_.pop(); - - auto & left_symbol = symbols_[bigram.left]; - auto & right_symbol = symbols_[bigram.right]; - - if (left_symbol.n == 0 || right_symbol.n == 0) { - continue; - } - std::string left_token = std::string(left_symbol.text, left_symbol.n); - std::string right_token = std::string(right_symbol.text, right_symbol.n); - if (left_token + right_token != bigram.text) { - continue; // Skip this bigram if it's outdated - } - - // merge the right sym into the left one - left_symbol.n += right_symbol.n; - right_symbol.n = 0; - - // remove the right sym from the chain - left_symbol.next = right_symbol.next; - if (right_symbol.next >= 0) { - symbols_[right_symbol.next].prev = bigram.left; - } - - add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol - add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol - } - - // add the fnished tokens to the final list keeping correct order for next and prev - for (auto & sym : symbols_) { - if (sym.n > 0) { - sym.prev = final_prev_index; - sym.next = -1; - if (final_prev_index != -1) { - symbols_final[final_prev_index].next = symbols_final.size(); - } - symbols_final.emplace_back(sym); - final_prev_index = symbols_final.size() - 1; - } - } - } - - symbols_ = symbols_final; - if (symbols_.size()) - for (int i = 0; i != -1; i = symbols_[i].next) { - auto & symbol = symbols_[i]; - if (symbol.n == 0) { - continue; - } - std::string str = std::string(symbol.text, symbol.n); - std::string str_decoded = decode_token(str); - auto token = vocab_.token_to_id.find(str_decoded); - - if (token == vocab_.token_to_id.end()) { - for (auto j = str_decoded.begin(); j != str_decoded.end(); ++j) { - std::string byte_str(1, *j); - auto token_multibyte = vocab_.token_to_id.find(byte_str); - if (token_multibyte == vocab_.token_to_id.end()) { - fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); - } - output.push_back((*token_multibyte).second); - } - } else { - output.push_back((*token).second); - } - } - } - -private: - void add_new_bigram(int left, int right) { - if (left == -1 || right == -1) return; - - std::string left_token = std::string(symbols_[left].text, symbols_[left].n); - std::string right_token = std::string(symbols_[right].text, symbols_[right].n); - - int rank_found = -1; - rank_found = vocab_.find_bpe_rank(left_token, right_token); - - if (rank_found < 0) { - return; - } - - ggllm_bpe_bigram bigram; - bigram.left = left; - bigram.right = right; - bigram.rank = rank_found; - bigram.size = left_token.size() + right_token.size(); - bigram.text = left_token + right_token; - work_queue_.push(bigram); - } - - std::unordered_map bytes_to_unicode() { - static std::unordered_map hex_map = { - { 0x21, "\x21" }, { 0x22, "\x22" }, { 0x23, "\x23" }, { 0x24, "\x24" }, { 0x25, "\x25" }, { 0x26, "\x26" }, { 0x27, "\x27" }, { 0x28, "\x28" }, { 0x29, "\x29" }, { 0x2A, "\x2A" }, - { 0x2B, "\x2B" }, { 0x2C, "\x2C" }, { 0x2D, "\x2D" }, { 0x2E, "\x2E" }, { 0x2F, "\x2F" }, { 0x30, "\x30" }, { 0x31, "\x31" }, { 0x32, "\x32" }, { 0x33, "\x33" }, { 0x34, "\x34" }, - { 0x35, "\x35" }, { 0x36, "\x36" }, { 0x37, "\x37" }, { 0x38, "\x38" }, { 0x39, "\x39" }, { 0x3A, "\x3A" }, { 0x3B, "\x3B" }, { 0x3C, "\x3C" }, { 0x3D, "\x3D" }, { 0x3E, "\x3E" }, - { 0x3F, "\x3F" }, { 0x40, "\x40" }, { 0x41, "\x41" }, { 0x42, "\x42" }, { 0x43, "\x43" }, { 0x44, "\x44" }, { 0x45, "\x45" }, { 0x46, "\x46" }, { 0x47, "\x47" }, { 0x48, "\x48" }, - { 0x49, "\x49" }, { 0x4A, "\x4A" }, { 0x4B, "\x4B" }, { 0x4C, "\x4C" }, { 0x4D, "\x4D" }, { 0x4E, "\x4E" }, { 0x4F, "\x4F" }, { 0x50, "\x50" }, { 0x51, "\x51" }, { 0x52, "\x52" }, - { 0x53, "\x53" }, { 0x54, "\x54" }, { 0x55, "\x55" }, { 0x56, "\x56" }, { 0x57, "\x57" }, { 0x58, "\x58" }, { 0x59, "\x59" }, { 0x5A, "\x5A" }, { 0x5B, "\x5B" }, { 0x5C, "\x5C" }, - { 0x5D, "\x5D" }, { 0x5E, "\x5E" }, { 0x5F, "\x5F" }, { 0x60, "\x60" }, { 0x61, "\x61" }, { 0x62, "\x62" }, { 0x63, "\x63" }, { 0x64, "\x64" }, { 0x65, "\x65" }, { 0x66, "\x66" }, - { 0x67, "\x67" }, { 0x68, "\x68" }, { 0x69, "\x69" }, { 0x6A, "\x6A" }, { 0x6B, "\x6B" }, { 0x6C, "\x6C" }, { 0x6D, "\x6D" }, { 0x6E, "\x6E" }, { 0x6F, "\x6F" }, { 0x70, "\x70" }, - { 0x71, "\x71" }, { 0x72, "\x72" }, { 0x73, "\x73" }, { 0x74, "\x74" }, { 0x75, "\x75" }, { 0x76, "\x76" }, { 0x77, "\x77" }, { 0x78, "\x78" }, { 0x79, "\x79" }, { 0x7A, "\x7A" }, - { 0x7B, "\x7B" }, { 0x7C, "\x7C" }, { 0x7D, "\x7D" }, { 0x7E, "\x7E" }, { 0xA1, "\xC2\xA1" }, { 0xA2, "\xC2\xA2" }, { 0xA3, "\xC2\xA3" }, { 0xA4, "\xC2\xA4" }, { 0xA5, "\xC2\xA5" }, - { 0xA6, "\xC2\xA6" }, { 0xA7, "\xC2\xA7" }, { 0xA8, "\xC2\xA8" }, { 0xA9, "\xC2\xA9" }, { 0xAA, "\xC2\xAA" }, { 0xAB, "\xC2\xAB" }, { 0xAC, "\xC2\xAC" }, { 0xAE, "\xC2\xAE" }, - { 0xAF, "\xC2\xAF" }, { 0xB0, "\xC2\xB0" }, { 0xB1, "\xC2\xB1" }, { 0xB2, "\xC2\xB2" }, { 0xB3, "\xC2\xB3" }, { 0xB4, "\xC2\xB4" }, { 0xB5, "\xC2\xB5" }, { 0xB6, "\xC2\xB6" }, - { 0xB7, "\xC2\xB7" }, { 0xB8, "\xC2\xB8" }, { 0xB9, "\xC2\xB9" }, { 0xBA, "\xC2\xBA" }, { 0xBB, "\xC2\xBB" }, { 0xBC, "\xC2\xBC" }, { 0xBD, "\xC2\xBD" }, { 0xBE, "\xC2\xBE" }, - { 0xBF, "\xC2\xBF" }, { 0xC0, "\xC3\x80" }, { 0xC1, "\xC3\x81" }, { 0xC2, "\xC3\x82" }, { 0xC3, "\xC3\x83" }, { 0xC4, "\xC3\x84" }, { 0xC5, "\xC3\x85" }, { 0xC6, "\xC3\x86" }, - { 0xC7, "\xC3\x87" }, { 0xC8, "\xC3\x88" }, { 0xC9, "\xC3\x89" }, { 0xCA, "\xC3\x8A" }, { 0xCB, "\xC3\x8B" }, { 0xCC, "\xC3\x8C" }, { 0xCD, "\xC3\x8D" }, { 0xCE, "\xC3\x8E" }, - { 0xCF, "\xC3\x8F" }, { 0xD0, "\xC3\x90" }, { 0xD1, "\xC3\x91" }, { 0xD2, "\xC3\x92" }, { 0xD3, "\xC3\x93" }, { 0xD4, "\xC3\x94" }, { 0xD5, "\xC3\x95" }, { 0xD6, "\xC3\x96" }, - { 0xD7, "\xC3\x97" }, { 0xD8, "\xC3\x98" }, { 0xD9, "\xC3\x99" }, { 0xDA, "\xC3\x9A" }, { 0xDB, "\xC3\x9B" }, { 0xDC, "\xC3\x9C" }, { 0xDD, "\xC3\x9D" }, { 0xDE, "\xC3\x9E" }, - { 0xDF, "\xC3\x9F" }, { 0xE0, "\xC3\xA0" }, { 0xE1, "\xC3\xA1" }, { 0xE2, "\xC3\xA2" }, { 0xE3, "\xC3\xA3" }, { 0xE4, "\xC3\xA4" }, { 0xE5, "\xC3\xA5" }, { 0xE6, "\xC3\xA6" }, - { 0xE7, "\xC3\xA7" }, { 0xE8, "\xC3\xA8" }, { 0xE9, "\xC3\xA9" }, { 0xEA, "\xC3\xAA" }, { 0xEB, "\xC3\xAB" }, { 0xEC, "\xC3\xAC" }, { 0xED, "\xC3\xAD" }, { 0xEE, "\xC3\xAE" }, - { 0xEF, "\xC3\xAF" }, { 0xF0, "\xC3\xB0" }, { 0xF1, "\xC3\xB1" }, { 0xF2, "\xC3\xB2" }, { 0xF3, "\xC3\xB3" }, { 0xF4, "\xC3\xB4" }, { 0xF5, "\xC3\xB5" }, { 0xF6, "\xC3\xB6" }, - { 0xF7, "\xC3\xB7" }, { 0xF8, "\xC3\xB8" }, { 0xF9, "\xC3\xB9" }, { 0xFA, "\xC3\xBA" }, { 0xFB, "\xC3\xBB" }, { 0xFC, "\xC3\xBC" }, { 0xFD, "\xC3\xBD" }, { 0xFE, "\xC3\xBE" }, - { 0xFF, "\xC3\xBF" }, { 0x00, "\xC4\x80" }, { 0x01, "\xC4\x81" }, { 0x02, "\xC4\x82" }, { 0x03, "\xC4\x83" }, { 0x04, "\xC4\x84" }, { 0x05, "\xC4\x85" }, { 0x06, "\xC4\x86" }, - { 0x07, "\xC4\x87" }, { 0x08, "\xC4\x88" }, { 0x09, "\xC4\x89" }, { 0x0A, "\xC4\x8A" }, { 0x0B, "\xC4\x8B" }, { 0x0C, "\xC4\x8C" }, { 0x0D, "\xC4\x8D" }, { 0x0E, "\xC4\x8E" }, - { 0x0F, "\xC4\x8F" }, { 0x10, "\xC4\x90" }, { 0x11, "\xC4\x91" }, { 0x12, "\xC4\x92" }, { 0x13, "\xC4\x93" }, { 0x14, "\xC4\x94" }, { 0x15, "\xC4\x95" }, { 0x16, "\xC4\x96" }, - { 0x17, "\xC4\x97" }, { 0x18, "\xC4\x98" }, { 0x19, "\xC4\x99" }, { 0x1A, "\xC4\x9A" }, { 0x1B, "\xC4\x9B" }, { 0x1C, "\xC4\x9C" }, { 0x1D, "\xC4\x9D" }, { 0x1E, "\xC4\x9E" }, - { 0x1F, "\xC4\x9F" }, { 0x20, "\xC4\xA0" }, { 0x7F, "\xC4\xA1" }, { 0x80, "\xC4\xA2" }, { 0x81, "\xC4\xA3" }, { 0x82, "\xC4\xA4" }, { 0x83, "\xC4\xA5" }, { 0x84, "\xC4\xA6" }, - { 0x85, "\xC4\xA7" }, { 0x86, "\xC4\xA8" }, { 0x87, "\xC4\xA9" }, { 0x88, "\xC4\xAA" }, { 0x89, "\xC4\xAB" }, { 0x8A, "\xC4\xAC" }, { 0x8B, "\xC4\xAD" }, { 0x8C, "\xC4\xAE" }, - { 0x8D, "\xC4\xAF" }, { 0x8E, "\xC4\xB0" }, { 0x8F, "\xC4\xB1" }, { 0x90, "\xC4\xB2" }, { 0x91, "\xC4\xB3" }, { 0x92, "\xC4\xB4" }, { 0x93, "\xC4\xB5" }, { 0x94, "\xC4\xB6" }, - { 0x95, "\xC4\xB7" }, { 0x96, "\xC4\xB8" }, { 0x97, "\xC4\xB9" }, { 0x98, "\xC4\xBA" }, { 0x99, "\xC4\xBB" }, { 0x9A, "\xC4\xBC" }, { 0x9B, "\xC4\xBD" }, { 0x9C, "\xC4\xBE" }, - { 0x9D, "\xC4\xBF" }, { 0x9E, "\xC5\x80" }, { 0x9F, "\xC5\x81" }, { 0xA0, "\xC5\x82" }, { 0xAD, "\xC5\x83" } - }; - return hex_map; - } - - std::unordered_map unicode_to_bytes() { - static std::unordered_map hex_map = { - { "\x21", 0x21 }, { "\x22", 0x22 }, { "\x23", 0x23 }, { "\x24", 0x24 }, { "\x25", 0x25 }, { "\x26", 0x26 }, { "\x27", 0x27 }, { "\x28", 0x28 }, { "\x29", 0x29 }, { "\x2A", 0x2A }, - { "\x2B", 0x2B }, { "\x2C", 0x2C }, { "\x2D", 0x2D }, { "\x2E", 0x2E }, { "\x2F", 0x2F }, { "\x30", 0x30 }, { "\x31", 0x31 }, { "\x32", 0x32 }, { "\x33", 0x33 }, { "\x34", 0x34 }, - { "\x35", 0x35 }, { "\x36", 0x36 }, { "\x37", 0x37 }, { "\x38", 0x38 }, { "\x39", 0x39 }, { "\x3A", 0x3A }, { "\x3B", 0x3B }, { "\x3C", 0x3C }, { "\x3D", 0x3D }, { "\x3E", 0x3E }, - { "\x3F", 0x3F }, { "\x40", 0x40 }, { "\x41", 0x41 }, { "\x42", 0x42 }, { "\x43", 0x43 }, { "\x44", 0x44 }, { "\x45", 0x45 }, { "\x46", 0x46 }, { "\x47", 0x47 }, { "\x48", 0x48 }, - { "\x49", 0x49 }, { "\x4A", 0x4A }, { "\x4B", 0x4B }, { "\x4C", 0x4C }, { "\x4D", 0x4D }, { "\x4E", 0x4E }, { "\x4F", 0x4F }, { "\x50", 0x50 }, { "\x51", 0x51 }, { "\x52", 0x52 }, - { "\x53", 0x53 }, { "\x54", 0x54 }, { "\x55", 0x55 }, { "\x56", 0x56 }, { "\x57", 0x57 }, { "\x58", 0x58 }, { "\x59", 0x59 }, { "\x5A", 0x5A }, { "\x5B", 0x5B }, { "\x5C", 0x5C }, - { "\x5D", 0x5D }, { "\x5E", 0x5E }, { "\x5F", 0x5F }, { "\x60", 0x60 }, { "\x61", 0x61 }, { "\x62", 0x62 }, { "\x63", 0x63 }, { "\x64", 0x64 }, { "\x65", 0x65 }, { "\x66", 0x66 }, - { "\x67", 0x67 }, { "\x68", 0x68 }, { "\x69", 0x69 }, { "\x6A", 0x6A }, { "\x6B", 0x6B }, { "\x6C", 0x6C }, { "\x6D", 0x6D }, { "\x6E", 0x6E }, { "\x6F", 0x6F }, { "\x70", 0x70 }, - { "\x71", 0x71 }, { "\x72", 0x72 }, { "\x73", 0x73 }, { "\x74", 0x74 }, { "\x75", 0x75 }, { "\x76", 0x76 }, { "\x77", 0x77 }, { "\x78", 0x78 }, { "\x79", 0x79 }, { "\x7A", 0x7A }, - { "\x7B", 0x7B }, { "\x7C", 0x7C }, { "\x7D", 0x7D }, { "\x7E", 0x7E }, { "\xC2\xA1", 0xA1 }, { "\xC2\xA2", 0xA2 }, { "\xC2\xA3", 0xA3 }, { "\xC2\xA4", 0xA4 }, { "\xC2\xA5", 0xA5 }, - { "\xC2\xA6", 0xA6 }, { "\xC2\xA7", 0xA7 }, { "\xC2\xA8", 0xA8 }, { "\xC2\xA9", 0xA9 }, { "\xC2\xAA", 0xAA }, { "\xC2\xAB", 0xAB }, { "\xC2\xAC", 0xAC }, { "\xC2\xAE", 0xAE }, - { "\xC2\xAF", 0xAF }, { "\xC2\xB0", 0xB0 }, { "\xC2\xB1", 0xB1 }, { "\xC2\xB2", 0xB2 }, { "\xC2\xB3", 0xB3 }, { "\xC2\xB4", 0xB4 }, { "\xC2\xB5", 0xB5 }, { "\xC2\xB6", 0xB6 }, - { "\xC2\xB7", 0xB7 }, { "\xC2\xB8", 0xB8 }, { "\xC2\xB9", 0xB9 }, { "\xC2\xBA", 0xBA }, { "\xC2\xBB", 0xBB }, { "\xC2\xBC", 0xBC }, { "\xC2\xBD", 0xBD }, { "\xC2\xBE", 0xBE }, - { "\xC2\xBF", 0xBF }, { "\xC3\x80", 0xC0 }, { "\xC3\x81", 0xC1 }, { "\xC3\x82", 0xC2 }, { "\xC3\x83", 0xC3 }, { "\xC3\x84", 0xC4 }, { "\xC3\x85", 0xC5 }, { "\xC3\x86", 0xC6 }, - { "\xC3\x87", 0xC7 }, { "\xC3\x88", 0xC8 }, { "\xC3\x89", 0xC9 }, { "\xC3\x8A", 0xCA }, { "\xC3\x8B", 0xCB }, { "\xC3\x8C", 0xCC }, { "\xC3\x8D", 0xCD }, { "\xC3\x8E", 0xCE }, - { "\xC3\x8F", 0xCF }, { "\xC3\x90", 0xD0 }, { "\xC3\x91", 0xD1 }, { "\xC3\x92", 0xD2 }, { "\xC3\x93", 0xD3 }, { "\xC3\x94", 0xD4 }, { "\xC3\x95", 0xD5 }, { "\xC3\x96", 0xD6 }, - { "\xC3\x97", 0xD7 }, { "\xC3\x98", 0xD8 }, { "\xC3\x99", 0xD9 }, { "\xC3\x9A", 0xDA }, { "\xC3\x9B", 0xDB }, { "\xC3\x9C", 0xDC }, { "\xC3\x9D", 0xDD }, { "\xC3\x9E", 0xDE }, - { "\xC3\x9F", 0xDF }, { "\xC3\xA0", 0xE0 }, { "\xC3\xA1", 0xE1 }, { "\xC3\xA2", 0xE2 }, { "\xC3\xA3", 0xE3 }, { "\xC3\xA4", 0xE4 }, { "\xC3\xA5", 0xE5 }, { "\xC3\xA6", 0xE6 }, - { "\xC3\xA7", 0xE7 }, { "\xC3\xA8", 0xE8 }, { "\xC3\xA9", 0xE9 }, { "\xC3\xAA", 0xEA }, { "\xC3\xAB", 0xEB }, { "\xC3\xAC", 0xEC }, { "\xC3\xAD", 0xED }, { "\xC3\xAE", 0xEE }, - { "\xC3\xAF", 0xEF }, { "\xC3\xB0", 0xF0 }, { "\xC3\xB1", 0xF1 }, { "\xC3\xB2", 0xF2 }, { "\xC3\xB3", 0xF3 }, { "\xC3\xB4", 0xF4 }, { "\xC3\xB5", 0xF5 }, { "\xC3\xB6", 0xF6 }, - { "\xC3\xB7", 0xF7 }, { "\xC3\xB8", 0xF8 }, { "\xC3\xB9", 0xF9 }, { "\xC3\xBA", 0xFA }, { "\xC3\xBB", 0xFB }, { "\xC3\xBC", 0xFC }, { "\xC3\xBD", 0xFD }, { "\xC3\xBE", 0xFE }, - { "\xC3\xBF", 0xFF }, { "\xC4\x80", 0x00 }, { "\xC4\x81", 0x01 }, { "\xC4\x82", 0x02 }, { "\xC4\x83", 0x03 }, { "\xC4\x84", 0x04 }, { "\xC4\x85", 0x05 }, { "\xC4\x86", 0x06 }, - { "\xC4\x87", 0x07 }, { "\xC4\x88", 0x08 }, { "\xC4\x89", 0x09 }, { "\xC4\x8A", 0x0A }, { "\xC4\x8B", 0x0B }, { "\xC4\x8C", 0x0C }, { "\xC4\x8D", 0x0D }, { "\xC4\x8E", 0x0E }, - { "\xC4\x8F", 0x0F }, { "\xC4\x90", 0x10 }, { "\xC4\x91", 0x11 }, { "\xC4\x92", 0x12 }, { "\xC4\x93", 0x13 }, { "\xC4\x94", 0x14 }, { "\xC4\x95", 0x15 }, { "\xC4\x96", 0x16 }, - { "\xC4\x97", 0x17 }, { "\xC4\x98", 0x18 }, { "\xC4\x99", 0x19 }, { "\xC4\x9A", 0x1A }, { "\xC4\x9B", 0x1B }, { "\xC4\x9C", 0x1C }, { "\xC4\x9D", 0x1D }, { "\xC4\x9E", 0x1E }, - { "\xC4\x9F", 0x1F }, { "\xC4\xA0", 0x20 }, { "\xC4\xA1", 0x7F }, { "\xC4\xA2", 0x80 }, { "\xC4\xA3", 0x81 }, { "\xC4\xA4", 0x82 }, { "\xC4\xA5", 0x83 }, { "\xC4\xA6", 0x84 }, - { "\xC4\xA7", 0x85 }, { "\xC4\xA8", 0x86 }, { "\xC4\xA9", 0x87 }, { "\xC4\xAA", 0x88 }, { "\xC4\xAB", 0x89 }, { "\xC4\xAC", 0x8A }, { "\xC4\xAD", 0x8B }, { "\xC4\xAE", 0x8C }, - { "\xC4\xAF", 0x8D }, { "\xC4\xB0", 0x8E }, { "\xC4\xB1", 0x8F }, { "\xC4\xB2", 0x90 }, { "\xC4\xB3", 0x91 }, { "\xC4\xB4", 0x92 }, { "\xC4\xB5", 0x93 }, { "\xC4\xB6", 0x94 }, - { "\xC4\xB7", 0x95 }, { "\xC4\xB8", 0x96 }, { "\xC4\xB9", 0x97 }, { "\xC4\xBA", 0x98 }, { "\xC4\xBB", 0x99 }, { "\xC4\xBC", 0x9A }, { "\xC4\xBD", 0x9B }, { "\xC4\xBE", 0x9C }, - { "\xC4\xBF", 0x9D }, { "\xC5\x80", 0x9E }, { "\xC5\x81", 0x9F }, { "\xC5\x82", 0xA0 }, { "\xC5\x83", 0xAD } - }; - return hex_map; - } - - // len must be available - bool inline str_is_equal(const char* str1, const char* str2, size_t len) { - for (size_t i = 0; i < len; ++i) { - if (str1[i] != str2[i]) { - return false; - } - } - return true; - } - - std::vector bpe_gpt2_preprocess(const std::string& text) { - static std::unordered_map< unsigned char, std::string> byte_encoder = bytes_to_unicode(); - std::vector bpe_words; - std::vector bpe_encoded_words; - - std::string token=""; - const char *raw_text_p = text.c_str(); - // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ - bool collecting_numeric = false; - bool collecting_letter = false; - bool collecting_special = false; - bool collecting_whitespace_lookahead = false; - bool collecting=false; - - std::vector text_utf; - text_utf.reserve(text.size()); - bpe_words.reserve(text.size()); - bpe_encoded_words.reserve(text.size()); - - text_utf = CNCTUnicode::split_utf8_enhanced(text); - - for (int i = 0; i < (int)text_utf.size(); i++) { - const CNCTString &utf_char = text_utf[i]; - bool split_condition = false; - const char *text_pos = raw_text_p + utf_char.seq_offset_bytes; - int bytes_remain = strlen(text_pos); - // forward backward lookups - const CNCTString &utf_char_next = (i+1 < (int)text_utf.size()) ? text_utf[i+1] : CNCTString(); - const CNCTString &utf_char_next_next = (i+2 < (int)text_utf.size()) ? text_utf[i+2] : CNCTString(); - // const CNCTString &utf_char_prev = (i > 0) ? text_utf[i-1] : CNCTString(); - - // handling contractions - if (!split_condition && bytes_remain >= 2) { - // 's|'t|'m|'d - if (utf_char == '\'' && (utf_char_next == 's' || utf_char_next == 't' || utf_char_next == 'm' || utf_char_next == 'd')) { - split_condition = true; - } - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char.str + utf_char_next.str; - bpe_words.emplace_back(token); - token=""; - i++; - continue; - } - } - if (!split_condition && bytes_remain >= 3) { - // 're|'ve|'ll - if (utf_char == '\'' && ( - (utf_char_next == 'r' || utf_char_next_next == 'e') || - (utf_char_next == 'v' || utf_char_next_next == 'e') || - (utf_char_next == 'l' || utf_char_next_next == 'l')) - ) { - split_condition = true; - } - if (split_condition) { - // current token + next token can be defined - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char.str + utf_char_next.str + utf_char_next_next.str; - bpe_words.emplace_back(token); // the contraction - token=""; - i+=2; - continue; - } - } - - if (!split_condition && !collecting) { - if (utf_char.char_type == CNCTCharType::LETTER || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::LETTER)) { - collecting_letter = true; - collecting = true; - } else if (utf_char.char_type == CNCTCharType::DIGIT || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::DIGIT)) { - collecting_numeric = true; - collecting = true; - } else if ( - ((utf_char.char_type != CNCTCharType::LETTER && utf_char.char_type != CNCTCharType::DIGIT) && (utf_char.char_type != CNCTCharType::WHITESPACE)) || - (!token.size() && utf_char==" " && utf_char_next.char_type != CNCTCharType::LETTER && utf_char_next.char_type != CNCTCharType::DIGIT && utf_char_next.char_type != CNCTCharType::WHITESPACE) - ) { - collecting_special = true; - collecting = true; - } else if (utf_char.char_type == CNCTCharType::WHITESPACE && utf_char_next.char_type == CNCTCharType::WHITESPACE) { - collecting_whitespace_lookahead = true; - collecting = true; - } else if (utf_char.char_type == CNCTCharType::WHITESPACE) { - split_condition = true; - } - } else if (!split_condition && collecting) { - if (collecting_letter && utf_char.char_type != CNCTCharType::LETTER) { - split_condition = true; - } else if (collecting_numeric && utf_char.char_type != CNCTCharType::DIGIT) { - split_condition = true; - } else if (collecting_special && (utf_char.char_type == CNCTCharType::LETTER || utf_char.char_type == CNCTCharType::DIGIT || utf_char.char_type == CNCTCharType::WHITESPACE)) { - split_condition = true; - } else if (collecting_whitespace_lookahead && utf_char_next.char_type != CNCTCharType::WHITESPACE) { - split_condition = true; - } - } - - if(utf_char_next.str.size() == 0) { - split_condition = true; // final - token += utf_char.str; - } - - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); - } - token = utf_char.str; - collecting = false; - collecting_letter = false; - collecting_numeric = false; - collecting_special = false; - collecting_whitespace_lookahead = false; - } else { - token += utf_char.str; - } - } - - for (std::string& word : bpe_words) { - std::string encoded_token=""; - for (char& c : word) { - encoded_token += byte_encoder[c]; - } - bpe_encoded_words.emplace_back(encoded_token); - } - - return bpe_encoded_words; - } - - // decoder (for one token) - std::string decode_token(const std::string& token) { - static std::unordered_map< std::string, unsigned char> byte_decoder = unicode_to_bytes(); - std::string decoded_token=""; - auto unicode_seqeunces = CNCTUnicode::split_utf8(token); - for (auto& unicode_sequence : unicode_seqeunces) { - decoded_token += byte_decoder[unicode_sequence]; - } - - return decoded_token; - } - - const gpt2bpe_vocab & vocab_; - std::vector symbols_; - std::vector symbols_final; - ggllm_bpe_bigram::queue work_queue_; - bool flag_g2ws=false; -}; - -static std::vector gpt2bpe_tokenize(const gpt2bpe_vocab & vocab, const std::string & text, bool bos, bool g2ws ) { - gpt2bpe_tokenizer tokenizer(vocab, g2ws); - std::vector output; - - if (text.empty()) { - return output; - } - - if (bos && vocab.special_bos_id != -1) { - output.push_back(vocab.special_bos_id); - } - - tokenizer.tokenize(text, output); - return output; -} - -#endif // CMPNCT_GPT2BPE diff --git a/examples/gptneox-wip/falcon-main.cpp b/examples/gptneox-wip/falcon-main.cpp deleted file mode 100644 index e9197f6b5..000000000 --- a/examples/gptneox-wip/falcon-main.cpp +++ /dev/null @@ -1,1111 +0,0 @@ -#include "ggml.h" -#include "cmpnct_gpt2bpe.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -// default hparams -struct falcon_hparams { - size_t n_merges = 0; - size_t n_vocab = 0; - uint32_t n_ctx = 0; - uint32_t n_embd = 0; - uint32_t n_head = 0; - uint32_t n_head_kv = 1; // Needs to be 1 for 7B model - uint32_t n_ff = 0; - uint32_t n_block = 0; - float norm_eps = 1e-5; -}; -struct falcon_block { - // normalization - struct ggml_tensor* input_layernorm; - struct ggml_tensor* input_layernorm_b; - struct ggml_tensor* attention_norm; // Falcon-40B only - struct ggml_tensor* attention_norm_b; // Falcon-40B only - - // attention - struct ggml_tensor* query_key_value; - struct ggml_tensor* wo; - - // ff - struct ggml_tensor* ffn_up; - struct ggml_tensor* ffn_down; -}; - -struct falcon_model { - falcon_hparams hparams; - - struct ggml_tensor* tok_embeddings; - struct ggml_tensor* output_norm; - struct ggml_tensor* output_norm_b; - struct ggml_tensor* lm_head; - - std::vector blocks; - - // key + value memory - struct ggml_tensor* memory_k; - struct ggml_tensor* memory_v; - - struct gguf_context * ggufctx; - struct ggml_context * ctx; - struct ggml_context * kvctx; - - std::map tensors; -}; - -struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - uint32_t n_predict = 200; // new tokens to predict - uint32_t n_batch = 512; // batch size for prompt processing - - // sampling parameters - int32_t top_k = 40; - float top_p = 1.0f; - float temp = 0.8f; - int32_t repeat_last_n = 64; - float repeat_penalty = 1.02f; - - std::string model = ""; // model path - std::string prompt = ""; - - std::string token_test = ""; - bool interactive = false; - int32_t interactive_port = -1; - int32_t n_gpu_layers = 0; -}; - -void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: random)\n"); - fprintf(stderr, " -f FNAME, --file FNAME\n"); - fprintf(stderr, " load prompt from a file\n"); - fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); - fprintf(stderr, " test tokenization\n"); - fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); - fprintf(stderr, " --top_k N top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k); - fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); - fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); - fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n); - fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, "\n"); -} - -// Function to check if the next argument exists -std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { - if (i + 1 < argc && argv[i + 1][0] != '-') { - return argv[++i]; - } else { - fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); - gpt_print_usage(argc, argv, params); - exit(0); - } -} - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-s" || arg == "--seed") { - params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-p" || arg == "--prompt") { - params.prompt = get_next_arg(i, argc, argv, arg, params); - } else if (arg == "-n" || arg == "--n_predict") { - params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--top_k") { - params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--top_p") { - params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--temp") { - params.temp = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--repeat-last-n") { - params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--repeat-penalty") { - params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-b" || arg == "--batch_size") { - params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-m" || arg == "--model") { - params.model = get_next_arg(i, argc, argv, arg, params); - } else if (arg == "-i" || arg == "--interactive") { - params.interactive = true; - } else if (arg == "-ip" || arg == "--interactive-port") { - params.interactive = true; - params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, params); - exit(0); - } else if (arg == "-f" || arg == "--file") { - get_next_arg(i, argc, argv, arg, params); - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - break; - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } else if (arg == "-tt" || arg == "--token_test") { - params.token_test = get_next_arg(i, argc, argv, arg, params); - } - else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, params); - exit(0); - } - } - - return true; -} - -gpt2bpe_vocab::id sample_top_k_top_p_repeat( - const gpt2bpe_vocab & vocab, - const float * logits, - const int32_t * last_n_tokens_data, - size_t last_n_tokens_data_size, - int top_k, - double top_p, - double temp, - int repeat_last_n, - float repeat_penalty, - std::mt19937 & rng) { - - int n_logits = vocab.id_to_token.size(); - - const auto * plogits = logits; - - const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size); - - if (temp <= 0) { - // select the token with the highest logit directly - float max_logit = plogits[0]; - gpt2bpe_vocab::id max_id = 0; - - for (int i = 1; i < n_logits; ++i) { - if (plogits[i] > max_logit) { - max_logit = plogits[i]; - max_id = i; - } - } - return max_id; - } - - - std::vector> logits_id; - logits_id.reserve(n_logits); - - { - const float scale = 1.0f/temp; - for (int i = 0; i < n_logits; ++i) { - // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) - // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main - if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) { - // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if (plogits[i] < 0.0f) { - logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); - } - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale, i)); - } - } - } - - // find the top K tokens - std::partial_sort( - logits_id.begin(), - logits_id.begin() + top_k, logits_id.end(), - [](const std::pair & a, const std::pair & b) { - return a.first > b.first; - }); - - logits_id.resize(top_k); - - double maxl = -INFINITY; - for (const auto & kv : logits_id) { - maxl = std::max(maxl, kv.first); - } - - // compute probs for the top K tokens - std::vector probs; - probs.reserve(logits_id.size()); - - double sum = 0.0; - for (const auto & kv : logits_id) { - double p = exp(kv.first - maxl); - probs.push_back(p); - sum += p; - } - - // normalize the probs - for (auto & p : probs) { - p /= sum; - } - - if (top_p < 1.0f) { - double cumsum = 0.0f; - for (int i = 0; i < top_k; i++) { - cumsum += probs[i]; - if (cumsum >= top_p) { - top_k = i + 1; - probs.resize(top_k); - logits_id.resize(top_k); - break; - } - } - - cumsum = 1.0/cumsum; - for (int i = 0; i < (int) probs.size(); i++) { - probs[i] *= cumsum; - } - } - -// printf("\n"); -// for (int i = 0; i < (int) probs.size(); i++) { -// for (int i = 0; i < 10; i++) { -// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); -// } - - std::discrete_distribution<> dist(probs.begin(), probs.end()); - int idx = dist(rng); - - return logits_id[idx].second; - -} - -struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){ - - struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); - if( cur == NULL ) { - printf("%s: tensor '%s' not found!\n", __func__, name.c_str()); - } else { -// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); - } - - return cur; -} - -// load the model's weights from a file -bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_vocab & vocab) { - printf("%s: loading model from '%s'..\n", __func__, fname.c_str()); - - model.ctx = NULL; - - struct gguf_init_params ggufparams = { - /*.no_alloc = */ false, - /*.ctx = */ &model.ctx, - }; - - auto & ggufctx = model.ggufctx; - - ggufctx = gguf_init_from_file(fname.c_str(), ggufparams); - - if (!ggufctx) { - fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__); - return false; - } - - printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); - printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); - printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); - - // print all kv - #if 0 - { - const int n_kv = gguf_get_n_kv(ggufctx); - - printf("%s: n_kv: %d\n", __func__, n_kv); - - for (int i = 0; i < n_kv; ++i) { - const char * key = gguf_get_key(ggufctx, i); - - printf("%s: kv[%d]: key = %s\n", __func__, i, key); - } - } - #endif - - // print some standard metadata - { - int keyidx; - - keyidx = gguf_find_key(ggufctx, "general.name"); - if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.description"); - if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.author"); - if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.license"); - if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); - if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository"); - if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - } - - // check required metadata - { - int keyidx; - - // check model architecture kv - keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) { - printf("%s: model architecture not supported!\n", __func__); - return false; - } - } else { - printf("%s: gguf model architecture not found!\n", __func__); - return false; - } - - // check model tensor data layout kv - keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout"); - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) { - printf("%s: model tensor data layout not supported!\n", __func__); - return false; - } - } else { - printf("%s: gguf model tensor data layout not found!\n", __func__); - return false; - } - - } - - // load hparams - { - auto & hparams = model.hparams; - - bool ok = true; - int keyidx; - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.context_length"); - if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.embedding_length"); - if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count"); - if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.feed_forward_length"); - if (keyidx != -1) { hparams.n_ff = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.block_count"); - if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.layer_norm_epsilon"); - if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; } } - - if (!ok) { - fprintf(stderr, "%s: required hparam missing!\n", __func__); - return false; - } - - keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count_kv"); - if (keyidx != -1) { hparams.n_head_kv = gguf_get_val_u32(ggufctx, keyidx); } - - - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_head_kv = %d\n", __func__, hparams.n_head_kv); - printf("%s: n_block = %d\n", __func__, hparams.n_block); - printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps); - - } - - // load vocab - { - auto & hparams = model.hparams; - - int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model"); - - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { - printf("%s: tokenizer model not supported!\n", __func__); - return false; - } - } else { - printf("%s: tokenizer model not found!\n", __func__); - return false; - } - - - int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); - - if (tokens_keyidx == -1) { - printf("%s: gpt2 tokenizer vocab not found!\n", __func__); - return false; - } - - int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); - - if (merges_keyidx == -1) { - printf("%s: gpt2 tokenizer merges not found!\n", __func__); - return false; - } - - hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); - hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); - - printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); - printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); - - for (size_t i = 0; i < hparams.n_vocab; i++) { - std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); - -// printf("token %d = '%s'\n",i,word.c_str() ); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - - if( vocab.id_to_token[i] == "\n" ) { - vocab.linefeed_id = i; - } - } - - std::vector> bpe_merges; - - for (size_t i = 0; i < hparams.n_merges; i++) { - - std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i); - - // Split the merges - std::string first, second; - size_t pos = word.find(' ', 1); // Start the search from the second character - if (pos != std::string::npos) { - first = word.substr(0, pos); - second = word.substr(pos + 1); - } - - bpe_merges.push_back(std::make_pair(first, second)); - } - - vocab.populate_bpe_ranks(bpe_merges); - - - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) { vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) { vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) { vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - - if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } - if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } - if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } - if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } - if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } - if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); } - - } - - - auto & ctx = model.ctx; - size_t ctx_size = ggml_get_mem_size(ctx); - - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - - // print tensor info - #if 0 - { - const int n_tensors = gguf_get_n_tensors(ggufctx); - - printf("%s: n_tensors: %d\n", __func__, n_tensors); - - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name (ggufctx, i); - const size_t offset = gguf_get_tensor_offset(ggufctx, i); - - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); - } - } - #endif - - // prepare memory for the weights - { - - auto & hparams = model.hparams; - - const int n_block = hparams.n_block; - - model.blocks.resize(n_block); - - model.tok_embeddings = ggml_get_tensor(ctx, "token_embd.weight"); - - model.output_norm = ggml_get_tensor(ctx, "output_norm.weight"); - model.output_norm_b = ggml_get_tensor(ctx, "output_norm.bias"); - model.lm_head = ggml_get_tensor(ctx, "output.weight"); - - // map by name - model.tensors["token_embd.weight"] = model.tok_embeddings; - model.tensors["output_norm.weight"] = model.output_norm; - model.tensors["output_norm.bias"] = model.output_norm_b; - model.tensors["output.weight"] = model.lm_head; - - for (int i = 0; i < n_block; ++i) { - - auto& block = model.blocks[i]; - std::string blocknamestart = "blk." + std::to_string(i) + "."; - - block.input_layernorm = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" ); - block.input_layernorm_b = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" ); - - if ( hparams.n_head_kv == 8 ) { // Falcon-40B - block.attention_norm = get_tensor_ex(ctx, blocknamestart + "attn_norm_2.weight" ); - block.attention_norm_b = get_tensor_ex(ctx, blocknamestart + "attn_norm_2.bias" ); - } - - // query_key_value shape for config.multi_query == True: - block.query_key_value = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" ); - block.wo = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" ); - - block.ffn_up = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" ); - block.ffn_down = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" ); - - // map by name - if ( hparams.n_head_kv == 8 ) { // Falcon-40B - // Falcon-40B: - model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm; - model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b; - model.tensors[blocknamestart + "attn_norm_2.weight"] = block.attention_norm; - model.tensors[blocknamestart + "attn_norm_2.bias"] = block.attention_norm_b; - } else { - // Falcon-7B: - model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm; - model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b; - } - - model.tensors[blocknamestart + "attn_qkv.weight"] = block.query_key_value; - model.tensors[blocknamestart + "attn_output.weight"] = block.wo; - - model.tensors[blocknamestart + "ffn_up.weight"] = block.ffn_up; - model.tensors[blocknamestart + "ffn_down.weight"] = block.ffn_down; - } - } - - // key + value memory - { - const auto & kvctx = model.kvctx; - const auto & hparams = model.hparams; - - const int n_block = hparams.n_block; - const int n_ctx = hparams.n_ctx; - const int n_embd = hparams.n_embd; - - const int64_t n_mem = n_block*n_ctx; - const int64_t n_elements = n_embd*n_mem; - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ size_t(n_elements*4+ggml_tensor_overhead()*2), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.kvctx = ggml_init(params); - if (!model.kvctx) { - fprintf(stderr, "%s: kv ggml_init() failed\n", __func__); - return false; - } - - } - - - model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - - return true; -} - - -// evaluate the transformer -// -// - model: the model -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool falcon_eval( - const falcon_model & model, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token) { - - - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_block = hparams.n_block; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - const int n_head_kv = hparams.n_head_kv; - const int n_vocab = hparams.n_vocab; - const size_t head_dim = n_embd / n_head; - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - // use 2 scratch buffers - // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = 256u*1024*1024; - static void * scr0 = malloc(scr0_size); - - static size_t scr1_size = 256u*1024*1024; - static void * scr1 = malloc(scr1_size); - - if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; -// gf.n_threads = n_threads; - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - - // wte - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); -// struct ggml_tensor* repeat_dummy = ggml_new_tensor_3d(ctx0, inpL->type, head_dim, N + n_past, n_head); - - ggml_type wtype = GGML_TYPE_F32; - const int sizeof_wtype = ggml_type_sizef(wtype); - - for (int il = 0; il < n_block; ++il) { - struct ggml_tensor * cur; - struct ggml_tensor * layernorm_output; - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // self-attention - { - layernorm_output = ggml_norm(ctx0, inpL); - - layernorm_output = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.blocks[il].input_layernorm, layernorm_output), - layernorm_output), - ggml_repeat(ctx0, model.blocks[il].input_layernorm_b, layernorm_output)); - - if ( hparams.n_head_kv == 8 ) { // Falcon-40B - cur = ggml_norm(ctx0, inpL); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.blocks[il].attention_norm, cur), - cur), - ggml_repeat(ctx0, model.blocks[il].attention_norm_b, cur)); - } - else { // Falcon 7B - cur = layernorm_output; - } - - // compute QKV - - cur = ggml_mul_mat(ctx0, model.blocks[il].query_key_value, cur); - - // Note that the strides for Kcur, Vcur are set up so that the - // resulting views are misaligned with the tensor's storage - // (by applying the K/V offset we shift the tensor's original - // view to stick out behind the viewed QKV tensor's allocated - // memory, so to say). This is ok because no actual accesses - // happen to that out-of-range memory, but it can require some - // trickery when trying to accurately dump these views for - // debugging. - - struct ggml_tensor * Qcur = ggml_view_3d( - ctx0, cur, head_dim, n_head, N, - head_dim * sizeof_wtype, - head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype, - 0); - - struct ggml_tensor * Kcur = ggml_view_3d( - ctx0, cur, head_dim, n_head_kv, N, - head_dim * sizeof_wtype, - head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype, - head_dim * n_head * sizeof_wtype); - - struct ggml_tensor * Vcur = ggml_view_3d( - ctx0, cur, head_dim, n_head_kv, N, - head_dim * sizeof_wtype, - head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype, - head_dim * (n_head + n_head_kv) * sizeof_wtype); - - // using mode = 2 for neox mode - Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, 0); - Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, 0); - - // store key and value to memory - { - struct ggml_tensor* k = ggml_view_1d( - ctx0, model.memory_k, N * n_head_kv * head_dim, - (ggml_element_size(model.memory_k) * n_head_kv * head_dim) * - (il * n_ctx + n_past)); - struct ggml_tensor* v = ggml_view_1d( - ctx0, model.memory_v, N * n_head_kv * head_dim, - (ggml_element_size(model.memory_v) * n_head_kv * head_dim) * - (il * n_ctx + n_past)); - - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * K = ggml_permute( - ctx0, - ggml_reshape_3d( - ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_head_kv * head_dim, - il * n_ctx * - ggml_element_size(model.memory_k) * - n_head_kv * - head_dim), - head_dim, n_head_kv, n_past + N), - 0, 2, 1, 3); - - // K * Q - -// K = ggml_cont(ctx0, ggml_repeat2(ctx0, K, repeat_dummy)); - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(head_dim))) - ); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor* V = ggml_permute( - ctx0, - ggml_reshape_3d( - ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim, - il * n_ctx * - ggml_element_size(model.memory_v) * - n_head_kv * - head_dim), - head_dim, n_head_kv, n_past + N), - 0, 2, 1, 3); - -// V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat2(ctx0, V, repeat_dummy))); - V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); - - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection - { - cur = ggml_mul_mat(ctx0, - model.blocks[il].wo, - cur); - } - } - - ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); - - struct ggml_tensor* inpFF = layernorm_output; - struct ggml_tensor* attn_out = ggml_cpy( - ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - { - cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_up, inpFF); - cur = ggml_gelu(ctx0, cur); - cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_down, cur); - } - - cur = ggml_add(ctx0, cur, attn_out); - cur = ggml_add(ctx0, cur, inpL); - // input for next layer - inpL = cur; - } - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // norm - { - inpL = ggml_norm(ctx0, inpL); - - // inpL = ln_f_g*inpL + ln_f_b - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.output_norm, inpL), - inpL), - ggml_repeat(ctx0, model.output_norm_b, inpL)); - } - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - - // lm_head - { - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); - - //inpL = ggml_add(ctx0, - // ggml_repeat(ctx0, model.lmh_b, inpL), - // inpL); - } - - // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); - - // run the computation - ggml_build_forward_expand(&gf, inpL); -// ggml_graph_compute (ctx0, &gf); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); - - //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - // return result for just the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab); - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); - - ggml_free(ctx0); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - return 1; - } - - int64_t t_load_us = 0; - - gpt2bpe_vocab vocab; - falcon_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!falcon_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - if (params.top_k == 0) { - params.top_k = model.hparams.n_vocab; - } - - printf("%s: seed = %d\n", __func__, params.seed); - printf("%s: temp = %.3f\n", __func__, params.temp); - printf("%s: top_k = %d\n", __func__, params.top_k); - printf("%s: top_p = %.3f\n", __func__, params.top_p); - printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); - printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); - - std::mt19937 rng(params.seed); - - if (params.prompt.empty()) { - params.prompt = "Once upon"; - } - - std::vector last_n_tokens(model.hparams.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); -// for (size_t i = 0; i < embd_inp.size(); i++) { -// printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str()); -// } - - if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) { - params.n_predict = model.hparams.n_ctx-embd_inp.size(); - } - - printf("%s: n_predict = %d\n", __func__, params.n_predict); - printf("\n"); - - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - falcon_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!falcon_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - const int repeat_last_n = params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - - const int n_vocab = model.hparams.n_vocab; - - gpt2bpe_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (embd.size() > params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str() ); - } - fflush(stdout); - - // end of text token - if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp deleted file mode 100644 index b76bafaa8..000000000 --- a/examples/gptneox-wip/gptneox-main.cpp +++ /dev/null @@ -1,1083 +0,0 @@ -#include "ggml.h" -#include "cmpnct_gpt2bpe.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -// default hparams -struct gpt_neox_hparams { - size_t n_merges = 0; - size_t n_vocab = 0; - uint32_t n_ctx = 0; - uint32_t n_embd = 0; - uint32_t n_head = 0; - uint32_t n_block = 0; - uint32_t n_rot = 0; // rotary_pct * (n_embd / n_head) - bool par_res = true; - float norm_eps = 1e-5; -}; - -struct gpt_neox_block { - // pre normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - // attention - struct ggml_tensor * c_attn_attn_w; - struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // post normalization - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // ff - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; - -struct gpt_neox_model { - gpt_neox_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - - struct ggml_tensor * lmh_g; // language model head - - std::vector blocks; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct gguf_context * ggufctx; - struct ggml_context * ctx; - struct ggml_context * kvctx; - - std::map tensors; -}; - -struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - uint32_t n_predict = 200; // new tokens to predict - uint32_t n_batch = 512; // batch size for prompt processing - - // sampling parameters - int32_t top_k = 40; - float top_p = 1.0f; - float temp = 0.8f; - int32_t repeat_last_n = 64; - float repeat_penalty = 1.02f; - - std::string model = ""; // model path - std::string prompt = ""; - - std::string token_test = ""; - bool interactive = false; - int32_t interactive_port = -1; - int32_t n_gpu_layers = 0; -}; - -void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: random)\n"); - fprintf(stderr, " -f FNAME, --file FNAME\n"); - fprintf(stderr, " load prompt from a file\n"); - fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); - fprintf(stderr, " test tokenization\n"); - fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); - fprintf(stderr, " --top_k N top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k); - fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); - fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); - fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n); - fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, "\n"); -} - -// Function to check if the next argument exists -std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { - if (i + 1 < argc && argv[i + 1][0] != '-') { - return argv[++i]; - } else { - fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); - gpt_print_usage(argc, argv, params); - exit(0); - } -} - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-s" || arg == "--seed") { - params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-p" || arg == "--prompt") { - params.prompt = get_next_arg(i, argc, argv, arg, params); - } else if (arg == "-n" || arg == "--n_predict") { - params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--top_k") { - params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--top_p") { - params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--temp") { - params.temp = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--repeat-last-n") { - params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "--repeat-penalty") { - params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-b" || arg == "--batch_size") { - params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-m" || arg == "--model") { - params.model = get_next_arg(i, argc, argv, arg, params); - } else if (arg == "-i" || arg == "--interactive") { - params.interactive = true; - } else if (arg == "-ip" || arg == "--interactive-port") { - params.interactive = true; - params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params)); - } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, params); - exit(0); - } else if (arg == "-f" || arg == "--file") { - get_next_arg(i, argc, argv, arg, params); - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - break; - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } else if (arg == "-tt" || arg == "--token_test") { - params.token_test = get_next_arg(i, argc, argv, arg, params); - } - else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, params); - exit(0); - } - } - - return true; -} - -gpt2bpe_vocab::id sample_top_k_top_p_repeat( - const gpt2bpe_vocab & vocab, - const float * logits, - const int32_t * last_n_tokens_data, - size_t last_n_tokens_data_size, - int top_k, - double top_p, - double temp, - int repeat_last_n, - float repeat_penalty, - std::mt19937 & rng) { - - int n_logits = vocab.id_to_token.size(); - - const auto * plogits = logits; - - const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size); - - if (temp <= 0) { - // select the token with the highest logit directly - float max_logit = plogits[0]; - gpt2bpe_vocab::id max_id = 0; - - for (int i = 1; i < n_logits; ++i) { - if (plogits[i] > max_logit) { - max_logit = plogits[i]; - max_id = i; - } - } - return max_id; - } - - - std::vector> logits_id; - logits_id.reserve(n_logits); - - { - const float scale = 1.0f/temp; - for (int i = 0; i < n_logits; ++i) { - // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) - // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main - if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) { - // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if (plogits[i] < 0.0f) { - logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); - } - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale, i)); - } - } - } - - // find the top K tokens - std::partial_sort( - logits_id.begin(), - logits_id.begin() + top_k, logits_id.end(), - [](const std::pair & a, const std::pair & b) { - return a.first > b.first; - }); - - logits_id.resize(top_k); - - double maxl = -INFINITY; - for (const auto & kv : logits_id) { - maxl = std::max(maxl, kv.first); - } - - // compute probs for the top K tokens - std::vector probs; - probs.reserve(logits_id.size()); - - double sum = 0.0; - for (const auto & kv : logits_id) { - double p = exp(kv.first - maxl); - probs.push_back(p); - sum += p; - } - - // normalize the probs - for (auto & p : probs) { - p /= sum; - } - - if (top_p < 1.0f) { - double cumsum = 0.0f; - for (int i = 0; i < top_k; i++) { - cumsum += probs[i]; - if (cumsum >= top_p) { - top_k = i + 1; - probs.resize(top_k); - logits_id.resize(top_k); - break; - } - } - - cumsum = 1.0/cumsum; - for (int i = 0; i < (int) probs.size(); i++) { - probs[i] *= cumsum; - } - } - -// printf("\n"); -// for (int i = 0; i < (int) probs.size(); i++) { -// for (int i = 0; i < 10; i++) { -// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); -// } - - std::discrete_distribution<> dist(probs.begin(), probs.end()); - int idx = dist(rng); - - return logits_id[idx].second; - -} - -struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){ - - struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); - if( cur == NULL ) { - printf("%s: tensor '%s' not found!\n", __func__, name.c_str()); - } else { -// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); - } - - return cur; -} - -// load the model's weights from a file -bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2bpe_vocab & vocab) { - printf("%s: loading model from '%s'..\n", __func__, fname.c_str()); - - model.ctx = NULL; - - struct gguf_init_params ggufparams = { - /*.no_alloc = */ false, - /*.ctx = */ &model.ctx, - }; - - auto & ggufctx = model.ggufctx; - - ggufctx = gguf_init_from_file(fname.c_str(), ggufparams); - - if (!ggufctx) { - fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__); - return false; - } - - printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); - printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); - printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); - - // print all kv - #if 0 - { - const int n_kv = gguf_get_n_kv(ggufctx); - - printf("%s: n_kv: %d\n", __func__, n_kv); - - for (int i = 0; i < n_kv; ++i) { - const char * key = gguf_get_key(ggufctx, i); - - printf("%s: kv[%d]: key = %s\n", __func__, i, key); - } - } - #endif - - // print some standard metadata - { - int keyidx; - - keyidx = gguf_find_key(ggufctx, "general.name"); - if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.description"); - if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.author"); - if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.license"); - if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); - if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository"); - if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - } - - // check required metadata - { - int keyidx; - - // check model architecture kv - keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) { - printf("%s: model architecture not supported!\n", __func__); - return false; - } - } else { - printf("%s: gguf model architecture not found!\n", __func__); - return false; - } - - } - - // load hparams - { - auto & hparams = model.hparams; - - bool ok = true; - int keyidx; - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.context_length"); - if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.embedding_length"); - if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.head_count"); - if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.block_count"); - if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.rope.dimension_count"); - if (keyidx != -1) { hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.use_parallel_residual"); - if (keyidx != -1) { hparams.par_res = gguf_get_val_bool(ggufctx, keyidx); } else { ok = false; } } - - if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.layer_norm_epsilon"); - if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; } } - - if (!ok) { - fprintf(stderr, "%s: required hparam missing!\n", __func__); - return false; - } - - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_block = %d\n", __func__, hparams.n_block); - printf("%s: n_rot = %d\n", __func__, hparams.n_rot); - printf("%s: par_res = %d\n", __func__, hparams.par_res); - printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps); - - } - - // load vocab - { - auto & hparams = model.hparams; - - int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model"); - - if (keyidx != -1) { - if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { - printf("%s: tokenizer model not supported!\n", __func__); - return false; - } - } else { - printf("%s: tokenizer model not found!\n", __func__); - return false; - } - - - int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); - - if (tokens_keyidx == -1) { - printf("%s: gpt2 tokenizer vocab not found!\n", __func__); - return false; - } - - int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); - - if (merges_keyidx == -1) { - printf("%s: gpt2 tokenizer merges not found!\n", __func__); - return false; - } - - hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); - hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); - - printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); - printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); - - for (size_t i = 0; i < hparams.n_vocab; i++) { - std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); - -// printf("token %d = '%s'\n",i,word.c_str() ); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - - if( vocab.id_to_token[i] == "\n" ) { - vocab.linefeed_id = i; - } - } - - std::vector> bpe_merges; - - for (size_t i = 0; i < hparams.n_merges; i++) { - - std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i); - - // Split the merges - std::string first, second; - size_t pos = word.find(' ', 1); // Start the search from the second character - if (pos != std::string::npos) { - first = word.substr(0, pos); - second = word.substr(pos + 1); - } - - bpe_merges.push_back(std::make_pair(first, second)); - } - - vocab.populate_bpe_ranks(bpe_merges); - - - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) { vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) { vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) { vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - - if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } - if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } - if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } - if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } - if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } - if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); } - } - - - auto & ctx = model.ctx; - size_t ctx_size = ggml_get_mem_size(ctx); - - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - - // print tensor info - #if 0 - { - const int n_tensors = gguf_get_n_tensors(ggufctx); - - printf("%s: n_tensors: %d\n", __func__, n_tensors); - - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name (ggufctx, i); - const size_t offset = gguf_get_tensor_offset(ggufctx, i); - - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); - } - } - #endif - - // prepare memory for the weights - { - const int n_block = model.hparams.n_block; - - model.blocks.resize(n_block); - - model.wte = ggml_get_tensor(ctx, "token_embd.weight"); - model.ln_f_g = ggml_get_tensor(ctx, "output_norm.weight"); - model.ln_f_b = ggml_get_tensor(ctx, "output_norm.bias"); - model.lmh_g = ggml_get_tensor(ctx, "output.weight"); - - // map by name - model.tensors["token_embd.weight"] = model.wte; - model.tensors["output_norm.weight"] = model.ln_f_g; - model.tensors["output_norm.bias"] = model.ln_f_b; - model.tensors["output.weight"] = model.lmh_g; - - for (int i = 0; i < n_block; ++i) { - auto & block = model.blocks[i]; - - std::string blocknamestart = "blk." + std::to_string(i) + "."; - - block.ln_1_g = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" ); - block.ln_1_b = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" ); - - block.c_attn_attn_w = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" ); - block.c_attn_attn_b = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" ); - - block.c_attn_proj_w = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" ); - block.c_attn_proj_b = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" ); - - block.ln_2_g = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" ); - block.ln_2_b = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias"); - - block.c_mlp_fc_w = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" ); - block.c_mlp_fc_b = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" ); - - block.c_mlp_proj_w = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" ); - block.c_mlp_proj_b = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" ); - - // map by name - model.tensors[blocknamestart + "attn_norm.weight"] = block.ln_1_g; - model.tensors[blocknamestart + "attn_norm.bias"] = block.ln_1_b; - - model.tensors[blocknamestart + "attn_qkv.weight"] = block.c_attn_attn_w; - model.tensors[blocknamestart + "attn_qkv.bias"] = block.c_attn_attn_b; - - model.tensors[blocknamestart + "attn_output.weight"] = block.c_attn_proj_w; - model.tensors[blocknamestart + "attn_output.bias"] = block.c_attn_proj_b; - - model.tensors[blocknamestart + "ffn_norm.weight"] = block.ln_2_g; - model.tensors[blocknamestart + "ffn_norm.bias"] = block.ln_2_b; - - model.tensors[blocknamestart + "ffn_up.weight"] = block.c_mlp_fc_w; - model.tensors[blocknamestart + "ffn_up.bias"] = block.c_mlp_fc_b; - - model.tensors[blocknamestart + "ffn_down.weight"] = block.c_mlp_proj_w; - model.tensors[blocknamestart + "ffn_down.bias"] = block.c_mlp_proj_b; - } - } - - // key + value memory - { - const auto & kvctx = model.kvctx; - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_block = hparams.n_block; - const int n_ctx = hparams.n_ctx; - - const int64_t n_mem = n_block*n_ctx; - const int64_t n_elements = n_embd*n_mem; - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ size_t(n_elements*4+ggml_tensor_overhead()*2), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.kvctx = ggml_init(params); - if (!model.kvctx) { - fprintf(stderr, "%s: kv ggml_init() failed\n", __func__); - return false; - } - - } - - - model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - - return true; -} - - -// feed-forward network -ggml_tensor * gpt_neox_ff( - const gpt_neox_block &block, - ggml_context * ctx0, - ggml_tensor * inp, - const gpt_neox_hparams &hparams) { - - ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps); - - cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur)); - cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur); - cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_fc_b, cur), cur); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - - // projection - // cur = proj_w*cur + proj_b - cur = ggml_mul_mat(ctx0, block.c_mlp_proj_w, cur); - - cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_proj_b, cur), cur); - return cur; -} - -// evaluate the transformer -// -// - model: the model -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool gpt_neox_eval( - const gpt_neox_model & model, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_block = hparams.n_block; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - const int n_vocab = hparams.n_vocab; - const int n_rot = hparams.n_rot; - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - // use 2 scratch buffers - // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = 256u*1024*1024; - static void * scr0 = malloc(scr0_size); - - static size_t scr1_size = 256u*1024*1024; - static void * scr1 = malloc(scr1_size); - - if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - - - // wte - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); - - for (int il = 0; il < n_block; ++il) { - struct ggml_tensor * cur; - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // self-attention - { - { - cur = ggml_norm(ctx0, inpL, hparams.norm_eps); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur), - ggml_repeat(ctx0, model.blocks[il].ln_1_b, cur)); - } - - // compute QKV - { - - cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_attn_w, cur); - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_attn_b, cur), cur); - } - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head)); - - // using mode = 2 for GPT-NeoX mode - Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0); - Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0); - - // store key and value to memory - { - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); - - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); - - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, - il*n_ctx*ggml_element_size(model.memory_v)*n_embd); - - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection - { - cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_proj_w, cur); - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_proj_b, cur), cur); - } - } - - ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); - - if (hparams.par_res == 0) { - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); - - cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams); - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - } else { - struct ggml_tensor * inpFF = cur; - - // this is independent of the self-attention result, so it could be done in parallel to the self-attention - // note here we pass inpL instead of cur - cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams); - - // layer input + FF - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - inpL = ggml_add(ctx0, cur, inpL); - } - } - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // norm - { - inpL = ggml_norm(ctx0, inpL, hparams.norm_eps); - - // inpL = ln_f_g*inpL + ln_f_b - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), - inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); - } - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - - // lm_head - { - inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); - - //inpL = ggml_add(ctx0, - // ggml_repeat(ctx0, model.lmh_b, inpL), - // inpL); - } - - // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); - - // run the computation - ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); - - //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result for just the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); - - ggml_free(ctx0); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - return 1; - } - - int64_t t_load_us = 0; - - gpt2bpe_vocab vocab; - gpt_neox_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt_neox_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - if (params.top_k == 0) { - params.top_k = model.hparams.n_vocab; - } - - printf("%s: seed = %d\n", __func__, params.seed); - printf("%s: temp = %.3f\n", __func__, params.temp); - printf("%s: top_k = %d\n", __func__, params.top_k); - printf("%s: top_p = %.3f\n", __func__, params.top_p); - printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); - printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); - - std::mt19937 rng(params.seed); - - if (params.prompt.empty()) { - params.prompt = "Once upon"; - } - - std::vector last_n_tokens(model.hparams.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); -// for (size_t i = 0; i < embd_inp.size(); i++) { -// printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str()); -// } - - if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) { - params.n_predict = model.hparams.n_ctx-embd_inp.size(); - } - - printf("%s: n_predict = %d\n", __func__, params.n_predict); - printf("\n"); - - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - const int repeat_last_n = params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - - const int n_vocab = model.hparams.n_vocab; - - gpt2bpe_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (embd.size() > params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str() ); - } - fflush(stdout); - - // end of text token - if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt index 046f9b1e7..57d01cb0b 100644 --- a/examples/infill/CMakeLists.txt +++ b/examples/infill/CMakeLists.txt @@ -4,5 +4,5 @@ install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) endif() diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 128d67080..6331335e3 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -39,8 +39,8 @@ static gpt_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; -static bool is_interacting = false; +static bool is_interacting = false; static void write_logfile( const llama_context * ctx, const gpt_params & params, const llama_model * model, @@ -104,7 +104,7 @@ static void sigint_handler(int signo) { int main(int argc, char ** argv) { gpt_params params; - llama_sampling_params & sparams = params.sampling_params; + llama_sampling_params & sparams = params.sparams; g_params = ¶ms; if (!gpt_params_parse(argc, argv, params)) { @@ -358,36 +358,10 @@ int main(int argc, char ** argv) { LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); } } - LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", - sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau); + LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); - struct llama_grammar * grammar = NULL; - grammar_parser::parse_state parsed_grammar; - - if (!params.grammar.empty()) { - parsed_grammar = grammar_parser::parse(params.grammar.c_str()); - // will be empty (default) if there are parse errors - if (parsed_grammar.rules.empty()) { - return 1; - } - LOG_TEE("%s: grammar:\n", __func__); - grammar_parser::print_grammar(stderr, parsed_grammar); - LOG_TEE("\n"); - - { - auto it = sparams.logit_bias.find(llama_token_eos(ctx)); - if (it != sparams.logit_bias.end() && it->second == -INFINITY) { - LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); - } - } - - std::vector grammar_rules(parsed_grammar.c_rules()); - grammar = llama_grammar_init( - grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); - } - LOG_TEE("\n##### Infill mode #####\n\n"); if (params.infill) { printf("\n************\n"); @@ -430,7 +404,7 @@ int main(int argc, char ** argv) { std::vector embd; std::vector embd_guidance; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); while (n_remain != 0 || params.interactive) { // predict @@ -549,7 +523,7 @@ int main(int argc, char ** argv) { const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); - llama_sampling_accept(ctx_sampling, ctx, id); + llama_sampling_accept(ctx_sampling, ctx, id, true); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); @@ -567,8 +541,11 @@ int main(int argc, char ** argv) { LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); - ctx_sampling->prev.erase(ctx_sampling->prev.begin()); - ctx_sampling->prev.push_back(embd_inp[n_consumed]); + + // push the prompt in the sampling context in order to apply repetition penalties later + // for the prompt, we don't apply grammar rules + llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); + ++n_consumed; if ((int) embd.size() >= params.n_batch) { break; @@ -600,7 +577,7 @@ int main(int argc, char ** argv) { if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){ + if ((llama_sampling_last(ctx_sampling) == llama_token_eot(ctx) || is_interacting) && params.interactive){ if(is_interacting && !params.interactive_first) { // print an eot token printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); @@ -617,7 +594,7 @@ int main(int argc, char ** argv) { buffer += line; } while (another_line); // check if we got an empty line, if so we use the old input - if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { + if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { params.input_prefix = buffer; } buffer.clear(); @@ -627,7 +604,7 @@ int main(int argc, char ** argv) { buffer += line; } while (another_line); // check if we got an empty line - if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { + if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { params.input_suffix = buffer; } buffer.clear(); @@ -640,7 +617,7 @@ int main(int argc, char ** argv) { process_escapes(params.input_suffix); } suff_rm_leading_spc = params.escape; - if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } @@ -667,7 +644,7 @@ int main(int argc, char ** argv) { is_interacting = false; } // deal with end of text token in interactive mode - else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) { + else if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) { LOG("found EOS token\n"); if (params.interactive) { @@ -740,15 +717,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - // reset grammar state if we're restarting generation - if (grammar != NULL) { - llama_grammar_free(grammar); - - std::vector grammar_rules(parsed_grammar.c_rules()); - grammar = llama_grammar_init( - grammar_rules.data(), grammar_rules.size(), - parsed_grammar.symbol_ids.at("root")); - } + llama_sampling_reset(ctx_sampling); } is_interacting = false; } @@ -778,9 +747,7 @@ int main(int argc, char ** argv) { llama_free(ctx); llama_free_model(model); - if (grammar != NULL) { - llama_grammar_free(grammar); - } + llama_sampling_free(ctx_sampling); llama_backend_free(); #ifndef LOG_DISABLE_LOGS diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h index e050b59be..45b2b1ad3 100644 --- a/examples/llava/llava-utils.h +++ b/examples/llava/llava-utils.h @@ -58,28 +58,30 @@ inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n // TODO: use common/sampling.h inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) { - // out of user input, sample next token - const float temp = params.sampling_params.temp; - const int32_t top_k = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k; - const float top_p = params.sampling_params.top_p; - const float tfs_z = params.sampling_params.tfs_z; - const float typical_p = params.sampling_params.typical_p; - // const int32_t repeat_last_n = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n; - // const float repeat_penalty = params.sampling_params.repeat_penalty; - // const float alpha_presence = params.sampling_params.presence_penalty; - // const float alpha_frequency = params.sampling_params.frequency_penalty; - const int mirostat = params.sampling_params.mirostat; - const float mirostat_tau = params.sampling_params.mirostat_tau; - const float mirostat_eta = params.sampling_params.mirostat_eta; - // const bool penalize_nl = params.sampling_params.penalize_nl; + auto & sparams = params.sparams; + + // out of user input, sample next token + const float temp = sparams.temp; + const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k; + const float top_p = sparams.top_p; + const float tfs_z = sparams.tfs_z; + const float typical_p = sparams.typical_p; + // const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n; + // const float repeat_penalty = sparams.repeat_penalty; + // const float alpha_presence = sparams.presence_penalty; + // const float alpha_frequency = sparams.frequency_penalty; + const int mirostat = sparams.mirostat; + const float mirostat_tau = sparams.mirostat_tau; + const float mirostat_eta = sparams.mirostat_eta; + // const bool penalize_nl = sparams.penalize_nl; llama_token id = 0; { auto logits = llama_get_logits(ctx_llama); auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama)); - // Apply params.logit_bias map - for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) { + // Apply params.logit_bias map + for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) { logits[it->first] += it->second; } @@ -91,18 +93,18 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) { llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - // TODO: Apply penalties - // float nl_logit = logits[llama_token_nl(ctx)]; - // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); - // llama_sample_repetition_penalty(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, repeat_penalty); - // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, alpha_frequency, alpha_presence); - // if (!penalize_nl) { - // logits[llama_token_nl(ctx)] = nl_logit; - // } + // TODO: Apply penalties + // float nl_logit = logits[llama_token_nl(ctx)]; + // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); + // llama_sample_repetition_penalty(ctx, &candidates_p, + // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + // last_n_repeat, repeat_penalty); + // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + // last_n_repeat, alpha_frequency, alpha_presence); + // if (!penalize_nl) { + // logits[llama_token_nl(ctx)] = nl_logit; + // } if (temp <= 0) { // Greedy sampling diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 1a5911c56..db5309afe 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -108,7 +108,7 @@ int main(int argc, char ** argv) { if (!gpt_params_parse(argc, argv, params)) { return 1; } - llama_sampling_params & sparams = params.sampling_params; + llama_sampling_params & sparams = params.sparams; #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("main", "log")); @@ -415,8 +415,7 @@ int main(int argc, char ** argv) { } } } - LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", - sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau); + LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); @@ -459,7 +458,7 @@ int main(int argc, char ** argv) { std::vector embd; std::vector embd_guidance; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict @@ -612,7 +611,7 @@ int main(int argc, char ** argv) { const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); - llama_sampling_accept(ctx_sampling, ctx, id); + llama_sampling_accept(ctx_sampling, ctx, id, true); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); @@ -631,12 +630,9 @@ int main(int argc, char ** argv) { while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); - // GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context - // Most likely will remove this in the future to avoid exposing "prev" - // Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition - // penalty will be applied only based on the tokens generated by the model. - ctx_sampling->prev.erase(ctx_sampling->prev.begin()); - ctx_sampling->prev.push_back(embd_inp[n_consumed]); + // push the prompt in the sampling context in order to apply repetition penalties later + // for the prompt, we don't apply grammar rules + llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -667,12 +663,10 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { - // check for reverse prompt + // check for reverse prompt in the last n_prev tokens if (!params.antiprompt.empty()) { - std::string last_output; - for (auto id : ctx_sampling->prev) { - last_output += llama_token_to_piece(ctx, id); - } + const int n_prev = 32; + const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev); is_antiprompt = false; // Check if each of the reverse prompts appears at the end of the output. @@ -699,7 +693,7 @@ int main(int argc, char ** argv) { } // deal with end of text token in interactive mode - if (ctx_sampling->prev.back() == llama_token_eos(ctx)) { + if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) { LOG("found EOS token\n"); if (params.interactive) { diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 69f9526a4..eb64adef8 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -157,7 +157,7 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.ctx_sampling = llama_sampling_init(params); + client.ctx_sampling = llama_sampling_init(params.sparams); } std::vector tokens_system; @@ -330,7 +330,7 @@ int main(int argc, char ** argv) { const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i); - llama_sampling_accept(client.ctx_sampling, ctx, id); + llama_sampling_accept(client.ctx_sampling, ctx, id, true); if (client.n_decoded == 1) { // start measuring generation time after the first token to make sure all concurrent clients diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0471528a3..b5ad3cc99 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -195,10 +195,12 @@ struct llama_server_context json prompt; std::vector embd; + gpt_params params; + llama_model *model = nullptr; llama_context *ctx = nullptr; - gpt_params params; llama_sampling_context *ctx_sampling = nullptr; + int n_ctx; bool truncated = false; @@ -232,7 +234,7 @@ struct llama_server_context void rewind() { params.antiprompt.clear(); - params.grammar.clear(); + params.sparams.grammar.clear(); num_prompt_tokens = 0; num_tokens_predicted = 0; generated_text = ""; @@ -246,11 +248,14 @@ struct llama_server_context multibyte_pending = 0; n_remain = 0; n_past = 0; + params.sparams.n_prev = n_ctx; + } + void initSampling() { if (ctx_sampling != nullptr) { llama_sampling_free(ctx_sampling); } - ctx_sampling = llama_sampling_init(params); + ctx_sampling = llama_sampling_init(params.sparams); } bool loadModel(const gpt_params ¶ms_) @@ -311,16 +316,32 @@ struct llama_server_context return prompt_tokens; } - bool loadGrammar() - { - ctx_sampling = llama_sampling_init(params); - return true; + void truncatePrompt(std::vector &prompt_tokens) { + const int n_left = n_ctx - params.n_keep; + const int n_block_size = n_left / 2; + const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size; + + // Keep n_keep tokens at start of prompt (at most n_ctx - 4) + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); + + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); + + LOG_VERBOSE("input truncated", { + {"n_ctx", n_ctx}, + {"n_keep", params.n_keep}, + {"n_left", n_left}, + {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, + {"num_prompt_tokens", new_tokens.size()} + }); + + truncated = true; + prompt_tokens = new_tokens; } void loadInfill() { bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } @@ -336,6 +357,7 @@ struct llama_server_context prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx)); prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); prefix_tokens.push_back(llama_token_middle(ctx)); + auto prompt_tokens = prefix_tokens; num_prompt_tokens = prompt_tokens.size(); @@ -347,31 +369,18 @@ struct llama_server_context params.n_keep = std::min(params.n_ctx - 4, params.n_keep); // if input prompt is too big, truncate like normal - if (num_prompt_tokens >= (size_t)params.n_ctx) + if (num_prompt_tokens >= (size_t) n_ctx) { - printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens); - // todo we probably want to cut from both sides - const int n_left = (params.n_ctx - params.n_keep) / 2; - std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left; - new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); - std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin()); + truncatePrompt(prompt_tokens); + num_prompt_tokens = prompt_tokens.size(); - LOG_VERBOSE("input truncated", { - {"n_ctx", params.n_ctx}, - {"n_keep", params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, - }); - - truncated = true; - prompt_tokens = new_tokens; + GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx); } - else + + // push the prompt into the sampling context (do not apply grammar) + for (auto & token : prompt_tokens) { - const size_t ps = num_prompt_tokens; - std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0); - std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps); + llama_sampling_accept(ctx_sampling, ctx, token, false); } // compare the evaluated prompt with the new prompt @@ -409,29 +418,18 @@ struct llama_server_context params.n_keep = std::min(n_ctx - 4, params.n_keep); // if input prompt is too big, truncate like normal - if (num_prompt_tokens >= (size_t)n_ctx) + if (num_prompt_tokens >= (size_t) n_ctx) { - const int n_left = (n_ctx - params.n_keep) / 2; - std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left; - new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); - std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin()); + truncatePrompt(prompt_tokens); + num_prompt_tokens = prompt_tokens.size(); - LOG_VERBOSE("input truncated", { - {"n_ctx", n_ctx}, - {"n_keep", params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, - }); - - truncated = true; - prompt_tokens = new_tokens; + GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx); } - else + + // push the prompt into the sampling context (do not apply grammar) + for (auto & token : prompt_tokens) { - const size_t ps = num_prompt_tokens; - std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0); - std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps); + llama_sampling_accept(ctx_sampling, ctx, token, false); } // compare the evaluated prompt with the new prompt @@ -530,8 +528,8 @@ struct llama_server_context llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false }; - const int32_t n_probs = params.sampling_params.n_probs; - if (params.sampling_params.temp <= 0 && n_probs > 0) + const int32_t n_probs = params.sparams.n_probs; + if (params.sparams.temp <= 0 && n_probs > 0) { // For llama_sample_token_greedy we need to sort candidates llama_sample_softmax(ctx, &cur_p); @@ -542,7 +540,7 @@ struct llama_server_context result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); } - llama_sampling_accept(ctx_sampling, ctx, result.tok); + llama_sampling_accept(ctx_sampling, ctx, result.tok, true); if (tg) { num_tokens_predicted++; @@ -606,7 +604,7 @@ struct llama_server_context const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; - if (params.sampling_params.n_probs > 0) + if (params.sparams.n_probs > 0) { generated_token_probs.push_back(token_with_probs); } @@ -1004,36 +1002,36 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, static json format_generation_settings(llama_server_context &llama) { - const auto & sparams = llama.params.sampling_params; + const auto & sparams = llama.params.sparams; const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx)); const bool ignore_eos = eos_bias != sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); return json{ - {"n_ctx", llama.n_ctx}, - {"model", llama.params.model_alias}, - {"seed", llama.params.seed}, - {"temp", sparams.temp}, - {"top_k", sparams.top_k}, - {"top_p", sparams.top_p}, - {"tfs_z", sparams.tfs_z}, - {"typical_p", sparams.typical_p}, - {"repeat_last_n", sparams.repeat_last_n}, - {"repeat_penalty", sparams.repeat_penalty}, - {"presence_penalty", sparams.presence_penalty}, - {"frequency_penalty", sparams.frequency_penalty}, - {"mirostat", sparams.mirostat}, - {"mirostat_tau", sparams.mirostat_tau}, - {"mirostat_eta", sparams.mirostat_eta}, - {"penalize_nl", sparams.penalize_nl}, - {"stop", llama.params.antiprompt}, - {"n_predict", llama.params.n_predict}, - {"n_keep", llama.params.n_keep}, - {"ignore_eos", ignore_eos}, - {"stream", llama.stream}, - {"logit_bias", sparams.logit_bias}, - {"n_probs", sparams.n_probs}, - {"grammar", llama.params.grammar}, + {"n_ctx", llama.n_ctx}, + {"model", llama.params.model_alias}, + {"seed", llama.params.seed}, + {"temp", sparams.temp}, + {"top_k", sparams.top_k}, + {"top_p", sparams.top_p}, + {"tfs_z", sparams.tfs_z}, + {"typical_p", sparams.typical_p}, + {"repeat_last_n", sparams.penalty_last_n}, + {"repeat_penalty", sparams.penalty_repeat}, + {"frequency_penalty", sparams.penalty_freq}, + {"presence_penalty", sparams.penalty_present}, + {"mirostat", sparams.mirostat}, + {"mirostat_tau", sparams.mirostat_tau}, + {"mirostat_eta", sparams.mirostat_eta}, + {"penalize_nl", sparams.penalize_nl}, + {"stop", llama.params.antiprompt}, + {"n_predict", llama.params.n_predict}, + {"n_keep", llama.params.n_keep}, + {"ignore_eos", ignore_eos}, + {"stream", llama.stream}, + {"logit_bias", sparams.logit_bias}, + {"n_probs", sparams.n_probs}, + {"grammar", llama.params.sparams.grammar}, }; } @@ -1081,7 +1079,7 @@ static json format_final_response(llama_server_context &llama, const std::string {"timings", format_timings(llama)}, }; - if (llama.params.sampling_params.n_probs > 0) + if (llama.params.sparams.n_probs > 0) { res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); } @@ -1097,7 +1095,7 @@ static json format_partial_response( {"stop", false}, }; - if (llama.params.sampling_params.n_probs > 0) + if (llama.params.sparams.n_probs > 0) { res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); } @@ -1129,28 +1127,30 @@ static T json_value(const json &body, const std::string &key, const T &default_v static void parse_options_completion(const json &body, llama_server_context &llama) { gpt_params default_params; - const auto & default_sparams = default_params.sampling_params; - auto & sparams = llama.params.sampling_params; + const auto & default_sparams = default_params.sparams; - llama.stream = json_value(body, "stream", false); - llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict); - sparams.top_k = json_value(body, "top_k", default_sparams.top_k); - sparams.top_p = json_value(body, "top_p", default_sparams.top_p); - sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z); - sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p); - sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n); - sparams.temp = json_value(body, "temperature", default_sparams.temp); - sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty); - sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty); - sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty); - sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat); - sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); - sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); - sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl); - llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep); - llama.params.seed = json_value(body, "seed", default_params.seed); - llama.params.grammar = json_value(body, "grammar", default_params.grammar); - sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs); + auto & params = llama.params; + auto & sparams = llama.params.sparams; + + llama.stream = json_value(body, "stream", false); + params.n_predict = json_value(body, "n_predict", default_params.n_predict); + sparams.top_k = json_value(body, "top_k", default_sparams.top_k); + sparams.top_p = json_value(body, "top_p", default_sparams.top_p); + sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z); + sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p); + sparams.temp = json_value(body, "temperature", default_sparams.temp); + sparams.penalty_last_n = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); + sparams.penalty_repeat = json_value(body, "repeat_penalty", default_sparams.penalty_repeat); + sparams.penalty_freq = json_value(body, "frequency_penalty", default_sparams.penalty_freq); + sparams.penalty_present = json_value(body, "presence_penalty", default_sparams.penalty_present); + sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat); + sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); + sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); + sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl); + params.n_keep = json_value(body, "n_keep", default_params.n_keep); + params.seed = json_value(body, "seed", default_params.seed); + sparams.grammar = json_value(body, "grammar", default_sparams.grammar); + sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs); if (body.count("prompt") != 0) { @@ -1204,8 +1204,6 @@ static void parse_options_completion(const json &body, llama_server_context &lla } } - llama.ctx_sampling = llama_sampling_init(llama.params); - LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama)); } @@ -1374,15 +1372,9 @@ int main(int argc, char **argv) llama.rewind(); llama_reset_timings(llama.ctx); - parse_options_completion(json::parse(req.body), llama); - if (!llama.loadGrammar()) - { - res.status = 400; - return; - } - + llama.initSampling(); llama.loadPrompt(); llama.beginCompletion(); @@ -1414,7 +1406,7 @@ int main(int argc, char **argv) } auto probs = llama.generated_token_probs; - if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) { + if (llama.params.sparams.n_probs > 0 && llama.stopped_word) { const std::vector stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false); probs = std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size()); } @@ -1466,7 +1458,7 @@ int main(int argc, char **argv) std::vector probs_output = {}; - if (llama.params.sampling_params.n_probs > 0) { + if (llama.params.sparams.n_probs > 0) { const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); @@ -1537,14 +1529,9 @@ int main(int argc, char **argv) llama.rewind(); llama_reset_timings(llama.ctx); - parse_options_infill(json::parse(req.body), llama); - if (!llama.loadGrammar()) - { - res.status = 400; - return; - } + llama.initSampling(); llama.loadInfill(); llama.beginCompletion(); const auto chunked_content_provider = [&](size_t, DataSink & sink) { @@ -1587,7 +1574,7 @@ int main(int argc, char **argv) std::vector probs_output = {}; - if (llama.params.sampling_params.n_probs > 0) { + if (llama.params.sparams.n_probs > 0) { const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); @@ -1694,7 +1681,9 @@ int main(int argc, char **argv) const json body = json::parse(req.body); llama.rewind(); + llama_reset_timings(llama.ctx); + if (body.count("content") != 0) { llama.prompt = body["content"]; @@ -1704,6 +1693,8 @@ int main(int argc, char **argv) llama.prompt = ""; } llama.params.n_predict = 0; + + llama.initSampling(); llama.loadPrompt(); llama.beginCompletion(); llama.doCompletion(); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 24f49012a..894321ce9 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -112,16 +112,16 @@ int main(int argc, char ** argv) { bool has_eos = false; // target model sampling context - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); // draft sequence data std::vector drafts(n_seq_dft); - params.grammar.clear(); // the draft samplers will copy the target sampler's grammar - params.sampling_params.temp = std::max(0.01f, params.sampling_params.temp); + params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar + params.sparams.temp = std::max(0.01f, params.sparams.temp); for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].ctx_sampling = llama_sampling_init(params); + drafts[s].ctx_sampling = llama_sampling_init(params.sparams); } llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); @@ -154,7 +154,7 @@ int main(int argc, char ** argv) { // sample from the target model llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); - llama_sampling_accept(ctx_sampling, ctx_tgt, id); + llama_sampling_accept(ctx_sampling, ctx_tgt, id, true); //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str()); @@ -328,7 +328,7 @@ int main(int argc, char ** argv) { const int s = sa[is]; - llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id); + llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true); drafts[s].tokens.push_back(id); diff --git a/llama.cpp b/llama.cpp index ec8ffad33..365349335 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1018,8 +1018,8 @@ enum e_model { }; static const size_t kB = 1024; -static const size_t MB = kB*kB; -static const size_t GB = kB*kB*kB; +static const size_t MB = 1024*kB; +static const size_t GB = 1024*MB; struct llama_hparams { bool vocab_only; @@ -1042,21 +1042,21 @@ struct llama_hparams { float f_max_alibi_bias; bool operator!=(const llama_hparams & other) const { - if (this->vocab_only != other.vocab_only) return true; - if (this->n_vocab != other.n_vocab) return true; + if (this->vocab_only != other.vocab_only) return true; + if (this->n_vocab != other.n_vocab) return true; if (this->n_ctx_train != other.n_ctx_train) return true; - if (this->n_embd != other.n_embd) return true; - if (this->n_head != other.n_head) return true; - if (this->n_head_kv != other.n_head_kv) return true; - if (this->n_layer != other.n_layer) return true; - if (this->n_rot != other.n_rot) return true; - if (this->n_ff != other.n_ff) return true; + if (this->n_embd != other.n_embd) return true; + if (this->n_head != other.n_head) return true; + if (this->n_head_kv != other.n_head_kv) return true; + if (this->n_layer != other.n_layer) return true; + if (this->n_rot != other.n_rot) return true; + if (this->n_ff != other.n_ff) return true; const float EPSILON = 1e-9; - if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; - if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; - if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; + if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; + if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; + if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; return false; @@ -1195,11 +1195,11 @@ struct llama_vocab { id special_sep_id = -1; id special_pad_id = -1; - id linefeed_id = 13; + id linefeed_id = 13; id special_prefix_id = 32007; id special_middle_id = 32009; id special_suffix_id = 32008; - id special_eot_id = 32010; + id special_eot_id = 32010; int find_bpe_rank(std::string token_left, std::string token_right) const { replace_all(token_left, " ", "\u0120"); @@ -1359,10 +1359,7 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - // TODO: this should be: - // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead()); - // change it and test that it works - cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead()); memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params; @@ -7417,37 +7414,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array llama_sample_temp(ctx, candidates_p, temp); } -void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) { - if (last_tokens_size == 0 || penalty == 1.0f) { - return; - } - - const int64_t t_start_sample_us = ggml_time_us(); - - for (size_t i = 0; i < candidates->size; ++i) { - const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); - if (token_iter == last_tokens + last_tokens_size) { - continue; - } - - // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. - // This is common fix for this problem, which is to multiply by the penalty instead of dividing. - if (candidates->data[i].logit <= 0) { - candidates->data[i].logit *= penalty; - } else { - candidates->data[i].logit /= penalty; - } - } - - candidates->sorted = false; - - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } -} - -void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { - if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { +void llama_sample_repetition_penalties( + struct llama_context * ctx, + llama_token_data_array * candidates, + const llama_token * last_tokens, + size_t penalty_last_n, + float penalty_repeat, + float penalty_freq, + float penalty_present) { + if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) { return; } @@ -7455,19 +7430,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l // Create a frequency map to count occurrences of each token in last_tokens std::unordered_map token_count; - for (size_t i = 0; i < last_tokens_size; ++i) { - token_count[last_tokens_p[i]]++; + for (size_t i = 0; i < penalty_last_n; ++i) { + token_count[last_tokens[i]]++; } // Apply frequency and presence penalties to the candidates for (size_t i = 0; i < candidates->size; ++i) { - auto token_iter = token_count.find(candidates->data[i].id); + const auto token_iter = token_count.find(candidates->data[i].id); if (token_iter == token_count.end()) { continue; } - int count = token_iter->second; - candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence; + const int count = token_iter->second; + + // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. + // This is common fix for this problem, which is to multiply by the penalty instead of dividing. + if (candidates->data[i].logit <= 0) { + candidates->data[i].logit *= penalty_repeat; + } else { + candidates->data[i].logit /= penalty_repeat; + } + + candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present; } candidates->sorted = false; diff --git a/llama.h b/llama.h index 51010e037..306f5b383 100644 --- a/llama.h +++ b/llama.h @@ -560,21 +560,15 @@ extern "C" { LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed); /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. - LLAMA_API void llama_sample_repetition_penalty( - struct llama_context * ctx, - llama_token_data_array * candidates, - const llama_token * last_tokens, - size_t last_tokens_size, - float penalty); - /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - LLAMA_API void llama_sample_frequency_and_presence_penalties( + LLAMA_API void llama_sample_repetition_penalties( struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, - size_t last_tokens_size, - float alpha_frequency, - float alpha_presence); + size_t penalty_last_n, + float penalty_repeat, + float penalty_freq, + float penalty_present); /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 019c0d462..32e58941c 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -8,11 +8,9 @@ #include #include #include -#include #include #include - static void dump(const llama_token_data_array * candidates) { for (size_t i = 0; i < candidates->size; i++) { printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit); @@ -21,7 +19,6 @@ static void dump(const llama_token_data_array * candidates) { #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) - static void test_top_k(const std::vector & probs, const std::vector & expected_probs, int k) { size_t n_vocab = probs.size(); std::vector candidates; @@ -37,13 +34,12 @@ static void test_top_k(const std::vector & probs, const std::vector & probs, const std::vector & expected_probs, float p) { size_t n_vocab = probs.size(); std::vector candidates; @@ -59,13 +55,12 @@ static void test_top_p(const std::vector & probs, const std::vector & probs, const std::vector & expected_probs, float z) { size_t n_vocab = probs.size(); std::vector candidates; @@ -80,13 +75,12 @@ static void test_tfs(const std::vector & probs, const std::vector llama_sample_tail_free(nullptr, &candidates_p, z, 1); DUMP(&candidates_p); - assert(candidates_p.size == expected_probs.size()); + GGML_ASSERT(candidates_p.size == expected_probs.size()); for (size_t i = 0; i < candidates_p.size; i++) { - assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); } } - static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) { size_t n_vocab = probs.size(); std::vector candidates; @@ -101,18 +95,17 @@ static void test_typical(const std::vector & probs, const std::vector & probs, const std::vector & last_tokens, - const std::vector & expected_probs, float penalty + const std::vector & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence ) { - assert(probs.size() == expected_probs.size()); + GGML_ASSERT(probs.size() == expected_probs.size()); size_t n_vocab = probs.size(); std::vector candidates; @@ -125,41 +118,13 @@ static void test_repetition_penalty( llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_sample_softmax(nullptr, &candidates_p); DUMP(&candidates_p); - llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty); + llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence); llama_sample_softmax(nullptr, &candidates_p); DUMP(&candidates_p); - assert(candidates_p.size == expected_probs.size()); + GGML_ASSERT(candidates_p.size == expected_probs.size()); for (size_t i = 0; i < candidates_p.size; i++) { - assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6); - } -} - - -static void test_frequency_presence_penalty( - const std::vector & probs, const std::vector & last_tokens, - const std::vector & expected_probs, float alpha_frequency, float alpha_presence -) { - assert(probs.size() == expected_probs.size()); - - size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { - float logit = log(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - llama_sample_softmax(nullptr, &candidates_p); - // DUMP(&candidates_p); - llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence); - llama_sample_softmax(nullptr, &candidates_p); - // DUMP(&candidates_p); - - assert(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); } } @@ -181,13 +146,13 @@ int main(void) { test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f); test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f); - test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f); - test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f); - test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f); + test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f); + test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f); + test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f); - test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f); - test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f); - test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f); + test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f); + test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f); + test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f); printf("OK\n");