From 1481a9cf25ea2e4abef6b13a57660a35f3e66af1 Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Fri, 28 Apr 2023 11:59:37 -0400 Subject: [PATCH] llama : add session file format and saved sessions in main (#1169) --- examples/chat-13B.sh | 4 +- examples/common.cpp | 7 ++++ examples/common.h | 1 + examples/main/main.cpp | 89 ++++++++++++++++++++++++++++++++++++++++++ llama.cpp | 53 +++++++++++++++++++++++++ llama.h | 4 ++ 6 files changed, 156 insertions(+), 2 deletions(-) diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh index 4265d7b66..2fac37784 100755 --- a/examples/chat-13B.sh +++ b/examples/chat-13B.sh @@ -31,8 +31,6 @@ The transcript only includes text, it does not include markup like HTML and Mark $USER_NAME: Hello, $AI_NAME! $AI_NAME: Hello $USER_NAME! How may I help you today? -$USER_NAME: What time is it? -$AI_NAME: It is $(date +%H:%M). $USER_NAME: What year is it? $AI_NAME: We are in $(date +%Y). $USER_NAME: Please tell me the largest city in Europe. @@ -50,4 +48,6 @@ $AI_NAME: The arguments are stored in process.argv. argv[3] is the second argument passed to the script and so on. $USER_NAME: Name a color. $AI_NAME: Blue +$USER_NAME: What time is it? +$AI_NAME: It is $(date +%H:%M). $USER_NAME:" "$@" diff --git a/examples/common.cpp b/examples/common.cpp index c0e87eb9f..9f10dc268 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -61,6 +61,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.prompt = argv[i]; + } else if (arg == "--session") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.path_session = argv[i]; } else if (arg == "-f" || arg == "--file") { if (++i >= argc) { invalid_param = true; @@ -228,6 +234,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); fprintf(stderr, " prompt to start generation with (default: empty)\n"); + fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stderr, " -f FNAME, --file FNAME\n"); diff --git a/examples/common.h b/examples/common.h index 6f26b514d..9d3697d79 100644 --- a/examples/common.h +++ b/examples/common.h @@ -31,6 +31,7 @@ struct gpt_params { std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string prompt = ""; + std::string path_session = ""; // path to file for saving/loading model eval state std::string input_prefix = ""; // string to prefix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f9c9e9d98..fda65574f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -157,6 +157,32 @@ int main(int argc, char ** argv) { // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); + std::string path_session = params.path_session; + std::vector session_tokens; + + if (!path_session.empty()) { + fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str()); + + // REVIEW - fopen to check for existing session + FILE * fp = std::fopen(path_session.c_str(), "rb"); + if (fp != NULL) { + std::fclose(fp); + + session_tokens.resize(params.n_ctx); + size_t n_token_count_out = 0; + const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out); + session_tokens.resize(n_token_count_out); + + if (n_session_bytes > 0) { + fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes); + } else { + fprintf(stderr, "%s: could not load session file, will recreate\n", __func__); + } + } else { + fprintf(stderr, "%s: session file does not exist, will create\n", __func__); + } + } + // tokenize the prompt auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); @@ -167,6 +193,26 @@ int main(int argc, char ** argv) { return 1; } + // debug message about similarity of saved session, if applicable + size_t n_matching_session_tokens = 0; + if (session_tokens.size()) { + for (llama_token id : session_tokens) { + if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { + break; + } + n_matching_session_tokens++; + } + if (n_matching_session_tokens >= embd_inp.size()) { + fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); + } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { + fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } else { + fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } + } + // number of tokens to keep when resetting context if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { params.n_keep = (int)embd_inp.size(); @@ -252,9 +298,16 @@ int main(int argc, char ** argv) { bool is_antiprompt = false; bool input_noecho = false; + // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session + // if we loaded a session with at least 75% similarity. It's currently just used to speed up the + // initial prompt so it doesn't need to be an exact match. + bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4); + + int n_past = 0; int n_remain = params.n_predict; int n_consumed = 0; + int n_session_consumed = 0; // the first thing we will do is to output the prompt, so set color accordingly set_console_color(con_st, CONSOLE_COLOR_PROMPT); @@ -276,6 +329,9 @@ int main(int argc, char ** argv) { // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); + // REVIEW - stop saving session if we run out of context + path_session = ""; + //printf("\n---\n"); //printf("resetting: '"); //for (int i = 0; i < (int) embd.size(); i++) { @@ -285,6 +341,28 @@ int main(int argc, char ** argv) { //printf("\n---\n"); } + // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + // REVIEW + if (n_session_consumed < (int) session_tokens.size()) { + size_t i = 0; + for ( ; i < embd.size(); i++) { + if (embd[i] != session_tokens[n_session_consumed]) { + session_tokens.resize(n_session_consumed); + break; + } + + n_past++; + n_session_consumed++; + + if (n_session_consumed >= (int) session_tokens.size()) { + break; + } + } + if (i > 0) { + embd.erase(embd.begin(), embd.begin() + i); + } + } + // evaluate tokens in batches // embd is typically prepared beforehand to fit within a batch, but not always for (int i = 0; i < (int) embd.size(); i += params.n_batch) { @@ -298,6 +376,11 @@ int main(int argc, char ** argv) { } n_past += n_eval; } + + if (embd.size() > 0 && !path_session.empty()) { + session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); + n_session_consumed = session_tokens.size(); + } } embd.clear(); @@ -309,6 +392,12 @@ int main(int argc, char ** argv) { const float temp = params.temp; const float repeat_penalty = params.repeat_penalty; + // optionally save the session on first sample (for faster prompt loading next time) + if (!path_session.empty() && need_to_save_session) { + need_to_save_session = false; + llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + } + llama_token id = 0; { diff --git a/llama.cpp b/llama.cpp index bfebf14bf..dca017db6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2431,3 +2431,56 @@ std::vector>& llama_internal_get_te return ctx->model.tensors_by_name; } +size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + // TODO leverage mmap + llama_file file(path_session, "rb"); + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (!(magic == 'ggsn' && version == 0)) { + fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return 0; + } + + llama_hparams session_hparams; + file.read_raw(&session_hparams, sizeof(llama_hparams)); + + // REVIEW + if (session_hparams != ctx->model.hparams) { + fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); + return 0; + } + + const uint32_t n_token_count = file.read_u32(); + LLAMA_ASSERT(n_token_capacity >= n_token_count); + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + + const size_t n_state_size = file.size - file.tell(); + const size_t n_orig_state_size = llama_get_state_size(ctx); + if (n_state_size != n_orig_state_size) { + fprintf(stderr, "%s : failed to validate state size\n", __func__); + } + std::unique_ptr state_data(new uint8_t[n_state_size]); + file.read_raw(state_data.get(), n_state_size); + return llama_set_state_data(ctx, state_data.get()); +} + +size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + // TODO save temp & swap + llama_file file(path_session, "wb"); + + const size_t n_state_size = llama_get_state_size(ctx); + std::unique_ptr state_data(new uint8_t[n_state_size]); + llama_copy_state_data(ctx, state_data.get()); + + file.write_u32('ggsn'); // magic + file.write_u32(0); // version + file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); + + file.write_u32((uint32_t) n_token_count); // REVIEW + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + file.write_raw(state_data.get(), n_state_size); + return n_state_size; // REVIEW +} diff --git a/llama.h b/llama.h index 17dac0689..86a7d279a 100644 --- a/llama.h +++ b/llama.h @@ -133,6 +133,10 @@ extern "C" { // Returns the number of bytes read LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); + // Save/load session file + LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); + LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); + // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process // n_past is the number of tokens to use from previous eval calls