From 319cdb3e1ffe263cf5b08249c9559e011396c1de Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 10 Mar 2023 21:50:46 +0200 Subject: [PATCH] Final touches --- README.md | 3 +-- main.cpp | 1 + models/.gitignore | 0 utils.cpp | 54 +++++++++++++++++++++++------------------------ utils.h | 6 +++--- 5 files changed, 32 insertions(+), 32 deletions(-) create mode 100644 models/.gitignore diff --git a/README.md b/README.md index 87808fd96..d2b9a70e5 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,5 @@ python3 convert-pth-to-ggml.py models/7B/ 1 In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that - I don't know yet how much the quantization affects the quality of the generated text - Probably the token sampling can be improved -- No Windows support - x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this on Apple Silicon - + diff --git a/main.cpp b/main.cpp index fb9eb17b1..982adf165 100644 --- a/main.cpp +++ b/main.cpp @@ -728,6 +728,7 @@ int main(int argc, char ** argv) { // end of text token if (embd.back() == 2) { + printf(" [end of text]\n"); break; } } diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/utils.cpp b/utils.cpp index 70a2ac2db..cd9c00157 100644 --- a/utils.cpp +++ b/utils.cpp @@ -231,39 +231,39 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri } std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - auto res = gpt_tokenize(vocab, text); - - if (bos) { - res.insert(res.begin(), 1); // TODO: replace with vocab.bos - } - - //std::vector res; + //auto res = gpt_tokenize(vocab, text); //if (bos) { - // res.push_back(1); // TODO: replace with vocab.bos + // res.insert(res.begin(), 1); // TODO: replace with vocab.bos //} - // find the longest token that matches the text - //int pos = 0; - //while (true) { - // int l = 0; - // int t = 0; - // for (const auto & kv : vocab.id_to_token) { - // if (kv.second.size() < l) continue; - // if (kv.second.size() > text.size() - pos) continue; - // if (text.substr(pos, kv.second.size()) == kv.second) { - // l = kv.second.size(); - // t = kv.first; - // } - // } + std::vector res; - // if (l == 0 && t != 13) { - // break; - // } + if (bos) { + res.push_back(1); // TODO: replace with vocab.bos + } - // res.push_back(t); - // pos += l; - //} + //find the longest token that matches the text + int pos = 0; + while (true) { + int l = 0; + int t = 0; + for (const auto & kv : vocab.id_to_token) { + if (kv.second.size() < l) continue; + if (kv.second.size() > text.size() - pos) continue; + if (text.substr(pos, kv.second.size()) == kv.second) { + l = kv.second.size(); + t = kv.first; + } + } + + if (l == 0 && t != 13) { + break; + } + + res.push_back(t); + pos += l; + } return res; } diff --git a/utils.h b/utils.h index d291964a5..20c42ba9c 100644 --- a/utils.h +++ b/utils.h @@ -15,12 +15,12 @@ struct gpt_params { int32_t seed = -1; // RNG seed int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 200; // new tokens to predict + int32_t n_predict = 128; // new tokens to predict // sampling parameters - int32_t top_k = 100; + int32_t top_k = 40; float top_p = 0.95f; - float temp = 0.8f; + float temp = 0.80f; int32_t n_batch = 8; // batch size for prompt processing