From a316a425d04027453dc0fd45f003b647c12f66f9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 Mar 2023 20:26:40 +0200 Subject: [PATCH] Overhaul the examples structure - main -> examples - utils -> examples (renamed to "common") - quantize -> examples - separate tools for "perplexity" and "embedding" Hope I didn't break something ! --- .gitignore | 1 + CMakeLists.txt | 29 +--- Makefile | 19 ++- examples/CMakeLists.txt | 36 +++++ utils.cpp => examples/common.cpp | 4 +- utils.h => examples/common.h | 0 examples/embedding/CMakeLists.txt | 4 + examples/embedding/README.md | 3 + examples/embedding/embedding.cpp | 106 +++++++++++++ examples/main/CMakeLists.txt | 4 + examples/main/README.md | 3 + main.cpp => examples/main/main.cpp | 119 ++------------ examples/perplexity/CMakeLists.txt | 4 + examples/perplexity/README.md | 3 + examples/perplexity/perplexity.cpp | 146 ++++++++++++++++++ examples/quantize/CMakeLists.txt | 4 + examples/quantize/README.md | 3 + .../quantize/quantize.cpp | 0 ggml.c | 26 ++-- tests/CMakeLists.txt | 2 +- tests/test-tokenizer-0.cpp | 6 +- 21 files changed, 361 insertions(+), 161 deletions(-) create mode 100644 examples/CMakeLists.txt rename utils.cpp => examples/common.cpp (99%) rename utils.h => examples/common.h (100%) create mode 100644 examples/embedding/CMakeLists.txt create mode 100644 examples/embedding/README.md create mode 100644 examples/embedding/embedding.cpp create mode 100644 examples/main/CMakeLists.txt create mode 100644 examples/main/README.md rename main.cpp => examples/main/main.cpp (76%) create mode 100644 examples/perplexity/CMakeLists.txt create mode 100644 examples/perplexity/README.md create mode 100644 examples/perplexity/perplexity.cpp create mode 100644 examples/quantize/CMakeLists.txt create mode 100644 examples/quantize/README.md rename quantize.cpp => examples/quantize/quantize.cpp (100%) diff --git a/.gitignore b/.gitignore index 3087b0ea5..ce01fd541 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ models/* /main /quantize /result +/perplexity arm_neon.h compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 51af97c4d..a1ff5a44e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,17 +211,6 @@ endif() # Build libraries # -add_library(utils OBJECT - utils.cpp - utils.h) - -target_include_directories(utils PUBLIC .) -target_compile_features(utils PUBLIC cxx_std_11) # don't bump -target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS}) -if (BUILD_SHARED_LIBS) - set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON) -endif() - add_library(ggml OBJECT ggml.c ggml.h) @@ -239,22 +228,12 @@ add_library(llama target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump -target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS}) +target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS}) if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) endif() -# -# Executables -# - -add_executable(main main.cpp) -target_link_libraries(main PRIVATE llama ggml utils) - -add_executable(quantize quantize.cpp) -target_link_libraries(quantize PRIVATE llama ggml utils) - # # programs, examples and tests # @@ -264,6 +243,6 @@ if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) add_subdirectory(tests) endif () -#if (LLAMA_BUILD_EXAMPLES) -# add_subdirectory(examples) -#endif() +if (LLAMA_BUILD_EXAMPLES) + add_subdirectory(examples) +endif() diff --git a/Makefile b/Makefile index e8b128cb8..98a2d85f3 100644 --- a/Makefile +++ b/Makefile @@ -212,7 +212,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize +default: main quantize perplexity # # Build library @@ -224,20 +224,23 @@ ggml.o: ggml.c ggml.h llama.o: llama.cpp llama.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o -utils.o: utils.cpp utils.h - $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o +common.o: examples/common.cpp examples/common.h + $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -f *.o main quantize + rm -vf *.o main quantize perplexity -main: main.cpp ggml.o llama.o utils.o - $(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS) +main: examples/main/main.cpp ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: quantize.cpp ggml.o llama.o utils.o - $(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS) +quantize: examples/quantize/quantize.cpp ggml.o llama.o + $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) + +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) # # Tests diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 000000000..ce3a34710 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,36 @@ +# dependencies + +find_package(Threads REQUIRED) + +# third-party + +# ... + +# common + +set(TARGET common) + +add_library(${TARGET} OBJECT + common.h + common.cpp + ) + +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + +target_include_directories(${TARGET} PUBLIC .) +target_compile_features(${TARGET} PUBLIC cxx_std_11) +target_link_libraries(${TARGET} PRIVATE llama) + +# examples + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +if (EMSCRIPTEN) +else() + add_subdirectory(main) + add_subdirectory(quantize) + add_subdirectory(perplexity) + add_subdirectory(embedding) +endif() diff --git a/utils.cpp b/examples/common.cpp similarity index 99% rename from utils.cpp rename to examples/common.cpp index cea309628..afa7d4026 100644 --- a/utils.cpp +++ b/examples/common.cpp @@ -1,6 +1,6 @@ -#include "ggml.h" +#include "common.h" -#include "utils.h" +#include "ggml.h" #include #include diff --git a/utils.h b/examples/common.h similarity index 100% rename from utils.h rename to examples/common.h diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt new file mode 100644 index 000000000..88c425d4a --- /dev/null +++ b/examples/embedding/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET embedding) +add_executable(${TARGET} embedding.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/embedding/README.md b/examples/embedding/README.md new file mode 100644 index 000000000..21d8be65f --- /dev/null +++ b/examples/embedding/README.md @@ -0,0 +1,3 @@ +# embedding + +TODO diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp new file mode 100644 index 000000000..3015293f7 --- /dev/null +++ b/examples/embedding/embedding.cpp @@ -0,0 +1,106 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + params.embedding = true; + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; + lparams.embedding = params.embedding; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + int n_past = 0; + + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + // tokenize the prompt + auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + + if (params.verbose_prompt) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + } + fprintf(stderr, "\n"); + } + + if (params.embedding){ + if (embd_inp.size() > 0) { + if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + } + + const auto embeddings = llama_get_embeddings(ctx); + + // TODO: print / use the embeddings + } + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; +} diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt new file mode 100644 index 000000000..b2dcc2910 --- /dev/null +++ b/examples/main/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET main) +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main/README.md b/examples/main/README.md new file mode 100644 index 000000000..4701aa558 --- /dev/null +++ b/examples/main/README.md @@ -0,0 +1,3 @@ +# main + +TODO diff --git a/main.cpp b/examples/main/main.cpp similarity index 76% rename from main.cpp rename to examples/main/main.cpp index 77260bb71..b5f1a7b5c 100644 --- a/main.cpp +++ b/examples/main/main.cpp @@ -1,5 +1,4 @@ -#include "utils.h" -#include "ggml.h" +#include "common.h" #include "llama.h" #include @@ -65,79 +64,6 @@ void set_console_state(console_state new_st) } } -std::vector softmax(const std::vector& logits) { - std::vector probs(logits.size()); - float max_logit = logits[0]; - for (float v : logits) max_logit = std::max(max_logit, v); - double sum_exp = 0.0; - for (size_t i = 0; i < logits.size(); i++) { - // Subtract the maximum logit value from the current logit value for numerical stability - float logit = logits[i] - max_logit; - double exp_logit = std::exp(logit); - sum_exp += exp_logit; - probs[i] = exp_logit; - } - for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; - return probs; -} - -void perplexity(llama_context * ctx, const gpt_params & params) { - // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research - // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` - // Output: `perplexity: 13.5106 [114/114]` - auto tokens = ::llama_tokenize(ctx, params.prompt, true); - - int count = 0; - double nll = 0.0; - int seq_count = tokens.size() / params.n_ctx; - - fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); - - for (int i = 0; i < seq_count; ++i) { - int start = i * params.n_ctx; - int end = start + params.n_ctx - 1; - std::vector embd(tokens.begin() + start, tokens.begin() + end); - auto start_t = std::chrono::high_resolution_clock::now(); - if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return; - } - auto end_t = std::chrono::high_resolution_clock::now(); - if (i == 0) { - double seconds = std::chrono::duration(end_t - start_t).count(); - printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); - } - // We get the logits for all the tokens in the context window (params.n_ctx) - // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, - // calculate the perplexity over the last half the window (so the model always has - // some context to predict the token). - // - // We rely on the fact that attention in the forward pass only looks at previous - // tokens here, so the logits returned for each token are an accurate representation - // of what the model would have predicted at that point. - // - // Example, we have a context window of 512, we will compute perplexity for each of the - // last 256 tokens. Then, we split the input up into context window size chunks to - // process the entire prompt. - - auto logits = llama_get_logits(ctx); - for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { - // Calculate probability of next token, given the previous ones. - int n_vocab = llama_n_vocab(ctx); - std::vector tok_logits( - logits + j * n_vocab, - logits + (j + 1) * n_vocab); - double prob = softmax(tok_logits)[tokens[start + j + 1]]; - nll += -std::log(prob); - ++count; - } - // perplexity is e^(average negative log-likelihood) - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - fflush(stdout); - } - printf("\n"); -} - static bool is_interacting = false; #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) @@ -155,9 +81,6 @@ void sigint_handler(int signo) { #endif int main(int argc, char ** argv) { - // has to be called once at the start of the program to init ggml stuff - ggml_time_init(); - gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; @@ -165,6 +88,14 @@ int main(int argc, char ** argv) { return 1; } + if (params.perplexity) { + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + if (params.n_ctx > 2048) { fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); @@ -198,9 +129,7 @@ int main(int argc, char ** argv) { lparams.n_parts = params.n_parts; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; - lparams.logits_all = params.perplexity; lparams.use_mlock = params.use_mlock; - lparams.embedding = params.embedding; ctx = llama_init_from_file(params.model.c_str(), lparams); @@ -236,11 +165,6 @@ int main(int argc, char ** argv) { return 0; } - if (params.perplexity) { - perplexity(ctx, params); - exit(0); - } - int n_past = 0; // Add a space in front of the first character to match OG llama tokenizer behavior @@ -346,27 +270,6 @@ int main(int argc, char ** argv) { // the first thing we will do is to output the prompt, so set color accordingly set_console_state(CONSOLE_STATE_PROMPT); - if (params.embedding){ - embd = embd_inp; - - if (embd.size() > 0) { - if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - } - - const auto embeddings = llama_get_embeddings(ctx); - - // TODO: print / use the embeddings - - if (params.use_color) { - printf(ANSI_COLOR_RESET); - } - - return 0; - } - while (remaining_tokens > 0 || params.interactive) { // predict if (embd.size() > 0) { @@ -392,10 +295,6 @@ int main(int argc, char ** argv) { auto logits = llama_get_logits(ctx); if (params.ignore_eos) { - // set the logit of the eos token to zero to avoid sampling it - //logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0; - // TODO: this does not work of params.logits_all == true - assert(params.perplexity == false); logits[llama_token_eos()] = 0; } diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt new file mode 100644 index 000000000..5836df8b2 --- /dev/null +++ b/examples/perplexity/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET perplexity) +add_executable(${TARGET} perplexity.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md new file mode 100644 index 000000000..a932275c2 --- /dev/null +++ b/examples/perplexity/README.md @@ -0,0 +1,3 @@ +# perplexity + +TODO diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp new file mode 100644 index 000000000..f0266a01f --- /dev/null +++ b/examples/perplexity/perplexity.cpp @@ -0,0 +1,146 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include + +std::vector softmax(const std::vector& logits) { + std::vector probs(logits.size()); + float max_logit = logits[0]; + for (float v : logits) max_logit = std::max(max_logit, v); + double sum_exp = 0.0; + for (size_t i = 0; i < logits.size(); i++) { + // Subtract the maximum logit value from the current logit value for numerical stability + float logit = logits[i] - max_logit; + double exp_logit = std::exp(logit); + sum_exp += exp_logit; + probs[i] = exp_logit; + } + for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; + return probs; +} + +void perplexity(llama_context * ctx, const gpt_params & params) { + // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research + // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` + // Output: `perplexity: 13.5106 [114/114]` + auto tokens = ::llama_tokenize(ctx, params.prompt, true); + + int count = 0; + double nll = 0.0; + int seq_count = tokens.size() / params.n_ctx; + + fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); + + for (int i = 0; i < seq_count; ++i) { + int start = i * params.n_ctx; + int end = start + params.n_ctx - 1; + std::vector embd(tokens.begin() + start, tokens.begin() + end); + auto start_t = std::chrono::high_resolution_clock::now(); + if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + auto end_t = std::chrono::high_resolution_clock::now(); + if (i == 0) { + double seconds = std::chrono::duration(end_t - start_t).count(); + printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); + } + // We get the logits for all the tokens in the context window (params.n_ctx) + // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, + // calculate the perplexity over the last half the window (so the model always has + // some context to predict the token). + // + // We rely on the fact that attention in the forward pass only looks at previous + // tokens here, so the logits returned for each token are an accurate representation + // of what the model would have predicted at that point. + // + // Example, we have a context window of 512, we will compute perplexity for each of the + // last 256 tokens. Then, we split the input up into context window size chunks to + // process the entire prompt. + + auto logits = llama_get_logits(ctx); + for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { + // Calculate probability of next token, given the previous ones. + int n_vocab = llama_n_vocab(ctx); + std::vector tok_logits( + logits + j * n_vocab, + logits + (j + 1) * n_vocab); + double prob = softmax(tok_logits)[tokens[start + j + 1]]; + nll += -std::log(prob); + ++count; + } + // perplexity is e^(average negative log-likelihood) + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + } + printf("\n"); +} + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + params.perplexity = true; + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; + lparams.embedding = params.embedding; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + perplexity(ctx, params); + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; +} diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt new file mode 100644 index 000000000..fb27d4517 --- /dev/null +++ b/examples/quantize/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET quantize) +add_executable(${TARGET} quantize.cpp) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/README.md b/examples/quantize/README.md new file mode 100644 index 000000000..f349e913e --- /dev/null +++ b/examples/quantize/README.md @@ -0,0 +1,3 @@ +# quantize + +TODO diff --git a/quantize.cpp b/examples/quantize/quantize.cpp similarity index 100% rename from quantize.cpp rename to examples/quantize/quantize.cpp diff --git a/ggml.c b/ggml.c index 291e12a0a..b566b5684 100644 --- a/ggml.c +++ b/ggml.c @@ -5741,8 +5741,8 @@ static bool ggml_compute_forward_mul_mat_use_blas( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - const int ne00 = src0->ne[0]; - const int ne01 = src0->ne[1]; + //const int ne00 = src0->ne[0]; + //const int ne01 = src0->ne[1]; const int ne10 = src1->ne[0]; @@ -5776,16 +5776,16 @@ static void ggml_compute_forward_mul_mat_f32( const int ne10 = src1->ne[0]; const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; + //const int ne12 = src1->ne[2]; + //const int ne13 = src1->ne[3]; - const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne0 = dst->ne[0]; + //const int ne1 = dst->ne[1]; + //const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + //const int ne = ne0*ne1*ne2*ne3; - const int nb00 = src0->nb[0]; + //const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; const int nb02 = src0->nb[2]; const int nb03 = src0->nb[3]; @@ -5947,7 +5947,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; @@ -6137,7 +6137,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; @@ -6322,7 +6322,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6a4170f80..b44d7fe7e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,7 +1,7 @@ function(llama_add_test source) get_filename_component(TEST_TARGET ${source} NAME_WE) add_executable(${TEST_TARGET} ${source}) - target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) + target_link_libraries(${TEST_TARGET} PRIVATE llama) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) endfunction() diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 49bc232b6..382055324 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -1,9 +1,9 @@ -#include "utils.h" #include "llama.h" #include #include #include +#include static const std::map> k_tests = { { "Hello World", { 1, 10994, 2787, }, }, @@ -48,7 +48,9 @@ int main(int argc, char **argv) { } for (const auto & test_kv : k_tests) { - const auto res = ::llama_tokenize(ctx, test_kv.first, true); + std::vector res(test_kv.first.size()); + const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true); + res.resize(n); bool correct = res.size() == test_kv.second.size();