From 38de86a7114c97ecf3644e3a60159f1ed893e1b0 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 20 Apr 2023 19:42:27 +0200 Subject: [PATCH] llama : multi-threaded quantization (#1075) * Multi-threading quantization. Not much gain for simple quantizations, bit it will be important for quantizations that require more CPU cycles. * Multi-threading for quantize-stats It now does the job in ~14 seconds on my Mac for Q4_0, Q4_1 and Q4_2. Single-threaded it was taking more than 2 minutes after adding the more elaborate version of Q4_2. * Reviewer comments * Avoiding compiler confusion After changing chunk_size to const int as suggested by @ggerganov, clang and GCC starting to warn me that I don't need to capture it in the lambda. So, I removed it from the capture list. But that makes the MSVC build fail. So, making it a constexpr to make every compiler happy. * Still fighting with lambda captures in MSVC --------- Co-authored-by: Iwan Kawrakow Co-authored-by: Georgi Gerganov --- examples/quantize-stats/quantize-stats.cpp | 137 +++++++++++++++------ examples/quantize/quantize.cpp | 7 +- ggml.c | 27 ++++ ggml.h | 2 + llama.cpp | 67 ++++++---- llama.h | 4 +- 6 files changed, 183 insertions(+), 61 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index cd973e8ac..4e6c2c831 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include struct quantize_stats_params { std::string model = "models/7B/ggml-model-f16.bin"; @@ -27,7 +29,6 @@ struct quantize_stats_params { std::vector include_types; }; -const int64_t SCRATCH_ELEMENTS = 32*32; const size_t HISTOGRAM_BUCKETS = 150; const double HISTOGRAM_RANGE = 0.03; @@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou stats.num_samples += nelements; } +void combine_error_stats(error_stats & into, const error_stats & from) { + into.num_samples += from.num_samples; + into.total_error += from.total_error; + if (from.max_error > into.max_error) into.max_error = from.max_error; + for (size_t i=0; inb[3] == tensor->nb[2]*tensor->ne[2]; } +void test_roundtrip_on_chunk( + const ggml_tensor * layer, + int64_t offset, + int64_t chunk_size, + const quantize_fns_t & qfns, + bool use_reference, + float * input_scratch, + char * quantized_scratch, + float * output_scratch, + error_stats & stats) { + + if (layer->type == GGML_TYPE_F16) { + for (int i = 0; i < chunk_size; i++) { + input_scratch[i] = ggml_get_f32_1d(layer, i + offset); + } + } else { + input_scratch = ggml_get_data_f32(layer) + offset; + } + + if (use_reference) { + qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); + } else { + qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + } + qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); + + update_error_stats(chunk_size, input_scratch, output_scratch, stats); +} + + // Run quantization function for a single layer and update error stats void test_roundtrip_on_layer( std::string & name, @@ -137,40 +175,61 @@ void test_roundtrip_on_layer( const quantize_fns_t & qfns, bool use_reference, const ggml_tensor * layer, - float * input_scratch, - char *quantized_scratch, - float * output_scratch, - error_stats & total_error) { + std::vector & input_scratch, + std::vector & quantized_scratch, + std::vector & output_scratch, + error_stats & total_error, + int max_thread = 0) { assert(tensor_is_contiguous(layer)); error_stats layer_error {}; - int64_t nelements = ggml_nelements(layer); + uint64_t nelements = ggml_nelements(layer); - for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) { - int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset); - - if (layer->type == GGML_TYPE_F16) { - for (int i = 0; i < chunk_size; i++) { - input_scratch[i] = ggml_get_f32_1d(layer, i + offset); - } - } else { - input_scratch = ggml_get_data_f32(layer) + offset; - } - - if (use_reference) { - qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); - } else { - qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); - } - qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); - - update_error_stats(chunk_size, input_scratch, output_scratch, total_error); - if (print_layer_stats) { - update_error_stats(chunk_size, input_scratch, output_scratch, layer_error); - } + float* input_scratch_ptr = nullptr; + if (layer->type == GGML_TYPE_F16) { + if (input_scratch.size() < nelements) input_scratch.resize(nelements); + input_scratch_ptr = input_scratch.data(); } + if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements); + if (output_scratch.size() < nelements) output_scratch.resize(nelements); + + if (max_thread < 1) max_thread = std::thread::hardware_concurrency(); + int chunk_size = 32*512; + int num_chunks = (nelements + chunk_size - 1)/chunk_size; + + if (num_chunks < 2 || max_thread < 2) { + test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(), + output_scratch.data(), print_layer_stats ? layer_error : total_error); + } else { + auto & stats = print_layer_stats ? layer_error : total_error; + std::mutex mutex; + uint64_t counter = 0; + auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr, + &quantized_scratch, &output_scratch, chunk_size] () { + error_stats local_stats {}; + while (true) { + std::unique_lock lock(mutex); + uint64_t offset = counter; counter += chunk_size; + if (offset >= nelements) { + combine_error_stats(stats, local_stats); + break; + } + lock.unlock(); + uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; + test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset, + quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); + } + }; + int nthread = std::min(num_chunks, max_thread); + std::vector workers(nthread-1); + for (auto& w : workers) w = std::thread(compute); + compute(); + for (auto& w : workers) w.join(); + } + if (print_layer_stats) { print_error_stats(name, layer_error, false); + combine_error_stats(total_error, layer_error); } } @@ -181,6 +240,7 @@ int main(int argc, char ** argv) { // read command line + int max_thread = 0; bool invalid_param = false; std::string arg; for (int i = 1; i < argc; i++) { @@ -230,6 +290,12 @@ int main(int argc, char ** argv) { fprintf(stderr, "error: %s not in list of types\n", argv[i]); invalid_param = true; } + } else if (arg == "-n" || arg == "--num-threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + max_thread = atoi(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); quantize_stats_print_usage(argc, argv); @@ -295,9 +361,9 @@ int main(int argc, char ** argv) { } printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); // allocate scratch space - std::vector input_scratch(SCRATCH_ELEMENTS); - std::vector quantized_scratch(SCRATCH_ELEMENTS*4); - std::vector output_scratch(SCRATCH_ELEMENTS); + std::vector input_scratch; + std::vector quantized_scratch; + std::vector output_scratch; // loop throught quantization types for (int i = 0; i < GGML_TYPE_COUNT; i++) { @@ -328,10 +394,11 @@ int main(int argc, char ** argv) { qfns, params.reference, kv_tensor.second, - input_scratch.data(), - quantized_scratch.data(), - output_scratch.data(), - global_stats + input_scratch, + quantized_scratch, + output_scratch, + global_stats, + max_thread ); } diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 49a33a86f..5b4812c62 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -10,8 +10,8 @@ int main(int argc, char ** argv) { ggml_time_init(); - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); + if (argc < 4) { + fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]); fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); @@ -30,6 +30,7 @@ int main(int argc, char ** argv) { const std::string fname_out = argv[2]; const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); + int nthread = argc > 4 ? atoi(argv[4]) : 0; const int64_t t_main_start_us = ggml_time_us(); @@ -39,7 +40,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) { + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/ggml.c b/ggml.c index 733ddc0de..1aa8ee303 100644 --- a/ggml.c +++ b/ggml.c @@ -12189,6 +12189,33 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_3*sizeof(block_q4_3)); } +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { + size_t result = 0; + switch (type) { + case GGML_TYPE_Q4_0: + { + GGML_ASSERT(start % QK4_0 == 0); + block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; + result = ggml_quantize_q4_0(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q4_1: + { + GGML_ASSERT(start % QK4_1 == 0); + block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; + result = ggml_quantize_q4_1(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q4_2: + { + GGML_ASSERT(start % QK4_2 == 0); + block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; + result = ggml_quantize_q4_2(src + start, block, n, n, hist); + } break; + default: + assert(false); + } + return result; +} + //////////////////////////////////////////////////////////////////////////////// int ggml_cpu_has_avx(void) { diff --git a/ggml.h b/ggml.h index 6e81d8125..a8a7b6b4f 100644 --- a/ggml.h +++ b/ggml.h @@ -813,6 +813,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + // // system info // diff --git a/llama.cpp b/llama.cpp index 99d29a1ef..e4c414c2d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -24,6 +24,9 @@ #include #include #include +#include +#include +#include #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 @@ -1572,7 +1575,7 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) { +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) { ggml_type quantized_type; switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; @@ -1582,6 +1585,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s default: throw format("invalid output file type %d\n", ftype); }; + if (nthread <= 0) { + nthread = std::thread::hardware_concurrency(); + } + std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, /*vocab_only*/ false)); llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); @@ -1590,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t total_size_new = 0; std::vector hist_all(1 << 4, 0); + std::vector workers; + std::mutex mutex; + size_t idx = 0; for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { llama_buffer read_data; @@ -1643,25 +1653,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = work.addr; std::vector hist_cur(1 << 4, 0); - switch (new_type) { - case GGML_TYPE_Q4_0: - { - new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_2: - { - new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_3: - { - new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - default: - LLAMA_ASSERT(false); + int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1)/chunk_size; + const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + if (nthread_use < 2) { + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + } else { + size_t counter = 0; + new_size = 0; + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + std::vector local_hist; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + size_t first = counter; counter += chunk_size; + if (first >= nelements) { + if (!local_hist.empty()) { + for (int j=0; j %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); @@ -1783,9 +1805,10 @@ void llama_free(struct llama_context * ctx) { int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype) { + enum llama_ftype ftype, + int nthread) { try { - llama_model_quantize_internal(fname_inp, fname_out, ftype); + llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread); return 0; } catch (const std::string & err) { fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); diff --git a/llama.h b/llama.h index 011e34c00..e95ff73b8 100644 --- a/llama.h +++ b/llama.h @@ -93,10 +93,12 @@ extern "C" { // TODO: not great API - very likely to change // Returns 0 on success + // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given LLAMA_API int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype); + enum llama_ftype ftype, + int nthread); // Apply a LoRA adapter to a loaded model // path_base_model is the path to a higher quality model to use as a base for