diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 947b40202..c6bf1b723 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -3,6 +3,7 @@ #include "llama.h" #include +#include #include #include @@ -53,27 +54,49 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st // usage: // ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] // +void usage(const char * executable) { + fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n", executable); + fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + fprintf(stderr, "Allowed quantization types:\n"); + for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { + fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second); + } + exit(1); +} + int main(int argc, char ** argv) { if (argc < 3) { - fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]); - for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { - fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second); + usage(argv[0]); + } + + llama_model_quantize_params params = llama_model_quantize_default_params(); + + int arg_idx = 1; + + for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { + if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { + params.quantize_output_tensor = false; + } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { + params.allow_requantize = true; + } else { + usage(argv[0]); } - return 1; + } + + if (argc - arg_idx < 3) { + usage(argv[0]); } llama_init_backend(); // parse command line arguments - const std::string fname_inp = argv[1]; + const std::string fname_inp = argv[arg_idx]; + arg_idx++; std::string fname_out; - int nthread; - llama_ftype ftype; - int arg_idx = 2; std::string ftype_str; - if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) { - // argv[2] is the ftype + if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { std::string fpath; const size_t pos = fname_inp.find_last_of('/'); if (pos != std::string::npos) { @@ -84,7 +107,6 @@ int main(int argc, char ** argv) { arg_idx++; } else { - // argv[2] is the output path fname_out = argv[arg_idx]; arg_idx++; @@ -92,8 +114,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: missing ftype\n", __func__); return 1; } - // argv[3] is the ftype - if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) { + if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); return 1; } @@ -103,21 +124,19 @@ int main(int argc, char ** argv) { // parse nthreads if (argc > arg_idx) { try { - nthread = std::stoi(argv[arg_idx]); + params.nthread = std::stoi(argv[arg_idx]); } catch (const std::exception & e) { fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what()); return 1; } - } else { - nthread = 0; } fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); - if (nthread > 0) { - fprintf(stderr, " using %d threads", nthread); + if (params.nthread > 0) { + fprintf(stderr, " using %d threads", params.nthread); } fprintf(stderr, "\n"); @@ -129,7 +148,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = llama_time_us(); - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) { + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ¶ms)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/llama.cpp b/llama.cpp index f40c5afa2..e100e2bc9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -886,6 +886,17 @@ struct llama_context_params llama_context_default_params() { return result; } +struct llama_model_quantize_params llama_model_quantize_default_params() { + struct llama_model_quantize_params result = { + /*.nthread =*/ 0, + /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, + /*.allow_requantize =*/ false, + /*.quantize_output_tensor =*/ true, + }; + + return result; +} + bool llama_mmap_supported() { return llama_mmap::SUPPORTED; } @@ -2231,9 +2242,70 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra // quantization // -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) { +static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) { + if (output.size < nelements * sizeof(float)) { + output.resize(nelements * sizeof(float)); + } + float * f32_output = (float *) output.addr; + + quantize_fns_t qtype; + if (ggml_is_quantized(tensor.type)) { + qtype = ggml_internal_get_quantize_fn(tensor.type); + if (qtype.dequantize_row_q == NULL) { + throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); + } + } else if (tensor.type != GGML_TYPE_F16) { + throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type))); + } + + if (nthread < 2) { + if (tensor.type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements); + } else if (ggml_is_quantized(tensor.type)) { + qtype.dequantize_row_q(tensor.data, f32_output, nelements); + } else { + LLAMA_ASSERT(false); // unreachable + } + return; + } + + auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type); + auto block_size_bytes = ggml_type_size(tensor.type); + + LLAMA_ASSERT(nelements % block_size == 0); + auto nblocks = nelements / block_size; + auto blocks_per_thread = nblocks / nthread; + auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count + + std::vector workers; + for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) { + auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread + auto thr_elems = thr_blocks * block_size; // number of elements for this thread + auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread + + auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { + if (typ == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); + } else { + qtype.dequantize_row_q(inbuf, outbuf, nels); + } + }; + workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); + in_buff_offs += thr_block_bytes; + out_buff_offs += thr_elems; + } + for (auto & worker : workers) { + worker.join(); + } + +} + +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type quantized_type; - switch (ftype) { + llama_ftype ftype = params->ftype; + int nthread = params->nthread; + + switch (params->ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; @@ -2259,7 +2331,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false, /*vocab_only*/ false)); - llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); + llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype); int n_attention_wv = 0; int n_feed_forward_w2 = 0; @@ -2301,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= (tensor.ne.size() == 2); // uncomment this to keep the output layer in FP16 - //if (tensor.name == "output.weight") { - // quantize = false; - //} + if (!params->quantize_output_tensor && tensor.name == "output.weight") { + quantize = false; + } + quantize = quantize && quantized_type != tensor.type; enum ggml_type new_type; void * new_data; @@ -2346,17 +2419,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s float * f32_data; size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); llama_buffer f32_conv_buf; + if (tensor.type == GGML_TYPE_F32) { f32_data = (float *) tensor.data; - } else if (tensor.type == GGML_TYPE_F16) { - f32_conv_buf.resize(nelements * sizeof(float)); - f32_data = (float *) f32_conv_buf.addr; - const auto * f16_data = (const ggml_fp16_t *) tensor.data; - for (size_t i = 0; i < nelements; i++) { - f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); - } + } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) { + throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type))); } else { - throw std::runtime_error(format("type %s unsupported for integer quantization", ggml_type_name(tensor.type))); + llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); + f32_data = (float *) f32_conv_buf.addr; } printf("quantizing .. "); @@ -2566,10 +2636,9 @@ void llama_free(struct llama_context * ctx) { int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype, - int nthread) { + const llama_model_quantize_params *params) { try { - llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread); + llama_model_quantize_internal(fname_inp, fname_out, params); return 0; } catch (const std::exception & err) { fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what()); diff --git a/llama.h b/llama.h index dc033b71d..7c7fd481c 100644 --- a/llama.h +++ b/llama.h @@ -115,7 +115,16 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors }; + // model quantization parameters + typedef struct llama_model_quantize_params { + int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + } llama_model_quantize_params; + LLAMA_API struct llama_context_params llama_context_default_params(); + LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); LLAMA_API bool llama_mmap_supported(); LLAMA_API bool llama_mlock_supported(); @@ -137,14 +146,11 @@ extern "C" { // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); - // TODO: not great API - very likely to change // Returns 0 on success - // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given LLAMA_API int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype, - int nthread); + const llama_model_quantize_params * params); // Apply a LoRA adapter to a loaded model // path_base_model is the path to a higher quality model to use as a base for