diff --git a/llama.cpp b/llama.cpp index d0e7151f4..54545f01d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -289,15 +289,15 @@ template static T checked_mul(T a, T b) { T ret = a * b; if (a != 0 && ret / a != b) { - throw format("overflow multiplying %llu * %llu", - (unsigned long long) a, (unsigned long long) b); + throw std::runtime_error(format("overflow multiplying %llu * %llu", + (unsigned long long) a, (unsigned long long) b)); } return ret; } static size_t checked_div(size_t a, size_t b) { if (b == 0 || a % b != 0) { - throw format("error dividing %zu / %zu", a, b); + throw std::runtime_error(format("error dividing %zu / %zu", a, b)); } return a / b; } @@ -361,7 +361,7 @@ struct llama_load_tensor { const auto & first_shard = shards.at(0); for (const auto & shard : shards) { if (shard.type != first_shard.type) { - throw format("inconsistent tensor shard type in '%s'", name.c_str()); + throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str())); } } type = first_shard.type; @@ -384,8 +384,8 @@ struct llama_load_tensor { const auto & first_shard = shards.at(0); for (const auto & shard : shards) { if (shard.ne != first_shard.ne) { - throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s", - name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()); + throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s", + name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str())); } } ne = first_shard.ne; @@ -463,8 +463,8 @@ struct llama_file_loader { } } - throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", - magic, version); + throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + magic, version)); } void read_hparams() { hparams.n_vocab = file.read_u32(); @@ -504,7 +504,7 @@ struct llama_file_loader { file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); std::string name = file.read_string(name_len); if (n_dims < 1 || n_dims > 2) { - throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); + throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); } switch (shard.type) { case GGML_TYPE_F32: @@ -521,7 +521,7 @@ struct llama_file_loader { case GGML_TYPE_Q6_K: break; default: { - throw format("unrecognized tensor type %u\n", shard.type); + throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type)); } } @@ -630,7 +630,7 @@ struct llama_model_loader { auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); file_loaders.emplace_back(ith_file); if (ith_file->hparams != first_file->hparams) { - throw format("llama.cpp: hparams inconsistent between files"); + throw std::runtime_error(format("llama.cpp: hparams inconsistent between files")); } } if (!llama_mmap::SUPPORTED) { @@ -660,7 +660,7 @@ struct llama_model_loader { uint32_t guess_n_parts() const { auto it = tensors_map.name_to_idx.find("tok_embeddings.weight"); if (it == tensors_map.name_to_idx.end()) { - throw std::string("missing tok_embeddings.weight"); + throw std::runtime_error(std::string("missing tok_embeddings.weight")); } const llama_load_tensor & lt = tensors_map.tensors.at(it->second); return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0); @@ -677,12 +677,12 @@ struct llama_model_loader { struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne, ggml_backend backend) { auto it = tensors_map.name_to_idx.find(name); if (it == tensors_map.name_to_idx.end()) { - throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); + throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str()))); } llama_load_tensor & lt = tensors_map.tensors.at(it->second); if (lt.ne != ne) { - throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", - name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); + throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", + name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str())); } return get_tensor_for(lt, backend); @@ -706,7 +706,7 @@ struct llama_model_loader { void done_getting_tensors() const { if (num_ggml_tensors_created != tensors_map.tensors.size()) { - throw std::string("llama.cpp: file contained more tensors than expected"); + throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected")); } } @@ -994,7 +994,7 @@ static void llama_model_load_internal( if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { - throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"); + throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)")); } } @@ -1002,7 +1002,7 @@ static void llama_model_load_internal( if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 || hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { - throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"); + throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)")); } } @@ -1033,7 +1033,7 @@ static void llama_model_load_internal( model.ctx = ggml_init(params); if (!model.ctx) { - throw format("ggml_init() failed"); + throw std::runtime_error(format("ggml_init() failed")); } } @@ -1214,8 +1214,8 @@ static bool llama_model_load( llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; - } catch (const std::string & err) { - fprintf(stderr, "error loading model: %s\n", err.c_str()); + } catch (const std::exception & err) { + fprintf(stderr, "error loading model: %s\n", err.what()); return false; } } @@ -2120,8 +2120,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; + // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; @@ -2129,8 +2130,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; case LLAMA_FTYPE_MOSTLY_Q5_K_S: case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; - case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; - default: throw format("invalid output file type %d\n", ftype); + case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; + default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } if (nthread <= 0) { @@ -2231,7 +2232,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); } } else { - throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)); + throw std::runtime_error(format("type %s unsupported for integer quantization", ggml_type_name(tensor.type))); } printf("quantizing .. "); @@ -2433,8 +2434,8 @@ int llama_model_quantize( try { llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread); return 0; - } catch (const std::string & err) { - fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what()); return 1; } } @@ -2687,8 +2688,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); - } catch (const std::string & err) { - fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str()); + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; } }