From 2818de21ffc16770e3501bfd33c70ac5c3ab0f6d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 7 Sep 2023 12:33:12 +0300 Subject: [PATCH] examples : fix build + compile warnings (close #1256) --- examples/common.cpp | 2 +- examples/lsp/lsp.cpp | 6 +++--- examples/main/main.cpp | 4 ++-- examples/talk-llama/llama.cpp | 25 ++++++++++++------------- examples/talk-llama/talk-llama.cpp | 5 ++++- examples/talk/gpt-2.cpp | 6 +++--- examples/talk/talk.cpp | 5 ++++- 7 files changed, 29 insertions(+), 24 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 2b8da8f..11064b8 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -792,7 +792,7 @@ bool sam_params_parse(int argc, char ** argv, sam_params & params) { return true; } -void sam_print_usage(int argc, char ** argv, const sam_params & params) { +void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); diff --git a/examples/lsp/lsp.cpp b/examples/lsp/lsp.cpp index d866b66..b8001b9 100644 --- a/examples/lsp/lsp.cpp +++ b/examples/lsp/lsp.cpp @@ -324,12 +324,12 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector commandset_list.push_back(cs); return json{{"index",index}}; } -json seek(struct whisper_context * ctx, audio_async &audio, json params) { +json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) { // whisper_state has the pertinent offsets, but there also seem to be a large // number of scratch buffers that would prevent rewinding context in a manner similar to llama // I'll give this a another pass once everything else is implemented, // but for now, it's unsupported - throw json{ + throw json { {"code", -32601}, {"message", "Seeking is not yet supported."} }; @@ -412,7 +412,7 @@ void process_loop(struct whisper_context * ctx, audio_async &audio, const whispe jobqueue.pop_front(); // send response std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace); - fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str()); + fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", (int)data.length()+1, data.c_str()); std::cout.flush(); } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index fa399c6..60c1cca 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -260,7 +260,7 @@ std::string estimate_diarization_speaker(std::vector> pcmf32s return speaker; } -void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) { +void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) { int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step; int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev); if (progress >= *progress_prev + progress_step) { @@ -492,7 +492,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_ return true; } -bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector> pcmf32s) { +bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector> /*pcmf32s*/) { std::ofstream fout(fname); fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); diff --git a/examples/talk-llama/llama.cpp b/examples/talk-llama/llama.cpp index 77550fa..aecae00 100644 --- a/examples/talk-llama/llama.cpp +++ b/examples/talk-llama/llama.cpp @@ -1164,7 +1164,7 @@ static bool llama_eval_internal( const llama_token * tokens, const int n_tokens, const int n_past, - const int n_threads) { + int n_threads) { // enforce that the first token is BOS if (n_past == 0 && tokens[0] != llama_token_bos()) { @@ -1190,6 +1190,8 @@ static bool llama_eval_internal( const int n_vocab = hparams.n_vocab; const int n_rot = hparams.n_embd/hparams.n_head; + const float eps = 5e-6f; // TODO: take from hparams + auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; @@ -1204,7 +1206,7 @@ static bool llama_eval_internal( // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); ggml_set_name(embd, "embd"); @@ -1221,7 +1223,7 @@ static bool llama_eval_internal( // norm { - cur = ggml_rms_norm(ctx0, inpL); + cur = ggml_rms_norm(ctx0, inpL, eps); // cur = cur*attention_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm); @@ -1329,7 +1331,7 @@ static bool llama_eval_internal( { // norm { - cur = ggml_rms_norm(ctx0, inpFF); + cur = ggml_rms_norm(ctx0, inpFF, eps); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); @@ -1367,7 +1369,7 @@ static bool llama_eval_internal( // norm { - inpL = ggml_rms_norm(ctx0, inpL); + inpL = ggml_rms_norm(ctx0, inpL, eps); // inpL = inpL*norm(broadcasted) inpL = ggml_mul(ctx0, inpL, model.norm); @@ -1384,8 +1386,8 @@ static bool llama_eval_internal( //inpL = ggml_soft_max_inplace(ctx0, inpL); // run the computation - ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_build_forward_expand (&gf, inpL); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); #ifdef GGML_PERF // print timing information per ggml operation (for debugging purposes) @@ -2488,8 +2490,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * } struct ggml_cgraph gf = ggml_build_forward(r); - gf.n_threads = n_threads; - ggml_graph_compute(lora_ctx, &gf); + ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads); // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -2635,7 +2636,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kout3d->data = out; @@ -2655,7 +2655,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1); ggml_free(cpy_ctx); } @@ -2743,7 +2743,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kin3d->data = (void *) inp; @@ -2763,7 +2762,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1); ggml_free(cpy_ctx); } diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp index 57a02ea..61d8583 100644 --- a/examples/talk-llama/talk-llama.cpp +++ b/examples/talk-llama/talk-llama.cpp @@ -649,7 +649,10 @@ int main(int argc, char ** argv) { } text_to_speak = ::replace(text_to_speak, "\"", ""); - system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str()); + int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str()); + if (ret != 0) { + fprintf(stderr, "%s: failed to speak\n", __func__); + } audio.clear(); diff --git a/examples/talk/gpt-2.cpp b/examples/talk/gpt-2.cpp index 1ee4e72..a2319db 100644 --- a/examples/talk/gpt-2.cpp +++ b/examples/talk/gpt-2.cpp @@ -191,9 +191,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // create the ggml context { struct ggml_init_params params = { - .mem_size = ctx_size, - .mem_buffer = NULL, - .no_alloc = false, + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, }; model.ctx = ggml_init(params); diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp index 651ca20..85c103d 100644 --- a/examples/talk/talk.cpp +++ b/examples/talk/talk.cpp @@ -349,7 +349,10 @@ int main(int argc, char ** argv) { gpt2_set_prompt(ctx_gpt, prompt_base.c_str()); text_to_speak = ::replace(text_to_speak, params.person + ": ", ""); - system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str()); + int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str()); + if (ret != 0) { + fprintf(stderr, "%s: system() failed!\n", __func__); + } audio.clear();