From 2ca8cc77b24cd8c7c88aa758c582305ac5ddb260 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 8 Oct 2022 17:28:06 +0300 Subject: [PATCH] ref #17 : print whisper logs to stderr Only the transcribed/translted text is printed to stdout. This way, one can redirect the result to a file. --- main.cpp | 22 +++++++++++----------- whisper.cpp | 54 ++++++++++++++++++++++++++--------------------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/main.cpp b/main.cpp index 728ab6f..acaf302 100644 --- a/main.cpp +++ b/main.cpp @@ -192,21 +192,21 @@ int main(int argc, char ** argv) { // print some info about the processing { - printf("\n"); + fprintf(stderr, "\n"); if (!whisper_is_multilingual(ctx)) { if (params.language != "en" || params.translate) { params.language = "en"; params.translate = false; - printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__); + fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__); } } - printf("%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n", + fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n", __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads, params.language.c_str(), params.translate ? "translate" : "transcribe", params.no_timestamps ? 0 : 1); - printf("\n"); + fprintf(stderr, "\n"); } @@ -230,25 +230,25 @@ int main(int argc, char ** argv) { // print result if (!wparams.print_realtime) { - printf("\n"); + fprintf(stderr, "\n"); const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { const char * text = whisper_full_get_segment_text(ctx, i); if (params.no_timestamps) { - printf ("%s", text); + fprintf(stderr, "%s", text); fflush(stdout); } else { const int64_t t0 = whisper_full_get_segment_t0(ctx, i); const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text); + fprintf(stderr, "[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text); } } } - printf("\n"); + fprintf(stderr, "\n"); // output to text file if (params.output_txt) { @@ -260,7 +260,7 @@ int main(int argc, char ** argv) { return 8; } - printf("%s: saving output to '%s.txt'\n", __func__, fname_inp.c_str()); + fprintf(stderr, "%s: saving output to '%s.txt'\n", __func__, fname_inp.c_str()); const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { @@ -279,7 +279,7 @@ int main(int argc, char ** argv) { return 9; } - printf("%s: saving output to '%s.vtt'\n", __func__, fname_inp.c_str()); + fprintf(stderr, "%s: saving output to '%s.vtt'\n", __func__, fname_inp.c_str()); fout_vtt << "WEBVTT\n\n"; @@ -304,7 +304,7 @@ int main(int argc, char ** argv) { return 10; } - printf("%s: saving output to '%s.srt'\n", __func__, fname_inp.c_str()); + fprintf(stderr, "%s: saving output to '%s.srt'\n", __func__, fname_inp.c_str()); const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { diff --git a/whisper.cpp b/whisper.cpp index b59cfd7..81da469 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -421,7 +421,7 @@ struct whisper_context { // see the convert-pt-to-ggml.py script for details // bool whisper_model_load(const std::string & fname, whisper_context & wctx) { - printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str()); auto & model = wctx.model; auto & vocab = wctx.vocab; @@ -480,18 +480,18 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) { model.type = e_model::MODEL_LARGE; } - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx); - printf("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state); - printf("%s: n_audio_head = %d\n", __func__, hparams.n_audio_head); - printf("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer); - printf("%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx); - printf("%s: n_text_state = %d\n", __func__, hparams.n_text_state); - printf("%s: n_text_head = %d\n", __func__, hparams.n_text_head); - printf("%s: n_text_layer = %d\n", __func__, hparams.n_text_layer); - printf("%s: n_mels = %d\n", __func__, hparams.n_mels); - printf("%s: f16 = %d\n", __func__, hparams.f16); - printf("%s: type = %d\n", __func__, model.type); + fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx); + fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state); + fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head); + fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer); + fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx); + fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state); + fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head); + fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer); + fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels); + fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); + fprintf(stderr, "%s: type = %d\n", __func__, model.type); wctx.buf_model.resize(MEM_REQ_MODEL.at(model.type)); wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type))); @@ -503,7 +503,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) { wctx.buf_compute.size() + wctx.buf_compute_layer.size(); - printf("%s: mem_required = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); + fprintf(stderr, "%s: mem_required = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); } // load mel filters @@ -553,7 +553,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) { } if (n_vocab < model.hparams.n_vocab) { - printf("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab); + fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab); for (int i = n_vocab; i < model.hparams.n_vocab; i++) { if (i > vocab.token_beg) { word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]"; @@ -698,7 +698,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) { ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } // create the ggml context @@ -945,7 +945,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) { ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) + ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v); - printf("%s: memory size = %8.2f MB \n", __func__, memory_size/1024.0/1024.0); + fprintf(stderr, "%s: memory size = %8.2f MB \n", __func__, memory_size/1024.0/1024.0); } // load weights @@ -1008,10 +1008,10 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) { n_loaded++; } - printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + fprintf(stderr, "%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); if (n_loaded == 0) { - printf("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); + fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); } else if (n_loaded != (int) model.tensors.size()) { fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), n_loaded); return false; @@ -2242,13 +2242,13 @@ whisper_token whisper_token_transcribe() { void whisper_print_timings(struct whisper_context * ctx) { const int64_t t_end_us = ggml_time_us(); - printf("\n"); - printf("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f); - printf("%s: mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f); - printf("%s: encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer); - printf("%s: decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer); - printf("%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f); + fprintf(stderr, "\n"); + fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f); + fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f); + fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f); + fprintf(stderr, "%s: encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer); + fprintf(stderr, "%s: decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer); + fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f); } //////////////////////////////////////////////////////////////////////////// @@ -2349,7 +2349,7 @@ int whisper_full( while (progress_cur >= progress_prev + progress_step) { progress_prev += progress_step; if (params.print_progress) { - printf("%s: progress = %3d%%\n", __func__, progress_prev); + fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev); } }