From 047ae5b51ab71af8fc5fec0e9243c8393676847b Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sat, 10 Feb 2024 23:02:01 +0000 Subject: [PATCH] reduce error rate --- whisper.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index e1d8c74..eb78a91 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -3079,7 +3079,7 @@ static std::vector bpe_gpt2_preprocess(const std::string & text) { else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { split_condition = true; } - else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) { split_condition = true; } } @@ -3101,7 +3101,12 @@ static std::vector bpe_gpt2_preprocess(const std::string & text) { collecting_whitespace_lookahead = false; } else { - token += utf_char; + if (codepoint_type(token) == CODEPOINT_TYPE_PUNCTUATION && codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER) { + bpe_words.emplace_back(token); + token = utf_char; + } else { + token += utf_char; + } } }