fix bugs
This commit is contained in:
parent
99e5322a79
commit
0f6ad6c2f5
107
whisper.cpp
107
whisper.cpp
|
@ -2983,7 +2983,7 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
||||||
bool collecting_numeric = false;
|
bool collecting_numeric = false;
|
||||||
bool collecting_letter = false;
|
bool collecting_letter = false;
|
||||||
bool collecting_special = false;
|
bool collecting_special = false;
|
||||||
bool collecting_whitespace_lookahead = false;
|
bool collecting_whitespace = false;
|
||||||
bool collecting = false;
|
bool collecting = false;
|
||||||
|
|
||||||
std::vector<std::string> text_utf;
|
std::vector<std::string> text_utf;
|
||||||
|
@ -3006,45 +3006,50 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
||||||
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
||||||
|
|
||||||
// handling contractions
|
// handling contractions
|
||||||
if (!split_condition && bytes_remain >= 2 && token != " ") {
|
if (!split_condition && bytes_remain >= 2) {
|
||||||
// 's|'t|'m|'d
|
if (token.size() == 0 || codepoint_type(token) == CODEPOINT_TYPE_LETTER || codepoint_type(token) == CODEPOINT_TYPE_DIGIT || codepoint_type(token) == CODEPOINT_TYPE_UNIDENTIFIED || (codepoint_type(token) == CODEPOINT_TYPE_WHITESPACE && token.back() != ' ')) {
|
||||||
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
// 's|'t|'m|'d
|
||||||
split_condition = true;
|
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
||||||
}
|
split_condition = true;
|
||||||
if (split_condition) {
|
}
|
||||||
if (token.size()) {
|
if (split_condition) {
|
||||||
bpe_words.emplace_back(token); // push previous content as token
|
if (token.size()) {
|
||||||
|
bpe_words.emplace_back(token); // push previous content as token
|
||||||
|
}
|
||||||
|
token = utf_char + utf_char_next;
|
||||||
|
bpe_words.emplace_back(token);
|
||||||
|
token = "";
|
||||||
|
i++;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
token = utf_char + utf_char_next;
|
|
||||||
bpe_words.emplace_back(token);
|
|
||||||
token = "";
|
|
||||||
i++;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!split_condition && bytes_remain >= 3 && token != " ") {
|
if (!split_condition && bytes_remain >= 3) {
|
||||||
// 're|'ve|'ll
|
if (token.size() == 0 || codepoint_type(token) == CODEPOINT_TYPE_LETTER || codepoint_type(token) == CODEPOINT_TYPE_DIGIT || codepoint_type(token) == CODEPOINT_TYPE_UNIDENTIFIED || (codepoint_type(token) == CODEPOINT_TYPE_WHITESPACE && token.back() != ' ')) {
|
||||||
if (utf_char == "\'" && (
|
// 're|'ve|'ll
|
||||||
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
if (utf_char == "\'" && (
|
||||||
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
||||||
(utf_char_next == "l" && utf_char_next_next == "l"))
|
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
||||||
) {
|
(utf_char_next == "l" && utf_char_next_next == "l"))
|
||||||
split_condition = true;
|
) {
|
||||||
}
|
split_condition = true;
|
||||||
if (split_condition) {
|
}
|
||||||
// current token + next token can be defined
|
if (split_condition) {
|
||||||
if (token.size()) {
|
// current token + next token can be defined
|
||||||
bpe_words.emplace_back(token); // push previous content as token
|
if (token.size()) {
|
||||||
|
bpe_words.emplace_back(token); // push previous content as token
|
||||||
|
}
|
||||||
|
token = utf_char + utf_char_next + utf_char_next_next;
|
||||||
|
bpe_words.emplace_back(token); // the contraction
|
||||||
|
token = "";
|
||||||
|
i += 2;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
token = utf_char + utf_char_next + utf_char_next_next;
|
|
||||||
bpe_words.emplace_back(token); // the contraction
|
|
||||||
token = "";
|
|
||||||
i += 2;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!split_condition && !collecting) {
|
if (!split_condition && !collecting) {
|
||||||
|
restart:
|
||||||
if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
||||||
collecting_letter = true;
|
collecting_letter = true;
|
||||||
collecting = true;
|
collecting = true;
|
||||||
|
@ -3060,13 +3065,10 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
||||||
collecting_special = true;
|
collecting_special = true;
|
||||||
collecting = true;
|
collecting = true;
|
||||||
}
|
}
|
||||||
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
else if ((utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) || (utf_char != " " && codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
||||||
collecting_whitespace_lookahead = true;
|
collecting_whitespace = true;
|
||||||
collecting = true;
|
collecting = true;
|
||||||
}
|
}
|
||||||
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if (!split_condition && collecting) {
|
else if (!split_condition && collecting) {
|
||||||
if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
||||||
|
@ -3078,14 +3080,17 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
||||||
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
else if (collecting_whitespace && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
}
|
if (split_condition) {
|
||||||
|
collecting = false;
|
||||||
if (utf_char_next == "") {
|
collecting_letter = false;
|
||||||
split_condition = true; // final
|
collecting_numeric = false;
|
||||||
token += utf_char;
|
collecting_special = false;
|
||||||
|
collecting_whitespace = false;
|
||||||
|
goto restart;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (split_condition) {
|
if (split_condition) {
|
||||||
|
@ -3093,19 +3098,13 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
||||||
bpe_words.emplace_back(token);
|
bpe_words.emplace_back(token);
|
||||||
}
|
}
|
||||||
token = utf_char;
|
token = utf_char;
|
||||||
collecting = false;
|
|
||||||
collecting_letter = false;
|
|
||||||
collecting_numeric = false;
|
|
||||||
collecting_special = false;
|
|
||||||
collecting_whitespace_lookahead = false;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (codepoint_type(token) == CODEPOINT_TYPE_PUNCTUATION && codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER) {
|
token += utf_char;
|
||||||
bpe_words.emplace_back(token);
|
}
|
||||||
token = utf_char;
|
|
||||||
} else {
|
if (utf_char_next == "") { // final
|
||||||
token += utf_char;
|
bpe_words.emplace_back(token);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue