This commit is contained in:
bobqianic 2024-02-13 02:51:28 +00:00 committed by GitHub
parent 99e5322a79
commit 0f6ad6c2f5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -2983,7 +2983,7 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
bool collecting_numeric = false; bool collecting_numeric = false;
bool collecting_letter = false; bool collecting_letter = false;
bool collecting_special = false; bool collecting_special = false;
bool collecting_whitespace_lookahead = false; bool collecting_whitespace = false;
bool collecting = false; bool collecting = false;
std::vector<std::string> text_utf; std::vector<std::string> text_utf;
@ -3006,45 +3006,50 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
// handling contractions // handling contractions
if (!split_condition && bytes_remain >= 2 && token != " ") { if (!split_condition && bytes_remain >= 2) {
// 's|'t|'m|'d if (token.size() == 0 || codepoint_type(token) == CODEPOINT_TYPE_LETTER || codepoint_type(token) == CODEPOINT_TYPE_DIGIT || codepoint_type(token) == CODEPOINT_TYPE_UNIDENTIFIED || (codepoint_type(token) == CODEPOINT_TYPE_WHITESPACE && token.back() != ' ')) {
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) { // 's|'t|'m|'d
split_condition = true; if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
} split_condition = true;
if (split_condition) { }
if (token.size()) { if (split_condition) {
bpe_words.emplace_back(token); // push previous content as token if (token.size()) {
bpe_words.emplace_back(token); // push previous content as token
}
token = utf_char + utf_char_next;
bpe_words.emplace_back(token);
token = "";
i++;
continue;
} }
token = utf_char + utf_char_next;
bpe_words.emplace_back(token);
token = "";
i++;
continue;
} }
} }
if (!split_condition && bytes_remain >= 3 && token != " ") { if (!split_condition && bytes_remain >= 3) {
// 're|'ve|'ll if (token.size() == 0 || codepoint_type(token) == CODEPOINT_TYPE_LETTER || codepoint_type(token) == CODEPOINT_TYPE_DIGIT || codepoint_type(token) == CODEPOINT_TYPE_UNIDENTIFIED || (codepoint_type(token) == CODEPOINT_TYPE_WHITESPACE && token.back() != ' ')) {
if (utf_char == "\'" && ( // 're|'ve|'ll
(utf_char_next == "r" && utf_char_next_next == "e") || if (utf_char == "\'" && (
(utf_char_next == "v" && utf_char_next_next == "e") || (utf_char_next == "r" && utf_char_next_next == "e") ||
(utf_char_next == "l" && utf_char_next_next == "l")) (utf_char_next == "v" && utf_char_next_next == "e") ||
) { (utf_char_next == "l" && utf_char_next_next == "l"))
split_condition = true; ) {
} split_condition = true;
if (split_condition) { }
// current token + next token can be defined if (split_condition) {
if (token.size()) { // current token + next token can be defined
bpe_words.emplace_back(token); // push previous content as token if (token.size()) {
bpe_words.emplace_back(token); // push previous content as token
}
token = utf_char + utf_char_next + utf_char_next_next;
bpe_words.emplace_back(token); // the contraction
token = "";
i += 2;
continue;
} }
token = utf_char + utf_char_next + utf_char_next_next;
bpe_words.emplace_back(token); // the contraction
token = "";
i += 2;
continue;
} }
} }
if (!split_condition && !collecting) { if (!split_condition && !collecting) {
restart:
if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
collecting_letter = true; collecting_letter = true;
collecting = true; collecting = true;
@ -3060,13 +3065,10 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
collecting_special = true; collecting_special = true;
collecting = true; collecting = true;
} }
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) { else if ((utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) || (utf_char != " " && codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
collecting_whitespace_lookahead = true; collecting_whitespace = true;
collecting = true; collecting = true;
} }
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
split_condition = true;
}
} }
else if (!split_condition && collecting) { else if (!split_condition && collecting) {
if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) { if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
@ -3078,14 +3080,17 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
split_condition = true; split_condition = true;
} }
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) { else if (collecting_whitespace && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
split_condition = true; split_condition = true;
} }
} if (split_condition) {
collecting = false;
if (utf_char_next == "") { collecting_letter = false;
split_condition = true; // final collecting_numeric = false;
token += utf_char; collecting_special = false;
collecting_whitespace = false;
goto restart;
}
} }
if (split_condition) { if (split_condition) {
@ -3093,19 +3098,13 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
bpe_words.emplace_back(token); bpe_words.emplace_back(token);
} }
token = utf_char; token = utf_char;
collecting = false;
collecting_letter = false;
collecting_numeric = false;
collecting_special = false;
collecting_whitespace_lookahead = false;
} }
else { else {
if (codepoint_type(token) == CODEPOINT_TYPE_PUNCTUATION && codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER) { token += utf_char;
bpe_words.emplace_back(token); }
token = utf_char;
} else { if (utf_char_next == "") { // final
token += utf_char; bpe_words.emplace_back(token);
}
} }
} }