Fully implemented the JSON spec

2024-07-07 12:23:49 +02:00 · 2015-01-10 16:49:10 +01:00 · 2015-01-10 16:49:10 +01:00 · 5a54e46709
parent 222aacc213
commit 5a54e46709
3 changed files with 232 additions and 62 deletions
--- a/src/json.cc
+++ b/src/json.cc
@ -2049,40 +2049,61 @@ std::string json::parser::parseString()
    // the result of the parse process
    std::string result;
-    // iterate with pos_ over the whole string
+    // iterate with pos_ over the whole input until we found the end and return
-    for (; pos_ < buffer_.size(); pos_++) {
+    // or we exit via error()
    for (; pos_ < buffer_.size(); pos_++)
    {
        char currentChar = buffer_[pos_];
-        // uneven amount of backslashes means the user wants to escape something
+        if (!evenAmountOfBackslashes)
-        if (!evenAmountOfBackslashes) {
+        {
            // uneven amount of backslashes means the user wants to escape something
            // so we know there is a case such as '\X' or '\\\X' but we don't
            // know yet what X is.
            // at this point in the code, the currentChar has the value of X
            // slash, backslash and quote are copied as is
            if (   currentChar == '/'
                || currentChar == '\\'
-                || currentChar == '"') {
+                || currentChar == '"')
            {
                result += currentChar;
            } else {
                // All other characters are replaced by their respective special character
                if (currentChar == 't') {
                    result += '\t';
                } else if (currentChar == 'b') {
                    result += '\b';
                } else if (currentChar == 'f') {
                    result += '\f';
                } else if (currentChar == 'n') {
                    result += '\n';
                } else if (currentChar == 'r') {
                    result += '\r';
                } else if (currentChar == 'u') {
                    pos_++;
                    result += parseUnicodeEscape();
                } else {
                    error("expected one of \\,/,b,f,n,r,t behind backslash.");
                }
                // TODO implement \uXXXX
            }
-        } else {
+            else
-            if (currentChar == '"') {
+            {
                // All other characters are replaced by their respective special character
                if (currentChar == 't')
                    result += '\t';
                else if (currentChar == 'b')
                    result += '\b';
                else if (currentChar == 'f')
                    result += '\f';
                else if (currentChar == 'n')
                    result += '\n';
                else if (currentChar == 'r')
                    result += '\r';
                else if (currentChar == 'u')
                {
                    // \uXXXX[\uXXXX] is used for escaping unicode, which
                    // has it's own subroutine.
                    result += parseUnicodeEscape();
                    // the parsing process has brought us one step behind the
                    // unicode escape sequence:
                    // \uXXXX
                    //       ^
                    // so we need to go one character back or the parser
                    // would skip the character we are currently pointing at
                    // (as the for-loop will drecement pos_ after this iteration).
                    pos_--;
                }
                else // user did something like \z and we should report a error
                    error("expected one of \\,/,b,f,n,r,t,u behind backslash.");
            }
        }
        else
        {
            if (currentChar == '"')
            {
                // currentChar is a quote, so we found the end of the string
@ -2093,7 +2114,9 @@ std::string json::parser::parseString()
                // bring the result of the parsing process back to the caller
                return result;
-            } else if (currentChar != '\\') {
+            }
            else if (currentChar != '\\')
            {
                // all non-backslash characters are added to the end of the result string.
                // the only backslashes we want in the result are the ones that are escaped (which happens above).
                result += currentChar;
@ -2121,34 +2144,74 @@ std::string json::parser::parseString()
    error("expected '\"'");
 }
 std::string json::parser::unicodeToUTF8(unsigned int codepoint) {
-    // it's just a ASCII compatible codepoint,
+
-    // so we just interpret the point as a character
+/*!
-    if (codepoint <= 0x7f) {
+Turns a code point into it's UTF-8 representation.
 You should only pass numbers < 0x10ffff into this function
 (everything else is a invalid code point).
@return the UTF-8 representation of the given codepoint
@pre  This method isn't accessing the members of the parser
@post This method isn't accessing the members of the parser
 */
 std::string json::parser::codepointToUTF8(unsigned int codepoint)
 {
    // this method contains a lot of bit manipulations to
    // build the bytes for UTF-8.
    // the '(... >> S) & 0xHH'-patterns are used to retrieve
    // certain bits from the code points.
    // all static casts in this method have boundary checks
    // we initialize all strings with their final length
    // (e.g. 1 to 4 bytes) to save the reallocations.
    if (codepoint <= 0x7f)
    {
        // it's just a ASCII compatible codepoint,
        // so we just interpret the point as a character
        // and return ASCII
        return std::string(1, static_cast<char>(codepoint));
    }
    // if true, we need two bytes to encode this as UTF-8
    else if (codepoint <= 0x7ff)
    {
-        std::string result(2, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
+        // the 0xC0 enables the two most significant two bits
-        result[1] = static_cast<char>(0x80 | (codepoint & 0x3f));
+        // to make this a two-byte UTF-8 character.
        std::string result(2, static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F)));
        result[1] = static_cast<char>(0x80 | (codepoint & 0x3F));
        return result;
    }
    // if true, now we need three bytes to encode this as UTF-8
    else if (codepoint <= 0xffff)
    {
-        std::string result(3, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
+        // the 0xE0 enables the three most significant two bits
-        result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
+        // to make this a three-byte UTF-8 character.
-        result[2] = static_cast<char>(0x80 | (codepoint & 0x3f));
+        std::string result(3, static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F)));
        result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
        result[2] = static_cast<char>(0x80 | (codepoint & 0x3F));
        return result;
    }
-    else if (codepoint <= 0x1fffff)
+    // if true, we need maximal four bytes to encode this as UTF-8
    else if (codepoint <= 0x10ffff)
    {
-        std::string result(4, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
+        // the 0xE0 enables the four most significant two bits
-        result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f));
+        // to make this a three-byte UTF-8 character.
-        result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
+        std::string result(4, static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07)));
-        result[3] = static_cast<char>(0x80 | (codepoint & 0x3f));
+        result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
        result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
        result[3] = static_cast<char>(0x80 | (codepoint & 0x3F));
        return result;
-    } else {
+    }
    else
    {
        // Can't be tested without direct access to this private method.
        std::string errorMessage = "Invalid codepoint: ";
        errorMessage += codepoint;
        error(errorMessage);
@ -2156,39 +2219,110 @@ std::string json::parser::unicodeToUTF8(unsigned int codepoint) {
 }
 /*!
-Parses the JSON style unicode escape sequence (\uXXXX).
+Parses 4 hexadecimal characters as a number.
-@return the utf-8 character the escape sequence escaped
+@return the value of the number the hexadecimal characters represent.
-@pre  An opening quote \p " was read in the main parse function @ref parse.
+@pre  pos_ is pointing to the first of the 4 hexadecimal characters.
      pos_ is the position after the opening quote.
-@post The character after the closing quote \p " is the current character @ref
+@post pos_ is pointing to the character after the 4 hexadecimal characters.
      current_. Whitespace is skipped.
 */
-std::string json::parser::parseUnicodeEscape() {
+unsigned int json::parser::parse4HexCodepoint()
 {
    const auto startPos = pos_;
-    if (pos_ + 3 >= buffer_.size()) {
+
    // check if the  remaining buffer is long enough to even hold 4 characters
    if (pos_ + 3 >= buffer_.size())
    {
        error("Got end of input while parsing unicode escape sequence \\uXXXX");
    }
    // make a string that can hold the pair
    std::string hexCode(4, ' ');
-    for(; pos_ < startPos + 4; pos_++) {
+
    for(; pos_ < startPos + 4; pos_++)
    {
        // no boundary check here as we already checked above
        char currentChar = buffer_[pos_];
        // check if we have a hexadecimal character
        if (   (currentChar >= '0' && currentChar <= '9')
            || (currentChar >= 'a' && currentChar <= 'f')
-            || (currentChar >= 'A' && currentChar <= 'F')) {
+            || (currentChar >= 'A' && currentChar <= 'F'))
        {
            // all is well, we have valid hexadecimal chars
            // so we copy that char into our string
            hexCode[pos_ - startPos] = currentChar;
-        } else {
+        }
        else
        {
            error("Found non-hexadecimal character in unicode escape sequence!");
        }
    }
-    pos_--;
+    // the cast is safe as 4 hex characters can't present more than 16 bits
-    // case is safe as 4 hex characters can't present more than 16 bits
+    // the input to stoul was checked to contain only hexadecimal characters (see above)
-    return unicodeToUTF8(static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16)));
+    return static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16));
 }
 /*!
 Parses the unicode escape codes as defined in the ECMA-404.
 The escape sequence has two forms:
 1. \uXXXX
 2. \uXXXX\uYYYY
 where X and Y are a hexadecimal character (a-zA-Z0-9).
 Form 1 just contains the unicode code point in the hexadecimal number XXXX.
 Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low surrogate is YYYY.
@return the UTF-8 character this unicode escape sequence escaped.
@pre  pos_ is pointing at at the 'u' behind the first backslash.
@post pos_ is pointing at the character behind the last X (or Y in form 2).
 */
 std::string json::parser::parseUnicodeEscape()
 {
    // jump to the first hex value
    pos_++;
    // parse the hex first hex values
    unsigned int firstCodepoint = parse4HexCodepoint();
    if (firstCodepoint >= 0xD800 && firstCodepoint <= 0xDBFF)
    {
        // we found invalid code points, which means we either have a malformed input
        // or we found a high surrogate.
        // we can only find out by seeing if the next character also wants to encode
        // a unicode character (so, we have the \uXXXX\uXXXX case here).
        // jump behind the next \u
        pos_ += 2;
        // try to parse the next hex values.
        // the method does boundary checking for us, so no need to do that here
        unsigned secondCodepoint = parse4HexCodepoint();
        // ok, we have a low surrogate, check if it is a valid one
        if (secondCodepoint >= 0xDC00 && secondCodepoint <= 0xDFFF)
        {
            // calculate the final code point from the pair according to the spec
            unsigned int finalCodePoint =
                    // high surrogate occupies the most significant 22 bits
                    (firstCodepoint << 10)
                    // low surrogate occupies the least significant 15 bits
                    + secondCodepoint
                    // there is still the 0xD800, 0xDC00 and 0x10000 noise in the result
                    // so we have to substract with (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
                    - 0x35FDC00;
            // we transform the calculated point into UTF-8
            return codepointToUTF8(finalCodePoint);
        }
        else
            error("missing low surrogate");
    }
    // We have Form 1, so we just interpret the XXXX as a code point
    return codepointToUTF8(firstCodepoint);
 }
 /*!
--- a/src/json.h
+++ b/src/json.h
@ -419,8 +419,10 @@ class json
        /// parse a quoted string
        inline std::string parseString();
        /// transforms a unicode codepoint to it's UTF-8 presentation
-        inline std::string unicodeToUTF8(unsigned int codepoint);
+        inline std::string codepointToUTF8(unsigned int codepoint);
-        /// parses a unicode escape sequence
+        /// parses 4 hex characters that represent a unicode codepoint
        inline unsigned int parse4HexCodepoint();
        /// parses \uXXXX[\uXXXX] unicode escape characters
        inline std::string parseUnicodeEscape();
        /// parse a Boolean "true"
        inline void parseTrue();
--- a/test/json_unit.cc
+++ b/test/json_unit.cc
@ -1652,10 +1652,6 @@ TEST_CASE("Parser")
        CHECK(json::parse("\"a\\nz\"") == json("a\nz"));
        CHECK(json::parse("\"\\n\"") == json("\n"));
        // escape unicode characters
        CHECK(json::parse("\"\\u002F\"") == json("/"));
        CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
        // escaping senseless stuff
        CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);
@ -1665,6 +1661,44 @@ TEST_CASE("Parser")
        CHECK_THROWS_AS(json::parse("\""), std::invalid_argument);
    }
    SECTION("unicode_escaping")
    {
        // two tests for uppercase and lowercase hex
        // normal forward slash in ASCII range
        CHECK(json::parse("\"\\u002F\"") == json("/"));
        CHECK(json::parse("\"\\u002f\"") == json("/"));
        // german a umlaut
        CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
        CHECK(json::parse("\"\\u00e4\"") == json(u8"\u00E4"));
        // weird d
        CHECK(json::parse("\"\\u0111\"") == json(u8"\u0111"));
        // unicode arrow left
        CHECK(json::parse("\"\\u2190\"") == json(u8"\u2190"));
        // pleasing osiris by testing hieroglyph support
        CHECK(json::parse("\"\\uD80C\\uDC60\"") == json(u8"\U00013060"));
        CHECK(json::parse("\"\\ud80C\\udc60\"") == json(u8"\U00013060"));
        // no hex numbers behind the \u
        CHECK_THROWS_AS(json::parse("\"\\uD80v\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\uD80 A\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\uD8v\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\uDv\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\uv\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\u\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\u\\u\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"a\\uD80vAz\""), std::invalid_argument);
        // missing part of a surrogate pair
        CHECK_THROWS_AS(json::parse("\"bla \\uD80C bla\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\uD80C bla bla\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"bla bla \\uD80C bla bla\""), std::invalid_argument);
        // senseless surrogate pair
        CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument);
        CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument);
    }
    SECTION("boolean")
    {
        // accept the exact values