Merge branch 'feature/manual_lexer' into develop

2017-05-05 18:27:56 +02:00 · 2017-05-05 18:27:56 +02:00 · 56ac7908f1
parent ecf895f2d1 dbcb032f22
commit 56ac7908f1
14 changed files with 5887 additions and 17227 deletions
--- a/.gitignore
+++ b/.gitignore
@ -30,3 +30,6 @@ test/parse_afl_fuzzer
 test/parse_cbor_fuzzer

 test/parse_msgpack_fuzzer
+
+minibench
+
--- a/19
+++ b/19
@ -1,9 +1,5 @@
 .PHONY: pretty clean ChangeLog.md

-# used programs
-RE2C := $(shell command -v re2c 2> /dev/null)
-SED = sed
-
 # main target
 all:
 	$(MAKE) -C test
@ -51,7 +47,8 @@ doctest:
 # -Wno-keyword-macro: unit-tests use "#define private public"
 # -Wno-deprecated-declarations: the library deprecated some functions
 # -Wno-weak-vtables: exception class is defined inline, but has virtual method
-# -Wno-range-loop-analysis: iterator_wrapper tests tests "for(const auto i...)"
+# -Wno-range-loop-analysis: iterator_wrapper tests "for(const auto i...)"
+# -Wno-float-equal: not all comparisons in the tests can be replaced by Approx
 pedantic_clang:
 	$(MAKE) json_unit CXXFLAGS="\
 		-std=c++11 \
@ -62,7 +59,8 @@ pedantic_clang:
 		-Wno-keyword-macro \
 		-Wno-deprecated-declarations \
 		-Wno-weak-vtables \
-		-Wno-range-loop-analysis"
+		-Wno-range-loop-analysis \
+		-Wno-float-equal"

 # calling GCC with most warnings
 pedantic_gcc:
@ -186,13 +184,6 @@ clang_sanitize: clean
 # maintainer targets
 ##########################################################################

-# create scanner with re2c
-re2c: src/json.hpp.re2c
-ifndef RE2C
-	$(error "re2c is not available, please install re2c")
-endif
-	$(RE2C) -W --utf-8 --encoding-policy fail --bit-vectors --nested-ifs --no-debug-info $< | $(SED) '1d' > src/json.hpp
-
 # pretty printer
 pretty:
 	astyle --style=allman --indent=spaces=4 --indent-modifiers \
@ -200,7 +191,7 @@ pretty:
 	   --indent-col1-comments --pad-oper --pad-header --align-pointer=type \
 	   --align-reference=type --add-brackets --convert-tabs --close-templates \
 	   --lineend=linux --preserve-date --suffix=none --formatted \
-	   src/json.hpp src/json.hpp.re2c test/src/*.cpp \
+	   src/json.hpp test/src/*.cpp \
 	   benchmarks/benchmarks.cpp doc/examples/*.cpp


--- a/README.md
+++ b/README.md
@ -899,7 +899,7 @@ $ make json_unit -Ctest
 $ ./test/json_unit "*"

 ===============================================================================
-All tests passed (11203022 assertions in 48 test cases)
+All tests passed (13391115 assertions in 49 test cases)
 ```

 Alternatively, you can use [CMake](https://cmake.org) and run
--- a/src/json.hpp
+++ b/src/json.hpp
--- a/src/json.hpp.re2c
+++ b/src/json.hpp.re2c
--- a/test/src/unit-cbor.cpp
+++ b/test/src/unit-cbor.cpp
@ -28,7 +28,6 @@ SOFTWARE.

 #include "catch.hpp"

-#define private public
 #include "json.hpp"
 using nlohmann::json;

@ -728,14 +727,9 @@ TEST_CASE("CBOR")
                    const auto result = json::to_cbor(j);
                    CHECK(result == expected);

-                    // restore value (reverse array for endianess)
-                    double restored;
-                    std::reverse(expected.begin(), expected.end());
-                    memcpy(&restored, expected.data(), sizeof(double));
-                    CHECK(restored == v);
-
                    // roundtrip
                    CHECK(json::from_cbor(result) == j);
+                    CHECK(json::from_cbor(result) == v);
                }
            }

@ -1166,35 +1160,35 @@ TEST_CASE("CBOR")
            CHECK_THROWS_AS(json::from_cbor(std::vector<uint8_t>({0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})), json::parse_error);

            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x18})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 1 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 2: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x19})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 2 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 2: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x19, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 2 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 3: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1a})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 2: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1a, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 3: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1a, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 4: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1a, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 5: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1b})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 2: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1b, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 3: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1b, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 4: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1b, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 5: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1b, 0x00, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 6: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1b, 0x00, 0x00, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 7: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 8: unexpected end of input");
            CHECK_THROWS_WITH(json::from_cbor(std::vector<uint8_t>({0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 9: unexpected end of input");
        }

        SECTION("unsupported bytes")
@ -1357,12 +1351,6 @@ TEST_CASE("CBOR regressions", "[!throws]")
            }
        }
    }
-
-    SECTION("improve code coverage")
-    {
-        // exotic edge case
-        CHECK_THROWS_AS(json::check_length(0xffffffffffffffffull, 0xfffffffffffffff0ull, 0xff), json::parse_error);
-    }
 }

 TEST_CASE("CBOR roundtrips", "[hide]")
@ -1756,7 +1744,7 @@ TEST_CASE("examples from RFC 7049 Appendix A")
        CHECK(json::parse("\"\\ud800\\udd51\"") == json::from_cbor(std::vector<uint8_t>({0x64, 0xf0, 0x90, 0x85, 0x91})));

        // indefinite length strings
-        CHECK(json::parse("\"streaming\"") == json::from_cbor(std::vector<uint8_t>({0x7f, 0x65, 0x73, 0x74, 0x72, 0x65, 0x61, 0x64, 0x6d, 0x69, 0x6e, 0x67, 0xff})));
+        CHECK(json::parse("\"streaming\"") == json::from_cbor(std::vector<uint8_t>({0x7f, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0xff})));
    }

    SECTION("arrays")
--- a/test/src/unit-class_lexer.cpp
+++ b/test/src/unit-class_lexer.cpp
@ -32,106 +32,84 @@ SOFTWARE.
 #include "json.hpp"
 using nlohmann::json;

+// shortcut to scan a string literal
+json::lexer::token_type scan_string(const char* s);
+json::lexer::token_type scan_string(const char* s)
+{
+    return json::lexer(json::input_adapter::create(s)).scan();
+}
+
 TEST_CASE("lexer class")
 {
    SECTION("scan")
    {
        SECTION("structural characters")
        {
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("["),
-                               1).scan() == json::lexer::token_type::begin_array));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("]"),
-                               1).scan() == json::lexer::token_type::end_array));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("{"),
-                               1).scan() == json::lexer::token_type::begin_object));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("}"),
-                               1).scan() == json::lexer::token_type::end_object));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(","),
-                               1).scan() == json::lexer::token_type::value_separator));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(":"),
-                               1).scan() == json::lexer::token_type::name_separator));
+            CHECK((scan_string("[") == json::lexer::token_type::begin_array));
+            CHECK((scan_string("]") == json::lexer::token_type::end_array));
+            CHECK((scan_string("{") == json::lexer::token_type::begin_object));
+            CHECK((scan_string("}") == json::lexer::token_type::end_object));
+            CHECK((scan_string(",") == json::lexer::token_type::value_separator));
+            CHECK((scan_string(":") == json::lexer::token_type::name_separator));
        }

        SECTION("literal names")
        {
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("null"),
-                               4).scan() == json::lexer::token_type::literal_null));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("true"),
-                               4).scan() == json::lexer::token_type::literal_true));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("false"),
-                               5).scan() == json::lexer::token_type::literal_false));
+            CHECK((scan_string("null") == json::lexer::token_type::literal_null));
+            CHECK((scan_string("true") == json::lexer::token_type::literal_true));
+            CHECK((scan_string("false") == json::lexer::token_type::literal_false));
        }

        SECTION("numbers")
        {
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("0"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("2"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("3"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("4"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("5"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("6"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("7"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("8"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("9"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("0") == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("1") == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("2") == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("3") == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("4") == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("5") == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("6") == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("7") == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("8") == json::lexer::token_type::value_unsigned));
+            CHECK((scan_string("9") == json::lexer::token_type::value_unsigned));

-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-0"),
-                               2).scan() == json::lexer::token_type::value_integer));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-1"),
-                               2).scan() == json::lexer::token_type::value_integer));
+            CHECK((scan_string("-0") == json::lexer::token_type::value_integer));
+            CHECK((scan_string("-1") == json::lexer::token_type::value_integer));

-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1.1"),
-                               3).scan() == json::lexer::token_type::value_float));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-1.1"),
-                               4).scan() == json::lexer::token_type::value_float));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1E10"),
-                               4).scan() == json::lexer::token_type::value_float));
+            CHECK((scan_string("1.1") == json::lexer::token_type::value_float));
+            CHECK((scan_string("-1.1") == json::lexer::token_type::value_float));
+            CHECK((scan_string("1E10") == json::lexer::token_type::value_float));
        }

        SECTION("whitespace")
        {
            // result is end_of_input, because not token is following
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(" "),
-                               1).scan() == json::lexer::token_type::end_of_input));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\t"),
-                               1).scan() == json::lexer::token_type::end_of_input));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\n"),
-                               1).scan() == json::lexer::token_type::end_of_input));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\r"),
-                               1).scan() == json::lexer::token_type::end_of_input));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(" \t\n\r\n\t "),
-                               7).scan() == json::lexer::token_type::end_of_input));
+            CHECK((scan_string(" ") == json::lexer::token_type::end_of_input));
+            CHECK((scan_string("\t") == json::lexer::token_type::end_of_input));
+            CHECK((scan_string("\n") == json::lexer::token_type::end_of_input));
+            CHECK((scan_string("\r") == json::lexer::token_type::end_of_input));
+            CHECK((scan_string(" \t\n\r\n\t ") == json::lexer::token_type::end_of_input));
        }
    }

    SECTION("token_type_name")
    {
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::uninitialized) == "<uninitialized>"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::literal_true) == "true literal"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::literal_false) == "false literal"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::literal_null) == "null literal"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::value_string) == "string literal"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::value_unsigned) == "number literal"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::value_integer) == "number literal"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::value_float) == "number literal"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::begin_array) == "'['"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::begin_object) == "'{'"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::end_array) == "']'"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::end_object) == "'}'"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::name_separator) == "':'"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::value_separator) == "','"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::parse_error) == "<parse error>"));
-        CHECK((json::lexer::token_type_name(json::lexer::token_type::end_of_input) == "end of input"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::uninitialized)) == "<uninitialized>"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::literal_true)) == "true literal"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::literal_false)) == "false literal"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::literal_null)) == "null literal"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::value_string)) == "string literal"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::value_unsigned)) == "number literal"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::value_integer)) == "number literal"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::value_float)) == "number literal"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::begin_array)) == "'['"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::begin_object)) == "'{'"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::end_array)) == "']'"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::end_object)) == "'}'"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::name_separator)) == "':'"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::value_separator)) == "','"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::parse_error)) == "<parse error>"));
+        CHECK((std::string(json::lexer::token_type_name(json::lexer::token_type::end_of_input)) == "end of input"));
    }

    SECTION("parse errors on first character")
@ -141,8 +119,7 @@ TEST_CASE("lexer class")
            // create string from the ASCII code
            const auto s = std::string(1, static_cast<char>(c));
            // store scan() result
-            const auto res = json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(s.c_str()),
-                                         1).scan();
+            const auto res = scan_string(s.c_str());

            switch (c)
            {
@ -188,12 +165,23 @@ TEST_CASE("lexer class")
        }
    }

+    SECTION("very large string")
+    {
+        // strings larger than 1024 bytes yield a resize of the lexer's yytext buffer
+        std::string s("\"");
+        s += std::string(2048, 'x');
+        s += "\"";
+        CHECK((scan_string(s.c_str()) == json::lexer::token_type::value_string));
+    }
+
+    /* NOTE: to_unicode function has been removed
    SECTION("to_unicode")
    {
        // lexer to call to_unicode on
-        json::lexer dummy_lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(""), 0);
+        json::lexer dummy_lexer("", 0);
        CHECK(dummy_lexer.to_unicode(0x1F4A9) == "💩");
        CHECK_THROWS_AS(dummy_lexer.to_unicode(0x200000), json::parse_error);
        CHECK_THROWS_WITH(dummy_lexer.to_unicode(0x200000), "[json.exception.parse_error.103] parse error: code points above 0x10FFFF are invalid");
    }
+    */
 }
--- a/test/src/unit-class_parser.cpp
+++ b/test/src/unit-class_parser.cpp
--- a/test/src/unit-convenience.cpp
+++ b/test/src/unit-convenience.cpp
@ -53,7 +53,7 @@ TEST_CASE("convenience functions")
                                      const char* escaped)
        {
            std::stringstream ss;
-            json::serializer s(ss);
+            json::serializer s(json::output_adapter<char>::create(ss));
            s.dump_escaped(original);
            CHECK(ss.str() == escaped);
        };
--- a/test/src/unit-deserialization.cpp
+++ b/test/src/unit-deserialization.cpp
@ -92,7 +92,7 @@ TEST_CASE("deserialization")
            ss2 << "[\"foo\",1,2,3,false,{\"one\":1}";
            CHECK_THROWS_AS(json::parse(ss1), json::parse_error);
            CHECK_THROWS_WITH(json::parse(ss2),
-                              "[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }

        SECTION("string")
@ -100,7 +100,7 @@ TEST_CASE("deserialization")
            json::string_t s = "[\"foo\",1,2,3,false,{\"one\":1}";
            CHECK_THROWS_AS(json::parse(s), json::parse_error);
            CHECK_THROWS_WITH(json::parse(s),
-                              "[json.exception.parse_error.101] parse error at 29: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }

        SECTION("operator<<")
@ -111,7 +111,7 @@ TEST_CASE("deserialization")
            json j;
            CHECK_THROWS_AS(j << ss1, json::parse_error);
            CHECK_THROWS_WITH(j << ss2,
-                              "[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }

        SECTION("operator>>")
@ -122,14 +122,14 @@ TEST_CASE("deserialization")
            json j;
            CHECK_THROWS_AS(ss1 >> j, json::parse_error);
            CHECK_THROWS_WITH(ss2 >> j,
-                              "[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }

        SECTION("user-defined string literal")
        {
            CHECK_THROWS_AS("[\"foo\",1,2,3,false,{\"one\":1}"_json, json::parse_error);
            CHECK_THROWS_WITH("[\"foo\",1,2,3,false,{\"one\":1}"_json,
-                              "[json.exception.parse_error.101] parse error at 29: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }
    }

--- a/test/src/unit-msgpack.cpp
+++ b/test/src/unit-msgpack.cpp
@ -676,14 +676,9 @@ TEST_CASE("MessagePack")
                    const auto result = json::to_msgpack(j);
                    CHECK(result == expected);

-                    // restore value (reverse array for endianess)
-                    double restored;
-                    std::reverse(expected.begin(), expected.end());
-                    memcpy(&restored, expected.data(), sizeof(double));
-                    CHECK(restored == v);
-
                    // roundtrip
                    CHECK(json::from_msgpack(result) == j);
+                    CHECK(json::from_msgpack(result) == v);
                }
            }
        }
@ -1038,35 +1033,35 @@ TEST_CASE("MessagePack")
            CHECK_THROWS_AS(json::from_msgpack(std::vector<uint8_t>({0xcf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})), json::parse_error);

            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcc})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 1 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 2: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcd})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 2 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 2: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcd, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 2 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 3: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xce})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 2: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xce, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 3: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xce, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 4: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xce, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 5: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcf})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 2: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcf, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 3: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcf, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 4: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcf, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 5: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcf, 0x00, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 6: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcf, 0x00, 0x00, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 7: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 8: unexpected end of input");
            CHECK_THROWS_WITH(json::from_msgpack(std::vector<uint8_t>({0xcf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})),
-                              "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                              "[json.exception.parse_error.110] parse error at 9: unexpected end of input");
        }

        SECTION("unsupported bytes")
--- a/test/src/unit-regression.cpp
+++ b/test/src/unit-regression.cpp
@ -596,7 +596,7 @@ TEST_CASE("regression tests")
        // a parse error because of the EOF.
        CHECK_THROWS_AS(ss >> j, json::parse_error);
        CHECK_THROWS_WITH(ss >> j,
-                          "[json.exception.parse_error.101] parse error at 1: parse error - unexpected end of input");
+                          "[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input");
    }

    SECTION("issue #389 - Integer-overflow (OSS-Fuzz issue 267)")
@ -629,7 +629,7 @@ TEST_CASE("regression tests")
        std::vector<uint8_t> vec {0x65, 0xf5, 0x0a, 0x48, 0x21};
        CHECK_THROWS_AS(json::from_cbor(vec), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 5 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 6: unexpected end of input");
    }

    SECTION("issue #407 - Heap-buffer-overflow (OSS-Fuzz issue 343)")
@ -638,31 +638,31 @@ TEST_CASE("regression tests")
        std::vector<uint8_t> vec1 {0xcb, 0x8f, 0x0a};
        CHECK_THROWS_AS(json::from_msgpack(vec1), json::parse_error);
        CHECK_THROWS_WITH(json::from_msgpack(vec1),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 4: unexpected end of input");

        // related test case: incomplete float32
        std::vector<uint8_t> vec2 {0xca, 0x8f, 0x0a};
        CHECK_THROWS_AS(json::from_msgpack(vec2), json::parse_error);
        CHECK_THROWS_WITH(json::from_msgpack(vec2),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 4: unexpected end of input");

        // related test case: incomplete Half-Precision Float (CBOR)
        std::vector<uint8_t> vec3 {0xf9, 0x8f};
        CHECK_THROWS_AS(json::from_cbor(vec3), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec3),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 2 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 3: unexpected end of input");

        // related test case: incomplete Single-Precision Float (CBOR)
        std::vector<uint8_t> vec4 {0xfa, 0x8f, 0x0a};
        CHECK_THROWS_AS(json::from_cbor(vec4), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec4),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 4 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 4: unexpected end of input");

        // related test case: incomplete Double-Precision Float (CBOR)
        std::vector<uint8_t> vec5 {0xfb, 0x8f, 0x0a};
        CHECK_THROWS_AS(json::from_cbor(vec5), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec5),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 8 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 4: unexpected end of input");
    }

    SECTION("issue #408 - Heap-buffer-overflow (OSS-Fuzz issue 344)")
@ -671,7 +671,7 @@ TEST_CASE("regression tests")
        std::vector<uint8_t> vec1 {0x87};
        CHECK_THROWS_AS(json::from_msgpack(vec1), json::parse_error);
        CHECK_THROWS_WITH(json::from_msgpack(vec1),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 1 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 2: unexpected end of input");

        // more test cases for MessagePack
        for (auto b :
@ -705,10 +705,10 @@ TEST_CASE("regression tests")
        std::vector<uint8_t> vec2;
        CHECK_THROWS_AS(json::from_cbor(vec2), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec2),
-                          "[json.exception.parse_error.110] parse error at 1: cannot read 1 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 1: unexpected end of input");
        CHECK_THROWS_AS(json::from_msgpack(vec2), json::parse_error);
        CHECK_THROWS_WITH(json::from_msgpack(vec2),
-                          "[json.exception.parse_error.110] parse error at 1: cannot read 1 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 1: unexpected end of input");
    }

    SECTION("issue #411 - Heap-buffer-overflow (OSS-Fuzz issue 366)")
@ -717,19 +717,19 @@ TEST_CASE("regression tests")
        std::vector<uint8_t> vec1 {0x7f};
        CHECK_THROWS_AS(json::from_cbor(vec1), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec1),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 1 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 2: unexpected end of input");

        // related test case: empty array (indefinite length)
        std::vector<uint8_t> vec2 {0x9f};
        CHECK_THROWS_AS(json::from_cbor(vec2), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec2),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 1 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 2: unexpected end of input");

        // related test case: empty map (indefinite length)
        std::vector<uint8_t> vec3 {0xbf};
        CHECK_THROWS_AS(json::from_cbor(vec3), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec3),
-                          "[json.exception.parse_error.110] parse error at 2: cannot read 1 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 2: unexpected end of input");
    }

    SECTION("issue #412 - Heap-buffer-overflow (OSS-Fuzz issue 367)")
@ -763,19 +763,19 @@ TEST_CASE("regression tests")
        std::vector<uint8_t> vec1 {0x7f, 0x61, 0x61};
        CHECK_THROWS_AS(json::from_cbor(vec1), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec1),
-                          "[json.exception.parse_error.110] parse error at 4: cannot read 1 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 4: unexpected end of input");

        // related test case: nonempty array (indefinite length)
        std::vector<uint8_t> vec2 {0x9f, 0x01};
        CHECK_THROWS_AS(json::from_cbor(vec2), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec2),
-                          "[json.exception.parse_error.110] parse error at 3: cannot read 1 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 3: unexpected end of input");

        // related test case: nonempty map (indefinite length)
        std::vector<uint8_t> vec3 {0xbf, 0x61, 0x61, 0x01};
        CHECK_THROWS_AS(json::from_cbor(vec3), json::parse_error);
        CHECK_THROWS_WITH(json::from_cbor(vec3),
-                          "[json.exception.parse_error.110] parse error at 5: cannot read 1 bytes from vector");
+                          "[json.exception.parse_error.110] parse error at 5: unexpected end of input");
    }

    SECTION("issue #414 - compare with literal 0)")
@ -921,6 +921,7 @@ TEST_CASE("regression tests")
        CHECK(j["bool_vector"].dump() == "[false,true,false,false]");
    }

+    /* NOTE: m_line_buffer is not used any more
    SECTION("issue #495 - fill_line_buffer incorrectly tests m_stream for eof but not fail or bad bits")
    {
        SECTION("setting failbit")
@ -953,6 +954,7 @@ TEST_CASE("regression tests")
            CHECK_THROWS_WITH(l.fill_line_buffer(), "[json.exception.parse_error.111] parse error: bad input stream");
        }
    }
+     */

    SECTION("issue #504 - assertion error (OSS-Fuzz 856)")
    {
--- a/test/src/unit-testsuites.cpp
+++ b/test/src/unit-testsuites.cpp
@ -77,8 +77,8 @@ TEST_CASE("compliance tests from json.org")
                })
        {
            CAPTURE(filename);
-            json j;
            std::ifstream f(filename);
+            json j;
            CHECK_THROWS_AS(f >> j, json::parse_error);
        }
    }
@ -93,8 +93,8 @@ TEST_CASE("compliance tests from json.org")
                })
        {
            CAPTURE(filename);
-            json j;
            std::ifstream f(filename);
+            json j;
            CHECK_NOTHROW(f >> j);
        }
    }
@ -305,6 +305,7 @@ TEST_CASE("compliance tests from nativejson-benchmark")
            std::string json_string( (std::istreambuf_iterator<char>(f) ),
                                     (std::istreambuf_iterator<char>()) );

+            CAPTURE(json_string);
            json j = json::parse(json_string);
            CHECK(j.dump() == json_string);
        }
--- a/test/src/unit-unicode.cpp
+++ b/test/src/unit-unicode.cpp
@ -34,17 +34,832 @@ using nlohmann::json;

 #include <fstream>

+// create and check a JSON string with up to four UTF-8 bytes
+void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
+{
+    std::string json_string = "\"";
+
+    CAPTURE(byte1);
+    json_string += std::string(1, static_cast<char>(byte1));
+
+    if (byte2 != -1)
+    {
+        CAPTURE(byte2);
+        json_string += std::string(1, static_cast<char>(byte2));
+    }
+
+    if (byte3 != -1)
+    {
+        CAPTURE(byte3);
+        json_string += std::string(1, static_cast<char>(byte3));
+    }
+
+    if (byte4 != -1)
+    {
+        CAPTURE(byte4);
+        json_string += std::string(1, static_cast<char>(byte4));
+    }
+
+    json_string += "\"";
+
+    CAPTURE(json_string);
+
+    if (success_expected)
+    {
+        CHECK_NOTHROW(json::parse(json_string));
+    }
+    else
+    {
+        CHECK_THROWS_AS(json::parse(json_string), json::parse_error);
+    }
+}
+
 TEST_CASE("Unicode", "[hide]")
 {
-    SECTION("full enumeration of Unicode code points")
+    SECTION("RFC 3629")
    {
-        // lexer to call to_unicode on
-        json::lexer dummy_lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(""), 0);
+        /*
+        RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
+        follows:

+            A UTF-8 string is a sequence of octets representing a sequence of UCS
+            characters.  An octet sequence is valid UTF-8 only if it matches the
+            following syntax, which is derived from the rules for encoding UTF-8
+            and is expressed in the ABNF of [RFC2234].
+
+            UTF8-octets = *( UTF8-char )
+            UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+            UTF8-1      = %x00-7F
+            UTF8-2      = %xC2-DF UTF8-tail
+            UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
+                          %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
+            UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
+                          %xF4 %x80-8F 2( UTF8-tail )
+            UTF8-tail   = %x80-BF
+        */
+
+        SECTION("ill-formed first byte")
+        {
+            for (int byte1 = 0x80; byte1 <= 0xC1; ++byte1)
+            {
+                check_utf8string(false, byte1);
+            }
+
+            for (int byte1 = 0xF5; byte1 <= 0xFF; ++byte1)
+            {
+                check_utf8string(false, byte1);
+            }
+        }
+
+        SECTION("UTF8-1 (x00-x7F)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0x00; byte1 <= 0x7F; ++byte1)
+                {
+                    // unescaped control characters are parse errors in JSON
+                    if (0x00 <= byte1 and byte1 <= 0x1F)
+                    {
+                        check_utf8string(false, byte1);
+                        continue;
+                    }
+
+                    // a single quote is a parse error in JSON
+                    if (byte1 == 0x22)
+                    {
+                        check_utf8string(false, byte1);
+                        continue;
+                    }
+
+                    // a single backslash is a parse error in JSON
+                    if (byte1 == 0x5C)
+                    {
+                        check_utf8string(false, byte1);
+                        continue;
+                    }
+
+                    // all other characters are OK
+                    check_utf8string(true, byte1);
+                }
+            }
+        }
+
+        SECTION("UTF8-2 (xC2-xDF UTF8-tail)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        check_utf8string(true, byte1, byte2);
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing second byte")
+            {
+                for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
+                {
+                    check_utf8string(false, byte1);
+                }
+            }
+
+            SECTION("ill-formed: wrong second byte")
+            {
+                for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
+                {
+                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
+                    {
+                        // skip correct second byte
+                        if (0x80 <= byte2 and byte2 <= 0xBF)
+                        {
+                            continue;
+                        }
+
+                        check_utf8string(false, byte1, byte2);
+                    }
+                }
+            }
+        }
+
+        SECTION("UTF8-3 (xE0 xA0-BF UTF8-tail)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
+                {
+                    for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(true, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing second byte")
+            {
+                for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
+                {
+                    check_utf8string(false, byte1);
+                }
+            }
+
+            SECTION("ill-formed: missing third byte")
+            {
+                for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
+                {
+                    for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
+                    {
+                        check_utf8string(false, byte1, byte2);
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong second byte")
+            {
+                for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
+                {
+                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
+                    {
+                        // skip correct second byte
+                        if (0xA0 <= byte2 and byte2 <= 0xBF)
+                        {
+                            continue;
+                        }
+
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong third byte")
+            {
+                for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
+                {
+                    for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
+                        {
+                            // skip correct third byte
+                            if (0x80 <= byte3 and byte3 <= 0xBF)
+                            {
+                                continue;
+                            }
+
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+        }
+
+        SECTION("UTF8-3 (xE1-xEC UTF8-tail UTF8-tail)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(true, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing second byte")
+            {
+                for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
+                {
+                    check_utf8string(false, byte1);
+                }
+            }
+
+            SECTION("ill-formed: missing third byte")
+            {
+                for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        check_utf8string(false, byte1, byte2);
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong second byte")
+            {
+                for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
+                {
+                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
+                    {
+                        // skip correct second byte
+                        if (0x80 <= byte2 and byte2 <= 0xBF)
+                        {
+                            continue;
+                        }
+
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong third byte")
+            {
+                for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
+                        {
+                            // skip correct third byte
+                            if (0x80 <= byte3 and byte3 <= 0xBF)
+                            {
+                                continue;
+                            }
+
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+        }
+
+        SECTION("UTF8-3 (xED x80-9F UTF8-tail)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(true, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing second byte")
+            {
+                for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
+                {
+                    check_utf8string(false, byte1);
+                }
+            }
+
+            SECTION("ill-formed: missing third byte")
+            {
+                for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
+                    {
+                        check_utf8string(false, byte1, byte2);
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong second byte")
+            {
+                for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
+                {
+                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
+                    {
+                        // skip correct second byte
+                        if (0x80 <= byte2 and byte2 <= 0x9F)
+                        {
+                            continue;
+                        }
+
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong third byte")
+            {
+                for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
+                    {
+                        for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
+                        {
+                            // skip correct third byte
+                            if (0x80 <= byte3 and byte3 <= 0xBF)
+                            {
+                                continue;
+                            }
+
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+        }
+
+        SECTION("UTF8-3 (xEE-xEF UTF8-tail UTF8-tail)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(true, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing second byte")
+            {
+                for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
+                {
+                    check_utf8string(false, byte1);
+                }
+            }
+
+            SECTION("ill-formed: missing third byte")
+            {
+                for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        check_utf8string(false, byte1, byte2);
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong second byte")
+            {
+                for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
+                {
+                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
+                    {
+                        // skip correct second byte
+                        if (0x80 <= byte2 and byte2 <= 0xBF)
+                        {
+                            continue;
+                        }
+
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong third byte")
+            {
+                for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
+                        {
+                            // skip correct third byte
+                            if (0x80 <= byte3 and byte3 <= 0xBF)
+                            {
+                                continue;
+                            }
+
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+        }
+
+        SECTION("UTF8-4 (xF0 x90-BF UTF8-tail UTF8-tail)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
+                {
+                    for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(true, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing second byte")
+            {
+                for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
+                {
+                    check_utf8string(false, byte1);
+                }
+            }
+
+            SECTION("ill-formed: missing third byte")
+            {
+                for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
+                {
+                    for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
+                    {
+                        check_utf8string(false, byte1, byte2);
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing fourth byte")
+            {
+                for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
+                {
+                    for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong second byte")
+            {
+                for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
+                {
+                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
+                    {
+                        // skip correct second byte
+                        if (0x90 <= byte2 and byte2 <= 0xBF)
+                        {
+                            continue;
+                        }
+
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong third byte")
+            {
+                for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
+                {
+                    for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
+                        {
+                            // skip correct third byte
+                            if (0x80 <= byte3 and byte3 <= 0xBF)
+                            {
+                                continue;
+                            }
+
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong fourth byte")
+            {
+                for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
+                {
+                    for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
+                            {
+                                // skip fourth second byte
+                                if (0x80 <= byte3 and byte3 <= 0xBF)
+                                {
+                                    continue;
+                                }
+
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        SECTION("UTF8-4 (xF1-F3 UTF8-tail UTF8-tail UTF8-tail)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(true, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing second byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    check_utf8string(false, byte1);
+                }
+            }
+
+            SECTION("ill-formed: missing third byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        check_utf8string(false, byte1, byte2);
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing fourth byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong second byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
+                    {
+                        // skip correct second byte
+                        if (0x80 <= byte2 and byte2 <= 0xBF)
+                        {
+                            continue;
+                        }
+
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong third byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
+                        {
+                            // skip correct third byte
+                            if (0x80 <= byte3 and byte3 <= 0xBF)
+                            {
+                                continue;
+                            }
+
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong fourth byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
+                            {
+                                // skip correct fourth byte
+                                if (0x80 <= byte3 and byte3 <= 0xBF)
+                                {
+                                    continue;
+                                }
+
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        SECTION("UTF8-4 (xF4 x80-8F UTF8-tail UTF8-tail)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(true, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing second byte")
+            {
+                for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
+                {
+                    check_utf8string(false, byte1);
+                }
+            }
+
+            SECTION("ill-formed: missing third byte")
+            {
+                for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
+                    {
+                        check_utf8string(false, byte1, byte2);
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing fourth byte")
+            {
+                for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong second byte")
+            {
+                for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
+                {
+                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
+                    {
+                        // skip correct second byte
+                        if (0x80 <= byte2 and byte2 <= 0x8F)
+                        {
+                            continue;
+                        }
+
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong third byte")
+            {
+                for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
+                    {
+                        for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
+                        {
+                            // skip correct third byte
+                            if (0x80 <= byte3 and byte3 <= 0xBF)
+                            {
+                                continue;
+                            }
+
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong fourth byte")
+            {
+                for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
+                            {
+                                // skip correct fourth byte
+                                if (0x80 <= byte3 and byte3 <= 0xBF)
+                                {
+                                    continue;
+                                }
+
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    SECTION("\\uxxxx sequences")
+    {
        // create an escaped string from a code point
        const auto codepoint_to_unicode = [](std::size_t cp)
        {
-            // copd points are represented as a six-character sequence: a
+            // code points are represented as a six-character sequence: a
            // reverse solidus, followed by the lowercase letter u, followed
            // by four hexadecimal digits that encode the character's code
            // point
@ -53,70 +868,100 @@ TEST_CASE("Unicode", "[hide]")
            return ss.str();
        };

-        // generate all UTF-8 code points; in total, 1112064 code points are
-        // generated: 0x1FFFFF code points - 2048 invalid values between
-        // 0xD800 and 0xDFFF.
-        for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
+        SECTION("correct sequences")
        {
-            // The Unicode standard permanently reserves these code point
-            // values for UTF-16 encoding of the high and low surrogates, and
-            // they will never be assigned a character, so there should be no
-            // reason to encode them. The official Unicode standard says that
-            // no UTF forms, including UTF-16, can encode these code points.
-            if (cp >= 0xD800u and cp <= 0xDFFFu)
+            // generate all UTF-8 code points; in total, 1112064 code points are
+            // generated: 0x1FFFFF code points - 2048 invalid values between
+            // 0xD800 and 0xDFFF.
+            for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
            {
-                // if we would not skip these code points, we would get a
-                // "missing low surrogate" exception
-                continue;
-            }
+                // string to store the code point as in \uxxxx format
+                std::string json_text = "\"";

-            // string to store the code point as in \uxxxx format
-            std::string escaped_string;
-            // string to store the code point as unescaped character sequence
-            std::string unescaped_string;
-
-            if (cp < 0x10000u)
-            {
-                // code points in the Basic Multilingual Plane can be
-                // represented with one \\uxxxx sequence
-                escaped_string = codepoint_to_unicode(cp);
-
-                // All Unicode characters may be placed within the quotation
-                // marks, except for the characters that must be escaped:
-                // quotation mark, reverse solidus, and the control characters
-                // (U+0000 through U+001F); we ignore these code points as
-                // they are checked with codepoint_to_unicode.
-                if (cp > 0x1f and cp != 0x22 and cp != 0x5c)
+                // decide whether to use one or two \uxxxx sequences
+                if (cp < 0x10000u)
                {
-                    unescaped_string = dummy_lexer.to_unicode(cp);
+                    // The Unicode standard permanently reserves these code point
+                    // values for UTF-16 encoding of the high and low surrogates, and
+                    // they will never be assigned a character, so there should be no
+                    // reason to encode them. The official Unicode standard says that
+                    // no UTF forms, including UTF-16, can encode these code points.
+                    if (cp >= 0xD800u and cp <= 0xDFFFu)
+                    {
+                        // if we would not skip these code points, we would get a
+                        // "missing low surrogate" exception
+                        continue;
+                    }
+
+                    // code points in the Basic Multilingual Plane can be
+                    // represented with one \uxxxx sequence
+                    json_text += codepoint_to_unicode(cp);
+                }
+                else
+                {
+                    // To escape an extended character that is not in the Basic
+                    // Multilingual Plane, the character is represented as a
+                    // 12-character sequence, encoding the UTF-16 surrogate pair
+                    const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
+                    const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
+                    json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
+                }
+
+                json_text += "\"";
+                CAPTURE(json_text);
+                CHECK_NOTHROW(json::parse(json_text));
+            }
+        }
+
+#if 0
+        SECTION("incorrect sequences")
+        {
+            SECTION("high surrogate without low surrogate")
+            {
+                // D800..DBFF are high surrogates and must be followed by low
+                // surrogates DC00..DFFF; here, nothing follows
+                for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
+                {
+                    std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
+                    CAPTURE(json_text);
+                    CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
                }
            }
-            else
+
+            SECTION("high surrogate with wrong low surrogate")
            {
-                // To escape an extended character that is not in the Basic
-                // Multilingual Plane, the character is represented as a
-                // 12-character sequence, encoding the UTF-16 surrogate pair
-                const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
-                const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
-                escaped_string = codepoint_to_unicode(codepoint1);
-                escaped_string += codepoint_to_unicode(codepoint2);
-                unescaped_string += dummy_lexer.to_unicode(codepoint1, codepoint2);
+                // D800..DBFF are high surrogates and must be followed by low
+                // surrogates DC00..DFFF; here a different sequence follows
+                for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
+                {
+                    for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
+                    {
+                        if (0xDC00u <= cp2 and cp2 <= 0xDFFFu)
+                        {
+                            continue;
+                        }
+
+                        std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
+                        CAPTURE(json_text);
+                        CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
+                    }
+                }
            }

-            // all other code points are valid and must not yield parse errors
-            CAPTURE(cp);
-            CAPTURE(escaped_string);
-            CAPTURE(unescaped_string);
+            SECTION("low surrogate without high surrogate")
+            {
+                // low surrogates DC00..DFFF must follow high surrogates; here,
+                // they occur alone
+                for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
+                {
+                    std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
+                    CAPTURE(json_text);
+                    CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
+                }
+            }

-            json j1, j2, j3, j4;
-            CHECK_NOTHROW(j1 = json::parse("\"" + escaped_string + "\""));
-            CHECK_NOTHROW(j2 = json::parse(j1.dump()));
-            CHECK(j1 == j2);
-
-            CHECK_NOTHROW(j3 = json::parse("\"" + unescaped_string + "\""));
-            CHECK_NOTHROW(j4 = json::parse(j3.dump()));
-            CHECK(j3 == j4);
        }
+#endif
    }

    SECTION("read all unicode characters")