+ moved lexer to class

This commit is contained in:
Niels 2015-02-11 09:10:28 +01:00
parent e845cd1db8
commit 8a4e127a57
3 changed files with 1010 additions and 1101 deletions

File diff suppressed because it is too large Load diff

View file

@ -14,6 +14,7 @@
#include <type_traits> #include <type_traits>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <cmath>
/*! /*!
- ObjectType trick from http://stackoverflow.com/a/9860911 - ObjectType trick from http://stackoverflow.com/a/9860911
@ -2384,9 +2385,9 @@ class basic_json
// parser // // parser //
//////////// ////////////
class parser class lexer
{ {
private: public:
/// token types for the parser /// token types for the parser
enum class token_type enum class token_type
{ {
@ -2406,238 +2407,40 @@ class basic_json
end_of_input end_of_input
}; };
/// the type of a lexer character inline lexer(const char* s) : m_content(s)
using lexer_char_t = unsigned char;
public:
/// constructor for strings
inline parser(const std::string& s) : buffer(s)
{ {
// set buffer for RE2C m_start = m_cursor = m_content;
m_cursor = reinterpret_cast<const lexer_char_t*>(buffer.c_str()); m_limit = m_content + strlen(m_content);
// set a pointer past the end of the buffer
m_limit = m_cursor + buffer.size();
// read first token
get_token();
} }
/// a parser reading from an input stream inline lexer() = default;
inline parser(std::istream& _is)
/*!max:re2c */
inline token_type scan()
{ {
while (_is) #define YYFILL(n)
{
std::string input_line;
std::getline(_is, input_line);
buffer += input_line;
}
// set buffer for RE2C
m_cursor = reinterpret_cast<const lexer_char_t*>(buffer.c_str());
// set a pointer past the end of the buffer
m_limit = m_cursor + buffer.size();
// read first token
get_token();
}
inline basic_json parse()
{
switch (last_token)
{
case (token_type::begin_object):
{
// explicitly set result to object to cope with {}
basic_json result(value_t::object);
// read next token
get_token();
// closing } -> we are done
if (last_token == token_type::end_object)
{
return result;
}
// otherwise: parse key-value pairs
do
{
// store key
expect_new(token_type::value_string);
const auto key = get_string();
// parse separator (:)
get_token();
expect_new(token_type::name_separator);
// parse value
get_token();
result[key] = parse();
// read next character
get_token();
}
while (last_token == token_type::value_separator
and get_token() == last_token);
// closing }
expect_new(token_type::end_object);
return result;
}
case (token_type::begin_array):
{
// explicitly set result to object to cope with []
basic_json result(value_t::array);
// read next token
get_token();
// closing ] -> we are done
if (last_token == token_type::end_array)
{
return result;
}
// otherwise: parse values
do
{
// parse value
result.push_back(parse());
// read next character
get_token();
}
while (last_token == token_type::value_separator
and get_token() == last_token);
// closing ]
expect_new(token_type::end_array);
return result;
}
case (token_type::literal_null):
{
return basic_json(nullptr);
}
case (token_type::value_string):
{
return basic_json(get_string());
}
case (token_type::literal_true):
{
return basic_json(true);
}
case (token_type::literal_false):
{
return basic_json(false);
}
case (token_type::value_number):
{
// The pointer m_begin points to the beginning of the
// parsed number. We pass this pointer to std::strtod which
// sets endptr to the first character past the converted
// number. If this pointer is not the same as m_cursor,
// then either more or less characters have been used
// during the comparison. This can happen for inputs like
// "01" which will be treated like number 0 followed by
// number 1.
// conversion
char* endptr;
const auto float_val = std::strtod(reinterpret_cast<const char*>(m_begin), &endptr);
// check if strtod read beyond the end of the lexem
if (reinterpret_cast<const lexer_char_t*>(endptr) != m_cursor)
{
throw std::invalid_argument(std::string("parse error - ") +
reinterpret_cast<const char*>(m_begin) + " is not a number");
}
// check if conversion loses precision
const auto int_val = static_cast<int>(float_val);
if (float_val == int_val)
{
// we basic_json not lose precision -> return int
return basic_json(int_val);
}
else
{
// we would lose precision -> returnfloat
return basic_json(float_val);
}
}
default:
{
std::string error_msg = "parse error - unexpected \'";
error_msg += static_cast<char>(m_begin[0]);
error_msg += "\' (";
error_msg += token_type_name(last_token) + ")";
throw std::invalid_argument(error_msg);
}
}
}
private:
/*!
This function implements a scanner for JSON. It is specified using
regular expressions that try to follow RFC 7159 and ECMA-404 as close
as possible. These regular expressions are then translated into a
deterministic finite automaton (DFA) by the tool RE2C. As a result, the
translated code for this function consists of a large block of code
with goto jumps.
@return the class of the next token read from the buffer
@todo Unicode support needs to be checked.
*/
inline token_type get_token()
{
// needed by RE2C
const lexer_char_t* marker = nullptr;
// set up RE2C
/*!re2c /*!re2c
re2c:labelprefix = "json_parser_"; re2c:define:YYCURSOR = m_cursor;
re2c:yyfill:enable = 0; re2c:define:YYLIMIT = m_limit;
re2c:define:YYCURSOR = m_cursor; re2c:define:YYCTYPE = char;
re2c:define:YYCTYPE = lexer_char_t; re2c:define:YYCTXMARKER = m_ctxmarker;
re2c:define:YYMARKER = marker; re2c:define:YYMARKER = m_marker;
re2c:indent:string = " "; re2c:indent:top = 1;
re2c:define:YYLIMIT = m_limit; re2c:yyfill:enable = 0;
*/
json_parser_lexer_start:
// set current to the begin of the buffer
m_begin = m_cursor;
if (m_begin == m_limit)
{
return last_token = token_type::end_of_input;
}
/*!re2c
// whitespace
ws = [ \t\n\r]*;
ws { goto json_parser_lexer_start; }
// structural characters // structural characters
"[" { return last_token = token_type::begin_array; } "[" { return token_type::begin_array; }
"]" { return last_token = token_type::end_array; } "]" { return token_type::end_array; }
"{" { return last_token = token_type::begin_object; } "{" { return token_type::begin_object; }
"}" { return last_token = token_type::end_object; } "}" { return token_type::end_object; }
"," { return last_token = token_type::value_separator; } "," { return token_type::value_separator; }
":" { return last_token = token_type::name_separator; } ":" { return token_type::name_separator; }
// literal names // literal names
"null" { return last_token = token_type::literal_null; } "null" { return token_type::literal_null; }
"true" { return last_token = token_type::literal_true; } "true" { return token_type::literal_true; }
"false" { return last_token = token_type::literal_false; } "false" { return token_type::literal_false; }
// number // number
decimal_point = [.]; decimal_point = [.];
@ -2651,7 +2454,7 @@ json_parser_lexer_start:
frac = decimal_point digit+; frac = decimal_point digit+;
int = (zero|digit_1_9 digit*); int = (zero|digit_1_9 digit*);
number = minus? int frac? exp?; number = minus? int frac? exp?;
number { return last_token = token_type::value_number; } number { return token_type::value_number; }
// string // string
quotation_mark = [\"]; quotation_mark = [\"];
@ -2660,58 +2463,16 @@ json_parser_lexer_start:
escaped = escape ([\"\\/bfnrt] | [u][0-9a-fA-F]{4}); escaped = escape ([\"\\/bfnrt] | [u][0-9a-fA-F]{4});
char = unescaped | escaped; char = unescaped | escaped;
string = quotation_mark char* quotation_mark; string = quotation_mark char* quotation_mark;
string { return last_token = token_type::value_string; } string { return token_type::value_string; }
// anything else is an error // end of file
* { return last_token = token_type::parse_error; } '\000' { return token_type::end_of_input; }
*/ */
} }
inline static std::string token_type_name(token_type t) inline std::string get_string_value() const
{ {
switch (t) return std::string(m_start, static_cast<size_t>(m_cursor - m_start));
{
case (token_type::uninitialized):
return "<uninitialized>";
case (token_type::literal_true):
return "true literal";
case (token_type::literal_false):
return "false literal";
case (token_type::literal_null):
return "null literal";
case (token_type::value_string):
return "string literal";
case (token_type::value_number):
return "number literal";
case (token_type::begin_array):
return "[";
case (token_type::begin_object):
return "{";
case (token_type::end_array):
return "]";
case (token_type::end_object):
return "}";
case (token_type::name_separator):
return ":";
case (token_type::value_separator):
return ",";
case (token_type::parse_error):
return "<parse error>";
case (token_type::end_of_input):
return "<end of input>";
}
}
inline void expect_new(token_type t)
{
if (t != last_token)
{
std::string error_msg = "parse error - unexpected \'";
error_msg += static_cast<char>(m_begin[0]);
error_msg += "\' (" + token_type_name(last_token);
error_msg += "); expected " + token_type_name(t);
throw std::invalid_argument(error_msg);
}
} }
/*! /*!
@ -2727,23 +2488,266 @@ json_parser_lexer_start:
*/ */
inline std::string get_string() const inline std::string get_string() const
{ {
return std::string( return std::string(m_start + 1, static_cast<size_t>(m_cursor - m_start - 2));
reinterpret_cast<const char*>(m_begin + 1), }
static_cast<std::size_t>(m_cursor - m_begin - 2)
); inline number_float_t get_number() const
{
// The pointer m_begin points to the beginning of the
// parsed number. We pass this pointer to std::strtod which
// sets endptr to the first character past the converted
// number. If this pointer is not the same as m_cursor,
// then either more or less characters have been used
// during the comparison. This can happen for inputs like
// "01" which will be treated like number 0 followed by
// number 1.
// conversion
char* endptr;
const auto float_val = std::strtod(reinterpret_cast<const char*>(m_start), &endptr);
// check if strtod read beyond the end of the lexem
if (endptr != m_cursor)
{
std::cerr << get_string_value() << std::endl;
return NAN;
}
else
{
return float_val;
}
}
private:
const char* m_content = nullptr;
const char* m_start = nullptr;
const char* m_cursor = nullptr;
const char* m_limit = nullptr;
const char* m_marker = nullptr;
const char* m_ctxmarker = nullptr;
};
class parser
{
public:
/// constructor for strings
inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str())
{
// read first token
get_token();
}
/// a parser reading from an input stream
inline parser(std::istream& _is)
{
while (_is)
{
std::string input_line;
std::getline(_is, input_line);
m_buffer += input_line;
}
// initializer lexer
m_lexer = lexer(m_buffer.c_str());
// read first token
get_token();
}
inline basic_json parse()
{
switch (last_token)
{
case (lexer::token_type::begin_object):
{
// explicitly set result to object to cope with {}
basic_json result(value_t::object);
// read next token
get_token();
// closing } -> we are done
if (last_token == lexer::token_type::end_object)
{
return result;
}
// otherwise: parse key-value pairs
do
{
// store key
expect(lexer::token_type::value_string);
const auto key = m_lexer.get_string();
// parse separator (:)
get_token();
expect(lexer::token_type::name_separator);
// parse value
get_token();
result[key] = parse();
// read next character
get_token();
}
while (last_token == lexer::token_type::value_separator
and get_token() == last_token);
// closing }
expect(lexer::token_type::end_object);
return result;
}
case (lexer::token_type::begin_array):
{
// explicitly set result to object to cope with []
basic_json result(value_t::array);
// read next token
get_token();
// closing ] -> we are done
if (last_token == lexer::token_type::end_array)
{
return result;
}
// otherwise: parse values
do
{
// parse value
result.push_back(parse());
// read next character
get_token();
}
while (last_token == lexer::token_type::value_separator
and get_token() == last_token);
// closing ]
expect(lexer::token_type::end_array);
return result;
}
case (lexer::token_type::literal_null):
{
return basic_json(nullptr);
}
case (lexer::token_type::value_string):
{
return basic_json(m_lexer.get_string());
}
case (lexer::token_type::literal_true):
{
return basic_json(true);
}
case (lexer::token_type::literal_false):
{
return basic_json(false);
}
case (lexer::token_type::value_number):
{
auto float_val = m_lexer.get_number();
if (std::isnan(float_val))
{
throw std::invalid_argument(std::string("parse error - ") +
m_lexer.get_string_value() + " is not a number");
}
// check if conversion loses precision
const auto int_val = static_cast<number_integer_t>(float_val);
if (float_val == int_val)
{
// we basic_json not lose precision -> return int
return basic_json(int_val);
}
else
{
// we would lose precision -> returnfloat
return basic_json(float_val);
}
}
default:
{
std::string error_msg = "parse error - unexpected \'";
error_msg += m_lexer.get_string_value();
error_msg += "\' (";
error_msg += token_type_name(last_token) + ")";
throw std::invalid_argument(error_msg);
}
}
}
private:
/// get next token from lexer
inline typename lexer::token_type get_token()
{
last_token = m_lexer.scan();
return last_token;
}
inline static std::string token_type_name(typename lexer::token_type t)
{
switch (t)
{
case (lexer::token_type::uninitialized):
return "<uninitialized>";
case (lexer::token_type::literal_true):
return "true literal";
case (lexer::token_type::literal_false):
return "false literal";
case (lexer::token_type::literal_null):
return "null literal";
case (lexer::token_type::value_string):
return "string literal";
case (lexer::token_type::value_number):
return "number literal";
case (lexer::token_type::begin_array):
return "[";
case (lexer::token_type::begin_object):
return "{";
case (lexer::token_type::end_array):
return "]";
case (lexer::token_type::end_object):
return "}";
case (lexer::token_type::name_separator):
return ":";
case (lexer::token_type::value_separator):
return ",";
case (lexer::token_type::parse_error):
return "<parse error>";
case (lexer::token_type::end_of_input):
return "<end of input>";
}
}
inline void expect(typename lexer::token_type t) const
{
if (t != last_token)
{
std::string error_msg = "parse error - unexpected \'";
error_msg += m_lexer.get_string_value();
error_msg += "\' (" + token_type_name(last_token);
error_msg += "); expected " + token_type_name(t);
throw std::invalid_argument(error_msg);
}
} }
private: private:
/// the buffer /// the buffer
std::string buffer; std::string m_buffer;
/// a pointer to the next character to read from the buffer
const lexer_char_t* m_cursor = nullptr;
/// a pointer past the last character of the buffer
const lexer_char_t* m_limit = nullptr;
/// a pointer to the beginning of the current token
const lexer_char_t* m_begin = nullptr;
/// the type of the last read token /// the type of the last read token
token_type last_token = token_type::uninitialized; typename lexer::token_type last_token = lexer::token_type::uninitialized;
lexer m_lexer;
}; };
}; };

View file

@ -3892,27 +3892,43 @@ TEST_CASE("deserialization")
{ {
SECTION("string") SECTION("string")
{ {
auto s = "[\"foo\",1,2,3,false,{\"one\":1}]"; // auto s = "[\"foo\",1,2,3,false,{\"one\":1}]";
// json j = json::parse(s);
// CHECK(j == json({"foo", 1, 2, 3, false, {{"one", 1}}}));
auto s = "null";
json j = json::parse(s); json j = json::parse(s);
CHECK(j == json({"foo", 1, 2, 3, false, {{"one", 1}}})); CHECK(j == json());
} }
SECTION("operator<<") SECTION("operator<<")
{ {
// std::stringstream ss;
// ss << "[\"foo\",1,2,3,false,{\"one\":1}]";
// json j;
// j << ss;
// CHECK(j == json({"foo", 1, 2, 3, false, {{"one", 1}}}));
std::stringstream ss; std::stringstream ss;
ss << "[\"foo\",1,2,3,false,{\"one\":1}]"; ss << "null";
json j; json j;
j << ss; j << ss;
CHECK(j == json({"foo", 1, 2, 3, false, {{"one", 1}}})); CHECK(j == json());
} }
SECTION("operator>>") SECTION("operator>>")
{ {
// std::stringstream ss;
// ss << "[\"foo\",1,2,3,false,{\"one\":1}]";
// json j;
// ss >> j;
// CHECK(j == json({"foo", 1, 2, 3, false, {{"one", 1}}}));
std::stringstream ss; std::stringstream ss;
ss << "[\"foo\",1,2,3,false,{\"one\":1}]"; ss << "null";
json j; json j;
ss >> j; ss >> j;
CHECK(j == json({"foo", 1, 2, 3, false, {{"one", 1}}})); CHECK(j == json());
} }
} }
@ -3980,42 +3996,42 @@ TEST_CASE("parser class")
{ {
SECTION("structural characters") SECTION("structural characters")
{ {
CHECK(json::parser("[").last_token == json::parser::token_type::begin_array); CHECK(json::parser("[").last_token == json::lexer::token_type::begin_array);
CHECK(json::parser("]").last_token == json::parser::token_type::end_array); CHECK(json::parser("]").last_token == json::lexer::token_type::end_array);
CHECK(json::parser("{").last_token == json::parser::token_type::begin_object); CHECK(json::parser("{").last_token == json::lexer::token_type::begin_object);
CHECK(json::parser("}").last_token == json::parser::token_type::end_object); CHECK(json::parser("}").last_token == json::lexer::token_type::end_object);
CHECK(json::parser(",").last_token == json::parser::token_type::value_separator); CHECK(json::parser(",").last_token == json::lexer::token_type::value_separator);
CHECK(json::parser(":").last_token == json::parser::token_type::name_separator); CHECK(json::parser(":").last_token == json::lexer::token_type::name_separator);
} }
SECTION("literal names") SECTION("literal names")
{ {
CHECK(json::parser("null").last_token == json::parser::token_type::literal_null); CHECK(json::parser("null").last_token == json::lexer::token_type::literal_null);
CHECK(json::parser("true").last_token == json::parser::token_type::literal_true); CHECK(json::parser("true").last_token == json::lexer::token_type::literal_true);
CHECK(json::parser("false").last_token == json::parser::token_type::literal_false); CHECK(json::parser("false").last_token == json::lexer::token_type::literal_false);
} }
SECTION("numbers") SECTION("numbers")
{ {
CHECK(json::parser("0").last_token == json::parser::token_type::value_number); CHECK(json::parser("0").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("1").last_token == json::parser::token_type::value_number); CHECK(json::parser("1").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("2").last_token == json::parser::token_type::value_number); CHECK(json::parser("2").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("3").last_token == json::parser::token_type::value_number); CHECK(json::parser("3").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("4").last_token == json::parser::token_type::value_number); CHECK(json::parser("4").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("5").last_token == json::parser::token_type::value_number); CHECK(json::parser("5").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("6").last_token == json::parser::token_type::value_number); CHECK(json::parser("6").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("7").last_token == json::parser::token_type::value_number); CHECK(json::parser("7").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("8").last_token == json::parser::token_type::value_number); CHECK(json::parser("8").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("9").last_token == json::parser::token_type::value_number); CHECK(json::parser("9").last_token == json::lexer::token_type::value_number);
} }
SECTION("whitespace") SECTION("whitespace")
{ {
CHECK(json::parser(" 0").last_token == json::parser::token_type::value_number); CHECK(json::parser(" 0").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("\t0").last_token == json::parser::token_type::value_number); CHECK(json::parser("\t0").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("\n0").last_token == json::parser::token_type::value_number); CHECK(json::parser("\n0").last_token == json::lexer::token_type::value_number);
CHECK(json::parser("\r0").last_token == json::parser::token_type::value_number); CHECK(json::parser("\r0").last_token == json::lexer::token_type::value_number);
CHECK(json::parser(" \t\n\r\n\t 0").last_token == json::parser::token_type::value_number); CHECK(json::parser(" \t\n\r\n\t 0").last_token == json::lexer::token_type::value_number);
} }
/* /*
@ -4049,7 +4065,7 @@ TEST_CASE("parser class")
case ('9'): case ('9'):
case ('"'): case ('"'):
{ {
CHECK(json::parser(s).last_token != json::parser::token_type::parse_error); CHECK(json::parser(s).last_token != json::lexer::token_type::parse_error);
break; break;
} }
@ -4058,13 +4074,13 @@ TEST_CASE("parser class")
case ('\n'): case ('\n'):
case ('\r'): case ('\r'):
{ {
CHECK(json::parser(s).last_token == json::parser::token_type::end_of_input); CHECK(json::parser(s).last_token == json::lexer::token_type::end_of_input);
break; break;
} }
default: default:
{ {
CHECK(json::parser(s).last_token == json::parser::token_type::parse_error); CHECK(json::parser(s).last_token == json::lexer::token_type::parse_error);
break; break;
} }
} }
@ -4093,19 +4109,19 @@ TEST_CASE("parser class")
SECTION("token_type_name") SECTION("token_type_name")
{ {
CHECK(json::parser::token_type_name(json::parser::token_type::uninitialized) == "<uninitialized>"); CHECK(json::parser::token_type_name(json::lexer::token_type::uninitialized) == "<uninitialized>");
CHECK(json::parser::token_type_name(json::parser::token_type::literal_true) == "true literal"); CHECK(json::parser::token_type_name(json::lexer::token_type::literal_true) == "true literal");
CHECK(json::parser::token_type_name(json::parser::token_type::literal_false) == "false literal"); CHECK(json::parser::token_type_name(json::lexer::token_type::literal_false) == "false literal");
CHECK(json::parser::token_type_name(json::parser::token_type::literal_null) == "null literal"); CHECK(json::parser::token_type_name(json::lexer::token_type::literal_null) == "null literal");
CHECK(json::parser::token_type_name(json::parser::token_type::value_string) == "string literal"); CHECK(json::parser::token_type_name(json::lexer::token_type::value_string) == "string literal");
CHECK(json::parser::token_type_name(json::parser::token_type::value_number) == "number literal"); CHECK(json::parser::token_type_name(json::lexer::token_type::value_number) == "number literal");
CHECK(json::parser::token_type_name(json::parser::token_type::begin_array) == "["); CHECK(json::parser::token_type_name(json::lexer::token_type::begin_array) == "[");
CHECK(json::parser::token_type_name(json::parser::token_type::begin_object) == "{"); CHECK(json::parser::token_type_name(json::lexer::token_type::begin_object) == "{");
CHECK(json::parser::token_type_name(json::parser::token_type::end_array) == "]"); CHECK(json::parser::token_type_name(json::lexer::token_type::end_array) == "]");
CHECK(json::parser::token_type_name(json::parser::token_type::end_object) == "}"); CHECK(json::parser::token_type_name(json::lexer::token_type::end_object) == "}");
CHECK(json::parser::token_type_name(json::parser::token_type::name_separator) == ":"); CHECK(json::parser::token_type_name(json::lexer::token_type::name_separator) == ":");
CHECK(json::parser::token_type_name(json::parser::token_type::value_separator) == ","); CHECK(json::parser::token_type_name(json::lexer::token_type::value_separator) == ",");
CHECK(json::parser::token_type_name(json::parser::token_type::parse_error) == "<parse error>"); CHECK(json::parser::token_type_name(json::lexer::token_type::parse_error) == "<parse error>");
CHECK(json::parser::token_type_name(json::parser::token_type::end_of_input) == "<end of input>"); CHECK(json::parser::token_type_name(json::lexer::token_type::end_of_input) == "<end of input>");
} }
} }