diff --git a/examples/json-schema-to-grammar.py b/examples/json-schema-to-grammar.py new file mode 100644 index 000000000..2dccc118a --- /dev/null +++ b/examples/json-schema-to-grammar.py @@ -0,0 +1,132 @@ +import argparse +import json +import re +import sys + +# whitespace is constrained to a single space char to prevent model "running away" in +# whitespace. Also maybe improves generation quality? +SPACE_RULE = '" "?' + +PRIMITIVE_RULES = { + 'boolean': '("true" | "false") space', + 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space', + 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space', + 'string': r''' "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) + )* "\"" space ''', + 'null': '"null" space', +} + +INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+') +GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]') +GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'} + + +class SchemaConverter: + def __init__(self, prop_order): + self._prop_order = prop_order + self._rules = {'space': SPACE_RULE} + + def _format_literal(self, literal): + escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( + lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal) + ) + return f'"{escaped}"' + + def _add_rule(self, name, rule): + esc_name = INVALID_RULE_CHARS_RE.sub('-', name) + if esc_name not in self._rules or self._rules[esc_name] == rule: + key = esc_name + else: + i = 0 + while f'{esc_name}{i}' in self._rules: + i += 1 + key = f'{esc_name}{i}' + self._rules[key] = rule + return key + + def visit(self, schema, name): + schema_type = schema.get('type') + rule_name = name or 'root' + + if 'oneOf' in schema or 'anyOf' in schema: + rule = ' | '.join(( + self.visit(alt_schema, f'{name}{"-" if name else ""}{i}') + for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf']) + )) + return self._add_rule(rule_name, rule) + + elif 'const' in schema: + return self._add_rule(rule_name, self._format_literal(schema['const'])) + + elif 'enum' in schema: + rule = ' | '.join((self._format_literal(v) for v in schema['enum'])) + return self._add_rule(rule_name, rule) + + elif schema_type == 'object' and 'properties' in schema: + # TODO: `required` keyword + prop_order = self._prop_order + prop_pairs = sorted( + schema['properties'].items(), + # sort by position in prop_order (if specified) then by key + key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]), + ) + + rule = '"{" space' + for i, (prop_name, prop_schema) in enumerate(prop_pairs): + prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}') + if i > 0: + rule += ' "," space' + rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}' + rule += ' "}" space' + + return self._add_rule(rule_name, rule) + + elif schema_type == 'array' and 'items' in schema: + # TODO `prefixItems` keyword + item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item') + rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space' + return self._add_rule(rule_name, rule) + + else: + assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}' + return self._add_rule( + 'root' if rule_name == 'root' else schema_type, + PRIMITIVE_RULES[schema_type] + ) + + def format_grammar(self): + return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items())) + + +def main(args_in = None): + parser = argparse.ArgumentParser( + description=''' + Generates a grammar (suitable for use in ./main) that produces JSON conforming to a + given JSON schema. Only a subset of JSON schema features are supported; more may be + added in the future. + ''', + ) + parser.add_argument( + '--prop-order', + default=[], + type=lambda s: s.split(','), + help=''' + comma-separated property names defining the order of precedence for object properties; + properties not specified here are given lower precedence than those that are, and are + sorted alphabetically + ''' + ) + parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)') + args = parser.parse_args(args_in) + + schema = json.load(sys.stdin if args.schema == '-' else open(args.schema)) + prop_order = {name: idx for idx, name in enumerate(args.prop_order)} + converter = SchemaConverter(prop_order) + converter.visit(schema, '') + print(converter.format_grammar()) + + +if __name__ == '__main__': + main() diff --git a/grammars/json.gbnf b/grammars/json.gbnf index 40fa2b637..a9537cdf9 100644 --- a/grammars/json.gbnf +++ b/grammars/json.gbnf @@ -1,29 +1,25 @@ -# Grammar for subset of JSON - doesn't support full string or number syntax - -root ::= object -value ::= object | array | string | number | boolean | "null" +root ::= object +value ::= object | array | string | number | ("true" | "false" | "null") ws object ::= "{" ws ( string ":" ws value ("," ws string ":" ws value)* - )? "}" + )? "}" ws array ::= "[" ws ( value ("," ws value)* - )? "]" + )? "]" ws -string ::= +string ::= "\"" ( [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes )* "\"" ws -# Only plain integers currently -number ::= "-"? [0-9]+ ws -boolean ::= ("true" | "false") ws +number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws # Optional space: by convention, applied in this grammar after literal chars when allowed ws ::= ([ \t\n] ws)?