From 46f7b9639375fe36e2ec9b79cf38294dfc4fea0f Mon Sep 17 00:00:00 2001 From: Harald Wolff Date: Thu, 23 Nov 2017 13:04:51 +0100 Subject: [PATCH] WIP --- CharBuffer.cs | 164 ++++++++++++++++++++++++++++++++++++ CharGroup.cs | 77 +++++++++++++++++ Lexer.cs | 59 +++++++++++++ LexerPathSegment.cs | 148 ++++++++++++++++++++++++++++++++ Parser.cs | 32 +++++++ ParserFormatException.cs | 18 ++++ ParserPath.cs | 37 ++++++++ ParserPathSegment.cs | 25 ++++++ Token.cs | 34 ++++++++ TokenDefinition.cs | 47 +++++++++++ TokenQueue.cs | 24 ++++++ UnexpectedTokenException.cs | 18 ++++ sharp.parser.csproj | 27 +++++- 13 files changed, 708 insertions(+), 2 deletions(-) create mode 100644 CharBuffer.cs create mode 100644 CharGroup.cs create mode 100644 Lexer.cs create mode 100644 LexerPathSegment.cs create mode 100644 Parser.cs create mode 100644 ParserFormatException.cs create mode 100644 ParserPath.cs create mode 100644 ParserPathSegment.cs create mode 100644 Token.cs create mode 100644 TokenDefinition.cs create mode 100644 TokenQueue.cs create mode 100644 UnexpectedTokenException.cs diff --git a/CharBuffer.cs b/CharBuffer.cs new file mode 100644 index 0000000..e03c41b --- /dev/null +++ b/CharBuffer.cs @@ -0,0 +1,164 @@ +using System; +using sharp.extensions; +using System.Collections.Generic; +using System.Linq; +namespace sharp.parser +{ + public class CharBuffer + { + char[] chars; + int position; + + int lineno, linepos, linestart; + + int[] lineStarts; + + public char[] Characters { get { return this.chars; } } + + public CharBuffer(char[] source) + { + this.chars = source; + this.initialize(); + } + public CharBuffer(string source){ + this.chars = source.ToCharArray(); + this.initialize(); + } + + private void initialize(){ + this.lineno = 1; + this.linepos = 1; + this.linestart = 0; + + List lineStartList = new List(); + lineStartList.Add(0); + + for (int n = 0; n < this.chars.Length;n++) + { + if (this.chars[n] == 0x0a){ + lineStartList.Add(n); + } + } + lineStartList.Add(this.chars.Length); + + this.lineStarts = lineStartList.ToArray(); + } + + private int lineEnd(){ + int p = position; + while (this.chars[p]!='\n'){ + p++; + if (p >= this.chars.Length){ + break; + } + } + return p; + } + + public string getLineAt(int pos){ + int n; + + if (pos >= this.chars.Length){ + return null; + } + + for (n = 0; n < this.lineStarts.Count();n++){ + if (pos >= this.lineStarts[n]){ + break; + } + } + + return new string( this.chars.Segment(this.lineStarts[n],this.lineStarts[n+1]-this.lineStarts[n]) ); + } + + public int Position { + get { return position; } + set { this.position = value >= chars.Length ? chars.Length : value; } + } + + public bool MoveNext(){ + if (position < this.chars.Length){ + position++; + + this.linepos++; + + if (Last == '\n'){ + this.linepos = 1; + this.linestart = position; + this.lineno++; + } + + return true; + } + return false; + } + + public bool MoveBack(){ + if (position > 0){ + position--; + return true; + } + return false; + } + + public bool EndOfBuffer(){ + return (this.position >= this.chars.Length); + } + + public void BypassWhiteSpace(){ + while (Current <= 0x20){ + MoveNext(); + } + } + + public int CurrentLineNumber { get { return this.lineno; } } + public int CurrentLinePosition { get { return this.linepos; } } + + public string CurrentLine { get { return new String(this.chars.Segment(this.linestart,lineEnd()-this.linestart)); } } + + public char Last { + get { + if (position > 0) { + return this.chars[position - 1]; + }; + throw new IndexOutOfRangeException("No character before the first one"); + } + } + public char Current + { + get { + if (position < this.chars.Length){ + return this.chars[position]; + } + throw new IndexOutOfRangeException("No character after the last one"); + } + } + public char Next + { + get { + if (position < this.chars.Length-1) { + return this.chars[position + 1]; + }; + throw new IndexOutOfRangeException("No character after the last one"); + + } + } + + public string Following(int len) + { + return new string(this.chars.Segment(position, len)); + } + public string Preceding(int len) + { + return new string(this.chars.Segment(position-len, len)); + } + + public void Pass(char ch){ + if (Current != ch){ + throw new ParserFormatException(String.Format("Expected {0}, but got {1}",ch,Current),lineno,linepos,CurrentLine); + } + MoveNext(); + } + + } +} diff --git a/CharGroup.cs b/CharGroup.cs new file mode 100644 index 0000000..9cc6afa --- /dev/null +++ b/CharGroup.cs @@ -0,0 +1,77 @@ +using System; +using sharp.extensions; +namespace sharp.parser +{ + public class CharGroup + { + public static readonly CharGroup digit = new CharGroup('0', '9'); + public static readonly CharGroup zero = new CharGroup('0'); + public static readonly CharGroup digit19 = new CharGroup('1', '9'); + public static readonly CharGroup plusminus = new CharGroup(new char[] { '+', '-' }); + public static readonly CharGroup minus = new CharGroup(new char[] { '+', '-' }); + public static readonly CharGroup plus = new CharGroup(new char[] { '+' }); + public static readonly CharGroup az = new CharGroup('a','z'); + public static readonly CharGroup AZ = new CharGroup('A','Z'); + public static readonly CharGroup aAzZ = az + AZ; + public static readonly CharGroup LF = new CharGroup((char)0x0A); + public static readonly CharGroup CR = new CharGroup((char)0x0D); + public static readonly CharGroup HTAB = new CharGroup((char)0x09); + public static readonly CharGroup WS = new CharGroup(new char[] { (char)0x09, (char)0x0A, (char)0x0B, (char)0x0C, (char)0x0D, (char)0x20}); + public static readonly CharGroup hexdigits = new CharGroup('a','f') + new CharGroup('A','F') + digit; + + char[] chars; + + public CharGroup(char ch) + { + chars = new char[] { ch }; + } + public CharGroup(char[] chars) + { + this.chars = chars.Segment(0); + } + public CharGroup(char first,char last) + { + int l = (int)last - (int)first; + this.chars = new char[l+1]; + + for (int n = 0; n <= l;n++){ + this.chars[n] = (char)(first + n); + } + } + + public bool Contains(char ch){ + foreach (char c in chars){ + if (c == ch){ + return true; + } + } + return false; + } + + public bool Intersects(CharGroup other){ + foreach (char ch in chars){ + if (other.Contains(ch)){ + return true; + } + } + return false; + } + + + + public static CharGroup operator +(CharGroup cg1, CharGroup cg2) + { + return new CharGroup(cg1.chars.Combine(cg2.chars)); + } + + public static CharGroup operator -(CharGroup cg1, CharGroup cg2) + { + return new CharGroup(cg1.chars.Remove(cg2.chars)); + } + + public override string ToString() + { + return string.Format("[CharGroup '{0}']",new string(this.chars)); + } + } +} diff --git a/Lexer.cs b/Lexer.cs new file mode 100644 index 0000000..f1d5a59 --- /dev/null +++ b/Lexer.cs @@ -0,0 +1,59 @@ +using System; +using System.Collections.Generic; +namespace sharp.parser +{ + public class Lexer + { + List tokenDefinitions = new List(); + + public Lexer(){ + } + public Lexer(TokenDefinition[] tokenDefinitions) + { + this.tokenDefinitions.AddRange(tokenDefinitions); + } + + public void AddTokenDefinition(TokenDefinition tokenDefinition) + { + this.tokenDefinitions.Add(tokenDefinition); + } + public void RemoveTokenDefinition(TokenDefinition tokenDefinition) + { + this.tokenDefinitions.Remove(tokenDefinition); + } + + + public Token[] parse(CharBuffer buffer){ + List tokens = new List(); + + while (!buffer.EndOfBuffer()){ + Token t = null; + + foreach (TokenDefinition tdef in tokenDefinitions) + { + t = tdef.tryParse(buffer); + if (t != null) + { + break; + } + } + + if (t == null) + { + break; + } + + tokens.Add(t); + } + + if (!buffer.EndOfBuffer()) + { + throw new FormatException(String.Format("Unexpected character at line {0} position {1}. '{2}'", buffer.CurrentLineNumber, buffer.CurrentLinePosition, buffer.Current)); + } + return tokens.ToArray(); + } + + + + } +} diff --git a/LexerPathSegment.cs b/LexerPathSegment.cs new file mode 100644 index 0000000..5fd63a8 --- /dev/null +++ b/LexerPathSegment.cs @@ -0,0 +1,148 @@ +using System; +using System.Collections.Generic; +namespace sharp.parser +{ + public class LexerPathSegment + { + public CharGroup CharGroup { get; private set; } + public bool MayFinish { get; set; } + + public LexerPathSegment[] Followers { get { return followers.ToArray(); } } + private List followers = new List(); + + public LexerPathSegment(){ + this.CharGroup = null; + } + + public LexerPathSegment(LexerPathSegment follower) + { + this.CharGroup = null; + this.AddFollower(follower); + } + + public LexerPathSegment(CharGroup charGroup) + { + this.CharGroup = charGroup; + } + + public LexerPathSegment(char[] chars){ + this.CharGroup = new CharGroup(chars); + } + + public LexerPathSegment(char ch) + { + this.CharGroup = new CharGroup(ch); + } + + public LexerPathSegment(char first,char last) + { + this.CharGroup = new CharGroup(first,last); + } + + public LexerPathSegment(CharGroup charGroup,bool mayFinish) + { + this.CharGroup = charGroup; + this.MayFinish = mayFinish; + } + + public LexerPathSegment(char[] chars,bool mayFinish) + { + this.CharGroup = new CharGroup(chars); + this.MayFinish = mayFinish; + } + + public LexerPathSegment(char ch,bool mayFinish) + { + this.CharGroup = new CharGroup(ch); + this.MayFinish = mayFinish; + } + + public LexerPathSegment(char first, char last,bool mayFinish) + { + this.CharGroup = new CharGroup(first, last); + this.MayFinish = mayFinish; + } + + public LexerPathSegment(CharGroup charGroup, LexerPathSegment follower) + { + this.CharGroup = charGroup; + this.AddFollower(follower); + } + + public LexerPathSegment(char[] chars, LexerPathSegment follower) + { + this.CharGroup = new CharGroup(chars); + this.AddFollower(follower); + } + + public LexerPathSegment(char ch, LexerPathSegment follower) + { + this.CharGroup = new CharGroup(ch); + this.AddFollower(follower); + } + + public LexerPathSegment(char first, char last, LexerPathSegment follower) + { + this.CharGroup = new CharGroup(first, last); + this.AddFollower(follower); + } + + + + public void AddFollower(LexerPathSegment path, params LexerPathSegment[] paths) + { + AddFollower(path); + foreach (LexerPathSegment p in paths) + { + AddFollower(p); + } + } + public void AddFollower(LexerPathSegment path) + { + followers.Add(path); + } + + public void RemoveFollower(LexerPathSegment path) + { + followers.Remove(path); + } + + public int walk(CharBuffer buffer){ + + if (this.CharGroup == null){ + + foreach (LexerPathSegment next in followers){ + int n = next.walk(buffer); + if (n > 0){ + return n; + } + } + + return -1; + } + + if (MayFinish && buffer.EndOfBuffer()){ + return 1; + } + + if (this.CharGroup.Contains(buffer.Current)){ + buffer.MoveNext(); + + foreach (LexerPathSegment next in followers){ + int n = next.walk(buffer); + if (n > 0){ + buffer.MoveBack(); + return n + 1; + } + } + buffer.MoveBack(); + + if (MayFinish){ + return 1; + } + } + return -1; + } + + } +} diff --git a/Parser.cs b/Parser.cs new file mode 100644 index 0000000..6f43337 --- /dev/null +++ b/Parser.cs @@ -0,0 +1,32 @@ +using System; +using System.Collections.Generic; +using System.Collections; + +namespace sharp.parser +{ + public abstract class Parser + { + protected Lexer Lexer { get; private set; } + + protected Parser(TokenDefinition[] tokenDefinitions) + { + this.Lexer = new Lexer(tokenDefinitions); + } + + public Token[] Tokenize(char[] source){ + return this.Lexer.parse(new CharBuffer(source)); + } + + public T Parse(string source){ + return Parse(source.ToCharArray()); + } + + public T Parse(char[] source){ + Token[] tokens = Tokenize(source); + return ParseTokens(tokens); + } + + protected abstract T ParseTokens(Token[] tokens); + + } +} diff --git a/ParserFormatException.cs b/ParserFormatException.cs new file mode 100644 index 0000000..c3ca1ca --- /dev/null +++ b/ParserFormatException.cs @@ -0,0 +1,18 @@ +using System; +namespace sharp.parser +{ + public class ParserFormatException : Exception + { + public String Line { get; private set; } + public int LineNumber { get; private set; } + public int Position { get; private set; } + + public ParserFormatException(string message,int lineno,int pos,string line) + :base(message) + { + Line = line; + LineNumber = lineno; + Position = pos; + } + } +} diff --git a/ParserPath.cs b/ParserPath.cs new file mode 100644 index 0000000..3077571 --- /dev/null +++ b/ParserPath.cs @@ -0,0 +1,37 @@ +using System; +using System.Collections.Generic; +namespace sharp.parser +{ + public abstract class ParserPath + { + public ParserPath[] Followers { get { return followers.ToArray(); } } + private List followers = new List(); + + public ParserPath(){ + } + public ParserPath(ParserPath[] followers) + { + this.followers.AddRange(followers); + } + + + public void AddFollower(ParserPath path,params ParserPath[] paths){ + AddFollower(path); + foreach (ParserPath p in paths){ + AddFollower(p); + } + } + public void AddFollower(ParserPath path) + { + followers.Add(path); + } + + public void RemoveFollower(ParserPath path) + { + followers.Remove(path); + } + + + + } +} diff --git a/ParserPathSegment.cs b/ParserPathSegment.cs new file mode 100644 index 0000000..c815b5c --- /dev/null +++ b/ParserPathSegment.cs @@ -0,0 +1,25 @@ +using System; +namespace sharp.parser +{ + public class ParserPathSegment : ParserPath + { + public CharGroup CharGroup { get; private set; } + + public ParserPathSegment(CharGroup charGroup) + { + this.CharGroup = charGroup; + } + public ParserPathSegment(char ch) + { + this.CharGroup = new CharGroup(ch); + } + public ParserPathSegment(char[] chars) + { + this.CharGroup = new CharGroup(chars); + } + public ParserPathSegment(char first, char last) + { + this.CharGroup = new CharGroup(first, last); + } + } +} diff --git a/Token.cs b/Token.cs new file mode 100644 index 0000000..b8b8656 --- /dev/null +++ b/Token.cs @@ -0,0 +1,34 @@ +using System; +using System.Collections.Generic; +using sharp.extensions; +namespace sharp.parser +{ + public class Token + { + public TokenDefinition Definition { get; private set; } + public CharBuffer charBuffer; + + public CharBuffer Buffer { get { return charBuffer; } } + public int Position { get; private set; } + public int Len { get; private set; } + + public string Value { get { return new String(charBuffer.Characters.Segment(Position, Len)); } } + + public Token(TokenDefinition tdef,CharBuffer buffer,int len) + { + this.Definition = tdef; + this.charBuffer = buffer; + this.Position = buffer.Position; + this.Len = len; + + buffer.Position += len; + } + + + public override string ToString() + { + return string.Format("[Token: {0} Value={1}]", Definition.Name, Value); + } + + } +} diff --git a/TokenDefinition.cs b/TokenDefinition.cs new file mode 100644 index 0000000..f1a9961 --- /dev/null +++ b/TokenDefinition.cs @@ -0,0 +1,47 @@ +using System; +using System.Collections.Generic; +namespace sharp.parser +{ + public class TokenDefinition + { + public String Name { get; private set; } + public LexerPathSegment[] PathHeads { get { return this.pathHeads.ToArray(); } } + + private List pathHeads = new List(); + + public TokenDefinition(string name) + { + this.Name = name; + } + public TokenDefinition(string name,LexerPathSegment pathHead) + { + this.Name = name; + this.pathHeads.Add(pathHead); + } + public TokenDefinition(string name,LexerPathSegment[] pathHeads) + { + this.Name = name; + this.pathHeads.AddRange(pathHeads); + } + + + public Token tryParse(CharBuffer buffer){ + int cpos = buffer.Position; + + foreach (LexerPathSegment head in pathHeads){ + int n = head.walk(buffer); + if (n > 0){ + Token t = new Token(this,buffer,n); + return t; + } + } + + return null; + } + + public override string ToString() + { + return string.Format("[TokenDefinition: Name={0}]", Name); + } + } +} diff --git a/TokenQueue.cs b/TokenQueue.cs new file mode 100644 index 0000000..b534f22 --- /dev/null +++ b/TokenQueue.cs @@ -0,0 +1,24 @@ +using System; +using System.Collections.Generic; +namespace sharp.parser +{ + public class TokenQueue : Queue + { + public TokenQueue() + { + } + + public Token Expect(params TokenDefinition[] tdefs) + { + Token t = Dequeue(); + foreach (TokenDefinition tdef in tdefs){ + if (t.Definition == tdef) + { + return t; + } + } + throw new UnexpectedTokenException(t, tdefs); + } + + } +} diff --git a/UnexpectedTokenException.cs b/UnexpectedTokenException.cs new file mode 100644 index 0000000..fe865ac --- /dev/null +++ b/UnexpectedTokenException.cs @@ -0,0 +1,18 @@ +using System; +namespace sharp.parser +{ + public class UnexpectedTokenException : Exception + { + public Token Token { get; private set; } + public TokenDefinition[] Expected { get; private set; } + + public new string Message { get; private set; } + + public UnexpectedTokenException(Token token, TokenDefinition[] expected) + { + this.Token = token; + this.Expected = expected; + this.Message = string.Format("Unexpected Token in Line {0} at position {1}.\nGot {2} but should be one of [{3}]\nLine: {4}",0,token.Position,token.Value,string.Join(",",expected),token.charBuffer.getLineAt(token.Position)); + } + } +} diff --git a/sharp.parser.csproj b/sharp.parser.csproj index 9084d52..0fb262c 100644 --- a/sharp.parser.csproj +++ b/sharp.parser.csproj @@ -4,10 +4,10 @@ Debug x86 {32267133-ADB7-4A85-8CF1-03CBDF53715C} - Exe + Library sharp.parser sharp.parser - v4.5 + v4.7 true @@ -26,5 +26,28 @@ 4 x86 + + + + + + + + + + + + + + + + + {97CA3CA9-98B3-4492-B072-D7A5995B68E9} + sharp.extensions + + + + + \ No newline at end of file