From 577e90b54ee22a1e4d9a736f34017d80ece93844 Mon Sep 17 00:00:00 2001 From: Harald Wolff-Thobaben Date: Thu, 19 Nov 2020 20:32:34 +0100 Subject: [PATCH] Alpha Commit --- Class1.cs | 8 --- ln.parse.csproj | 8 +++ tokenizer/RegularExpressionMatcher.cs | 37 +++++++++++++ tokenizer/SourceBuffer.cs | 77 +++++++++++++++++++++++++++ tokenizer/Token.cs | 54 +++++++++++++++++++ tokenizer/TokenMatcher.cs | 28 ++++++++++ tokenizer/Tokenizer.cs | 45 ++++++++++++++++ 7 files changed, 249 insertions(+), 8 deletions(-) delete mode 100644 Class1.cs create mode 100644 tokenizer/RegularExpressionMatcher.cs create mode 100644 tokenizer/SourceBuffer.cs create mode 100644 tokenizer/Token.cs create mode 100644 tokenizer/TokenMatcher.cs create mode 100644 tokenizer/Tokenizer.cs diff --git a/Class1.cs b/Class1.cs deleted file mode 100644 index d4ce454..0000000 --- a/Class1.cs +++ /dev/null @@ -1,8 +0,0 @@ -using System; - -namespace ln.parse -{ - public class Class1 - { - } -} diff --git a/ln.parse.csproj b/ln.parse.csproj index cb63190..d1c7cd1 100644 --- a/ln.parse.csproj +++ b/ln.parse.csproj @@ -4,4 +4,12 @@ netcoreapp3.1 + + + + + + + + diff --git a/tokenizer/RegularExpressionMatcher.cs b/tokenizer/RegularExpressionMatcher.cs new file mode 100644 index 0000000..f255277 --- /dev/null +++ b/tokenizer/RegularExpressionMatcher.cs @@ -0,0 +1,37 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Text.RegularExpressions; + +namespace ln.parse.tokenizer +{ + public class RegularExpressionMatcher : TokenMatcher + { + Regex regex; + Func createTokenDelegate; + + public RegularExpressionMatcher(string pattern,Func createTokenDelegate) + :this(pattern) + { + this.createTokenDelegate = createTokenDelegate; + } + protected RegularExpressionMatcher(string pattern) + { + regex = new Regex(pattern); + } + + public virtual Token CreateToken(SourceBuffer sourceBuffer, int start, int length) => createTokenDelegate(sourceBuffer, start, length); + + public override bool Match(SourceBuffer sourceBuffer,out Token token) + { + Match match = regex.Match(sourceBuffer.GetCurrentText()); + if ((match != null) && match.Success && (match.Index == 0)) + { + token = CreateToken(sourceBuffer, sourceBuffer.LinearPosition, match.Length); + return true; + } + token = null; + return false; + } + } +} diff --git a/tokenizer/SourceBuffer.cs b/tokenizer/SourceBuffer.cs new file mode 100644 index 0000000..fcb55b2 --- /dev/null +++ b/tokenizer/SourceBuffer.cs @@ -0,0 +1,77 @@ +using ln.collections; +using System; +using System.IO; + +namespace ln.parse.tokenizer +{ + public struct TextPosition + { + public int LineNo; + public int CursorPosition; + + public TextPosition(int line,int cursor) + { + LineNo = line; + CursorPosition = cursor; + } + + public static TextPosition First => new TextPosition(1, 1); + + public override string ToString() => string.Format("{0}:{1}", LineNo, CursorPosition); + } + + public class SourceBuffer + { + readonly string _buffer; + BTree statTextPositions = new BTree(); + + int linearPosition; + public int LinearPosition { get => linearPosition; set => linearPosition = value; } + public TextPosition TextPosition => GetTextPosition(linearPosition); + + public string Text => _buffer; + + public int Length => _buffer.Length; + + public SourceBuffer(char[] buffer) : this(new StringReader(new String(buffer))) + { } + public SourceBuffer(TextReader reader) : this(reader.ReadToEnd()) + { } + public SourceBuffer(string source) + { + _buffer = source; + doStatistics(); + } + + private void doStatistics() + { + TextPosition textPosition = new TextPosition(); + + for (int n=0;n<(_buffer.Length-1);n++) + { + textPosition.CursorPosition++; + + if (_buffer[n] == '\n') + { + textPosition.LineNo++; + textPosition.CursorPosition = 0; + } else if (textPosition.CursorPosition == 1) + statTextPositions.Add(n, textPosition); + } + } + + public TextPosition GetTextPosition(int linearPosition) + { + statTextPositions.TryGetPreviousOrCurrentValue(linearPosition, out TextPosition textPosition); + return textPosition; + } + + public string GetText() => _buffer; + public string GetText(int linearStart) => _buffer.Substring(linearStart); + public string GetText(int linearStart, int length) => _buffer.Substring(linearStart, length); + + public string GetCurrentText() => _buffer.Substring(linearPosition); + + + } +} diff --git a/tokenizer/Token.cs b/tokenizer/Token.cs new file mode 100644 index 0000000..49addb0 --- /dev/null +++ b/tokenizer/Token.cs @@ -0,0 +1,54 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace ln.parse.tokenizer +{ + public class Token + { + public SourceBuffer SourceBuffer { get; } + public int LinearStart { get; } + public int Length { get; } + + public TextPosition TextPosition => SourceBuffer.GetTextPosition(LinearStart); + + public Token(SourceBuffer sourceBuffer, int start, int length) + { + SourceBuffer = sourceBuffer; + LinearStart = start; + Length = length; + } + + public string Value => SourceBuffer.GetText(LinearStart, Length); + + + public class IntegerToken : Token + { + public IntegerToken(SourceBuffer sourceBuffer, int start, int length) : base(sourceBuffer, start, length) { } + } + public class FloatToken : Token + { + public FloatToken(SourceBuffer sourceBuffer, int start, int length) : base(sourceBuffer, start, length) { } + } + public class StringToken : Token + { + public StringToken(SourceBuffer sourceBuffer, int start, int length) : base(sourceBuffer, start, length) { } + } + public class OperatorToken : Token + { + public OperatorToken(SourceBuffer sourceBuffer, int start, int length) : base(sourceBuffer, start, length) { } + } + public class WhiteSpaceToken : Token + { + public WhiteSpaceToken(SourceBuffer sourceBuffer, int start, int length) : base(sourceBuffer, start, length) { } + } + public class IdentifierToken : Token + { + public IdentifierToken(SourceBuffer sourceBuffer, int start, int length) : base(sourceBuffer, start, length) { } + } + public class BracketToken : Token + { + public BracketToken(SourceBuffer sourceBuffer, int start, int length) : base(sourceBuffer, start, length) { } + } + } +} diff --git a/tokenizer/TokenMatcher.cs b/tokenizer/TokenMatcher.cs new file mode 100644 index 0000000..cc4e46b --- /dev/null +++ b/tokenizer/TokenMatcher.cs @@ -0,0 +1,28 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Runtime.InteropServices.ComTypes; +using System.Text; + +namespace ln.parse.tokenizer +{ + public abstract class TokenMatcher + { + + public TokenMatcher() + { + } + + public abstract bool Match(SourceBuffer sourceBuffer, out Token token); + + public static readonly TokenMatcher INTEGER = new RegularExpressionMatcher("^-?\\d+", (SourceBuffer sourceBuffer, int start, int length) => new Token.IntegerToken(sourceBuffer, start, length)); + public static readonly TokenMatcher FLOAT = new RegularExpressionMatcher("^-?\\d+.\\d*", (SourceBuffer sourceBuffer, int start, int length) => new Token.FloatToken(sourceBuffer, start, length)); + public static readonly TokenMatcher STRING = new RegularExpressionMatcher("^\\\"(\\\\\"|.)*?\\\"", (SourceBuffer sourceBuffer, int start, int length) => new Token.StringToken(sourceBuffer, start, length)); + public static readonly TokenMatcher IDENTIFIER = new RegularExpressionMatcher("^\\w][a-zA-Z0-9_]*", (SourceBuffer sourceBuffer, int start, int length) => new Token.IdentifierToken(sourceBuffer, start, length)); + public static readonly TokenMatcher OPERATOR = new RegularExpressionMatcher("\\+|\\-|\\*|\\/|\\||\\&|\\|\\||\\&\\&", (SourceBuffer sourceBuffer, int start, int length) => new Token.OperatorToken(sourceBuffer, start, length)); + public static readonly TokenMatcher WHITESPACE = new RegularExpressionMatcher("^\\s+", (SourceBuffer sourceBuffer, int start, int length) => new Token.WhiteSpaceToken(sourceBuffer, start, length)); + public static readonly TokenMatcher BRACKET = new RegularExpressionMatcher("^\\{|\\}|\\(|\\)|\\[|\\]|", (SourceBuffer sourceBuffer, int start, int length) => new Token.WhiteSpaceToken(sourceBuffer, start, length)); + + } + +} diff --git a/tokenizer/Tokenizer.cs b/tokenizer/Tokenizer.cs new file mode 100644 index 0000000..81f08a2 --- /dev/null +++ b/tokenizer/Tokenizer.cs @@ -0,0 +1,45 @@ +using System; +using System.Collections.Generic; +using System.Reflection.Metadata.Ecma335; +using System.Text; + +namespace ln.parse.tokenizer +{ + public class Tokenizer + { + List tokenMatchers = new List(); + + public Tokenizer() + { + } + + public Tokenizer Add(TokenMatcher tokenMatcher) { tokenMatchers.Add(tokenMatcher); return this; } + public Tokenizer Remove(TokenMatcher tokenMatcher) { tokenMatchers.Remove(tokenMatcher); return this; } + + public Token[] Parse(string source) => Parse(new SourceBuffer(source)); + public Token[] Parse(SourceBuffer sourceBuffer) + { + List tokens = new List(); + + while (sourceBuffer.LinearPosition < sourceBuffer.Length) + { + Token token = null; + + foreach (TokenMatcher tokenMatcher in tokenMatchers) + { + if (tokenMatcher.Match(sourceBuffer, out token)) + break; + } + + if (token == null) + throw new FormatException(String.Format("invalid token at {0}",sourceBuffer.TextPosition)); + + tokens.Add(token); + } + + return tokens.ToArray(); + } + + + } +}