REFC GedcomTokenizer instance

2020-11-27 11:30:35 +01:00 · 2020-11-27 11:30:35 +01:00 · c8e50a410b
parent 84bf8a596a
commit c8e50a410b
13 changed files with 213 additions and 108 deletions
--- a/src/main/java/de/nth/chronicle/gedcom/GedcomHeader.java
+++ b/src/main/java/de/nth/chronicle/gedcom/GedcomHeader.java
@ -5,6 +5,9 @@ import lombok.Data;
 import lombok.Getter;
 import lombok.NonNull;

+import java.time.LocalDate;
+import java.time.LocalTime;
+
@Data
@Builder(toBuilder = true)
 public class GedcomHeader {
@ -17,6 +20,22 @@ public class GedcomHeader {
    @NonNull
    private final String characterSet;

+    //SOUR
+    private final String approvedSystemId;
+    private final String sourceVersion;
+    private final String nameOfProduct;
+    private final String nameOfBusiness;
+    private final String nameOfSourceData;
+    private final LocalDate publicationDate;
+    private final String copyrightSourceData;
+
+    private final String receivingSystemName;
+    private final LocalDate transmissionDate;
+    private final LocalTime time;
+    private final String fileName;
+    private final String copyrightGedcomFile;
+    private final String language;
+    private final String contentDescription;


 }
--- a/src/main/java/de/nth/chronicle/gedcom/parser/Gedcom555Parser.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/Gedcom555Parser.java
@ -1,7 +1,6 @@
 package de.nth.chronicle.gedcom.parser;

 import de.nth.chronicle.gedcom.Gedcom;
-import de.nth.chronicle.gedcom.GedcomVersion;
 import de.nth.chronicle.gedcom.parser.records.HeaderRecordParser;

 import java.io.InputStream;
@ -19,10 +18,9 @@ public class Gedcom555Parser implements GedcomParser {
    @Override
    public Gedcom parseGedcom(InputStream stream) throws Exception {

-        GedcomTokenizer tokenizer = GedcomTokenizer.create(stream, TokenizerOptions.builder()
-                                                                                    .build());
+        GedcomTokenizer tokenizer = new GedcomTokenizer(stream);

-        List<RecordToken> tokens = tokenizer.parseRecords();
+        List<RecordToken> tokens = tokenizer.parseRecordsTokens();
        Gedcom.GedcomBuilder builder = Gedcom.builder();

        for(RecordToken token : tokens) {
@ -32,8 +30,6 @@ public class Gedcom555Parser implements GedcomParser {
                System.err.println("No Parser found for tag " + token.getTag());
            }
        }
-
-
        return builder.build();
    }

--- a/src/main/java/de/nth/chronicle/gedcom/parser/GedcomReader.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/GedcomReader.java
@ -0,0 +1,12 @@
+package de.nth.chronicle.gedcom.parser;
+
+import de.nth.chronicle.gedcom.Gedcom;
+import de.nth.chronicle.gedcom.parser.exception.GedcomException;
+
+import java.io.InputStream;
+
+public interface GedcomReader {
+
+    public Gedcom read(InputStream stream) throws GedcomException;
+
+}
--- a/src/main/java/de/nth/chronicle/gedcom/parser/GedcomTokenizer.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/GedcomTokenizer.java
@ -1,8 +1,12 @@
 package de.nth.chronicle.gedcom.parser;

+import de.nth.chronicle.gedcom.parser.exception.GedcomException;
+import de.nth.chronicle.gedcom.parser.exception.InvalidLineException;
+import de.nth.chronicle.gedcom.parser.exception.MissingBomException;
 import de.nth.chronicle.gedcom.util.EncodingUtils;

 import java.io.BufferedReader;
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
@ -15,85 +19,102 @@ class GedcomTokenizer {
    // ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
    public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");

-    private final TokenizerOptions options;
+    private String currentLine;
+    private int currentLineNumber;
+
+    private final List<RecordToken> records = new LinkedList<>();
+    private final Stack<RecordToken> stack = new Stack<>();
    private final BufferedReader reader;

-    GedcomTokenizer(BufferedReader reader, TokenizerOptions options) {
-        this.reader = reader;
-        this.options = options;
+    public GedcomTokenizer(InputStream stream) {
+        Charset charset = validateEncoding(stream);
+
+        this.reader = new BufferedReader(new InputStreamReader(stream, charset));
    }

-    public List<RecordToken> parseRecords() throws Exception {
+    public String getCurrentLine() {
+        return this.currentLine;
+    }
+    public int getCurrentLineNumber() {
+        return this.currentLineNumber;
+    }

-        List<RecordToken> tokens = new LinkedList<>();
-        Stack<RecordToken> stack = new Stack<>();
+    /*
+     * GEDCOM 5.5.5 Reader Rules
+     * ▪ import each line value as-is
+     * ▪ do not trim trailing white space from any GEDCOM line or line value
+     * ▪ do not trim leading white space from any line value
+     * */
+    public List<RecordToken> parseRecordsTokens() throws GedcomException {

-        /*
-        * GEDCOM 5.5.5 Reader Rules
-        * ▪ import each line value as-is
-        * ▪ do not trim trailing white space from any GEDCOM line or line value
-        * ▪ do not trim leading white space from any line value
-        * */
-        String line;
-        int lineNumber = 1;
-        while((line = reader.readLine()) != null) {
-            System.out.println("Tokenizer: tokenize line " + lineNumber);
-
-            Matcher matcher = matchLine(line);
-            if(!matcher.matches()) {
-                throw new InvalidGedcomException.InvalidLine(lineNumber, line);
-            }
-
-            int level = Integer.parseInt(matcher.group(1));
-            String tag = matcher.group(2);
-            String value = matcher.group(3);
-
-            RecordToken record = RecordToken.builder()
-                                            .level(level)
-                                            .tag(tag)
-                                            .value(value)
-                                            .build();
-
-            if(stack.isEmpty()) {
-                System.out.println("First Record: " + line);
-                stack.push(record);
-                tokens.add(record);
-            } else if(level == 0) {
-                System.out.println("New Record: " + line);
-                stack.clear();
-                stack.push(record);
-                tokens.add(record);
-            } else if(stack.peek().getLevel() == level) {
-                System.out.println("Same Level Record: " + line);
-                stack.pop();
-                if(!stack.isEmpty()) {
-                    stack.peek().getSubRecords().add(record);
-                }
-                stack.push(record);
-            } else if(stack.peek().getLevel() < level) {
-                System.out.println("Next Level Record: " + line);
-                stack.peek().getSubRecords().add(record);
-                stack.push(record);
-            } else if(stack.peek().getLevel() > level) {
-                while(stack.peek().getLevel() >= level) {
-                    stack.pop();
-                }
-                System.out.println("Higher Level Record: " + line + " ==== parent: " + stack.peek().getTag());
-                stack.peek().getSubRecords().add(record);
-                stack.push(record);
-            }
-
-            lineNumber++;
+        if(!this.records.isEmpty()) {
+            return this.records;
        }

-        return tokens;
+        while(readNextLine()) {

+            if(this.currentLine.length() > 255) {
+                throw new InvalidLineException(this.currentLineNumber, this.currentLine, "Line is too long!");
+            }
+
+            pushRecordToken(parseRecordToken());
+            this.currentLineNumber++;
+        }
+
+        return this.records;
    }

-    public static GedcomTokenizer create(InputStream stream, TokenizerOptions options) {
-        Charset charset = validateEncoding(stream);
-        System.out.println("Tokenizer: user encoding " + charset);
-        return new GedcomTokenizer(new BufferedReader(new InputStreamReader(stream, charset)), options);
+    private boolean readNextLine() {
+        try {
+            return (this.currentLine = this.reader.readLine()) != null;
+        } catch (IOException e) {
+            throw new GedcomException(e);
+        }
+    }
+
+    private RecordToken parseRecordToken() {
+        Matcher matcher = matchLine(this.currentLine);
+        if(!matcher.matches()) {
+            throw new InvalidGedcomException.InvalidLine(this.currentLineNumber, this.currentLine);
+        }
+
+        int level = Integer.parseInt(matcher.group(1));
+        String tag = matcher.group(2);
+        String value = matcher.group(3);
+
+        RecordToken record = RecordToken.builder()
+                .level(level)
+                .tag(tag)
+                .value(value)
+                .build();
+
+        return record;
+    }
+
+    private void pushRecordToken(RecordToken record) {
+        if(this.stack.isEmpty()) {
+            this.stack.push(record);
+            this.records.add(record);
+        } else if(record.getLevel() == 0) {
+            this.stack.clear();
+            this.stack.push(record);
+            this.records.add(record);
+        } else if(record.getLevel() == this.stack.peek().getLevel()) {
+            this.stack.pop();
+            if(!this.stack.isEmpty()) {
+                this.stack.peek().getSubRecords().add(record);
+            }
+            this.stack.push(record);
+        } else if(record.getLevel() > this.stack.peek().getLevel()) {
+            this.stack.peek().getSubRecords().add(record);
+            this.stack.push(record);
+        } else if(record.getLevel() < this.stack.peek().getLevel()) {
+            while(record.getLevel() <= this.stack.peek().getLevel()) {
+                this.stack.pop();
+            }
+            this.stack.peek().getSubRecords().add(record);
+            this.stack.push(record);
+        }
    }

    /**
@ -109,18 +130,18 @@ class GedcomTokenizer {
        return LINE_REGEX.matcher(line);
    }

+    /*
+    ▪ demand that file starts with a Byte Order Mark (BOM)
+    ▪ demand that the encoding is either UTF-8 or UTF-16
+    ▪ must support both UTF-8 and UTF-16 GEDCOM files
+    ▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
+    ▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
+    */
    static Charset validateEncoding(InputStream stream) {
-        /*
-            ▪ demand that file starts with a Byte Order Mark (BOM)
-            ▪ demand that the encoding is either UTF-8 or UTF-16
-            ▪ must support both UTF-8 and UTF-16 GEDCOM files
-            ▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
-            ▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
-         */
        Charset charset = EncodingUtils.getCharsetForBOM(stream);

        if(charset == null) {
-            throw new InvalidGedcomException.MissingBOM();
+            throw new MissingBomException();
        }

        return charset;
--- a/src/main/java/de/nth/chronicle/gedcom/parser/InvalidGedcomException.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/InvalidGedcomException.java
@ -18,9 +18,9 @@ public abstract class InvalidGedcomException extends RuntimeException {
        }
    }

-    public static class MissingRecord extends InvalidGedcomException {
-        public MissingRecord(String tag) {
-            super(String.format("Missing Record '%s'!", tag));
+    public static class InvalidOrMissingRecord extends InvalidGedcomException {
+        public InvalidOrMissingRecord(String tag) {
+            super(String.format("Record '%s' is missing or invalid!", tag));
        }
    }

--- a/src/main/java/de/nth/chronicle/gedcom/parser/TokenizerOptions.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/TokenizerOptions.java
@ -1,12 +0,0 @@
-package de.nth.chronicle.gedcom.parser;
-
-import lombok.Builder;
-import lombok.Getter;
-
-import java.nio.charset.Charset;
-
-@Builder
-@Getter
-public class TokenizerOptions {
-
-}
--- a/src/main/java/de/nth/chronicle/gedcom/parser/exception/GedcomException.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/exception/GedcomException.java
@ -0,0 +1,16 @@
+package de.nth.chronicle.gedcom.parser.exception;
+
+public class GedcomException extends RuntimeException {
+
+    public GedcomException() { super(); }
+    public GedcomException(String message) {
+        super(message);
+    }
+    public GedcomException(Throwable cause) {
+        super(cause);
+    }
+    public GedcomException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+}
--- a/src/main/java/de/nth/chronicle/gedcom/parser/exception/InvalidEncodingException.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/exception/InvalidEncodingException.java
@ -0,0 +1,10 @@
+package de.nth.chronicle.gedcom.parser.exception;
+
+public class InvalidEncodingException extends GedcomException {
+
+    public InvalidEncodingException(String expectedEncoding, String actualEncoding) {
+        super(String.format("GEDCOM 5.5.5 file is %s encoded but defines %s as encoding!",
+                                expectedEncoding, actualEncoding));
+    }
+
+}
--- a/src/main/java/de/nth/chronicle/gedcom/parser/exception/InvalidLineException.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/exception/InvalidLineException.java
@ -0,0 +1,18 @@
+package de.nth.chronicle.gedcom.parser.exception;
+
+public class InvalidLineException extends GedcomException {
+
+    private int lineNumber;
+    private String line;
+
+    private String reason;
+
+    public InvalidLineException(int lineNumber, String line, String reason) {
+        super(String.format("%s '%s' at line %d", reason, line, lineNumber));
+
+        this.lineNumber = lineNumber;
+        this.line = line;
+        this.reason = reason;
+    }
+
+}
--- a/src/main/java/de/nth/chronicle/gedcom/parser/exception/MissingBomException.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/exception/MissingBomException.java
@ -0,0 +1,9 @@
+package de.nth.chronicle.gedcom.parser.exception;
+
+public class MissingBomException extends GedcomException {
+
+
+    public MissingBomException() {
+        super("GEDCOM 5.5.5 file lacks Byte Order Mark!");
+    }
+}
--- a/src/main/java/de/nth/chronicle/gedcom/parser/records/HeaderRecordParser.java
+++ b/src/main/java/de/nth/chronicle/gedcom/parser/records/HeaderRecordParser.java
@ -6,10 +6,13 @@ import de.nth.chronicle.gedcom.parser.GedcomRecordParser;
 import de.nth.chronicle.gedcom.parser.InvalidGedcomException;
 import de.nth.chronicle.gedcom.parser.RecordToken;

-import java.util.function.Consumer;
+import java.time.LocalDate;
+import java.util.regex.Pattern;

 public class HeaderRecordParser implements GedcomRecordParser  {

+    public static final Pattern VERSION_REGEX = Pattern.compile("(\\p{Digit}{1,3})\\.(\\p{Digit}{1,3})(?:\\.(\\p{Digit}{1,3}))?");
+
    @Override
    public void parse(RecordToken token, Gedcom.GedcomBuilder builder) {

@ -17,18 +20,35 @@ public class HeaderRecordParser implements GedcomRecordParser  {

        GedcomHeader header = GedcomHeader.builder()
                                            .characterSet(token.findFirstValue("CHAR")
-                                                                .orElseThrow(() -> new InvalidGedcomException.MissingRecord("HEAD.CHAR")))
+                                                                .orElseThrow(() -> new InvalidGedcomException.InvalidOrMissingRecord("HEAD.CHAR")))
                                            .versionNumber(token.findFirstValue("GEDC.VERS")
-                                                                .orElse(null))
+                                                                .map(this::validateVersion)
+                                                                .orElseThrow(() -> new InvalidGedcomException.InvalidOrMissingRecord("GEDC.VERS")))
                                            .gedcomForm(token.findFirstValue("GEDC.FORM")
                                                                .orElse(null))
                                            .gedcomFormVersion(token.findFirstValue("GEDC.FORM.VERS")
-                                                                .orElse(null))
+                                                                .map(this::validateVersion)
+                                                                .orElseThrow(() -> new InvalidGedcomException.InvalidOrMissingRecord("GEDC.VERS")))
+                                            .approvedSystemId(token.findFirstValue("SOUR").orElse(null))
+                                            .sourceVersion(token.findFirstValue("SOUR.VERS").orElse(null))
+                                            .nameOfProduct(token.findFirstValue("SOUR.NAME").orElse(null))
+                                            .nameOfBusiness(token.findFirstValue("SOUR.CORP").orElse(null)) //TODO address
+                                            .nameOfSourceData(token.findFirstValue("SOUR.DATA.").orElse(null))
+                                            .publicationDate(token.findFirstValue("SOUR.DATA.DATE").map(LocalDate::parse).orElse(null))
+                                            //.copyrightSourceData() TODO
+
                                            .build();

        builder.header(header);

    }

+    private String validateVersion(String version) {
+        if(VERSION_REGEX.matcher(version).matches()) return version;
+        return null;
+    }
+
+
+

 }
--- a/src/test/java/de/nth/chronicle/gedcom/parser/GedcomParserTests.java
+++ b/src/test/java/de/nth/chronicle/gedcom/parser/GedcomParserTests.java
@ -11,7 +11,7 @@ public class GedcomParserTests {

        GedcomParser parser = GedcomParser.getParser(GedcomVersion.VERSION_5_5_5);

-        Gedcom gedcom = parser.parseGedcom(GedcomParserTests.class.getResourceAsStream("/examples/MINIMAL555.ged"));
+        Gedcom gedcom = parser.parseGedcom(GedcomParserTests.class.getResourceAsStream("/examples/555SAMPLE.ged"));

        System.out.println(gedcom);

--- a/src/test/java/de/nth/chronicle/gedcom/parser/GedcomTokenizerTests.java
+++ b/src/test/java/de/nth/chronicle/gedcom/parser/GedcomTokenizerTests.java
@ -8,7 +8,6 @@ import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
-import java.util.Set;
 import java.util.function.Consumer;

 import static org.junit.jupiter.api.Assertions.*;
@ -51,16 +50,13 @@ public class GedcomTokenizerTests {
    void testBasicTokenizerFunctionality() throws Exception {

        InputStream stream = GedcomTokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
-        BufferedReader reader = new BufferedReader(new InputStreamReader(stream));

-        GedcomTokenizer tokenizer = new GedcomTokenizer(reader, TokenizerOptions.builder()
-
-                                                                                        .build());
+        GedcomTokenizer tokenizer = new GedcomTokenizer(stream);

        List<RecordToken> records = null;

        try {
-            records = tokenizer.parseRecords();
+            records = tokenizer.parseRecordsTokens();
        }catch(Exception e) {
            e.printStackTrace();
        }