WIP bassic parsing system

master
Niclas Thobaben 2020-11-27 22:02:28 +01:00
parent e79d6f6236
commit 796680f620
21 changed files with 406 additions and 26 deletions

View File

@ -1,5 +1,6 @@
package de.nth.chronicle.gedcom;
import de.nth.chronicle.gedcom.type.Header;
import lombok.Builder;
import lombok.Data;
@ -7,6 +8,6 @@ import lombok.Data;
@Data
public class Gedcom {
private final Header header;
}

View File

@ -1,4 +1,4 @@
package de.nth.chronicle.gedcom.parser.exception;
package de.nth.chronicle.gedcom.exception;
public class GedcomException extends RuntimeException {

View File

@ -1,4 +1,4 @@
package de.nth.chronicle.gedcom.parser.exception;
package de.nth.chronicle.gedcom.exception;
public class InvalidEncodingException extends GedcomException {

View File

@ -1,4 +1,4 @@
package de.nth.chronicle.gedcom.parser.exception;
package de.nth.chronicle.gedcom.exception;
public class InvalidLineException extends GedcomException {

View File

@ -1,4 +1,4 @@
package de.nth.chronicle.gedcom.parser.exception;
package de.nth.chronicle.gedcom.exception;
public class MissingBomException extends GedcomException {

View File

@ -0,0 +1,9 @@
package de.nth.chronicle.gedcom.exception;
public class MissingRecordException extends GedcomException {
public MissingRecordException(String recordPath) {
super(String.format("Missing Record: '%s'!", recordPath));
}
}

View File

@ -0,0 +1,9 @@
package de.nth.chronicle.gedcom.exception;
public class UnrecognisedGedcomForm extends GedcomException {
public UnrecognisedGedcomForm(String form) {
super(String.format("Unrecognised GEDCOM Form '%s'!", form));
}
}

View File

@ -0,0 +1,22 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.exception.GedcomException;
import de.nth.chronicle.gedcom.type.Address;
public class AddressParser implements RecordParser<Address> {
@Override
public Address parse(RecordChunk chunk) throws GedcomException {
return Address.builder()
.line1(chunk.findFirstValue("ADR1").orElse(null))
.line2(chunk.findFirstValue("ADR2").orElse(null))
.line3(chunk.findFirstValue("ADR3").orElse(null))
.city(chunk.findFirstValue("CITY").orElse(null))
.state(chunk.findFirstValue("STAE").orElse(null))
.postalCode(chunk.findFirstValue("POST").orElse(null))
.country(chunk.findFirstValue("CTRY").orElse(null))
.build();
}
}

View File

@ -1,12 +1,38 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.Gedcom;
import de.nth.chronicle.gedcom.parser.exception.GedcomException;
import de.nth.chronicle.gedcom.exception.GedcomException;
import java.io.InputStream;
import java.util.List;
public interface GedcomReader {
public class GedcomReader {
private GedcomReaderContext context;
private Gedcom result;
public GedcomReader() {
}
public Gedcom read(InputStream stream) throws GedcomException {
if(this.result != null) return this.result;
Tokenizer tokenizer = new Tokenizer(stream);
List<RecordChunk> chunks = tokenizer.parseRecordsChunks();
GedcomReaderContext context = new GedcomReaderContext(chunks);
Gedcom.GedcomBuilder builder = context.getGedcomBuilder();
for(RecordChunk chunk : chunks) {
}
this.result = context.getGedcomBuilder().build();
return this.result;
}
public Gedcom read(InputStream stream) throws GedcomException;
}

View File

@ -0,0 +1,46 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.Gedcom;
import lombok.Getter;
import java.util.LinkedList;
import java.util.List;
@Getter
public class GedcomReaderContext {
private final Gedcom.GedcomBuilder gedcomBuilder;
private final List<RecordChunk> recordChunks;
private final List<Warning> warnings;
private RecordChunk currentChunk;
private int index = -1;
public boolean hasNext() {
return (this.index+1) < this.recordChunks.size();
}
public RecordChunk nextChunk() {
this.index++;
if(this.index >= this.recordChunks.size()) {
this.currentChunk = null;
return null;
}
this.currentChunk = this.recordChunks.get(this.index);
return this.currentChunk;
}
public RecordChunk getChunk() {
return this.currentChunk;
}
GedcomReaderContext(List<RecordChunk> chunks) {
this.gedcomBuilder = Gedcom.builder();
this.warnings = new LinkedList<>();
this.recordChunks = chunks;
}
}

View File

@ -1,9 +0,0 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.Gedcom;
public interface GedcomRecordParser {
public void parse(RecordChunk token, Gedcom.GedcomBuilder builder);
}

View File

@ -0,0 +1,80 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.exception.GedcomException;
import de.nth.chronicle.gedcom.exception.MissingRecordException;
import de.nth.chronicle.gedcom.exception.UnrecognisedGedcomForm;
import de.nth.chronicle.gedcom.type.GedcomFormType;
import de.nth.chronicle.gedcom.type.Header;
public class HeaderParser implements RecordParser<Header> {
@Override
public Header parse(RecordChunk chunk) throws GedcomException {
Header.HeaderBuilder builder = Header.builder();
for(RecordChunk subChunk : chunk.getSubRecords()) {
switch(subChunk.getTag()) {
case "GEDC":
parseGedcomMeta(subChunk, builder);
break;
case "CHAR":
builder.encoding(subChunk.getValue());
break;
case "SOUR":
parseGedcomSource(subChunk, builder);
break;
case "DATE":
break;
case "FILE":
builder.fileName(subChunk.getValue());
break;
case "LANG":
builder.language(subChunk.getValue());
break;
case "COPR":
builder.copyright(subChunk.getValue());
break;
case "PLAC":
break;
case "NOTE":
builder.contentDescription(subChunk.getValue());
break;
}
}
return builder.build();
}
private void parseGedcomMeta(RecordChunk chunk, Header.HeaderBuilder builder) {
GedcomFormType formType = GedcomFormType.forKey(chunk.findFirstValue("FORM")
.orElseThrow(() -> new MissingRecordException("HEAD.GEDC.FORM")));
if(formType == null) {
throw new UnrecognisedGedcomForm(chunk.findFirstValue("FORM").orElse(null));
}
builder.gedcomMeta(Header.Meta.builder()
.version(chunk.findFirstValue("VERS")
.orElseThrow(() -> new MissingRecordException("HEAD.GEDC.VERS")))
.formType(formType)
.formVersion(chunk.findFirstValue("FORM.VERS")
.orElse(null))
.build());
}
private void parseGedcomSource(RecordChunk chunk, Header.HeaderBuilder builder) {
builder.source(Header.Source.builder()
.systemId(chunk.getValue())
.version(chunk.findFirstValue("VERS").orElse(null))
.productName(chunk.findFirstValue("NAME").orElse(null))
.businessName(chunk.findFirstValue("CORP").orElse(null))
.address(chunk.findFirst("CORP.ADDR").map(RecordParser.ADDRESS::parse).orElse(null))
.build());
}
}

View File

@ -13,13 +13,45 @@ public class RecordChunk {
private Map<String, RecordChunk> recordIndex;
private int lineNumber;
private String sourceLine;
private int level;
private String tag;
private String value;
private RecordChunk previous, next;
@Builder.Default
private List<RecordChunk> subRecords = new LinkedList<>();
/**
* Normalizes CONT & CONC Sub Records by appending the line value
* to the parent record line value
*/
void normalize() {
if(this.value == null) {
this.subRecords.forEach(record -> record.normalize());
return;
}
StringBuilder sb = new StringBuilder(this.value);
Set<RecordChunk> delete = new HashSet<>();
for(RecordChunk chunk : this.subRecords) {
if(chunk.getTag().equals("CONT")) {
sb.append(System.lineSeparator()).append(chunk.getValue());
delete.add(chunk);
}else if(chunk.getTag().equals("CONC")) {
sb.append(chunk.getValue());
delete.add(chunk);
}
chunk.normalize();
}
this.subRecords.removeAll(delete);
this.value = sb.toString();
}
public Optional<String> findFirstValue(String tag) {
return findFirst(tag)
.map(record -> record.getValue());

View File

@ -0,0 +1,15 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.exception.GedcomException;
import de.nth.chronicle.gedcom.type.Address;
import de.nth.chronicle.gedcom.type.Header;
public interface RecordParser<T> {
public T parse(RecordChunk chunk) throws GedcomException;
public static RecordParser<Header> HEADER = new HeaderParser();
public static RecordParser<Address> ADDRESS = new AddressParser();
}

View File

@ -1,8 +1,8 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.parser.exception.GedcomException;
import de.nth.chronicle.gedcom.parser.exception.InvalidLineException;
import de.nth.chronicle.gedcom.parser.exception.MissingBomException;
import de.nth.chronicle.gedcom.exception.GedcomException;
import de.nth.chronicle.gedcom.exception.InvalidLineException;
import de.nth.chronicle.gedcom.exception.MissingBomException;
import de.nth.chronicle.gedcom.util.EncodingUtils;
import java.io.BufferedReader;
@ -14,7 +14,7 @@ import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
class GedcomTokenizer {
class Tokenizer {
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
@ -26,7 +26,9 @@ class GedcomTokenizer {
private final Stack<RecordChunk> stack = new Stack<>();
private final BufferedReader reader;
public GedcomTokenizer(InputStream stream) {
private RecordChunk lastChunk;
public Tokenizer(InputStream stream) {
Charset charset = validateEncoding(stream);
this.reader = new BufferedReader(new InputStreamReader(stream, charset));
@ -61,6 +63,8 @@ class GedcomTokenizer {
this.currentLineNumber++;
}
this.records.forEach(record -> record.normalize());
return this.records;
}
@ -86,6 +90,8 @@ class GedcomTokenizer {
.level(level)
.tag(tag)
.value(value)
.lineNumber(this.currentLineNumber)
.sourceLine(this.currentLine)
.build();
return record;
@ -115,6 +121,12 @@ class GedcomTokenizer {
this.stack.peek().getSubRecords().add(record);
this.stack.push(record);
}
if(this.lastChunk != null) {
this.lastChunk.setNext(record);
}
record.setPrevious(this.lastChunk);
this.lastChunk = record;
}
/**

View File

@ -0,0 +1,19 @@
package de.nth.chronicle.gedcom.parser;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.Getter;
@AllArgsConstructor
@Getter
public class Warning {
private final String message;
private final String line;
private final int lineNumber;
@Override
public String toString() {
return String.format("Warning: %s '%s' at line %d", this.message, this.line, this.lineNumber);
}
}

View File

@ -0,0 +1,21 @@
package de.nth.chronicle.gedcom.type;
import lombok.Builder;
import lombok.Data;
import java.util.List;
@Data
@Builder
public class Address {
private String line1; /* ADR1 <ADDRESS_LINE1> */
private String line2; /* ADR2 <ADDRESS_LINE2> */
private String line3; /* ADR3 <ADDRESS_LINE3> */
private String city; /* CITY <ADDRESS_CITY> */
private String state; /* STAE <ADDRESS_STATE> */
private String postalCode; /* POST <ADDRESS_POSTAL_CODE> */
private String country; /* CTRY <ADDRESS_COUNTRY> */
}

View File

@ -0,0 +1,22 @@
package de.nth.chronicle.gedcom.type;
public enum GedcomFormType {
LINEAGE_LINKED("LINEAGE-LINKED");
private String key;
GedcomFormType(String key) { this.key = key; }
public String getKey() { return this.key; }
public static GedcomFormType forKey(String key) {
for(GedcomFormType type : values()) {
if(type.getKey().equals(key)) {
return type;
}
}
return null;
}
}

View File

@ -0,0 +1,52 @@
package de.nth.chronicle.gedcom.type;
import lombok.Builder;
import lombok.Data;
import java.time.LocalDate;
import java.time.LocalTime;
@Data
@Builder
public class Header {
private String encoding; /* CHAR <CHARACTER_SET> */
private String contentDescription; /* NOTE <GEDCOM_CONTENT_DESCRIPTION> */
private String language; /* LANG <LANGUAGE_OF_TEXT> */
private String copyright; /* COPR <COPYRIGHT_GEDCOM_FILE> */
private String fileName; /* FILE <FILE_NAME> */
private LocalDate transmissionDate; /* DATE <TRANSMISSION_DATE> */
private LocalTime transmissionTime; /* TIME <TIME_VALUE> */
private Meta gedcomMeta; /* GEDC */
private Source source; /* SOUR */
@Data
@Builder
public static class Meta {
private String version; /* VERS <VERSION_NUMBER> */
private GedcomFormType formType; /* FORM <GEDCOM_FORM> */
private String formVersion; /* */
}
@Data
@Builder
public static class Source {
private String systemId; /* SOUR <APPROVED_SYSTEM_ID> */
private String version; /* VERS <VERSION_NUMBER> */
private String productName; /* NAME <NAME_OF_PRODUCT> */
private String businessName; /* CORP <NAME_OF_BUSINESS> */
private Address address; /* ADDR <ADDRESS_STRUCTURE> */
private String name; /* DATA <NAME_OF_SOURCE_DATA> */
private LocalDate publicationDate; /* DATE <PUBLICATION_DATE> */
private LocalTime publicationTime; /* TIME <PUBLICATION_TIME> */
private String copyright; /* COPR <COPYRIGHT_SOURCE_DATA> */
}
}

View File

@ -0,0 +1,23 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.Gedcom;
import org.junit.jupiter.api.Test;
import java.io.InputStream;
public class GedcomReaderTests {
@Test
void testMinimalGedcom() {
InputStream stream = GedcomReaderTests.class.getResourceAsStream("/examples/555SAMPLE.ged");
GedcomReader reader = new GedcomReader();
Gedcom gedcom = reader.read(stream);
System.out.println(gedcom);
}
}

View File

@ -43,16 +43,14 @@ public class GedcomTokenizerTests {
}
void validateLine(String line) {
assertTrue(GedcomTokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
assertTrue(Tokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
}
@Test
void testBasicTokenizerFunctionality() throws Exception {
InputStream stream = GedcomTokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
GedcomTokenizer tokenizer = new GedcomTokenizer(stream);
Tokenizer tokenizer = new Tokenizer(stream);
List<RecordChunk> records = null;
try {
@ -61,6 +59,8 @@ public class GedcomTokenizerTests {
e.printStackTrace();
}
assertEquals(3, records.size());
}