WIP bassic parsing system
parent
e79d6f6236
commit
796680f620
|
@ -1,5 +1,6 @@
|
|||
package de.nth.chronicle.gedcom;
|
||||
|
||||
import de.nth.chronicle.gedcom.type.Header;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
|
@ -7,6 +8,6 @@ import lombok.Data;
|
|||
@Data
|
||||
public class Gedcom {
|
||||
|
||||
|
||||
private final Header header;
|
||||
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package de.nth.chronicle.gedcom.parser.exception;
|
||||
package de.nth.chronicle.gedcom.exception;
|
||||
|
||||
public class GedcomException extends RuntimeException {
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package de.nth.chronicle.gedcom.parser.exception;
|
||||
package de.nth.chronicle.gedcom.exception;
|
||||
|
||||
public class InvalidEncodingException extends GedcomException {
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package de.nth.chronicle.gedcom.parser.exception;
|
||||
package de.nth.chronicle.gedcom.exception;
|
||||
|
||||
public class InvalidLineException extends GedcomException {
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package de.nth.chronicle.gedcom.parser.exception;
|
||||
package de.nth.chronicle.gedcom.exception;
|
||||
|
||||
public class MissingBomException extends GedcomException {
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
package de.nth.chronicle.gedcom.exception;
|
||||
|
||||
public class MissingRecordException extends GedcomException {
|
||||
|
||||
public MissingRecordException(String recordPath) {
|
||||
super(String.format("Missing Record: '%s'!", recordPath));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
package de.nth.chronicle.gedcom.exception;
|
||||
|
||||
public class UnrecognisedGedcomForm extends GedcomException {
|
||||
|
||||
public UnrecognisedGedcomForm(String form) {
|
||||
super(String.format("Unrecognised GEDCOM Form '%s'!", form));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.exception.GedcomException;
|
||||
import de.nth.chronicle.gedcom.type.Address;
|
||||
|
||||
public class AddressParser implements RecordParser<Address> {
|
||||
|
||||
@Override
|
||||
public Address parse(RecordChunk chunk) throws GedcomException {
|
||||
|
||||
return Address.builder()
|
||||
.line1(chunk.findFirstValue("ADR1").orElse(null))
|
||||
.line2(chunk.findFirstValue("ADR2").orElse(null))
|
||||
.line3(chunk.findFirstValue("ADR3").orElse(null))
|
||||
.city(chunk.findFirstValue("CITY").orElse(null))
|
||||
.state(chunk.findFirstValue("STAE").orElse(null))
|
||||
.postalCode(chunk.findFirstValue("POST").orElse(null))
|
||||
.country(chunk.findFirstValue("CTRY").orElse(null))
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,12 +1,38 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.Gedcom;
|
||||
import de.nth.chronicle.gedcom.parser.exception.GedcomException;
|
||||
import de.nth.chronicle.gedcom.exception.GedcomException;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
public interface GedcomReader {
|
||||
public class GedcomReader {
|
||||
|
||||
private GedcomReaderContext context;
|
||||
private Gedcom result;
|
||||
|
||||
public GedcomReader() {
|
||||
|
||||
}
|
||||
|
||||
public Gedcom read(InputStream stream) throws GedcomException {
|
||||
if(this.result != null) return this.result;
|
||||
|
||||
Tokenizer tokenizer = new Tokenizer(stream);
|
||||
List<RecordChunk> chunks = tokenizer.parseRecordsChunks();
|
||||
|
||||
GedcomReaderContext context = new GedcomReaderContext(chunks);
|
||||
Gedcom.GedcomBuilder builder = context.getGedcomBuilder();
|
||||
|
||||
for(RecordChunk chunk : chunks) {
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
this.result = context.getGedcomBuilder().build();
|
||||
return this.result;
|
||||
}
|
||||
|
||||
public Gedcom read(InputStream stream) throws GedcomException;
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.Gedcom;
|
||||
import lombok.Getter;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
@Getter
|
||||
public class GedcomReaderContext {
|
||||
|
||||
private final Gedcom.GedcomBuilder gedcomBuilder;
|
||||
private final List<RecordChunk> recordChunks;
|
||||
|
||||
private final List<Warning> warnings;
|
||||
|
||||
private RecordChunk currentChunk;
|
||||
private int index = -1;
|
||||
|
||||
public boolean hasNext() {
|
||||
return (this.index+1) < this.recordChunks.size();
|
||||
}
|
||||
|
||||
public RecordChunk nextChunk() {
|
||||
this.index++;
|
||||
if(this.index >= this.recordChunks.size()) {
|
||||
this.currentChunk = null;
|
||||
return null;
|
||||
}
|
||||
this.currentChunk = this.recordChunks.get(this.index);
|
||||
return this.currentChunk;
|
||||
}
|
||||
|
||||
public RecordChunk getChunk() {
|
||||
return this.currentChunk;
|
||||
}
|
||||
|
||||
GedcomReaderContext(List<RecordChunk> chunks) {
|
||||
this.gedcomBuilder = Gedcom.builder();
|
||||
this.warnings = new LinkedList<>();
|
||||
this.recordChunks = chunks;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -1,9 +0,0 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.Gedcom;
|
||||
|
||||
public interface GedcomRecordParser {
|
||||
|
||||
public void parse(RecordChunk token, Gedcom.GedcomBuilder builder);
|
||||
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.exception.GedcomException;
|
||||
import de.nth.chronicle.gedcom.exception.MissingRecordException;
|
||||
import de.nth.chronicle.gedcom.exception.UnrecognisedGedcomForm;
|
||||
import de.nth.chronicle.gedcom.type.GedcomFormType;
|
||||
import de.nth.chronicle.gedcom.type.Header;
|
||||
|
||||
public class HeaderParser implements RecordParser<Header> {
|
||||
|
||||
@Override
|
||||
public Header parse(RecordChunk chunk) throws GedcomException {
|
||||
|
||||
Header.HeaderBuilder builder = Header.builder();
|
||||
|
||||
for(RecordChunk subChunk : chunk.getSubRecords()) {
|
||||
|
||||
switch(subChunk.getTag()) {
|
||||
case "GEDC":
|
||||
parseGedcomMeta(subChunk, builder);
|
||||
break;
|
||||
case "CHAR":
|
||||
builder.encoding(subChunk.getValue());
|
||||
break;
|
||||
case "SOUR":
|
||||
parseGedcomSource(subChunk, builder);
|
||||
break;
|
||||
case "DATE":
|
||||
break;
|
||||
case "FILE":
|
||||
builder.fileName(subChunk.getValue());
|
||||
break;
|
||||
case "LANG":
|
||||
builder.language(subChunk.getValue());
|
||||
break;
|
||||
case "COPR":
|
||||
builder.copyright(subChunk.getValue());
|
||||
break;
|
||||
case "PLAC":
|
||||
break;
|
||||
case "NOTE":
|
||||
builder.contentDescription(subChunk.getValue());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
private void parseGedcomMeta(RecordChunk chunk, Header.HeaderBuilder builder) {
|
||||
|
||||
GedcomFormType formType = GedcomFormType.forKey(chunk.findFirstValue("FORM")
|
||||
.orElseThrow(() -> new MissingRecordException("HEAD.GEDC.FORM")));
|
||||
|
||||
if(formType == null) {
|
||||
throw new UnrecognisedGedcomForm(chunk.findFirstValue("FORM").orElse(null));
|
||||
}
|
||||
|
||||
builder.gedcomMeta(Header.Meta.builder()
|
||||
.version(chunk.findFirstValue("VERS")
|
||||
.orElseThrow(() -> new MissingRecordException("HEAD.GEDC.VERS")))
|
||||
.formType(formType)
|
||||
.formVersion(chunk.findFirstValue("FORM.VERS")
|
||||
.orElse(null))
|
||||
.build());
|
||||
}
|
||||
|
||||
private void parseGedcomSource(RecordChunk chunk, Header.HeaderBuilder builder) {
|
||||
|
||||
|
||||
builder.source(Header.Source.builder()
|
||||
.systemId(chunk.getValue())
|
||||
.version(chunk.findFirstValue("VERS").orElse(null))
|
||||
.productName(chunk.findFirstValue("NAME").orElse(null))
|
||||
.businessName(chunk.findFirstValue("CORP").orElse(null))
|
||||
.address(chunk.findFirst("CORP.ADDR").map(RecordParser.ADDRESS::parse).orElse(null))
|
||||
.build());
|
||||
}
|
||||
|
||||
}
|
|
@ -13,13 +13,45 @@ public class RecordChunk {
|
|||
|
||||
private Map<String, RecordChunk> recordIndex;
|
||||
|
||||
private int lineNumber;
|
||||
private String sourceLine;
|
||||
|
||||
private int level;
|
||||
private String tag;
|
||||
private String value;
|
||||
|
||||
private RecordChunk previous, next;
|
||||
|
||||
@Builder.Default
|
||||
private List<RecordChunk> subRecords = new LinkedList<>();
|
||||
|
||||
/**
|
||||
* Normalizes CONT & CONC Sub Records by appending the line value
|
||||
* to the parent record line value
|
||||
*/
|
||||
void normalize() {
|
||||
|
||||
if(this.value == null) {
|
||||
this.subRecords.forEach(record -> record.normalize());
|
||||
return;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(this.value);
|
||||
Set<RecordChunk> delete = new HashSet<>();
|
||||
for(RecordChunk chunk : this.subRecords) {
|
||||
if(chunk.getTag().equals("CONT")) {
|
||||
sb.append(System.lineSeparator()).append(chunk.getValue());
|
||||
delete.add(chunk);
|
||||
}else if(chunk.getTag().equals("CONC")) {
|
||||
sb.append(chunk.getValue());
|
||||
delete.add(chunk);
|
||||
}
|
||||
chunk.normalize();
|
||||
}
|
||||
this.subRecords.removeAll(delete);
|
||||
this.value = sb.toString();
|
||||
}
|
||||
|
||||
public Optional<String> findFirstValue(String tag) {
|
||||
return findFirst(tag)
|
||||
.map(record -> record.getValue());
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.exception.GedcomException;
|
||||
import de.nth.chronicle.gedcom.type.Address;
|
||||
import de.nth.chronicle.gedcom.type.Header;
|
||||
|
||||
public interface RecordParser<T> {
|
||||
|
||||
public T parse(RecordChunk chunk) throws GedcomException;
|
||||
|
||||
|
||||
public static RecordParser<Header> HEADER = new HeaderParser();
|
||||
public static RecordParser<Address> ADDRESS = new AddressParser();
|
||||
|
||||
}
|
|
@ -1,8 +1,8 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.parser.exception.GedcomException;
|
||||
import de.nth.chronicle.gedcom.parser.exception.InvalidLineException;
|
||||
import de.nth.chronicle.gedcom.parser.exception.MissingBomException;
|
||||
import de.nth.chronicle.gedcom.exception.GedcomException;
|
||||
import de.nth.chronicle.gedcom.exception.InvalidLineException;
|
||||
import de.nth.chronicle.gedcom.exception.MissingBomException;
|
||||
import de.nth.chronicle.gedcom.util.EncodingUtils;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
|
@ -14,7 +14,7 @@ import java.util.*;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
class GedcomTokenizer {
|
||||
class Tokenizer {
|
||||
|
||||
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
|
||||
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
|
||||
|
@ -26,7 +26,9 @@ class GedcomTokenizer {
|
|||
private final Stack<RecordChunk> stack = new Stack<>();
|
||||
private final BufferedReader reader;
|
||||
|
||||
public GedcomTokenizer(InputStream stream) {
|
||||
private RecordChunk lastChunk;
|
||||
|
||||
public Tokenizer(InputStream stream) {
|
||||
Charset charset = validateEncoding(stream);
|
||||
|
||||
this.reader = new BufferedReader(new InputStreamReader(stream, charset));
|
||||
|
@ -61,6 +63,8 @@ class GedcomTokenizer {
|
|||
this.currentLineNumber++;
|
||||
}
|
||||
|
||||
this.records.forEach(record -> record.normalize());
|
||||
|
||||
return this.records;
|
||||
}
|
||||
|
||||
|
@ -86,6 +90,8 @@ class GedcomTokenizer {
|
|||
.level(level)
|
||||
.tag(tag)
|
||||
.value(value)
|
||||
.lineNumber(this.currentLineNumber)
|
||||
.sourceLine(this.currentLine)
|
||||
.build();
|
||||
|
||||
return record;
|
||||
|
@ -115,6 +121,12 @@ class GedcomTokenizer {
|
|||
this.stack.peek().getSubRecords().add(record);
|
||||
this.stack.push(record);
|
||||
}
|
||||
|
||||
if(this.lastChunk != null) {
|
||||
this.lastChunk.setNext(record);
|
||||
}
|
||||
record.setPrevious(this.lastChunk);
|
||||
this.lastChunk = record;
|
||||
}
|
||||
|
||||
/**
|
|
@ -0,0 +1,19 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
public class Warning {
|
||||
|
||||
private final String message;
|
||||
private final String line;
|
||||
private final int lineNumber;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Warning: %s '%s' at line %d", this.message, this.line, this.lineNumber);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
package de.nth.chronicle.gedcom.type;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class Address {
|
||||
|
||||
private String line1; /* ADR1 <ADDRESS_LINE1> */
|
||||
private String line2; /* ADR2 <ADDRESS_LINE2> */
|
||||
private String line3; /* ADR3 <ADDRESS_LINE3> */
|
||||
|
||||
private String city; /* CITY <ADDRESS_CITY> */
|
||||
private String state; /* STAE <ADDRESS_STATE> */
|
||||
private String postalCode; /* POST <ADDRESS_POSTAL_CODE> */
|
||||
private String country; /* CTRY <ADDRESS_COUNTRY> */
|
||||
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
package de.nth.chronicle.gedcom.type;
|
||||
|
||||
public enum GedcomFormType {
|
||||
|
||||
LINEAGE_LINKED("LINEAGE-LINKED");
|
||||
|
||||
private String key;
|
||||
GedcomFormType(String key) { this.key = key; }
|
||||
|
||||
public String getKey() { return this.key; }
|
||||
|
||||
public static GedcomFormType forKey(String key) {
|
||||
|
||||
for(GedcomFormType type : values()) {
|
||||
if(type.getKey().equals(key)) {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
package de.nth.chronicle.gedcom.type;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalTime;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class Header {
|
||||
|
||||
private String encoding; /* CHAR <CHARACTER_SET> */
|
||||
private String contentDescription; /* NOTE <GEDCOM_CONTENT_DESCRIPTION> */
|
||||
private String language; /* LANG <LANGUAGE_OF_TEXT> */
|
||||
private String copyright; /* COPR <COPYRIGHT_GEDCOM_FILE> */
|
||||
private String fileName; /* FILE <FILE_NAME> */
|
||||
private LocalDate transmissionDate; /* DATE <TRANSMISSION_DATE> */
|
||||
private LocalTime transmissionTime; /* TIME <TIME_VALUE> */
|
||||
|
||||
private Meta gedcomMeta; /* GEDC */
|
||||
private Source source; /* SOUR */
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public static class Meta {
|
||||
|
||||
private String version; /* VERS <VERSION_NUMBER> */
|
||||
private GedcomFormType formType; /* FORM <GEDCOM_FORM> */
|
||||
private String formVersion; /* */
|
||||
|
||||
}
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public static class Source {
|
||||
|
||||
private String systemId; /* SOUR <APPROVED_SYSTEM_ID> */
|
||||
private String version; /* VERS <VERSION_NUMBER> */
|
||||
private String productName; /* NAME <NAME_OF_PRODUCT> */
|
||||
private String businessName; /* CORP <NAME_OF_BUSINESS> */
|
||||
private Address address; /* ADDR <ADDRESS_STRUCTURE> */
|
||||
private String name; /* DATA <NAME_OF_SOURCE_DATA> */
|
||||
private LocalDate publicationDate; /* DATE <PUBLICATION_DATE> */
|
||||
private LocalTime publicationTime; /* TIME <PUBLICATION_TIME> */
|
||||
private String copyright; /* COPR <COPYRIGHT_SOURCE_DATA> */
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.Gedcom;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
public class GedcomReaderTests {
|
||||
|
||||
@Test
|
||||
void testMinimalGedcom() {
|
||||
|
||||
InputStream stream = GedcomReaderTests.class.getResourceAsStream("/examples/555SAMPLE.ged");
|
||||
|
||||
GedcomReader reader = new GedcomReader();
|
||||
|
||||
Gedcom gedcom = reader.read(stream);
|
||||
|
||||
System.out.println(gedcom);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -43,16 +43,14 @@ public class GedcomTokenizerTests {
|
|||
}
|
||||
|
||||
void validateLine(String line) {
|
||||
assertTrue(GedcomTokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
|
||||
assertTrue(Tokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testBasicTokenizerFunctionality() throws Exception {
|
||||
|
||||
InputStream stream = GedcomTokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
|
||||
|
||||
GedcomTokenizer tokenizer = new GedcomTokenizer(stream);
|
||||
|
||||
Tokenizer tokenizer = new Tokenizer(stream);
|
||||
List<RecordChunk> records = null;
|
||||
|
||||
try {
|
||||
|
@ -61,6 +59,8 @@ public class GedcomTokenizerTests {
|
|||
e.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
|
||||
assertEquals(3, records.size());
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue