FEAT GEDCOM Header Parser v1

master
Niclas Thobaben 2020-11-27 02:16:38 +01:00
parent 04350dd2fb
commit 84bf8a596a
14 changed files with 347 additions and 165 deletions

View File

@ -1,7 +1,12 @@
package de.nth.chronicle.gedcom;
import lombok.Builder;
import lombok.Data;
@Builder(toBuilder = true)
@Data
public class Gedcom {
private final GedcomHeader header;
}

View File

@ -0,0 +1,22 @@
package de.nth.chronicle.gedcom;
import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.NonNull;
@Data
@Builder(toBuilder = true)
public class GedcomHeader {
private final String versionNumber;
@NonNull
private final String gedcomForm;
@NonNull
private final String gedcomFormVersion;
@NonNull
private final String characterSet;
}

View File

@ -0,0 +1,40 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.Gedcom;
import de.nth.chronicle.gedcom.GedcomVersion;
import de.nth.chronicle.gedcom.parser.records.HeaderRecordParser;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class Gedcom555Parser implements GedcomParser {
private static final Map<String, GedcomRecordParser> RECORD_PARSER_MAP = new HashMap<>();
static {
RECORD_PARSER_MAP.put("HEAD", new HeaderRecordParser());
}
@Override
public Gedcom parseGedcom(InputStream stream) throws Exception {
GedcomTokenizer tokenizer = GedcomTokenizer.create(stream, TokenizerOptions.builder()
.build());
List<RecordToken> tokens = tokenizer.parseRecords();
Gedcom.GedcomBuilder builder = Gedcom.builder();
for(RecordToken token : tokens) {
if(RECORD_PARSER_MAP.containsKey(token.getTag())) {
RECORD_PARSER_MAP.get(token.getTag()).parse(token, builder);
}else {
System.err.println("No Parser found for tag " + token.getTag());
}
}
return builder.build();
}
}

View File

@ -1,117 +0,0 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.util.EncodingUtils;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
class Gedcom555Tokenizer implements GedcomTokenizer {
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
private final TokenizerOptions options;
private final BufferedReader reader;
Gedcom555Tokenizer(BufferedReader reader, TokenizerOptions options) {
this.reader = reader;
this.options = options;
}
@Override
public Set<RecordToken> parseRecords() throws Exception {
Set<RecordToken> tokens = new HashSet<>();
Stack<RecordToken> stack = new Stack<>();
/*
* GEDCOM 5.5.5 Reader Rules
* import each line value as-is
* do not trim trailing white space from any GEDCOM line or line value
* do not trim leading white space from any line value
* */
String line;
while((line = reader.readLine()) != null) {
Matcher matcher = matchLine(line);
if(!matcher.matches()) {
//TODO throw Exception
continue;
}
int level = Integer.parseInt(matcher.group(1));
String tag = matcher.group(2);
String value = matcher.group(3);
RecordToken record = RecordToken.builder()
.level(level)
.tag(tag)
.value(value)
.build();
if(stack.isEmpty()) {
stack.push(record);
tokens.add(record);
} else if(level == 0) {
stack.clear();
stack.push(record);
tokens.add(record);
} else if(stack.peek().getLevel() == level) {
stack.pop();
if(!stack.isEmpty()) {
stack.peek().getSubRecords().add(record);
}
stack.push(record);
} else if(stack.peek().getLevel() < level) {
stack.peek().getSubRecords().add(record);
stack.push(record);
}
}
return tokens;
}
public static Gedcom555Tokenizer create(InputStream stream, TokenizerOptions options) {
Charset charset = validateEncoding(stream);
return new Gedcom555Tokenizer(new BufferedReader(new InputStreamReader(stream, charset)), options);
}
/**
* Removes possible UTF-8/UTF-16 BOM and Matches a GEDCOM Line.
*
* @param line to match
* @return Regex {@link Matcher}
*/
public static Matcher matchLine(String line) {
if(line.contains("\uFEFF")) {
line = line.replace("\uFEFF", "");
}
return LINE_REGEX.matcher(line);
}
static Charset validateEncoding(InputStream stream) {
/*
demand that file starts with a Byte Order Mark (BOM)
demand that the encoding is either UTF-8 or UTF-16
must support both UTF-8 and UTF-16 GEDCOM files
must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
*/
Charset charset = EncodingUtils.getCharsetForBOM(stream);
if(charset == null) {
throw new InvalidGedcomException.MissingBOM();
}
return charset;
}
}

View File

@ -0,0 +1,20 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.Gedcom;
import de.nth.chronicle.gedcom.GedcomVersion;
import java.io.InputStream;
public interface GedcomParser {
public Gedcom parseGedcom(InputStream stream) throws Exception;
public static GedcomParser getDefault() {
return getParser(GedcomVersion.VERSION_5_5_5);
}
public static GedcomParser getParser(GedcomVersion version) {
return new Gedcom555Parser();
}
}

View File

@ -0,0 +1,9 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.Gedcom;
public interface GedcomRecordParser {
public void parse(RecordToken token, Gedcom.GedcomBuilder builder);
}

View File

@ -1,9 +1,129 @@
package de.nth.chronicle.gedcom.parser;
import java.util.Set;
import de.nth.chronicle.gedcom.util.EncodingUtils;
public interface GedcomTokenizer {
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public Set<RecordToken> parseRecords() throws Exception;
class GedcomTokenizer {
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
private final TokenizerOptions options;
private final BufferedReader reader;
GedcomTokenizer(BufferedReader reader, TokenizerOptions options) {
this.reader = reader;
this.options = options;
}
public List<RecordToken> parseRecords() throws Exception {
List<RecordToken> tokens = new LinkedList<>();
Stack<RecordToken> stack = new Stack<>();
/*
* GEDCOM 5.5.5 Reader Rules
* import each line value as-is
* do not trim trailing white space from any GEDCOM line or line value
* do not trim leading white space from any line value
* */
String line;
int lineNumber = 1;
while((line = reader.readLine()) != null) {
System.out.println("Tokenizer: tokenize line " + lineNumber);
Matcher matcher = matchLine(line);
if(!matcher.matches()) {
throw new InvalidGedcomException.InvalidLine(lineNumber, line);
}
int level = Integer.parseInt(matcher.group(1));
String tag = matcher.group(2);
String value = matcher.group(3);
RecordToken record = RecordToken.builder()
.level(level)
.tag(tag)
.value(value)
.build();
if(stack.isEmpty()) {
System.out.println("First Record: " + line);
stack.push(record);
tokens.add(record);
} else if(level == 0) {
System.out.println("New Record: " + line);
stack.clear();
stack.push(record);
tokens.add(record);
} else if(stack.peek().getLevel() == level) {
System.out.println("Same Level Record: " + line);
stack.pop();
if(!stack.isEmpty()) {
stack.peek().getSubRecords().add(record);
}
stack.push(record);
} else if(stack.peek().getLevel() < level) {
System.out.println("Next Level Record: " + line);
stack.peek().getSubRecords().add(record);
stack.push(record);
} else if(stack.peek().getLevel() > level) {
while(stack.peek().getLevel() >= level) {
stack.pop();
}
System.out.println("Higher Level Record: " + line + " ==== parent: " + stack.peek().getTag());
stack.peek().getSubRecords().add(record);
stack.push(record);
}
lineNumber++;
}
return tokens;
}
public static GedcomTokenizer create(InputStream stream, TokenizerOptions options) {
Charset charset = validateEncoding(stream);
System.out.println("Tokenizer: user encoding " + charset);
return new GedcomTokenizer(new BufferedReader(new InputStreamReader(stream, charset)), options);
}
/**
* Removes possible UTF-8/UTF-16 BOM and Matches a GEDCOM Line.
*
* @param line to match
* @return Regex {@link Matcher}
*/
public static Matcher matchLine(String line) {
if(line.contains("\uFEFF")) {
line = line.replace("\uFEFF", "");
}
return LINE_REGEX.matcher(line);
}
static Charset validateEncoding(InputStream stream) {
/*
demand that file starts with a Byte Order Mark (BOM)
demand that the encoding is either UTF-8 or UTF-16
must support both UTF-8 and UTF-16 GEDCOM files
must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
*/
Charset charset = EncodingUtils.getCharsetForBOM(stream);
if(charset == null) {
throw new InvalidGedcomException.MissingBOM();
}
return charset;
}
}

View File

@ -7,11 +7,21 @@ public abstract class InvalidGedcomException extends RuntimeException {
}
public static class MissingBOM extends InvalidGedcomException {
public MissingBOM() {
super("BOM is missing!");
super("GEDCOM 5.5.5 file lacks Byte Order Mark!");
}
}
public static class InvalidLine extends InvalidGedcomException {
public InvalidLine(int line, String content) {
super(String.format("Invalid Gedcom: '%s' at line %d!", content, line));
}
}
public static class MissingRecord extends InvalidGedcomException {
public MissingRecord(String tag) {
super(String.format("Missing Record '%s'!", tag));
}
}

View File

@ -1,21 +1,66 @@
package de.nth.chronicle.gedcom.parser;
import lombok.*;
import lombok.Builder;
import lombok.Getter;
import lombok.Setter;
import java.util.HashSet;
import java.util.Set;
import java.util.*;
@Getter
@Setter
@Builder
public class RecordToken {
private Map<String, RecordToken> recordIndex;
private int level;
private String tag;
private String value;
@Builder.Default
private Set<RecordToken> subRecords = new HashSet<>();
private List<RecordToken> subRecords = new LinkedList<>();
public Optional<String> findFirstValue(String tag) {
return findFirst(tag)
.map(record -> record.getValue());
}
public Optional<RecordToken> findFirst(String tag) {
return findIndexed(tag);
}
private Optional<RecordToken> findIndexed(String tag) {
if(this.recordIndex == null) {
this.recordIndex = new HashMap<>();
}
if(this.recordIndex.containsKey(tag)) {
return Optional.of(this.recordIndex.get(tag));
}
return searchRecord(tag).map(record -> putIndex(tag, record));
}
private Optional<RecordToken> searchRecord(String tag) {
String[] pathTokens = tag.split("\\.");
RecordToken lastRecord = this;
for(String token : pathTokens) {
for(RecordToken record : lastRecord.subRecords) {
if(record.getTag().equals(token)) {
lastRecord = record;
}
}
}
if(!lastRecord.getTag().equals(pathTokens[pathTokens.length - 1])) {
lastRecord = null;
}
return Optional.ofNullable(lastRecord);
}
private RecordToken putIndex(String tag, RecordToken record) {
this.recordIndex.put(tag, record);
return record;
}
@Override
public String toString() {

View File

@ -1,23 +0,0 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.GedcomVersion;
import java.io.InputStream;
public interface TokenizerFactory {
public GedcomTokenizer createTokenizer(InputStream stream, TokenizerOptions options);
public static TokenizerFactory forVersion(String version) {
return forVersion(GedcomVersion.forVersionString(version));
}
public static TokenizerFactory forVersion(GedcomVersion version) {
switch(version) {
case VERSION_5_5_5: return Gedcom555Tokenizer::create;
default:
throw new IllegalArgumentException(String.format("No Implementation for version '%s' found!",
version.getVersion()));
}
}
}

View File

@ -0,0 +1,34 @@
package de.nth.chronicle.gedcom.parser.records;
import de.nth.chronicle.gedcom.Gedcom;
import de.nth.chronicle.gedcom.GedcomHeader;
import de.nth.chronicle.gedcom.parser.GedcomRecordParser;
import de.nth.chronicle.gedcom.parser.InvalidGedcomException;
import de.nth.chronicle.gedcom.parser.RecordToken;
import java.util.function.Consumer;
public class HeaderRecordParser implements GedcomRecordParser {
@Override
public void parse(RecordToken token, Gedcom.GedcomBuilder builder) {
System.out.println("Header: " + token.getSubRecords());
GedcomHeader header = GedcomHeader.builder()
.characterSet(token.findFirstValue("CHAR")
.orElseThrow(() -> new InvalidGedcomException.MissingRecord("HEAD.CHAR")))
.versionNumber(token.findFirstValue("GEDC.VERS")
.orElse(null))
.gedcomForm(token.findFirstValue("GEDC.FORM")
.orElse(null))
.gedcomFormVersion(token.findFirstValue("GEDC.FORM.VERS")
.orElse(null))
.build();
builder.header(header);
}
}

View File

@ -17,7 +17,10 @@ public final class EncodingUtils {
stream.read(bom);
if(bom[0] == (byte)0xEF) return StandardCharsets.UTF_8;
if(bom[0] == (byte)0xEF) {
stream.read();
return StandardCharsets.UTF_8;
}
if(bom[0] == (byte)0xFE) return StandardCharsets.UTF_16BE;
if(bom[0] == (byte)0xFF) return StandardCharsets.UTF_16LE;

View File

@ -0,0 +1,20 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.Gedcom;
import de.nth.chronicle.gedcom.GedcomVersion;
import org.junit.jupiter.api.Test;
public class GedcomParserTests {
@Test
void testParserMinimal() throws Exception {
GedcomParser parser = GedcomParser.getParser(GedcomVersion.VERSION_5_5_5);
Gedcom gedcom = parser.parseGedcom(GedcomParserTests.class.getResourceAsStream("/examples/MINIMAL555.ged"));
System.out.println(gedcom);
}
}

View File

@ -1,6 +1,5 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.GedcomVersion;
import org.junit.jupiter.api.Test;
import java.io.BufferedReader;
@ -8,15 +7,16 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Set;
import java.util.function.Consumer;
import static org.junit.jupiter.api.Assertions.*;
public class Gedcom555TokenizerTests {
public class GedcomTokenizerTests {
void useResourceReader(String resource, Charset charset, Consumer<BufferedReader> consumer) throws Exception{
InputStream stream = Gedcom555TokenizerTests.class.getResourceAsStream(resource);
InputStream stream = GedcomTokenizerTests.class.getResourceAsStream(resource);
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charset));
consumer.accept(reader);
@ -28,12 +28,6 @@ public class Gedcom555TokenizerTests {
useResourceReader(resource, StandardCharsets.UTF_8, consumer);
}
@Test
void testFactoryAccess() {
assertDoesNotThrow(() -> TokenizerFactory.forVersion(GedcomVersion.VERSION_5_5_5));
assertDoesNotThrow(() -> TokenizerFactory.forVersion("5.5.5"));
}
@Test
void testBasicTokenizerLineRegex() throws Exception {
useResourceReader("/examples/MINIMAL555.ged", reader -> reader.lines().forEach(this::validateLine));
@ -50,20 +44,20 @@ public class Gedcom555TokenizerTests {
}
void validateLine(String line) {
assertTrue(Gedcom555Tokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
assertTrue(GedcomTokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
}
@Test
void testBasicTokenizerFunctionality() throws Exception {
InputStream stream = Gedcom555TokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
InputStream stream = GedcomTokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
Gedcom555Tokenizer tokenizer = new Gedcom555Tokenizer(reader, TokenizerOptions.builder()
GedcomTokenizer tokenizer = new GedcomTokenizer(reader, TokenizerOptions.builder()
.build());
Set<RecordToken> records = null;
List<RecordToken> records = null;
try {
records = tokenizer.parseRecords();
@ -72,7 +66,7 @@ public class Gedcom555TokenizerTests {
}
assertEquals(3, records.size());
}
}