REFC Tokenzier Factory + Tokenzier 5.5.5 implementation

master
Niclas Thobaben 2020-11-26 23:53:49 +01:00
parent 029e0b88c1
commit c62ab4fe26
8 changed files with 122 additions and 17 deletions

View File

@ -0,0 +1,7 @@
package de.nth.chronicle.gedcom;
public class Gedcom {
}

View File

@ -0,0 +1,26 @@
package de.nth.chronicle.gedcom;
public enum GedcomVersion {
VERSION_5_5_5("5.5.5");
private String version;
GedcomVersion(String version) {
this.version = version;
}
public String getVersion() {
return this.version;
}
public static GedcomVersion forVersionString(String version) {
for(GedcomVersion ver : values()) {
if(ver.getVersion().equals(version)) {
return ver;
}
}
throw new IllegalArgumentException(String.format("No Gedcom Version '%s' found!", version));
}
}

View File

@ -1,22 +1,40 @@
package de.nth.chronicle.gedcom.parser.gedcom.parser;
import lombok.SneakyThrows;
package de.nth.chronicle.gedcom.parser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Tokenizer {
class Gedcom555Tokenizer implements GedcomTokenizer {
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
@SneakyThrows
public Set<RecordToken> parseTokens(BufferedReader reader) {
private final TokenizerOptions options;
private final BufferedReader reader;
Gedcom555Tokenizer(BufferedReader reader, TokenizerOptions options) {
this.reader = reader;
this.options = options;
}
void validateEncoding(BufferedReader reader) {
/*
demand that file starts with a Byte Order Mark (BOM)
demand that the encoding is either UTF-8 or UTF-16
must support both UTF-8 and UTF-16 GEDCOM files
must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
*/
}
@Override
public Set<RecordToken> parseRecords() throws Exception {
Set<RecordToken> tokens = new HashSet<>();
Stack<RecordToken> stack = new Stack<>();
@ -69,6 +87,10 @@ public class Tokenizer {
}
public static Gedcom555Tokenizer create(InputStream stream, TokenizerOptions options) {
return new Gedcom555Tokenizer(new BufferedReader(new InputStreamReader(stream)), options);
}
/**
* Removes possible UTF-8/UTF-16 BOM and Matches a GEDCOM Line.
*

View File

@ -0,0 +1,9 @@
package de.nth.chronicle.gedcom.parser;
import java.util.Set;
public interface GedcomTokenizer {
public Set<RecordToken> parseRecords() throws Exception;
}

View File

@ -1,4 +1,4 @@
package de.nth.chronicle.gedcom.parser.gedcom.parser;
package de.nth.chronicle.gedcom.parser;
import lombok.*;

View File

@ -0,0 +1,23 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.GedcomVersion;
import java.io.InputStream;
public interface TokenizerFactory {
public GedcomTokenizer createTokenizer(InputStream stream, TokenizerOptions options);
public static TokenizerFactory forVersion(String version) {
return forVersion(GedcomVersion.forVersionString(version));
}
public static TokenizerFactory forVersion(GedcomVersion version) {
switch(version) {
case VERSION_5_5_5: return Gedcom555Tokenizer::create;
default:
throw new IllegalArgumentException(String.format("No Implementation for version '%s' found!",
version.getVersion()));
}
}
}

View File

@ -0,0 +1,12 @@
package de.nth.chronicle.gedcom.parser;
import lombok.Builder;
import lombok.Getter;
@Builder
@Getter
public class TokenizerOptions {
}

View File

@ -1,8 +1,6 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.parser.gedcom.parser.RecordToken;
import de.nth.chronicle.gedcom.parser.gedcom.parser.Tokenizer;
import org.junit.jupiter.api.Assertions;
import de.nth.chronicle.gedcom.GedcomVersion;
import org.junit.jupiter.api.Test;
import java.io.BufferedReader;
@ -15,10 +13,10 @@ import java.util.function.Consumer;
import static org.junit.jupiter.api.Assertions.*;
public class TokenizerTests {
public class Gedcom555TokenizerTests {
void useResourceReader(String resource, Charset charset, Consumer<BufferedReader> consumer) throws Exception{
InputStream stream = TokenizerTests.class.getResourceAsStream(resource);
InputStream stream = Gedcom555TokenizerTests.class.getResourceAsStream(resource);
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charset));
consumer.accept(reader);
@ -30,6 +28,12 @@ public class TokenizerTests {
useResourceReader(resource, StandardCharsets.UTF_8, consumer);
}
@Test
void testFactoryAccess() {
assertDoesNotThrow(() -> TokenizerFactory.forVersion(GedcomVersion.VERSION_5_5_5));
assertDoesNotThrow(() -> TokenizerFactory.forVersion("5.5.5"));
}
@Test
void testBasicTokenizerLineRegex() throws Exception {
useResourceReader("/examples/MINIMAL555.ged", reader -> reader.lines().forEach(this::validateLine));
@ -46,21 +50,23 @@ public class TokenizerTests {
}
void validateLine(String line) {
assertTrue(Tokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
assertTrue(Gedcom555Tokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
}
@Test
void testBasicTokenizerFunctionality() throws Exception {
InputStream stream = TokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
InputStream stream = Gedcom555TokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
Tokenizer tokenizer = new Tokenizer();
Gedcom555Tokenizer tokenizer = new Gedcom555Tokenizer(reader, TokenizerOptions.builder()
.build());
Set<RecordToken> records = null;
try {
records = tokenizer.parseTokens(reader);
records = tokenizer.parseRecords();
}catch(Exception e) {
e.printStackTrace();
}