REFC Tokenzier Factory + Tokenzier 5.5.5 implementation
parent
029e0b88c1
commit
c62ab4fe26
|
@ -0,0 +1,7 @@
|
|||
package de.nth.chronicle.gedcom;
|
||||
|
||||
public class Gedcom {
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
package de.nth.chronicle.gedcom;
|
||||
|
||||
public enum GedcomVersion {
|
||||
|
||||
VERSION_5_5_5("5.5.5");
|
||||
|
||||
private String version;
|
||||
|
||||
GedcomVersion(String version) {
|
||||
this.version = version;
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
return this.version;
|
||||
}
|
||||
|
||||
public static GedcomVersion forVersionString(String version) {
|
||||
for(GedcomVersion ver : values()) {
|
||||
if(ver.getVersion().equals(version)) {
|
||||
return ver;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException(String.format("No Gedcom Version '%s' found!", version));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,22 +1,40 @@
|
|||
package de.nth.chronicle.gedcom.parser.gedcom.parser;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.Stack;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class Tokenizer {
|
||||
class Gedcom555Tokenizer implements GedcomTokenizer {
|
||||
|
||||
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
|
||||
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
|
||||
|
||||
@SneakyThrows
|
||||
public Set<RecordToken> parseTokens(BufferedReader reader) {
|
||||
private final TokenizerOptions options;
|
||||
private final BufferedReader reader;
|
||||
|
||||
Gedcom555Tokenizer(BufferedReader reader, TokenizerOptions options) {
|
||||
this.reader = reader;
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
void validateEncoding(BufferedReader reader) {
|
||||
/*
|
||||
▪ demand that file starts with a Byte Order Mark (BOM)
|
||||
▪ demand that the encoding is either UTF-8 or UTF-16
|
||||
▪ must support both UTF-8 and UTF-16 GEDCOM files
|
||||
▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
|
||||
▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<RecordToken> parseRecords() throws Exception {
|
||||
|
||||
Set<RecordToken> tokens = new HashSet<>();
|
||||
Stack<RecordToken> stack = new Stack<>();
|
||||
|
@ -69,6 +87,10 @@ public class Tokenizer {
|
|||
|
||||
}
|
||||
|
||||
public static Gedcom555Tokenizer create(InputStream stream, TokenizerOptions options) {
|
||||
return new Gedcom555Tokenizer(new BufferedReader(new InputStreamReader(stream)), options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes possible UTF-8/UTF-16 BOM and Matches a GEDCOM Line.
|
||||
*
|
|
@ -0,0 +1,9 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
public interface GedcomTokenizer {
|
||||
|
||||
public Set<RecordToken> parseRecords() throws Exception;
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package de.nth.chronicle.gedcom.parser.gedcom.parser;
|
||||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import lombok.*;
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.GedcomVersion;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
public interface TokenizerFactory {
|
||||
|
||||
public GedcomTokenizer createTokenizer(InputStream stream, TokenizerOptions options);
|
||||
|
||||
public static TokenizerFactory forVersion(String version) {
|
||||
return forVersion(GedcomVersion.forVersionString(version));
|
||||
}
|
||||
public static TokenizerFactory forVersion(GedcomVersion version) {
|
||||
switch(version) {
|
||||
case VERSION_5_5_5: return Gedcom555Tokenizer::create;
|
||||
default:
|
||||
throw new IllegalArgumentException(String.format("No Implementation for version '%s' found!",
|
||||
version.getVersion()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
public class TokenizerOptions {
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -1,8 +1,6 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.parser.gedcom.parser.RecordToken;
|
||||
import de.nth.chronicle.gedcom.parser.gedcom.parser.Tokenizer;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import de.nth.chronicle.gedcom.GedcomVersion;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
|
@ -15,10 +13,10 @@ import java.util.function.Consumer;
|
|||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
public class TokenizerTests {
|
||||
public class Gedcom555TokenizerTests {
|
||||
|
||||
void useResourceReader(String resource, Charset charset, Consumer<BufferedReader> consumer) throws Exception{
|
||||
InputStream stream = TokenizerTests.class.getResourceAsStream(resource);
|
||||
InputStream stream = Gedcom555TokenizerTests.class.getResourceAsStream(resource);
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charset));
|
||||
|
||||
consumer.accept(reader);
|
||||
|
@ -30,6 +28,12 @@ public class TokenizerTests {
|
|||
useResourceReader(resource, StandardCharsets.UTF_8, consumer);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFactoryAccess() {
|
||||
assertDoesNotThrow(() -> TokenizerFactory.forVersion(GedcomVersion.VERSION_5_5_5));
|
||||
assertDoesNotThrow(() -> TokenizerFactory.forVersion("5.5.5"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testBasicTokenizerLineRegex() throws Exception {
|
||||
useResourceReader("/examples/MINIMAL555.ged", reader -> reader.lines().forEach(this::validateLine));
|
||||
|
@ -46,21 +50,23 @@ public class TokenizerTests {
|
|||
}
|
||||
|
||||
void validateLine(String line) {
|
||||
assertTrue(Tokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
|
||||
assertTrue(Gedcom555Tokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testBasicTokenizerFunctionality() throws Exception {
|
||||
|
||||
InputStream stream = TokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
|
||||
InputStream stream = Gedcom555TokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
|
||||
|
||||
Tokenizer tokenizer = new Tokenizer();
|
||||
Gedcom555Tokenizer tokenizer = new Gedcom555Tokenizer(reader, TokenizerOptions.builder()
|
||||
|
||||
.build());
|
||||
|
||||
Set<RecordToken> records = null;
|
||||
|
||||
try {
|
||||
records = tokenizer.parseTokens(reader);
|
||||
records = tokenizer.parseRecords();
|
||||
}catch(Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
Loading…
Reference in New Issue