FEAT first tokenizer impl

Support for UTF-8 & UTF-16
master
Niclas Thobaben 2020-11-26 23:13:36 +01:00
parent c0db5247be
commit 37bd9eeb37
3 changed files with 189 additions and 0 deletions

View File

@ -0,0 +1,24 @@
package de.nth.chronicle.gedcom.parser.gedcom.parser;
import lombok.*;
import java.util.HashSet;
import java.util.Set;
@Getter
@Setter
@Builder
public class RecordToken {
private int level;
private String tag;
private String value;
@Builder.Default
private Set<RecordToken> subRecords = new HashSet<>();
@Override
public String toString() {
return String.format("%d %s %s", this.level, this.tag, this.value);
}
}

View File

@ -0,0 +1,90 @@
package de.nth.chronicle.gedcom.parser.gedcom.parser;
import lombok.SneakyThrows;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Tokenizer {
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
@SneakyThrows
public Set<RecordToken> parseTokens(BufferedReader reader) {
Set<RecordToken> tokens = new HashSet<>();
Stack<RecordToken> stack = new Stack<>();
/*
* GEDCOM 5.5.5 Reader Rules
* import each line value as-is
* do not trim trailing white space from any GEDCOM line or line value
* do not trim leading white space from any line value
* */
String line;
while((line = reader.readLine()) != null) {
Matcher matcher = LINE_REGEX.matcher(line);
if(!matcher.matches()) {
//TODO throw Exception
continue;
}
int level = Integer.parseInt(matcher.group(1));
String tag = matcher.group(2);
String value = matcher.group(3);
RecordToken record = RecordToken.builder()
.level(level)
.tag(tag)
.value(value)
.build();
if(stack.isEmpty()) {
stack.push(record);
tokens.add(record);
} else if(level == 0) {
stack.clear();
stack.push(record);
tokens.add(record);
} else if(stack.peek().getLevel() == level) {
stack.pop();
if(!stack.isEmpty()) {
stack.peek().getSubRecords().add(record);
}
stack.push(record);
} else if(stack.peek().getLevel() < level) {
stack.peek().getSubRecords().add(record);
stack.push(record);
}
System.out.println(stack);
}
return tokens;
}
/**
* Removes possible UTF-8/UTF-16 BOM and Matches a GEDCOM Line.
*
* @param line to match
* @return Regex {@link Matcher}
*/
public static Matcher matchLine(String line) {
if(line.contains("\uFEFF")) {
line = line.replace("\uFEFF", "");
}
return LINE_REGEX.matcher(line);
}
}

View File

@ -0,0 +1,75 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.parser.gedcom.parser.RecordToken;
import de.nth.chronicle.gedcom.parser.gedcom.parser.Tokenizer;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Set;
import java.util.function.Consumer;
import static org.junit.jupiter.api.Assertions.*;
public class TokenizerTests {
void useResourceReader(String resource, Charset charset, Consumer<BufferedReader> consumer) throws Exception{
InputStream stream = TokenizerTests.class.getResourceAsStream(resource);
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charset));
consumer.accept(reader);
reader.close();
}
void useResourceReader(String resource, Consumer<BufferedReader> consumer) throws Exception{
useResourceReader(resource, StandardCharsets.UTF_8, consumer);
}
@Test
void testBasicTokenizerLineRegex() throws Exception {
useResourceReader("/examples/MINIMAL555.ged", reader -> reader.lines().forEach(this::validateLine));
useResourceReader("/examples/555SAMPLE.ged", reader -> reader.lines().forEach(this::validateLine));
useResourceReader("/examples/555SAMPLE16BE.ged",
StandardCharsets.UTF_16BE,
reader -> reader.lines().forEach(this::validateLine));
useResourceReader("/examples/555SAMPLE16LE.ged",
StandardCharsets.UTF_16LE,
reader -> reader.lines().forEach(this::validateLine));
useResourceReader("/examples/REMARR.ged", reader -> reader.lines().forEach(this::validateLine));
useResourceReader("/examples/SSMARR.ged", reader -> reader.lines().forEach(this::validateLine));
}
void validateLine(String line) {
assertTrue(Tokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
}
@Test
void testBasicTokenizerFunctionality() throws Exception {
InputStream stream = TokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
Tokenizer tokenizer = new Tokenizer();
Set<RecordToken> records = null;
try {
records = tokenizer.parseTokens(reader);
}catch(Exception e) {
e.printStackTrace();
}
records.forEach(System.out::println);
assertEquals(3, records.size());
}
}