parent
c0db5247be
commit
37bd9eeb37
|
@ -0,0 +1,24 @@
|
|||
package de.nth.chronicle.gedcom.parser.gedcom.parser;
|
||||
|
||||
import lombok.*;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
@Builder
|
||||
public class RecordToken {
|
||||
|
||||
private int level;
|
||||
private String tag;
|
||||
private String value;
|
||||
|
||||
@Builder.Default
|
||||
private Set<RecordToken> subRecords = new HashSet<>();
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("%d %s %s", this.level, this.tag, this.value);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
package de.nth.chronicle.gedcom.parser.gedcom.parser;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.Stack;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class Tokenizer {
|
||||
|
||||
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
|
||||
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
|
||||
|
||||
@SneakyThrows
|
||||
public Set<RecordToken> parseTokens(BufferedReader reader) {
|
||||
|
||||
Set<RecordToken> tokens = new HashSet<>();
|
||||
Stack<RecordToken> stack = new Stack<>();
|
||||
|
||||
/*
|
||||
* GEDCOM 5.5.5 Reader Rules
|
||||
* ▪ import each line value as-is
|
||||
* ▪ do not trim trailing white space from any GEDCOM line or line value
|
||||
* ▪ do not trim leading white space from any line value
|
||||
* */
|
||||
String line;
|
||||
while((line = reader.readLine()) != null) {
|
||||
|
||||
Matcher matcher = LINE_REGEX.matcher(line);
|
||||
|
||||
if(!matcher.matches()) {
|
||||
//TODO throw Exception
|
||||
continue;
|
||||
}
|
||||
|
||||
int level = Integer.parseInt(matcher.group(1));
|
||||
String tag = matcher.group(2);
|
||||
String value = matcher.group(3);
|
||||
|
||||
RecordToken record = RecordToken.builder()
|
||||
.level(level)
|
||||
.tag(tag)
|
||||
.value(value)
|
||||
.build();
|
||||
|
||||
if(stack.isEmpty()) {
|
||||
stack.push(record);
|
||||
tokens.add(record);
|
||||
} else if(level == 0) {
|
||||
stack.clear();
|
||||
stack.push(record);
|
||||
tokens.add(record);
|
||||
} else if(stack.peek().getLevel() == level) {
|
||||
stack.pop();
|
||||
if(!stack.isEmpty()) {
|
||||
stack.peek().getSubRecords().add(record);
|
||||
}
|
||||
stack.push(record);
|
||||
} else if(stack.peek().getLevel() < level) {
|
||||
stack.peek().getSubRecords().add(record);
|
||||
stack.push(record);
|
||||
}
|
||||
|
||||
System.out.println(stack);
|
||||
|
||||
}
|
||||
|
||||
return tokens;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes possible UTF-8/UTF-16 BOM and Matches a GEDCOM Line.
|
||||
*
|
||||
* @param line to match
|
||||
* @return Regex {@link Matcher}
|
||||
*/
|
||||
public static Matcher matchLine(String line) {
|
||||
if(line.contains("\uFEFF")) {
|
||||
line = line.replace("\uFEFF", "");
|
||||
}
|
||||
return LINE_REGEX.matcher(line);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.parser.gedcom.parser.RecordToken;
|
||||
import de.nth.chronicle.gedcom.parser.gedcom.parser.Tokenizer;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Set;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
public class TokenizerTests {
|
||||
|
||||
void useResourceReader(String resource, Charset charset, Consumer<BufferedReader> consumer) throws Exception{
|
||||
InputStream stream = TokenizerTests.class.getResourceAsStream(resource);
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charset));
|
||||
|
||||
consumer.accept(reader);
|
||||
|
||||
reader.close();
|
||||
}
|
||||
|
||||
void useResourceReader(String resource, Consumer<BufferedReader> consumer) throws Exception{
|
||||
useResourceReader(resource, StandardCharsets.UTF_8, consumer);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testBasicTokenizerLineRegex() throws Exception {
|
||||
useResourceReader("/examples/MINIMAL555.ged", reader -> reader.lines().forEach(this::validateLine));
|
||||
useResourceReader("/examples/555SAMPLE.ged", reader -> reader.lines().forEach(this::validateLine));
|
||||
useResourceReader("/examples/555SAMPLE16BE.ged",
|
||||
StandardCharsets.UTF_16BE,
|
||||
reader -> reader.lines().forEach(this::validateLine));
|
||||
useResourceReader("/examples/555SAMPLE16LE.ged",
|
||||
StandardCharsets.UTF_16LE,
|
||||
reader -> reader.lines().forEach(this::validateLine));
|
||||
useResourceReader("/examples/REMARR.ged", reader -> reader.lines().forEach(this::validateLine));
|
||||
useResourceReader("/examples/SSMARR.ged", reader -> reader.lines().forEach(this::validateLine));
|
||||
}
|
||||
|
||||
void validateLine(String line) {
|
||||
assertTrue(Tokenizer.matchLine(line).matches(), () -> String.format("Invalid Line: '%s'", line));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testBasicTokenizerFunctionality() throws Exception {
|
||||
|
||||
InputStream stream = TokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
|
||||
|
||||
Tokenizer tokenizer = new Tokenizer();
|
||||
|
||||
Set<RecordToken> records = null;
|
||||
|
||||
try {
|
||||
records = tokenizer.parseTokens(reader);
|
||||
}catch(Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
records.forEach(System.out::println);
|
||||
|
||||
assertEquals(3, records.size());
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue