REFC GedcomTokenizer instance
parent
84bf8a596a
commit
c8e50a410b
|
@ -5,6 +5,9 @@ import lombok.Data;
|
|||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalTime;
|
||||
|
||||
@Data
|
||||
@Builder(toBuilder = true)
|
||||
public class GedcomHeader {
|
||||
|
@ -17,6 +20,22 @@ public class GedcomHeader {
|
|||
@NonNull
|
||||
private final String characterSet;
|
||||
|
||||
//SOUR
|
||||
private final String approvedSystemId;
|
||||
private final String sourceVersion;
|
||||
private final String nameOfProduct;
|
||||
private final String nameOfBusiness;
|
||||
private final String nameOfSourceData;
|
||||
private final LocalDate publicationDate;
|
||||
private final String copyrightSourceData;
|
||||
|
||||
private final String receivingSystemName;
|
||||
private final LocalDate transmissionDate;
|
||||
private final LocalTime time;
|
||||
private final String fileName;
|
||||
private final String copyrightGedcomFile;
|
||||
private final String language;
|
||||
private final String contentDescription;
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.Gedcom;
|
||||
import de.nth.chronicle.gedcom.GedcomVersion;
|
||||
import de.nth.chronicle.gedcom.parser.records.HeaderRecordParser;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
@ -19,10 +18,9 @@ public class Gedcom555Parser implements GedcomParser {
|
|||
@Override
|
||||
public Gedcom parseGedcom(InputStream stream) throws Exception {
|
||||
|
||||
GedcomTokenizer tokenizer = GedcomTokenizer.create(stream, TokenizerOptions.builder()
|
||||
.build());
|
||||
GedcomTokenizer tokenizer = new GedcomTokenizer(stream);
|
||||
|
||||
List<RecordToken> tokens = tokenizer.parseRecords();
|
||||
List<RecordToken> tokens = tokenizer.parseRecordsTokens();
|
||||
Gedcom.GedcomBuilder builder = Gedcom.builder();
|
||||
|
||||
for(RecordToken token : tokens) {
|
||||
|
@ -32,8 +30,6 @@ public class Gedcom555Parser implements GedcomParser {
|
|||
System.err.println("No Parser found for tag " + token.getTag());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.Gedcom;
|
||||
import de.nth.chronicle.gedcom.parser.exception.GedcomException;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
public interface GedcomReader {
|
||||
|
||||
public Gedcom read(InputStream stream) throws GedcomException;
|
||||
|
||||
}
|
|
@ -1,8 +1,12 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.parser.exception.GedcomException;
|
||||
import de.nth.chronicle.gedcom.parser.exception.InvalidLineException;
|
||||
import de.nth.chronicle.gedcom.parser.exception.MissingBomException;
|
||||
import de.nth.chronicle.gedcom.util.EncodingUtils;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.Charset;
|
||||
|
@ -15,85 +19,102 @@ class GedcomTokenizer {
|
|||
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
|
||||
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
|
||||
|
||||
private final TokenizerOptions options;
|
||||
private String currentLine;
|
||||
private int currentLineNumber;
|
||||
|
||||
private final List<RecordToken> records = new LinkedList<>();
|
||||
private final Stack<RecordToken> stack = new Stack<>();
|
||||
private final BufferedReader reader;
|
||||
|
||||
GedcomTokenizer(BufferedReader reader, TokenizerOptions options) {
|
||||
this.reader = reader;
|
||||
this.options = options;
|
||||
public GedcomTokenizer(InputStream stream) {
|
||||
Charset charset = validateEncoding(stream);
|
||||
|
||||
this.reader = new BufferedReader(new InputStreamReader(stream, charset));
|
||||
}
|
||||
|
||||
public List<RecordToken> parseRecords() throws Exception {
|
||||
public String getCurrentLine() {
|
||||
return this.currentLine;
|
||||
}
|
||||
public int getCurrentLineNumber() {
|
||||
return this.currentLineNumber;
|
||||
}
|
||||
|
||||
List<RecordToken> tokens = new LinkedList<>();
|
||||
Stack<RecordToken> stack = new Stack<>();
|
||||
/*
|
||||
* GEDCOM 5.5.5 Reader Rules
|
||||
* ▪ import each line value as-is
|
||||
* ▪ do not trim trailing white space from any GEDCOM line or line value
|
||||
* ▪ do not trim leading white space from any line value
|
||||
* */
|
||||
public List<RecordToken> parseRecordsTokens() throws GedcomException {
|
||||
|
||||
/*
|
||||
* GEDCOM 5.5.5 Reader Rules
|
||||
* ▪ import each line value as-is
|
||||
* ▪ do not trim trailing white space from any GEDCOM line or line value
|
||||
* ▪ do not trim leading white space from any line value
|
||||
* */
|
||||
String line;
|
||||
int lineNumber = 1;
|
||||
while((line = reader.readLine()) != null) {
|
||||
System.out.println("Tokenizer: tokenize line " + lineNumber);
|
||||
|
||||
Matcher matcher = matchLine(line);
|
||||
if(!matcher.matches()) {
|
||||
throw new InvalidGedcomException.InvalidLine(lineNumber, line);
|
||||
}
|
||||
|
||||
int level = Integer.parseInt(matcher.group(1));
|
||||
String tag = matcher.group(2);
|
||||
String value = matcher.group(3);
|
||||
|
||||
RecordToken record = RecordToken.builder()
|
||||
.level(level)
|
||||
.tag(tag)
|
||||
.value(value)
|
||||
.build();
|
||||
|
||||
if(stack.isEmpty()) {
|
||||
System.out.println("First Record: " + line);
|
||||
stack.push(record);
|
||||
tokens.add(record);
|
||||
} else if(level == 0) {
|
||||
System.out.println("New Record: " + line);
|
||||
stack.clear();
|
||||
stack.push(record);
|
||||
tokens.add(record);
|
||||
} else if(stack.peek().getLevel() == level) {
|
||||
System.out.println("Same Level Record: " + line);
|
||||
stack.pop();
|
||||
if(!stack.isEmpty()) {
|
||||
stack.peek().getSubRecords().add(record);
|
||||
}
|
||||
stack.push(record);
|
||||
} else if(stack.peek().getLevel() < level) {
|
||||
System.out.println("Next Level Record: " + line);
|
||||
stack.peek().getSubRecords().add(record);
|
||||
stack.push(record);
|
||||
} else if(stack.peek().getLevel() > level) {
|
||||
while(stack.peek().getLevel() >= level) {
|
||||
stack.pop();
|
||||
}
|
||||
System.out.println("Higher Level Record: " + line + " ==== parent: " + stack.peek().getTag());
|
||||
stack.peek().getSubRecords().add(record);
|
||||
stack.push(record);
|
||||
}
|
||||
|
||||
lineNumber++;
|
||||
if(!this.records.isEmpty()) {
|
||||
return this.records;
|
||||
}
|
||||
|
||||
return tokens;
|
||||
while(readNextLine()) {
|
||||
|
||||
if(this.currentLine.length() > 255) {
|
||||
throw new InvalidLineException(this.currentLineNumber, this.currentLine, "Line is too long!");
|
||||
}
|
||||
|
||||
pushRecordToken(parseRecordToken());
|
||||
this.currentLineNumber++;
|
||||
}
|
||||
|
||||
return this.records;
|
||||
}
|
||||
|
||||
public static GedcomTokenizer create(InputStream stream, TokenizerOptions options) {
|
||||
Charset charset = validateEncoding(stream);
|
||||
System.out.println("Tokenizer: user encoding " + charset);
|
||||
return new GedcomTokenizer(new BufferedReader(new InputStreamReader(stream, charset)), options);
|
||||
private boolean readNextLine() {
|
||||
try {
|
||||
return (this.currentLine = this.reader.readLine()) != null;
|
||||
} catch (IOException e) {
|
||||
throw new GedcomException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private RecordToken parseRecordToken() {
|
||||
Matcher matcher = matchLine(this.currentLine);
|
||||
if(!matcher.matches()) {
|
||||
throw new InvalidGedcomException.InvalidLine(this.currentLineNumber, this.currentLine);
|
||||
}
|
||||
|
||||
int level = Integer.parseInt(matcher.group(1));
|
||||
String tag = matcher.group(2);
|
||||
String value = matcher.group(3);
|
||||
|
||||
RecordToken record = RecordToken.builder()
|
||||
.level(level)
|
||||
.tag(tag)
|
||||
.value(value)
|
||||
.build();
|
||||
|
||||
return record;
|
||||
}
|
||||
|
||||
private void pushRecordToken(RecordToken record) {
|
||||
if(this.stack.isEmpty()) {
|
||||
this.stack.push(record);
|
||||
this.records.add(record);
|
||||
} else if(record.getLevel() == 0) {
|
||||
this.stack.clear();
|
||||
this.stack.push(record);
|
||||
this.records.add(record);
|
||||
} else if(record.getLevel() == this.stack.peek().getLevel()) {
|
||||
this.stack.pop();
|
||||
if(!this.stack.isEmpty()) {
|
||||
this.stack.peek().getSubRecords().add(record);
|
||||
}
|
||||
this.stack.push(record);
|
||||
} else if(record.getLevel() > this.stack.peek().getLevel()) {
|
||||
this.stack.peek().getSubRecords().add(record);
|
||||
this.stack.push(record);
|
||||
} else if(record.getLevel() < this.stack.peek().getLevel()) {
|
||||
while(record.getLevel() <= this.stack.peek().getLevel()) {
|
||||
this.stack.pop();
|
||||
}
|
||||
this.stack.peek().getSubRecords().add(record);
|
||||
this.stack.push(record);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -109,18 +130,18 @@ class GedcomTokenizer {
|
|||
return LINE_REGEX.matcher(line);
|
||||
}
|
||||
|
||||
/*
|
||||
▪ demand that file starts with a Byte Order Mark (BOM)
|
||||
▪ demand that the encoding is either UTF-8 or UTF-16
|
||||
▪ must support both UTF-8 and UTF-16 GEDCOM files
|
||||
▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
|
||||
▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
|
||||
*/
|
||||
static Charset validateEncoding(InputStream stream) {
|
||||
/*
|
||||
▪ demand that file starts with a Byte Order Mark (BOM)
|
||||
▪ demand that the encoding is either UTF-8 or UTF-16
|
||||
▪ must support both UTF-8 and UTF-16 GEDCOM files
|
||||
▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
|
||||
▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
|
||||
*/
|
||||
Charset charset = EncodingUtils.getCharsetForBOM(stream);
|
||||
|
||||
if(charset == null) {
|
||||
throw new InvalidGedcomException.MissingBOM();
|
||||
throw new MissingBomException();
|
||||
}
|
||||
|
||||
return charset;
|
||||
|
|
|
@ -18,9 +18,9 @@ public abstract class InvalidGedcomException extends RuntimeException {
|
|||
}
|
||||
}
|
||||
|
||||
public static class MissingRecord extends InvalidGedcomException {
|
||||
public MissingRecord(String tag) {
|
||||
super(String.format("Missing Record '%s'!", tag));
|
||||
public static class InvalidOrMissingRecord extends InvalidGedcomException {
|
||||
public InvalidOrMissingRecord(String tag) {
|
||||
super(String.format("Record '%s' is missing or invalid!", tag));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +0,0 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
public class TokenizerOptions {
|
||||
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
package de.nth.chronicle.gedcom.parser.exception;
|
||||
|
||||
public class GedcomException extends RuntimeException {
|
||||
|
||||
public GedcomException() { super(); }
|
||||
public GedcomException(String message) {
|
||||
super(message);
|
||||
}
|
||||
public GedcomException(Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
public GedcomException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
package de.nth.chronicle.gedcom.parser.exception;
|
||||
|
||||
public class InvalidEncodingException extends GedcomException {
|
||||
|
||||
public InvalidEncodingException(String expectedEncoding, String actualEncoding) {
|
||||
super(String.format("GEDCOM 5.5.5 file is %s encoded but defines %s as encoding!",
|
||||
expectedEncoding, actualEncoding));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package de.nth.chronicle.gedcom.parser.exception;
|
||||
|
||||
public class InvalidLineException extends GedcomException {
|
||||
|
||||
private int lineNumber;
|
||||
private String line;
|
||||
|
||||
private String reason;
|
||||
|
||||
public InvalidLineException(int lineNumber, String line, String reason) {
|
||||
super(String.format("%s '%s' at line %d", reason, line, lineNumber));
|
||||
|
||||
this.lineNumber = lineNumber;
|
||||
this.line = line;
|
||||
this.reason = reason;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
package de.nth.chronicle.gedcom.parser.exception;
|
||||
|
||||
public class MissingBomException extends GedcomException {
|
||||
|
||||
|
||||
public MissingBomException() {
|
||||
super("GEDCOM 5.5.5 file lacks Byte Order Mark!");
|
||||
}
|
||||
}
|
|
@ -6,10 +6,13 @@ import de.nth.chronicle.gedcom.parser.GedcomRecordParser;
|
|||
import de.nth.chronicle.gedcom.parser.InvalidGedcomException;
|
||||
import de.nth.chronicle.gedcom.parser.RecordToken;
|
||||
|
||||
import java.util.function.Consumer;
|
||||
import java.time.LocalDate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class HeaderRecordParser implements GedcomRecordParser {
|
||||
|
||||
public static final Pattern VERSION_REGEX = Pattern.compile("(\\p{Digit}{1,3})\\.(\\p{Digit}{1,3})(?:\\.(\\p{Digit}{1,3}))?");
|
||||
|
||||
@Override
|
||||
public void parse(RecordToken token, Gedcom.GedcomBuilder builder) {
|
||||
|
||||
|
@ -17,18 +20,35 @@ public class HeaderRecordParser implements GedcomRecordParser {
|
|||
|
||||
GedcomHeader header = GedcomHeader.builder()
|
||||
.characterSet(token.findFirstValue("CHAR")
|
||||
.orElseThrow(() -> new InvalidGedcomException.MissingRecord("HEAD.CHAR")))
|
||||
.orElseThrow(() -> new InvalidGedcomException.InvalidOrMissingRecord("HEAD.CHAR")))
|
||||
.versionNumber(token.findFirstValue("GEDC.VERS")
|
||||
.orElse(null))
|
||||
.map(this::validateVersion)
|
||||
.orElseThrow(() -> new InvalidGedcomException.InvalidOrMissingRecord("GEDC.VERS")))
|
||||
.gedcomForm(token.findFirstValue("GEDC.FORM")
|
||||
.orElse(null))
|
||||
.gedcomFormVersion(token.findFirstValue("GEDC.FORM.VERS")
|
||||
.orElse(null))
|
||||
.map(this::validateVersion)
|
||||
.orElseThrow(() -> new InvalidGedcomException.InvalidOrMissingRecord("GEDC.VERS")))
|
||||
.approvedSystemId(token.findFirstValue("SOUR").orElse(null))
|
||||
.sourceVersion(token.findFirstValue("SOUR.VERS").orElse(null))
|
||||
.nameOfProduct(token.findFirstValue("SOUR.NAME").orElse(null))
|
||||
.nameOfBusiness(token.findFirstValue("SOUR.CORP").orElse(null)) //TODO address
|
||||
.nameOfSourceData(token.findFirstValue("SOUR.DATA.").orElse(null))
|
||||
.publicationDate(token.findFirstValue("SOUR.DATA.DATE").map(LocalDate::parse).orElse(null))
|
||||
//.copyrightSourceData() TODO
|
||||
|
||||
.build();
|
||||
|
||||
builder.header(header);
|
||||
|
||||
}
|
||||
|
||||
private String validateVersion(String version) {
|
||||
if(VERSION_REGEX.matcher(version).matches()) return version;
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ public class GedcomParserTests {
|
|||
|
||||
GedcomParser parser = GedcomParser.getParser(GedcomVersion.VERSION_5_5_5);
|
||||
|
||||
Gedcom gedcom = parser.parseGedcom(GedcomParserTests.class.getResourceAsStream("/examples/MINIMAL555.ged"));
|
||||
Gedcom gedcom = parser.parseGedcom(GedcomParserTests.class.getResourceAsStream("/examples/555SAMPLE.ged"));
|
||||
|
||||
System.out.println(gedcom);
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@ import java.io.InputStreamReader;
|
|||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
@ -51,16 +50,13 @@ public class GedcomTokenizerTests {
|
|||
void testBasicTokenizerFunctionality() throws Exception {
|
||||
|
||||
InputStream stream = GedcomTokenizerTests.class.getResourceAsStream("/examples/MINIMAL555.ged");
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
|
||||
|
||||
GedcomTokenizer tokenizer = new GedcomTokenizer(reader, TokenizerOptions.builder()
|
||||
|
||||
.build());
|
||||
GedcomTokenizer tokenizer = new GedcomTokenizer(stream);
|
||||
|
||||
List<RecordToken> records = null;
|
||||
|
||||
try {
|
||||
records = tokenizer.parseRecords();
|
||||
records = tokenizer.parseRecordsTokens();
|
||||
}catch(Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue