163 lines
5.0 KiB
Java
163 lines
5.0 KiB
Java
package de.nth.chronicle.gedcom.parser;
|
|
|
|
import de.nth.chronicle.gedcom.exception.GedcomException;
|
|
import de.nth.chronicle.gedcom.exception.InvalidLineException;
|
|
import de.nth.chronicle.gedcom.exception.MissingBomException;
|
|
import de.nth.chronicle.gedcom.util.EncodingUtils;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.nio.charset.Charset;
|
|
import java.util.*;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
class Tokenizer {
|
|
|
|
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
|
|
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
|
|
|
|
private String currentLine;
|
|
private int currentLineNumber;
|
|
|
|
private final List<RecordChunk> records = new LinkedList<>();
|
|
private final Stack<RecordChunk> stack = new Stack<>();
|
|
private final BufferedReader reader;
|
|
|
|
private RecordChunk lastChunk;
|
|
|
|
public Tokenizer(InputStream stream) {
|
|
Charset charset = validateEncoding(stream);
|
|
|
|
this.reader = new BufferedReader(new InputStreamReader(stream, charset));
|
|
}
|
|
|
|
public String getCurrentLine() {
|
|
return this.currentLine;
|
|
}
|
|
public int getCurrentLineNumber() {
|
|
return this.currentLineNumber;
|
|
}
|
|
|
|
/*
|
|
* GEDCOM 5.5.5 Reader Rules
|
|
* ▪ import each line value as-is
|
|
* ▪ do not trim trailing white space from any GEDCOM line or line value
|
|
* ▪ do not trim leading white space from any line value
|
|
* */
|
|
public List<RecordChunk> parseRecordsChunks() throws GedcomException {
|
|
|
|
if(!this.records.isEmpty()) {
|
|
return this.records;
|
|
}
|
|
|
|
while(readNextLine()) {
|
|
|
|
if(this.currentLine.length() > 255) {
|
|
throw new InvalidLineException(this.currentLineNumber, this.currentLine, "Line is too long!");
|
|
}
|
|
|
|
pushRecordChunk(parseRecordChunk());
|
|
this.currentLineNumber++;
|
|
}
|
|
|
|
this.records.forEach(record -> record.normalize());
|
|
|
|
return this.records;
|
|
}
|
|
|
|
private boolean readNextLine() {
|
|
try {
|
|
return (this.currentLine = this.reader.readLine()) != null;
|
|
} catch (IOException e) {
|
|
throw new GedcomException(e);
|
|
}
|
|
}
|
|
|
|
private RecordChunk parseRecordChunk() {
|
|
Matcher matcher = matchLine(this.currentLine);
|
|
if(!matcher.matches()) {
|
|
throw new InvalidGedcomException.InvalidLine(this.currentLineNumber, this.currentLine);
|
|
}
|
|
|
|
int level = Integer.parseInt(matcher.group(1));
|
|
String tag = matcher.group(2);
|
|
String value = matcher.group(3);
|
|
|
|
RecordChunk record = RecordChunk.builder()
|
|
.level(level)
|
|
.tag(tag)
|
|
.value(value)
|
|
.lineNumber(this.currentLineNumber)
|
|
.sourceLine(this.currentLine)
|
|
.build();
|
|
|
|
return record;
|
|
}
|
|
|
|
private void pushRecordChunk(RecordChunk record) {
|
|
if(this.stack.isEmpty()) {
|
|
this.stack.push(record);
|
|
this.records.add(record);
|
|
} else if(record.getLevel() == 0) {
|
|
this.stack.clear();
|
|
this.stack.push(record);
|
|
this.records.add(record);
|
|
} else if(record.getLevel() == this.stack.peek().getLevel()) {
|
|
this.stack.pop();
|
|
if(!this.stack.isEmpty()) {
|
|
this.stack.peek().getSubRecords().add(record);
|
|
}
|
|
this.stack.push(record);
|
|
} else if(record.getLevel() > this.stack.peek().getLevel()) {
|
|
this.stack.peek().getSubRecords().add(record);
|
|
this.stack.push(record);
|
|
} else if(record.getLevel() < this.stack.peek().getLevel()) {
|
|
while(record.getLevel() <= this.stack.peek().getLevel()) {
|
|
this.stack.pop();
|
|
}
|
|
this.stack.peek().getSubRecords().add(record);
|
|
this.stack.push(record);
|
|
}
|
|
|
|
if(this.lastChunk != null) {
|
|
this.lastChunk.setNext(record);
|
|
}
|
|
record.setPrevious(this.lastChunk);
|
|
this.lastChunk = record;
|
|
}
|
|
|
|
/**
|
|
* Removes possible UTF-8/UTF-16 BOM and Matches a GEDCOM Line.
|
|
*
|
|
* @param line to match
|
|
* @return Regex {@link Matcher}
|
|
*/
|
|
public static Matcher matchLine(String line) {
|
|
if(line.contains("\uFEFF")) {
|
|
line = line.replace("\uFEFF", "");
|
|
}
|
|
return LINE_REGEX.matcher(line);
|
|
}
|
|
|
|
/*
|
|
▪ demand that file starts with a Byte Order Mark (BOM)
|
|
▪ demand that the encoding is either UTF-8 or UTF-16
|
|
▪ must support both UTF-8 and UTF-16 GEDCOM files
|
|
▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
|
|
▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
|
|
*/
|
|
static Charset validateEncoding(InputStream stream) {
|
|
Charset charset = EncodingUtils.getCharsetForBOM(stream);
|
|
|
|
if(charset == null) {
|
|
throw new MissingBomException();
|
|
}
|
|
|
|
return charset;
|
|
}
|
|
|
|
}
|