gedcom-api/src/main/java/de/nth/chronicle/gedcom/parser/Tokenizer.java

163 lines
5.0 KiB
Java

package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.exception.GedcomException;
import de.nth.chronicle.gedcom.exception.InvalidLineException;
import de.nth.chronicle.gedcom.exception.MissingBomException;
import de.nth.chronicle.gedcom.util.EncodingUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
class Tokenizer {
// ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");
private String currentLine;
private int currentLineNumber;
private final List<RecordChunk> records = new LinkedList<>();
private final Stack<RecordChunk> stack = new Stack<>();
private final BufferedReader reader;
private RecordChunk lastChunk;
public Tokenizer(InputStream stream) {
Charset charset = validateEncoding(stream);
this.reader = new BufferedReader(new InputStreamReader(stream, charset));
}
public String getCurrentLine() {
return this.currentLine;
}
public int getCurrentLineNumber() {
return this.currentLineNumber;
}
/*
* GEDCOM 5.5.5 Reader Rules
* ▪ import each line value as-is
* ▪ do not trim trailing white space from any GEDCOM line or line value
* ▪ do not trim leading white space from any line value
* */
public List<RecordChunk> parseRecordsChunks() throws GedcomException {
if(!this.records.isEmpty()) {
return this.records;
}
while(readNextLine()) {
if(this.currentLine.length() > 255) {
throw new InvalidLineException(this.currentLineNumber, this.currentLine, "Line is too long!");
}
pushRecordChunk(parseRecordChunk());
this.currentLineNumber++;
}
this.records.forEach(record -> record.normalize());
return this.records;
}
private boolean readNextLine() {
try {
return (this.currentLine = this.reader.readLine()) != null;
} catch (IOException e) {
throw new GedcomException(e);
}
}
private RecordChunk parseRecordChunk() {
Matcher matcher = matchLine(this.currentLine);
if(!matcher.matches()) {
throw new InvalidGedcomException.InvalidLine(this.currentLineNumber, this.currentLine);
}
int level = Integer.parseInt(matcher.group(1));
String tag = matcher.group(2);
String value = matcher.group(3);
RecordChunk record = RecordChunk.builder()
.level(level)
.tag(tag)
.value(value)
.lineNumber(this.currentLineNumber)
.sourceLine(this.currentLine)
.build();
return record;
}
private void pushRecordChunk(RecordChunk record) {
if(this.stack.isEmpty()) {
this.stack.push(record);
this.records.add(record);
} else if(record.getLevel() == 0) {
this.stack.clear();
this.stack.push(record);
this.records.add(record);
} else if(record.getLevel() == this.stack.peek().getLevel()) {
this.stack.pop();
if(!this.stack.isEmpty()) {
this.stack.peek().getSubRecords().add(record);
}
this.stack.push(record);
} else if(record.getLevel() > this.stack.peek().getLevel()) {
this.stack.peek().getSubRecords().add(record);
this.stack.push(record);
} else if(record.getLevel() < this.stack.peek().getLevel()) {
while(record.getLevel() <= this.stack.peek().getLevel()) {
this.stack.pop();
}
this.stack.peek().getSubRecords().add(record);
this.stack.push(record);
}
if(this.lastChunk != null) {
this.lastChunk.setNext(record);
}
record.setPrevious(this.lastChunk);
this.lastChunk = record;
}
/**
* Removes possible UTF-8/UTF-16 BOM and Matches a GEDCOM Line.
*
* @param line to match
* @return Regex {@link Matcher}
*/
public static Matcher matchLine(String line) {
if(line.contains("\uFEFF")) {
line = line.replace("\uFEFF", "");
}
return LINE_REGEX.matcher(line);
}
/*
▪ demand that file starts with a Byte Order Mark (BOM)
▪ demand that the encoding is either UTF-8 or UTF-16
▪ must support both UTF-8 and UTF-16 GEDCOM files
▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
*/
static Charset validateEncoding(InputStream stream) {
Charset charset = EncodingUtils.getCharsetForBOM(stream);
if(charset == null) {
throw new MissingBomException();
}
return charset;
}
}