gedcom-api/src/main/java/de/nth/chronicle/gedcom/parser/Tokenizer.java

package de.nth.chronicle.gedcom.parser;

import de.nth.chronicle.gedcom.exception.GedcomException;
import de.nth.chronicle.gedcom.exception.InvalidLineException;
import de.nth.chronicle.gedcom.exception.MissingBomException;
import de.nth.chronicle.gedcom.util.EncodingUtils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

class Tokenizer {

    // ^\s*(\p{Digit})+\s+([a-zA-Z1-9_]+)(?:\s(.*)$)?
    public static final Pattern LINE_REGEX = Pattern.compile("^\\s*(\\p{Digit})+\\s+([a-zA-Z1-9_@]+)(?:\\s(.*))?");

    private String currentLine;
    private int currentLineNumber;

    private final List<RecordChunk> records = new LinkedList<>();
    private final Stack<RecordChunk> stack = new Stack<>();
    private final BufferedReader reader;

    private RecordChunk lastChunk;

    public Tokenizer(InputStream stream) {
        Charset charset = validateEncoding(stream);

        this.reader = new BufferedReader(new InputStreamReader(stream, charset));
    }

    public String getCurrentLine() {
        return this.currentLine;
    }
    public int getCurrentLineNumber() {
        return this.currentLineNumber;
    }

    /*
     * GEDCOM 5.5.5 Reader Rules
     * ▪ import each line value as-is
     * ▪ do not trim trailing white space from any GEDCOM line or line value
     * ▪ do not trim leading white space from any line value
     * */
    public List<RecordChunk> parseRecordsChunks() throws GedcomException {

        if(!this.records.isEmpty()) {
            return this.records;
        }

        while(readNextLine()) {

            if(this.currentLine.length() > 255) {
                throw new InvalidLineException(this.currentLineNumber, this.currentLine, "Line is too long!");
            }

            pushRecordChunk(parseRecordChunk());
            this.currentLineNumber++;
        }

        this.records.forEach(record -> record.normalize());

        return this.records;
    }

    private boolean readNextLine() {
        try {
            return (this.currentLine = this.reader.readLine()) != null;
        } catch (IOException e) {
            throw new GedcomException(e);
        }
    }

    private RecordChunk parseRecordChunk() {
        Matcher matcher = matchLine(this.currentLine);
        if(!matcher.matches()) {
            throw new InvalidGedcomException.InvalidLine(this.currentLineNumber, this.currentLine);
        }

        int level = Integer.parseInt(matcher.group(1));
        String tag = matcher.group(2);
        String value = matcher.group(3);

        RecordChunk record = RecordChunk.builder()
                .level(level)
                .tag(tag)
                .value(value)
                .lineNumber(this.currentLineNumber)
                .sourceLine(this.currentLine)
                .build();

        return record;
    }

    private void pushRecordChunk(RecordChunk record) {
        if(this.stack.isEmpty()) {
            this.stack.push(record);
            this.records.add(record);
        } else if(record.getLevel() == 0) {
            this.stack.clear();
            this.stack.push(record);
            this.records.add(record);
        } else if(record.getLevel() == this.stack.peek().getLevel()) {
            this.stack.pop();
            if(!this.stack.isEmpty()) {
                this.stack.peek().getSubRecords().add(record);
            }
            this.stack.push(record);
        } else if(record.getLevel() > this.stack.peek().getLevel()) {
            this.stack.peek().getSubRecords().add(record);
            this.stack.push(record);
        } else if(record.getLevel() < this.stack.peek().getLevel()) {
            while(record.getLevel() <= this.stack.peek().getLevel()) {
                this.stack.pop();
            }
            this.stack.peek().getSubRecords().add(record);
            this.stack.push(record);
        }

        if(this.lastChunk != null) {
            this.lastChunk.setNext(record);
        }
        record.setPrevious(this.lastChunk);
        this.lastChunk = record;
    }

    /**
     * Removes possible UTF-8/UTF-16 BOM and Matches a GEDCOM Line.
     *
     * @param line to match
     * @return Regex {@link Matcher}
     */
    public static Matcher matchLine(String line) {
        if(line.contains("\uFEFF")) {
            line = line.replace("\uFEFF", "");
        }
        return LINE_REGEX.matcher(line);
    }

    /*
    ▪ demand that file starts with a Byte Order Mark (BOM)
    ▪ demand that the encoding is either UTF-8 or UTF-16
    ▪ must support both UTF-8 and UTF-16 GEDCOM files
    ▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
    ▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
    */
    static Charset validateEncoding(InputStream stream) {
        Charset charset = EncodingUtils.getCharsetForBOM(stream);

        if(charset == null) {
            throw new MissingBomException();
        }

        return charset;
    }

}