FEAT 5.5.5 Tokenizer Charset Validation from BOM

master
Niclas Thobaben 2020-11-27 00:32:01 +01:00
parent c62ab4fe26
commit 04350dd2fb
7 changed files with 215 additions and 19 deletions

View File

@ -1,8 +1,11 @@
package de.nth.chronicle.gedcom.parser;
import de.nth.chronicle.gedcom.util.EncodingUtils;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.Set;
import java.util.Stack;
@ -22,17 +25,6 @@ class Gedcom555Tokenizer implements GedcomTokenizer {
this.options = options;
}
void validateEncoding(BufferedReader reader) {
/*
demand that file starts with a Byte Order Mark (BOM)
demand that the encoding is either UTF-8 or UTF-16
must support both UTF-8 and UTF-16 GEDCOM files
must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
*/
}
@Override
public Set<RecordToken> parseRecords() throws Exception {
@ -88,7 +80,8 @@ class Gedcom555Tokenizer implements GedcomTokenizer {
}
public static Gedcom555Tokenizer create(InputStream stream, TokenizerOptions options) {
return new Gedcom555Tokenizer(new BufferedReader(new InputStreamReader(stream)), options);
Charset charset = validateEncoding(stream);
return new Gedcom555Tokenizer(new BufferedReader(new InputStreamReader(stream, charset)), options);
}
/**
@ -104,5 +97,21 @@ class Gedcom555Tokenizer implements GedcomTokenizer {
return LINE_REGEX.matcher(line);
}
static Charset validateEncoding(InputStream stream) {
/*
demand that file starts with a Byte Order Mark (BOM)
demand that the encoding is either UTF-8 or UTF-16
must support both UTF-8 and UTF-16 GEDCOM files
must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
*/
Charset charset = EncodingUtils.getCharsetForBOM(stream);
if(charset == null) {
throw new InvalidGedcomException.MissingBOM();
}
return charset;
}
}

View File

@ -0,0 +1,18 @@
package de.nth.chronicle.gedcom.parser;
public abstract class InvalidGedcomException extends RuntimeException {
public InvalidGedcomException(String message) {
super(message);
}
public static class MissingBOM extends InvalidGedcomException {
public MissingBOM() {
super("BOM is missing!");
}
}
}

View File

@ -3,10 +3,10 @@ package de.nth.chronicle.gedcom.parser;
import lombok.Builder;
import lombok.Getter;
import java.nio.charset.Charset;
@Builder
@Getter
public class TokenizerOptions {
}

View File

@ -0,0 +1,40 @@
package de.nth.chronicle.gedcom.util;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
public final class EncodingUtils {
private EncodingUtils() { /* no-op */ }
public static Charset getCharsetForBOM(InputStream stream) {
try {
byte[] bom = new byte[2];
stream.read(bom);
if(bom[0] == (byte)0xEF) return StandardCharsets.UTF_8;
if(bom[0] == (byte)0xFE) return StandardCharsets.UTF_16BE;
if(bom[0] == (byte)0xFF) return StandardCharsets.UTF_16LE;
return null;
}catch(Exception e) {
throw new RuntimeException(e);
}
}
public static String byteArrayToHex(byte[] bytes) {
StringBuilder sb = new StringBuilder(bytes.length*2);
for(byte b : bytes) {
sb.append(Character.forDigit((b >> 4) & 0xF, 16));
sb.append(Character.forDigit((b & 0xF), 16));
}
return sb.toString();
}
}

View File

@ -71,12 +71,8 @@ public class Gedcom555TokenizerTests {
e.printStackTrace();
}
records.forEach(System.out::println);
assertEquals(3, records.size());
}
}

View File

@ -0,0 +1,36 @@
package de.nth.chronicle.gedcom.util;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
public class EncodingUtilsTests {
@Test
void testCharsetForBOM() throws IOException {
InputStream stream;
stream = EncodingUtils.class.getResourceAsStream("/examples/555SAMPLE.ged");
Assertions.assertEquals(StandardCharsets.UTF_8, EncodingUtils.getCharsetForBOM(stream));
stream.close();
stream = EncodingUtils.class.getResourceAsStream("/examples/555SAMPLE16LE.ged");
Assertions.assertEquals(StandardCharsets.UTF_16LE, EncodingUtils.getCharsetForBOM(stream));
stream.close();
stream = EncodingUtils.class.getResourceAsStream("/examples/555SAMPLE16BE.ged");
Assertions.assertEquals(StandardCharsets.UTF_16BE, EncodingUtils.getCharsetForBOM(stream));
stream.close();
stream = EncodingUtils.class.getResourceAsStream("/examples/555SAMPLE_nobom.ged");
Assertions.assertEquals(null, EncodingUtils.getCharsetForBOM(stream));
stream.close();
}
}

View File

@ -0,0 +1,97 @@
0 HEAD
1 GEDC
2 VERS 5.5.5
2 FORM LINEAGE-LINKED
3 VERS 5.5.5
1 CHAR UTF-8
1 SOUR GS
2 NAME GEDCOM Specification
2 VERS 5.5.5
2 CORP gedcom.org
3 ADDR
4 CITY LEIDEN
3 WWW www.gedcom.org
1 DATE 2 Oct 2019
2 TIME 0:00:00
1 FILE 555Sample.ged
1 LANG English
1 SUBM @U1@
0 @U1@ SUBM
1 NAME Reldon Poulson
1 ADDR
2 ADR1 1900 43rd Street West
2 CITY Billings
2 STAE Montana
2 POST 68051
2 CTRY United States of America
1 PHON +1 (406) 555-1232
0 @I1@ INDI
1 NAME Robert Eugene /Williams/
2 SURN Williams
2 GIVN Robert Eugene
1 SEX M
1 BIRT
2 DATE 2 Oct 1822
2 PLAC Weston, Madison, Connecticut, United States of America
2 SOUR @S1@
3 PAGE Sec. 2, p. 45
1 DEAT
2 DATE 14 Apr 1905
2 PLAC Stamford, Fairfield, Connecticut, United States of America
1 BURI
2 PLAC Spring Hill Cemetery, Stamford, Fairfield, Connecticut, United States of America
1 FAMS @F1@
1 FAMS @F2@
1 RESI
2 DATE from 1900 to 1905
0 @I2@ INDI
1 NAME Mary Ann /Wilson/
2 SURN Wilson
2 GIVN Mary Ann
1 SEX F
1 BIRT
2 DATE BEF 1828
2 PLAC Connecticut, United States of America
1 FAMS @F1@
0 @I3@ INDI
1 NAME Joe /Williams/
2 SURN Williams
2 GIVN Joe
1 SEX M
1 BIRT
2 DATE 11 Jun 1861
2 PLAC Idaho Falls, Bonneville, Idaho, United States of America
1 FAMC @F1@
1 FAMC @F2@
2 PEDI adopted
1 ADOP
2 DATE 16 Mar 1864
0 @F1@ FAM
1 HUSB @I1@
1 WIFE @I2@
1 CHIL @I3@
1 MARR
2 DATE Dec 1859
2 PLAC Rapid City, Pennington, South Dakota, United States of America
0 @F2@ FAM
1 HUSB @I1@
1 CHIL @I3@
0 @S1@ SOUR
1 DATA
2 EVEN BIRT, DEAT, MARR
3 DATE FROM Jan 1820 TO DEC 1825
3 PLAC Madison, Connecticut, United States of America
2 AGNC Madison County Court
1 TITL Madison County Birth, Death, and Marriage Records
1 ABBR Madison BMD Records
1 REPO @R1@
2 CALN 13B-1234.01
0 @R1@ REPO
1 NAME Family History Library
1 ADDR
2 ADR1 35 N West Temple Street
2 CITY Salt Lake City
2 STAE Utah
2 POST 84150
2 CTRY United States of America
0 TRLR