FEAT 5.5.5 Tokenizer Charset Validation from BOM
parent
c62ab4fe26
commit
04350dd2fb
|
@ -1,8 +1,11 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
import de.nth.chronicle.gedcom.util.EncodingUtils;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.Stack;
|
||||
|
@ -22,17 +25,6 @@ class Gedcom555Tokenizer implements GedcomTokenizer {
|
|||
this.options = options;
|
||||
}
|
||||
|
||||
void validateEncoding(BufferedReader reader) {
|
||||
/*
|
||||
▪ demand that file starts with a Byte Order Mark (BOM)
|
||||
▪ demand that the encoding is either UTF-8 or UTF-16
|
||||
▪ must support both UTF-8 and UTF-16 GEDCOM files
|
||||
▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
|
||||
▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<RecordToken> parseRecords() throws Exception {
|
||||
|
||||
|
@ -88,7 +80,8 @@ class Gedcom555Tokenizer implements GedcomTokenizer {
|
|||
}
|
||||
|
||||
public static Gedcom555Tokenizer create(InputStream stream, TokenizerOptions options) {
|
||||
return new Gedcom555Tokenizer(new BufferedReader(new InputStreamReader(stream)), options);
|
||||
Charset charset = validateEncoding(stream);
|
||||
return new Gedcom555Tokenizer(new BufferedReader(new InputStreamReader(stream, charset)), options);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -104,5 +97,21 @@ class Gedcom555Tokenizer implements GedcomTokenizer {
|
|||
return LINE_REGEX.matcher(line);
|
||||
}
|
||||
|
||||
static Charset validateEncoding(InputStream stream) {
|
||||
/*
|
||||
▪ demand that file starts with a Byte Order Mark (BOM)
|
||||
▪ demand that the encoding is either UTF-8 or UTF-16
|
||||
▪ must support both UTF-8 and UTF-16 GEDCOM files
|
||||
▪ must support both Little-Endian and Big-Endian UTF-16 GEDCOM files
|
||||
▪ reject files using anything else as not-GEDCOM, not even a valid GEDCOM header
|
||||
*/
|
||||
Charset charset = EncodingUtils.getCharsetForBOM(stream);
|
||||
|
||||
if(charset == null) {
|
||||
throw new InvalidGedcomException.MissingBOM();
|
||||
}
|
||||
|
||||
return charset;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
package de.nth.chronicle.gedcom.parser;
|
||||
|
||||
public abstract class InvalidGedcomException extends RuntimeException {
|
||||
|
||||
public InvalidGedcomException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public static class MissingBOM extends InvalidGedcomException {
|
||||
|
||||
public MissingBOM() {
|
||||
super("BOM is missing!");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -3,10 +3,10 @@ package de.nth.chronicle.gedcom.parser;
|
|||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
public class TokenizerOptions {
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
package de.nth.chronicle.gedcom.util;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
|
||||
public final class EncodingUtils {
|
||||
|
||||
private EncodingUtils() { /* no-op */ }
|
||||
|
||||
public static Charset getCharsetForBOM(InputStream stream) {
|
||||
|
||||
try {
|
||||
byte[] bom = new byte[2];
|
||||
|
||||
stream.read(bom);
|
||||
|
||||
|
||||
if(bom[0] == (byte)0xEF) return StandardCharsets.UTF_8;
|
||||
if(bom[0] == (byte)0xFE) return StandardCharsets.UTF_16BE;
|
||||
if(bom[0] == (byte)0xFF) return StandardCharsets.UTF_16LE;
|
||||
|
||||
return null;
|
||||
|
||||
}catch(Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static String byteArrayToHex(byte[] bytes) {
|
||||
StringBuilder sb = new StringBuilder(bytes.length*2);
|
||||
for(byte b : bytes) {
|
||||
sb.append(Character.forDigit((b >> 4) & 0xF, 16));
|
||||
sb.append(Character.forDigit((b & 0xF), 16));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -71,12 +71,8 @@ public class Gedcom555TokenizerTests {
|
|||
e.printStackTrace();
|
||||
}
|
||||
|
||||
records.forEach(System.out::println);
|
||||
|
||||
assertEquals(3, records.size());
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
package de.nth.chronicle.gedcom.util;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class EncodingUtilsTests {
|
||||
|
||||
@Test
|
||||
void testCharsetForBOM() throws IOException {
|
||||
|
||||
InputStream stream;
|
||||
|
||||
stream = EncodingUtils.class.getResourceAsStream("/examples/555SAMPLE.ged");
|
||||
Assertions.assertEquals(StandardCharsets.UTF_8, EncodingUtils.getCharsetForBOM(stream));
|
||||
stream.close();
|
||||
|
||||
stream = EncodingUtils.class.getResourceAsStream("/examples/555SAMPLE16LE.ged");
|
||||
Assertions.assertEquals(StandardCharsets.UTF_16LE, EncodingUtils.getCharsetForBOM(stream));
|
||||
stream.close();
|
||||
|
||||
stream = EncodingUtils.class.getResourceAsStream("/examples/555SAMPLE16BE.ged");
|
||||
Assertions.assertEquals(StandardCharsets.UTF_16BE, EncodingUtils.getCharsetForBOM(stream));
|
||||
stream.close();
|
||||
|
||||
stream = EncodingUtils.class.getResourceAsStream("/examples/555SAMPLE_nobom.ged");
|
||||
Assertions.assertEquals(null, EncodingUtils.getCharsetForBOM(stream));
|
||||
stream.close();
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
0 HEAD
|
||||
1 GEDC
|
||||
2 VERS 5.5.5
|
||||
2 FORM LINEAGE-LINKED
|
||||
3 VERS 5.5.5
|
||||
1 CHAR UTF-8
|
||||
1 SOUR GS
|
||||
2 NAME GEDCOM Specification
|
||||
2 VERS 5.5.5
|
||||
2 CORP gedcom.org
|
||||
3 ADDR
|
||||
4 CITY LEIDEN
|
||||
3 WWW www.gedcom.org
|
||||
1 DATE 2 Oct 2019
|
||||
2 TIME 0:00:00
|
||||
1 FILE 555Sample.ged
|
||||
1 LANG English
|
||||
1 SUBM @U1@
|
||||
0 @U1@ SUBM
|
||||
1 NAME Reldon Poulson
|
||||
1 ADDR
|
||||
2 ADR1 1900 43rd Street West
|
||||
2 CITY Billings
|
||||
2 STAE Montana
|
||||
2 POST 68051
|
||||
2 CTRY United States of America
|
||||
1 PHON +1 (406) 555-1232
|
||||
0 @I1@ INDI
|
||||
1 NAME Robert Eugene /Williams/
|
||||
2 SURN Williams
|
||||
2 GIVN Robert Eugene
|
||||
1 SEX M
|
||||
1 BIRT
|
||||
2 DATE 2 Oct 1822
|
||||
2 PLAC Weston, Madison, Connecticut, United States of America
|
||||
2 SOUR @S1@
|
||||
3 PAGE Sec. 2, p. 45
|
||||
1 DEAT
|
||||
2 DATE 14 Apr 1905
|
||||
2 PLAC Stamford, Fairfield, Connecticut, United States of America
|
||||
1 BURI
|
||||
2 PLAC Spring Hill Cemetery, Stamford, Fairfield, Connecticut, United States of America
|
||||
1 FAMS @F1@
|
||||
1 FAMS @F2@
|
||||
1 RESI
|
||||
2 DATE from 1900 to 1905
|
||||
0 @I2@ INDI
|
||||
1 NAME Mary Ann /Wilson/
|
||||
2 SURN Wilson
|
||||
2 GIVN Mary Ann
|
||||
1 SEX F
|
||||
1 BIRT
|
||||
2 DATE BEF 1828
|
||||
2 PLAC Connecticut, United States of America
|
||||
1 FAMS @F1@
|
||||
0 @I3@ INDI
|
||||
1 NAME Joe /Williams/
|
||||
2 SURN Williams
|
||||
2 GIVN Joe
|
||||
1 SEX M
|
||||
1 BIRT
|
||||
2 DATE 11 Jun 1861
|
||||
2 PLAC Idaho Falls, Bonneville, Idaho, United States of America
|
||||
1 FAMC @F1@
|
||||
1 FAMC @F2@
|
||||
2 PEDI adopted
|
||||
1 ADOP
|
||||
2 DATE 16 Mar 1864
|
||||
0 @F1@ FAM
|
||||
1 HUSB @I1@
|
||||
1 WIFE @I2@
|
||||
1 CHIL @I3@
|
||||
1 MARR
|
||||
2 DATE Dec 1859
|
||||
2 PLAC Rapid City, Pennington, South Dakota, United States of America
|
||||
0 @F2@ FAM
|
||||
1 HUSB @I1@
|
||||
1 CHIL @I3@
|
||||
0 @S1@ SOUR
|
||||
1 DATA
|
||||
2 EVEN BIRT, DEAT, MARR
|
||||
3 DATE FROM Jan 1820 TO DEC 1825
|
||||
3 PLAC Madison, Connecticut, United States of America
|
||||
2 AGNC Madison County Court
|
||||
1 TITL Madison County Birth, Death, and Marriage Records
|
||||
1 ABBR Madison BMD Records
|
||||
1 REPO @R1@
|
||||
2 CALN 13B-1234.01
|
||||
0 @R1@ REPO
|
||||
1 NAME Family History Library
|
||||
1 ADDR
|
||||
2 ADR1 35 N West Temple Street
|
||||
2 CITY Salt Lake City
|
||||
2 STAE Utah
|
||||
2 POST 84150
|
||||
2 CTRY United States of America
|
||||
0 TRLR
|
Loading…
Reference in New Issue