(fix) generalized HTML sanitization to avoid encoding issues when replying/forwarding mails

This commit is contained in:
Ludovic Marcotte 2017-10-18 09:33:44 -04:00
parent a2f84f1358
commit 5cd3a8f245
4 changed files with 210 additions and 200 deletions

View file

@ -1,6 +1,6 @@
/* NSData+Mail.h - this file is part of SOGo
*
* Copyright (C) 2007-2015 Inverse inc.
* Copyright (C) 2007-2017 Inverse inc.
*
* This file is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -29,8 +29,8 @@
- (NSData *) bodyDataFromEncoding: (NSString *) encoding;
- (NSString *) bodyStringFromCharset: (NSString *) charset;
- (NSString *) decodedHeader;
- (NSData *) sanitizedContentUsingVoidTags: (NSArray *) theVoidTags;
@end

View file

@ -1,6 +1,6 @@
/* NSData+Mail.m - this file is part of SOGo
*
* Copyright (C) 2007-2015 Inverse inc.
* Copyright (C) 2007-2017 Inverse inc.
*
* This file is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -18,6 +18,7 @@
* Boston, MA 02111-1307, USA.
*/
#import <Foundation/NSArray.h>
#import <Foundation/NSString.h>
#import <NGExtensions/NGBase64Coding.h>
@ -212,4 +213,195 @@
return decodedString;
}
//
// In order to avoid a libxml bug/limitation, we strip the charset= parameter
// to avoid libxml to consider the charset= parameter while it works in UTF-8
// internally, all the time.
//
// A fix was commited by Daniel Veillard following discussions Inverse had
// with him on the issue:
//
// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870
// Author: Daniel Veillard <veillard redhat com>
// Date: Mon May 16 16:03:50 2011 +0800
//
// Add options to ignore the internal encoding
//
// For both XML and HTML, the document can provide an encoding
// either in XMLDecl in XML, or as a meta element in HTML head.
// This adds options to ignore those encodings if the encoding
// is known in advace for example if the content had been converted
// before being passed to the parser.
//
// * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option
// for XML parsing
// * include/libxml/HTMLparser.h HTMLparser.c: adds the
// HTML_PARSE_IGNORE_ENC for HTML parsing
// * HTMLtree.c: fix the handling of saving when an unknown encoding is
// defined in meta document header
// * xmllint.c: add a --noenc option to activate the new parser options
//
//
- (NSData *) sanitizedContentUsingVoidTags: (NSArray *) theVoidTags
{
NSMutableData *d;
NSString *found_tag, *tag;
NSEnumerator *tags;
const char *bytes;
char *buf;
int i, j, len;
BOOL found_delimiter, in_meta;
d = [NSMutableData dataWithData: self];
bytes = [d bytes];
len = [d length];
i = 0;
in_meta = NO;
while (i < len)
{
// We check if we see <meta ...> in which case, we substitute de charset= stuff.
if (i < len-5)
{
if ((*bytes == '<') &&
(*(bytes+1) == 'm' || *(bytes+1) == 'M') &&
(*(bytes+2) == 'e' || *(bytes+2) == 'E') &&
(*(bytes+3) == 't' || *(bytes+3) == 'T') &&
(*(bytes+4) == 'a' || *(bytes+4) == 'A') &&
(*(bytes+5) == ' '))
in_meta = YES;
}
// We search for something like :
//
// <meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
//
if (in_meta && i < len-9)
{
if ((*bytes == 'c' || *bytes == 'C') &&
(*(bytes+1) == 'h' || *(bytes+1) == 'H') &&
(*(bytes+2) == 'a' || *(bytes+2) == 'A') &&
(*(bytes+3) == 'r' || *(bytes+3) == 'R') &&
(*(bytes+4) == 's' || *(bytes+4) == 'S') &&
(*(bytes+5) == 'e' || *(bytes+5) == 'E') &&
(*(bytes+6) == 't' || *(bytes+6) == 'T') &&
(*(bytes+7) == '='))
{
// We search until we find a '"' or a space
j = 8;
found_delimiter = YES;
while (*(bytes+j) != ' ' && *(bytes+j) != '"' && *(bytes+j) != '\'')
{
j++;
// We haven't found anything, let's return the data untouched
if ((i+j) >= len)
{
in_meta = found_delimiter = NO;
break;
}
}
if (found_delimiter)
{
[d replaceBytesInRange: NSMakeRange(i, j)
withBytes: NULL
length: 0];
in_meta = found_delimiter = NO;
}
}
}
bytes++;
i++;
}
/*
* Replace badly formatted void tags
*
* A void tag that begins with a slash is considered invalid.
* We remove the slash from those tags.
*
* Ex: </br> is replaced by <br>
*/
if (!theVoidTags)
{
/* see http://www.w3.org/TR/html4/index/elements.html */
theVoidTags = [[[NSArray alloc] initWithObjects: @"area", @"base",
@"basefont", @"br", @"col", @"frame", @"hr",
@"img", @"input", @"isindex", @"link",
@"meta", @"param", @"", nil] autorelease];
}
bytes = [d bytes];
len = [d length];
i = 0;
while (i < len)
{
if (i < len-3)
{
// Search for ending tags
if ((*bytes == '<') && (*(bytes+1) == '/'))
{
i += 2;
bytes += 2;
j = 0;
found_delimiter = YES;
while (*(bytes+j) != '>')
{
j++;
if ((i+j) >= len)
{
found_delimiter = NO;
break;
}
}
if (found_delimiter && j > 0)
{
// Copy the ending tag to a NSString
buf = malloc((j+1) * sizeof(char));
memset (buf, 0, j+1);
memcpy (buf, bytes, j);
found_tag = [NSString stringWithCString: buf encoding: NSUTF8StringEncoding];
tags = [theVoidTags objectEnumerator];
tag = [tags nextObject];
while (tag && found_tag)
{
if ([tag caseInsensitiveCompare: found_tag] == NSOrderedSame)
{
// Remove the leading slash
//NSLog(@"Found void tag with invalid leading slash: </%@>", found_tag);
i--;
[d replaceBytesInRange: NSMakeRange(i, 1)
withBytes: NULL
length: 0];
bytes = [d bytes];
bytes += i;
len = [d length];
break;
}
tag = [tags nextObject];
}
free(buf);
// Continue the parsing after end tag
i += j;
bytes += j;
}
}
}
bytes++;
i++;
}
return d;
}
@end

View file

@ -106,12 +106,14 @@
//
- (NSString *) _contentForEditingFromKeys: (NSArray *) keys
{
NSArray *types;
NSDictionary *parts;
NSString *rawPart, *content, *contentKey;
SOGoUserDefaults *ud;
NSUInteger index;
NSDictionary *parts;
NSArray *types;
NSData *data;
BOOL htmlComposition, htmlContent;
NSUInteger index;
content = @"";
@ -156,7 +158,14 @@
}
}
return content;
// We strip charset= information from HTML content to avoid SOGo setting
// the encoding of the final mail to UTF-8 while keeping charset="iso-8859-1"
// in the HTML meta headers, for example. That would cause encoding display
// issues with most MUAs.
data = [[content dataUsingEncoding: NSUTF8StringEncoding] sanitizedContentUsingVoidTags: nil];
content = [[NSString alloc] initWithData: data encoding: NSUTF8StringEncoding];
return [content autorelease];
}
//

View file

@ -110,197 +110,6 @@ _xmlCharsetForCharset (NSString *charset)
return encoding;
}
//
// In order to avoid a libxml bug/limitation, we strip the charset= parameter
// to avoid libxml to consider the charset= parameter while it works in UTF-8
// internally, all the time.
//
// A fix was commited by Daniel Veillard following discussions Inverse had
// with him on the issue:
//
// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870
// Author: Daniel Veillard <veillard redhat com>
// Date: Mon May 16 16:03:50 2011 +0800
//
// Add options to ignore the internal encoding
//
// For both XML and HTML, the document can provide an encoding
// either in XMLDecl in XML, or as a meta element in HTML head.
// This adds options to ignore those encodings if the encoding
// is known in advace for example if the content had been converted
// before being passed to the parser.
//
// * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option
// for XML parsing
// * include/libxml/HTMLparser.h HTMLparser.c: adds the
// HTML_PARSE_IGNORE_ENC for HTML parsing
// * HTMLtree.c: fix the handling of saving when an unknown encoding is
// defined in meta document header
// * xmllint.c: add a --noenc option to activate the new parser options
//
//
static NSData* _sanitizeContent(NSData *theData)
{
NSMutableData *d;
NSString *found_tag, *tag;
NSEnumerator *tags;
const char *bytes;
char *buf;
int i, j, len;
BOOL found_delimiter, in_meta;
d = [NSMutableData dataWithData: theData];
bytes = [d bytes];
len = [d length];
i = 0;
in_meta = NO;
while (i < len)
{
// We check if we see <meta ...> in which case, we substitute de charset= stuff.
if (i < len-5)
{
if ((*bytes == '<') &&
(*(bytes+1) == 'm' || *(bytes+1) == 'M') &&
(*(bytes+2) == 'e' || *(bytes+2) == 'E') &&
(*(bytes+3) == 't' || *(bytes+3) == 'T') &&
(*(bytes+4) == 'a' || *(bytes+4) == 'A') &&
(*(bytes+5) == ' '))
in_meta = YES;
}
// We search for something like :
//
// <meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
//
if (in_meta && i < len-9)
{
if ((*bytes == 'c' || *bytes == 'C') &&
(*(bytes+1) == 'h' || *(bytes+1) == 'H') &&
(*(bytes+2) == 'a' || *(bytes+2) == 'A') &&
(*(bytes+3) == 'r' || *(bytes+3) == 'R') &&
(*(bytes+4) == 's' || *(bytes+4) == 'S') &&
(*(bytes+5) == 'e' || *(bytes+5) == 'E') &&
(*(bytes+6) == 't' || *(bytes+6) == 'T') &&
(*(bytes+7) == '='))
{
// We search until we find a '"' or a space
j = 8;
found_delimiter = YES;
while (*(bytes+j) != ' ' && *(bytes+j) != '"' && *(bytes+j) != '\'')
{
j++;
// We haven't found anything, let's return the data untouched
if ((i+j) >= len)
{
in_meta = found_delimiter = NO;
break;
}
}
if (found_delimiter)
{
[d replaceBytesInRange: NSMakeRange(i, j)
withBytes: NULL
length: 0];
in_meta = found_delimiter = NO;
}
}
}
bytes++;
i++;
}
/*
* Replace badly formatted void tags
*
* A void tag that begins with a slash is considered invalid.
* We remove the slash from those tags.
*
* Ex: </br> is replaced by <br>
*/
if (!VoidTags)
{
/* see http://www.w3.org/TR/html4/index/elements.html */
VoidTags = [[NSArray alloc] initWithObjects: @"area", @"base",
@"basefont", @"br", @"col", @"frame", @"hr",
@"img", @"input", @"isindex", @"link",
@"meta", @"param", @"", nil];
}
bytes = [d bytes];
len = [d length];
i = 0;
while (i < len)
{
if (i < len-3)
{
// Search for ending tags
if ((*bytes == '<') && (*(bytes+1) == '/'))
{
i += 2;
bytes += 2;
j = 0;
found_delimiter = YES;
while (*(bytes+j) != '>')
{
j++;
if ((i+j) >= len)
{
found_delimiter = NO;
break;
}
}
if (found_delimiter && j > 0)
{
// Copy the ending tag to a NSString
buf = malloc((j+1) * sizeof(char));
memset (buf, 0, j+1);
memcpy (buf, bytes, j);
found_tag = [NSString stringWithCString: buf encoding: NSUTF8StringEncoding];
tags = [VoidTags objectEnumerator];
tag = [tags nextObject];
while (tag && found_tag)
{
if ([tag caseInsensitiveCompare: found_tag] == NSOrderedSame)
{
// Remove the leading slash
//NSLog(@"Found void tag with invalid leading slash: </%@>", found_tag);
i--;
[d replaceBytesInRange: NSMakeRange(i, 1)
withBytes: NULL
length: 0];
bytes = [d bytes];
bytes += i;
len = [d length];
break;
}
tag = [tags nextObject];
}
free(buf);
// Continue the parsing after end tag
i += j;
bytes += j;
}
}
}
bytes++;
i++;
}
return d;
}
@interface _UIxHTMLMailContentHandler : NSObject <SaxContentHandler, SaxLexicalHandler>
{
NSMutableString *result;
@ -853,7 +662,7 @@ static NSData* _sanitizeContent(NSData *theData)
mail = [self clientObject];
preparsedContent = _sanitizeContent([super decodedFlatContent]);
preparsedContent = [[super decodedFlatContent] sanitizedContentUsingVoidTags: VoidTags];
parser = [[SaxXMLReaderFactory standardXMLReaderFactory]
createXMLReaderForMimeType: @"text/html"];
@ -971,7 +780,7 @@ static NSData* _sanitizeContent(NSData *theData)
part = [self clientObject];
mail = [part mailObject];
preparsedContent = _sanitizeContent([part fetchBLOB]);
preparsedContent = [[part fetchBLOB] sanitizedContentUsingVoidTags: VoidTags];
parser = [[SaxXMLReaderFactory standardXMLReaderFactory]
createXMLReaderForMimeType: @"text/html"];
encoding = [[part partInfo] valueForKey: @"encoding"];