(fix) generalized HTML sanitization to avoid encoding issues when replying/forwarding mails
This commit is contained in:
parent
a2f84f1358
commit
5cd3a8f245
|
@ -1,6 +1,6 @@
|
|||
/* NSData+Mail.h - this file is part of SOGo
|
||||
*
|
||||
* Copyright (C) 2007-2015 Inverse inc.
|
||||
* Copyright (C) 2007-2017 Inverse inc.
|
||||
*
|
||||
* This file is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
@ -29,8 +29,8 @@
|
|||
|
||||
- (NSData *) bodyDataFromEncoding: (NSString *) encoding;
|
||||
- (NSString *) bodyStringFromCharset: (NSString *) charset;
|
||||
|
||||
- (NSString *) decodedHeader;
|
||||
- (NSData *) sanitizedContentUsingVoidTags: (NSArray *) theVoidTags;
|
||||
|
||||
@end
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/* NSData+Mail.m - this file is part of SOGo
|
||||
*
|
||||
* Copyright (C) 2007-2015 Inverse inc.
|
||||
* Copyright (C) 2007-2017 Inverse inc.
|
||||
*
|
||||
* This file is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
@ -18,6 +18,7 @@
|
|||
* Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
#import <Foundation/NSArray.h>
|
||||
#import <Foundation/NSString.h>
|
||||
|
||||
#import <NGExtensions/NGBase64Coding.h>
|
||||
|
@ -212,4 +213,195 @@
|
|||
return decodedString;
|
||||
}
|
||||
|
||||
//
|
||||
// In order to avoid a libxml bug/limitation, we strip the charset= parameter
|
||||
// to avoid libxml to consider the charset= parameter while it works in UTF-8
|
||||
// internally, all the time.
|
||||
//
|
||||
// A fix was commited by Daniel Veillard following discussions Inverse had
|
||||
// with him on the issue:
|
||||
//
|
||||
// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870
|
||||
// Author: Daniel Veillard <veillard redhat com>
|
||||
// Date: Mon May 16 16:03:50 2011 +0800
|
||||
//
|
||||
// Add options to ignore the internal encoding
|
||||
//
|
||||
// For both XML and HTML, the document can provide an encoding
|
||||
// either in XMLDecl in XML, or as a meta element in HTML head.
|
||||
// This adds options to ignore those encodings if the encoding
|
||||
// is known in advace for example if the content had been converted
|
||||
// before being passed to the parser.
|
||||
//
|
||||
// * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option
|
||||
// for XML parsing
|
||||
// * include/libxml/HTMLparser.h HTMLparser.c: adds the
|
||||
// HTML_PARSE_IGNORE_ENC for HTML parsing
|
||||
// * HTMLtree.c: fix the handling of saving when an unknown encoding is
|
||||
// defined in meta document header
|
||||
// * xmllint.c: add a --noenc option to activate the new parser options
|
||||
//
|
||||
//
|
||||
- (NSData *) sanitizedContentUsingVoidTags: (NSArray *) theVoidTags
|
||||
{
|
||||
NSMutableData *d;
|
||||
NSString *found_tag, *tag;
|
||||
NSEnumerator *tags;
|
||||
const char *bytes;
|
||||
char *buf;
|
||||
int i, j, len;
|
||||
BOOL found_delimiter, in_meta;
|
||||
|
||||
d = [NSMutableData dataWithData: self];
|
||||
bytes = [d bytes];
|
||||
len = [d length];
|
||||
i = 0;
|
||||
|
||||
in_meta = NO;
|
||||
|
||||
while (i < len)
|
||||
{
|
||||
// We check if we see <meta ...> in which case, we substitute de charset= stuff.
|
||||
if (i < len-5)
|
||||
{
|
||||
if ((*bytes == '<') &&
|
||||
(*(bytes+1) == 'm' || *(bytes+1) == 'M') &&
|
||||
(*(bytes+2) == 'e' || *(bytes+2) == 'E') &&
|
||||
(*(bytes+3) == 't' || *(bytes+3) == 'T') &&
|
||||
(*(bytes+4) == 'a' || *(bytes+4) == 'A') &&
|
||||
(*(bytes+5) == ' '))
|
||||
in_meta = YES;
|
||||
}
|
||||
|
||||
// We search for something like :
|
||||
//
|
||||
// <meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
|
||||
//
|
||||
if (in_meta && i < len-9)
|
||||
{
|
||||
if ((*bytes == 'c' || *bytes == 'C') &&
|
||||
(*(bytes+1) == 'h' || *(bytes+1) == 'H') &&
|
||||
(*(bytes+2) == 'a' || *(bytes+2) == 'A') &&
|
||||
(*(bytes+3) == 'r' || *(bytes+3) == 'R') &&
|
||||
(*(bytes+4) == 's' || *(bytes+4) == 'S') &&
|
||||
(*(bytes+5) == 'e' || *(bytes+5) == 'E') &&
|
||||
(*(bytes+6) == 't' || *(bytes+6) == 'T') &&
|
||||
(*(bytes+7) == '='))
|
||||
{
|
||||
// We search until we find a '"' or a space
|
||||
j = 8;
|
||||
found_delimiter = YES;
|
||||
|
||||
while (*(bytes+j) != ' ' && *(bytes+j) != '"' && *(bytes+j) != '\'')
|
||||
{
|
||||
j++;
|
||||
|
||||
// We haven't found anything, let's return the data untouched
|
||||
if ((i+j) >= len)
|
||||
{
|
||||
in_meta = found_delimiter = NO;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_delimiter)
|
||||
{
|
||||
[d replaceBytesInRange: NSMakeRange(i, j)
|
||||
withBytes: NULL
|
||||
length: 0];
|
||||
in_meta = found_delimiter = NO;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bytes++;
|
||||
i++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Replace badly formatted void tags
|
||||
*
|
||||
* A void tag that begins with a slash is considered invalid.
|
||||
* We remove the slash from those tags.
|
||||
*
|
||||
* Ex: </br> is replaced by <br>
|
||||
*/
|
||||
|
||||
if (!theVoidTags)
|
||||
{
|
||||
/* see http://www.w3.org/TR/html4/index/elements.html */
|
||||
theVoidTags = [[[NSArray alloc] initWithObjects: @"area", @"base",
|
||||
@"basefont", @"br", @"col", @"frame", @"hr",
|
||||
@"img", @"input", @"isindex", @"link",
|
||||
@"meta", @"param", @"", nil] autorelease];
|
||||
}
|
||||
|
||||
bytes = [d bytes];
|
||||
len = [d length];
|
||||
i = 0;
|
||||
while (i < len)
|
||||
{
|
||||
if (i < len-3)
|
||||
{
|
||||
// Search for ending tags
|
||||
if ((*bytes == '<') && (*(bytes+1) == '/'))
|
||||
{
|
||||
i += 2;
|
||||
bytes += 2;
|
||||
j = 0;
|
||||
found_delimiter = YES;
|
||||
|
||||
while (*(bytes+j) != '>')
|
||||
{
|
||||
j++;
|
||||
if ((i+j) >= len)
|
||||
{
|
||||
found_delimiter = NO;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_delimiter && j > 0)
|
||||
{
|
||||
// Copy the ending tag to a NSString
|
||||
buf = malloc((j+1) * sizeof(char));
|
||||
memset (buf, 0, j+1);
|
||||
memcpy (buf, bytes, j);
|
||||
found_tag = [NSString stringWithCString: buf encoding: NSUTF8StringEncoding];
|
||||
|
||||
tags = [theVoidTags objectEnumerator];
|
||||
tag = [tags nextObject];
|
||||
while (tag && found_tag)
|
||||
{
|
||||
if ([tag caseInsensitiveCompare: found_tag] == NSOrderedSame)
|
||||
{
|
||||
// Remove the leading slash
|
||||
//NSLog(@"Found void tag with invalid leading slash: </%@>", found_tag);
|
||||
i--;
|
||||
[d replaceBytesInRange: NSMakeRange(i, 1)
|
||||
withBytes: NULL
|
||||
length: 0];
|
||||
bytes = [d bytes];
|
||||
bytes += i;
|
||||
len = [d length];
|
||||
break;
|
||||
}
|
||||
tag = [tags nextObject];
|
||||
}
|
||||
free(buf);
|
||||
|
||||
// Continue the parsing after end tag
|
||||
i += j;
|
||||
bytes += j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bytes++;
|
||||
i++;
|
||||
}
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
@end
|
||||
|
|
|
@ -106,12 +106,14 @@
|
|||
//
|
||||
- (NSString *) _contentForEditingFromKeys: (NSArray *) keys
|
||||
{
|
||||
NSArray *types;
|
||||
NSDictionary *parts;
|
||||
NSString *rawPart, *content, *contentKey;
|
||||
SOGoUserDefaults *ud;
|
||||
NSUInteger index;
|
||||
NSDictionary *parts;
|
||||
NSArray *types;
|
||||
NSData *data;
|
||||
|
||||
BOOL htmlComposition, htmlContent;
|
||||
NSUInteger index;
|
||||
|
||||
content = @"";
|
||||
|
||||
|
@ -156,7 +158,14 @@
|
|||
}
|
||||
}
|
||||
|
||||
return content;
|
||||
// We strip charset= information from HTML content to avoid SOGo setting
|
||||
// the encoding of the final mail to UTF-8 while keeping charset="iso-8859-1"
|
||||
// in the HTML meta headers, for example. That would cause encoding display
|
||||
// issues with most MUAs.
|
||||
data = [[content dataUsingEncoding: NSUTF8StringEncoding] sanitizedContentUsingVoidTags: nil];
|
||||
content = [[NSString alloc] initWithData: data encoding: NSUTF8StringEncoding];
|
||||
|
||||
return [content autorelease];
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
@ -110,197 +110,6 @@ _xmlCharsetForCharset (NSString *charset)
|
|||
return encoding;
|
||||
}
|
||||
|
||||
//
|
||||
// In order to avoid a libxml bug/limitation, we strip the charset= parameter
|
||||
// to avoid libxml to consider the charset= parameter while it works in UTF-8
|
||||
// internally, all the time.
|
||||
//
|
||||
// A fix was commited by Daniel Veillard following discussions Inverse had
|
||||
// with him on the issue:
|
||||
//
|
||||
// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870
|
||||
// Author: Daniel Veillard <veillard redhat com>
|
||||
// Date: Mon May 16 16:03:50 2011 +0800
|
||||
//
|
||||
// Add options to ignore the internal encoding
|
||||
//
|
||||
// For both XML and HTML, the document can provide an encoding
|
||||
// either in XMLDecl in XML, or as a meta element in HTML head.
|
||||
// This adds options to ignore those encodings if the encoding
|
||||
// is known in advace for example if the content had been converted
|
||||
// before being passed to the parser.
|
||||
//
|
||||
// * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option
|
||||
// for XML parsing
|
||||
// * include/libxml/HTMLparser.h HTMLparser.c: adds the
|
||||
// HTML_PARSE_IGNORE_ENC for HTML parsing
|
||||
// * HTMLtree.c: fix the handling of saving when an unknown encoding is
|
||||
// defined in meta document header
|
||||
// * xmllint.c: add a --noenc option to activate the new parser options
|
||||
//
|
||||
//
|
||||
static NSData* _sanitizeContent(NSData *theData)
|
||||
{
|
||||
NSMutableData *d;
|
||||
NSString *found_tag, *tag;
|
||||
NSEnumerator *tags;
|
||||
const char *bytes;
|
||||
char *buf;
|
||||
int i, j, len;
|
||||
BOOL found_delimiter, in_meta;
|
||||
|
||||
d = [NSMutableData dataWithData: theData];
|
||||
bytes = [d bytes];
|
||||
len = [d length];
|
||||
i = 0;
|
||||
|
||||
in_meta = NO;
|
||||
|
||||
while (i < len)
|
||||
{
|
||||
// We check if we see <meta ...> in which case, we substitute de charset= stuff.
|
||||
if (i < len-5)
|
||||
{
|
||||
if ((*bytes == '<') &&
|
||||
(*(bytes+1) == 'm' || *(bytes+1) == 'M') &&
|
||||
(*(bytes+2) == 'e' || *(bytes+2) == 'E') &&
|
||||
(*(bytes+3) == 't' || *(bytes+3) == 'T') &&
|
||||
(*(bytes+4) == 'a' || *(bytes+4) == 'A') &&
|
||||
(*(bytes+5) == ' '))
|
||||
in_meta = YES;
|
||||
}
|
||||
|
||||
// We search for something like :
|
||||
//
|
||||
// <meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
|
||||
//
|
||||
if (in_meta && i < len-9)
|
||||
{
|
||||
if ((*bytes == 'c' || *bytes == 'C') &&
|
||||
(*(bytes+1) == 'h' || *(bytes+1) == 'H') &&
|
||||
(*(bytes+2) == 'a' || *(bytes+2) == 'A') &&
|
||||
(*(bytes+3) == 'r' || *(bytes+3) == 'R') &&
|
||||
(*(bytes+4) == 's' || *(bytes+4) == 'S') &&
|
||||
(*(bytes+5) == 'e' || *(bytes+5) == 'E') &&
|
||||
(*(bytes+6) == 't' || *(bytes+6) == 'T') &&
|
||||
(*(bytes+7) == '='))
|
||||
{
|
||||
// We search until we find a '"' or a space
|
||||
j = 8;
|
||||
found_delimiter = YES;
|
||||
|
||||
while (*(bytes+j) != ' ' && *(bytes+j) != '"' && *(bytes+j) != '\'')
|
||||
{
|
||||
j++;
|
||||
|
||||
// We haven't found anything, let's return the data untouched
|
||||
if ((i+j) >= len)
|
||||
{
|
||||
in_meta = found_delimiter = NO;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_delimiter)
|
||||
{
|
||||
[d replaceBytesInRange: NSMakeRange(i, j)
|
||||
withBytes: NULL
|
||||
length: 0];
|
||||
in_meta = found_delimiter = NO;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bytes++;
|
||||
i++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Replace badly formatted void tags
|
||||
*
|
||||
* A void tag that begins with a slash is considered invalid.
|
||||
* We remove the slash from those tags.
|
||||
*
|
||||
* Ex: </br> is replaced by <br>
|
||||
*/
|
||||
|
||||
if (!VoidTags)
|
||||
{
|
||||
/* see http://www.w3.org/TR/html4/index/elements.html */
|
||||
VoidTags = [[NSArray alloc] initWithObjects: @"area", @"base",
|
||||
@"basefont", @"br", @"col", @"frame", @"hr",
|
||||
@"img", @"input", @"isindex", @"link",
|
||||
@"meta", @"param", @"", nil];
|
||||
}
|
||||
|
||||
bytes = [d bytes];
|
||||
len = [d length];
|
||||
i = 0;
|
||||
while (i < len)
|
||||
{
|
||||
if (i < len-3)
|
||||
{
|
||||
// Search for ending tags
|
||||
if ((*bytes == '<') && (*(bytes+1) == '/'))
|
||||
{
|
||||
i += 2;
|
||||
bytes += 2;
|
||||
j = 0;
|
||||
found_delimiter = YES;
|
||||
|
||||
while (*(bytes+j) != '>')
|
||||
{
|
||||
j++;
|
||||
if ((i+j) >= len)
|
||||
{
|
||||
found_delimiter = NO;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_delimiter && j > 0)
|
||||
{
|
||||
// Copy the ending tag to a NSString
|
||||
buf = malloc((j+1) * sizeof(char));
|
||||
memset (buf, 0, j+1);
|
||||
memcpy (buf, bytes, j);
|
||||
found_tag = [NSString stringWithCString: buf encoding: NSUTF8StringEncoding];
|
||||
|
||||
tags = [VoidTags objectEnumerator];
|
||||
tag = [tags nextObject];
|
||||
while (tag && found_tag)
|
||||
{
|
||||
if ([tag caseInsensitiveCompare: found_tag] == NSOrderedSame)
|
||||
{
|
||||
// Remove the leading slash
|
||||
//NSLog(@"Found void tag with invalid leading slash: </%@>", found_tag);
|
||||
i--;
|
||||
[d replaceBytesInRange: NSMakeRange(i, 1)
|
||||
withBytes: NULL
|
||||
length: 0];
|
||||
bytes = [d bytes];
|
||||
bytes += i;
|
||||
len = [d length];
|
||||
break;
|
||||
}
|
||||
tag = [tags nextObject];
|
||||
}
|
||||
free(buf);
|
||||
|
||||
// Continue the parsing after end tag
|
||||
i += j;
|
||||
bytes += j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bytes++;
|
||||
i++;
|
||||
}
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
@interface _UIxHTMLMailContentHandler : NSObject <SaxContentHandler, SaxLexicalHandler>
|
||||
{
|
||||
NSMutableString *result;
|
||||
|
@ -853,7 +662,7 @@ static NSData* _sanitizeContent(NSData *theData)
|
|||
|
||||
mail = [self clientObject];
|
||||
|
||||
preparsedContent = _sanitizeContent([super decodedFlatContent]);
|
||||
preparsedContent = [[super decodedFlatContent] sanitizedContentUsingVoidTags: VoidTags];
|
||||
parser = [[SaxXMLReaderFactory standardXMLReaderFactory]
|
||||
createXMLReaderForMimeType: @"text/html"];
|
||||
|
||||
|
@ -971,7 +780,7 @@ static NSData* _sanitizeContent(NSData *theData)
|
|||
part = [self clientObject];
|
||||
mail = [part mailObject];
|
||||
|
||||
preparsedContent = _sanitizeContent([part fetchBLOB]);
|
||||
preparsedContent = [[part fetchBLOB] sanitizedContentUsingVoidTags: VoidTags];
|
||||
parser = [[SaxXMLReaderFactory standardXMLReaderFactory]
|
||||
createXMLReaderForMimeType: @"text/html"];
|
||||
encoding = [[part partInfo] valueForKey: @"encoding"];
|
||||
|
|
Loading…
Reference in a new issue