From 5cd3a8f245fab623a2eacff606f8936ef83d1302 Mon Sep 17 00:00:00 2001 From: Ludovic Marcotte Date: Wed, 18 Oct 2017 09:33:44 -0400 Subject: [PATCH] (fix) generalized HTML sanitization to avoid encoding issues when replying/forwarding mails --- SoObjects/Mailer/NSData+Mail.h | 4 +- SoObjects/Mailer/NSData+Mail.m | 194 +++++++++++++++++++- SoObjects/Mailer/SOGoMailObject+Draft.m | 17 +- UI/MailPartViewers/UIxMailPartHTMLViewer.m | 195 +-------------------- 4 files changed, 210 insertions(+), 200 deletions(-) diff --git a/SoObjects/Mailer/NSData+Mail.h b/SoObjects/Mailer/NSData+Mail.h index 7be5af1a4..ecd844493 100644 --- a/SoObjects/Mailer/NSData+Mail.h +++ b/SoObjects/Mailer/NSData+Mail.h @@ -1,6 +1,6 @@ /* NSData+Mail.h - this file is part of SOGo * - * Copyright (C) 2007-2015 Inverse inc. + * Copyright (C) 2007-2017 Inverse inc. * * This file is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -29,8 +29,8 @@ - (NSData *) bodyDataFromEncoding: (NSString *) encoding; - (NSString *) bodyStringFromCharset: (NSString *) charset; - - (NSString *) decodedHeader; +- (NSData *) sanitizedContentUsingVoidTags: (NSArray *) theVoidTags; @end diff --git a/SoObjects/Mailer/NSData+Mail.m b/SoObjects/Mailer/NSData+Mail.m index 92f379ec4..c5b7061fd 100644 --- a/SoObjects/Mailer/NSData+Mail.m +++ b/SoObjects/Mailer/NSData+Mail.m @@ -1,6 +1,6 @@ /* NSData+Mail.m - this file is part of SOGo * - * Copyright (C) 2007-2015 Inverse inc. + * Copyright (C) 2007-2017 Inverse inc. * * This file is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,6 +18,7 @@ * Boston, MA 02111-1307, USA. */ +#import #import #import @@ -212,4 +213,195 @@ return decodedString; } +// +// In order to avoid a libxml bug/limitation, we strip the charset= parameter +// to avoid libxml to consider the charset= parameter while it works in UTF-8 +// internally, all the time. +// +// A fix was commited by Daniel Veillard following discussions Inverse had +// with him on the issue: +// +// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870 +// Author: Daniel Veillard +// Date: Mon May 16 16:03:50 2011 +0800 +// +// Add options to ignore the internal encoding +// +// For both XML and HTML, the document can provide an encoding +// either in XMLDecl in XML, or as a meta element in HTML head. +// This adds options to ignore those encodings if the encoding +// is known in advace for example if the content had been converted +// before being passed to the parser. +// +// * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option +// for XML parsing +// * include/libxml/HTMLparser.h HTMLparser.c: adds the +// HTML_PARSE_IGNORE_ENC for HTML parsing +// * HTMLtree.c: fix the handling of saving when an unknown encoding is +// defined in meta document header +// * xmllint.c: add a --noenc option to activate the new parser options +// +// +- (NSData *) sanitizedContentUsingVoidTags: (NSArray *) theVoidTags +{ + NSMutableData *d; + NSString *found_tag, *tag; + NSEnumerator *tags; + const char *bytes; + char *buf; + int i, j, len; + BOOL found_delimiter, in_meta; + + d = [NSMutableData dataWithData: self]; + bytes = [d bytes]; + len = [d length]; + i = 0; + + in_meta = NO; + + while (i < len) + { + // We check if we see in which case, we substitute de charset= stuff. + if (i < len-5) + { + if ((*bytes == '<') && + (*(bytes+1) == 'm' || *(bytes+1) == 'M') && + (*(bytes+2) == 'e' || *(bytes+2) == 'E') && + (*(bytes+3) == 't' || *(bytes+3) == 'T') && + (*(bytes+4) == 'a' || *(bytes+4) == 'A') && + (*(bytes+5) == ' ')) + in_meta = YES; + } + + // We search for something like : + // + // + // + if (in_meta && i < len-9) + { + if ((*bytes == 'c' || *bytes == 'C') && + (*(bytes+1) == 'h' || *(bytes+1) == 'H') && + (*(bytes+2) == 'a' || *(bytes+2) == 'A') && + (*(bytes+3) == 'r' || *(bytes+3) == 'R') && + (*(bytes+4) == 's' || *(bytes+4) == 'S') && + (*(bytes+5) == 'e' || *(bytes+5) == 'E') && + (*(bytes+6) == 't' || *(bytes+6) == 'T') && + (*(bytes+7) == '=')) + { + // We search until we find a '"' or a space + j = 8; + found_delimiter = YES; + + while (*(bytes+j) != ' ' && *(bytes+j) != '"' && *(bytes+j) != '\'') + { + j++; + + // We haven't found anything, let's return the data untouched + if ((i+j) >= len) + { + in_meta = found_delimiter = NO; + break; + } + } + + if (found_delimiter) + { + [d replaceBytesInRange: NSMakeRange(i, j) + withBytes: NULL + length: 0]; + in_meta = found_delimiter = NO; + } + } + } + + bytes++; + i++; + } + + /* + * Replace badly formatted void tags + * + * A void tag that begins with a slash is considered invalid. + * We remove the slash from those tags. + * + * Ex:
is replaced by
+ */ + + if (!theVoidTags) + { + /* see http://www.w3.org/TR/html4/index/elements.html */ + theVoidTags = [[[NSArray alloc] initWithObjects: @"area", @"base", + @"basefont", @"br", @"col", @"frame", @"hr", + @"img", @"input", @"isindex", @"link", + @"meta", @"param", @"", nil] autorelease]; + } + + bytes = [d bytes]; + len = [d length]; + i = 0; + while (i < len) + { + if (i < len-3) + { + // Search for ending tags + if ((*bytes == '<') && (*(bytes+1) == '/')) + { + i += 2; + bytes += 2; + j = 0; + found_delimiter = YES; + + while (*(bytes+j) != '>') + { + j++; + if ((i+j) >= len) + { + found_delimiter = NO; + break; + } + } + + if (found_delimiter && j > 0) + { + // Copy the ending tag to a NSString + buf = malloc((j+1) * sizeof(char)); + memset (buf, 0, j+1); + memcpy (buf, bytes, j); + found_tag = [NSString stringWithCString: buf encoding: NSUTF8StringEncoding]; + + tags = [theVoidTags objectEnumerator]; + tag = [tags nextObject]; + while (tag && found_tag) + { + if ([tag caseInsensitiveCompare: found_tag] == NSOrderedSame) + { + // Remove the leading slash + //NSLog(@"Found void tag with invalid leading slash: ", found_tag); + i--; + [d replaceBytesInRange: NSMakeRange(i, 1) + withBytes: NULL + length: 0]; + bytes = [d bytes]; + bytes += i; + len = [d length]; + break; + } + tag = [tags nextObject]; + } + free(buf); + + // Continue the parsing after end tag + i += j; + bytes += j; + } + } + } + + bytes++; + i++; + } + + return d; +} + @end diff --git a/SoObjects/Mailer/SOGoMailObject+Draft.m b/SoObjects/Mailer/SOGoMailObject+Draft.m index 1aa24ea5b..7ef33c42b 100644 --- a/SoObjects/Mailer/SOGoMailObject+Draft.m +++ b/SoObjects/Mailer/SOGoMailObject+Draft.m @@ -106,12 +106,14 @@ // - (NSString *) _contentForEditingFromKeys: (NSArray *) keys { - NSArray *types; - NSDictionary *parts; NSString *rawPart, *content, *contentKey; SOGoUserDefaults *ud; - NSUInteger index; + NSDictionary *parts; + NSArray *types; + NSData *data; + BOOL htmlComposition, htmlContent; + NSUInteger index; content = @""; @@ -156,7 +158,14 @@ } } - return content; + // We strip charset= information from HTML content to avoid SOGo setting + // the encoding of the final mail to UTF-8 while keeping charset="iso-8859-1" + // in the HTML meta headers, for example. That would cause encoding display + // issues with most MUAs. + data = [[content dataUsingEncoding: NSUTF8StringEncoding] sanitizedContentUsingVoidTags: nil]; + content = [[NSString alloc] initWithData: data encoding: NSUTF8StringEncoding]; + + return [content autorelease]; } // diff --git a/UI/MailPartViewers/UIxMailPartHTMLViewer.m b/UI/MailPartViewers/UIxMailPartHTMLViewer.m index 214260cf3..076c74ae4 100644 --- a/UI/MailPartViewers/UIxMailPartHTMLViewer.m +++ b/UI/MailPartViewers/UIxMailPartHTMLViewer.m @@ -110,197 +110,6 @@ _xmlCharsetForCharset (NSString *charset) return encoding; } -// -// In order to avoid a libxml bug/limitation, we strip the charset= parameter -// to avoid libxml to consider the charset= parameter while it works in UTF-8 -// internally, all the time. -// -// A fix was commited by Daniel Veillard following discussions Inverse had -// with him on the issue: -// -// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870 -// Author: Daniel Veillard -// Date: Mon May 16 16:03:50 2011 +0800 -// -// Add options to ignore the internal encoding -// -// For both XML and HTML, the document can provide an encoding -// either in XMLDecl in XML, or as a meta element in HTML head. -// This adds options to ignore those encodings if the encoding -// is known in advace for example if the content had been converted -// before being passed to the parser. -// -// * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option -// for XML parsing -// * include/libxml/HTMLparser.h HTMLparser.c: adds the -// HTML_PARSE_IGNORE_ENC for HTML parsing -// * HTMLtree.c: fix the handling of saving when an unknown encoding is -// defined in meta document header -// * xmllint.c: add a --noenc option to activate the new parser options -// -// -static NSData* _sanitizeContent(NSData *theData) -{ - NSMutableData *d; - NSString *found_tag, *tag; - NSEnumerator *tags; - const char *bytes; - char *buf; - int i, j, len; - BOOL found_delimiter, in_meta; - - d = [NSMutableData dataWithData: theData]; - bytes = [d bytes]; - len = [d length]; - i = 0; - - in_meta = NO; - - while (i < len) - { - // We check if we see in which case, we substitute de charset= stuff. - if (i < len-5) - { - if ((*bytes == '<') && - (*(bytes+1) == 'm' || *(bytes+1) == 'M') && - (*(bytes+2) == 'e' || *(bytes+2) == 'E') && - (*(bytes+3) == 't' || *(bytes+3) == 'T') && - (*(bytes+4) == 'a' || *(bytes+4) == 'A') && - (*(bytes+5) == ' ')) - in_meta = YES; - } - - // We search for something like : - // - // - // - if (in_meta && i < len-9) - { - if ((*bytes == 'c' || *bytes == 'C') && - (*(bytes+1) == 'h' || *(bytes+1) == 'H') && - (*(bytes+2) == 'a' || *(bytes+2) == 'A') && - (*(bytes+3) == 'r' || *(bytes+3) == 'R') && - (*(bytes+4) == 's' || *(bytes+4) == 'S') && - (*(bytes+5) == 'e' || *(bytes+5) == 'E') && - (*(bytes+6) == 't' || *(bytes+6) == 'T') && - (*(bytes+7) == '=')) - { - // We search until we find a '"' or a space - j = 8; - found_delimiter = YES; - - while (*(bytes+j) != ' ' && *(bytes+j) != '"' && *(bytes+j) != '\'') - { - j++; - - // We haven't found anything, let's return the data untouched - if ((i+j) >= len) - { - in_meta = found_delimiter = NO; - break; - } - } - - if (found_delimiter) - { - [d replaceBytesInRange: NSMakeRange(i, j) - withBytes: NULL - length: 0]; - in_meta = found_delimiter = NO; - } - } - } - - bytes++; - i++; - } - - /* - * Replace badly formatted void tags - * - * A void tag that begins with a slash is considered invalid. - * We remove the slash from those tags. - * - * Ex:
is replaced by
- */ - - if (!VoidTags) - { - /* see http://www.w3.org/TR/html4/index/elements.html */ - VoidTags = [[NSArray alloc] initWithObjects: @"area", @"base", - @"basefont", @"br", @"col", @"frame", @"hr", - @"img", @"input", @"isindex", @"link", - @"meta", @"param", @"", nil]; - } - - bytes = [d bytes]; - len = [d length]; - i = 0; - while (i < len) - { - if (i < len-3) - { - // Search for ending tags - if ((*bytes == '<') && (*(bytes+1) == '/')) - { - i += 2; - bytes += 2; - j = 0; - found_delimiter = YES; - - while (*(bytes+j) != '>') - { - j++; - if ((i+j) >= len) - { - found_delimiter = NO; - break; - } - } - - if (found_delimiter && j > 0) - { - // Copy the ending tag to a NSString - buf = malloc((j+1) * sizeof(char)); - memset (buf, 0, j+1); - memcpy (buf, bytes, j); - found_tag = [NSString stringWithCString: buf encoding: NSUTF8StringEncoding]; - - tags = [VoidTags objectEnumerator]; - tag = [tags nextObject]; - while (tag && found_tag) - { - if ([tag caseInsensitiveCompare: found_tag] == NSOrderedSame) - { - // Remove the leading slash - //NSLog(@"Found void tag with invalid leading slash: ", found_tag); - i--; - [d replaceBytesInRange: NSMakeRange(i, 1) - withBytes: NULL - length: 0]; - bytes = [d bytes]; - bytes += i; - len = [d length]; - break; - } - tag = [tags nextObject]; - } - free(buf); - - // Continue the parsing after end tag - i += j; - bytes += j; - } - } - } - - bytes++; - i++; - } - - return d; -} - @interface _UIxHTMLMailContentHandler : NSObject { NSMutableString *result; @@ -853,7 +662,7 @@ static NSData* _sanitizeContent(NSData *theData) mail = [self clientObject]; - preparsedContent = _sanitizeContent([super decodedFlatContent]); + preparsedContent = [[super decodedFlatContent] sanitizedContentUsingVoidTags: VoidTags]; parser = [[SaxXMLReaderFactory standardXMLReaderFactory] createXMLReaderForMimeType: @"text/html"]; @@ -971,7 +780,7 @@ static NSData* _sanitizeContent(NSData *theData) part = [self clientObject]; mail = [part mailObject]; - preparsedContent = _sanitizeContent([part fetchBLOB]); + preparsedContent = [[part fetchBLOB] sanitizedContentUsingVoidTags: VoidTags]; parser = [[SaxXMLReaderFactory standardXMLReaderFactory] createXMLReaderForMimeType: @"text/html"]; encoding = [[part partInfo] valueForKey: @"encoding"];