(fix) generalized HTML sanitization to avoid encoding issues when replying/forwarding mails

2017-10-18 09:33:44 -04:00 · 2017-10-18 09:33:44 -04:00 · 5cd3a8f245
parent a2f84f1358
commit 5cd3a8f245
4 changed files with 210 additions and 200 deletions
--- a/SoObjects/Mailer/NSData+Mail.h
+++ b/SoObjects/Mailer/NSData+Mail.h
@ -1,6 +1,6 @@
 /* NSData+Mail.h - this file is part of SOGo
 *
- * Copyright (C) 2007-2015 Inverse inc.
+ * Copyright (C) 2007-2017 Inverse inc.
 *
 * This file is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@ -29,8 +29,8 @@

 - (NSData *) bodyDataFromEncoding: (NSString *) encoding;
 - (NSString *) bodyStringFromCharset: (NSString *) charset;
-
 - (NSString *) decodedHeader;
+- (NSData *) sanitizedContentUsingVoidTags: (NSArray *) theVoidTags;

@end

--- a/SoObjects/Mailer/NSData+Mail.m
+++ b/SoObjects/Mailer/NSData+Mail.m
@ -1,6 +1,6 @@
 /* NSData+Mail.m - this file is part of SOGo
 *
- * Copyright (C) 2007-2015 Inverse inc.
+ * Copyright (C) 2007-2017 Inverse inc.
 *
 * This file is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@ -18,6 +18,7 @@
 * Boston, MA 02111-1307, USA.
 */

+#import <Foundation/NSArray.h>
 #import <Foundation/NSString.h>

 #import <NGExtensions/NGBase64Coding.h>
@ -212,4 +213,195 @@
  return decodedString;
 }

+//
+// In order to avoid a libxml bug/limitation, we strip the charset= parameter
+// to avoid libxml to consider the charset= parameter while it works in UTF-8
+// internally, all the time.
+//
+// A fix was commited by Daniel Veillard following discussions Inverse had
+// with him on the issue:
+//
+// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870
+// Author: Daniel Veillard <veillard redhat com>
+// Date:   Mon May 16 16:03:50 2011 +0800
+//
+//     Add options to ignore the internal encoding
+//
+//     For both XML and HTML, the document can provide an encoding
+//     either in XMLDecl in XML, or as a meta element in HTML head.
+//     This adds options to ignore those encodings if the encoding
+//     is known in advace for example if the content had been converted
+//     before being passed to the parser.
+//
+//     * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option
+//       for XML parsing
+//     * include/libxml/HTMLparser.h HTMLparser.c: adds the
+//       HTML_PARSE_IGNORE_ENC for HTML parsing
+//     * HTMLtree.c: fix the handling of saving when an unknown encoding is
+//       defined in meta document header
+//     * xmllint.c: add a --noenc option to activate the new parser options
+//
+//
+- (NSData *) sanitizedContentUsingVoidTags: (NSArray *) theVoidTags
+{
+  NSMutableData *d;
+  NSString *found_tag, *tag;
+  NSEnumerator *tags;
+  const char *bytes;
+  char *buf;
+  int i, j, len;
+  BOOL found_delimiter, in_meta;
+
+  d = [NSMutableData dataWithData: self];
+  bytes = [d bytes];
+  len = [d length];
+  i = 0;
+
+  in_meta = NO;
+
+  while (i < len)
+    {
+      // We check if we see <meta ...> in which case, we substitute de charset= stuff.
+      if (i < len-5)
+	{
+	  if ((*bytes == '<') &&
+	      (*(bytes+1) == 'm' || *(bytes+1) == 'M') &&
+	      (*(bytes+2) == 'e' || *(bytes+2) == 'E') &&
+	      (*(bytes+3) == 't' || *(bytes+3) == 'T') &&
+	      (*(bytes+4) == 'a' || *(bytes+4) == 'A') &&
+	      (*(bytes+5) == ' '))
+            in_meta = YES;
+	}
+
+      // We search for something like :
+      //
+      // <meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
+      //
+      if (in_meta && i < len-9)
+	{
+	  if ((*bytes == 'c' || *bytes == 'C') &&
+	      (*(bytes+1) == 'h' || *(bytes+1) == 'H') &&
+	      (*(bytes+2) == 'a' || *(bytes+2) == 'A') &&
+	      (*(bytes+3) == 'r' || *(bytes+3) == 'R') &&
+	      (*(bytes+4) == 's' || *(bytes+4) == 'S') &&
+	      (*(bytes+5) == 'e' || *(bytes+5) == 'E') &&
+	      (*(bytes+6) == 't' || *(bytes+6) == 'T') &&
+	      (*(bytes+7) == '='))
+	    {
+	      // We search until we find a '"' or a space
+	      j = 8;
+              found_delimiter = YES;
+
+	      while (*(bytes+j) != ' ' && *(bytes+j) != '"' && *(bytes+j) != '\'')
+		{
+		  j++;
+
+		  // We haven't found anything, let's return the data untouched
+		  if ((i+j) >= len)
+                    {
+                      in_meta = found_delimiter = NO;
+                      break;
+                    }
+		}
+
+              if (found_delimiter)
+                {
+                  [d replaceBytesInRange: NSMakeRange(i, j)
+                               withBytes: NULL
+                                  length: 0];
+                  in_meta = found_delimiter = NO;
+                }
+	    }
+	}
+
+      bytes++;
+      i++;
+    }
+
+  /*
+   * Replace badly formatted void tags
+   *
+   * A void tag that begins with a slash is considered invalid.
+   * We remove the slash from those tags.
+   *
+   * Ex: </br> is replaced by <br>
+   */
+
+  if (!theVoidTags)
+    {
+      /* see http://www.w3.org/TR/html4/index/elements.html */
+      theVoidTags = [[[NSArray alloc] initWithObjects: @"area", @"base",
+                                      @"basefont", @"br", @"col", @"frame", @"hr",
+                                      @"img", @"input", @"isindex", @"link",
+                                      @"meta", @"param", @"", nil] autorelease];
+    }
+
+  bytes = [d bytes];
+  len = [d length];
+  i = 0;
+  while (i < len)
+    {
+      if (i < len-3)
+	{
+          // Search for ending tags
+	  if ((*bytes == '<') && (*(bytes+1) == '/'))
+            {
+              i += 2;
+              bytes += 2;
+              j = 0;
+              found_delimiter = YES;
+
+              while (*(bytes+j) != '>')
+                {
+                  j++;
+                  if ((i+j) >= len)
+                    {
+                      found_delimiter = NO;
+                      break;
+                    }
+                }
+
+              if (found_delimiter && j > 0)
+                {
+                  // Copy the ending tag to a NSString
+                  buf = malloc((j+1) * sizeof(char));
+                  memset (buf, 0, j+1);
+                  memcpy (buf, bytes, j);
+                  found_tag = [NSString stringWithCString: buf encoding: NSUTF8StringEncoding];
+
+                  tags = [theVoidTags objectEnumerator];
+                  tag = [tags nextObject];
+                  while (tag && found_tag)
+                    {
+                      if ([tag caseInsensitiveCompare: found_tag] == NSOrderedSame)
+                        {
+                          // Remove the leading slash
+                          //NSLog(@"Found void tag with invalid leading slash: </%@>", found_tag);
+                          i--;
+                          [d replaceBytesInRange: NSMakeRange(i, 1)
+                                       withBytes: NULL
+                                          length: 0];
+                          bytes = [d bytes];
+                          bytes += i;
+                          len = [d length];
+                          break;
+                        }
+                      tag = [tags nextObject];
+                    }
+                  free(buf);
+
+                  // Continue the parsing after end tag
+                  i += j;
+                  bytes += j;
+                }
+            }
+        }
+
+      bytes++;
+      i++;
+    }
+
+  return d;
+}
+
@end
--- a/SoObjects/Mailer/SOGoMailObject+Draft.m
+++ b/SoObjects/Mailer/SOGoMailObject+Draft.m
@ -106,12 +106,14 @@
 //
 - (NSString *) _contentForEditingFromKeys: (NSArray *) keys
 {
-  NSArray *types;
-  NSDictionary *parts;
  NSString *rawPart, *content, *contentKey;
  SOGoUserDefaults *ud;
-  NSUInteger index;
+  NSDictionary *parts;
+  NSArray *types;
+  NSData *data;
+
  BOOL htmlComposition, htmlContent;
+  NSUInteger index;

  content = @"";

@ -156,7 +158,14 @@
        }
    }

-  return content;
+  // We strip charset= information from HTML content to avoid SOGo setting
+  // the encoding of the final mail to UTF-8 while keeping charset="iso-8859-1"
+  // in the HTML meta headers, for example. That would cause encoding display
+  // issues with most MUAs.
+  data = [[content dataUsingEncoding: NSUTF8StringEncoding] sanitizedContentUsingVoidTags: nil];
+  content = [[NSString alloc] initWithData: data  encoding: NSUTF8StringEncoding];
+
+  return [content autorelease];
 }

 //
--- a/UI/MailPartViewers/UIxMailPartHTMLViewer.m
+++ b/UI/MailPartViewers/UIxMailPartHTMLViewer.m
@ -110,197 +110,6 @@ _xmlCharsetForCharset (NSString *charset)
  return encoding;
 }

-//
-// In order to avoid a libxml bug/limitation, we strip the charset= parameter
-// to avoid libxml to consider the charset= parameter while it works in UTF-8
-// internally, all the time.
-//
-// A fix was commited by Daniel Veillard following discussions Inverse had
-// with him on the issue:
-//
-// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870
-// Author: Daniel Veillard <veillard redhat com>
-// Date:   Mon May 16 16:03:50 2011 +0800
-//
-//     Add options to ignore the internal encoding
-//  
-//     For both XML and HTML, the document can provide an encoding
-//     either in XMLDecl in XML, or as a meta element in HTML head.
-//     This adds options to ignore those encodings if the encoding
-//     is known in advace for example if the content had been converted
-//     before being passed to the parser.
-//  
-//     * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option
-//       for XML parsing
-//     * include/libxml/HTMLparser.h HTMLparser.c: adds the
-//       HTML_PARSE_IGNORE_ENC for HTML parsing
-//     * HTMLtree.c: fix the handling of saving when an unknown encoding is
-//       defined in meta document header
-//     * xmllint.c: add a --noenc option to activate the new parser options
-//
-// 
-static NSData* _sanitizeContent(NSData *theData)
-{
-  NSMutableData *d;
-  NSString *found_tag, *tag;
-  NSEnumerator *tags;
-  const char *bytes;
-  char *buf;
-  int i, j, len;
-  BOOL found_delimiter, in_meta;
-
-  d = [NSMutableData dataWithData: theData];
-  bytes = [d bytes];
-  len = [d length];
-  i = 0;
-
-  in_meta = NO;
-
-  while (i < len)
-    {
-      // We check if we see <meta ...> in which case, we substitute de charset= stuff.
-      if (i < len-5)
-	{
-	  if ((*bytes == '<') &&
-	      (*(bytes+1) == 'm' || *(bytes+1) == 'M') &&
-	      (*(bytes+2) == 'e' || *(bytes+2) == 'E') &&
-	      (*(bytes+3) == 't' || *(bytes+3) == 'T') &&
-	      (*(bytes+4) == 'a' || *(bytes+4) == 'A') &&
-	      (*(bytes+5) == ' '))
-            in_meta = YES;
-	}
-      
-      // We search for something like :
-      // 
-      // <meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
-      //
-      if (in_meta && i < len-9)
-	{
-	  if ((*bytes == 'c' || *bytes == 'C') &&
-	      (*(bytes+1) == 'h' || *(bytes+1) == 'H') &&
-	      (*(bytes+2) == 'a' || *(bytes+2) == 'A') &&
-	      (*(bytes+3) == 'r' || *(bytes+3) == 'R') &&
-	      (*(bytes+4) == 's' || *(bytes+4) == 'S') &&
-	      (*(bytes+5) == 'e' || *(bytes+5) == 'E') &&
-	      (*(bytes+6) == 't' || *(bytes+6) == 'T') &&
-	      (*(bytes+7) == '='))
-	    {
-	      // We search until we find a '"' or a space
-	      j = 8;
-              found_delimiter = YES;
-
-	      while (*(bytes+j) != ' ' && *(bytes+j) != '"' && *(bytes+j) != '\'')
-		{
-		  j++;
-		  
-		  // We haven't found anything, let's return the data untouched
-		  if ((i+j) >= len)
-                    {
-                      in_meta = found_delimiter = NO;
-                      break;
-                    }
-		}
-
-              if (found_delimiter)
-                {
-                  [d replaceBytesInRange: NSMakeRange(i, j)
-                               withBytes: NULL
-                                  length: 0];
-                  in_meta = found_delimiter = NO;
-                }
-	    }
-	}
-
-      bytes++;
-      i++;
-    }
-
-  /*
-   * Replace badly formatted void tags
-   *
-   * A void tag that begins with a slash is considered invalid.
-   * We remove the slash from those tags.
-   *
-   * Ex: </br> is replaced by <br>
-   */
-
-  if (!VoidTags)
-    {
-      /* see http://www.w3.org/TR/html4/index/elements.html */
-      VoidTags = [[NSArray alloc] initWithObjects: @"area", @"base",
-                                  @"basefont", @"br", @"col", @"frame", @"hr",
-                                  @"img", @"input", @"isindex", @"link",
-                                  @"meta", @"param", @"", nil];
-    }
-
-  bytes = [d bytes];
-  len = [d length];
-  i = 0;
-  while (i < len)
-    {
-      if (i < len-3)
-	{
-          // Search for ending tags
-	  if ((*bytes == '<') && (*(bytes+1) == '/'))
-            {
-              i += 2;
-              bytes += 2;
-              j = 0;
-              found_delimiter = YES;
-
-              while (*(bytes+j) != '>')
-                {
-                  j++;
-                  if ((i+j) >= len)
-                    {
-                      found_delimiter = NO;
-                      break;
-                    }
-                }
-
-              if (found_delimiter && j > 0)
-                {
-                  // Copy the ending tag to a NSString
-                  buf = malloc((j+1) * sizeof(char));
-                  memset (buf, 0, j+1);
-                  memcpy (buf, bytes, j);
-                  found_tag = [NSString stringWithCString: buf encoding: NSUTF8StringEncoding];
-                  
-                  tags = [VoidTags objectEnumerator];
-                  tag = [tags nextObject];
-                  while (tag && found_tag)
-                    {
-                      if ([tag caseInsensitiveCompare: found_tag] == NSOrderedSame)
-                        {
-                          // Remove the leading slash
-                          //NSLog(@"Found void tag with invalid leading slash: </%@>", found_tag);
-                          i--;
-                          [d replaceBytesInRange: NSMakeRange(i, 1)
-                                       withBytes: NULL
-                                          length: 0];
-                          bytes = [d bytes];
-                          bytes += i;
-                          len = [d length];
-                          break;
-                        }
-                      tag = [tags nextObject];
-                    }
-                  free(buf);
-
-                  // Continue the parsing after end tag
-                  i += j;
-                  bytes += j;
-                }
-            }
-        }
-
-      bytes++;
-      i++;
-    }
-  
-  return d;
-}
-
@interface _UIxHTMLMailContentHandler : NSObject <SaxContentHandler, SaxLexicalHandler>
 {
  NSMutableString *result;
@ -853,7 +662,7 @@ static NSData* _sanitizeContent(NSData *theData)

  mail = [self clientObject];

-  preparsedContent = _sanitizeContent([super decodedFlatContent]);
+  preparsedContent = [[super decodedFlatContent] sanitizedContentUsingVoidTags: VoidTags];
  parser = [[SaxXMLReaderFactory standardXMLReaderFactory]
             createXMLReaderForMimeType: @"text/html"];

@ -971,7 +780,7 @@ static NSData* _sanitizeContent(NSData *theData)
  part = [self clientObject];
  mail = [part mailObject];

-  preparsedContent = _sanitizeContent([part fetchBLOB]);
+  preparsedContent = [[part fetchBLOB] sanitizedContentUsingVoidTags: VoidTags];
  parser = [[SaxXMLReaderFactory standardXMLReaderFactory]
             createXMLReaderForMimeType: @"text/html"];
  encoding = [[part partInfo] valueForKey: @"encoding"];