diff --git a/src/java/org/apache/poi/util/LocaleUtil.java b/src/java/org/apache/poi/util/LocaleUtil.java index 70d9a50750..c440fc96a4 100644 --- a/src/java/org/apache/poi/util/LocaleUtil.java +++ b/src/java/org/apache/poi/util/LocaleUtil.java @@ -616,5 +616,473 @@ public final class LocaleUtil { } } + /** + * Get default code page from LCID value + * + * @param lcid the LCID value + * @return the default code page + */ + public static int getDefaultCodePageFromLCID(int lcid) { + int languageId = lcid & 0xFFFF; + switch (languageId) { + case 0x0001: return 1256; + case 0x0002: return 1251; + case 0x0003: return 1252; + case 0x0004: return 936; + case 0x0005: return 1250; + case 0x0006: return 1252; + case 0x0007: return 1252; + case 0x0008: return 1253; + case 0x0009: return 1252; + case 0x000a: return 1252; + case 0x000b: return 1252; + case 0x000c: return 1252; + case 0x000d: return 1255; + case 0x000e: return 1250; + case 0x000f: return 1252; + case 0x0010: return 1252; + case 0x0011: return 932; + case 0x0012: return 949; + case 0x0013: return 1252; + case 0x0014: return 1252; + case 0x0015: return 1250; + case 0x0016: return 1252; + case 0x0017: return 1252; + case 0x0018: return 1250; + case 0x0019: return 1251; + case 0x001a: return 1250; + case 0x001b: return 1250; + case 0x001c: return 1250; + case 0x001d: return 1252; + case 0x001e: return 874; + case 0x001f: return 1254; + case 0x0020: return 1256; + case 0x0021: return 1252; + case 0x0022: return 1251; + case 0x0023: return 1251; + case 0x0024: return 1250; + case 0x0025: return 1257; + case 0x0026: return 1257; + case 0x0027: return 1257; + case 0x0028: return 1251; + case 0x0029: return 1256; + case 0x002a: return 1258; + case 0x002b: return 0; + case 0x002c: return 1254; + case 0x002d: return 1252; + case 0x002e: return 1252; + case 0x002f: return 1251; + case 0x0030: return 0; + case 0x0031: return 0; + case 0x0032: return 1252; + case 0x0033: return 32759; + case 0x0034: return 1252; + case 0x0035: return 1252; + case 0x0036: return 1252; + case 0x0037: return 0; + case 0x0038: return 1252; + case 0x0039: return 0; + case 0x003a: return 0; + case 0x003b: return 1252; + case 0x003c: return 1252; + case 0x003d: return 32759; + case 0x003e: return 1252; + case 0x003f: return 0; + case 0x0040: return 1251; + case 0x0041: return 1252; + case 0x0042: return 1250; + case 0x0043: return 1254; + case 0x0044: return 1251; + case 0x0045: return 0; + case 0x0046: return 0; + case 0x0047: return 0; + case 0x0048: return 0; + case 0x0049: return 0; + case 0x004a: return 0; + case 0x004b: return 0; + case 0x004c: return 0; + case 0x004d: return 0; + case 0x004e: return 0; + case 0x004f: return 0; + case 0x0050: return 1251; + case 0x0051: return 0; + case 0x0052: return 1252; + case 0x0053: return 0; + case 0x0054: return 0; + case 0x0055: return 0; + case 0x0056: return 1252; + case 0x0057: return 0; + case 0x0058: return 32759; + case 0x0059: return 1256; + case 0x005a: return 0; + case 0x005b: return 0; + case 0x005c: return 0; + case 0x005d: return 1252; + case 0x005e: return 0; + case 0x005f: return 1252; + case 0x0060: return 32759; + case 0x0061: return 0; + case 0x0062: return 1252; + case 0x0063: return 0; + case 0x0064: return 1252; + case 0x0065: return 0; + case 0x0066: return 32759; + case 0x0067: return 1252; + case 0x0068: return 1252; + case 0x0069: return 32759; + case 0x006a: return 1252; + case 0x006b: return 1252; + case 0x006c: return 1252; + case 0x006d: return 1251; + case 0x006e: return 1252; + case 0x006f: return 1252; + case 0x0070: return 1252; + case 0x0071: return 32759; + case 0x0072: return 0; + case 0x0073: return 0; + case 0x0074: return 1252; + case 0x0075: return 1252; + case 0x0076: return 32759; + case 0x0077: return 0; + case 0x0078: return 0; + case 0x0079: return 32759; + case 0x007a: return 1252; + case 0x007b: return 32759; + case 0x007c: return 1252; + case 0x007d: return 32759; + case 0x007e: return 1252; + case 0x007f: return 1252; + case 0x0080: return 1256; + case 0x0081: return 0; + case 0x0082: return 1252; + case 0x0083: return 1252; + case 0x0084: return 1252; + case 0x0085: return 1251; + case 0x0086: return 1252; + case 0x0087: return 1252; + case 0x0088: return 1252; + case 0x0089: return 32759; + case 0x008a: return 32759; + case 0x008b: return 32759; + case 0x008c: return 1256; + case 0x008d: return 32759; + case 0x008e: return 32759; + case 0x008f: return 32759; + case 0x0090: return 32759; + case 0x0091: return 1252; + case 0x0092: return 1256; + case 0x0093: return 32759; + case 0x0401: return 1256; + case 0x0402: return 1251; + case 0x0403: return 1252; + case 0x0404: return 950; + case 0x0405: return 1250; + case 0x0406: return 1252; + case 0x0407: return 1252; + case 0x0408: return 1253; + case 0x0409: return 1252; + case 0x040a: return 1252; + case 0x040b: return 1252; + case 0x040c: return 1252; + case 0x040d: return 1255; + case 0x040e: return 1250; + case 0x040f: return 1252; + case 0x0410: return 1252; + case 0x0411: return 932; + case 0x0412: return 949; + case 0x0413: return 1252; + case 0x0414: return 1252; + case 0x0415: return 1250; + case 0x0416: return 1252; + case 0x0417: return 1252; + case 0x0418: return 1250; + case 0x0419: return 1251; + case 0x041a: return 1250; + case 0x041b: return 1250; + case 0x041c: return 1250; + case 0x041d: return 1252; + case 0x041e: return 874; + case 0x041f: return 1254; + case 0x0420: return 1256; + case 0x0421: return 1252; + case 0x0422: return 1251; + case 0x0423: return 1251; + case 0x0424: return 1250; + case 0x0425: return 1257; + case 0x0426: return 1257; + case 0x0427: return 1257; + case 0x0428: return 1251; + case 0x0429: return 1256; + case 0x042a: return 1258; + case 0x042b: return 0; + case 0x042c: return 1254; + case 0x042d: return 1252; + case 0x042e: return 1252; + case 0x042f: return 1251; + case 0x0430: return 0; + case 0x0431: return 0; + case 0x0432: return 1252; + case 0x0433: return 32759; + case 0x0434: return 1252; + case 0x0435: return 1252; + case 0x0436: return 1252; + case 0x0437: return 0; + case 0x0438: return 1252; + case 0x0439: return 0; + case 0x043a: return 0; + case 0x043b: return 1252; + case 0x043d: return 32759; + case 0x043e: return 1252; + case 0x043f: return 0; + case 0x0440: return 1251; + case 0x0441: return 1252; + case 0x0442: return 1250; + case 0x0443: return 1254; + case 0x0444: return 1251; + case 0x0445: return 0; + case 0x0446: return 0; + case 0x0447: return 0; + case 0x0448: return 0; + case 0x0449: return 0; + case 0x044a: return 0; + case 0x044b: return 0; + case 0x044c: return 0; + case 0x044d: return 0; + case 0x044e: return 0; + case 0x044f: return 0; + case 0x0450: return 1251; + case 0x0451: return 0; + case 0x0452: return 1252; + case 0x0453: return 0; + case 0x0454: return 0; + case 0x0455: return 0; + case 0x0456: return 1252; + case 0x0457: return 0; + case 0x0458: return 32759; + case 0x0459: return 32759; + case 0x045a: return 0; + case 0x045b: return 0; + case 0x045c: return 0; + case 0x045d: return 0; + case 0x045e: return 0; + case 0x045f: return 32759; + case 0x0460: return 32759; + case 0x0461: return 0; + case 0x0462: return 1252; + case 0x0463: return 0; + case 0x0464: return 1252; + case 0x0465: return 0; + case 0x0466: return 32759; + case 0x0467: return 32759; + case 0x0468: return 1252; + case 0x0469: return 32759; + case 0x046a: return 1252; + case 0x046b: return 1252; + case 0x046c: return 1252; + case 0x046d: return 1251; + case 0x046e: return 1252; + case 0x046f: return 1252; + case 0x0470: return 1252; + case 0x0471: return 32759; + case 0x0472: return 0; + case 0x0473: return 0; + case 0x0474: return 1252; + case 0x0475: return 1252; + case 0x0476: return 32759; + case 0x0477: return 0; + case 0x0478: return 0; + case 0x0479: return 32759; + case 0x047a: return 1252; + case 0x047c: return 1252; + case 0x047e: return 1252; + case 0x0480: return 1256; + case 0x0481: return 0; + case 0x0482: return 1252; + case 0x0483: return 1252; + case 0x0484: return 1252; + case 0x0485: return 1251; + case 0x0486: return 1252; + case 0x0487: return 1252; + case 0x0488: return 1252; + case 0x048c: return 1256; + case 0x048d: return 32759; + case 0x048e: return 32759; + case 0x048f: return 32759; + case 0x0490: return 32759; + case 0x0491: return 1252; + case 0x0492: return 1256; + case 0x0493: return 32759; + case 0x0501: return 1250; + case 0x05fe: return 932; + case 0x0801: return 1256; + case 0x0803: return 1252; + case 0x0804: return 936; + case 0x0807: return 1252; + case 0x0809: return 1252; + case 0x080a: return 1252; + case 0x080c: return 1252; + case 0x0810: return 1252; + case 0x0811: return 32759; + case 0x0813: return 1252; + case 0x0814: return 1252; + case 0x0816: return 1252; + case 0x0818: return 0; + case 0x0819: return 32759; + case 0x081a: return 1250; + case 0x081d: return 1252; + case 0x0820: return 0; + case 0x0827: return 32759; + case 0x082c: return 1251; + case 0x082e: return 1252; + case 0x0832: return 1252; + case 0x083b: return 1252; + case 0x083c: return 1252; + case 0x083e: return 1252; + case 0x0843: return 1251; + case 0x0845: return 0; + case 0x0846: return 1256; + case 0x0849: return 0; + case 0x0850: return 0; + case 0x0851: return 32759; + case 0x0859: return 1256; + case 0x085d: return 1252; + case 0x085f: return 1252; + case 0x0860: return 32759; + case 0x0861: return 0; + case 0x0867: return 1252; + case 0x086b: return 1252; + case 0x0873: return 0; + case 0x09ff: return 1256; + case 0x0c01: return 1256; + case 0x0c04: return 950; + case 0x0c07: return 1252; + case 0x0c09: return 1252; + case 0x0c0a: return 1252; + case 0x0c0c: return 1252; + case 0x0c1a: return 1251; + case 0x0c3b: return 1252; + case 0x0c5f: return 32759; + case 0x0c6b: return 1252; + case 0x1001: return 1256; + case 0x1004: return 936; + case 0x1007: return 1252; + case 0x1009: return 1252; + case 0x100a: return 1252; + case 0x100c: return 1252; + case 0x101a: return 1250; + case 0x103b: return 1252; + case 0x1401: return 1256; + case 0x1404: return 950; + case 0x1407: return 1252; + case 0x1409: return 1252; + case 0x140a: return 1252; + case 0x140c: return 1252; + case 0x141a: return 1250; + case 0x143b: return 1252; + case 0x1801: return 1256; + case 0x1809: return 1252; + case 0x180a: return 1252; + case 0x180c: return 1252; + case 0x181a: return 1250; + case 0x183b: return 1252; + case 0x1c01: return 1256; + case 0x1c09: return 1252; + case 0x1c0a: return 1252; + case 0x1c0c: return 32759; + case 0x1c1a: return 1251; + case 0x1c3b: return 1252; + case 0x2001: return 1256; + case 0x2008: return 32759; + case 0x2009: return 1252; + case 0x200a: return 1252; + case 0x200c: return 0; + case 0x201a: return 1251; + case 0x203b: return 1252; + case 0x2401: return 1256; + case 0x2409: return 1252; + case 0x240a: return 1252; + case 0x240c: return 0; + case 0x241a: return 1250; + case 0x243b: return 1252; + case 0x2801: return 1256; + case 0x2809: return 1252; + case 0x280a: return 1252; + case 0x280c: return 0; + case 0x281a: return 1251; + case 0x2c01: return 1256; + case 0x2c09: return 1252; + case 0x2c0a: return 1252; + case 0x2c0c: return 0; + case 0x2c1a: return 1250; + case 0x3001: return 1256; + case 0x3009: return 1252; + case 0x300a: return 1252; + case 0x300c: return 0; + case 0x301a: return 1251; + case 0x3401: return 1256; + case 0x3409: return 1252; + case 0x340a: return 1252; + case 0x340c: return 0; + case 0x3801: return 1256; + case 0x3809: return 32759; + case 0x380a: return 1252; + case 0x380c: return 0; + case 0x3c01: return 1256; + case 0x3c09: return 0; + case 0x3c0a: return 1252; + case 0x3c0c: return 0; + case 0x4001: return 1256; + case 0x4009: return 1252; + case 0x400a: return 1252; + case 0x4401: return 32759; + case 0x4409: return 1252; + case 0x440a: return 1252; + case 0x4801: return 32759; + case 0x4809: return 1252; + case 0x480a: return 1252; + case 0x4c09: return 32759; + case 0x4c0a: return 1252; + case 0x5009: return 32759; + case 0x500a: return 1252; + case 0x5409: return 32759; + case 0x540a: return 1252; + case 0x5809: return 32759; + case 0x5c09: return 32759; + case 0x6009: return 32759; + case 0x6409: return 32759; + case 0x641a: return 1251; + case 0x681a: return 1250; + case 0x6c1a: return 1251; + case 0x701a: return 1250; + case 0x703b: return 1252; + case 0x742c: return 1251; + case 0x743b: return 1252; + case 0x7804: return 936; + case 0x7814: return 1252; + case 0x781a: return 1250; + case 0x782c: return 1254; + case 0x783b: return 1252; + case 0x7843: return 1251; + case 0x7850: return 1251; + case 0x785d: return 0; + case 0x7c04: return 950; + case 0x7c14: return 1252; + case 0x7c1a: return 1250; + case 0x7c28: return 1251; + case 0x7c2e: return 1252; + case 0x7c3b: return 1252; + case 0x7c43: return 1254; + case 0x7c46: return 1256; + case 0x7c50: return 0; + case 0x7c59: return 1256; + case 0x7c5c: return 0; + case 0x7c5d: return 1252; + case 0x7c5f: return 1252; + case 0x7c67: return 1252; + case 0x7c68: return 1252; + case 0x7c92: return 1256; + default: return 0; + } + } } diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java index 5b0941e4b3..ef2771b9fe 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java @@ -50,6 +50,7 @@ import org.apache.poi.hsmf.parsers.POIFSChunkParser; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.CodePageUtil; +import org.apache.poi.util.LocaleUtil; import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; @@ -210,8 +211,21 @@ public class MAPIMessage extends POIReadOnlyDocument { * returnNullOnMissingChunk is set */ public String getHtmlBody() throws ChunkNotFoundException { - if(mainChunks.getHtmlBodyChunkBinary() != null) { - return mainChunks.getHtmlBodyChunkBinary().getAs7bitString(); + ByteChunk htmlBodyBinaryChunk = mainChunks.getHtmlBodyChunkBinary(); + if (htmlBodyBinaryChunk != null) { + List cpid = mainChunks.getProperties().get(MAPIProperty.INTERNET_CPID); + if (cpid != null && cpid.size() > 0) { + int codepage = ((LongPropertyValue) cpid.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + byte[] htmlBodyBinary = htmlBodyBinaryChunk.getValue(); + return new String(htmlBodyBinary, encoding); + } catch (UnsupportedEncodingException e) { + logger.log(POILogger.WARN, "HTML body binary: Invalid codepage ID ", codepage, " set for the message via ", + MAPIProperty.INTERNET_CPID, ", ignoring"); + } + } + return htmlBodyBinaryChunk.getAs7bitString(); } return getStringFromChunk(mainChunks.getHtmlBodyChunkString()); } @@ -391,67 +405,86 @@ public class MAPIMessage extends POIReadOnlyDocument { *

Bug #49441 has more on why this is needed

*/ public void guess7BitEncoding() { - // First choice is a codepage property - for (MAPIProperty prop : new MAPIProperty[] { - MAPIProperty.MESSAGE_CODEPAGE, - MAPIProperty.INTERNET_CPID - }) { - List val = mainChunks.getProperties().get(prop); - if (val != null && val.size() > 0) { - int codepage = ((LongPropertyValue)val.get(0)).getValue(); - try { - String encoding = CodePageUtil.codepageToEncoding(codepage, true); - set7BitEncoding(encoding); - return; - } catch(UnsupportedEncodingException e) { - logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, - " set for the message via ", prop, ", ignoring"); - } + String generalcodepage = null; + String htmlbodycodepage = null; + String bodycodepage = null; + // + // General codepage: Message codepage property. + // + List val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_CODEPAGE); + if (val != null && val.size() > 0) { + int codepage = ((LongPropertyValue) val.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + generalcodepage = encoding; + } catch (UnsupportedEncodingException e) { + logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, " set for the message via ", + MAPIProperty.MESSAGE_CODEPAGE, ", ignoring"); + } + } + // + // General codepage fallback: Message locale ID property. + // + if (generalcodepage == null) { + val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_LOCALE_ID); + if (val != null && val.size() > 0) { + int lcid = ((LongPropertyValue) val.get(0)).getValue(); + int codepage = LocaleUtil.getDefaultCodePageFromLCID(lcid); + try { + if (codepage != 0) { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + generalcodepage = encoding; + } + } catch (UnsupportedEncodingException e) { + logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, "from locale ID", lcid, " set for the message via ", + MAPIProperty.MESSAGE_LOCALE_ID, ", ignoring"); } - } - - - // Second choice is a charset on a content type header - try { + } + } + // + // General codepage fallback: Charset on a content type header. + // + if (generalcodepage == null) { + try { String[] headers = getHeaders(); - if(headers != null && headers.length > 0) { - // Look for a content type with a charset - Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); - - for(String header : headers) { - if(header.startsWith("Content-Type")) { - Matcher m = p.matcher(header); - if(m.matches()) { - // Found it! Tell all the string chunks - String charset = m.group(1); - - if (!charset.equalsIgnoreCase("utf-8")) { - set7BitEncoding(charset); - } - return; - } + if (headers != null && headers.length > 0) { + Pattern p = Pattern.compile("content-type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); + for (String header : headers) { + if (header.toLowerCase().startsWith("content-type")) { + Matcher m = p.matcher(header); + if (m.matches()) { + String encoding = m.group(1); + generalcodepage = encoding; } - } + } + } } - } catch(ChunkNotFoundException e) {} - - // Nothing suitable in the headers, try HTML - try { - String html = getHtmlBody(); - if(html != null && html.length() > 0) { - // Look for a content type in the meta headers - Pattern p = Pattern.compile( - " 0) { + int codepage = ((LongPropertyValue) val.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + htmlbodycodepage = encoding; + if (!encoding.equalsIgnoreCase("utf-8")) { + bodycodepage = encoding; } - } catch(ChunkNotFoundException e) {} - } + } catch (UnsupportedEncodingException e) { + logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, " set for the message via ", + MAPIProperty.INTERNET_CPID, ", ignoring"); + } + } + // + // Apply encoding + // + set7BitEncoding(generalcodepage, htmlbodycodepage, bodycodepage); + } /** * Many messages store their strings as unicode, which is @@ -464,26 +497,41 @@ public class MAPIMessage extends POIReadOnlyDocument { * @see #guess7BitEncoding() */ public void set7BitEncoding(String charset) { + set7BitEncoding(charset, charset, charset); + } + public void set7BitEncoding(String generalcharset, String htmlbodycharset, String bodycharset) { for(Chunk c : mainChunks.getChunks()) { if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(charset); + if (c.getChunkId() == MAPIProperty.BODY_HTML.id) { + if (htmlbodycharset != null) { + ((StringChunk)c).set7BitEncoding(htmlbodycharset); + } + } + else if (c.getChunkId() == MAPIProperty.BODY.id) { + if (bodycharset != null) { + ((StringChunk)c).set7BitEncoding(bodycharset); + } + } + else if (generalcharset != null) { + ((StringChunk)c).set7BitEncoding(generalcharset); + } } } - - if (nameIdChunks!=null) { - for(Chunk c : nameIdChunks.getChunks()) { - if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(charset); - } - } - } - - for(RecipientChunks rc : recipientChunks) { - for(Chunk c : rc.getAll()) { - if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(charset); - } - } + if (generalcharset != null) { + if (nameIdChunks!=null) { + for(Chunk c : nameIdChunks.getChunks()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(generalcharset); + } + } + } + for(RecipientChunks rc : recipientChunks) { + for(Chunk c : rc.getAll()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(generalcharset); + } + } + } } } diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java index b7a0aed574..ba54f70101 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java @@ -512,6 +512,8 @@ public class MAPIProperty { new MAPIProperty(0x1a, ASCII_STRING, "MessageClass", "PR_MESSAGE_CLASS"); public static final MAPIProperty MESSAGE_CODEPAGE = new MAPIProperty(0x3ffd, Types.LONG, "MessageCodepage", "PR_MESSAGE_CODEPAGE"); + public static final MAPIProperty MESSAGE_LOCALE_ID = + new MAPIProperty(0x3ff1, Types.LONG, "MessageLocaleId", "PR_MESSAGE_LOCALE_ID"); public static final MAPIProperty MESSAGE_DELIVERY_ID = new MAPIProperty(0x1b, BINARY, "MessageDeliveryId", "PR_MESSAGE_DELIVERY_ID"); public static final MAPIProperty MESSAGE_DELIVERY_TIME = diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java b/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java index 77dfb5e4cb..d0d057c28f 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java @@ -39,7 +39,8 @@ import org.junit.runners.Suite; TestPOIFSChunkParser.class, TestMessageSubmissionChunkY2KRead.class, TestMessageSubmissionChunk.class, - TestExtractEmbeddedMSG.class + TestExtractEmbeddedMSG.class, + Test7BitCodepage.class }) public class AllHSMFTests { } diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java b/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java new file mode 100644 index 0000000000..0ec5eb7b56 --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java @@ -0,0 +1,85 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hsmf; + +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.poi.POIDataSamples; + +/** + * Tests to verify if code page for general properties like subject, + * text body and html body is evaluated correctly. + */ +public final class Test7BitCodepage extends TestCase { + private final MAPIMessage ascii_cp1251_lcid1049; + private final MAPIMessage ascii_utf_8_cp1252_lcid1031; + private final MAPIMessage ascii_utf_8_cp1252_lcid1031_html; + private final MAPIMessage htmlbodybinary_cp1251; + private final MAPIMessage htmlbodybinary_utf_8; + + /** + * Initialize this test, load up the messages. + * @throws Exception + */ + public Test7BitCodepage() throws IOException { + POIDataSamples samples = POIDataSamples.getHSMFInstance(); + ascii_cp1251_lcid1049 = new MAPIMessage(samples.openResourceAsStream("ASCII_CP1251_LCID1049.msg")); + ascii_utf_8_cp1252_lcid1031 = new MAPIMessage(samples.openResourceAsStream("ASCII_UTF-8_CP1252_LCID1031.msg")); + ascii_utf_8_cp1252_lcid1031_html = new MAPIMessage(samples.openResourceAsStream("ASCII_UTF-8_CP1252_LCID1031_HTML.msg")); + htmlbodybinary_cp1251 = new MAPIMessage(samples.openResourceAsStream("HTMLBodyBinary_CP1251.msg")); + htmlbodybinary_utf_8 = new MAPIMessage(samples.openResourceAsStream("HTMLBodyBinary_UTF-8.msg")); + } + + /** + * Evaluate encoding and check if the subject, text body and html body is decoded correctly. + */ + public void test7BitEncoding() throws Exception { + ascii_cp1251_lcid1049.guess7BitEncoding(); + ascii_cp1251_lcid1049.setReturnNullOnMissingChunk(true); + ascii_utf_8_cp1252_lcid1031.guess7BitEncoding(); + ascii_utf_8_cp1252_lcid1031.setReturnNullOnMissingChunk(true); + ascii_utf_8_cp1252_lcid1031_html.guess7BitEncoding(); + ascii_utf_8_cp1252_lcid1031_html.setReturnNullOnMissingChunk(true); + htmlbodybinary_cp1251.guess7BitEncoding(); + htmlbodybinary_cp1251.setReturnNullOnMissingChunk(true); + htmlbodybinary_utf_8.guess7BitEncoding(); + htmlbodybinary_utf_8.setReturnNullOnMissingChunk(true); + + assertEquals("Subject автоматически Subject", ascii_cp1251_lcid1049.getSubject()); + assertEquals("Body автоматически Body", ascii_cp1251_lcid1049.getTextBody()); + assertEquals("HTML автоматически", ascii_cp1251_lcid1049.getHtmlBody()); + + assertEquals("Subject öäü Subject", ascii_utf_8_cp1252_lcid1031.getSubject()); + assertEquals("Body öäü Body", ascii_utf_8_cp1252_lcid1031.getTextBody()); + assertNull(ascii_utf_8_cp1252_lcid1031.getHtmlBody()); + + assertEquals("Subject öäü Subject", ascii_utf_8_cp1252_lcid1031_html.getSubject()); + assertEquals("Body öäü Body", ascii_utf_8_cp1252_lcid1031_html.getTextBody()); + assertEquals("HTML öäü", ascii_utf_8_cp1252_lcid1031_html.getHtmlBody()); + + assertEquals("Subject öäü Subject", htmlbodybinary_cp1251.getSubject()); + assertNull(htmlbodybinary_cp1251.getTextBody()); + assertEquals("HTML автоматически", htmlbodybinary_cp1251.getHtmlBody()); + + assertEquals("Subject öäü Subject", htmlbodybinary_utf_8.getSubject()); + assertNull(htmlbodybinary_utf_8.getTextBody()); + assertEquals("HTML öäü", htmlbodybinary_utf_8.getHtmlBody()); + } +} diff --git a/test-data/hsmf/ASCII_CP1251_LCID1049.msg b/test-data/hsmf/ASCII_CP1251_LCID1049.msg new file mode 100644 index 0000000000..2936aa2593 Binary files /dev/null and b/test-data/hsmf/ASCII_CP1251_LCID1049.msg differ diff --git a/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031.msg b/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031.msg new file mode 100644 index 0000000000..fa61f8bb8d Binary files /dev/null and b/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031.msg differ diff --git a/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg b/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg new file mode 100644 index 0000000000..a51844b12f Binary files /dev/null and b/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg differ diff --git a/test-data/hsmf/HTMLBodyBinary_CP1251.msg b/test-data/hsmf/HTMLBodyBinary_CP1251.msg new file mode 100644 index 0000000000..13967a31e7 Binary files /dev/null and b/test-data/hsmf/HTMLBodyBinary_CP1251.msg differ diff --git a/test-data/hsmf/HTMLBodyBinary_UTF-8.msg b/test-data/hsmf/HTMLBodyBinary_UTF-8.msg new file mode 100644 index 0000000000..34bdac82f3 Binary files /dev/null and b/test-data/hsmf/HTMLBodyBinary_UTF-8.msg differ