From 0275daa5deae2e0069badd1f46268abb43fbc3dc Mon Sep 17 00:00:00 2001 From: PJ Fanning Date: Fri, 17 Mar 2023 23:35:33 +0000 Subject: [PATCH] [bug-66532] more performant way to iterate over codepoints. Thanks to Matthias Raschhofer git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1908458 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/xssf/streaming/SheetDataWriter.java | 29 ++++++++++--------- .../org/apache/poi/util/CodepointsUtil.java | 16 ++++++++++ .../apache/poi/util/TestCodepointsUtil.java | 19 ++++++++++-- 3 files changed, 49 insertions(+), 15 deletions(-) diff --git a/poi-ooxml/src/main/java/org/apache/poi/xssf/streaming/SheetDataWriter.java b/poi-ooxml/src/main/java/org/apache/poi/xssf/streaming/SheetDataWriter.java index 9352ebe133..1c4b9c0346 100644 --- a/poi-ooxml/src/main/java/org/apache/poi/xssf/streaming/SheetDataWriter.java +++ b/poi-ooxml/src/main/java/org/apache/poi/xssf/streaming/SheetDataWriter.java @@ -31,6 +31,7 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.util.Iterator; +import java.util.PrimitiveIterator; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -397,37 +398,39 @@ public class SheetDataWriter implements Closeable { return; } - for (Iterator iter = CodepointsUtil.iteratorFor(s); iter.hasNext(); ) { - String codepoint = iter.next(); + int codepoint; + for (PrimitiveIterator.OfInt iter = CodepointsUtil.primitiveIterator(s); iter.hasNext(); ) { + codepoint = iter.nextInt(); switch (codepoint) { - case "<": + case '<': _out.write("<"); break; - case ">": + case '>': _out.write(">"); break; - case "&": + case '&': _out.write("&"); break; - case "\"": + case '\"': _out.write("""); break; // Special characters - case "\n": + case '\n': _out.write(" "); break; - case "\r": + case '\r': _out.write(" "); break; - case "\t": + case '\t': _out.write(" "); break; - case "\u00A0": // NO-BREAK SPACE + case '\u00A0': // NO-BREAK SPACE _out.write(" "); break; default: - if (codepoint.length() == 1) { - char c = codepoint.charAt(0); + final char[] chars = Character.toChars(codepoint); + if (chars.length == 1) { + char c = chars[0]; // YK: XmlBeans silently replaces all ISO control characters ( < 32) with question marks. // the same rule applies to "not a character" symbols. if (replaceWithQuestionMark(c)) { @@ -436,7 +439,7 @@ public class SheetDataWriter implements Closeable { _out.write(c); } } else { - _out.write(codepoint); + _out.write(chars); } break; } diff --git a/poi/src/main/java/org/apache/poi/util/CodepointsUtil.java b/poi/src/main/java/org/apache/poi/util/CodepointsUtil.java index 9f2d01780c..71fcdbe85e 100644 --- a/poi/src/main/java/org/apache/poi/util/CodepointsUtil.java +++ b/poi/src/main/java/org/apache/poi/util/CodepointsUtil.java @@ -18,12 +18,28 @@ package org.apache.poi.util; import java.util.Iterator; +import java.util.PrimitiveIterator; @Internal public class CodepointsUtil { + + /** + * @param text to iterate over + * @return iterator with Strings representing the codepoints + * @see #primitiveIterator(String) a more performnt iterator + */ public static Iterator iteratorFor(String text) { return text.codePoints() .mapToObj(codePoint -> new String(Character.toChars(codePoint))) .iterator(); } + + /** + * @param text to iterate over + * @return iterator with ints representing the codepoints + * @since POI 5.2.4 + */ + public static PrimitiveIterator.OfInt primitiveIterator(String text) { + return text.codePoints().iterator(); + } } \ No newline at end of file diff --git a/poi/src/test/java/org/apache/poi/util/TestCodepointsUtil.java b/poi/src/test/java/org/apache/poi/util/TestCodepointsUtil.java index 3610535c95..f6d31ef5d4 100644 --- a/poi/src/test/java/org/apache/poi/util/TestCodepointsUtil.java +++ b/poi/src/test/java/org/apache/poi/util/TestCodepointsUtil.java @@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.function.IntConsumer; import org.junit.jupiter.api.Test; @@ -38,8 +39,22 @@ class TestCodepointsUtil { List codePoints = new ArrayList<>(); CodepointsUtil.iteratorFor(unicodeSurrogates).forEachRemaining(codePoints::add); assertEquals(17, codePoints.size()); - for(String point : codePoints){ - assertTrue(point.length() >=1 && point.length() <= 2, "codepoint " + point + "is wrong size"); + for (String point : codePoints) { + assertTrue(point.length() >= 1 && point.length() <= 2, "codepoint " + point + "is wrong size"); + } + } + + @Test + void testPrimitiveIterator() { + final String unicodeSurrogates = "\uD835\uDF4A\uD835\uDF4B\uD835\uDF4C\uD835\uDF4D\uD835\uDF4E" + + "abcdef123456"; + List codePoints = new ArrayList<>(); + CodepointsUtil.primitiveIterator(unicodeSurrogates).forEachRemaining((IntConsumer) (i) -> { + codePoints.add(new String(Character.toChars(i))); + }); + assertEquals(17, codePoints.size()); + for (String point : codePoints) { + assertTrue(point.length() >= 1 && point.length() <= 2, "codepoint " + point + "is wrong size"); } }