[bug-66532] more performant way to iterate over codepoints. Thanks to Matthias Raschhofer

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1908458 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
PJ Fanning 2023-03-17 23:35:33 +00:00
parent 3e6dddaa95
commit 0275daa5de
3 changed files with 49 additions and 15 deletions

View File

@ -31,6 +31,7 @@ import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.PrimitiveIterator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -397,37 +398,39 @@ public class SheetDataWriter implements Closeable {
return;
}
for (Iterator<String> iter = CodepointsUtil.iteratorFor(s); iter.hasNext(); ) {
String codepoint = iter.next();
int codepoint;
for (PrimitiveIterator.OfInt iter = CodepointsUtil.primitiveIterator(s); iter.hasNext(); ) {
codepoint = iter.nextInt();
switch (codepoint) {
case "<":
case '<':
_out.write("&lt;");
break;
case ">":
case '>':
_out.write("&gt;");
break;
case "&":
case '&':
_out.write("&amp;");
break;
case "\"":
case '\"':
_out.write("&quot;");
break;
// Special characters
case "\n":
case '\n':
_out.write("&#xa;");
break;
case "\r":
case '\r':
_out.write("&#xd;");
break;
case "\t":
case '\t':
_out.write("&#x9;");
break;
case "\u00A0": // NO-BREAK SPACE
case '\u00A0': // NO-BREAK SPACE
_out.write("&#xa0;");
break;
default:
if (codepoint.length() == 1) {
char c = codepoint.charAt(0);
final char[] chars = Character.toChars(codepoint);
if (chars.length == 1) {
char c = chars[0];
// YK: XmlBeans silently replaces all ISO control characters ( < 32) with question marks.
// the same rule applies to "not a character" symbols.
if (replaceWithQuestionMark(c)) {
@ -436,7 +439,7 @@ public class SheetDataWriter implements Closeable {
_out.write(c);
}
} else {
_out.write(codepoint);
_out.write(chars);
}
break;
}

View File

@ -18,12 +18,28 @@
package org.apache.poi.util;
import java.util.Iterator;
import java.util.PrimitiveIterator;
@Internal
public class CodepointsUtil {
/**
* @param text to iterate over
* @return iterator with Strings representing the codepoints
* @see #primitiveIterator(String) a more performnt iterator
*/
public static Iterator<String> iteratorFor(String text) {
return text.codePoints()
.mapToObj(codePoint -> new String(Character.toChars(codePoint)))
.iterator();
}
/**
* @param text to iterate over
* @return iterator with ints representing the codepoints
* @since POI 5.2.4
*/
public static PrimitiveIterator.OfInt primitiveIterator(String text) {
return text.codePoints().iterator();
}
}

View File

@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.function.IntConsumer;
import org.junit.jupiter.api.Test;
@ -38,8 +39,22 @@ class TestCodepointsUtil {
List<String> codePoints = new ArrayList<>();
CodepointsUtil.iteratorFor(unicodeSurrogates).forEachRemaining(codePoints::add);
assertEquals(17, codePoints.size());
for(String point : codePoints){
assertTrue(point.length() >=1 && point.length() <= 2, "codepoint " + point + "is wrong size");
for (String point : codePoints) {
assertTrue(point.length() >= 1 && point.length() <= 2, "codepoint " + point + "is wrong size");
}
}
@Test
void testPrimitiveIterator() {
final String unicodeSurrogates = "\uD835\uDF4A\uD835\uDF4B\uD835\uDF4C\uD835\uDF4D\uD835\uDF4E"
+ "abcdef123456";
List<String> codePoints = new ArrayList<>();
CodepointsUtil.primitiveIterator(unicodeSurrogates).forEachRemaining((IntConsumer) (i) -> {
codePoints.add(new String(Character.toChars(i)));
});
assertEquals(17, codePoints.size());
for (String point : codePoints) {
assertTrue(point.length() >= 1 && point.length() <= 2, "codepoint " + point + "is wrong size");
}
}