Add XSSFOptimiser - XSSF equivalent of HSSFOptimiser (#952)

* Add XSSFOptimiser - XSSF equivalent of HSSFOptimiser

Adds optimization methods for XSSF workbooks to address style explosion
issues. HSSFOptimiser exists for HSSF but no equivalent existed for XSSF.

Methods added:
- optimiseCellStyles(XSSFWorkbook): Remaps cells using duplicate styles
  to canonical (first) occurrence
- optimiseFonts(XSSFWorkbook): Updates style font references to canonical
  fonts

Implementation uses a safe remapping approach rather than removing entries
from XML, which avoids XmlValueDisconnectedException issues caused by
StylesTable's internal list becoming disconnected from CTStylesheet.

Includes comprehensive test coverage in TestXSSFOptimiser.

Closes #951

* Address PR review comments

- Add "since 6.0.0" javadoc tag to XSSFOptimiser class
- Use Locale.ROOT in String.format() to comply with forbidden-apis check
This commit is contained in:
Gili Tzabari 2025-12-01 13:30:27 -05:00 committed by GitHub
parent f44ece137f
commit c953a8ff22
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 554 additions and 0 deletions

View File

@ -0,0 +1,249 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xssf.usermodel;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Color;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.model.StylesTable;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCellXfs;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTStylesheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTXf;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
/**
* XSSF equivalent of HSSFOptimiser.
*
* <p>Provides methods to optimise cell styles and fonts in XSSF workbooks
* by remapping cells that use duplicate styles/fonts to use canonical
* (first) occurrences.</p>
*
* <p>This helps avoid "style explosion" issues where Excel workbooks accumulate
* too many style definitions, causing performance problems or file corruption.</p>
*
* <p>Usage:</p>
* <pre>
* XSSFWorkbook workbook = new XSSFWorkbook(file);
* XSSFOptimiser.optimiseFonts(workbook);
* XSSFOptimiser.optimiseCellStyles(workbook);
* workbook.write(outputStream);
* </pre>
*
* <p>Note: Unlike HSSFOptimiser which can remove unused styles from the binary
* format, this implementation remaps cells to canonical styles without removing
* entries from the XML. This is because XSSF's StylesTable maintains a separate
* Java list from the CTStylesheet XML, and removing entries can cause
* XmlValueDisconnectedException on save. The remapping approach prevents
* further style explosion and is safe.</p>
*
* @see org.apache.poi.hssf.usermodel.HSSFOptimiser
* @since 6.0.0
*/
public final class XSSFOptimiser {
private XSSFOptimiser() {
// no instances of this class
}
/**
* Goes through the Workbook, optimising the cell styles by remapping
* cells that use duplicate styles to use the canonical (first) occurrence.
*
* <p>Two styles are considered duplicates if they have identical properties:
* font properties (not just font index), data format, alignment, borders,
* fill colors, etc.</p>
*
* <p>For best results, call {@link #optimiseFonts(XSSFWorkbook)} first.</p>
*
* @param workbook The workbook to optimise
* @return The number of cells that were remapped to canonical styles
*/
public static int optimiseCellStyles(XSSFWorkbook workbook) {
int numStyles = workbook.getNumCellStyles();
if (numStyles <= 1) {
return 0; // Nothing to optimize
}
// Step 1: Build signature map for duplicate detection
Map<String, Integer> signatureToCanonical = new LinkedHashMap<>();
Map<Integer, Integer> oldToCanonical = new HashMap<>();
for (int i = 0; i < numStyles; i++) {
XSSFCellStyle style = workbook.getCellStyleAt(i);
String signature = getStyleSignature(style);
if (signatureToCanonical.containsKey(signature)) {
// Duplicate found - map to earlier canonical style
oldToCanonical.put(i, signatureToCanonical.get(signature));
} else {
// First occurrence - this is the canonical style
signatureToCanonical.put(signature, i);
oldToCanonical.put(i, i);
}
}
// Step 2: Remap all cells to use canonical styles
int cellsRemapped = 0;
for (int sheetIdx = 0; sheetIdx < workbook.getNumberOfSheets(); sheetIdx++) {
XSSFSheet sheet = workbook.getSheetAt(sheetIdx);
for (Row row : sheet) {
if (row == null) continue;
for (Cell cell : row) {
if (cell == null) continue;
int currentIdx = cell.getCellStyle().getIndex();
int canonicalIdx = oldToCanonical.getOrDefault(currentIdx, currentIdx);
if (canonicalIdx != currentIdx) {
cell.setCellStyle(workbook.getCellStyleAt(canonicalIdx));
cellsRemapped++;
}
}
}
}
return cellsRemapped;
}
/**
* Goes through the Workbook, optimising the fonts by updating cell styles
* that reference duplicate fonts to use the canonical (first) occurrence.
*
* <p>Two fonts are considered duplicates if they have identical properties:
* name, size, bold, italic, strikeout, underline, color, typeOffset, charset.</p>
*
* @param workbook The workbook to optimise
* @return The number of duplicate fonts found
*/
public static int optimiseFonts(XSSFWorkbook workbook) {
int numFonts = workbook.getNumberOfFonts();
if (numFonts <= 1) {
return 0; // Nothing to optimize
}
// Step 1: Build signature map for duplicate detection
Map<String, Integer> signatureToCanonical = new LinkedHashMap<>();
Map<Integer, Integer> oldToCanonical = new HashMap<>();
int duplicatesFound = 0;
for (int i = 0; i < numFonts; i++) {
XSSFFont font = workbook.getFontAt(i);
String signature = getFontSignature(font);
if (signatureToCanonical.containsKey(signature)) {
oldToCanonical.put(i, signatureToCanonical.get(signature));
duplicatesFound++;
} else {
signatureToCanonical.put(signature, i);
oldToCanonical.put(i, i);
}
}
if (duplicatesFound == 0) {
return 0; // No duplicates to remap
}
// Step 2: Update font references in cell styles' CTXf entries
StylesTable stylesTable = workbook.getStylesSource();
CTStylesheet ctStylesheet = stylesTable.getCTStylesheet();
CTCellXfs ctXfs = ctStylesheet.getCellXfs();
if (ctXfs != null) {
for (int i = 0; i < ctXfs.sizeOfXfArray(); i++) {
CTXf xf = ctXfs.getXfArray(i);
if (xf.isSetFontId()) {
int oldFontId = (int) xf.getFontId();
int canonicalFontId = oldToCanonical.getOrDefault(oldFontId, oldFontId);
if (canonicalFontId != oldFontId) {
xf.setFontId(canonicalFontId);
}
}
}
}
return duplicatesFound;
}
/**
* Generate a signature string for a cell style to detect duplicates.
* Uses font PROPERTIES (not index) so styles with identical but separate fonts
* are detected as duplicates.
*/
private static String getStyleSignature(XSSFCellStyle style) {
StringBuilder sb = new StringBuilder();
// Use font properties, not index, so identical fonts are detected
XSSFFont font = style.getFont();
sb.append("font:").append(getFontSignature(font)).append(";");
sb.append("fmt:").append(style.getDataFormat()).append(";");
sb.append("align:").append(style.getAlignment()).append(";");
sb.append("valign:").append(style.getVerticalAlignment()).append(";");
sb.append("wrap:").append(style.getWrapText()).append(";");
sb.append("rotation:").append(style.getRotation()).append(";");
sb.append("indent:").append(style.getIndention()).append(";");
sb.append("borderL:").append(style.getBorderLeft()).append(";");
sb.append("borderR:").append(style.getBorderRight()).append(";");
sb.append("borderT:").append(style.getBorderTop()).append(";");
sb.append("borderB:").append(style.getBorderBottom()).append(";");
sb.append("fillFg:").append(colorToString(style.getFillForegroundColorColor())).append(";");
sb.append("fillBg:").append(colorToString(style.getFillBackgroundColorColor())).append(";");
sb.append("fillPat:").append(style.getFillPattern()).append(";");
sb.append("locked:").append(style.getLocked()).append(";");
sb.append("hidden:").append(style.getHidden()).append(";");
return sb.toString();
}
/**
* Generate a signature string for a font to detect duplicates.
*/
private static String getFontSignature(XSSFFont font) {
StringBuilder sb = new StringBuilder();
sb.append("name:").append(font.getFontName()).append(";");
sb.append("size:").append(font.getFontHeightInPoints()).append(";");
sb.append("bold:").append(font.getBold()).append(";");
sb.append("italic:").append(font.getItalic()).append(";");
sb.append("strike:").append(font.getStrikeout()).append(";");
sb.append("underline:").append(font.getUnderline()).append(";");
sb.append("color:").append(colorToString(font.getXSSFColor())).append(";");
sb.append("typeOffset:").append(font.getTypeOffset()).append(";");
sb.append("charset:").append(font.getCharSet()).append(";");
return sb.toString();
}
/**
* Convert a color to a string for comparison.
*/
private static String colorToString(Color color) {
if (color == null) {
return "null";
}
if (color instanceof XSSFColor) {
XSSFColor xssfColor = (XSSFColor) color;
byte[] rgb = xssfColor.getRGB();
if (rgb != null) {
return String.format(Locale.ROOT, "#%02X%02X%02X", rgb[0] & 0xFF, rgb[1] & 0xFF, rgb[2] & 0xFF);
}
return "indexed:" + xssfColor.getIndexed();
}
return color.toString();
}
}

View File

@ -0,0 +1,305 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xssf.usermodel;
import static org.junit.jupiter.api.Assertions.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.DataFormat;
import org.apache.poi.ss.usermodel.Font;
import org.apache.poi.ss.usermodel.HorizontalAlignment;
import org.apache.poi.ss.usermodel.Row;
import org.junit.jupiter.api.Test;
/**
* Tests for {@link XSSFOptimiser}.
*/
class TestXSSFOptimiser {
/**
* Test that optimiseCellStyles remaps cells with duplicate styles to canonical style.
*/
@Test
void testOptimiseCellStylesRemapsDuplicates() throws Exception {
try (XSSFWorkbook workbook = new XSSFWorkbook()) {
XSSFSheet sheet = workbook.createSheet("Test");
// Create 10 cells, each with a "new" style that is actually identical
for (int i = 0; i < 10; i++) {
Row row = sheet.createRow(i);
Cell cell = row.createCell(0);
cell.setCellValue("Value " + i);
// Create a "new" style for each cell (causes style explosion)
CellStyle style = workbook.createCellStyle();
Font font = workbook.createFont();
font.setFontName("Calibri");
font.setFontHeightInPoints((short) 11);
font.setBold(false);
style.setFont(font);
cell.setCellStyle(style);
}
int stylesBeforeOptimize = workbook.getNumCellStyles();
assertTrue(stylesBeforeOptimize > 5,
"Should have created multiple styles before optimization, got: " + stylesBeforeOptimize);
// Optimize - returns number of cells remapped
int cellsRemapped = XSSFOptimiser.optimiseCellStyles(workbook);
assertTrue(cellsRemapped > 0,
"Should have remapped cells to canonical styles, got: " + cellsRemapped);
// Verify all cells now reference the same canonical style index
int canonicalStyleIdx = sheet.getRow(0).getCell(0).getCellStyle().getIndex();
for (int i = 1; i < 10; i++) {
Cell cell = sheet.getRow(i).getCell(0);
assertEquals(canonicalStyleIdx, cell.getCellStyle().getIndex(),
"Cell " + i + " should use canonical style index");
}
// Verify cells still have correct values
for (int i = 0; i < 10; i++) {
Cell cell = sheet.getRow(i).getCell(0);
assertEquals("Value " + i, cell.getStringCellValue());
}
}
}
/**
* Test that optimiseCellStyles handles unused styles gracefully.
*/
@Test
void testOptimiseCellStylesHandlesUnusedStyles() throws Exception {
try (XSSFWorkbook workbook = new XSSFWorkbook()) {
XSSFSheet sheet = workbook.createSheet("Test");
// Create a cell with one style
Row row = sheet.createRow(0);
Cell cell = row.createCell(0);
cell.setCellValue("Used style");
CellStyle usedStyle = workbook.createCellStyle();
usedStyle.setAlignment(HorizontalAlignment.CENTER);
cell.setCellStyle(usedStyle);
// Create several unused styles
for (int i = 0; i < 5; i++) {
CellStyle unusedStyle = workbook.createCellStyle();
unusedStyle.setAlignment(HorizontalAlignment.LEFT);
// Never apply this style to any cell
}
int stylesBeforeOptimize = workbook.getNumCellStyles();
assertTrue(stylesBeforeOptimize >= 6,
"Should have at least 6 styles (1 default + 1 used + 5 unused), got: " + stylesBeforeOptimize);
// Optimize - should not throw, returns 0 (no duplicates to remap)
int cellsRemapped = XSSFOptimiser.optimiseCellStyles(workbook);
assertEquals(0, cellsRemapped, "No duplicate styles to remap");
// Verify the used cell still has correct alignment
CellStyle cellStyle = cell.getCellStyle();
assertEquals(HorizontalAlignment.CENTER, cellStyle.getAlignment(),
"Cell should still have CENTER alignment after optimization");
}
}
/**
* Test that optimiseCellStyles preserves different styles.
*/
@Test
void testOptimiseCellStylesPreservesDifferentStyles() throws Exception {
try (XSSFWorkbook workbook = new XSSFWorkbook()) {
XSSFSheet sheet = workbook.createSheet("Test");
// Cell with bold style
Row row1 = sheet.createRow(0);
Cell cell1 = row1.createCell(0);
cell1.setCellValue("Bold");
CellStyle boldStyle = workbook.createCellStyle();
Font boldFont = workbook.createFont();
boldFont.setBold(true);
boldStyle.setFont(boldFont);
cell1.setCellStyle(boldStyle);
// Cell with italic style
Row row2 = sheet.createRow(1);
Cell cell2 = row2.createCell(0);
cell2.setCellValue("Italic");
CellStyle italicStyle = workbook.createCellStyle();
Font italicFont = workbook.createFont();
italicFont.setItalic(true);
italicStyle.setFont(italicFont);
cell2.setCellStyle(italicStyle);
// Cell with number format
Row row3 = sheet.createRow(2);
Cell cell3 = row3.createCell(0);
cell3.setCellValue(1234.56);
CellStyle numStyle = workbook.createCellStyle();
DataFormat format = workbook.createDataFormat();
numStyle.setDataFormat(format.getFormat("#,##0.00"));
cell3.setCellStyle(numStyle);
// Optimize
XSSFOptimiser.optimiseCellStyles(workbook);
// Verify each cell retains its distinct formatting
Font font1 = workbook.getFontAt(cell1.getCellStyle().getFontIndex());
assertTrue(font1.getBold(), "Cell 1 should still be bold");
Font font2 = workbook.getFontAt(cell2.getCellStyle().getFontIndex());
assertTrue(font2.getItalic(), "Cell 2 should still be italic");
String numFormat = cell3.getCellStyle().getDataFormatString();
assertTrue(numFormat.contains("#,##0"),
"Cell 3 should still have number format, got: " + numFormat);
}
}
/**
* Test that optimiseFonts remaps style font references to canonical fonts.
*/
@Test
void testOptimiseFontsRemapsDuplicates() throws Exception {
try (XSSFWorkbook workbook = new XSSFWorkbook()) {
XSSFSheet sheet = workbook.createSheet("Test");
// Create cells with duplicate fonts
for (int i = 0; i < 10; i++) {
Row row = sheet.createRow(i);
Cell cell = row.createCell(0);
cell.setCellValue("Value " + i);
// Create a "new" font for each cell (identical fonts)
CellStyle style = workbook.createCellStyle();
Font font = workbook.createFont();
font.setFontName("Arial");
font.setFontHeightInPoints((short) 12);
font.setBold(true);
style.setFont(font);
cell.setCellStyle(style);
}
int fontsBeforeOptimize = workbook.getNumberOfFonts();
assertTrue(fontsBeforeOptimize > 5,
"Should have created multiple fonts before optimization, got: " + fontsBeforeOptimize);
// Optimize fonts - returns number of duplicates found
int duplicatesFound = XSSFOptimiser.optimiseFonts(workbook);
assertTrue(duplicatesFound > 0,
"Should have found duplicate fonts, got: " + duplicatesFound);
// Verify cells still have bold Arial font (formatting preserved)
for (int i = 0; i < 10; i++) {
Cell cell = sheet.getRow(i).getCell(0);
Font font = workbook.getFontAt(cell.getCellStyle().getFontIndex());
assertEquals("Arial", font.getFontName(), "Font name should be preserved");
assertTrue(font.getBold(), "Font should still be bold");
}
}
}
/**
* Test that workbook is saveable after optimization.
*/
@Test
void testWorkbookSaveableAfterOptimization() throws Exception {
try (XSSFWorkbook workbook = new XSSFWorkbook()) {
XSSFSheet sheet = workbook.createSheet("Test");
// Create style explosion
for (int i = 0; i < 20; i++) {
Row row = sheet.createRow(i);
Cell cell = row.createCell(0);
cell.setCellValue("Value " + i);
CellStyle style = workbook.createCellStyle();
Font font = workbook.createFont();
font.setFontName("Calibri");
style.setFont(font);
cell.setCellStyle(style);
}
// Optimize
XSSFOptimiser.optimiseCellStyles(workbook);
XSSFOptimiser.optimiseFonts(workbook);
// Save to byte array (should not throw XmlValueDisconnectedException)
ByteArrayOutputStream baos = new ByteArrayOutputStream();
workbook.write(baos);
byte[] savedBytes = baos.toByteArray();
assertTrue(savedBytes.length > 0, "Saved workbook should have content");
// Verify we can read it back
try (XSSFWorkbook reloaded = new XSSFWorkbook(new ByteArrayInputStream(savedBytes))) {
XSSFSheet reloadedSheet = reloaded.getSheetAt(0);
assertEquals("Value 0", reloadedSheet.getRow(0).getCell(0).getStringCellValue());
assertEquals("Value 19", reloadedSheet.getRow(19).getCell(0).getStringCellValue());
}
}
}
/**
* Test that optimization preserves formulas.
*/
@Test
void testOptimizationPreservesFormulas() throws Exception {
try (XSSFWorkbook workbook = new XSSFWorkbook()) {
XSSFSheet sheet = workbook.createSheet("Test");
// Create cells with values
for (int i = 0; i < 5; i++) {
Row row = sheet.createRow(i);
Cell cell = row.createCell(0);
cell.setCellValue(i + 1);
CellStyle style = workbook.createCellStyle();
style.setAlignment(HorizontalAlignment.RIGHT);
cell.setCellStyle(style);
}
// Create formula cell
Row sumRow = sheet.createRow(5);
Cell sumCell = sumRow.createCell(0);
sumCell.setCellFormula("SUM(A1:A5)");
CellStyle sumStyle = workbook.createCellStyle();
Font boldFont = workbook.createFont();
boldFont.setBold(true);
sumStyle.setFont(boldFont);
sumCell.setCellStyle(sumStyle);
// Optimize
XSSFOptimiser.optimiseCellStyles(workbook);
// Verify formula is preserved
Cell formulaCell = sheet.getRow(5).getCell(0);
assertEquals(CellType.FORMULA, formulaCell.getCellType());
assertEquals("SUM(A1:A5)", formulaCell.getCellFormula());
// Verify bold style is preserved on formula cell
Font font = workbook.getFontAt(formulaCell.getCellStyle().getFontIndex());
assertTrue(font.getBold(), "Formula cell should still be bold");
}
}
}