Skip to content

Commit 0e7b475

Browse files
Modify Charset Aliases : Stop treat ISO-8859-1 as Windows-1252's alias
1 parent 813a6eb commit 0e7b475

File tree

2 files changed

+7
-8
lines changed

2 files changed

+7
-8
lines changed

tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,9 @@ private static void addAll() {
9898
addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250");
9999
addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251");
100100
addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1",
101-
"ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987",
102-
"l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
101+
"ibm819", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
102+
addCharset(charset("ISO-8859-1"), "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591",
103+
"iso_8859-1", "iso_8859-1:1987");
103104
addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253");
104105
addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148",
105106
"iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254");

tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,7 @@ public void replacement() throws IOException {
177177

178178
@Test
179179
public void iso88591() throws IOException {
180-
// In the spec, iso-8859-1 is an alias for WINDOWS-1252
181-
assertWindows1252("<meta charset='iso-8859-1'>");
180+
assertCharset("<meta charset='iso-8859-1'>", StandardCharsets.ISO_8859_1);
182181
}
183182

184183
@Test
@@ -294,10 +293,9 @@ public void withCompactComment() throws IOException {
294293
@Test
295294
public void withCharsetInContentType() throws IOException {
296295
metadata.set(Metadata.CONTENT_TYPE, "text/html; Charset=ISO-8859-1");
297-
// ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
298-
assertWindows1252("");
299-
assertWindows1252("<meta charset='UTF-8'>");
300-
assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>");
296+
assertCharset("", StandardCharsets.ISO_8859_1);
297+
assertCharset("<meta charset='UTF-8'>", StandardCharsets.ISO_8859_1);
298+
assertCharset("<meta http-equiv='content-type' content='charset=utf-8'>", StandardCharsets.ISO_8859_1);
301299
// if a BOM is present, it has precedence over transport layer information
302300
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
303301
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);

0 commit comments

Comments
 (0)