|
1 |
| -<?php |
2 |
| - |
3 |
| -// warning: this file is encoded in UTF-8! |
4 |
| - |
5 |
| -class HTML5_Data |
6 |
| -{ |
7 |
| - |
8 |
| - // at some point this should be moved to a .ser file. Another |
9 |
| - // possible optimization is to give UTF-8 bytes, not Unicode |
10 |
| - // codepoints |
11 |
| - protected static $realCodepointTable = array( |
12 |
| - 0x0D => 0x000A, // LINE FEED (LF) |
13 |
| - 0x80 => 0x20AC, // EURO SIGN ('€') |
14 |
| - 0x81 => 0xFFFD, // REPLACEMENT CHARACTER |
15 |
| - 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') |
16 |
| - 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ') |
17 |
| - 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„') |
18 |
| - 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…') |
19 |
| - 0x86 => 0x2020, // DAGGER ('†') |
20 |
| - 0x87 => 0x2021, // DOUBLE DAGGER ('‡') |
21 |
| - 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ') |
22 |
| - 0x89 => 0x2030, // PER MILLE SIGN ('‰') |
23 |
| - 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š') |
24 |
| - 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹') |
25 |
| - 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ') |
26 |
| - 0x8D => 0xFFFD, // REPLACEMENT CHARACTER |
27 |
| - 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž') |
28 |
| - 0x8F => 0xFFFD, // REPLACEMENT CHARACTER |
29 |
| - 0x90 => 0xFFFD, // REPLACEMENT CHARACTER |
30 |
| - 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘') |
31 |
| - 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’') |
32 |
| - 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“') |
33 |
| - 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”') |
34 |
| - 0x95 => 0x2022, // BULLET ('•') |
35 |
| - 0x96 => 0x2013, // EN DASH ('–') |
36 |
| - 0x97 => 0x2014, // EM DASH ('—') |
37 |
| - 0x98 => 0x02DC, // SMALL TILDE ('˜') |
38 |
| - 0x99 => 0x2122, // TRADE MARK SIGN ('™') |
39 |
| - 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š') |
40 |
| - 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›') |
41 |
| - 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ') |
42 |
| - 0x9D => 0xFFFD, // REPLACEMENT CHARACTER |
43 |
| - 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž') |
44 |
| - 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') |
45 |
| - ); |
46 |
| - |
47 |
| - protected static $namedCharacterReferences; |
48 |
| - |
49 |
| - /** |
50 |
| - * Returns the "real" Unicode codepoint of a malformed character |
51 |
| - * reference. |
52 |
| - */ |
53 |
| - public static function getRealCodepoint($ref) { |
54 |
| - if (!isset(self::$realCodepointTable[$ref])) return false; |
55 |
| - else return self::$realCodepointTable[$ref]; |
56 |
| - } |
57 |
| - |
58 |
| - public static function getNamedCharacterReferences() { |
59 |
| - if (!self::$namedCharacterReferences) { |
60 |
| - self::$namedCharacterReferences = unserialize( |
61 |
| - file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); |
62 |
| - } |
63 |
| - return self::$namedCharacterReferences; |
64 |
| - } |
65 |
| - |
66 |
| - /** |
67 |
| - * Converts a Unicode codepoint to sequence of UTF-8 bytes. |
68 |
| - * @note Shamelessly stolen from HTML Purifier, which is also |
69 |
| - * shamelessly stolen from Feyd (which is in public domain). |
70 |
| - */ |
71 |
| - public static function utf8chr($code) { |
72 |
| - if($code > 0x10FFFF or $code < 0x0 or |
73 |
| - ($code >= 0xD800 and $code <= 0xDFFF) ) { |
74 |
| - // bits are set outside the "valid" range as defined |
75 |
| - // by UNICODE 4.1.0 |
76 |
| - return "\xEF\xBF\xBD"; |
77 |
| - } |
78 |
| - |
79 |
| - $x = $y = $z = $w = 0; |
80 |
| - if ($code < 0x80) { |
81 |
| - // regular ASCII character |
82 |
| - $x = $code; |
83 |
| - } else { |
84 |
| - // set up bits for UTF-8 |
85 |
| - $x = ($code & 0x3F) | 0x80; |
86 |
| - if ($code < 0x800) { |
87 |
| - $y = (($code & 0x7FF) >> 6) | 0xC0; |
88 |
| - } else { |
89 |
| - $y = (($code & 0xFC0) >> 6) | 0x80; |
90 |
| - if($code < 0x10000) { |
91 |
| - $z = (($code >> 12) & 0x0F) | 0xE0; |
92 |
| - } else { |
93 |
| - $z = (($code >> 12) & 0x3F) | 0x80; |
94 |
| - $w = (($code >> 18) & 0x07) | 0xF0; |
95 |
| - } |
96 |
| - } |
97 |
| - } |
98 |
| - // set up the actual character |
99 |
| - $ret = ''; |
100 |
| - if($w) $ret .= chr($w); |
101 |
| - if($z) $ret .= chr($z); |
102 |
| - if($y) $ret .= chr($y); |
103 |
| - $ret .= chr($x); |
104 |
| - |
105 |
| - return $ret; |
106 |
| - } |
107 |
| - |
108 |
| -} |
| 1 | +<?php |
| 2 | + |
| 3 | +// warning: this file is encoded in UTF-8! |
| 4 | + |
| 5 | +class HTML5_Data |
| 6 | +{ |
| 7 | + |
| 8 | + // at some point this should be moved to a .ser file. Another |
| 9 | + // possible optimization is to give UTF-8 bytes, not Unicode |
| 10 | + // codepoints |
| 11 | + protected static $realCodepointTable = array( |
| 12 | + 0x0D => 0x000A, // LINE FEED (LF) |
| 13 | + 0x80 => 0x20AC, // EURO SIGN ('€') |
| 14 | + 0x81 => 0xFFFD, // REPLACEMENT CHARACTER |
| 15 | + 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') |
| 16 | + 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ') |
| 17 | + 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„') |
| 18 | + 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…') |
| 19 | + 0x86 => 0x2020, // DAGGER ('†') |
| 20 | + 0x87 => 0x2021, // DOUBLE DAGGER ('‡') |
| 21 | + 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ') |
| 22 | + 0x89 => 0x2030, // PER MILLE SIGN ('‰') |
| 23 | + 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š') |
| 24 | + 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹') |
| 25 | + 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ') |
| 26 | + 0x8D => 0xFFFD, // REPLACEMENT CHARACTER |
| 27 | + 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž') |
| 28 | + 0x8F => 0xFFFD, // REPLACEMENT CHARACTER |
| 29 | + 0x90 => 0xFFFD, // REPLACEMENT CHARACTER |
| 30 | + 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘') |
| 31 | + 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’') |
| 32 | + 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“') |
| 33 | + 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”') |
| 34 | + 0x95 => 0x2022, // BULLET ('•') |
| 35 | + 0x96 => 0x2013, // EN DASH ('–') |
| 36 | + 0x97 => 0x2014, // EM DASH ('—') |
| 37 | + 0x98 => 0x02DC, // SMALL TILDE ('˜') |
| 38 | + 0x99 => 0x2122, // TRADE MARK SIGN ('™') |
| 39 | + 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š') |
| 40 | + 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›') |
| 41 | + 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ') |
| 42 | + 0x9D => 0xFFFD, // REPLACEMENT CHARACTER |
| 43 | + 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž') |
| 44 | + 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') |
| 45 | + ); |
| 46 | + |
| 47 | + protected static $namedCharacterReferences; |
| 48 | + |
| 49 | + /** |
| 50 | + * Returns the "real" Unicode codepoint of a malformed character |
| 51 | + * reference. |
| 52 | + */ |
| 53 | + public static function getRealCodepoint($ref) { |
| 54 | + if (!isset(self::$realCodepointTable[$ref])) return false; |
| 55 | + else return self::$realCodepointTable[$ref]; |
| 56 | + } |
| 57 | + |
| 58 | + public static function getNamedCharacterReferences() { |
| 59 | + if (!self::$namedCharacterReferences) { |
| 60 | + self::$namedCharacterReferences = unserialize( |
| 61 | + file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); |
| 62 | + } |
| 63 | + return self::$namedCharacterReferences; |
| 64 | + } |
| 65 | + |
| 66 | + /** |
| 67 | + * Converts a Unicode codepoint to sequence of UTF-8 bytes. |
| 68 | + * @note Shamelessly stolen from HTML Purifier, which is also |
| 69 | + * shamelessly stolen from Feyd (which is in public domain). |
| 70 | + */ |
| 71 | + public static function utf8chr($code) { |
| 72 | + if($code > 0x10FFFF or $code < 0x0 or |
| 73 | + ($code >= 0xD800 and $code <= 0xDFFF) ) { |
| 74 | + // bits are set outside the "valid" range as defined |
| 75 | + // by UNICODE 4.1.0 |
| 76 | + return "\xEF\xBF\xBD"; |
| 77 | + } |
| 78 | + |
| 79 | + $x = $y = $z = $w = 0; |
| 80 | + if ($code < 0x80) { |
| 81 | + // regular ASCII character |
| 82 | + $x = $code; |
| 83 | + } else { |
| 84 | + // set up bits for UTF-8 |
| 85 | + $x = ($code & 0x3F) | 0x80; |
| 86 | + if ($code < 0x800) { |
| 87 | + $y = (($code & 0x7FF) >> 6) | 0xC0; |
| 88 | + } else { |
| 89 | + $y = (($code & 0xFC0) >> 6) | 0x80; |
| 90 | + if($code < 0x10000) { |
| 91 | + $z = (($code >> 12) & 0x0F) | 0xE0; |
| 92 | + } else { |
| 93 | + $z = (($code >> 12) & 0x3F) | 0x80; |
| 94 | + $w = (($code >> 18) & 0x07) | 0xF0; |
| 95 | + } |
| 96 | + } |
| 97 | + } |
| 98 | + // set up the actual character |
| 99 | + $ret = ''; |
| 100 | + if($w) $ret .= chr($w); |
| 101 | + if($z) $ret .= chr($z); |
| 102 | + if($y) $ret .= chr($y); |
| 103 | + $ret .= chr($x); |
| 104 | + |
| 105 | + return $ret; |
| 106 | + } |
| 107 | + |
| 108 | +} |
0 commit comments