html5lib
diff --git a/‎docs/line-col-tracking.txt
+33-33 b/‎docs/line-col-tracking.txt
+33-33
diff --git a/‎library/HTML5/Data.php
+108-108 b/‎library/HTML5/Data.php
+108-108
diff --git a/‎library/HTML5/Parser.php
+6-6 b/‎library/HTML5/Parser.php
+6-6
@@ -1,33 +1,33 @@
-Line and column number tracking
-
-The first thing is that it's trivial to do in-efficiently. The second thing
-is that it's nontrivial to do it efficiently. We have two competing interests
-at stake:
-
-1. Code brevity and comprehensibility, which means that we factor things out,
-   make methods, sub-calls, a stream class, etc. This is what the Python and
-   Ruby implementations did.
-2. Performance, which means avoiding function calls like the plague.
-
-This is mildly manageable, until you realize that column numbers are tracked
-as per Unicode characters, not bytes. Which means that if we go for (2), we
-will end up with a large amount of duplicated code. Further complicating issues
-is the fact that the initial implementation PH5P opted to make function calls
-to retrieve characters. I personally find this unacceptable.
-
-It should be noted, however, that we're already performing a dynamic function
-lookup on every character of HTML, which is ALSO unacceptable. One possible
-way of restructuring this is turning the entire thing into a giant loop with
-a giant conditional. Note that the conditional is evaluated sequentially, so
-let's test whether or not that's more expensive.
-
-Some surprising results: the cost it takes to perform all of those string
-comparisons dwarfs the cost from calling functions. If you convert the states
-into integers, however, having a gigantic loop is slightly faster SO LONG AS
-you use the loop to get rid of a function call. However, if we pull the common
-code out of the individual state functions and place it in the looper, things
-work nicely.
-
-The conclusion, I suppose, is that we're going to keep the method-based state
-machine and save on method calls by moving the common code outside. Let's do 
-this right now.
+Line and column number tracking
+
+The first thing is that it's trivial to do in-efficiently. The second thing
+is that it's nontrivial to do it efficiently. We have two competing interests
+at stake:
+
+1. Code brevity and comprehensibility, which means that we factor things out,
+   make methods, sub-calls, a stream class, etc. This is what the Python and
+   Ruby implementations did.
+2. Performance, which means avoiding function calls like the plague.
+
+This is mildly manageable, until you realize that column numbers are tracked
+as per Unicode characters, not bytes. Which means that if we go for (2), we
+will end up with a large amount of duplicated code. Further complicating issues
+is the fact that the initial implementation PH5P opted to make function calls
+to retrieve characters. I personally find this unacceptable.
+
+It should be noted, however, that we're already performing a dynamic function
+lookup on every character of HTML, which is ALSO unacceptable. One possible
+way of restructuring this is turning the entire thing into a giant loop with
+a giant conditional. Note that the conditional is evaluated sequentially, so
+let's test whether or not that's more expensive.
+
+Some surprising results: the cost it takes to perform all of those string
+comparisons dwarfs the cost from calling functions. If you convert the states
+into integers, however, having a gigantic loop is slightly faster SO LONG AS
+you use the loop to get rid of a function call. However, if we pull the common
+code out of the individual state functions and place it in the looper, things
+work nicely.
+
+The conclusion, I suppose, is that we're going to keep the method-based state
+machine and save on method calls by moving the common code outside. Let's do 
+this right now.
@@ -1,108 +1,108 @@
-<?php
-
-// warning: this file is encoded in UTF-8!
-
-class HTML5_Data
-{
-    
-    // at some point this should be moved to a .ser file. Another
-    // possible optimization is to give UTF-8 bytes, not Unicode
-    // codepoints
-    protected static $realCodepointTable = array(
-        0x0D => 0x000A, // LINE FEED (LF) 
-        0x80 => 0x20AC, // EURO SIGN ('€')
-        0x81 => 0xFFFD, // REPLACEMENT CHARACTER 
-        0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') 
-        0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
-        0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
-        0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
-        0x86 => 0x2020, // DAGGER ('†')
-        0x87 => 0x2021, // DOUBLE DAGGER ('‡')
-        0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
-        0x89 => 0x2030, // PER MILLE SIGN ('‰')
-        0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
-        0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
-        0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
-        0x8D => 0xFFFD, // REPLACEMENT CHARACTER
-        0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
-        0x8F => 0xFFFD, // REPLACEMENT CHARACTER
-        0x90 => 0xFFFD, // REPLACEMENT CHARACTER
-        0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
-        0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
-        0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
-        0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
-        0x95 => 0x2022, // BULLET ('•')
-        0x96 => 0x2013, // EN DASH ('–')
-        0x97 => 0x2014, // EM DASH ('—')
-        0x98 => 0x02DC, // SMALL TILDE ('˜')
-        0x99 => 0x2122, // TRADE MARK SIGN ('™')
-        0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
-        0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
-        0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
-        0x9D => 0xFFFD, // REPLACEMENT CHARACTER
-        0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
-        0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') 
-    );
-    
-    protected static $namedCharacterReferences;
-    
-    /**
-     * Returns the "real" Unicode codepoint of a malformed character
-     * reference.
-     */
-    public static function getRealCodepoint($ref) {
-        if (!isset(self::$realCodepointTable[$ref])) return false;
-        else return self::$realCodepointTable[$ref];
-    }
-    
-    public static function getNamedCharacterReferences() {
-        if (!self::$namedCharacterReferences) {
-            self::$namedCharacterReferences = unserialize(
-                file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
-        }
-        return self::$namedCharacterReferences;
-    }
-    
-    /**
-     * Converts a Unicode codepoint to sequence of UTF-8 bytes.
-     * @note Shamelessly stolen from HTML Purifier, which is also
-     *       shamelessly stolen from Feyd (which is in public domain).
-     */
-    public static function utf8chr($code) {
-        if($code > 0x10FFFF or $code < 0x0 or
-          ($code >= 0xD800 and $code <= 0xDFFF) ) {
-            // bits are set outside the "valid" range as defined
-            // by UNICODE 4.1.0
-            return "\xEF\xBF\xBD";
-        }
-
-        $x = $y = $z = $w = 0;
-        if ($code < 0x80) {
-            // regular ASCII character
-            $x = $code;
-        } else {
-            // set up bits for UTF-8
-            $x = ($code & 0x3F) | 0x80;
-            if ($code < 0x800) {
-               $y = (($code & 0x7FF) >> 6) | 0xC0;
-            } else {
-                $y = (($code & 0xFC0) >> 6) | 0x80;
-                if($code < 0x10000) {
-                    $z = (($code >> 12) & 0x0F) | 0xE0;
-                } else {
-                    $z = (($code >> 12) & 0x3F) | 0x80;
-                    $w = (($code >> 18) & 0x07) | 0xF0;
-                }
-            }
-        }
-        // set up the actual character
-        $ret = '';
-        if($w) $ret .= chr($w);
-        if($z) $ret .= chr($z);
-        if($y) $ret .= chr($y);
-        $ret .= chr($x);
-
-        return $ret;
-    }
-    
-}
+<?php
+
+// warning: this file is encoded in UTF-8!
+
+class HTML5_Data
+{
+    
+    // at some point this should be moved to a .ser file. Another
+    // possible optimization is to give UTF-8 bytes, not Unicode
+    // codepoints
+    protected static $realCodepointTable = array(
+        0x0D => 0x000A, // LINE FEED (LF) 
+        0x80 => 0x20AC, // EURO SIGN ('€')
+        0x81 => 0xFFFD, // REPLACEMENT CHARACTER 
+        0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') 
+        0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
+        0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
+        0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
+        0x86 => 0x2020, // DAGGER ('†')
+        0x87 => 0x2021, // DOUBLE DAGGER ('‡')
+        0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
+        0x89 => 0x2030, // PER MILLE SIGN ('‰')
+        0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
+        0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
+        0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
+        0x8D => 0xFFFD, // REPLACEMENT CHARACTER
+        0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
+        0x8F => 0xFFFD, // REPLACEMENT CHARACTER
+        0x90 => 0xFFFD, // REPLACEMENT CHARACTER
+        0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
+        0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
+        0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
+        0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
+        0x95 => 0x2022, // BULLET ('•')
+        0x96 => 0x2013, // EN DASH ('–')
+        0x97 => 0x2014, // EM DASH ('—')
+        0x98 => 0x02DC, // SMALL TILDE ('˜')
+        0x99 => 0x2122, // TRADE MARK SIGN ('™')
+        0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
+        0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
+        0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
+        0x9D => 0xFFFD, // REPLACEMENT CHARACTER
+        0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
+        0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') 
+    );
+    
+    protected static $namedCharacterReferences;
+    
+    /**
+     * Returns the "real" Unicode codepoint of a malformed character
+     * reference.
+     */
+    public static function getRealCodepoint($ref) {
+        if (!isset(self::$realCodepointTable[$ref])) return false;
+        else return self::$realCodepointTable[$ref];
+    }
+    
+    public static function getNamedCharacterReferences() {
+        if (!self::$namedCharacterReferences) {
+            self::$namedCharacterReferences = unserialize(
+                file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
+        }
+        return self::$namedCharacterReferences;
+    }
+    
+    /**
+     * Converts a Unicode codepoint to sequence of UTF-8 bytes.
+     * @note Shamelessly stolen from HTML Purifier, which is also
+     *       shamelessly stolen from Feyd (which is in public domain).
+     */
+    public static function utf8chr($code) {
+        if($code > 0x10FFFF or $code < 0x0 or
+          ($code >= 0xD800 and $code <= 0xDFFF) ) {
+            // bits are set outside the "valid" range as defined
+            // by UNICODE 4.1.0
+            return "\xEF\xBF\xBD";
+        }
+
+        $x = $y = $z = $w = 0;
+        if ($code < 0x80) {
+            // regular ASCII character
+            $x = $code;
+        } else {
+            // set up bits for UTF-8
+            $x = ($code & 0x3F) | 0x80;
+            if ($code < 0x800) {
+               $y = (($code & 0x7FF) >> 6) | 0xC0;
+            } else {
+                $y = (($code & 0xFC0) >> 6) | 0x80;
+                if($code < 0x10000) {
+                    $z = (($code >> 12) & 0x0F) | 0xE0;
+                } else {
+                    $z = (($code >> 12) & 0x3F) | 0x80;
+                    $w = (($code >> 18) & 0x07) | 0xF0;
+                }
+            }
+        }
+        // set up the actual character
+        $ret = '';
+        if($w) $ret .= chr($w);
+        if($z) $ret .= chr($z);
+        if($y) $ret .= chr($y);
+        $ret .= chr($x);
+
+        return $ret;
+    }
+    
+}
@@ -1,6 +1,6 @@
-<?php
-
-class HTML5_Parser
-{
-    
-}
+<?php
+
+class HTML5_Parser
+{
+    
+}