|
22 | 22 | import com.google.javascript.jscomp.parsing.parser.util.ErrorReporter;
|
23 | 23 | import com.google.javascript.jscomp.parsing.parser.util.SourcePosition;
|
24 | 24 | import com.google.javascript.jscomp.parsing.parser.util.SourceRange;
|
| 25 | +import com.google.javascript.jscomp.parsing.parser.util.UnicodeMatch; |
25 | 26 | import java.util.ArrayList;
|
26 | 27 | import javax.annotation.Nullable;
|
27 | 28 |
|
@@ -869,111 +870,18 @@ private static String processUnicodeEscapes(String value) {
|
869 | 870 | return value;
|
870 | 871 | }
|
871 | 872 |
|
872 |
| - @SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code |
| 873 | + /** |
| 874 | + * Interface from UnicodeRegex. Includes old optimizations. |
| 875 | + */ |
873 | 876 | private static boolean isIdentifierStart(char ch) {
|
874 |
| - // Most code is written in pure ASCII, so create a fast path here. |
875 |
| - if (ch <= 127) { |
876 |
| - // Intentionally avoiding short circuiting behavior of "||" and "&&". |
877 |
| - // This minimizes branches in this code which minimizes branch prediction misses. |
878 |
| - return ((ch >= 'A' & ch <= 'Z') | (ch >= 'a' & ch <= 'z') | (ch == '_' | ch == '$')); |
879 |
| - } |
880 |
| - |
881 |
| - // Handle non-ASCII characters. |
882 |
| - // TODO(tjgq): This should include all characters with the ID_Start property. |
883 |
| - if (Character.isLetter(ch)) { |
884 |
| - return true; |
885 |
| - } |
886 |
| - |
887 |
| - // Workaround for b/36459436. |
888 |
| - // When running under GWT/J2CL, Character.isLetter only handles ASCII. |
889 |
| - // Angular relies heavily on Latin Small Letter Barred O and Greek Capital Letter Delta. |
890 |
| - // Greek letters are occasionally found in math code. |
891 |
| - // Latin letters are found in our own tests. |
892 |
| - return (ch >= 0x00C0 & ch <= 0x00D6) // Latin letters |
893 |
| - // 0x00D7 = multiplication sign, not a letter |
894 |
| - | (ch >= 0x00D8 & ch <= 0x00F6) // Latin letters |
895 |
| - // 0x00F7 = division sign, not a letter |
896 |
| - | (ch >= 0x00F8 & ch <= 0x00FF) // Latin letters |
897 |
| - | ch == 0x0275 // Latin Barred O |
898 |
| - | (ch >= 0x0391 & ch <= 0x03A1) // Greek uppercase letters |
899 |
| - // 0x03A2 = unassigned |
900 |
| - | (ch >= 0x03A3 & ch <= 0x03A9) // Remaining Greek uppercase letters |
901 |
| - | (ch >= 0x03B1 & ch <= 0x03C9); // Greek lowercase letters |
902 |
| - } |
903 |
| - |
904 |
| - // Check if char is Unicode Category "Combining spacing mark (Mc)" |
905 |
| - // This list is not exhaustive! |
906 |
| - @SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code |
907 |
| - private static boolean isCombiningMark(char ch) { |
908 |
| - return ( |
909 |
| - // 0300-036F |
910 |
| - (0x0300 <= ch & ch <= 0x036F) | |
911 |
| - // 1AB0–1AFF |
912 |
| - (0x1AB0 <= ch & ch <= 0x1AFF) | |
913 |
| - // 1DC0–1DFF |
914 |
| - (0x1DC0 <= ch & ch <= 0x1DFF) | |
915 |
| - // 20D0–20FF |
916 |
| - (0x20D0 <= ch & ch <= 0x20FF) | |
917 |
| - // FE20–FE2F |
918 |
| - (0xFE20 <= ch & ch <= 0xFE2F) |
919 |
| - ); |
920 |
| - // TODO (ctjl): Implement in a more reliable and future-proofed way, i.e.: |
921 |
| - // return Character.getType(ch) == Character.NON_SPACING_MARK; |
922 |
| - } |
923 |
| - |
924 |
| - // TODO (ctjl): Implement |
925 |
| - private static boolean isConnectorPunctuation() { |
926 |
| - return true; |
| 877 | + return UnicodeMatch.isJavascriptIdentifierStart(ch); |
927 | 878 | }
|
928 |
| - |
929 |
| - // TODO (ctjl): Implement |
930 |
| - private static boolean isZeroWidthJoiner() { |
931 |
| - return true; |
932 |
| - } |
933 |
| - |
934 |
| - // TODO (ctjl): Implement |
935 |
| - private static boolean isZeroWidthNonJoiner() { |
936 |
| - return true; |
937 |
| - } |
938 |
| - |
939 |
| - @SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code |
| 879 | + |
| 880 | + /** |
| 881 | + * Interface from UnicodeRegex. Includes old optimizations. |
| 882 | + */ |
940 | 883 | private static boolean isIdentifierPart(char ch) {
|
941 |
| - /** |
942 |
| - https://www.ecma-international.org/ecma-262/5.1/#sec-7.6 |
943 |
| - IdentifierPart :: |
944 |
| - IdentifierStart |
945 |
| - ✓ isIdentifierPart() |
946 |
| -
|
947 |
| - UnicodeCombiningMark |
948 |
| - ✓ isCombiningMark() |
949 |
| -
|
950 |
| - UnicodeDigit |
951 |
| - ✓ Character.isDigit() |
952 |
| -
|
953 |
| - UnicodeConnectorPunctuation |
954 |
| - ✓ isConnectorPunctuation() |
955 |
| -
|
956 |
| - <ZWNJ> |
957 |
| - ✓ isZeroWidthNonJoiner() |
958 |
| - |
959 |
| - <ZWJ> |
960 |
| - ✓ isZeroWidthJoiner() |
961 |
| - */ |
962 |
| - |
963 |
| - // Most code is written in pure ASCII, so create a fast path here. |
964 |
| - if (ch <= 127) { |
965 |
| - return ((ch >= 'A' & ch <= 'Z') |
966 |
| - | (ch >= 'a' & ch <= 'z') |
967 |
| - | (ch >= '0' & ch <= '9') |
968 |
| - | (ch == '_' | ch == '$')); |
969 |
| - } |
970 |
| - |
971 |
| - // Handle non-ASCII characters. |
972 |
| - // TODO(tjgq): This should include all characters with the ID_Continue property, plus |
973 |
| - // TODO(ctjl): Implement remaining grammar (zero-width joiners, etc.) |
974 |
| - return isIdentifierStart(ch) |
975 |
| - || isCombiningMark(ch) |
976 |
| - || Character.isDigit(ch); |
| 884 | + return UnicodeMatch.isJavascriptIdentifierPart(ch); |
977 | 885 | }
|
978 | 886 |
|
979 | 887 | private Token scanStringLiteral(int beginIndex, char terminator) {
|
|
0 commit comments