Skip to content

Commit 696e201

Browse files
committed
Adding support for non-ASCII Unicode.
1 parent e1d5129 commit 696e201

File tree

4 files changed

+111
-103
lines changed

4 files changed

+111
-103
lines changed

Diff for: src/com/google/javascript/jscomp/parsing/parser/Scanner.java

+10-102
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import com.google.javascript.jscomp.parsing.parser.util.ErrorReporter;
2323
import com.google.javascript.jscomp.parsing.parser.util.SourcePosition;
2424
import com.google.javascript.jscomp.parsing.parser.util.SourceRange;
25+
import com.google.javascript.jscomp.parsing.parser.util.UnicodeMatch;
2526
import java.util.ArrayList;
2627
import javax.annotation.Nullable;
2728

@@ -869,111 +870,18 @@ private static String processUnicodeEscapes(String value) {
869870
return value;
870871
}
871872

872-
@SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code
873+
/**
874+
* Interface from UnicodeRegex. Includes old optimizations.
875+
*/
873876
private static boolean isIdentifierStart(char ch) {
874-
// Most code is written in pure ASCII, so create a fast path here.
875-
if (ch <= 127) {
876-
// Intentionally avoiding short circuiting behavior of "||" and "&&".
877-
// This minimizes branches in this code which minimizes branch prediction misses.
878-
return ((ch >= 'A' & ch <= 'Z') | (ch >= 'a' & ch <= 'z') | (ch == '_' | ch == '$'));
879-
}
880-
881-
// Handle non-ASCII characters.
882-
// TODO(tjgq): This should include all characters with the ID_Start property.
883-
if (Character.isLetter(ch)) {
884-
return true;
885-
}
886-
887-
// Workaround for b/36459436.
888-
// When running under GWT/J2CL, Character.isLetter only handles ASCII.
889-
// Angular relies heavily on Latin Small Letter Barred O and Greek Capital Letter Delta.
890-
// Greek letters are occasionally found in math code.
891-
// Latin letters are found in our own tests.
892-
return (ch >= 0x00C0 & ch <= 0x00D6) // Latin letters
893-
// 0x00D7 = multiplication sign, not a letter
894-
| (ch >= 0x00D8 & ch <= 0x00F6) // Latin letters
895-
// 0x00F7 = division sign, not a letter
896-
| (ch >= 0x00F8 & ch <= 0x00FF) // Latin letters
897-
| ch == 0x0275 // Latin Barred O
898-
| (ch >= 0x0391 & ch <= 0x03A1) // Greek uppercase letters
899-
// 0x03A2 = unassigned
900-
| (ch >= 0x03A3 & ch <= 0x03A9) // Remaining Greek uppercase letters
901-
| (ch >= 0x03B1 & ch <= 0x03C9); // Greek lowercase letters
902-
}
903-
904-
// Check if char is Unicode Category "Combining spacing mark (Mc)"
905-
// This list is not exhaustive!
906-
@SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code
907-
private static boolean isCombiningMark(char ch) {
908-
return (
909-
// 0300-036F
910-
(0x0300 <= ch & ch <= 0x036F) |
911-
// 1AB0–1AFF
912-
(0x1AB0 <= ch & ch <= 0x1AFF) |
913-
// 1DC0–1DFF
914-
(0x1DC0 <= ch & ch <= 0x1DFF) |
915-
// 20D0–20FF
916-
(0x20D0 <= ch & ch <= 0x20FF) |
917-
// FE20–FE2F
918-
(0xFE20 <= ch & ch <= 0xFE2F)
919-
);
920-
// TODO (ctjl): Implement in a more reliable and future-proofed way, i.e.:
921-
// return Character.getType(ch) == Character.NON_SPACING_MARK;
922-
}
923-
924-
// TODO (ctjl): Implement
925-
private static boolean isConnectorPunctuation() {
926-
return true;
877+
return UnicodeMatch.isJavascriptIdentifierStart(ch);
927878
}
928-
929-
// TODO (ctjl): Implement
930-
private static boolean isZeroWidthJoiner() {
931-
return true;
932-
}
933-
934-
// TODO (ctjl): Implement
935-
private static boolean isZeroWidthNonJoiner() {
936-
return true;
937-
}
938-
939-
@SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code
879+
880+
/**
881+
* Interface from UnicodeRegex. Includes old optimizations.
882+
*/
940883
private static boolean isIdentifierPart(char ch) {
941-
/**
942-
https://www.ecma-international.org/ecma-262/5.1/#sec-7.6
943-
IdentifierPart ::
944-
IdentifierStart
945-
✓ isIdentifierPart()
946-
947-
UnicodeCombiningMark
948-
✓ isCombiningMark()
949-
950-
UnicodeDigit
951-
✓ Character.isDigit()
952-
953-
UnicodeConnectorPunctuation
954-
✓ isConnectorPunctuation()
955-
956-
<ZWNJ>
957-
✓ isZeroWidthNonJoiner()
958-
959-
<ZWJ>
960-
✓ isZeroWidthJoiner()
961-
*/
962-
963-
// Most code is written in pure ASCII, so create a fast path here.
964-
if (ch <= 127) {
965-
return ((ch >= 'A' & ch <= 'Z')
966-
| (ch >= 'a' & ch <= 'z')
967-
| (ch >= '0' & ch <= '9')
968-
| (ch == '_' | ch == '$'));
969-
}
970-
971-
// Handle non-ASCII characters.
972-
// TODO(tjgq): This should include all characters with the ID_Continue property, plus
973-
// TODO(ctjl): Implement remaining grammar (zero-width joiners, etc.)
974-
return isIdentifierStart(ch)
975-
|| isCombiningMark(ch)
976-
|| Character.isDigit(ch);
884+
return UnicodeMatch.isJavascriptIdentifierPart(ch);
977885
}
978886

979887
private Token scanStringLiteral(int beginIndex, char terminator) {

0 commit comments

Comments
 (0)