Fix: handle unicode heredoc tags & Rust grapheme clusters properly (#6024)

georgesittas · web-flow · commit 718d6bbf7f40 · 2025-10-06T22:09:56.000+03:00
diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py
@@ -1421,7 +1421,11 @@ def _scan_string(self, start: str) -> bool:
                         raise_unmatched=not self.HEREDOC_TAG_IS_IDENTIFIER,
                     )
 
-                if tag and self.HEREDOC_TAG_IS_IDENTIFIER and (self._end or not tag.isidentifier()):
+                if (
+                    tag
+                    and self.HEREDOC_TAG_IS_IDENTIFIER
+                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
+                ):
                     if not self._end:
                         self._advance(-1)
 
diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs
@@ -444,13 +444,13 @@ impl<'a> TokenizerState<'a> {
 
                 if !tag.is_empty()
                     && self.settings.heredoc_tag_is_identifier
-                    && (self.is_end || !self.is_identifier(&tag))
+                    && (self.is_end || tag.chars().all(|c| c.is_ascii_digit()) || tag.chars().any(|c| c.is_whitespace()))
                 {
                     if !self.is_end {
                         self.advance(-1)?;
                     }
 
-                    self.advance(-(tag.len() as isize))?;
+                    self.advance(-(tag.chars().count() as isize))?;
                     self.add(self.token_types.heredoc_string_alternative, None)?;
                     return Ok(true);
                 }
@@ -687,9 +687,10 @@ impl<'a> TokenizerState<'a> {
                     continue;
                 }
             }
-            if self.chars(delimiter.len()) == delimiter {
-                if delimiter.len() > 1 {
-                    self.advance((delimiter.len() - 1) as isize)?;
+            let delimiter_char_count = delimiter.chars().count();
+            if self.chars(delimiter_char_count) == delimiter {
+                if delimiter_char_count > 1 {
+                    self.advance((delimiter_char_count - 1) as isize)?;
                 }
                 break;
             }
@@ -720,16 +721,6 @@ impl<'a> TokenizerState<'a> {
         name.is_alphabetic() || name == '_'
     }
 
-    fn is_identifier(&self, s: &str) -> bool {
-        s.chars().enumerate().all(|(i, c)| {
-            if i == 0 {
-                self.is_alphabetic_or_underscore(c)
-            } else {
-                self.is_alphabetic_or_underscore(c) || c.is_ascii_digit()
-            }
-        })
-    }
-
     fn is_numeric(&self, s: &str) -> bool {
         s.chars().all(|c| c.is_ascii_digit())
     }
diff --git a/tests/dialects/test_duckdb.py b/tests/dialects/test_duckdb.py
@@ -363,6 +363,10 @@ def test_duckdb(self):
         self.validate_identity(
             """SELECT '{ "family": "anatidae", "species": [ "duck", "goose", "swan", null ] }' ->> ['$.family', '$.species']""",
         )
+        self.validate_identity(
+            "SELECT $🦆$foo$🦆$",
+            "SELECT 'foo'",
+        )
         self.validate_identity(
             "SELECT * FROM t PIVOT(FIRST(t) AS t, FOR quarter IN ('Q1', 'Q2'))",
             "SELECT * FROM t PIVOT(FIRST(t) AS t FOR quarter IN ('Q1', 'Q2'))",

Original file line number	Diff line number	Diff line change
`@@ -444,13 +444,13 @@ impl<'a> TokenizerState<'a> {`
`444`	`444`
`445`	`445`	`if !tag.is_empty()`
`446`	`446`	`&& self.settings.heredoc_tag_is_identifier`
`447`		`- && (self.is_end \|\| !self.is_identifier(&tag))`
	`447`	`+ && (self.is_end \|\| tag.chars().all(\|c\| c.is_ascii_digit()) \|\| tag.chars().any(\|c\| c.is_whitespace()))`
`448`	`448`	`{`
`449`	`449`	`if !self.is_end {`
`450`	`450`	`self.advance(-1)?;`
`451`	`451`	`}`
`452`	`452`
`453`		`- self.advance(-(tag.len() as isize))?;`
	`453`	`+ self.advance(-(tag.chars().count() as isize))?;`
`454`	`454`	`self.add(self.token_types.heredoc_string_alternative, None)?;`
`455`	`455`	`return Ok(true);`
`456`	`456`	`}`
`@@ -687,9 +687,10 @@ impl<'a> TokenizerState<'a> {`
`687`	`687`	`continue;`
`688`	`688`	`}`
`689`	`689`	`}`
`690`		`- if self.chars(delimiter.len()) == delimiter {`
`691`		`- if delimiter.len() > 1 {`
`692`		`- self.advance((delimiter.len() - 1) as isize)?;`
	`690`	`+ let delimiter_char_count = delimiter.chars().count();`
	`691`	`+ if self.chars(delimiter_char_count) == delimiter {`
	`692`	`+ if delimiter_char_count > 1 {`
	`693`	`+ self.advance((delimiter_char_count - 1) as isize)?;`
`693`	`694`	`}`
`694`	`695`	`break;`
`695`	`696`	`}`
`@@ -720,16 +721,6 @@ impl<'a> TokenizerState<'a> {`
`720`	`721`	`name.is_alphabetic() \|\| name == '_'`
`721`	`722`	`}`
`722`	`723`
`723`		`- fn is_identifier(&self, s: &str) -> bool {`
`724`		`- s.chars().enumerate().all(\|(i, c)\| {`
`725`		`- if i == 0 {`
`726`		`- self.is_alphabetic_or_underscore(c)`
`727`		`- } else {`
`728`		`- self.is_alphabetic_or_underscore(c) \|\| c.is_ascii_digit()`
`729`		`- }`
`730`		`- })`
`731`		`- }`
`732`		`-`
`733`	`724`	`fn is_numeric(&self, s: &str) -> bool {`
`734`	`725`	`s.chars().all(\|c\| c.is_ascii_digit())`
`735`	`726`	`}`