Skip to content

Commit 718d6bb

Browse files
authored
Fix: handle unicode heredoc tags & Rust grapheme clusters properly (#6024)
1 parent 27a76cd commit 718d6bb

File tree

3 files changed

+15
-16
lines changed

3 files changed

+15
-16
lines changed

sqlglot/tokens.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1421,7 +1421,11 @@ def _scan_string(self, start: str) -> bool:
14211421
raise_unmatched=not self.HEREDOC_TAG_IS_IDENTIFIER,
14221422
)
14231423

1424-
if tag and self.HEREDOC_TAG_IS_IDENTIFIER and (self._end or not tag.isidentifier()):
1424+
if (
1425+
tag
1426+
and self.HEREDOC_TAG_IS_IDENTIFIER
1427+
and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1428+
):
14251429
if not self._end:
14261430
self._advance(-1)
14271431

sqlglotrs/src/tokenizer.rs

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -444,13 +444,13 @@ impl<'a> TokenizerState<'a> {
444444

445445
if !tag.is_empty()
446446
&& self.settings.heredoc_tag_is_identifier
447-
&& (self.is_end || !self.is_identifier(&tag))
447+
&& (self.is_end || tag.chars().all(|c| c.is_ascii_digit()) || tag.chars().any(|c| c.is_whitespace()))
448448
{
449449
if !self.is_end {
450450
self.advance(-1)?;
451451
}
452452

453-
self.advance(-(tag.len() as isize))?;
453+
self.advance(-(tag.chars().count() as isize))?;
454454
self.add(self.token_types.heredoc_string_alternative, None)?;
455455
return Ok(true);
456456
}
@@ -687,9 +687,10 @@ impl<'a> TokenizerState<'a> {
687687
continue;
688688
}
689689
}
690-
if self.chars(delimiter.len()) == delimiter {
691-
if delimiter.len() > 1 {
692-
self.advance((delimiter.len() - 1) as isize)?;
690+
let delimiter_char_count = delimiter.chars().count();
691+
if self.chars(delimiter_char_count) == delimiter {
692+
if delimiter_char_count > 1 {
693+
self.advance((delimiter_char_count - 1) as isize)?;
693694
}
694695
break;
695696
}
@@ -720,16 +721,6 @@ impl<'a> TokenizerState<'a> {
720721
name.is_alphabetic() || name == '_'
721722
}
722723

723-
fn is_identifier(&self, s: &str) -> bool {
724-
s.chars().enumerate().all(|(i, c)| {
725-
if i == 0 {
726-
self.is_alphabetic_or_underscore(c)
727-
} else {
728-
self.is_alphabetic_or_underscore(c) || c.is_ascii_digit()
729-
}
730-
})
731-
}
732-
733724
fn is_numeric(&self, s: &str) -> bool {
734725
s.chars().all(|c| c.is_ascii_digit())
735726
}

tests/dialects/test_duckdb.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,10 @@ def test_duckdb(self):
363363
self.validate_identity(
364364
"""SELECT '{ "family": "anatidae", "species": [ "duck", "goose", "swan", null ] }' ->> ['$.family', '$.species']""",
365365
)
366+
self.validate_identity(
367+
"SELECT $🦆$foo$🦆$",
368+
"SELECT 'foo'",
369+
)
366370
self.validate_identity(
367371
"SELECT * FROM t PIVOT(FIRST(t) AS t, FOR quarter IN ('Q1', 'Q2'))",
368372
"SELECT * FROM t PIVOT(FIRST(t) AS t FOR quarter IN ('Q1', 'Q2'))",

0 commit comments

Comments
 (0)