No public description

tensorflower-gardener · tensorflower-gardener · commit 386429d5d30b · 2026-04-23T09:35:29.000-07:00
PiperOrigin-RevId: 904496953
diff --git a/official/nlp/tools/tokenization.py b/official/nlp/tools/tokenization.py
@@ -428,16 +428,18 @@ def preprocess_text(inputs, remove_space=True, lower=False):
     The preprocessed text.
 
   """
+  # Byte strings need to be explicitly decoded to unicode text,
+  # typically using UTF-8. A latin-1 fallback is included for
+  # backward compatibility with legacy sentence piece models.
+  if isinstance(inputs, six.binary_type):
+    try:
+      inputs = six.ensure_text(inputs, "utf-8")
+    except UnicodeDecodeError:
+      inputs = six.ensure_text(inputs, "latin-1")
   outputs = inputs
   if remove_space:
     outputs = " ".join(inputs.strip().split())
 
-  if six.PY2 and isinstance(outputs, str):
-    try:
-      outputs = six.ensure_text(outputs, "utf-8")
-    except UnicodeDecodeError:
-      outputs = six.ensure_text(outputs, "latin-1")
-
   outputs = unicodedata.normalize("NFKD", outputs)
   outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
   if lower:
diff --git a/official/nlp/tools/tokenization_test.py b/official/nlp/tools/tokenization_test.py
@@ -151,6 +151,18 @@ def test_is_punctuation(self):
     self.assertFalse(tokenization._is_punctuation(u"A"))
     self.assertFalse(tokenization._is_punctuation(u" "))
 
+  def test_preprocess_text(self):
+    self.assertEqual(tokenization.preprocess_text("hello world"), "hello world")
+    self.assertEqual(tokenization.preprocess_text(b"hello \xc3\xa9"), "hello e")
+    self.assertEqual(tokenization.preprocess_text(b"hello \xe9"), "hello e")
+    self.assertEqual(
+        tokenization.preprocess_text(b"hello  world", remove_space=True),
+        "hello world",
+    )
+    self.assertEqual(
+        tokenization.preprocess_text("Hello World", lower=True), "hello world"
+    )
+
 
 if __name__ == "__main__":
   tf.test.main()