Avoid refromat_transcript() on original pinyin

Before this change, there are two different ways that the code interacts with `pinyin` in a note: 1. If the `pinyin` field is empty, it uses the `hanzi` field as a source-of-truth to generate the `pinyin` field. 2. If the `pinyin` field is non-empty, it takes the contents of `pinyin` and runs `reformat_transcript()` on it. The idea of this function is that it will update (split, colorize, etc) the `pinyin` field with new information (that the user provides). This function does the splitting using a regular-expression. We have observed a bug in this logic that occurs for some words, such as 可能 which initially see the correct pinyin populated ("kě néng") but have this pinyin incorrectly change ("kěn éng") as a result of running the `reformat_transcript()` function on them. This bug can occur for any pinyin in which there are multiple acceptable regular expression splits. Bug report: Gustaf-C#55 This commit attempts to put a bandaid over the issue by avoiding repopulating the pinyin field for words if the user did not change the original hanzi transcription. Unit tests and documentation are also included in the commit.
Jacobinski · Jan 16, 2024 · dc4e703 · dc4e703
1 parent b638ce1
commit dc4e703
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 4 deletions.
diff --git a/chinese/behavior.py b/chinese/behavior.py
@@ -170,18 +170,41 @@ def fill_transcript(hanzi, note):
     n_filled = 0
     separated = split_hanzi(hanzi)
 
+    # There are three cases to consider here for each target:
+    #
+    #   1. The note has an empty field for the target. In this case, the
+    #      extension should populate the field using the hanzi.
+    #
+    #   2. The note has a populated field that matches the extension's hanzi
+    #      translation. In this case, do nothing. The reason that this case
+    #      exists is because `reformat_transcript()` uses regex to split the 
+    #      pinyin, and in some cases it will take an initially correct pinyin
+    #      and resplit it into an incorrect pinyin. For example, the extension
+    #      correctly turns 可能 into "kě néng", but `reformat_transcript()` would
+    #      incorrectly resplit it into "kěn éng".
+    #
+    #   3. The note has a populated field that does not match the extension's
+    #      Hanzi translation. This can happen when the user inputs a hanzi with
+    #      multiple definitions (such as 都: dōu = all, dū = metropolis) and the
+    #      version selected by the extension is not the version wanted by the 
+    #      user. In this case, they will override the field with a new pinyin
+    #      value (ex. du1 -> dū) and the extension will be expected to split 
+    #      and recolorize it while ignoring the original hanzi translation.
+
     for key, target, type_ in [
         ('bopomofo', 'bopomofo', 'trad'),
         ('cantonese', 'jyutping', 'trad'),
         ('pinyin', 'pinyin', 'simp'),
         ('pinyinTaiwan', 'pinyin_tw', 'trad'),
     ]:
+        trans = colorize(transcribe(separated.copy(), target, type_), target)
+        trans = hide(trans, no_tone(trans))
         if get_first(config['fields'][key], note) == '':
-            trans = colorize(transcribe(separated, target, type_), target)
-            trans = hide(trans, no_tone(trans))
             set_all(config['fields'][key], note, to=trans)
             n_filled += 1
-        else:
+        elif get_first(config['fields'][key], note) == trans:
+            continue
+        else: 
             reformat_transcript(note, key, target)
 
     return n_filled
@@ -322,7 +345,7 @@ def fill_all_rubies(hanzi, note):
     ]:
         fill_ruby(hanzi, note, trans_group, ruby_group)
 
-
+# The entry-point into the code, from edit.py `onFocusLost()`
 def update_fields(note, focus_field, fields):
     copy = dict(note)
     hanzi = get_first(config['fields']['hanzi'], copy)

diff --git a/tests/test_behavior.py b/tests/test_behavior.py
@@ -48,6 +48,68 @@ def test_issue_78(self):
         reformat_transcript(note, 'pinyin', 'pinyin')
         self.assertEqual(note['Pinyin'], expected)
 
+    def test_fill_transcript_pinyin_empty(self):
+        hanzi = '可能'
+        note = {'Hanzi': hanzi, 'Pinyin': ''}
+        # TODO: The '<!-- ken eng -->' is incorrecly split because it relies
+        #       on regexp splitting of keneng instead of using the two <span>
+        #       containers as a source-of-truth.
+        expected = (
+            '<span class="tone3">kě</span>'
+            '<span class="tone2">néng</span> '
+            '<!-- ken eng -->'
+        )
+        fill_transcript(hanzi, note)
+        self.assertEqual(note['Pinyin'], expected)
+        fill_transcript(hanzi, note) # Verify stability with a second attempt
+        self.assertEqual(note['Pinyin'], expected)
+
+    def test_fill_transcript_pinyin_unchanged(self):
+        hanzi = '可能'
+        # The contents of `pinyin_html` is taken from fill_transcript('可能').
+        # See the testcase `test_fill_transcript_pinyin_empty` above.
+        pinyin_html = (
+            '<span class="tone3">kě</span>'
+            '<span class="tone2">néng</span> '
+            '<!-- ken eng -->'
+        )
+        note = {'Hanzi': hanzi, 'Pinyin': pinyin_html}
+        fill_transcript(hanzi, note)
+        self.assertEqual(note['Pinyin'], pinyin_html)
+        fill_transcript(hanzi, note) # Verify stability with a second attempt
+        self.assertEqual(note['Pinyin'], pinyin_html)
+
+    def test_fill_transcript_pinyin_changed(self):
+        # The word 大都 has two translations:
+        #   dàdōu: "for the most part"
+        #   dàdū: "metropolis"
+        # The extension will initially give the user the pinyin "dàdū".
+        # If the user changes it to "dàdōu", the extension should split it into
+        # "dà dōu" without reverting it to the initial "dàdū" transcription.
+        hanzi = '大都'
+        note = {'Hanzi': hanzi, 'Pinyin': ''}
+        expected_initial = (
+            '<span class="tone4">dà</span>'
+            '<span class="tone1">dū</span> '
+            '<!-- da du -->'
+        )
+        fill_transcript(hanzi, note)
+        self.assertEqual(note['Pinyin'], expected_initial)
+        fill_transcript(hanzi, note) # Verify stability with a second attempt
+        self.assertEqual(note['Pinyin'], expected_initial)
+
+        # User changes the pinyin to "da4dou1"
+        note['Pinyin'] = 'da4dou1'
+        expected_final = (
+            '<span class="tone4">dà</span>'
+            '<span class="tone1">dōu</span> '
+            '<!-- da dou -->'
+        )
+        fill_transcript(hanzi, note)
+        self.assertEqual(note['Pinyin'], expected_final)
+        fill_transcript(hanzi, note) # Verify stability with a second attempt
+        self.assertEqual(note['Pinyin'], expected_final)
+
 
 class FillSound(Base):
     def test_missing_sound(self):