Skip to content

Commit

Permalink
Avoid refromat_transcript() on original pinyin
Browse files Browse the repository at this point in the history
Before this change, there are two different ways that the code interacts with
`pinyin` in a note:

  1. If the `pinyin` field is empty, it uses the `hanzi` field as a
     source-of-truth to generate the `pinyin` field.

  2. If the `pinyin` field is non-empty, it takes the contents of
     `pinyin` and runs `reformat_transcript()` on it. The idea of this
     function is that it will update (split, colorize, etc) the `pinyin`
     field with new information (that the user provides). This function
     does the splitting using a regular-expression.

We have observed a bug in this logic that occurs for some words, such as
可能 which initially see the correct pinyin populated ("kě néng") but
have this pinyin incorrectly change ("kěn éng") as a result of running
the `reformat_transcript()` function on them. This bug can occur for any
pinyin in which there are multiple acceptable regular expression splits.

Bug report: Gustaf-C#55

This commit attempts to put a bandaid over the issue by avoiding repopulating
the pinyin field for words if the user did not change the original hanzi
transcription. Unit tests and documentation are also included in the commit.
  • Loading branch information
Jacobinski committed Jan 16, 2024
1 parent b638ce1 commit dc4e703
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 4 deletions.
31 changes: 27 additions & 4 deletions chinese/behavior.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,18 +170,41 @@ def fill_transcript(hanzi, note):
n_filled = 0
separated = split_hanzi(hanzi)

# There are three cases to consider here for each target:
#
# 1. The note has an empty field for the target. In this case, the
# extension should populate the field using the hanzi.
#
# 2. The note has a populated field that matches the extension's hanzi
# translation. In this case, do nothing. The reason that this case
# exists is because `reformat_transcript()` uses regex to split the
# pinyin, and in some cases it will take an initially correct pinyin
# and resplit it into an incorrect pinyin. For example, the extension
# correctly turns 可能 into "kě néng", but `reformat_transcript()` would
# incorrectly resplit it into "kěn éng".
#
# 3. The note has a populated field that does not match the extension's
# Hanzi translation. This can happen when the user inputs a hanzi with
# multiple definitions (such as 都: dōu = all, dū = metropolis) and the
# version selected by the extension is not the version wanted by the
# user. In this case, they will override the field with a new pinyin
# value (ex. du1 -> dū) and the extension will be expected to split
# and recolorize it while ignoring the original hanzi translation.

for key, target, type_ in [
('bopomofo', 'bopomofo', 'trad'),
('cantonese', 'jyutping', 'trad'),
('pinyin', 'pinyin', 'simp'),
('pinyinTaiwan', 'pinyin_tw', 'trad'),
]:
trans = colorize(transcribe(separated.copy(), target, type_), target)
trans = hide(trans, no_tone(trans))
if get_first(config['fields'][key], note) == '':
trans = colorize(transcribe(separated, target, type_), target)
trans = hide(trans, no_tone(trans))
set_all(config['fields'][key], note, to=trans)
n_filled += 1
else:
elif get_first(config['fields'][key], note) == trans:
continue
else:
reformat_transcript(note, key, target)

return n_filled
Expand Down Expand Up @@ -322,7 +345,7 @@ def fill_all_rubies(hanzi, note):
]:
fill_ruby(hanzi, note, trans_group, ruby_group)


# The entry-point into the code, from edit.py `onFocusLost()`
def update_fields(note, focus_field, fields):
copy = dict(note)
hanzi = get_first(config['fields']['hanzi'], copy)
Expand Down
62 changes: 62 additions & 0 deletions tests/test_behavior.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,68 @@ def test_issue_78(self):
reformat_transcript(note, 'pinyin', 'pinyin')
self.assertEqual(note['Pinyin'], expected)

def test_fill_transcript_pinyin_empty(self):
hanzi = '可能'
note = {'Hanzi': hanzi, 'Pinyin': ''}
# TODO: The '<!-- ken eng -->' is incorrecly split because it relies
# on regexp splitting of keneng instead of using the two <span>
# containers as a source-of-truth.
expected = (
'<span class="tone3">kě</span>'
'<span class="tone2">néng</span> '
'<!-- ken eng -->'
)
fill_transcript(hanzi, note)
self.assertEqual(note['Pinyin'], expected)
fill_transcript(hanzi, note) # Verify stability with a second attempt
self.assertEqual(note['Pinyin'], expected)

def test_fill_transcript_pinyin_unchanged(self):
hanzi = '可能'
# The contents of `pinyin_html` is taken from fill_transcript('可能').
# See the testcase `test_fill_transcript_pinyin_empty` above.
pinyin_html = (
'<span class="tone3">kě</span>'
'<span class="tone2">néng</span> '
'<!-- ken eng -->'
)
note = {'Hanzi': hanzi, 'Pinyin': pinyin_html}
fill_transcript(hanzi, note)
self.assertEqual(note['Pinyin'], pinyin_html)
fill_transcript(hanzi, note) # Verify stability with a second attempt
self.assertEqual(note['Pinyin'], pinyin_html)

def test_fill_transcript_pinyin_changed(self):
# The word 大都 has two translations:
# dàdōu: "for the most part"
# dàdū: "metropolis"
# The extension will initially give the user the pinyin "dàdū".
# If the user changes it to "dàdōu", the extension should split it into
# "dà dōu" without reverting it to the initial "dàdū" transcription.
hanzi = '大都'
note = {'Hanzi': hanzi, 'Pinyin': ''}
expected_initial = (
'<span class="tone4">dà</span>'
'<span class="tone1">dū</span> '
'<!-- da du -->'
)
fill_transcript(hanzi, note)
self.assertEqual(note['Pinyin'], expected_initial)
fill_transcript(hanzi, note) # Verify stability with a second attempt
self.assertEqual(note['Pinyin'], expected_initial)

# User changes the pinyin to "da4dou1"
note['Pinyin'] = 'da4dou1'
expected_final = (
'<span class="tone4">dà</span>'
'<span class="tone1">dōu</span> '
'<!-- da dou -->'
)
fill_transcript(hanzi, note)
self.assertEqual(note['Pinyin'], expected_final)
fill_transcript(hanzi, note) # Verify stability with a second attempt
self.assertEqual(note['Pinyin'], expected_final)


class FillSound(Base):
def test_missing_sound(self):
Expand Down

0 comments on commit dc4e703

Please sign in to comment.