Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 70 additions & 20 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,42 @@ FSTProcessor::compoundAnalysis(UString input_word)
return filterFinals(current_state, input_word);
}

UString
FSTProcessor::compoundAnalysisOrLowering(UString input_cased) {
if(do_decomposition) {
// Try compound analysis without altering casing:
UString compound = compoundAnalysis(input_cased);
if(!compound.empty()) {
return compound;
}
}
// If we failed due to state explosion, we may try again with the lowercased string:
UString input_lowered = StringUtils::tolower(input_cased);
State current_state = initial_state;
for(unsigned int i=0; i<input_lowered.size(); i++) {
current_state.step_case(input_lowered[i], beCaseSensitive(current_state));
if(current_state.size()==0) {
break;
}
}
if(do_decomposition && compoundOnlyLSymbol != 0) {
current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
}
UString nonCompound = filterFinals(current_state, input_lowered);
if(!nonCompound.empty()) {
return nonCompound;
}
if(do_decomposition) {
// … or even on the compound analysis of the lowercased string:
UString compound = compoundAnalysis(input_lowered);
if(!compound.empty()) {
return compound;
}
}
// None of the above:
UString nullString;
return nullString;
}


void
Expand Down Expand Up @@ -961,17 +997,10 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
{
input_buffer.setPos(last_start + limit.i_codepoint);
UString unknown_word = sf.substr(0, limit.i_utf16);
if(do_decomposition)
UString compoundOrLower = compoundAnalysisOrLowering(unknown_word);
if(!compoundOrLower.empty())
{
UString compound = compoundAnalysis(unknown_word);
if(!compound.empty())
{
printWord(unknown_word, compound, output);
}
else
{
printUnknownWord(unknown_word, output);
}
printWord(unknown_word, compoundOrLower, output);
}
else
{
Expand All @@ -991,17 +1020,10 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
{
input_buffer.setPos(last_start + limit.i_codepoint);
UString unknown_word = sf.substr(0, limit.i_utf16);
if(do_decomposition)
UString compoundOrLower = compoundAnalysisOrLowering(unknown_word);
if(!compoundOrLower.empty())
{
UString compound = compoundAnalysis(unknown_word);
if(!compound.empty())
{
printWord(unknown_word, compound, output);
}
else
{
printUnknownWord(unknown_word, output);
}
printWord(unknown_word, compoundOrLower, output);
}
else
{
Expand Down Expand Up @@ -1781,6 +1803,34 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
result.clear();
}

if(result.empty() && (mode == gm_bilgen || mode == gm_all)) {
// Retry looking up lower-cased version, this time not using alt-override (which leads to state explosions)
State current_state = initial_state;
if (reader.readings[index].mark == '#') current_state.step('#');
bool seenTags = false;
for (size_t i = 0; i < symbols.size(); i++) {
seenTags = seenTags || alphabet.isTag(symbols[i]);
if(alphabet.isTag(symbols[i]) || beCaseSensitive(current_state)) {
current_state.step_override(symbols[i], any_char, symbols[i]);
}
else {
int32_t symbol_low = u_tolower(symbols[i]);
current_state.step_override(symbol_low, any_char, symbol_low);
}
if (current_state.isFinal(all_finals)) {
queue_start = i;
current_state.filterFinalsArray(result,
all_finals, alphabet, escaped_chars,
displayWeightsMode, maxAnalyses,
maxWeightClasses);
}
}
// if there are no tags, we only return complete matches
if ((!seenTags || mode == gm_all || mode == gm_bilgen) && queue_start + 1 < symbols.size()) {
result.clear();
}
}

UString source;
size_t queue_pos = 0;
if (reader.readings[index].mark == '#') {
Expand Down
5 changes: 5 additions & 0 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,11 @@ class FSTProcessor
static UStringView removeTags(UStringView str);
UString compoundAnalysis(UString str);

/**
* As above, but if compoundAnalysis gives no results, try analysing the lowercased version of str.
*/
UString compoundAnalysisOrLowering(UString str);

struct Indices {
size_t i_codepoint;
size_t i_utf16; // always >= i_codepoint since some codepoints take up 2 UTF-16's
Expand Down
26 changes: 26 additions & 0 deletions tests/data/big-mono.dix
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<dictionary>
<alphabet>ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­-</alphabet>
<sdefs>
<sdef n="n"/>
<sdef n="np"/>
<sdef n="def"/>
<sdef n="compound-only-L"/>
<sdef n="compound-R"/>
</sdefs>
<pardefs>

</pardefs>

<section id="main" type="standard">
<e><p><l>hjerterytmeovervåkningen</l><r>hjerterytmeovervåkning<s n="n"/><s n="def"/></r></p></e>
<e><p><l>hjerteklaff</l><r>hjerteklaff<s n="n"/><s n="compound-only-L"/></r></p></e>
<e><p><l>overvåkningen</l><r>overvåkning<s n="n"/><s n="def"/><s n="compound-R"/></r></p></e>
<e> <re>[A-ZÆØÅ]+[a-zæøåA-ZÆØÅ]+!</re><p><l/><r><s n="np"/></r></p></e>

<e><p><l>vas</l><r>vass<s n="n"/><s n="compound-only-L"/></r></p></e>
<e><p><l>senga</l><r>seng<s n="n"/><s n="def"/><s n="compound-R"/></r></p></e>
</section>


</dictionary>
24 changes: 24 additions & 0 deletions tests/lt_proc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,30 @@ class BiltransGenDebugSymbols(ProcTest):
'^ab<n><def>#c/#ab<n><def>#c$',
]

class BiltransLowerFallback(ProcTest):
procdix = 'data/big-mono.dix'
procdir = 'rl'
procflags = ['-g', '-b', '-z']
inputs = [
'^HJERTERYTMEOVERVÅKNING<n><def>$',
]
expectedOutputs = [
'^HJERTERYTMEOVERVÅKNING<n><def>/hjerterytmeovervåkningen$',
]

class AnalysisLowerFallback(ProcTest):
procdix = 'data/big-mono.dix'
procdir = 'lr'
procflags = ['-w', '-e', '-z']
inputs = [
'Vas vas',
'hjerterytmeovervåkningen hjerteklaffovervåkningen HJERTERYTMEOVERVÅKNINGEN HJERTEKLAFFOVERVÅKNINGEN',
]
expectedOutputs = [
'^Vas/*Vas$ ^vas/*vas$',
'^hjerterytmeovervåkningen/hjerterytmeovervåkning<n><def>$ ^hjerteklaffovervåkningen/hjerteklaff<n>+overvåkning<n><def>$ ^HJERTERYTMEOVERVÅKNINGEN/hjerterytmeovervåkning<n><def>$ ^HJERTEKLAFFOVERVÅKNINGEN/hjerteklaff<n>+overvåkning<n><def>$'
]


# These fail on some systems:
#from null_flush_invalid_stream_format import *