diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 9698f3db..d1548a64 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -715,6 +715,42 @@ FSTProcessor::compoundAnalysis(UString input_word) return filterFinals(current_state, input_word); } +UString +FSTProcessor::compoundAnalysisOrLowering(UString input_cased) { + if(do_decomposition) { + // Try compound analysis without altering casing: + UString compound = compoundAnalysis(input_cased); + if(!compound.empty()) { + return compound; + } + } + // If we failed due to state explosion, we may try again with the lowercased string: + UString input_lowered = StringUtils::tolower(input_cased); + State current_state = initial_state; + for(unsigned int i=0; i= i_codepoint since some codepoints take up 2 UTF-16's diff --git a/tests/data/big-mono.dix b/tests/data/big-mono.dix new file mode 100644 index 00000000..f583869d --- /dev/null +++ b/tests/data/big-mono.dix @@ -0,0 +1,26 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + + + + + + + +
+

hjerterytmeovervåkningenhjerterytmeovervåkning

+

hjerteklaffhjerteklaff

+

overvåkningenovervåkning

+ [A-ZÆØÅ]+[a-zæøåA-ZÆØÅ]+!

+ +

vasvass

+

sengaseng

+
+ + +
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index ac0f9410..e5cbf476 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -531,6 +531,30 @@ class BiltransGenDebugSymbols(ProcTest): '^ab#c/#ab#c$', ] +class BiltransLowerFallback(ProcTest): + procdix = 'data/big-mono.dix' + procdir = 'rl' + procflags = ['-g', '-b', '-z'] + inputs = [ + '^HJERTERYTMEOVERVÅKNING$', + ] + expectedOutputs = [ + '^HJERTERYTMEOVERVÅKNING/hjerterytmeovervåkningen$', + ] + +class AnalysisLowerFallback(ProcTest): + procdix = 'data/big-mono.dix' + procdir = 'lr' + procflags = ['-w', '-e', '-z'] + inputs = [ + 'Vas vas', + 'hjerterytmeovervåkningen hjerteklaffovervåkningen HJERTERYTMEOVERVÅKNINGEN HJERTEKLAFFOVERVÅKNINGEN', + ] + expectedOutputs = [ + '^Vas/*Vas$ ^vas/*vas$', + '^hjerterytmeovervåkningen/hjerterytmeovervåkning$ ^hjerteklaffovervåkningen/hjerteklaff+overvåkning$ ^HJERTERYTMEOVERVÅKNINGEN/hjerterytmeovervåkning$ ^HJERTEKLAFFOVERVÅKNINGEN/hjerteklaff+overvåkning$' + ] + # These fail on some systems: #from null_flush_invalid_stream_format import *