diff --git a/lttoolbox/CMakeLists.txt b/lttoolbox/CMakeLists.txt
index b9b79ec8..6874e163 100644
--- a/lttoolbox/CMakeLists.txt
+++ b/lttoolbox/CMakeLists.txt
@@ -21,6 +21,7 @@ set(LIBLTTOOLBOX_HEADERS
node.h
pattern_list.h
regexp_compiler.h
+ reusable_state.h
serialiser.h
sorted_vector.h
sorted_vector.hpp
@@ -54,6 +55,7 @@ set(LIBLTTOOLBOX_SOURCES
node.cc
pattern_list.cc
regexp_compiler.cc
+ reusable_state.cc
sorted_vector.cc
state.cc
stream_reader.cc
diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc
index d1548a64..a87f5898 100644
--- a/lttoolbox/fst_processor.cc
+++ b/lttoolbox/fst_processor.cc
@@ -476,6 +476,20 @@ FSTProcessor::filterFinals(const State& state, UStringView casefrom)
uppercase, firstupper, 0);
}
+UString
+FSTProcessor::filterFinals(const ReusableState& state, UStringView casefrom)
+{
+ bool firstupper = false, uppercase = false;
+ if (!dictionaryCase) {
+ firstupper = u_isupper(casefrom[0]);
+ uppercase = (casefrom.size() > 1 &&
+ firstupper && u_isupper(casefrom[casefrom.size()-1]));
+ }
+ return state.filterFinals(all_finals, alphabet, escaped_chars,
+ displayWeightsMode, maxAnalyses, maxWeightClasses,
+ uppercase, firstupper, 0);
+}
+
void
FSTProcessor::writeEscaped(UStringView str, UFILE *output)
{
@@ -674,6 +688,7 @@ void
FSTProcessor::initBiltrans()
{
initGeneration();
+ escaped_chars.insert('*');
}
@@ -803,7 +818,8 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
bool last_incond = false;
bool last_postblank = false;
bool last_preblank = false;
- State current_state = initial_state;
+ ReusableState current_state;
+ current_state.init(&root);
UString lf; // analysis (lexical form and tags)
UString sf; // surface form
UString lf_spcmp; // space compound analysis
@@ -1044,7 +1060,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
}
}
- current_state = initial_state;
+ current_state.init(&root);
lf.clear();
sf.clear();
last_start = input_buffer.getPos();
@@ -1265,7 +1281,8 @@ FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode)
{
StreamReader reader(&input);
reader.alpha = &alphabet;
- State current_state;
+ ReusableState current_state;
+ current_state.init(&root);
while (!reader.at_eof) {
reader.next();
@@ -1322,7 +1339,7 @@ FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode)
break;
}
if (!skip) {
- current_state = initial_state;
+ current_state.init(&root);
for (auto& sym : reader.readings[0].symbols) {
if (!alphabet.isTag(sym) && u_isupper(sym) &&
!beCaseSensitive(current_state)) {
@@ -1410,7 +1427,8 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
size_t cur_word = 0;
size_t cur_pos = 0;
size_t match_pos = 0;
- State current_state = initial_state;
+ ReusableState current_state;
+ current_state.init(&root);
UString last_match;
int space_diff = 0;
@@ -1590,7 +1608,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
firstupper = false;
have_first = false;
have_second = false;
- current_state = initial_state;
+ current_state.init(&root);
}
}
}
@@ -1728,6 +1746,8 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
StreamReader reader(&input);
reader.alpha = &alphabet;
reader.add_unknowns = true;
+ ReusableState current_state;
+ current_state.init(&root);
size_t index = (biltransSurfaceForms || biltransSurfaceFormsKeep ? 1 : 0);
@@ -1769,7 +1789,7 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
continue;
}
- State current_state = initial_state;
+ current_state.reinit(&root);
bool firstupper = (symbols[0] > 0 && u_isupper(symbols[0]));
bool uppercase = (firstupper && symbols.size() > 1 &&
@@ -1791,11 +1811,11 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
}
if (current_state.isFinal(all_finals)) {
queue_start = i;
- current_state.filterFinalsArray(result,
- all_finals, alphabet, escaped_chars,
- displayWeightsMode, maxAnalyses,
- maxWeightClasses, uppercase,
- firstupper, 0);
+ result = current_state.filterFinalsArray(all_finals, alphabet,
+ escaped_chars,
+ displayWeightsMode, maxAnalyses,
+ maxWeightClasses, uppercase,
+ firstupper, 0);
}
}
// if there are no tags, we only return complete matches
@@ -1847,7 +1867,12 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
u_fputc('/', output);
if (!result.empty()) {
- write(compose(result, source.substr(queue_pos)), output);
+ UString queue = source.substr(queue_pos);
+ for (auto& piece : result) {
+ u_fputc('/', output);
+ write(piece, output);
+ write(queue, output);
+ }
} else {
u_fputc((mode == gm_all ? '#' : '@'), output);
write(source, output);
diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h
index 04c379c9..f706d071 100644
--- a/lttoolbox/fst_processor.h
+++ b/lttoolbox/fst_processor.h
@@ -24,6 +24,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -328,6 +329,7 @@ class FSTProcessor
* Assumes that casefrom is non-empty
*/
UString filterFinals(const State& state, UStringView casefrom);
+ UString filterFinals(const ReusableState& state, UStringView casefrom);
/**
* Write a string to an output stream,
@@ -456,11 +458,11 @@ class FSTProcessor
*
* @return running with --case-sensitive or state size exceeds max
*/
- bool beCaseSensitive(const State& state) {
+ bool beCaseSensitive(size_t size) {
if(caseSensitive) {
return true;
}
- else if(state.size() < max_case_insensitive_state_size) {
+ else if(size < max_case_insensitive_state_size) {
return false; // ie. do case-folding
}
else {
@@ -473,6 +475,10 @@ class FSTProcessor
}
}
+ bool beCaseSensitive(const State& s) { return beCaseSensitive(s.size()); }
+ bool beCaseSensitive(const ReusableState& s) {
+ return beCaseSensitive(s.size());
+ }
void appendEscaped(UString& to, const UString& from) {
for(auto &c : from) {
if (escaped_chars.find(c) != escaped_chars.end()) {
diff --git a/lttoolbox/node.h b/lttoolbox/node.h
index 34ae538b..b9221544 100644
--- a/lttoolbox/node.h
+++ b/lttoolbox/node.h
@@ -23,6 +23,7 @@
#include
-
\ No newline at end of file
+
diff --git a/tests/data/weird-caps.dix b/tests/data/weird-caps.dix
new file mode 100644
index 00000000..59185b92
--- /dev/null
+++ b/tests/data/weird-caps.dix
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py
index e5cbf476..95146ee2 100644
--- a/tests/lt_proc/__init__.py
+++ b/tests/lt_proc/__init__.py
@@ -494,13 +494,25 @@ class SlashesInTags(ProcTest):
'^\\*lobwana1.1<3/4>$',
'^\\*lobwana1.1<1/2>$',
'^\\*lobwana1.1<3/4>$']
- expectedOutputs = ['^\\*lobwana1.1<1/2>/*lopwana1.1<1/2>$',
+ expectedOutputs = ['^\\*lobwana1.1<1/2>/\\*lopwana1.1<1/2>$',
'^\\*lobwana1.1<3/4>/@\\*lobwana1.1<3/4>$',
- '^\\*lobwana1.1<1/2>/*lopwana1.1<1/2>$',
+ '^\\*lobwana1.1<1/2>/\\*lopwana1.1<1/2>$',
'^\\*lobwana1.1<3/4>/@\\*lobwana1.1<3/4>$',
- '^\\*lobwana1.1<1/2>/*lopwana1.1<1/2>$',
+ '^\\*lobwana1.1<1/2>/\\*lopwana1.1<1/2>$',
'^\\*lobwana1.1<3/4>/@\\*lobwana1.1<3/4>$']
+class GeneratorCaps(ProcTest):
+ procdix = 'data/weird-caps.dix'
+ procflags = ['-g', '-z']
+ procdir = 'lr'
+ inputs = ['^iPad$',
+ '^IPad$',
+ '^iPaD$',
+ '^iPAD$',
+ '^IPAD$',
+ '^ipad$',]
+ expectedOutputs = ['iPad', 'IPAD', 'iPad', 'iPad', 'IPAD', '#ipad']
+
class BiltransAnyChar(ProcTest):
procdix = 'data/pass-through.lsx'
procflags = ['-b', '-z']