From 0b26cbd5c76e1df523e647455f6315ae248840b4 Mon Sep 17 00:00:00 2001 From: zdenop Date: Sun, 10 Aug 2025 19:00:11 +0200 Subject: [PATCH] Fixed the GetUTF8Text method to return nullptr instead of asserting when best_choice is nullptr --- src/ccmain/ltrresultiterator.cpp | 57 +++++++++++++++++++++----------- src/ccmain/output.cpp | 19 +++++++---- src/ccmain/resultiterator.cpp | 4 ++- 3 files changed, 52 insertions(+), 28 deletions(-) diff --git a/src/ccmain/ltrresultiterator.cpp b/src/ccmain/ltrresultiterator.cpp index 84ed913a2f..3807bd9ec1 100644 --- a/src/ccmain/ltrresultiterator.cpp +++ b/src/ccmain/ltrresultiterator.cpp @@ -49,7 +49,9 @@ char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const { std::string text; PAGE_RES_IT res_it(*it_); WERD_CHOICE *best_choice = res_it.word()->best_choice; - ASSERT_HOST(best_choice != nullptr); + if (best_choice == nullptr) { + return nullptr; // No recognition results available + } if (level == RIL_SYMBOL) { text = res_it.word()->BestUTF8(blob_index_, false); } else if (level == RIL_WORD) { @@ -61,7 +63,9 @@ char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const { do { // for each text line in a paragraph do { // for each word in a text line best_choice = res_it.word()->best_choice; - ASSERT_HOST(best_choice != nullptr); + if (best_choice == nullptr) { + break; // Skip words without recognition results + } text += best_choice->unichar_string(); text += " "; res_it.forward(); @@ -104,16 +108,20 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const { case RIL_BLOCK: do { best_choice = res_it.word()->best_choice; - mean_certainty += best_choice->certainty(); - ++certainty_count; + if (best_choice != nullptr) { + mean_certainty += best_choice->certainty(); + ++certainty_count; + } res_it.forward(); } while (res_it.block() == res_it.prev_block()); break; case RIL_PARA: do { best_choice = res_it.word()->best_choice; - mean_certainty += best_choice->certainty(); - ++certainty_count; + if (best_choice != nullptr) { + mean_certainty += best_choice->certainty(); + ++certainty_count; + } res_it.forward(); } while (res_it.block() == res_it.prev_block() && res_it.row()->row->para() == res_it.prev_row()->row->para()); @@ -121,20 +129,26 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const { case RIL_TEXTLINE: do { best_choice = res_it.word()->best_choice; - mean_certainty += best_choice->certainty(); - ++certainty_count; + if (best_choice != nullptr) { + mean_certainty += best_choice->certainty(); + ++certainty_count; + } res_it.forward(); } while (res_it.row() == res_it.prev_row()); break; case RIL_WORD: best_choice = res_it.word()->best_choice; - mean_certainty = best_choice->certainty(); - certainty_count = 1; + if (best_choice != nullptr) { + mean_certainty = best_choice->certainty(); + certainty_count = 1; + } break; case RIL_SYMBOL: best_choice = res_it.word()->best_choice; - mean_certainty = best_choice->certainty(blob_index_); - certainty_count = 1; + if (best_choice != nullptr) { + mean_certainty = best_choice->certainty(blob_index_); + certainty_count = 1; + } } if (certainty_count > 0) { mean_certainty /= certainty_count; @@ -226,8 +240,8 @@ StrongScriptDirection LTRResultIterator::WordDirection() const { // Returns true if the current word was found in a dictionary. bool LTRResultIterator::WordIsFromDictionary() const { - if (it_->word() == nullptr) { - return false; // Already at the end! + if (it_->word() == nullptr || it_->word()->best_choice == nullptr) { + return false; // Already at the end or no recognition results! } int permuter = it_->word()->best_choice->permuter(); return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM || permuter == USER_DAWG_PERM; @@ -243,8 +257,8 @@ int LTRResultIterator::BlanksBeforeWord() const { // Returns true if the current word is numeric. bool LTRResultIterator::WordIsNumeric() const { - if (it_->word() == nullptr) { - return false; // Already at the end! + if (it_->word() == nullptr || it_->word()->best_choice == nullptr) { + return false; // Already at the end or no recognition results! } int permuter = it_->word()->best_choice->permuter(); return permuter == NUMBER_PERM; @@ -315,8 +329,11 @@ char *LTRResultIterator::WordNormedUTF8Text() const { if (it_->word() == nullptr) { return nullptr; // Already at the end! } - std::string ocr_text; WERD_CHOICE *best_choice = it_->word()->best_choice; + if (best_choice == nullptr) { + return nullptr; // No recognition results available + } + std::string ocr_text; const UNICHARSET *unicharset = it_->word()->uch_set; for (unsigned i = 0; i < best_choice->length(); ++i) { ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i)); @@ -341,7 +358,7 @@ const char *LTRResultIterator::WordLattice(int *lattice_size) const { // If iterating at a higher level object than symbols, eg words, then // this will return the attributes of the first symbol in that word. bool LTRResultIterator::SymbolIsSuperscript() const { - if (cblob_it_ == nullptr && it_->word() != nullptr) { + if (cblob_it_ == nullptr && it_->word() != nullptr && it_->word()->best_choice != nullptr) { return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUPERSCRIPT; } return false; @@ -351,7 +368,7 @@ bool LTRResultIterator::SymbolIsSuperscript() const { // If iterating at a higher level object than symbols, eg words, then // this will return the attributes of the first symbol in that word. bool LTRResultIterator::SymbolIsSubscript() const { - if (cblob_it_ == nullptr && it_->word() != nullptr) { + if (cblob_it_ == nullptr && it_->word() != nullptr && it_->word()->best_choice != nullptr) { return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT; } return false; @@ -361,7 +378,7 @@ bool LTRResultIterator::SymbolIsSubscript() const { // If iterating at a higher level object than symbols, eg words, then // this will return the attributes of the first symbol in that word. bool LTRResultIterator::SymbolIsDropcap() const { - if (cblob_it_ == nullptr && it_->word() != nullptr) { + if (cblob_it_ == nullptr && it_->word() != nullptr && it_->word()->best_choice != nullptr) { return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP; } return false; diff --git a/src/ccmain/output.cpp b/src/ccmain/output.cpp index a530c15e40..639591ca3a 100644 --- a/src/ccmain/output.cpp +++ b/src/ccmain/output.cpp @@ -103,7 +103,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, const UNICHARSET &uchset = *word->uch_set; UNICHAR_ID space = uchset.unichar_to_id(" "); - if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) && + if ((word->unlv_crunch_mode != CR_NONE || (word->best_choice != nullptr && word->best_choice->empty())) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { bool need_reject = false; if ((word->unlv_crunch_mode != CR_DELETE) && @@ -149,7 +149,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && - (word->best_choice->unichar_id(0) == space)) { + (word->best_choice != nullptr && word->best_choice->unichar_id(0) == space)) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ word->MergeAdjacentBlobs(0); @@ -157,7 +157,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) { stats_.last_char_was_tilde = false; } else { - if (word->reject_map.length() > 0) { + if (word->reject_map.length() > 0 && word->best_choice != nullptr) { if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) { stats_.last_char_was_tilde = true; } else { @@ -169,15 +169,17 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, /* else it is unchanged as there are no output chars */ } - ASSERT_HOST(word->best_choice->length() == word->reject_map.length()); + if (word->best_choice != nullptr) { + ASSERT_HOST(word->best_choice->length() == word->reject_map.length()); + } set_unlv_suspects(word); check_debug_pt(word, 120); - if (tessedit_rejection_debug) { + if (tessedit_rejection_debug && word->best_choice != nullptr) { tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(), dict_word(*(word->best_choice))); } - if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) { + if ((!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) && word->best_choice != nullptr) { if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (unsigned i = 0; i < word->best_choice->length(); ++i) { @@ -250,7 +252,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? ; } - if (i < word->reject_map.length()) { + if (i < word->reject_map.length() && word->best_choice != nullptr) { return word->best_choice->unichar_id(i); } else { return word->uch_set->unichar_to_id(unrecognised_char.c_str()); @@ -268,6 +270,9 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? * tessedit_minimal_rejection. *************************************************************************/ void Tesseract::set_unlv_suspects(WERD_RES *word_res) { + if (word_res->best_choice == nullptr) { + return; // No recognition results available + } int len = word_res->reject_map.length(); const WERD_CHOICE &word = *(word_res->best_choice); const UNICHARSET &uchset = *word.unicharset(); diff --git a/src/ccmain/resultiterator.cpp b/src/ccmain/resultiterator.cpp index 1fe4584298..399ef0ff60 100644 --- a/src/ccmain/resultiterator.cpp +++ b/src/ccmain/resultiterator.cpp @@ -706,7 +706,9 @@ void ResultIterator::AppendUTF8WordText(std::string *text) const { if (!it_->word()) { return; } - ASSERT_HOST(it_->word()->best_choice != nullptr); + if (it_->word()->best_choice == nullptr) { + return; // No recognition results available + } bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; if (at_beginning_of_minor_run_) { *text += reading_direction_is_ltr ? kLRM : kRLM;