tesseract-ocr · zdenop · Aug 10, 2025 · stweil · Aug 10, 2025 · stweil
diff --git a/src/ccmain/ltrresultiterator.cpp b/src/ccmain/ltrresultiterator.cpp
@@ -49,7 +49,9 @@ char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
   std::string text;
   PAGE_RES_IT res_it(*it_);
   WERD_CHOICE *best_choice = res_it.word()->best_choice;
-  ASSERT_HOST(best_choice != nullptr);
+  if (best_choice == nullptr) {
+    return nullptr; // No recognition results available
+  }
   if (level == RIL_SYMBOL) {
     text = res_it.word()->BestUTF8(blob_index_, false);
   } else if (level == RIL_WORD) {
@@ -61,7 +63,9 @@ char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
       do {            // for each text line in a paragraph
         do {          // for each word in a text line
           best_choice = res_it.word()->best_choice;
-          ASSERT_HOST(best_choice != nullptr);
+          if (best_choice == nullptr) {
+            break; // Skip words without recognition results
+          }
           text += best_choice->unichar_string();
           text += " ";
           res_it.forward();
@@ -104,37 +108,47 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const {
     case RIL_BLOCK:
       do {
         best_choice = res_it.word()->best_choice;
-        mean_certainty += best_choice->certainty();
-        ++certainty_count;
+        if (best_choice != nullptr) {
+          mean_certainty += best_choice->certainty();
+          ++certainty_count;
+        }
         res_it.forward();
       } while (res_it.block() == res_it.prev_block());
       break;
     case RIL_PARA:
       do {
         best_choice = res_it.word()->best_choice;
-        mean_certainty += best_choice->certainty();
-        ++certainty_count;
+        if (best_choice != nullptr) {
+          mean_certainty += best_choice->certainty();
+          ++certainty_count;
+        }
         res_it.forward();
       } while (res_it.block() == res_it.prev_block() &&
                res_it.row()->row->para() == res_it.prev_row()->row->para());
       break;
     case RIL_TEXTLINE:
       do {
         best_choice = res_it.word()->best_choice;
-        mean_certainty += best_choice->certainty();
-        ++certainty_count;
+        if (best_choice != nullptr) {
+          mean_certainty += best_choice->certainty();
+          ++certainty_count;
+        }
         res_it.forward();
       } while (res_it.row() == res_it.prev_row());
       break;
     case RIL_WORD:
       best_choice = res_it.word()->best_choice;
-      mean_certainty = best_choice->certainty();
-      certainty_count = 1;
+      if (best_choice != nullptr) {
+        mean_certainty = best_choice->certainty();
+        certainty_count = 1;
+      }
       break;
     case RIL_SYMBOL:
       best_choice = res_it.word()->best_choice;
-      mean_certainty = best_choice->certainty(blob_index_);
-      certainty_count = 1;
+      if (best_choice != nullptr) {
+        mean_certainty = best_choice->certainty(blob_index_);
+        certainty_count = 1;
+      }
   }
   if (certainty_count > 0) {
     mean_certainty /= certainty_count;
@@ -226,8 +240,8 @@ StrongScriptDirection LTRResultIterator::WordDirection() const {
 
 // Returns true if the current word was found in a dictionary.
 bool LTRResultIterator::WordIsFromDictionary() const {
-  if (it_->word() == nullptr) {
-    return false; // Already at the end!
+  if (it_->word() == nullptr || it_->word()->best_choice == nullptr) {
+    return false; // Already at the end or no recognition results!
   }
   int permuter = it_->word()->best_choice->permuter();
   return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM || permuter == USER_DAWG_PERM;
@@ -243,8 +257,8 @@ int LTRResultIterator::BlanksBeforeWord() const {
 
 // Returns true if the current word is numeric.
 bool LTRResultIterator::WordIsNumeric() const {
-  if (it_->word() == nullptr) {
-    return false; // Already at the end!
+  if (it_->word() == nullptr || it_->word()->best_choice == nullptr) {
+    return false; // Already at the end or no recognition results!
   }
   int permuter = it_->word()->best_choice->permuter();
   return permuter == NUMBER_PERM;
@@ -315,8 +329,11 @@ char *LTRResultIterator::WordNormedUTF8Text() const {
   if (it_->word() == nullptr) {
     return nullptr; // Already at the end!
   }
-  std::string ocr_text;
   WERD_CHOICE *best_choice = it_->word()->best_choice;
+  if (best_choice == nullptr) {
+    return nullptr; // No recognition results available
+  }
+  std::string ocr_text;
   const UNICHARSET *unicharset = it_->word()->uch_set;
   for (unsigned i = 0; i < best_choice->length(); ++i) {
     ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
@@ -341,7 +358,7 @@ const char *LTRResultIterator::WordLattice(int *lattice_size) const {
 // If iterating at a higher level object than symbols, eg words, then
 // this will return the attributes of the first symbol in that word.
 bool LTRResultIterator::SymbolIsSuperscript() const {
-  if (cblob_it_ == nullptr && it_->word() != nullptr) {
+  if (cblob_it_ == nullptr && it_->word() != nullptr && it_->word()->best_choice != nullptr) {
     return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
   }
   return false;
@@ -351,7 +368,7 @@ bool LTRResultIterator::SymbolIsSuperscript() const {
 // If iterating at a higher level object than symbols, eg words, then
 // this will return the attributes of the first symbol in that word.
 bool LTRResultIterator::SymbolIsSubscript() const {
-  if (cblob_it_ == nullptr && it_->word() != nullptr) {
+  if (cblob_it_ == nullptr && it_->word() != nullptr && it_->word()->best_choice != nullptr) {
     return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
   }
   return false;
@@ -361,7 +378,7 @@ bool LTRResultIterator::SymbolIsSubscript() const {
 // If iterating at a higher level object than symbols, eg words, then
 // this will return the attributes of the first symbol in that word.
 bool LTRResultIterator::SymbolIsDropcap() const {
-  if (cblob_it_ == nullptr && it_->word() != nullptr) {
+  if (cblob_it_ == nullptr && it_->word() != nullptr && it_->word()->best_choice != nullptr) {
     return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
   }
   return false;

diff --git a/src/ccmain/output.cpp b/src/ccmain/output.cpp
@@ -103,7 +103,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
   const UNICHARSET &uchset = *word->uch_set;
   UNICHAR_ID space = uchset.unichar_to_id(" ");
 
-  if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
+  if ((word->unlv_crunch_mode != CR_NONE || (word->best_choice != nullptr && word->best_choice->empty())) &&
       !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
     bool need_reject = false;
     if ((word->unlv_crunch_mode != CR_DELETE) &&
@@ -149,15 +149,15 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
 
   if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
       !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
-      (word->best_choice->unichar_id(0) == space)) {
+      (word->best_choice != nullptr && word->best_choice->unichar_id(0) == space)) {
     /* Prevent adjacent tilde across words - we know that adjacent tildes within
    words have been removed */
     word->MergeAdjacentBlobs(0);
   }
   if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
     stats_.last_char_was_tilde = false;
   } else {
-    if (word->reject_map.length() > 0) {
+    if (word->reject_map.length() > 0 && word->best_choice != nullptr) {
       if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
         stats_.last_char_was_tilde = true;
       } else {
@@ -169,15 +169,17 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
     /* else it is unchanged as there are no output chars */
   }
 
-  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
+  if (word->best_choice != nullptr) {
+    ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
+  }
 
   set_unlv_suspects(word);
   check_debug_pt(word, 120);
-  if (tessedit_rejection_debug) {
+  if (tessedit_rejection_debug && word->best_choice != nullptr) {
     tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
             dict_word(*(word->best_choice)));
   }
-  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
+  if ((!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) && word->best_choice != nullptr) {
     if (tessedit_zero_rejection) {
       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
       for (unsigned i = 0; i < word->best_choice->length(); ++i) {
@@ -250,7 +252,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
     ;
   }
 
-  if (i < word->reject_map.length()) {
+  if (i < word->reject_map.length() && word->best_choice != nullptr) {
     return word->best_choice->unichar_id(i);
   } else {
     return word->uch_set->unichar_to_id(unrecognised_char.c_str());
@@ -268,6 +270,9 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
  * tessedit_minimal_rejection.
  *************************************************************************/
 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
+  if (word_res->best_choice == nullptr) {
+    return; // No recognition results available
+  }
   int len = word_res->reject_map.length();
   const WERD_CHOICE &word = *(word_res->best_choice);
   const UNICHARSET &uchset = *word.unicharset();

diff --git a/src/ccmain/resultiterator.cpp b/src/ccmain/resultiterator.cpp
@@ -706,7 +706,9 @@ void ResultIterator::AppendUTF8WordText(std::string *text) const {
   if (!it_->word()) {
     return;
   }
-  ASSERT_HOST(it_->word()->best_choice != nullptr);
+  if (it_->word()->best_choice == nullptr) {
+    return; // No recognition results available
+  }
   bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
   if (at_beginning_of_minor_run_) {
     *text += reading_direction_is_ltr ? kLRM : kRLM;