tesseract  5.0.0
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract () override
 
DictgetDict () override
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Imagemutable_pix_binary ()
 
Image pix_binary () const
 
Image pix_grey () const
 
void set_pix_grey (Image grey_pix)
 
Image pix_original () const
 
void set_pix_original (Image original_pix)
 
Image BestPix () const
 
void set_pix_thresholds (Image thresholds)
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Image scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Image color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
bool AnyTessLang () const
 
bool AnyLSTMLang () const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Image *photo_mask_pix, Image *music_mask_pix)
 
void PrerecAllWordsPar (const std::vector< WordData > &words)
 
bool TrainLineRecognizer (const char *input_imagename, const std::string &output_basename, BLOCK_LIST *block_list)
 
void TrainFromBoxes (const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
 
ImageDataGetLineData (const TBOX &line_box, const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, int start_box, int end_box, const BLOCK &block)
 
ImageDataGetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
 
void LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
 
void SearchWords (PointerVector< WERD_RES > *words)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, std::vector< WordData > *words)
 
void SetupWordPassN (int pass_n, WordData *word)
 
bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, std::vector< WordData > *words)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
void script_pos_pass (PAGE_RES *page_res)
 
int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
 
bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
 
void AssignDiacriticsToOverlappingBlobs (const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< bool > *overlapped_any_blob, std::vector< C_BLOB * > *target_blobs)
 
void AssignDiacriticsToNewBlobs (const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< C_BLOB * > *target_blobs)
 
bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const std::vector< C_OUTLINE * > &outlines, int num_outlines, std::vector< bool > *ok_outlines)
 
float ClassifyBlobPlusOutlines (const std::vector< bool > &ok_outlines, const std::vector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str)
 
float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, float *c2)
 
void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
 
void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
 
bool recog_interactive (PAGE_RES_IT *pr_it)
 
void set_word_fonts (WERD_RES *word)
 
void font_recognition_pass (PAGE_RES *page_res)
 
void dictionary_correction_pass (PAGE_RES *page_res)
 
bool check_debug_pt (WERD_RES *word, int location)
 
bool SubAndSuperscriptFix (WERD_RES *word_res)
 
void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
 
WERD_RESTrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
 
bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
bool acceptable_number_string (const char *s, const char *lengths)
 
int16_t count_alphanums (const WERD_CHOICE &word)
 
int16_t count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
 
int init_tesseract (const std::string &datapath, const std::string &language, OcrEngineMode oem)
 
int init_tesseract_internal (const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
 
void SetupUniversalFontIds ()
 
void recognize_page (std::string &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const std::string &arg0, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
 
void ParseLanguageString (const std::string &lang_str, std::vector< std::string > *to_load, std::vector< std::string > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
bool process_cmd_win_event (int32_t cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
 
bool word_display (PAGE_RES_IT *pr_it)
 
bool word_bln_display (PAGE_RES_IT *pr_it)
 
bool word_blank_and_set_display (PAGE_RES_IT *pr_its)
 
bool word_set_display (PAGE_RES_IT *pr_it)
 
bool word_dumper (PAGE_RES_IT *pr_it)
 
void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)
 
void make_reject_map (WERD_RES *word, ROW *row, int16_t pass)
 
bool one_ell_conflict (WERD_RES *word_res, bool update_map)
 
int16_t first_alphanum_index (const char *word, const char *word_lengths)
 
int16_t first_alphanum_offset (const char *word, const char *word_lengths)
 
int16_t alpha_count (const char *word, const char *word_lengths)
 
bool word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
int16_t count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
bool non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
void set_done (WERD_RES *word, int16_t pass)
 
int16_t safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
bool word_adaptable (WERD_RES *word, uint16_t mode)
 
void recog_word_recursive (WERD_RES *word)
 
void recog_word (WERD_RES *word)
 
void split_and_recog_word (WERD_RES *word)
 
void split_word (WERD_RES *word, unsigned split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
 
void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, bool ok_dict_word)
 
bool potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
int16_t word_blob_quality (WERD_RES *word)
 
void word_char_quality (WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word)
 
int16_t count_outline_errs (char c, int16_t outline_count)
 
int16_t word_outline_errs (WERD_RES *word)
 
bool terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, int16_t &delete_mode)
 
int16_t failure_count (WERD_RES *word)
 
bool noise_outlines (TWERD *word)
 
void tess_segment_pass_n (int pass_n, WERD_RES *word)
 
PAGE_RESApplyBoxes (const char *filename, bool find_segmentation, BLOCK_LIST *block_list)
 
void PreenXHeights (BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const std::vector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const std::vector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, std::vector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const std::vector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const std::vector< BLOB_CHOICE_LIST * > *choices, int choices_pos, unsigned choices_length, const std::vector< UNICHAR_ID > &target_text, unsigned text_index, float rating, std::vector< int > *segmentation, float *best_rating, std::vector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const std::string &fontname, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)
 
 BOOL_VAR_H (tessedit_resegment_from_boxes)
 
 BOOL_VAR_H (tessedit_resegment_from_line_boxes)
 
 BOOL_VAR_H (tessedit_train_from_boxes)
 
 BOOL_VAR_H (tessedit_make_boxes_from_boxes)
 
 BOOL_VAR_H (tessedit_train_line_recognizer)
 
 BOOL_VAR_H (tessedit_dump_pageseg_images)
 
 BOOL_VAR_H (tessedit_do_invert)
 
 INT_VAR_H (tessedit_pageseg_mode)
 
 INT_VAR_H (thresholding_method)
 
 BOOL_VAR_H (thresholding_debug)
 
 double_VAR_H (thresholding_window_size)
 
 double_VAR_H (thresholding_kfactor)
 
 double_VAR_H (thresholding_tile_size)
 
 double_VAR_H (thresholding_smooth_kernel_size)
 
 double_VAR_H (thresholding_score_fraction)
 
 INT_VAR_H (tessedit_ocr_engine_mode)
 
 STRING_VAR_H (tessedit_char_blacklist)
 
 STRING_VAR_H (tessedit_char_whitelist)
 
 STRING_VAR_H (tessedit_char_unblacklist)
 
 BOOL_VAR_H (tessedit_ambigs_training)
 
 INT_VAR_H (pageseg_devanagari_split_strategy)
 
 INT_VAR_H (ocr_devanagari_split_strategy)
 
 STRING_VAR_H (tessedit_write_params_to_file)
 
 BOOL_VAR_H (tessedit_adaption_debug)
 
 INT_VAR_H (bidi_debug)
 
 INT_VAR_H (applybox_debug)
 
 INT_VAR_H (applybox_page)
 
 STRING_VAR_H (applybox_exposure_pattern)
 
 BOOL_VAR_H (applybox_learn_chars_and_char_frags_mode)
 
 BOOL_VAR_H (applybox_learn_ngrams_mode)
 
 BOOL_VAR_H (tessedit_display_outwords)
 
 BOOL_VAR_H (tessedit_dump_choices)
 
 BOOL_VAR_H (tessedit_timing_debug)
 
 BOOL_VAR_H (tessedit_fix_fuzzy_spaces)
 
 BOOL_VAR_H (tessedit_unrej_any_wd)
 
 BOOL_VAR_H (tessedit_fix_hyphens)
 
 BOOL_VAR_H (tessedit_enable_doc_dict)
 
 BOOL_VAR_H (tessedit_debug_fonts)
 
 INT_VAR_H (tessedit_font_id)
 
 BOOL_VAR_H (tessedit_debug_block_rejection)
 
 BOOL_VAR_H (tessedit_enable_bigram_correction)
 
 BOOL_VAR_H (tessedit_enable_dict_correction)
 
 INT_VAR_H (tessedit_bigram_debug)
 
 BOOL_VAR_H (enable_noise_removal)
 
 INT_VAR_H (debug_noise_removal)
 
 double_VAR_H (noise_cert_basechar)
 
 double_VAR_H (noise_cert_disjoint)
 
 double_VAR_H (noise_cert_punc)
 
 double_VAR_H (noise_cert_factor)
 
 INT_VAR_H (noise_maxperblob)
 
 INT_VAR_H (noise_maxperword)
 
 INT_VAR_H (debug_x_ht_level)
 
 STRING_VAR_H (chs_leading_punct)
 
 STRING_VAR_H (chs_trailing_punct1)
 
 STRING_VAR_H (chs_trailing_punct2)
 
 double_VAR_H (quality_rej_pc)
 
 double_VAR_H (quality_blob_pc)
 
 double_VAR_H (quality_outline_pc)
 
 double_VAR_H (quality_char_pc)
 
 INT_VAR_H (quality_min_initial_alphas_reqd)
 
 INT_VAR_H (tessedit_tess_adaption_mode)
 
 BOOL_VAR_H (tessedit_minimal_rej_pass1)
 
 BOOL_VAR_H (tessedit_test_adaption)
 
 BOOL_VAR_H (test_pt)
 
 double_VAR_H (test_pt_x)
 
 double_VAR_H (test_pt_y)
 
 INT_VAR_H (multilang_debug_level)
 
 INT_VAR_H (paragraph_debug_level)
 
 BOOL_VAR_H (paragraph_text_based)
 
 BOOL_VAR_H (lstm_use_matrix)
 
 STRING_VAR_H (outlines_odd)
 
 STRING_VAR_H (outlines_2)
 
 BOOL_VAR_H (tessedit_good_quality_unrej)
 
 BOOL_VAR_H (tessedit_use_reject_spaces)
 
 double_VAR_H (tessedit_reject_doc_percent)
 
 double_VAR_H (tessedit_reject_block_percent)
 
 double_VAR_H (tessedit_reject_row_percent)
 
 double_VAR_H (tessedit_whole_wd_rej_row_percent)
 
 BOOL_VAR_H (tessedit_preserve_blk_rej_perfect_wds)
 
 BOOL_VAR_H (tessedit_preserve_row_rej_perfect_wds)
 
 BOOL_VAR_H (tessedit_dont_blkrej_good_wds)
 
 BOOL_VAR_H (tessedit_dont_rowrej_good_wds)
 
 INT_VAR_H (tessedit_preserve_min_wd_len)
 
 BOOL_VAR_H (tessedit_row_rej_good_docs)
 
 double_VAR_H (tessedit_good_doc_still_rowrej_wd)
 
 BOOL_VAR_H (tessedit_reject_bad_qual_wds)
 
 BOOL_VAR_H (tessedit_debug_doc_rejection)
 
 BOOL_VAR_H (tessedit_debug_quality_metrics)
 
 BOOL_VAR_H (bland_unrej)
 
 double_VAR_H (quality_rowrej_pc)
 
 BOOL_VAR_H (unlv_tilde_crunching)
 
 BOOL_VAR_H (hocr_font_info)
 
 BOOL_VAR_H (hocr_char_boxes)
 
 BOOL_VAR_H (crunch_early_merge_tess_fails)
 
 BOOL_VAR_H (crunch_early_convert_bad_unlv_chs)
 
 double_VAR_H (crunch_terrible_rating)
 
 BOOL_VAR_H (crunch_terrible_garbage)
 
 double_VAR_H (crunch_poor_garbage_cert)
 
 double_VAR_H (crunch_poor_garbage_rate)
 
 double_VAR_H (crunch_pot_poor_rate)
 
 double_VAR_H (crunch_pot_poor_cert)
 
 double_VAR_H (crunch_del_rating)
 
 double_VAR_H (crunch_del_cert)
 
 double_VAR_H (crunch_del_min_ht)
 
 double_VAR_H (crunch_del_max_ht)
 
 double_VAR_H (crunch_del_min_width)
 
 double_VAR_H (crunch_del_high_word)
 
 double_VAR_H (crunch_del_low_word)
 
 double_VAR_H (crunch_small_outlines_size)
 
 INT_VAR_H (crunch_rating_max)
 
 INT_VAR_H (crunch_pot_indicators)
 
 BOOL_VAR_H (crunch_leave_ok_strings)
 
 BOOL_VAR_H (crunch_accept_ok)
 
 BOOL_VAR_H (crunch_leave_accept_strings)
 
 BOOL_VAR_H (crunch_include_numerals)
 
 INT_VAR_H (crunch_leave_lc_strings)
 
 INT_VAR_H (crunch_leave_uc_strings)
 
 INT_VAR_H (crunch_long_repetitions)
 
 INT_VAR_H (crunch_debug)
 
 INT_VAR_H (fixsp_non_noise_limit)
 
 double_VAR_H (fixsp_small_outlines_size)
 
 BOOL_VAR_H (tessedit_prefer_joined_punct)
 
 INT_VAR_H (fixsp_done_mode)
 
 INT_VAR_H (debug_fix_space_level)
 
 STRING_VAR_H (numeric_punctuation)
 
 INT_VAR_H (x_ht_acceptance_tolerance)
 
 INT_VAR_H (x_ht_min_change)
 
 INT_VAR_H (superscript_debug)
 
 double_VAR_H (superscript_worse_certainty)
 
 double_VAR_H (superscript_bettered_certainty)
 
 double_VAR_H (superscript_scaledown_ratio)
 
 double_VAR_H (subscript_max_y_top)
 
 double_VAR_H (superscript_min_y_bottom)
 
 BOOL_VAR_H (tessedit_write_block_separators)
 
 BOOL_VAR_H (tessedit_write_rep_codes)
 
 BOOL_VAR_H (tessedit_write_unlv)
 
 BOOL_VAR_H (tessedit_create_txt)
 
 BOOL_VAR_H (tessedit_create_hocr)
 
 BOOL_VAR_H (tessedit_create_alto)
 
 BOOL_VAR_H (tessedit_create_lstmbox)
 
 BOOL_VAR_H (tessedit_create_tsv)
 
 BOOL_VAR_H (tessedit_create_wordstrbox)
 
 BOOL_VAR_H (tessedit_create_pdf)
 
 BOOL_VAR_H (textonly_pdf)
 
 INT_VAR_H (jpg_quality)
 
 INT_VAR_H (user_defined_dpi)
 
 INT_VAR_H (min_characters_to_try)
 
 STRING_VAR_H (unrecognised_char)
 
 INT_VAR_H (suspect_level)
 
 INT_VAR_H (suspect_short_words)
 
 BOOL_VAR_H (suspect_constrain_1Il)
 
 double_VAR_H (suspect_rating_per_ch)
 
 double_VAR_H (suspect_accept_rating)
 
 BOOL_VAR_H (tessedit_minimal_rejection)
 
 BOOL_VAR_H (tessedit_zero_rejection)
 
 BOOL_VAR_H (tessedit_word_for_word)
 
 BOOL_VAR_H (tessedit_zero_kelvin_rejection)
 
 INT_VAR_H (tessedit_reject_mode)
 
 BOOL_VAR_H (tessedit_rejection_debug)
 
 BOOL_VAR_H (tessedit_flip_0O)
 
 double_VAR_H (tessedit_lower_flip_hyphen)
 
 double_VAR_H (tessedit_upper_flip_hyphen)
 
 BOOL_VAR_H (rej_trust_doc_dawg)
 
 BOOL_VAR_H (rej_1Il_use_dict_word)
 
 BOOL_VAR_H (rej_1Il_trust_permuter_type)
 
 BOOL_VAR_H (rej_use_tess_accepted)
 
 BOOL_VAR_H (rej_use_tess_blanks)
 
 BOOL_VAR_H (rej_use_good_perm)
 
 BOOL_VAR_H (rej_use_sensible_wd)
 
 BOOL_VAR_H (rej_alphas_in_number_perm)
 
 double_VAR_H (rej_whole_of_mostly_reject_word_fract)
 
 INT_VAR_H (tessedit_image_border)
 
 STRING_VAR_H (ok_repeated_ch_non_alphanum_wds)
 
 STRING_VAR_H (conflict_set_I_l_1)
 
 INT_VAR_H (min_sane_x_ht_pixels)
 
 BOOL_VAR_H (tessedit_create_boxfile)
 
 INT_VAR_H (tessedit_page_number)
 
 BOOL_VAR_H (tessedit_write_images)
 
 BOOL_VAR_H (interactive_display_mode)
 
 STRING_VAR_H (file_type)
 
 BOOL_VAR_H (tessedit_override_permuter)
 
 STRING_VAR_H (tessedit_load_sublangs)
 
 BOOL_VAR_H (tessedit_use_primary_params_model)
 
 double_VAR_H (min_orientation_margin)
 
 BOOL_VAR_H (textord_tabfind_show_vlines)
 
 BOOL_VAR_H (textord_use_cjk_fp_model)
 
 BOOL_VAR_H (poly_allow_detailed_fx)
 
 BOOL_VAR_H (tessedit_init_config_only)
 
 BOOL_VAR_H (textord_equation_detect)
 
 BOOL_VAR_H (textord_tabfind_vertical_text)
 
 BOOL_VAR_H (textord_tabfind_force_vertical_text)
 
 double_VAR_H (textord_tabfind_vertical_text_ratio)
 
 double_VAR_H (textord_tabfind_aligned_gap_fraction)
 
 INT_VAR_H (tessedit_parallelize)
 
 BOOL_VAR_H (preserve_interword_spaces)
 
 STRING_VAR_H (page_separator)
 
 INT_VAR_H (lstm_choice_mode)
 
 INT_VAR_H (lstm_choice_iterations)
 
 double_VAR_H (lstm_rating_coefficient)
 
 BOOL_VAR_H (pageseg_apply_music_mask)
 
FILE * init_recog_training (const char *filename)
 
void recog_training_segmented (const char *filename, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word

  • 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

bool digit_or_numeric_punct (WERD_RES *word, int char_position)
 
int16_t eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

int16_t fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
int16_t worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
transform_to_next_perm()

Examines the current word list to find the smallest word gap size. Then walks the word list closing any gaps of this size by either inserted new combination words, or extending existing ones.

The routine COULD be limited to stop it building words longer than N blobs.

If there are no more gaps then it DELETES the entire list and returns the empty list to cause termination.

void dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
 
bool fixspace_thinks_word_done (WERD_RES *word)
 
process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
bool tess_acceptable_word (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Wordrec
 BOOL_VAR_H (merge_fragments_in_matrix)
 
 BOOL_VAR_H (wordrec_enable_assoc)
 
 BOOL_VAR_H (force_word_assoc)
 
 INT_VAR_H (repair_unchopped_blobs)
 
 double_VAR_H (tessedit_certainty_threshold)
 
 INT_VAR_H (chop_debug)
 
 BOOL_VAR_H (chop_enable)
 
 BOOL_VAR_H (chop_vertical_creep)
 
 INT_VAR_H (chop_split_length)
 
 INT_VAR_H (chop_same_distance)
 
 INT_VAR_H (chop_min_outline_points)
 
 INT_VAR_H (chop_seam_pile_size)
 
 BOOL_VAR_H (chop_new_seam_pile)
 
 INT_VAR_H (chop_inside_angle)
 
 INT_VAR_H (chop_min_outline_area)
 
 double_VAR_H (chop_split_dist_knob)
 
 double_VAR_H (chop_overlap_knob)
 
 double_VAR_H (chop_center_knob)
 
 INT_VAR_H (chop_centered_maxwidth)
 
 double_VAR_H (chop_sharpness_knob)
 
 double_VAR_H (chop_width_change_knob)
 
 double_VAR_H (chop_ok_split)
 
 double_VAR_H (chop_good_split)
 
 INT_VAR_H (chop_x_y_weight)
 
 BOOL_VAR_H (assume_fixed_pitch_char_segment)
 
 INT_VAR_H (wordrec_debug_level)
 
 INT_VAR_H (wordrec_max_join_chunks)
 
 BOOL_VAR_H (wordrec_skip_no_truth_words)
 
 BOOL_VAR_H (wordrec_debug_blamer)
 
 BOOL_VAR_H (wordrec_run_blamer)
 
 INT_VAR_H (segsearch_debug_level)
 
 INT_VAR_H (segsearch_max_pain_points)
 
 INT_VAR_H (segsearch_max_futile_classifications)
 
 double_VAR_H (segsearch_max_char_wh_ratio)
 
 BOOL_VAR_H (save_alt_choices)
 
 Wordrec ()
 
 ~Wordrec () override=default
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, std::vector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (SPLIT *split)
 
PRIORITY grade_sharpness (SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const std::vector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void program_editup (const std::string &textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
void cc_recog (WERD_RES *word)
 
void program_editdown (int32_t elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, ScrollView::Color color, BlamerBundle *blamer_bundle)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, PointHeap *points)
 
void new_min_point (EDGEPT *local_min, PointHeap *points)
 
void new_max_point (EDGEPT *local_max, PointHeap *points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const std::vector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const std::vector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const std::vector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, unsigned *blob_number)
 
SEAMimprove_one_blob (const std::vector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, unsigned *blob_number)
 
SEAMchop_one_blob (const std::vector< TBOX > &boxes, const std::vector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, unsigned *blob_number)
 
void chop_word_main (WERD_RES *word)
 
void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, std::vector< SegSearchPending > *pending)
 
int select_blob_to_split (const std::vector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
 ~Classify () override
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
int GetFontinfoId (ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, std::vector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
 
ADAPT_TEMPLATES_STRUCTReadAdaptedTemplates (TFile *File)
 
void ConvertProto (PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class)
 
INT_TEMPLATES_STRUCTCreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class, ADAPT_TEMPLATES_STRUCT *Templates)
 
void AmbigClassifier (const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES_STRUCT *templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box, const std::vector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
std::string ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, std::vector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES_STRUCT *adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES_STRUCTReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES_STRUCT *Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const char *filename)
 
 BOOL_VAR_H (allow_blob_division)
 
 BOOL_VAR_H (prioritize_division)
 
 BOOL_VAR_H (classify_enable_learning)
 
 INT_VAR_H (classify_debug_level)
 
 INT_VAR_H (classify_norm_method)
 
 double_VAR_H (classify_char_norm_range)
 
 double_VAR_H (classify_max_rating_ratio)
 
 double_VAR_H (classify_max_certainty_margin)
 
 BOOL_VAR_H (tess_cn_matching)
 
 BOOL_VAR_H (tess_bn_matching)
 
 BOOL_VAR_H (classify_enable_adaptive_matcher)
 
 BOOL_VAR_H (classify_use_pre_adapted_templates)
 
 BOOL_VAR_H (classify_save_adapted_templates)
 
 BOOL_VAR_H (classify_enable_adaptive_debugger)
 
 BOOL_VAR_H (classify_nonlinear_norm)
 
 INT_VAR_H (matcher_debug_level)
 
 INT_VAR_H (matcher_debug_flags)
 
 INT_VAR_H (classify_learning_debug_level)
 
 double_VAR_H (matcher_good_threshold)
 
 double_VAR_H (matcher_reliable_adaptive_result)
 
 double_VAR_H (matcher_perfect_threshold)
 
 double_VAR_H (matcher_bad_match_pad)
 
 double_VAR_H (matcher_rating_margin)
 
 double_VAR_H (matcher_avg_noise_size)
 
 INT_VAR_H (matcher_permanent_classes_min)
 
 INT_VAR_H (matcher_min_examples_for_prototyping)
 
 INT_VAR_H (matcher_sufficient_examples_for_prototyping)
 
 double_VAR_H (matcher_clustering_max_angle_delta)
 
 double_VAR_H (classify_misfit_junk_penalty)
 
 double_VAR_H (rating_scale)
 
 double_VAR_H (certainty_scale)
 
 double_VAR_H (tessedit_class_miss_scale)
 
 double_VAR_H (classify_adapted_pruning_factor)
 
 double_VAR_H (classify_adapted_pruning_threshold)
 
 INT_VAR_H (classify_adapt_proto_threshold)
 
 INT_VAR_H (classify_adapt_feature_threshold)
 
 BOOL_VAR_H (disable_character_fragments)
 
 double_VAR_H (classify_character_fragments_garbage_certainty_threshold)
 
 BOOL_VAR_H (classify_debug_character_fragments)
 
 BOOL_VAR_H (matcher_debug_separate_windows)
 
 STRING_VAR_H (classify_learn_debug_str)
 
 INT_VAR_H (classify_class_pruner_threshold)
 
 INT_VAR_H (classify_class_pruner_multiplier)
 
 INT_VAR_H (classify_cp_cutoff_strength)
 
 INT_VAR_H (classify_integer_matcher_multiplier)
 
 BOOL_VAR_H (classify_bln_numeric_mode)
 
 double_VAR_H (speckle_large_max_size)
 
 double_VAR_H (speckle_rating_penalty)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
 ~CCStruct () override
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const std::string &argv0, const std::string &basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 
 INT_VAR_H (ambigs_debug_level)
 
 BOOL_VAR_H (use_ambigs_for_adaption)
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, std::vector< INT_FEATURE_STRUCT > *bl_features, std::vector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, std::vector< int > *outline_cn_counts)
 
- Public Attributes inherited from tesseract::Wordrec
std::unique_ptr< LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
INT_TEMPLATES_STRUCTPreTrainedTemplates = nullptr
 
ADAPT_TEMPLATES_STRUCTAdaptedTemplates = nullptr
 
ADAPT_TEMPLATES_STRUCTBackupAdaptedTemplates = nullptr
 
BIT_VECTOR AllProtosOn = nullptr
 
BIT_VECTOR AllConfigsOn = nullptr
 
BIT_VECTOR AllConfigsOff = nullptr
 
BIT_VECTOR TempProtoMask = nullptr
 
NORM_PROTOSNormProtos = nullptr
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
bool EnableLearning = true
 
- Public Attributes inherited from tesseract::CCUtil
std::string datadir
 
std::string imagebasename
 
std::string lang
 
std::string language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
std::string imagefile
 
std::string directory
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, std::vector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, std::vector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, std::vector< SegSearchPending > &pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, std::string &blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_ = nullptr
 

Detailed Description

Definition at line 178 of file tesseractclass.h.

Constructor & Destructor Documentation

◆ Tesseract()

tesseract::Tesseract::Tesseract ( )

Definition at line 53 of file tesseractclass.cpp.

54  : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
55  "Take segmentation and labeling from box file", this->params())
56  , BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
57  "Conversion of word/line box file to char box file", this->params())
58  , BOOL_MEMBER(tessedit_train_from_boxes, false, "Generate training data from boxed chars",
59  this->params())
60  , BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars",
61  this->params())
62  , BOOL_MEMBER(tessedit_train_line_recognizer, false,
63  "Break input into lines and remap boxes if present", this->params())
64  , BOOL_MEMBER(tessedit_dump_pageseg_images, false,
65  "Dump intermediate images made during page segmentation", this->params())
66  , BOOL_MEMBER(tessedit_do_invert, true, "Try inverting the image in `LSTMRecognizeWord`",
67  this->params())
68  ,
69  // The default for pageseg_mode is the old behaviour, so as not to
70  // upset anything that relies on that.
71  INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
72  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, "
73  "4=column,"
74  " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
75  "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
76  " (Values from PageSegMode enum in tesseract/publictypes.h)",
77  this->params())
78  , INT_MEMBER(thresholding_method,
79  static_cast<int>(ThresholdMethod::Otsu),
80  "Thresholding method: 0 = Otsu, 1 = LeptonicaOtsu, 2 = "
81  "Sauvola",
82  this->params())
83  , BOOL_MEMBER(thresholding_debug, false,
84  "Debug the thresholding process",
85  this->params())
86  , double_MEMBER(thresholding_window_size, 0.33,
87  "Window size for measuring local statistics (to be "
88  "multiplied by image DPI). "
89  "This parameter is used by the Sauvola thresolding method",
90  this->params())
91  , double_MEMBER(thresholding_kfactor, 0.34,
92  "Factor for reducing threshold due to variance. "
93  "This parameter is used by the Sauvola thresolding method."
94  " Normal range: 0.2-0.5",
95  this->params())
96  , double_MEMBER(thresholding_tile_size, 0.33,
97  "Desired tile size (to be multiplied by image DPI). "
98  "This parameter is used by the LeptonicaOtsu thresolding "
99  "method",
100  this->params())
101  , double_MEMBER(thresholding_smooth_kernel_size, 0.0,
102  "Size of convolution kernel applied to threshold array "
103  "(to be multiplied by image DPI). Use 0 for no smoothing. "
104  "This parameter is used by the LeptonicaOtsu thresolding "
105  "method",
106  this->params())
107  , double_MEMBER(thresholding_score_fraction, 0.1,
108  "Fraction of the max Otsu score. "
109  "This parameter is used by the LeptonicaOtsu thresolding "
110  "method. "
111  "For standard Otsu use 0.0, otherwise 0.1 is recommended",
112  this->params())
113  , INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
114  "Which OCR engine(s) to run (Tesseract, LSTM, both)."
115  " Defaults to loading and running the most accurate"
116  " available.",
117  this->params())
118  , STRING_MEMBER(tessedit_char_blacklist, "", "Blacklist of chars not to recognize",
119  this->params())
120  , STRING_MEMBER(tessedit_char_whitelist, "", "Whitelist of chars to recognize", this->params())
121  , STRING_MEMBER(tessedit_char_unblacklist, "",
122  "List of chars to override tessedit_char_blacklist", this->params())
123  , BOOL_MEMBER(tessedit_ambigs_training, false, "Perform training for ambiguities",
124  this->params())
125  , INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
126  "Whether to use the top-line splitting process for Devanagari "
127  "documents while performing page-segmentation.",
128  this->params())
129  , INT_MEMBER(ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
130  "Whether to use the top-line splitting process for Devanagari "
131  "documents while performing ocr.",
132  this->params())
133  , STRING_MEMBER(tessedit_write_params_to_file, "", "Write all parameters to the given file.",
134  this->params())
135  , BOOL_MEMBER(tessedit_adaption_debug, false,
136  "Generate and print debug"
137  " information for adaption",
138  this->params())
139  , INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params())
140  , INT_MEMBER(applybox_debug, 1, "Debug level", this->params())
141  , INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", this->params())
142  , STRING_MEMBER(applybox_exposure_pattern, ".exp",
143  "Exposure value follows"
144  " this pattern in the image filename. The name of the image"
145  " files are expected to be in the form"
146  " [lang].[fontname].exp[num].tif",
147  this->params())
148  , BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
149  "Learn both character fragments (as is done in the"
150  " special low exposure mode) as well as unfragmented"
151  " characters.",
152  this->params())
153  , BOOL_MEMBER(applybox_learn_ngrams_mode, false,
154  "Each bounding box"
155  " is assumed to contain ngrams. Only learn the ngrams"
156  " whose outlines overlap horizontally.",
157  this->params())
158  , BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", this->params())
159  , BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", this->params())
160  , BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", this->params())
161  , BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces", this->params())
162  , BOOL_MEMBER(tessedit_unrej_any_wd, false, "Don't bother with word plausibility",
163  this->params())
164  , BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", this->params())
165  , BOOL_MEMBER(tessedit_enable_doc_dict, true, "Add words to the document dictionary",
166  this->params())
167  , BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", this->params())
168  , INT_MEMBER(tessedit_font_id, 0, "Font ID to use or zero", this->params())
169  , BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params())
170  , BOOL_MEMBER(tessedit_enable_bigram_correction, true,
171  "Enable correction based on the word bigram dictionary.", this->params())
172  , BOOL_MEMBER(tessedit_enable_dict_correction, false,
173  "Enable single word correction based on the dictionary.", this->params())
174  , INT_MEMBER(tessedit_bigram_debug, 0, "Amount of debug output for bigram correction.",
175  this->params())
176  , BOOL_MEMBER(enable_noise_removal, true,
177  "Remove and conditionally reassign small outlines when they"
178  " confuse layout analysis, determining diacritics vs noise",
179  this->params())
180  , INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", this->params())
181  ,
182  // Worst (min) certainty, for which a diacritic is allowed to make the
183  // base
184  // character worse and still be included.
185  double_MEMBER(noise_cert_basechar, -8.0, "Hingepoint for base char certainty", this->params())
186  ,
187  // Worst (min) certainty, for which a non-overlapping diacritic is allowed
188  // to make the base character worse and still be included.
189  double_MEMBER(noise_cert_disjoint, -1.0, "Hingepoint for disjoint certainty", this->params())
190  ,
191  // Worst (min) certainty, for which a diacritic is allowed to make a new
192  // stand-alone blob.
193  double_MEMBER(noise_cert_punc, -3.0, "Threshold for new punc char certainty", this->params())
194  ,
195  // Factor of certainty margin for adding diacritics to not count as worse.
196  double_MEMBER(noise_cert_factor, 0.375, "Scaling on certainty diff from Hingepoint",
197  this->params())
198  , INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", this->params())
199  , INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", this->params())
200  , INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params())
201  , STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", this->params())
202  , STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", this->params())
203  , STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", this->params())
204  , double_MEMBER(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit", this->params())
205  , double_MEMBER(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit", this->params())
206  , double_MEMBER(quality_outline_pc, 1.0, "good_quality_doc lte outline error limit",
207  this->params())
208  , double_MEMBER(quality_char_pc, 0.95, "good_quality_doc gte good char limit", this->params())
209  , INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", this->params())
210  , INT_MEMBER(tessedit_tess_adaption_mode, 0x27, "Adaptation decision algorithm for tess",
211  this->params())
212  , BOOL_MEMBER(tessedit_minimal_rej_pass1, false, "Do minimal rejection on pass 1 output",
213  this->params())
214  , BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", this->params())
215  , BOOL_MEMBER(test_pt, false, "Test for point", this->params())
216  , double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params())
217  , double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params())
218  , INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.", this->params())
219  , INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", this->params())
220  , BOOL_MEMBER(paragraph_text_based, true,
221  "Run paragraph detection on the post-text-recognition "
222  "(more accurate)",
223  this->params())
224  , BOOL_MEMBER(lstm_use_matrix, 1, "Use ratings matrix/beam search with lstm", this->params())
225  , STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params())
226  , STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", this->params())
227  , BOOL_MEMBER(tessedit_good_quality_unrej, true, "Reduce rejection on good docs",
228  this->params())
229  , BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", this->params())
230  , double_MEMBER(tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc",
231  this->params())
232  , double_MEMBER(tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block",
233  this->params())
234  , double_MEMBER(tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row",
235  this->params())
236  , double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
237  "Number of row rejects in whole word rejects"
238  " which prevents whole row rejection",
239  this->params())
240  , BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
241  "Only rej partially rejected words in block rejection", this->params())
242  , BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
243  "Only rej partially rejected words in row rejection", this->params())
244  , BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, "Use word segmentation quality metric",
245  this->params())
246  , BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, "Use word segmentation quality metric",
247  this->params())
248  , INT_MEMBER(tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this",
249  this->params())
250  , BOOL_MEMBER(tessedit_row_rej_good_docs, true, "Apply row rejection to good docs",
251  this->params())
252  , double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
253  "rej good doc wd if more than this fraction rejected", this->params())
254  , BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds", this->params())
255  , BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", this->params())
256  , BOOL_MEMBER(tessedit_debug_quality_metrics, false, "Output data to debug file",
257  this->params())
258  , BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks", this->params())
259  , double_MEMBER(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit", this->params())
260  , BOOL_MEMBER(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch", this->params())
261  , BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params())
262  , BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
263  this->params())
264  , BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params())
265  , BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params())
266  , double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params())
267  , BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params())
268  , double_MEMBER(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this", this->params())
269  , double_MEMBER(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this", this->params())
270  , double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", this->params())
271  , double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", this->params())
272  , double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", this->params())
273  , double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", this->params())
274  , double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", this->params())
275  , double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", this->params())
276  , double_MEMBER(crunch_del_min_width, 3.0, "Del if word width lt xht x this", this->params())
277  , double_MEMBER(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl", this->params())
278  , double_MEMBER(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl", this->params())
279  , double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", this->params())
280  , INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", this->params())
281  , INT_MEMBER(crunch_pot_indicators, 1, "How many potential indicators needed", this->params())
282  , BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings", this->params())
283  , BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", this->params())
284  , BOOL_MEMBER(crunch_leave_accept_strings, false, "Don't pot crunch sensible strings",
285  this->params())
286  , BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", this->params())
287  , INT_MEMBER(crunch_leave_lc_strings, 4, "Don't crunch words with long lower case strings",
288  this->params())
289  , INT_MEMBER(crunch_leave_uc_strings, 4, "Don't crunch words with long lower case strings",
290  this->params())
291  , INT_MEMBER(crunch_long_repetitions, 3, "Crunch words with long repetitions", this->params())
292  , INT_MEMBER(crunch_debug, 0, "As it says", this->params())
293  , INT_MEMBER(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?", this->params())
294  , double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", this->params())
295  , BOOL_MEMBER(tessedit_prefer_joined_punct, false, "Reward punctuation joins", this->params())
296  , INT_MEMBER(fixsp_done_mode, 1, "What constitutes done for spacing", this->params())
297  , INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", this->params())
298  , STRING_MEMBER(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers", this->params())
299  , INT_MEMBER(x_ht_acceptance_tolerance, 8,
300  "Max allowed deviation of blob top outside of font data", this->params())
301  , INT_MEMBER(x_ht_min_change, 8, "Min change in xht before actually trying it", this->params())
302  , INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", this->params())
303  , double_MEMBER(superscript_worse_certainty, 2.0,
304  "How many times worse "
305  "certainty does a superscript position glyph need to be for "
306  "us to try classifying it as a char with a different "
307  "baseline?",
308  this->params())
309  , double_MEMBER(superscript_bettered_certainty, 0.97,
310  "What reduction in "
311  "badness do we think sufficient to choose a superscript "
312  "over what we'd thought. For example, a value of 0.6 means "
313  "we want to reduce badness of certainty by at least 40%",
314  this->params())
315  , double_MEMBER(superscript_scaledown_ratio, 0.4,
316  "A superscript scaled down more than this is unbelievably "
317  "small. For example, 0.3 means we expect the font size to "
318  "be no smaller than 30% of the text line font size.",
319  this->params())
320  , double_MEMBER(subscript_max_y_top, 0.5,
321  "Maximum top of a character measured as a multiple of "
322  "x-height above the baseline for us to reconsider whether "
323  "it's a subscript.",
324  this->params())
325  , double_MEMBER(superscript_min_y_bottom, 0.3,
326  "Minimum bottom of a character measured as a multiple of "
327  "x-height above the baseline for us to reconsider whether "
328  "it's a superscript.",
329  this->params())
330  , BOOL_MEMBER(tessedit_write_block_separators, false, "Write block separators in output",
331  this->params())
332  , BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", this->params())
333  , BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", this->params())
334  , BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())
335  , BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())
336  , BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())
337  , BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
338  this->params())
339  , BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())
340  , BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
341  this->params())
342  , BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params())
343  , BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer",
344  this->params())
345  , INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params())
346  , INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params())
347  , INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD",
348  this->params())
349  , STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params())
350  , INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params())
351  , INT_MEMBER(suspect_short_words, 2, "Don't suspect dict wds longer than this", this->params())
352  , BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params())
353  , double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit", this->params())
354  , double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params())
355  , BOOL_MEMBER(tessedit_minimal_rejection, false, "Only reject tess failures", this->params())
356  , BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING", this->params())
357  , BOOL_MEMBER(tessedit_word_for_word, false, "Make output have exactly one word per WERD",
358  this->params())
359  , BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, "Don't reject ANYTHING AT ALL",
360  this->params())
361  , INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params())
362  , BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", this->params())
363  , BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", this->params())
364  , double_MEMBER(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test", this->params())
365  , double_MEMBER(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test", this->params())
366  , BOOL_MEMBER(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector", this->params())
367  , BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", this->params())
368  , BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check", this->params())
369  , BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", this->params())
370  , BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", this->params())
371  , BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", this->params())
372  , BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", this->params())
373  , BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", this->params())
374  , double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract", this->params())
375  , INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", this->params())
376  , STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej", this->params())
377  , STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", this->params())
378  , INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", this->params())
379  , BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", this->params())
380  , INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages, else specific page to process",
381  this->params())
382  , BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params())
383  , BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", this->params())
384  , STRING_MEMBER(file_type, ".tif", "Filename extension", this->params())
385  , BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params())
386  , STRING_MEMBER(tessedit_load_sublangs, "", "List of languages to load with this one",
387  this->params())
388  , BOOL_MEMBER(tessedit_use_primary_params_model, false,
389  "In multilingual mode use params model of the"
390  " primary language",
391  this->params())
392  , double_MEMBER(min_orientation_margin, 7.0, "Min acceptable orientation margin",
393  this->params())
394  , BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())
395  , BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())
396  , BOOL_MEMBER(poly_allow_detailed_fx, false,
397  "Allow feature extractors to see the original outline", this->params())
398  , BOOL_INIT_MEMBER(tessedit_init_config_only, false,
399  "Only initialize with the config file. Useful if the "
400  "instance is not going to be used for OCR but say only "
401  "for layout analysis.",
402  this->params())
403 #ifndef DISABLED_LEGACY_ENGINE
404  , BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", this->params())
405 #endif // ndef DISABLED_LEGACY_ENGINE
406  , BOOL_MEMBER(textord_tabfind_vertical_text, true, "Enable vertical detection", this->params())
407  , BOOL_MEMBER(textord_tabfind_force_vertical_text, false, "Force using vertical text page mode",
408  this->params())
409  , double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,
410  "Fraction of textlines deemed vertical to use vertical page "
411  "mode",
412  this->params())
413  , double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,
414  "Fraction of height used as a minimum gap for aligned blobs.", this->params())
415  , INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", this->params())
416  , BOOL_MEMBER(preserve_interword_spaces, false, "Preserve multiple interword spaces",
417  this->params())
418  , STRING_MEMBER(page_separator, "\f", "Page separator (default is form feed control character)",
419  this->params())
420  , INT_MEMBER(lstm_choice_mode, 0,
421  "Allows to include alternative symbols choices in the hOCR output. "
422  "Valid input values are 0, 1 and 2. 0 is the default value. "
423  "With 1 the alternative symbol choices per timestep are included. "
424  "With 2 alternative symbol choices are extracted from the CTC "
425  "process instead of the lattice. The choices are mapped per "
426  "character.",
427  this->params())
428  , INT_MEMBER(lstm_choice_iterations, 5,
429  "Sets the number of cascading iterations for the Beamsearch in "
430  "lstm_choice_mode. Note that lstm_choice_mode must be set to a "
431  "value greater than 0 to produce results.",
432  this->params())
433  , double_MEMBER(lstm_rating_coefficient, 5,
434  "Sets the rating coefficient for the lstm choices. The smaller the "
435  "coefficient, the better are the ratings for each choice and less "
436  "information is lost due to the cut off at 0. The standard value is "
437  "5",
438  this->params())
439  , BOOL_MEMBER(pageseg_apply_music_mask, false,
440  "Detect music staff and remove intersecting components", this->params())
441  ,
442 
443  backup_config_file_(nullptr)
444  , pix_binary_(nullptr)
445  , pix_grey_(nullptr)
446  , pix_original_(nullptr)
447  , pix_thresholds_(nullptr)
448  , source_resolution_(0)
449  , textord_(this)
450  , right_to_left_(false)
451  , scaled_color_(nullptr)
452  , scaled_factor_(-1)
453  , deskew_(1.0f, 0.0f)
454  , reskew_(1.0f, 0.0f)
455  , most_recently_used_(this)
456  , font_table_size_(0)
457 #ifndef DISABLED_LEGACY_ENGINE
458  , equ_detect_(nullptr)
459 #endif // ndef DISABLED_LEGACY_ENGINE
460  , lstm_recognizer_(nullptr)
461  , train_line_page_num_(0) {}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:368
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:376
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:378
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:374
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:372
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:370
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:168
ParamsVectors * params()
Definition: ccutil.h:53

◆ ~Tesseract()

tesseract::Tesseract::~Tesseract ( )
override

Definition at line 463 of file tesseractclass.cpp.

463  {
464  Clear();
465  pix_original_.destroy();
466  end_tesseract();
467  for (auto *lang : sub_langs_) {
468  delete lang;
469  }
470  delete lstm_recognizer_;
471  lstm_recognizer_ = nullptr;
472 }
void destroy()
Definition: image.cpp:32
std::string lang
Definition: ccutil.h:59

Member Function Documentation

◆ acceptable_number_string()

bool tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 386 of file output.cpp.

386  {
387  bool prev_digit = false;
388 
389  if (*lengths == 1 && *s == '(') {
390  s++;
391  }
392 
393  if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
394  s++;
395  }
396 
397  for (; *s != '\0'; s += *(lengths++)) {
398  if (unicharset.get_isdigit(s, *lengths)) {
399  prev_digit = true;
400  } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
401  prev_digit = false;
402  } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
403  ((*s == '%') || (*s == ')'))) {
404  return true;
405  } else if (prev_digit && *lengths == 1 && (*s == '%') &&
406  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
407  (*(s + *lengths + *(lengths + 1)) == '\0')) {
408  return true;
409  } else {
410  return false;
411  }
412  }
413  return true;
414 }
UNICHARSET unicharset
Definition: ccutil.h:61
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524

◆ acceptable_word_string()

ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1704 of file control.cpp.

1705  {
1706  int i = 0;
1707  int offset = 0;
1708  int leading_punct_count;
1709  int upper_count = 0;
1710  int hyphen_pos = -1;
1712 
1713  if (strlen(lengths) > 20) {
1714  return word_type;
1715  }
1716 
1717  /* Single Leading punctuation char*/
1718 
1719  if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) {
1720  offset += lengths[i++];
1721  }
1722  leading_punct_count = i;
1723 
1724  /* Initial cap */
1725  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1726  offset += lengths[i++];
1727  upper_count++;
1728  }
1729  if (upper_count > 1) {
1730  word_type = AC_UPPER_CASE;
1731  } else {
1732  /* Lower case word, possibly with an initial cap */
1733  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1734  offset += lengths[i++];
1735  }
1736  if (i - leading_punct_count < quality_min_initial_alphas_reqd) {
1737  goto not_a_word;
1738  }
1739  /*
1740 Allow a single hyphen in a lower case word
1741 - don't trust upper case - I've seen several cases of "H" -> "I-I"
1742 */
1743  if (lengths[i] == 1 && s[offset] == '-') {
1744  hyphen_pos = i;
1745  offset += lengths[i++];
1746  if (s[offset] != '\0') {
1747  while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) {
1748  offset += lengths[i++];
1749  }
1750  if (i < hyphen_pos + 3) {
1751  goto not_a_word;
1752  }
1753  }
1754  } else {
1755  /* Allow "'s" in NON hyphenated lower case words */
1756  if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 &&
1757  (s[offset + lengths[i]] == 's')) {
1758  offset += lengths[i++];
1759  offset += lengths[i++];
1760  }
1761  }
1762  if (upper_count > 0) {
1763  word_type = AC_INITIAL_CAP;
1764  } else {
1765  word_type = AC_LOWER_CASE;
1766  }
1767  }
1768 
1769  /* Up to two different, constrained trailing punctuation chars */
1770  if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) {
1771  offset += lengths[i++];
1772  }
1773  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
1774  chs_trailing_punct2.contains(s[offset])) {
1775  offset += lengths[i++];
1776  }
1777 
1778  if (s[offset] != '\0') {
1779  word_type = AC_UNACCEPTABLE;
1780  }
1781 
1782 not_a_word:
1783 
1784  if (word_type == AC_UNACCEPTABLE) {
1785  /* Look for abbreviation string */
1786  i = 0;
1787  offset = 0;
1788  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1789  word_type = AC_UC_ABBREV;
1790  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&
1791  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1792  offset += lengths[i++];
1793  offset += lengths[i++];
1794  }
1795  } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1796  word_type = AC_LC_ABBREV;
1797  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&
1798  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1799  offset += lengths[i++];
1800  offset += lengths[i++];
1801  }
1802  }
1803  if (s[offset] != '\0') {
1804  word_type = AC_UNACCEPTABLE;
1805  }
1806  }
1807 
1808  return word_type;
1809 }
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
@ AC_UC_ABBREV
A.B.C.
Definition: control.h:34
@ AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:32
@ AC_LC_ABBREV
a.b.c.
Definition: control.h:33
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
@ AC_UPPER_CASE
ALL upper case.
Definition: control.h:31
@ AC_LOWER_CASE
ALL lower case.
Definition: control.h:30

◆ alpha_count()

int16_t tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 483 of file reject.cpp.

483  {
484  int16_t i;
485  int16_t offset;
486  int16_t count = 0;
487 
488  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489  if (unicharset.get_isalpha(word + offset, word_lengths[i])) {
490  count++;
491  }
492  }
493  return count;
494 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497

◆ ambigs_classify_and_output()

void tesseract::Tesseract::ambigs_classify_and_output ( const char *  label,
PAGE_RES_IT pr_it,
FILE *  output_file 
)

Definition at line 203 of file recogtraining.cpp.

204  {
205  // Classify word.
206  fflush(stdout);
207  WordData word_data(*pr_it);
208  SetupWordPassN(1, &word_data);
209  classify_word_and_language(1, pr_it, &word_data);
210  WERD_RES *werd_res = word_data.word;
211  WERD_CHOICE *best_choice = werd_res->best_choice;
212  ASSERT_HOST(best_choice != nullptr);
213 
214  // Compute the number of unichars in the label.
215  std::vector<UNICHAR_ID> encoding;
216  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
217  tprintf("Not outputting illegal unichar %s\n", label);
218  return;
219  }
220 
221  // Dump all paths through the ratings matrix (which is normally small).
222  int dim = werd_res->ratings->dimension();
223  const auto **blob_choices = new const BLOB_CHOICE *[dim];
224  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
225  delete[] blob_choices;
226 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1302
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:166
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239

◆ AnyLSTMLang()

bool tesseract::Tesseract::AnyLSTMLang ( ) const
inline

Definition at line 302 of file tesseractclass.h.

302  {
303  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
304  return true;
305  }
306  for (auto &lang : sub_langs_) {
307  if (lang->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
308  return true;
309  }
310  }
311  return false;
312  }
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:266

◆ AnyTessLang()

bool tesseract::Tesseract::AnyTessLang ( ) const
inline

Definition at line 290 of file tesseractclass.h.

290  {
291  if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
292  return true;
293  }
294  for (auto &lang : sub_langs_) {
295  if (lang->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
296  return true;
297  }
298  }
299  return false;
300  }

◆ ApplyBoxes()

PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const char *  filename,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 110 of file applybox.cpp.

111  {
112  std::vector<TBOX> boxes;
113  std::vector<std::string> texts, full_texts;
114  if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
115  return nullptr; // Can't do it.
116  }
117 
118  const int box_count = boxes.size();
119  int box_failures = 0;
120 
121  // In word mode, we use the boxes to make a word for each box, but
122  // in blob mode we use the existing words and maximally chop them first.
123  PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
124  clear_any_old_text(block_list);
125 
126  for (int i = 0; i < box_count; i++) {
127  bool foundit = false;
128  if (page_res != nullptr) {
129  foundit =
130  ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
131  (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
132  } else {
133  foundit = ResegmentWordBox(block_list, boxes[i],
134  (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
135  }
136  if (!foundit) {
137  box_failures++;
138  ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
139  }
140  }
141 
142  if (page_res == nullptr) {
143  // In word/line mode, we now maximally chop all the words and resegment
144  // them with the classifier.
145  page_res = SetupApplyBoxes(boxes, block_list);
146  ReSegmentByClassification(page_res);
147  }
148  if (applybox_debug > 0) {
149  tprintf("APPLY_BOXES:\n");
150  tprintf(" Boxes read from boxfile: %6d\n", box_count);
151  if (box_failures > 0) {
152  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
153  }
154  }
155  TidyUp(page_res);
156  return page_res;
157 }
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
Definition: boxread.cpp:75
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
void TidyUp(PAGE_RES *page_res)
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:310
void ReSegmentByClassification(PAGE_RES *page_res)
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
PAGE_RES * SetupApplyBoxes(const std::vector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:197

◆ ApplyBoxTraining()

void tesseract::Tesseract::ApplyBoxTraining ( const std::string &  fontname,
PAGE_RES page_res 
)

◆ AssignDiacriticsToNewBlobs()

void tesseract::Tesseract::AssignDiacriticsToNewBlobs ( const std::vector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
std::vector< bool > *  word_wanted,
std::vector< C_BLOB * > *  target_blobs 
)

Definition at line 1036 of file control.cpp.

1039  {
1040  std::vector<bool> blob_wanted;
1041  word_wanted->clear();
1042  word_wanted->resize(outlines.size());
1043  target_blobs->clear();
1044  target_blobs->resize(outlines.size());
1045  // Check for outlines that need to be turned into stand-alone blobs.
1046  for (unsigned i = 0; i < outlines.size(); ++i) {
1047  if (outlines[i] == nullptr) {
1048  continue;
1049  }
1050  // Get a set of adjacent outlines that don't overlap any existing blob.
1051  blob_wanted.clear();
1052  blob_wanted.resize(outlines.size());
1053  int num_blob_outlines = 0;
1054  TBOX total_ol_box(outlines[i]->bounding_box());
1055  while (i < outlines.size() && outlines[i] != nullptr) {
1056  blob_wanted[i] = true;
1057  total_ol_box += outlines[i]->bounding_box();
1058  ++i;
1059  ++num_blob_outlines;
1060  }
1061  // Find the insertion point.
1062  C_BLOB_IT blob_it(real_word->cblob_list());
1063  while (!blob_it.at_last() &&
1064  blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) {
1065  blob_it.forward();
1066  }
1067  // Choose which combination of them we actually want and where to put
1068  // them.
1069  if (debug_noise_removal) {
1070  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1071  }
1072  C_BLOB *left_blob = blob_it.data();
1073  TBOX left_box = left_blob->bounding_box();
1074  C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1075  if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1076  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1077  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines,
1078  num_blob_outlines, &blob_wanted)) {
1079  if (debug_noise_removal) {
1080  tprintf("Added to left blob\n");
1081  }
1082  for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1083  if (blob_wanted[j]) {
1084  (*word_wanted)[j] = true;
1085  (*target_blobs)[j] = left_blob;
1086  }
1087  }
1088  } else if (right_blob != nullptr &&
1089  (!left_box.x_overlap(total_ol_box) ||
1090  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1091  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines,
1092  num_blob_outlines, &blob_wanted)) {
1093  if (debug_noise_removal) {
1094  tprintf("Added to right blob\n");
1095  }
1096  for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1097  if (blob_wanted[j]) {
1098  (*word_wanted)[j] = true;
1099  (*target_blobs)[j] = right_blob;
1100  }
1101  }
1102  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines,
1103  num_blob_outlines, &blob_wanted)) {
1104  if (debug_noise_removal) {
1105  tprintf("Fitted between blobs\n");
1106  }
1107  for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1108  if (blob_wanted[j]) {
1109  (*word_wanted)[j] = true;
1110  (*target_blobs)[j] = nullptr;
1111  }
1112  }
1113  }
1114  }
1115 }
@ TBOX
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const std::vector< C_OUTLINE * > &outlines, int num_outlines, std::vector< bool > *ok_outlines)
Definition: control.cpp:1120

◆ AssignDiacriticsToOverlappingBlobs()

void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs ( const std::vector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
std::vector< bool > *  word_wanted,
std::vector< bool > *  overlapped_any_blob,
std::vector< C_BLOB * > *  target_blobs 
)

Definition at line 981 of file control.cpp.

985  {
986  std::vector<bool> blob_wanted;
987  word_wanted->clear();
988  word_wanted->resize(outlines.size());
989  overlapped_any_blob->clear();
990  overlapped_any_blob->resize(outlines.size());
991  target_blobs->clear();
992  target_blobs->resize(outlines.size());
993  // For each real blob, find the outlines that seriously overlap it.
994  // A single blob could be several merged characters, so there can be quite
995  // a few outlines overlapping, and the full engine needs to be used to chop
996  // and join to get a sensible result.
997  C_BLOB_IT blob_it(real_word->cblob_list());
998  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
999  C_BLOB *blob = blob_it.data();
1000  const TBOX blob_box = blob->bounding_box();
1001  blob_wanted.clear();
1002  blob_wanted.resize(outlines.size());
1003  int num_blob_outlines = 0;
1004  for (unsigned i = 0; i < outlines.size(); ++i) {
1005  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) {
1006  blob_wanted[i] = true;
1007  (*overlapped_any_blob)[i] = true;
1008  ++num_blob_outlines;
1009  }
1010  }
1011  if (debug_noise_removal) {
1012  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1013  blob_box.print();
1014  }
1015  // If any outlines overlap the blob, and not too many, classify the blob
1016  // (using the full engine, languages and all), and choose the maximal
1017  // combination of outlines that doesn't hurt the end-result classification
1018  // by too much. Mark them as wanted.
1019  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1020  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines,
1021  num_blob_outlines, &blob_wanted)) {
1022  for (unsigned i = 0; i < blob_wanted.size(); ++i) {
1023  if (blob_wanted[i]) {
1024  // Claim the outline and record where it is going.
1025  (*word_wanted)[i] = true;
1026  (*target_blobs)[i] = blob;
1027  }
1028  }
1029  }
1030  }
1031  }
1032 }

◆ AutoPageSeg()

int tesseract::Tesseract::AutoPageSeg ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
BLOBNBOX_LIST *  diacritic_blobs,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 201 of file pagesegmain.cpp.

202  {
203  Image photomask_pix = nullptr;
204  Image musicmask_pix = nullptr;
205  // The blocks made by the ColumnFinder. Moved to blocks before return.
206  BLOCK_LIST found_blocks;
207  TO_BLOCK_LIST temp_blocks;
208 
209  ColumnFinder *finder = SetupPageSegAndDetectOrientation(
210  pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
211  pageseg_apply_music_mask ? &musicmask_pix : nullptr);
212  int result = 0;
213  if (finder != nullptr) {
214  TO_BLOCK_IT to_block_it(&temp_blocks);
215  TO_BLOCK *to_block = to_block_it.data();
216  if (musicmask_pix != nullptr) {
217  // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
218  // blocks separately. For now combine with photomask_pix.
219  photomask_pix |= musicmask_pix;
220  }
221 #ifndef DISABLED_LEGACY_ENGINE
222  if (equ_detect_) {
223  finder->SetEquationDetect(equ_detect_);
224  }
225 #endif // ndef DISABLED_LEGACY_ENGINE
226  result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
227  photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
228  &found_blocks, diacritic_blobs, to_blocks);
229  if (result >= 0) {
230  finder->GetDeskewVectors(&deskew_, &reskew_);
231  }
232  delete finder;
233  }
234  photomask_pix.destroy();
235  musicmask_pix.destroy();
236  if (result < 0) {
237  return result;
238  }
239 
240  blocks->clear();
241  BLOCK_IT block_it(blocks);
242  // Move the found blocks to the input/output blocks.
243  block_it.add_list_after(&found_blocks);
244  return result;
245 }
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Image *photo_mask_pix, Image *music_mask_pix)

◆ BelievableSuperscript()

bool tesseract::Tesseract::BelievableSuperscript ( bool  debug,
const WERD_RES word,
float  certainty_threshold,
int *  left_ok,
int *  right_ok 
) const

Return whether this is believable superscript or subscript text.

We insist that:

  • there are no punctuation marks.
  • there are no italics.
  • no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
  • each character is at least as certain as certainty_threshold.
Parameters
[in]debugIf true, spew debug output
[in]wordThe word whose best_choice we're evaluating
[in]certainty_thresholdIf any of the characters have less certainty than this, reject.
[out]left_okHow many left-side characters were ok?
[out]right_okHow many right-side characters were ok?
Returns
Whether the complete best choice is believable as a superscript.

Definition at line 503 of file superscript.cpp.

504  {
505  unsigned initial_ok_run_count = 0;
506  unsigned ok_run_count = 0;
507  float worst_certainty = 0.0f;
508  const WERD_CHOICE &wc = *word.best_choice;
509 
510  const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();
511  for (unsigned i = 0; i < wc.length(); i++) {
512  TBLOB *blob = word.rebuild_word->blobs[i];
513  UNICHAR_ID unichar_id = wc.unichar_id(i);
514  float char_certainty = wc.certainty(i);
515  bool bad_certainty = char_certainty < certainty_threshold;
516  bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
517  bool is_italic = word.fontinfo && word.fontinfo->is_italic();
518  BLOB_CHOICE *choice = word.GetBlobChoice(i);
519  if (choice && fontinfo_table.size() > 0) {
520  // Get better information from the specific choice, if available.
521  int font_id1 = choice->fontinfo_id();
522  bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.at(font_id1).is_italic() : false;
523  int font_id2 = choice->fontinfo_id2();
524  is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.at(font_id2).is_italic());
525  }
526 
527  float height_fraction = 1.0f;
528  float char_height = blob->bounding_box().height();
529  float normal_height = char_height;
530  if (wc.unicharset()->top_bottom_useful()) {
531  int min_bot, max_bot, min_top, max_top;
532  wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top);
533  float hi_height = max_top - max_bot;
534  float lo_height = min_top - min_bot;
535  normal_height = (hi_height + lo_height) / 2;
536  if (normal_height >= kBlnXHeight) {
537  // Only ding characters that we have decent information for because
538  // they're supposed to be normal sized, not tiny specks or dashes.
539  height_fraction = char_height / normal_height;
540  }
541  }
542  bool bad_height = height_fraction < superscript_scaledown_ratio;
543 
544  if (debug) {
545  if (is_italic) {
546  tprintf(" Rejecting: superscript is italic.\n");
547  }
548  if (is_punc) {
549  tprintf(" Rejecting: punctuation present.\n");
550  }
551  const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
552  if (bad_certainty) {
553  tprintf(
554  " Rejecting: don't believe character %s with certainty %.2f "
555  "which is less than threshold %.2f\n",
556  char_str, char_certainty, certainty_threshold);
557  }
558  if (bad_height) {
559  tprintf(
560  " Rejecting: character %s seems too small @ %.2f versus "
561  "expected %.2f\n",
562  char_str, char_height, normal_height);
563  }
564  }
565  if (bad_certainty || bad_height || is_punc || is_italic) {
566  if (ok_run_count == i) {
567  initial_ok_run_count = ok_run_count;
568  }
569  ok_run_count = 0;
570  } else {
571  ok_run_count++;
572  }
573  if (char_certainty < worst_certainty) {
574  worst_certainty = char_certainty;
575  }
576  }
577  bool all_ok = ok_run_count == wc.length();
578  if (all_ok && debug) {
579  tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
580  }
581  if (!all_ok) {
582  if (left_ok) {
583  *left_ok = initial_ok_run_count;
584  }
585  if (right_ok) {
586  *right_ok = ok_run_count;
587  }
588  }
589  return all_ok;
590 }
const int kBlnXHeight
Definition: normalis.h:33
int UNICHAR_ID
Definition: unichar.h:36
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:324

◆ BestPix()

Image tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 238 of file tesseractclass.h.

238  {
239  if (pixGetWidth(pix_original_) == ImageWidth()) {
240  return pix_original_;
241  } else if (pix_grey_ != nullptr) {
242  return pix_grey_;
243  } else {
244  return pix_binary_;
245  }
246  }

◆ bigram_correction_pass()

void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 456 of file control.cpp.

456  {
457  PAGE_RES_IT word_it(page_res);
458 
459  WERD_RES *w_prev = nullptr;
460  WERD_RES *w = word_it.word();
461  while (true) {
462  w_prev = w;
463  while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) {
464  // advance word_it, skipping over parts of combos
465  }
466  if (!word_it.word()) {
467  break;
468  }
469  w = word_it.word();
470  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
471  continue;
472  }
473  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
474  if (tessedit_bigram_debug) {
475  tprintf("Skipping because one of the words is W_REP_CHAR\n");
476  }
477  continue;
478  }
479  // Two words sharing the same language model, excellent!
480  std::vector<WERD_CHOICE *> overrides_word1;
481  std::vector<WERD_CHOICE *> overrides_word2;
482 
483  const auto orig_w1_str = w_prev->best_choice->unichar_string();
484  const auto orig_w2_str = w->best_choice->unichar_string();
485  WERD_CHOICE prev_best(w->uch_set);
486  {
487  int w1start, w1end;
488  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
489  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
490  }
491  WERD_CHOICE this_best(w->uch_set);
492  {
493  int w2start, w2end;
494  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
495  this_best = w->best_choice->shallow_copy(w2start, w2end);
496  }
497 
498  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
499  if (tessedit_bigram_debug) {
500  tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(),
501  orig_w2_str.c_str());
502  }
503  continue;
504  }
505  if (tessedit_bigram_debug > 2) {
506  tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str());
507  }
508  if (tessedit_bigram_debug > 1) {
509  if (!w_prev->best_choices.singleton()) {
510  w_prev->PrintBestChoices();
511  }
512  if (!w->best_choices.singleton()) {
513  w->PrintBestChoices();
514  }
515  }
516  float best_rating = 0.0;
517  int best_idx = 0;
518  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
519  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
520  WERD_CHOICE *p1 = prev_it.data();
521  WERD_CHOICE strip1(w->uch_set);
522  {
523  int p1start, p1end;
524  p1->GetNonSuperscriptSpan(&p1start, &p1end);
525  strip1 = p1->shallow_copy(p1start, p1end);
526  }
527  WERD_CHOICE_IT w_it(&w->best_choices);
528  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
529  WERD_CHOICE *p2 = w_it.data();
530  WERD_CHOICE strip2(w->uch_set);
531  {
532  int p2start, p2end;
533  p2->GetNonSuperscriptSpan(&p2start, &p2end);
534  strip2 = p2->shallow_copy(p2start, p2end);
535  }
536  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
537  overrides_word1.push_back(p1);
538  overrides_word2.push_back(p2);
539  if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) {
540  best_rating = p1->rating() + p2->rating();
541  best_idx = overrides_word1.size() - 1;
542  }
543  }
544  }
545  }
546  if (!overrides_word1.empty()) {
547  // Excellent, we have some bigram matches.
548  if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) &&
549  EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) {
550  if (tessedit_bigram_debug > 1) {
551  tprintf(
552  "Top choice \"%s %s\" verified (sans case) by bigram "
553  "model.\n",
554  orig_w1_str.c_str(), orig_w2_str.c_str());
555  }
556  continue;
557  }
558  const auto new_w1_str = overrides_word1[best_idx]->unichar_string();
559  const auto new_w2_str = overrides_word2[best_idx]->unichar_string();
560  if (new_w1_str != orig_w1_str) {
561  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
562  }
563  if (new_w2_str != orig_w2_str) {
564  w->ReplaceBestChoice(overrides_word2[best_idx]);
565  }
566  if (tessedit_bigram_debug > 0) {
567  std::string choices_description;
568  int num_bigram_choices = overrides_word1.size() * overrides_word2.size();
569  if (num_bigram_choices == 1) {
570  choices_description = "This was the unique bigram choice.";
571  } else {
572  if (tessedit_bigram_debug > 1) {
573  std::string bigrams_list;
574  const int kMaxChoicesToPrint = 20;
575  for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) {
576  if (i > 0) {
577  bigrams_list += ", ";
578  }
579  WERD_CHOICE *p1 = overrides_word1[i];
580  WERD_CHOICE *p2 = overrides_word2[i];
581  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
582  }
583  choices_description = "There were many choices: {";
584  choices_description += bigrams_list;
585  choices_description += "}";
586  } else {
587  choices_description += "There were " + std::to_string(num_bigram_choices);
588  choices_description += " compatible bigrams.";
589  }
590  }
591  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(),
592  orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),
593  choices_description.c_str());
594  }
595  }
596  }
597 }
@ W_REP_CHAR
repeated character
Definition: werd.h:40
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:773

◆ blamer_pass()

void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 683 of file control.cpp.

683  {
684  if (!wordrec_run_blamer) {
685  return;
686  }
687  PAGE_RES_IT page_res_it(page_res);
688  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
689  WERD_RES *word = page_res_it.word();
690  BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
691  page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
692  }
693  tprintf("Blame reasons:\n");
694  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
696  page_res->blame_reasons[bl]);
697  }
698  if (page_res->misadaption_log.size() > 0) {
699  tprintf("Misadaption log:\n");
700  for (auto &log : page_res->misadaption_log) {
701  tprintf("%s\n", log.c_str());
702  }
703  }
704 }
IncorrectResultReason
Definition: blamer.h:56
@ IRR_NUM_REASONS
Definition: blamer.h:103
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:540

◆ blob_feature_display()

void tesseract::Tesseract::blob_feature_display ( PAGE_RES page_res,
const TBOX selection_box 
)

Definition at line 912 of file pgedit.cpp.

912  {
913 # ifndef DISABLED_LEGACY_ENGINE
914  PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
915  if (it != nullptr) {
916  WERD_RES *word_res = it->word();
917  word_res->x_height = it->row()->row->x_height();
918  word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
919  classify_bln_numeric_mode, textord_use_cjk_fp_model,
920  poly_allow_detailed_fx, it->row()->row, it->block()->block);
921  TWERD *bln_word = word_res->chopped_word;
922  TBLOB *bln_blob = bln_word->blobs[0];
923  INT_FX_RESULT_STRUCT fx_info;
924  std::vector<INT_FEATURE_STRUCT> bl_features;
925  std::vector<INT_FEATURE_STRUCT> cn_features;
926  Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features, &cn_features,
927  &fx_info, nullptr);
928  // Display baseline features.
929  ScrollView *bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
931  for (auto &bl_feature : bl_features) {
932  RenderIntFeature(bl_win, &bl_feature, ScrollView::GREEN);
933  }
934  bl_win->Update();
935  // Display cn features.
936  ScrollView *cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
938  for (auto &cn_feature : cn_features) {
939  RenderIntFeature(cn_win, &cn_feature, ScrollView::GREEN);
940  }
941  cn_win->Update();
942 
943  it->DeleteCurrentWord();
944  delete it;
945  }
946 # endif // ndef DISABLED_LEGACY_ENGINE
947 }
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:38
@ character
Definition: mfoutline.h:53
@ baseline
Definition: mfoutline.h:53
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1622
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1500
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:887
Image BestPix() const
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, std::vector< INT_FEATURE_STRUCT > *bl_features, std::vector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, std::vector< int > *outline_cn_counts)
Definition: intfx.cpp:436

◆ blob_noise_score()

float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 772 of file fixspace.cpp.

772  {
773  TBOX box; // BB of outline
774  int16_t outline_count = 0;
775  int16_t max_dimension;
776  int16_t largest_outline_dimension = 0;
777 
778  for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
779  outline_count++;
780  box = ol->bounding_box();
781  if (box.height() > box.width()) {
782  max_dimension = box.height();
783  } else {
784  max_dimension = box.width();
785  }
786 
787  if (largest_outline_dimension < max_dimension) {
788  largest_outline_dimension = max_dimension;
789  }
790  }
791 
792  if (outline_count > 5) {
793  // penalise LOTS of blobs
794  largest_outline_dimension *= 2;
795  }
796 
797  box = blob->bounding_box();
798  if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {
799  // Lax blob is if high or low
800  largest_outline_dimension /= 2;
801  }
802 
803  return largest_outline_dimension;
804 }
const int kBlnBaselineOffset
Definition: normalis.h:34

◆ BOOL_VAR_H() [1/91]

tesseract::Tesseract::BOOL_VAR_H ( applybox_learn_chars_and_char_frags_mode  )

◆ BOOL_VAR_H() [2/91]

tesseract::Tesseract::BOOL_VAR_H ( applybox_learn_ngrams_mode  )

◆ BOOL_VAR_H() [3/91]

tesseract::Tesseract::BOOL_VAR_H ( bland_unrej  )

◆ BOOL_VAR_H() [4/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_accept_ok  )

◆ BOOL_VAR_H() [5/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_early_convert_bad_unlv_chs  )

◆ BOOL_VAR_H() [6/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_early_merge_tess_fails  )

◆ BOOL_VAR_H() [7/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_include_numerals  )

◆ BOOL_VAR_H() [8/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_leave_accept_strings  )

◆ BOOL_VAR_H() [9/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_leave_ok_strings  )

◆ BOOL_VAR_H() [10/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_terrible_garbage  )

◆ BOOL_VAR_H() [11/91]

tesseract::Tesseract::BOOL_VAR_H ( enable_noise_removal  )

◆ BOOL_VAR_H() [12/91]

tesseract::Tesseract::BOOL_VAR_H ( hocr_char_boxes  )

◆ BOOL_VAR_H() [13/91]

tesseract::Tesseract::BOOL_VAR_H ( hocr_font_info  )

◆ BOOL_VAR_H() [14/91]

tesseract::Tesseract::BOOL_VAR_H ( interactive_display_mode  )

◆ BOOL_VAR_H() [15/91]

tesseract::Tesseract::BOOL_VAR_H ( lstm_use_matrix  )

◆ BOOL_VAR_H() [16/91]

tesseract::Tesseract::BOOL_VAR_H ( pageseg_apply_music_mask  )

◆ BOOL_VAR_H() [17/91]

tesseract::Tesseract::BOOL_VAR_H ( paragraph_text_based  )

◆ BOOL_VAR_H() [18/91]

tesseract::Tesseract::BOOL_VAR_H ( poly_allow_detailed_fx  )

◆ BOOL_VAR_H() [19/91]

tesseract::Tesseract::BOOL_VAR_H ( preserve_interword_spaces  )

◆ BOOL_VAR_H() [20/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_1Il_trust_permuter_type  )

◆ BOOL_VAR_H() [21/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_1Il_use_dict_word  )

◆ BOOL_VAR_H() [22/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_alphas_in_number_perm  )

◆ BOOL_VAR_H() [23/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_trust_doc_dawg  )

◆ BOOL_VAR_H() [24/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_use_good_perm  )

◆ BOOL_VAR_H() [25/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_use_sensible_wd  )

◆ BOOL_VAR_H() [26/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_use_tess_accepted  )

◆ BOOL_VAR_H() [27/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_use_tess_blanks  )

◆ BOOL_VAR_H() [28/91]

tesseract::Tesseract::BOOL_VAR_H ( suspect_constrain_1Il  )

◆ BOOL_VAR_H() [29/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_adaption_debug  )

◆ BOOL_VAR_H() [30/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_ambigs_training  )

◆ BOOL_VAR_H() [31/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_alto  )

◆ BOOL_VAR_H() [32/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_boxfile  )

◆ BOOL_VAR_H() [33/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_hocr  )

◆ BOOL_VAR_H() [34/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_lstmbox  )

◆ BOOL_VAR_H() [35/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_pdf  )

◆ BOOL_VAR_H() [36/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_tsv  )

◆ BOOL_VAR_H() [37/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_txt  )

◆ BOOL_VAR_H() [38/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_wordstrbox  )

◆ BOOL_VAR_H() [39/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_debug_block_rejection  )

◆ BOOL_VAR_H() [40/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_debug_doc_rejection  )

◆ BOOL_VAR_H() [41/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_debug_fonts  )

◆ BOOL_VAR_H() [42/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_debug_quality_metrics  )

◆ BOOL_VAR_H() [43/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_display_outwords  )

◆ BOOL_VAR_H() [44/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_do_invert  )

◆ BOOL_VAR_H() [45/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_dont_blkrej_good_wds  )

◆ BOOL_VAR_H() [46/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_dont_rowrej_good_wds  )

◆ BOOL_VAR_H() [47/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_dump_choices  )

◆ BOOL_VAR_H() [48/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_dump_pageseg_images  )

◆ BOOL_VAR_H() [49/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_enable_bigram_correction  )

◆ BOOL_VAR_H() [50/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_enable_dict_correction  )

◆ BOOL_VAR_H() [51/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_enable_doc_dict  )

◆ BOOL_VAR_H() [52/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_fix_fuzzy_spaces  )

◆ BOOL_VAR_H() [53/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_fix_hyphens  )

◆ BOOL_VAR_H() [54/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_flip_0O  )

◆ BOOL_VAR_H() [55/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_good_quality_unrej  )

◆ BOOL_VAR_H() [56/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_init_config_only  )

◆ BOOL_VAR_H() [57/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_make_boxes_from_boxes  )

◆ BOOL_VAR_H() [58/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_minimal_rej_pass1  )

◆ BOOL_VAR_H() [59/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_minimal_rejection  )

◆ BOOL_VAR_H() [60/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_override_permuter  )

◆ BOOL_VAR_H() [61/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_prefer_joined_punct  )

◆ BOOL_VAR_H() [62/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_preserve_blk_rej_perfect_wds  )

◆ BOOL_VAR_H() [63/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_preserve_row_rej_perfect_wds  )

◆ BOOL_VAR_H() [64/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_reject_bad_qual_wds  )

◆ BOOL_VAR_H() [65/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_rejection_debug  )

◆ BOOL_VAR_H() [66/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_resegment_from_boxes  )

◆ BOOL_VAR_H() [67/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_resegment_from_line_boxes  )

◆ BOOL_VAR_H() [68/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_row_rej_good_docs  )

◆ BOOL_VAR_H() [69/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_test_adaption  )

◆ BOOL_VAR_H() [70/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_timing_debug  )

◆ BOOL_VAR_H() [71/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_train_from_boxes  )

◆ BOOL_VAR_H() [72/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_train_line_recognizer  )

◆ BOOL_VAR_H() [73/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_unrej_any_wd  )

◆ BOOL_VAR_H() [74/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_use_primary_params_model  )

◆ BOOL_VAR_H() [75/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_use_reject_spaces  )

◆ BOOL_VAR_H() [76/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_word_for_word  )

◆ BOOL_VAR_H() [77/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_write_block_separators  )

◆ BOOL_VAR_H() [78/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_write_images  )

◆ BOOL_VAR_H() [79/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_write_rep_codes  )

◆ BOOL_VAR_H() [80/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_write_unlv  )

◆ BOOL_VAR_H() [81/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_zero_kelvin_rejection  )

◆ BOOL_VAR_H() [82/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_zero_rejection  )

◆ BOOL_VAR_H() [83/91]

tesseract::Tesseract::BOOL_VAR_H ( test_pt  )

◆ BOOL_VAR_H() [84/91]

tesseract::Tesseract::BOOL_VAR_H ( textonly_pdf  )

◆ BOOL_VAR_H() [85/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_equation_detect  )

◆ BOOL_VAR_H() [86/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_tabfind_force_vertical_text  )

◆ BOOL_VAR_H() [87/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_tabfind_show_vlines  )

◆ BOOL_VAR_H() [88/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_tabfind_vertical_text  )

◆ BOOL_VAR_H() [89/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_use_cjk_fp_model  )

◆ BOOL_VAR_H() [90/91]

tesseract::Tesseract::BOOL_VAR_H ( thresholding_debug  )

◆ BOOL_VAR_H() [91/91]

tesseract::Tesseract::BOOL_VAR_H ( unlv_tilde_crunching  )

◆ break_noisiest_blob_word()

void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 621 of file fixspace.cpp.

621  {
622  WERD_RES_IT word_it(&words);
623  WERD_RES_IT worst_word_it;
624  float worst_noise_score = 9999;
625  int worst_blob_index = -1; // Noisiest blob of noisiest wd
626  int blob_index; // of wds noisiest blob
627  float noise_score; // of wds noisiest blob
628  WERD_RES *word_res;
629  C_BLOB_IT blob_it;
630  C_BLOB_IT rej_cblob_it;
631  C_BLOB_LIST new_blob_list;
632  C_BLOB_IT new_blob_it;
633  C_BLOB_IT new_rej_cblob_it;
634  WERD *new_word;
635  int16_t start_of_noise_blob;
636  int16_t i;
637 
638  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
639  blob_index = worst_noise_blob(word_it.data(), &noise_score);
640  if (blob_index > -1 && worst_noise_score > noise_score) {
641  worst_noise_score = noise_score;
642  worst_blob_index = blob_index;
643  worst_word_it = word_it;
644  }
645  }
646  if (worst_blob_index < 0) {
647  words.clear(); // signal termination
648  return;
649  }
650 
651  /* Now split the worst_word_it */
652 
653  word_res = worst_word_it.data();
654 
655  /* Move blobs before noise blob to a new bloblist */
656 
657  new_blob_it.set_to_list(&new_blob_list);
658  blob_it.set_to_list(word_res->word->cblob_list());
659  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
660  new_blob_it.add_after_then_move(blob_it.extract());
661  }
662  start_of_noise_blob = blob_it.data()->bounding_box().left();
663  delete blob_it.extract(); // throw out noise blob
664 
665  new_word = new WERD(&new_blob_list, word_res->word);
666  new_word->set_flag(W_EOL, false);
667  word_res->word->set_flag(W_BOL, false);
668  word_res->word->set_blanks(1); // After break
669 
670  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
671  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
672  for (; (!rej_cblob_it.empty() &&
673  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
674  rej_cblob_it.forward()) {
675  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
676  }
677 
678  auto *new_word_res = new WERD_RES(new_word);
679  new_word_res->combination = true;
680  worst_word_it.add_before_then_move(new_word_res);
681 
682  word_res->ClearResults();
683 }
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:685

◆ build_menu_new()

SVMenuNode * tesseract::Tesseract::build_menu_new ( )

build_menu()

Construct the menu tree used by the command window

Definition at line 274 of file pgedit.cpp.

274  {
275  SVMenuNode *parent_menu;
276  auto *root_menu_item = new SVMenuNode();
277 
278  SVMenuNode *modes_menu_item = root_menu_item->AddChild("MODES");
279 
280  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
281  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
282  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
283  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
284  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
285  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
286  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
287  modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
288 
289  parent_menu = root_menu_item->AddChild("DISPLAY");
290 
291  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
292  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
293  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
294  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
295  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
296  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
297  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
298  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
299  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
300  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
301  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
302  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
303  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
304  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
305  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
306 
307  parent_menu = root_menu_item->AddChild("OTHER");
308 
309  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
310  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
311  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
312  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
313  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
314  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
315 
316  return root_menu_item;
317 }
@ SHOW_SUBSCRIPT_CMD_EVENT
Definition: pgedit.cpp:69
@ DEBUG_WERD_CMD_EVENT
Definition: pgedit.cpp:53
@ SHOW_UNDERLINE_CMD_EVENT
Definition: pgedit.cpp:73
@ SHOW_SERIF_CMD_EVENT
Definition: pgedit.cpp:75
@ BASELINES_CMD_EVENT
Definition: pgedit.cpp:62
@ SHOW_BOLD_CMD_EVENT
Definition: pgedit.cpp:72
@ BLAMER_CMD_EVENT
Definition: pgedit.cpp:54
@ SHOW_BLN_WERD_CMD_EVENT
Definition: pgedit.cpp:52
@ RECOG_PSEUDO
Definition: pgedit.cpp:67
@ SHOW_SUPERSCRIPT_CMD_EVENT
Definition: pgedit.cpp:70
@ BL_NORM_CMD_EVENT
Definition: pgedit.cpp:58
@ REFRESH_CMD_EVENT
Definition: pgedit.cpp:64
@ BITMAP_CMD_EVENT
Definition: pgedit.cpp:59
@ DUMP_WERD_CMD_EVENT
Definition: pgedit.cpp:50
@ SHOW_BLOB_FEATURES
Definition: pgedit.cpp:68
@ SHOW_POINT_CMD_EVENT
Definition: pgedit.cpp:51
@ IMAGE_CMD_EVENT
Definition: pgedit.cpp:60
@ RECOG_WERDS
Definition: pgedit.cpp:66
@ SHOW_DROPCAPS_CMD_EVENT
Definition: pgedit.cpp:77
@ SHOW_FIXEDPITCH_CMD_EVENT
Definition: pgedit.cpp:74
@ CHANGE_DISP_CMD_EVENT
Definition: pgedit.cpp:49
@ CORRECT_TEXT_CMD_EVENT
Definition: pgedit.cpp:56
@ BOUNDING_BOX_CMD_EVENT
Definition: pgedit.cpp:55
@ BLOCKS_CMD_EVENT
Definition: pgedit.cpp:61
@ POLYGONAL_CMD_EVENT
Definition: pgedit.cpp:57
@ UNIFORM_DISP_CMD_EVENT
Definition: pgedit.cpp:63
@ QUIT_CMD_EVENT
Definition: pgedit.cpp:65
@ SHOW_SMALLCAPS_CMD_EVENT
Definition: pgedit.cpp:76
@ SHOW_ITALIC_CMD_EVENT
Definition: pgedit.cpp:71

◆ check_debug_pt()

bool tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1811 of file control.cpp.

1811  {
1812  bool show_map_detail = false;
1813  int16_t i;
1814 
1815  if (!test_pt) {
1816  return false;
1817  }
1818 
1819  tessedit_rejection_debug.set_value(false);
1820  debug_x_ht_level.set_value(0);
1821 
1822  if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) {
1823  if (location < 0) {
1824  return true; // For breakpoint use
1825  }
1826  tessedit_rejection_debug.set_value(true);
1827  debug_x_ht_level.set_value(2);
1828  tprintf("\n\nTESTWD::");
1829  switch (location) {
1830  case 0:
1831  tprintf("classify_word_pass1 start\n");
1832  word->word->print();
1833  break;
1834  case 10:
1835  tprintf("make_reject_map: initial map");
1836  break;
1837  case 20:
1838  tprintf("make_reject_map: after NN");
1839  break;
1840  case 30:
1841  tprintf("classify_word_pass2 - START");
1842  break;
1843  case 40:
1844  tprintf("classify_word_pass2 - Pre Xht");
1845  break;
1846  case 50:
1847  tprintf("classify_word_pass2 - END");
1848  show_map_detail = true;
1849  break;
1850  case 60:
1851  tprintf("fixspace");
1852  break;
1853  case 70:
1854  tprintf("MM pass START");
1855  break;
1856  case 80:
1857  tprintf("MM pass END");
1858  break;
1859  case 90:
1860  tprintf("After Poor quality rejection");
1861  break;
1862  case 100:
1863  tprintf("unrej_good_quality_words - START");
1864  break;
1865  case 110:
1866  tprintf("unrej_good_quality_words - END");
1867  break;
1868  case 120:
1869  tprintf("Write results pass");
1870  show_map_detail = true;
1871  break;
1872  }
1873  if (word->best_choice != nullptr) {
1874  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
1875  word->reject_map.print(debug_fp);
1876  tprintf("\n");
1877  if (show_map_detail) {
1878  tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
1879  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1880  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1881  word->reject_map[i].full_print(debug_fp);
1882  }
1883  }
1884  } else {
1885  tprintf("null best choice\n");
1886  }
1887  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1888  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1889  return true;
1890  } else {
1891  return false;
1892  }
1893 }
FILE * debug_fp
Definition: tessvars.cpp:24

◆ classify_word_and_language()

void tesseract::Tesseract::classify_word_and_language ( int  pass_n,
PAGE_RES_IT pr_it,
WordData word_data 
)

Definition at line 1302 of file control.cpp.

1302  {
1303 #ifdef DISABLED_LEGACY_ENGINE
1305 #else
1306  WordRecognizer recognizer =
1308 #endif // def DISABLED_LEGACY_ENGINE
1309 
1310  // Best result so far.
1311  PointerVector<WERD_RES> best_words;
1312  // Points to the best result. May be word or in lang_words.
1313  const WERD_RES *word = word_data->word;
1314  clock_t start_t = clock();
1315  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1316  if (debug) {
1317  tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing",
1318  most_recently_used_->lang.c_str());
1319  word->word->bounding_box().print();
1320  }
1321  if (word->done) {
1322  // If done on pass1, leave it as-is.
1323  if (!word->tess_failed) {
1324  most_recently_used_ = word->tesseract;
1325  }
1326  return;
1327  }
1328  auto sub = sub_langs_.size();
1329  if (most_recently_used_ != this) {
1330  // Get the index of the most_recently_used_.
1331  for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {
1332  }
1333  }
1334  most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],
1335  &best_words);
1336  Tesseract *best_lang_tess = most_recently_used_;
1337  if (!WordsAcceptable(best_words)) {
1338  // Try all the other languages to see if they are any better.
1339  if (most_recently_used_ != this &&
1340  this->RetryWithLanguage(*word_data, recognizer, debug,
1341  &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {
1342  best_lang_tess = this;
1343  }
1344  for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {
1345  if (most_recently_used_ != sub_langs_[i] &&
1346  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],
1347  &best_words) > 0) {
1348  best_lang_tess = sub_langs_[i];
1349  }
1350  }
1351  }
1352  most_recently_used_ = best_lang_tess;
1353  if (!best_words.empty()) {
1354  if (best_words.size() == 1 && !best_words[0]->combination) {
1355  // Move the best single result to the main word.
1356  word_data->word->ConsumeWordResults(best_words[0]);
1357  } else {
1358  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1359  word_data->word = best_words.back();
1360  pr_it->ReplaceCurrentWord(&best_words);
1361  }
1362  ASSERT_HOST(word_data->word->box_word != nullptr);
1363  } else {
1364  tprintf("no best words!!\n");
1365  }
1366  clock_t ocr_t = clock();
1367  if (tessedit_timing_debug) {
1368  tprintf("%s (ocr took %.2f sec)\n", word_data->word->best_choice->unichar_string().c_str(),
1369  static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC);
1370  }
1371 }
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1379
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:873
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1535

◆ classify_word_pass1()

void tesseract::Tesseract::classify_word_pass1 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1379 of file control.cpp.

1380  {
1381  ROW *row = word_data.row;
1382  BLOCK *block = word_data.block;
1384  word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1385 #ifdef DISABLED_LEGACY_ENGINE
1386  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1387 #else
1388  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
1389  tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
1390 #endif // def DISABLED_LEGACY_ENGINE
1391  if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1392  LSTMRecognizeWord(*block, row, *in_word, out_words);
1393  if (!out_words->empty()) {
1394  return; // Successful lstm recognition.
1395  }
1396  }
1397  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1398  // No fallback allowed, so use a fake.
1399  (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1400  return;
1401  }
1402 
1403 #ifndef DISABLED_LEGACY_ENGINE
1404  // Fall back to tesseract for failed words or odd words.
1405  (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr,
1406  classify_bln_numeric_mode, textord_use_cjk_fp_model,
1407  poly_allow_detailed_fx, row, block);
1408 #endif // ndef DISABLED_LEGACY_ENGINE
1409  }
1410 
1411 #ifndef DISABLED_LEGACY_ENGINE
1412  WERD_RES *word = *in_word;
1413  match_word_pass_n(1, word, row, block);
1414  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1415  word->tess_would_adapt = AdaptableWord(word);
1416  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1417 
1418  if (adapt_ok) {
1419  // Send word to adaptive classifier for training.
1420  word->BestChoiceToCorrectText();
1421  LearnWord(nullptr, word);
1422  // Mark misadaptions if running blamer.
1423  if (word->blamer_bundle != nullptr) {
1424  word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer);
1425  }
1426  }
1427 
1428  if (tessedit_enable_doc_dict && !word->IsAmbiguous()) {
1429  tess_add_doc_word(word->best_choice);
1430  }
1431  }
1432 #endif // ndef DISABLED_LEGACY_ENGINE
1433 }
@ OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:268
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:230
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1589
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:73
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:34
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:811
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262
const UNICHARSET & GetUnicharset() const
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387

◆ classify_word_pass2()

void tesseract::Tesseract::classify_word_pass2 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1535 of file control.cpp.

1536  {
1537  // Return if we do not want to run Tesseract.
1538  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1539  return;
1540  }
1541 #ifndef DISABLED_LEGACY_ENGINE
1542  ROW *row = word_data.row;
1543  BLOCK *block = word_data.block;
1544  WERD_RES *word = *in_word;
1546  word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1547 
1548  check_debug_pt(word, 30);
1549  if (!word->done) {
1550  word->caps_height = 0.0;
1551  if (word->x_height == 0.0f) {
1552  word->x_height = row->x_height();
1553  }
1554  match_word_pass_n(2, word, row, block);
1555  check_debug_pt(word, 40);
1556  }
1557 
1558  SubAndSuperscriptFix(word);
1559 
1560  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1562  block->classify_rotation().y() == 0.0f) {
1563  // Use the tops and bottoms since they are available.
1564  TrainedXheightFix(word, block, row);
1565  }
1566  }
1567 # ifndef GRAPHICS_DISABLED
1568  if (tessedit_display_outwords) {
1569  if (fx_win == nullptr) {
1570  create_fx_win();
1571  }
1572  clear_fx_win();
1573  word->rebuild_word->plot(fx_win);
1574  TBOX wbox = word->rebuild_word->bounding_box();
1575  fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
1577  }
1578 # endif
1579  check_debug_pt(word, 50);
1580 #endif // ndef DISABLED_LEGACY_ENGINE
1581 }
void clear_fx_win()
Definition: drawfx.cpp:61
void create_fx_win()
Definition: drawfx.cpp:50
ScrollView * fx_win
Definition: drawfx.cpp:42
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1811
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1455
bool script_has_xheight() const
Definition: unicharset.h:959
bool top_bottom_useful() const
Definition: unicharset.h:555
void void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:759
static void Update()
Definition: scrollview.cpp:713

◆ ClassifyBlobAsWord()

float tesseract::Tesseract::ClassifyBlobAsWord ( int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
std::string &  best_str,
float *  c2 
)

Definition at line 1252 of file control.cpp.

1253  {
1254  WERD *real_word = pr_it->word()->word;
1255  WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL),
1256  C_BLOB::deep_copy(blob));
1257  WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1258  // Get a new iterator that points to the new word.
1259  PAGE_RES_IT it(pr_it->page_res);
1260  while (it.word() != word_res && it.word() != nullptr) {
1261  it.forward();
1262  }
1263  ASSERT_HOST(it.word() == word_res);
1264  WordData wd(it);
1265  // Force full initialization.
1266  SetupWordPassN(1, &wd);
1267  classify_word_and_language(pass_n, &it, &wd);
1268  if (debug_noise_removal) {
1269  if (wd.word->raw_choice != nullptr) {
1270  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(),
1271  wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height());
1272  } else {
1273  tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1274  wd.row->x_height());
1275  }
1276  }
1277  float cert = 0.0f;
1278  if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1279  cert = wd.word->raw_choice->certainty();
1280  float rat = wd.word->raw_choice->rating();
1281  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1282  best_str = wd.word->raw_choice->unichar_string();
1283  } else {
1284  *c2 = 0.0f;
1285  best_str.clear();
1286  }
1287  it.DeleteCurrentWord();
1288  pr_it->ResetWordIterator();
1289  return cert;
1290 }
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:118

◆ ClassifyBlobPlusOutlines()

float tesseract::Tesseract::ClassifyBlobPlusOutlines ( const std::vector< bool > &  ok_outlines,
const std::vector< C_OUTLINE * > &  outlines,
int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
std::string &  best_str 
)

Definition at line 1207 of file control.cpp.

1209  {
1210  C_OUTLINE_IT ol_it;
1211  C_OUTLINE *first_to_keep = nullptr;
1212  C_BLOB *local_blob = nullptr;
1213  if (blob != nullptr) {
1214  // Add the required outlines to the blob.
1215  ol_it.set_to_list(blob->out_list());
1216  first_to_keep = ol_it.data();
1217  }
1218  for (unsigned i = 0; i < ok_outlines.size(); ++i) {
1219  if (ok_outlines[i]) {
1220  // This outline is to be added.
1221  if (blob == nullptr) {
1222  local_blob = new C_BLOB(outlines[i]);
1223  blob = local_blob;
1224  ol_it.set_to_list(blob->out_list());
1225  } else {
1226  ol_it.add_before_stay_put(outlines[i]);
1227  }
1228  }
1229  }
1230  float c2;
1231  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1232  ol_it.move_to_first();
1233  if (first_to_keep == nullptr) {
1234  // We created blob. Empty its outlines and delete it.
1235  for (; !ol_it.empty(); ol_it.forward()) {
1236  ol_it.extract();
1237  }
1238  delete local_blob;
1239  cert = -c2;
1240  } else {
1241  // Remove the outlines that we put in.
1242  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1243  ol_it.extract();
1244  }
1245  }
1246  return cert;
1247 }
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, float *c2)
Definition: control.cpp:1252

◆ Clear()

void tesseract::Tesseract::Clear ( )

Definition at line 483 of file tesseractclass.cpp.

483  {
484  std::string debug_name = imagebasename + "_debug.pdf";
485  pixa_debug_.WritePDF(debug_name.c_str());
486  pix_binary_.destroy();
487  pix_grey_.destroy();
488  pix_thresholds_.destroy();
489  scaled_color_.destroy();
490  deskew_ = FCOORD(1.0f, 0.0f);
491  reskew_ = FCOORD(1.0f, 0.0f);
492  splitter_.Clear();
493  scaled_factor_ = -1;
494  for (auto &sub_lang : sub_langs_) {
495  sub_lang->Clear();
496  }
497 }
void WritePDF(const char *filename)
Definition: debugpixa.h:42
std::string imagebasename
Definition: ccutil.h:58

◆ ComputeCompatibleXheight()

float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res,
float *  baseline_shift 
)

Definition at line 105 of file fixxht.cpp.

105  {
106  STATS top_stats(0, UINT8_MAX);
107  STATS shift_stats(-UINT8_MAX, UINT8_MAX);
108  int bottom_shift = 0;
109  int num_blobs = word_res->rebuild_word->NumBlobs();
110  do {
111  top_stats.clear();
112  shift_stats.clear();
113  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
114  TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
115  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
116  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
117  int top = blob->bounding_box().top() + bottom_shift;
118  // Clip the top to the limit of normalized feature space.
119  if (top >= INT_FEAT_RANGE) {
120  top = INT_FEAT_RANGE - 1;
121  }
122  int bottom = blob->bounding_box().bottom() + bottom_shift;
123  int min_bottom, max_bottom, min_top, max_top;
124  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
125  // Chars with a wild top range would mess up the result so ignore them.
126  if (max_top - min_top > kMaxCharTopRange) {
127  continue;
128  }
129  int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
130  top - (max_top + x_ht_acceptance_tolerance));
131  int height = top - kBlnBaselineOffset;
132  if (debug_x_ht_level >= 2) {
133  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
134  unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,
135  max_top, bottom, top);
136  }
137  // Use only chars that fit in the expected bottom range, and where
138  // the range of tops is sensibly near the xheight.
139  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
140  bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&
141  max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {
142  // Compute the x-height position using proportionality between the
143  // actual height and expected height.
144  int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);
145  int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);
146  if (debug_x_ht_level >= 2) {
147  tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
148  }
149  // The range of expected heights gets a vote equal to the distance
150  // of the actual top from the expected top.
151  for (int y = min_xht; y <= max_xht; ++y) {
152  top_stats.add(y, misfit_dist);
153  }
154  } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
155  bottom - x_ht_acceptance_tolerance > max_bottom) &&
156  bottom_shift == 0) {
157  // Get the range of required bottom shift.
158  int min_shift = min_bottom - bottom;
159  int max_shift = max_bottom - bottom;
160  if (debug_x_ht_level >= 2) {
161  tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
162  }
163  // The range of expected shifts gets a vote equal to the min distance
164  // of the actual bottom from the expected bottom, spread over the
165  // range of its acceptance.
166  int misfit_weight = abs(min_shift);
167  if (max_shift > min_shift) {
168  misfit_weight /= max_shift - min_shift;
169  }
170  for (int y = min_shift; y <= max_shift; ++y) {
171  shift_stats.add(y, misfit_weight);
172  }
173  } else {
174  if (bottom_shift == 0) {
175  // Things with bottoms that are already ok need to say so, on the
176  // 1st iteration only.
177  shift_stats.add(0, kBlnBaselineOffset);
178  }
179  if (debug_x_ht_level >= 2) {
180  tprintf(" already OK\n");
181  }
182  }
183  }
184  }
185  if (shift_stats.get_total() > top_stats.get_total()) {
186  bottom_shift = IntCastRounded(shift_stats.median());
187  if (debug_x_ht_level >= 2) {
188  tprintf("Applying bottom shift=%d\n", bottom_shift);
189  }
190  }
191  } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());
192  // Baseline shift is opposite sign to the bottom shift.
193  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
194  if (debug_x_ht_level >= 2) {
195  tprintf("baseline shift=%g\n", *baseline_shift);
196  }
197  if (top_stats.get_total() == 0) {
198  return bottom_shift != 0 ? word_res->x_height : 0.0f;
199  }
200  // The new xheight is just the median vote, which is then scaled out
201  // of BLN space back to pixel space to get the x-height in pixel space.
202  float new_xht = top_stats.median();
203  if (debug_x_ht_level >= 2) {
204  tprintf("Median xht=%f\n", new_xht);
205  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht,
206  new_xht / word_res->denorm.y_scale());
207  }
208  // The xheight must change by at least x_ht_min_change to be used.
209  if (std::fabs(new_xht - kBlnXHeight) >= x_ht_min_change) {
210  return new_xht / word_res->denorm.y_scale();
211  } else {
212  return bottom_shift != 0 ? word_res->x_height : 0.0f;
213  }
214 }
#define INT_FEAT_RANGE
Definition: float2int.h:27
int IntCastRounded(double x)
Definition: helpers.h:175
int DivRounded(int a, int b)
Definition: helpers.h:167
const int kMaxCharTopRange
Definition: fixxht.cpp:69
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586

◆ convert_bad_unlv_chs()

void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 594 of file docqual.cpp.

594  {
595  int i;
596  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
597  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
598  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
599  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
600  for (i = 0; i < word_res->reject_map.length(); ++i) {
601  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
602  word_res->best_choice->set_unichar_id(unichar_dash, i);
603  if (word_res->reject_map[i].accepted()) {
604  word_res->reject_map[i].setrej_unlv_rej();
605  }
606  }
607  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
608  word_res->best_choice->set_unichar_id(unichar_space, i);
609  if (word_res->reject_map[i].accepted()) {
610  word_res->reject_map[i].setrej_unlv_rej();
611  }
612  }
613  }
614 }

◆ ConvertStringToUnichars()

bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
std::vector< UNICHAR_ID > *  class_ids 
)

◆ CorrectClassifyWords()

void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

◆ count_alphanums() [1/2]

int16_t tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 375 of file output.cpp.

375  {
376  int count = 0;
377  for (unsigned i = 0; i < word.length(); ++i) {
378  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
379  word.unicharset()->get_isdigit(word.unichar_id(i))) {
380  count++;
381  }
382  }
383  return count;
384 }

◆ count_alphanums() [2/2]

int16_t tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 542 of file reject.cpp.

542  {
543  int count = 0;
544  const WERD_CHOICE *best_choice = word_res->best_choice;
545  for (unsigned i = 0; i < word_res->reject_map.length(); ++i) {
546  if ((word_res->reject_map[i].accepted()) &&
547  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
548  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
549  count++;
550  }
551  }
552  return count;
553 }

◆ count_alphas()

int16_t tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 365 of file output.cpp.

365  {
366  int count = 0;
367  for (unsigned i = 0; i < word.length(); ++i) {
368  if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
369  count++;
370  }
371  }
372  return count;
373 }

◆ count_outline_errs()

int16_t tesseract::Tesseract::count_outline_errs ( char  c,
int16_t  outline_count 
)

Definition at line 107 of file docqual.cpp.

107  {
108  int expected_outline_count;
109 
110  if (outlines_odd.contains(c)) {
111  return 0; // Don't use this char
112  } else if (outlines_2.contains(c)) {
113  expected_outline_count = 2;
114  } else {
115  expected_outline_count = 1;
116  }
117  return abs(outline_count - expected_outline_count);
118 }

◆ CountMisfitTops()

int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 72 of file fixxht.cpp.

72  {
73  int bad_blobs = 0;
74  int num_blobs = word_res->rebuild_word->NumBlobs();
75  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
76  TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
77  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
78  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
79  int top = blob->bounding_box().top();
80  if (top >= INT_FEAT_RANGE) {
81  top = INT_FEAT_RANGE - 1;
82  }
83  int min_bottom, max_bottom, min_top, max_top;
84  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
85  if (max_top - min_top > kMaxCharTopRange) {
86  continue;
87  }
88  bool bad =
89  top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance;
90  if (bad) {
91  ++bad_blobs;
92  }
93  if (debug_x_ht_level >= 1) {
94  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
95  unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top,
96  static_cast<int>(x_ht_acceptance_tolerance));
97  }
98  }
99  }
100  return bad_blobs;
101 }

◆ debug_word()

void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 639 of file pgedit.cpp.

639  {
640 # ifndef DISABLED_LEGACY_ENGINE
642 # endif
643  recog_all_words(page_res, nullptr, &selection_box, word_config_.c_str(), 0);
644 }
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:287

◆ dictionary_correction_pass()

void tesseract::Tesseract::dictionary_correction_pass ( PAGE_RES page_res)

Definition at line 2069 of file control.cpp.

2069  {
2070  PAGE_RES_IT word_it(page_res);
2071  for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) {
2072  if (word->best_choices.singleton()) {
2073  continue; // There are no alternates.
2074  }
2075 
2076  const WERD_CHOICE *best = word->best_choice;
2077  if (word->tesseract->getDict().valid_word(*best) != 0) {
2078  continue; // The best choice is in the dictionary.
2079  }
2080 
2081  WERD_CHOICE_IT choice_it(&word->best_choices);
2082  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
2083  WERD_CHOICE *alternate = choice_it.data();
2084  if (word->tesseract->getDict().valid_word(*alternate)) {
2085  // The alternate choice is in the dictionary.
2086  if (tessedit_bigram_debug) {
2087  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2088  best->unichar_string().c_str(), alternate->unichar_string().c_str());
2089  }
2090  // Replace the 'best' choice with a better choice.
2091  word->ReplaceBestChoice(alternate);
2092  break;
2093  }
2094  }
2095  }
2096 }

◆ digit_or_numeric_punct()

bool tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 366 of file fixspace.cpp.

366  {
367  int i;
368  int offset;
369 
370  for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {
371  ;
372  }
373  return (
374  word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
375  word->best_choice->unichar_lengths()[i]) ||
376  (word->best_choice->permuter() == NUMBER_PERM &&
377  numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
378 }
@ NUMBER_PERM
Definition: ratngs.h:238

◆ do_re_display()

void tesseract::Tesseract::do_re_display ( bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_painter)

do_re_display()

Redisplay page

Definition at line 324 of file pgedit.cpp.

324  {
325  int block_count = 1;
326 
327  image_win->Clear();
328  if (display_image) {
329  image_win->Draw(pix_binary_, 0, 0);
330  }
331 
332  image_win->Brush(ScrollView::NONE);
333  PAGE_RES_IT pr_it(current_page_res);
334  for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
335  (this->*word_painter)(&pr_it);
336  if (display_baselines && pr_it.row() != pr_it.prev_row()) {
337  pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
338  }
339  if (display_blocks && pr_it.block() != pr_it.prev_block()) {
340  pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
341  }
342  }
343  image_win->Update();
344 }
void Draw(Image image, int x_pos, int y_pos)
Definition: scrollview.cpp:767
void Brush(Color color)
Definition: scrollview.cpp:729

◆ doc_and_block_rejection()

void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 210 of file docqual.cpp.

211  {
212  int16_t block_no = 0;
213  int16_t row_no = 0;
214  BLOCK_RES *current_block;
215  ROW_RES *current_row;
216 
217  bool rej_word;
218  bool prev_word_rejected;
219  int16_t char_quality = 0;
220  int16_t accepted_char_quality;
221 
222  if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >
223  tessedit_reject_doc_percent) {
224  reject_whole_page(page_res_it);
225  if (tessedit_debug_doc_rejection) {
226  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,
227  page_res_it.page_res->rej_count);
228  }
229  } else {
230  if (tessedit_debug_doc_rejection) {
231  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count,
232  page_res_it.page_res->rej_count);
233  }
234 
235  /* Walk blocks testing for block rejection */
236 
237  page_res_it.restart_page();
238  WERD_RES *word;
239  while ((word = page_res_it.word()) != nullptr) {
240  current_block = page_res_it.block();
241  block_no = current_block->block->pdblk.index();
242  if (current_block->char_count > 0 &&
243  (current_block->rej_count * 100.0 / current_block->char_count) >
244  tessedit_reject_block_percent) {
245  if (tessedit_debug_block_rejection) {
246  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no,
247  current_block->char_count, current_block->rej_count);
248  }
249  prev_word_rejected = false;
250  while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {
251  if (tessedit_preserve_blk_rej_perfect_wds) {
252  rej_word = word->reject_map.reject_count() > 0 ||
253  word->reject_map.length() < tessedit_preserve_min_wd_len;
254  if (rej_word && tessedit_dont_blkrej_good_wds &&
255  word->reject_map.length() >= tessedit_preserve_min_wd_len &&
256  acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
257  word->best_choice->unichar_lengths().c_str()) !=
258  AC_UNACCEPTABLE) {
259  word_char_quality(word, &char_quality, &accepted_char_quality);
260  rej_word = char_quality != word->reject_map.length();
261  }
262  } else {
263  rej_word = true;
264  }
265  if (rej_word) {
266  /*
267  Reject spacing if both current and prev words are rejected.
268  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
269  generated more space errors.
270 */
271  if (tessedit_use_reject_spaces && prev_word_rejected &&
272  page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
273  word->reject_spaces = true;
274  }
275  word->reject_map.rej_word_block_rej();
276  }
277  prev_word_rejected = rej_word;
278  page_res_it.forward();
279  }
280  } else {
281  if (tessedit_debug_block_rejection) {
282  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no,
283  page_res_it.block()->char_count, page_res_it.block()->rej_count);
284  }
285 
286  /* Walk rows in block testing for row rejection */
287  row_no = 0;
288  while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {
289  current_row = page_res_it.row();
290  row_no++;
291  /* Reject whole row if:
292  fraction of chars on row which are rejected exceed a limit AND
293  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
294  limit
295 */
296  if (current_row->char_count > 0 &&
297  (current_row->rej_count * 100.0 / current_row->char_count) >
298  tessedit_reject_row_percent &&
299  (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <
300  tessedit_whole_wd_rej_row_percent) {
301  if (tessedit_debug_block_rejection) {
302  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no,
303  current_row->char_count, current_row->rej_count);
304  }
305  prev_word_rejected = false;
306  while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {
307  /* Preserve words on good docs unless they are mostly rejected*/
308  if (!tessedit_row_rej_good_docs && good_quality_doc) {
309  rej_word = word->reject_map.reject_count() /
310  static_cast<float>(word->reject_map.length()) >
311  tessedit_good_doc_still_rowrej_wd;
312  } else if (tessedit_preserve_row_rej_perfect_wds) {
313  /* Preserve perfect words anyway */
314  rej_word = word->reject_map.reject_count() > 0 ||
315  word->reject_map.length() < tessedit_preserve_min_wd_len;
316  if (rej_word && tessedit_dont_rowrej_good_wds &&
317  word->reject_map.length() >= tessedit_preserve_min_wd_len &&
319  *word->uch_set, word->best_choice->unichar_string().c_str(),
320  word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {
321  word_char_quality(word, &char_quality, &accepted_char_quality);
322  rej_word = char_quality != word->reject_map.length();
323  }
324  } else {
325  rej_word = true;
326  }
327  if (rej_word) {
328  /*
329  Reject spacing if both current and prev words are rejected.
330  NOTE - this is NOT restricted to FUZZY spaces. - When tried
331  this generated more space errors.
332 */
333  if (tessedit_use_reject_spaces && prev_word_rejected &&
334  page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
335  word->reject_spaces = true;
336  }
337  word->reject_map.rej_word_row_rej();
338  }
339  prev_word_rejected = rej_word;
340  page_res_it.forward();
341  }
342  } else {
343  if (tessedit_debug_block_rejection) {
344  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no,
345  current_row->char_count, current_row->rej_count);
346  }
347  while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {
348  page_res_it.forward();
349  }
350  }
351  }
352  }
353  }
354  }
355 }
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:363
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:81
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1704

◆ dont_allow_1Il()

void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 513 of file reject.cpp.

513  {
514  int word_len = word->reject_map.length();
515  const char *s = word->best_choice->unichar_string().c_str();
516  const char *lengths = word->best_choice->unichar_lengths().c_str();
517  bool accepted_1Il = false;
518 
519  for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
520  if (word->reject_map[i].accepted()) {
521  if (conflict_set_I_l_1.contains(s[offset])) {
522  accepted_1Il = true;
523  } else {
524  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
525  word->uch_set->get_isdigit(s + offset, lengths[i])) {
526  return; // >=1 non 1Il ch accepted
527  }
528  }
529  }
530  }
531  if (!accepted_1Il) {
532  return; // Nothing to worry about
533  }
534 
535  for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
536  if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
537  word->reject_map[i].setrej_postNN_1Il();
538  }
539  }
540 }

◆ double_VAR_H() [1/49]

tesseract::Tesseract::double_VAR_H ( crunch_del_cert  )

◆ double_VAR_H() [2/49]

tesseract::Tesseract::double_VAR_H ( crunch_del_high_word  )

◆ double_VAR_H() [3/49]

tesseract::Tesseract::double_VAR_H ( crunch_del_low_word  )

◆ double_VAR_H() [4/49]

tesseract::Tesseract::double_VAR_H ( crunch_del_max_ht  )

◆ double_VAR_H() [5/49]

tesseract::Tesseract::double_VAR_H ( crunch_del_min_ht  )

◆ double_VAR_H() [6/49]

tesseract::Tesseract::double_VAR_H ( crunch_del_min_width  )

◆ double_VAR_H() [7/49]

tesseract::Tesseract::double_VAR_H ( crunch_del_rating  )

◆ double_VAR_H() [8/49]

tesseract::Tesseract::double_VAR_H ( crunch_poor_garbage_cert  )

◆ double_VAR_H() [9/49]

tesseract::Tesseract::double_VAR_H ( crunch_poor_garbage_rate  )

◆ double_VAR_H() [10/49]

tesseract::Tesseract::double_VAR_H ( crunch_pot_poor_cert  )

◆ double_VAR_H() [11/49]

tesseract::Tesseract::double_VAR_H ( crunch_pot_poor_rate  )

◆ double_VAR_H() [12/49]

tesseract::Tesseract::double_VAR_H ( crunch_small_outlines_size  )

◆ double_VAR_H() [13/49]

tesseract::Tesseract::double_VAR_H ( crunch_terrible_rating  )

◆ double_VAR_H() [14/49]

tesseract::Tesseract::double_VAR_H ( fixsp_small_outlines_size  )

◆ double_VAR_H() [15/49]

tesseract::Tesseract::double_VAR_H ( lstm_rating_coefficient  )

◆ double_VAR_H() [16/49]

tesseract::Tesseract::double_VAR_H ( min_orientation_margin  )

◆ double_VAR_H() [17/49]

tesseract::Tesseract::double_VAR_H ( noise_cert_basechar  )

◆ double_VAR_H() [18/49]

tesseract::Tesseract::double_VAR_H ( noise_cert_disjoint  )

◆ double_VAR_H() [19/49]

tesseract::Tesseract::double_VAR_H ( noise_cert_factor  )

◆ double_VAR_H() [20/49]

tesseract::Tesseract::double_VAR_H ( noise_cert_punc  )

◆ double_VAR_H() [21/49]

tesseract::Tesseract::double_VAR_H ( quality_blob_pc  )

◆ double_VAR_H() [22/49]

tesseract::Tesseract::double_VAR_H ( quality_char_pc  )

◆ double_VAR_H() [23/49]

tesseract::Tesseract::double_VAR_H ( quality_outline_pc  )

◆ double_VAR_H() [24/49]

tesseract::Tesseract::double_VAR_H ( quality_rej_pc  )

◆ double_VAR_H() [25/49]

tesseract::Tesseract::double_VAR_H ( quality_rowrej_pc  )

◆ double_VAR_H() [26/49]

tesseract::Tesseract::double_VAR_H ( rej_whole_of_mostly_reject_word_fract  )

◆ double_VAR_H() [27/49]

tesseract::Tesseract::double_VAR_H ( subscript_max_y_top  )

◆ double_VAR_H() [28/49]

tesseract::Tesseract::double_VAR_H ( superscript_bettered_certainty  )

◆ double_VAR_H() [29/49]

tesseract::Tesseract::double_VAR_H ( superscript_min_y_bottom  )

◆ double_VAR_H() [30/49]

tesseract::Tesseract::double_VAR_H ( superscript_scaledown_ratio  )

◆ double_VAR_H() [31/49]

tesseract::Tesseract::double_VAR_H ( superscript_worse_certainty  )

◆ double_VAR_H() [32/49]

tesseract::Tesseract::double_VAR_H ( suspect_accept_rating  )

◆ double_VAR_H() [33/49]

tesseract::Tesseract::double_VAR_H ( suspect_rating_per_ch  )

◆ double_VAR_H() [34/49]

tesseract::Tesseract::double_VAR_H ( tessedit_good_doc_still_rowrej_wd  )

◆ double_VAR_H() [35/49]

tesseract::Tesseract::double_VAR_H ( tessedit_lower_flip_hyphen  )

◆ double_VAR_H() [36/49]

tesseract::Tesseract::double_VAR_H ( tessedit_reject_block_percent  )

◆ double_VAR_H() [37/49]

tesseract::Tesseract::double_VAR_H ( tessedit_reject_doc_percent  )

◆ double_VAR_H() [38/49]

tesseract::Tesseract::double_VAR_H ( tessedit_reject_row_percent  )

◆ double_VAR_H() [39/49]

tesseract::Tesseract::double_VAR_H ( tessedit_upper_flip_hyphen  )

◆ double_VAR_H() [40/49]

tesseract::Tesseract::double_VAR_H ( tessedit_whole_wd_rej_row_percent  )

◆ double_VAR_H() [41/49]

tesseract::Tesseract::double_VAR_H ( test_pt_x  )

◆ double_VAR_H() [42/49]

tesseract::Tesseract::double_VAR_H ( test_pt_y  )

◆ double_VAR_H() [43/49]

tesseract::Tesseract::double_VAR_H ( textord_tabfind_aligned_gap_fraction  )

◆ double_VAR_H() [44/49]

tesseract::Tesseract::double_VAR_H ( textord_tabfind_vertical_text_ratio  )

◆ double_VAR_H() [45/49]

tesseract::Tesseract::double_VAR_H ( thresholding_kfactor  )

◆ double_VAR_H() [46/49]

tesseract::Tesseract::double_VAR_H ( thresholding_score_fraction  )

◆ double_VAR_H() [47/49]

tesseract::Tesseract::double_VAR_H ( thresholding_smooth_kernel_size  )

◆ double_VAR_H() [48/49]

tesseract::Tesseract::double_VAR_H ( thresholding_tile_size  )

◆ double_VAR_H() [49/49]

tesseract::Tesseract::double_VAR_H ( thresholding_window_size  )

◆ dump_words()

void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
int16_t  score,
int16_t  mode,
bool  improved 
)

Definition at line 467 of file fixspace.cpp.

467  {
468  WERD_RES_IT word_res_it(&perm);
469 
470  if (debug_fix_space_level > 0) {
471  if (mode == 1) {
472  stats_.dump_words_str = "";
473  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
474  if (!word_res_it.data()->part_of_combo) {
475  stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();
476  stats_.dump_words_str += ' ';
477  }
478  }
479  }
480 
481  if (debug_fix_space_level > 1) {
482  switch (mode) {
483  case 1:
484  tprintf("EXTRACTED (%d): \"", score);
485  break;
486  case 2:
487  tprintf("TESTED (%d): \"", score);
488  break;
489  case 3:
490  tprintf("RETURNED (%d): \"", score);
491  break;
492  }
493 
494  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
495  if (!word_res_it.data()->part_of_combo) {
496  tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
497  static_cast<int>(word_res_it.data()->best_choice->permuter()));
498  }
499  }
500  tprintf("\"\n");
501  } else if (improved) {
502  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
503  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
504  if (!word_res_it.data()->part_of_combo) {
505  tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
506  static_cast<int>(word_res_it.data()->best_choice->permuter()));
507  }
508  }
509  tprintf("\"\n");
510  }
511  }
512 }

◆ end_tesseract()

void tesseract::Tesseract::end_tesseract ( )

Definition at line 459 of file tessedit.cpp.

459  {
460  end_recog();
461 }

◆ eval_word_spacing()

int16_t tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 260 of file fixspace.cpp.

260  {
261  WERD_RES_IT word_res_it(&word_res_list);
262  int16_t total_score = 0;
263  int16_t word_count = 0;
264  int16_t done_word_count = 0;
265  int i;
266  int16_t offset;
267  int16_t prev_word_score = 0;
268  bool prev_word_done = false;
269  bool prev_char_1 = false; // prev ch a "1/I/l"?
270  bool prev_char_digit = false; // prev ch 2..9 or 0
271  const char *punct_chars = "!\"`',.:;";
272  bool prev_char_punct = false;
273 
274  do {
275  // current word
276  WERD_RES *word = word_res_it.data();
277  bool word_done = fixspace_thinks_word_done(word);
278  word_count++;
279  if (word->tess_failed) {
280  total_score += prev_word_score;
281  if (prev_word_done) {
282  done_word_count++;
283  }
284  prev_word_score = 0;
285  prev_char_1 = false;
286  prev_char_digit = false;
287  prev_word_done = false;
288  } else {
289  /*
290  Can we add the prev word score and potentially count this word?
291  Yes IF it didn't end in a 1 when the first char of this word is a digit
292  AND it didn't end in a digit when the first char of this word is a 1
293 */
294  auto word_len = word->reject_map.length();
295  bool current_word_ok_so_far = false;
296  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
297  (prev_char_digit &&
298  ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
299  word->best_choice->unichar_string()[0] == '1') ||
300  (!word_done &&
301  conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
302  total_score += prev_word_score;
303  if (prev_word_done) {
304  done_word_count++;
305  }
306  current_word_ok_so_far = word_done;
307  }
308 
309  if (current_word_ok_so_far) {
310  prev_word_done = true;
311  prev_word_score = word_len;
312  } else {
313  prev_word_done = false;
314  prev_word_score = 0;
315  }
316 
317  /* Add 1 to total score for every joined 1 regardless of context and
318  rejtn */
319  for (i = 0, prev_char_1 = false; i < word_len; i++) {
320  bool current_char_1 = word->best_choice->unichar_string()[i] == '1';
321  if (prev_char_1 || (current_char_1 && (i > 0))) {
322  total_score++;
323  }
324  prev_char_1 = current_char_1;
325  }
326 
327  /* Add 1 to total score for every joined punctuation regardless of context
328  and rejtn */
329  if (tessedit_prefer_joined_punct) {
330  for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
331  offset += word->best_choice->unichar_lengths()[i++]) {
332  bool current_char_punct =
333  strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;
334  if (prev_char_punct || (current_char_punct && i > 0)) {
335  total_score++;
336  }
337  prev_char_punct = current_char_punct;
338  }
339  }
340  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
341  for (i = 0, offset = 0; i < word_len - 1;
342  offset += word->best_choice->unichar_lengths()[i++]) {
343  ;
344  }
345  prev_char_1 =
346  ((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
347  (!word_done &&
348  conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
349  }
350  /* Find next word */
351  do {
352  word_res_it.forward();
353  } while (word_res_it.data()->part_of_combo);
354  } while (!word_res_it.at_first());
355  total_score += prev_word_score;
356  if (prev_word_done) {
357  done_word_count++;
358  }
359  if (done_word_count == word_count) {
360  return PERFECT_WERDS;
361  } else {
362  return total_score;
363  }
364 }
#define PERFECT_WERDS
Definition: fixspace.cpp:48
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:514
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:366

◆ failure_count()

int16_t tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 895 of file docqual.cpp.

895  {
896  const char *str = word->best_choice->unichar_string().c_str();
897  int tess_rejs = 0;
898 
899  for (; *str != '\0'; str++) {
900  if (*str == ' ') {
901  tess_rejs++;
902  }
903  }
904  return tess_rejs;
905 }

◆ FindSegmentation()

bool tesseract::Tesseract::FindSegmentation ( const std::vector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

◆ first_alphanum_index()

int16_t tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 457 of file reject.cpp.

457  {
458  int16_t i;
459  int16_t offset;
460 
461  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
462  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
463  unicharset.get_isdigit(word + offset, word_lengths[i])) {
464  return i;
465  }
466  }
467  return -1;
468 }

◆ first_alphanum_offset()

int16_t tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 470 of file reject.cpp.

470  {
471  int16_t i;
472  int16_t offset;
473 
474  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
475  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
476  unicharset.get_isdigit(word + offset, word_lengths[i])) {
477  return offset;
478  }
479  }
480  return -1;
481 }

◆ fix_fuzzy_space_list()

void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 171 of file fixspace.cpp.

171  {
172  int16_t best_score;
173  WERD_RES_LIST current_perm;
174  int16_t current_score;
175  bool improved = false;
176 
177  best_score = eval_word_spacing(best_perm); // default score
178  dump_words(best_perm, best_score, 1, improved);
179 
180  if (best_score != PERFECT_WERDS) {
181  initialise_search(best_perm, current_perm);
182  }
183 
184  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
185  match_current_words(current_perm, row, block);
186  current_score = eval_word_spacing(current_perm);
187  dump_words(current_perm, current_score, 2, improved);
188  if (current_score > best_score) {
189  best_perm.clear();
190  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
191  best_score = current_score;
192  improved = true;
193  }
194  if (current_score < PERFECT_WERDS) {
195  transform_to_next_perm(current_perm);
196  }
197  }
198  dump_words(best_perm, best_score, 3, improved);
199 }
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:391
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:201
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:467
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:260
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:218
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:655

◆ fix_fuzzy_spaces()

void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
int32_t  word_count,
PAGE_RES page_res 
)

Definition at line 77 of file fixspace.cpp.

77  {
78  BLOCK_RES_IT block_res_it;
79  ROW_RES_IT row_res_it;
80  WERD_RES_IT word_res_it_from;
81  WERD_RES_IT word_res_it_to;
82  WERD_RES *word_res;
83  WERD_RES_LIST fuzzy_space_words;
84  int16_t new_length;
85  bool prevent_null_wd_fixsp; // DON'T process blobless wds
86  int32_t word_index; // current word
87 
88  block_res_it.set_to_list(&page_res->block_res_list);
89  word_index = 0;
90  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
91  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
92  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
93  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
94  while (!word_res_it_from.at_last()) {
95  word_res = word_res_it_from.data();
96  while (!word_res_it_from.at_last() &&
97  !(word_res->combination ||
98  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
99  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
100  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
101  word_res = word_res_it_from.forward();
102  word_index++;
103  if (monitor != nullptr) {
104  monitor->ocr_alive = true;
105  monitor->progress = 90 + 5 * word_index / word_count;
106  if (monitor->deadline_exceeded() ||
107  (monitor->cancel != nullptr &&
108  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
109  return;
110  }
111  }
112  }
113 
114  if (!word_res_it_from.at_last()) {
115  word_res_it_to = word_res_it_from;
116  prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();
117  if (check_debug_pt(word_res, 60)) {
118  debug_fix_space_level.set_value(10);
119  }
120  word_res_it_to.forward();
121  word_index++;
122  if (monitor != nullptr) {
123  monitor->ocr_alive = true;
124  monitor->progress = 90 + 5 * word_index / word_count;
125  if (monitor->deadline_exceeded() ||
126  (monitor->cancel != nullptr &&
127  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
128  return;
129  }
130  }
131  while (!word_res_it_to.at_last() &&
132  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
133  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
134  if (check_debug_pt(word_res, 60)) {
135  debug_fix_space_level.set_value(10);
136  }
137  if (word_res->word->cblob_list()->empty()) {
138  prevent_null_wd_fixsp = true;
139  }
140  word_res = word_res_it_to.forward();
141  }
142  if (check_debug_pt(word_res, 60)) {
143  debug_fix_space_level.set_value(10);
144  }
145  if (word_res->word->cblob_list()->empty()) {
146  prevent_null_wd_fixsp = true;
147  }
148  if (prevent_null_wd_fixsp) {
149  word_res_it_from = word_res_it_to;
150  } else {
151  fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
152  fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,
153  block_res_it.data()->block);
154  new_length = fuzzy_space_words.length();
155  word_res_it_from.add_list_before(&fuzzy_space_words);
156  for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
157  word_res_it_from.forward();
158  }
159  }
160  if (test_pt) {
161  debug_fix_space_level.set_value(0);
162  }
163  }
164  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
165  // Last word in row
166  }
167  }
168  }
169 }
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:545
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:171

◆ fix_noisy_space_list()

void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 577 of file fixspace.cpp.

577  {
578  int16_t best_score;
579  WERD_RES_IT best_perm_it(&best_perm);
580  WERD_RES_LIST current_perm;
581  WERD_RES_IT current_perm_it(&current_perm);
582  WERD_RES *old_word_res;
583  int16_t current_score;
584  bool improved = false;
585 
586  best_score = fp_eval_word_spacing(best_perm); // default score
587 
588  dump_words(best_perm, best_score, 1, improved);
589 
590  old_word_res = best_perm_it.data();
591  // Even deep_copy doesn't copy the underlying WERD unless its combination
592  // flag is true!.
593  old_word_res->combination = true; // Kludge to force deep copy
594  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
595  old_word_res->combination = false; // Undo kludge
596 
597  break_noisiest_blob_word(current_perm);
598 
599  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
600  match_current_words(current_perm, row, block);
601  current_score = fp_eval_word_spacing(current_perm);
602  dump_words(current_perm, current_score, 2, improved);
603  if (current_score > best_score) {
604  best_perm.clear();
605  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
606  best_score = current_score;
607  improved = true;
608  }
609  if (current_score < PERFECT_WERDS) {
610  break_noisiest_blob_word(current_perm);
611  }
612  }
613  dump_words(best_perm, best_score, 3, improved);
614 }
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:621
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:837

◆ fix_rep_char()

void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1665 of file control.cpp.

1665  {
1666  WERD_RES *word_res = page_res_it->word();
1667  const WERD_CHOICE &word = *(word_res->best_choice);
1668 
1669  // Find the frequency of each unique character in the word.
1670  SortHelper<UNICHAR_ID> rep_ch(word.length());
1671  for (unsigned i = 0; i < word.length(); ++i) {
1672  rep_ch.Add(word.unichar_id(i), 1);
1673  }
1674 
1675  // Find the most frequent result.
1676  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1677  int max_count = rep_ch.MaxCount(&maxch_id);
1678  // Find the best exemplar of a classifier result for maxch_id.
1679  BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);
1680  if (best_choice == nullptr) {
1681  tprintf("Failed to find a choice for %s, occurring %d times\n",
1682  word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
1683  return;
1684  }
1685  word_res->done = true;
1686 
1687  // Measure the mean space.
1688  int gap_count = 0;
1689  WERD *werd = word_res->word;
1690  C_BLOB_IT blob_it(werd->cblob_list());
1691  C_BLOB *prev_blob = blob_it.data();
1692  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1693  C_BLOB *blob = blob_it.data();
1694  int gap = blob->bounding_box().left();
1695  gap -= prev_blob->bounding_box().right();
1696  ++gap_count;
1697  prev_blob = blob;
1698  }
1699  // Just correct existing classification.
1700  CorrectRepcharChoices(best_choice, word_res);
1701  word_res->reject_map.initialise(word.length());
1702 }

◆ fix_sp_fp_word()

void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 545 of file fixspace.cpp.

545  {
546  WERD_RES *word_res;
547  WERD_RES_LIST sub_word_list;
548  WERD_RES_IT sub_word_list_it(&sub_word_list);
549  int16_t blob_index;
550  int16_t new_length;
551  float junk;
552 
553  word_res = word_res_it.data();
554  if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||
555  !word_res->word->flag(W_DONT_CHOP)) {
556  return;
557  }
558 
559  blob_index = worst_noise_blob(word_res, &junk);
560  if (blob_index < 0) {
561  return;
562  }
563 
564  if (debug_fix_space_level > 1) {
565  tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());
566  }
567  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
568  sub_word_list_it.add_after_stay_put(word_res_it.extract());
569  fix_noisy_space_list(sub_word_list, row, block);
570  new_length = sub_word_list.length();
571  word_res_it.add_list_before(&sub_word_list);
572  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
573  word_res_it.forward();
574  }
575 }
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:39
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:577

◆ fixspace_thinks_word_done()

bool tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 514 of file fixspace.cpp.

514  {
515  if (word->done) {
516  return true;
517  }
518 
519  /*
520  Use all the standard pass 2 conditions for mode 5 in set_done() in
521  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
522  CARE WHETHER WE HAVE of/at on/an etc.
523 */
524  if (fixsp_done_mode > 0 &&
525  (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
526  fixsp_done_mode == 3) &&
527  (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
528  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
529  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
530  (word->best_choice->permuter() == USER_DAWG_PERM) ||
531  (word->best_choice->permuter() == NUMBER_PERM))) {
532  return true;
533  } else {
534  return false;
535  }
536 }
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
@ USER_DAWG_PERM
Definition: ratngs.h:242
@ FREQ_DAWG_PERM
Definition: ratngs.h:243

◆ flip_0O()

void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 660 of file reject.cpp.

660  {
661  WERD_CHOICE *best_choice = word_res->best_choice;
662  TBOX out_box;
663 
664  if (!tessedit_flip_0O) {
665  return;
666  }
667 
668  auto num_blobs = word_res->rebuild_word->NumBlobs();
669  for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
670  TBLOB *blob = word_res->rebuild_word->blobs[i];
671  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
672  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
673  out_box = blob->bounding_box();
674  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
675  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {
676  return; // Beware words with sub/superscripts
677  }
678  }
679  }
680  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
681  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
682  if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||
683  unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
684  return; // 0 or O are not present/enabled in unicharset
685  }
686  for (unsigned i = 1; i < best_choice->length(); ++i) {
687  if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
688  /* A0A */
689  if ((i + 1) < best_choice->length() &&
690  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
691  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
692  best_choice->set_unichar_id(unichar_O, i);
693  }
694  /* A00A */
695  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
696  (i + 1) < best_choice->length() &&
697  (best_choice->unichar_id(i + 1) == unichar_0 ||
698  best_choice->unichar_id(i + 1) == unichar_O) &&
699  (i + 2) < best_choice->length() &&
700  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
701  best_choice->set_unichar_id(unichar_O, i);
702  i++;
703  }
704  /* AA0<non digit or end of word> */
705  if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
706  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
707  (((i + 1) < best_choice->length() &&
708  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
709  !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
710  !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
711  (i == best_choice->length() - 1))) {
712  best_choice->set_unichar_id(unichar_O, i);
713  }
714  /* 9O9 */
715  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
716  (i + 1) < best_choice->length() &&
717  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
718  best_choice->set_unichar_id(unichar_0, i);
719  }
720  /* 9OOO */
721  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
722  (i + 2) < best_choice->length() &&
723  (best_choice->unichar_id(i + 1) == unichar_0 ||
724  best_choice->unichar_id(i + 1) == unichar_O) &&
725  (best_choice->unichar_id(i + 2) == unichar_0 ||
726  best_choice->unichar_id(i + 2) == unichar_O)) {
727  best_choice->set_unichar_id(unichar_0, i);
728  best_choice->set_unichar_id(unichar_0, i + 1);
729  best_choice->set_unichar_id(unichar_0, i + 2);
730  i += 2;
731  }
732  /* 9OO<non upper> */
733  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
734  (i + 2) < best_choice->length() &&
735  (best_choice->unichar_id(i + 1) == unichar_0 ||
736  best_choice->unichar_id(i + 1) == unichar_O) &&
737  !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
738  best_choice->set_unichar_id(unichar_0, i);
739  best_choice->set_unichar_id(unichar_0, i + 1);
740  i++;
741  }
742  /* 9O<non upper> */
743  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
744  (i + 1) < best_choice->length() &&
745  !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
746  best_choice->set_unichar_id(unichar_0, i);
747  }
748  /* 9[.,]OOO.. */
749  if ((i > 1) &&
750  (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
751  word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
752  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
753  best_choice->unichar_id(i - 2) == unichar_O)) {
754  if (best_choice->unichar_id(i - 2) == unichar_O) {
755  best_choice->set_unichar_id(unichar_0, i - 2);
756  }
757  while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||
758  best_choice->unichar_id(i) == unichar_0)) {
759  best_choice->set_unichar_id(unichar_0, i);
760  i++;
761  }
762  i--;
763  }
764  }
765  }
766 }
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:768
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:772

◆ flip_hyphens()

void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 602 of file reject.cpp.

602  {
603  WERD_CHOICE *best_choice = word_res->best_choice;
604  int prev_right = -9999;
605  int next_left;
606  TBOX out_box;
607  float aspect_ratio;
608 
609  if (tessedit_lower_flip_hyphen <= 1) {
610  return;
611  }
612 
613  auto num_blobs = word_res->rebuild_word->NumBlobs();
614  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
615  for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
616  TBLOB *blob = word_res->rebuild_word->blobs[i];
617  out_box = blob->bounding_box();
618  if (i + 1 == num_blobs) {
619  next_left = 9999;
620  } else {
621  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
622  }
623  // Don't touch small or touching blobs - it is too dangerous.
624  if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&
625  (out_box.right() < next_left)) {
626  aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
627  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
628  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
629  word_res->uch_set->contains_unichar_id(unichar_dash) &&
630  word_res->uch_set->get_enabled(unichar_dash)) {
631  /* Certain HYPHEN */
632  best_choice->set_unichar_id(unichar_dash, i);
633  if (word_res->reject_map[i].rejected()) {
634  word_res->reject_map[i].setrej_hyphen_accept();
635  }
636  }
637  if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {
638  // Suspected HYPHEN
639  word_res->reject_map[i].setrej_hyphen();
640  }
641  } else if (best_choice->unichar_id(i) == unichar_dash) {
642  if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {
643  word_res->reject_map[i].setrej_hyphen_accept();
644  }
645  // Certain HYPHEN
646 
647  if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {
648  // Suspected HYPHEN
649  word_res->reject_map[i].setrej_hyphen();
650  }
651  }
652  }
653  prev_right = out_box.right();
654  }
655 }

◆ font_recognition_pass()

void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 2015 of file control.cpp.

2015  {
2016  PAGE_RES_IT page_res_it(page_res);
2017  WERD_RES *word; // current word
2018  STATS doc_fonts(0, font_table_size_); // font counters
2019 
2020  // Gather font id statistics.
2021  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2022  word = page_res_it.word();
2023  if (word->fontinfo != nullptr) {
2024  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2025  }
2026  if (word->fontinfo2 != nullptr) {
2027  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2028  }
2029  }
2030  int16_t doc_font; // modal font
2031  int8_t doc_font_count; // modal font
2032  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2033  if (doc_font_count == 0) {
2034  return;
2035  }
2036  // Get the modal font pointer.
2037  const FontInfo *modal_font = nullptr;
2038  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2039  word = page_res_it.word();
2040  if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2041  modal_font = word->fontinfo;
2042  break;
2043  }
2044  if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2045  modal_font = word->fontinfo2;
2046  break;
2047  }
2048  }
2049  ASSERT_HOST(modal_font != nullptr);
2050 
2051  // Assign modal font to weak words.
2052  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2053  word = page_res_it.word();
2054  const int length = word->best_choice->length();
2055 
2056  const int count = word->fontinfo_id_count;
2057  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2058  word->fontinfo = modal_font;
2059  // Counts only get 1 as it came from the doc.
2060  word->fontinfo_id_count = 1;
2061  }
2062  }
2063 }

◆ fp_eval_word_spacing()

int16_t tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

fp_eval_word_spacing() Evaluation function for fixed pitch word lists.

Basically, count the number of "nice" characters - those which are in tess acceptable words or in dict words and are not rejected. Penalise any potential noise chars

Definition at line 837 of file fixspace.cpp.

837  {
838  WERD_RES_IT word_it(&word_res_list);
839  WERD_RES *word;
840  int16_t score = 0;
841  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
842 
843  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
844  word = word_it.data();
845  if (word->rebuild_word == nullptr) {
846  continue; // Can't handle cube words.
847  }
848  if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
849  word->best_choice->permuter() == FREQ_DAWG_PERM ||
850  word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
851  auto num_blobs = word->rebuild_word->NumBlobs();
852  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
853  for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
854  TBLOB *blob = word->rebuild_word->blobs[i];
855  if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
856  score -= 1; // penalise possibly erroneous non-space
857  } else if (word->reject_map[i].accepted()) {
858  score++;
859  }
860  }
861  }
862  }
863  if (score < 0) {
864  score = 0;
865  }
866  return score;
867 }
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:772
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593

◆ garbage_word()

GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
bool  ok_dict_word 
)

Definition at line 616 of file docqual.cpp.

616  {
617  enum STATES {
618  JUNK,
619  FIRST_UPPER,
620  FIRST_LOWER,
621  FIRST_NUM,
622  SUBSEQUENT_UPPER,
623  SUBSEQUENT_LOWER,
624  SUBSEQUENT_NUM
625  };
626  const char *str = word->best_choice->unichar_string().c_str();
627  const char *lengths = word->best_choice->unichar_lengths().c_str();
628  STATES state = JUNK;
629  int len = 0;
630  int isolated_digits = 0;
631  int isolated_alphas = 0;
632  int bad_char_count = 0;
633  int tess_rejs = 0;
634  int dodgy_chars = 0;
635  int ok_chars;
636  UNICHAR_ID last_char = -1;
637  int alpha_repetition_count = 0;
638  int longest_alpha_repetition_count = 0;
639  int longest_lower_run_len = 0;
640  int lower_string_count = 0;
641  int longest_upper_run_len = 0;
642  int upper_string_count = 0;
643  int total_alpha_count = 0;
644  int total_digit_count = 0;
645 
646  for (; *str != '\0'; str += *(lengths++)) {
647  len++;
648  if (word->uch_set->get_isupper(str, *lengths)) {
649  total_alpha_count++;
650  switch (state) {
651  case SUBSEQUENT_UPPER:
652  case FIRST_UPPER:
653  state = SUBSEQUENT_UPPER;
654  upper_string_count++;
655  if (longest_upper_run_len < upper_string_count) {
656  longest_upper_run_len = upper_string_count;
657  }
658  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
659  alpha_repetition_count++;
660  if (longest_alpha_repetition_count < alpha_repetition_count) {
661  longest_alpha_repetition_count = alpha_repetition_count;
662  }
663  } else {
664  last_char = word->uch_set->unichar_to_id(str, *lengths);
665  alpha_repetition_count = 1;
666  }
667  break;
668  case FIRST_NUM:
669  isolated_digits++;
670  // Fall through.
671  default:
672  state = FIRST_UPPER;
673  last_char = word->uch_set->unichar_to_id(str, *lengths);
674  alpha_repetition_count = 1;
675  upper_string_count = 1;
676  break;
677  }
678  } else if (word->uch_set->get_islower(str, *lengths)) {
679  total_alpha_count++;
680  switch (state) {
681  case SUBSEQUENT_LOWER:
682  case FIRST_LOWER:
683  state = SUBSEQUENT_LOWER;
684  lower_string_count++;
685  if (longest_lower_run_len < lower_string_count) {
686  longest_lower_run_len = lower_string_count;
687  }
688  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
689  alpha_repetition_count++;
690  if (longest_alpha_repetition_count < alpha_repetition_count) {
691  longest_alpha_repetition_count = alpha_repetition_count;
692  }
693  } else {
694  last_char = word->uch_set->unichar_to_id(str, *lengths);
695  alpha_repetition_count = 1;
696  }
697  break;
698  case FIRST_NUM:
699  isolated_digits++;
700  // Fall through.
701  default:
702  state = FIRST_LOWER;
703  last_char = word->uch_set->unichar_to_id(str, *lengths);
704  alpha_repetition_count = 1;
705  lower_string_count = 1;
706  break;
707  }
708  } else if (word->uch_set->get_isdigit(str, *lengths)) {
709  total_digit_count++;
710  switch (state) {
711  case FIRST_NUM:
712  state = SUBSEQUENT_NUM;
713  case SUBSEQUENT_NUM:
714  break;
715  case FIRST_UPPER:
716  case FIRST_LOWER:
717  isolated_alphas++;
718  // Fall through.
719  default:
720  state = FIRST_NUM;
721  break;
722  }
723  } else {
724  if (*lengths == 1 && *str == ' ') {
725  tess_rejs++;
726  } else {
727  bad_char_count++;
728  }
729  switch (state) {
730  case FIRST_NUM:
731  isolated_digits++;
732  break;
733  case FIRST_UPPER:
734  case FIRST_LOWER:
735  isolated_alphas++;
736  default:
737  break;
738  }
739  state = JUNK;
740  }
741  }
742 
743  switch (state) {
744  case FIRST_NUM:
745  isolated_digits++;
746  break;
747  case FIRST_UPPER:
748  case FIRST_LOWER:
749  isolated_alphas++;
750  default:
751  break;
752  }
753 
754  if (crunch_include_numerals) {
755  total_alpha_count += total_digit_count - isolated_digits;
756  }
757 
758  if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
759  longest_alpha_repetition_count < crunch_long_repetitions) {
760  if ((crunch_accept_ok &&
761  acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||
762  longest_lower_run_len > crunch_leave_lc_strings ||
763  longest_upper_run_len > crunch_leave_uc_strings) {
764  return G_NEVER_CRUNCH;
765  }
766  }
767  if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&
768  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
769  word->best_choice->permuter() == FREQ_DAWG_PERM ||
770  word->best_choice->permuter() == USER_DAWG_PERM ||
771  word->best_choice->permuter() == NUMBER_PERM ||
772  acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {
773  return G_OK;
774  }
775 
776  ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;
777 
778  if (crunch_debug > 3) {
779  tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());
780  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count,
781  isolated_digits, isolated_alphas, tess_rejs);
782  }
783  if (bad_char_count == 0 && tess_rejs == 0 &&
784  (len > isolated_digits + isolated_alphas || len <= 2)) {
785  return G_OK;
786  }
787 
788  if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {
789  return G_TERRIBLE;
790  }
791 
792  if (len > 4) {
793  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
794  if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {
795  return G_DODGY;
796  } else {
797  return G_OK;
798  }
799  } else {
800  dodgy_chars = 2 * tess_rejs + bad_char_count;
801  if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {
802  return G_DODGY;
803  } else {
804  return G_OK;
805  }
806  }
807 }
@ G_TERRIBLE
Definition: docqual.h:30
@ G_NEVER_CRUNCH
Definition: docqual.h:30
@ G_OK
Definition: docqual.h:30
@ G_DODGY
Definition: docqual.h:30

◆ get_rep_char()

UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 247 of file output.cpp.

247  { // what char is repeated?
248  int i;
249  for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
250  ;
251  }
252 
253  if (i < word->reject_map.length()) {
254  return word->best_choice->unichar_id(i);
255  } else {
256  return word->uch_set->unichar_to_id(unrecognised_char.c_str());
257  }
258 }

◆ get_sub_lang()

Tesseract* tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 286 of file tesseractclass.h.

286  {
287  return sub_langs_[index];
288  }

◆ getDict()

Dict & tesseract::Tesseract::getDict ( )
overridevirtual

Reimplemented from tesseract::Classify.

Definition at line 474 of file tesseractclass.cpp.

474  {
475  if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) {
476  if (lstm_recognizer_ && lstm_recognizer_->GetDict()) {
477  return *lstm_recognizer_->GetDict();
478  }
479  }
480  return Classify::getDict();
481 }
bool AnyLSTMLang() const
virtual Dict & getDict()
Definition: classify.h:98
const Dict * GetDict() const

◆ GetLineData()

ImageData * tesseract::Tesseract::GetLineData ( const TBOX line_box,
const std::vector< TBOX > &  boxes,
const std::vector< std::string > &  texts,
int  start_box,
int  end_box,
const BLOCK block 
)

Definition at line 133 of file linerec.cpp.

135  {
136  TBOX revised_box;
137  ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
138  if (image_data == nullptr) {
139  return nullptr;
140  }
141  image_data->set_page_number(applybox_page);
142  // Copy the boxes and shift them so they are relative to the image.
143  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
144  ICOORD shift = -revised_box.botleft();
145  std::vector<TBOX> line_boxes;
146  std::vector<std::string> line_texts;
147  for (int b = start_box; b < end_box; ++b) {
148  TBOX box = boxes[b];
149  box.rotate(block_rotation);
150  box.move(shift);
151  line_boxes.push_back(box);
152  line_texts.push_back(texts[b]);
153  }
154  std::vector<int> page_numbers(line_boxes.size(), applybox_page);
155  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
156  return image_data;
157 }
const int kImagePadding
Definition: imagedata.h:39
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:165

◆ GetRectImage()

ImageData * tesseract::Tesseract::GetRectImage ( const TBOX box,
const BLOCK block,
int  padding,
TBOX revised_box 
) const

Definition at line 165 of file linerec.cpp.

166  {
167  TBOX wbox = box;
168  wbox.pad(padding, padding);
169  *revised_box = wbox;
170  // Number of clockwise 90 degree rotations needed to get back to tesseract
171  // coords from the clipped image.
172  int num_rotations = 0;
173  if (block.re_rotation().y() > 0.0f) {
174  num_rotations = 1;
175  } else if (block.re_rotation().x() < 0.0f) {
176  num_rotations = 2;
177  } else if (block.re_rotation().y() < 0.0f) {
178  num_rotations = 3;
179  }
180  // Handle two cases automatically: 1 the box came from the block, 2 the box
181  // came from a box file, and refers to the image, which the block may not.
182  if (block.pdblk.bounding_box().major_overlap(*revised_box)) {
183  revised_box->rotate(block.re_rotation());
184  }
185  // Now revised_box always refers to the image.
186  // BestPix is never colormapped, but may be of any depth.
187  Image pix = BestPix();
188  int width = pixGetWidth(pix);
189  int height = pixGetHeight(pix);
190  TBOX image_box(0, 0, width, height);
191  // Clip to image bounds;
192  *revised_box &= image_box;
193  if (revised_box->null_box()) {
194  return nullptr;
195  }
196  Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
197  revised_box->height());
198  Image box_pix = pixClipRectangle(pix, clip_box, nullptr);
199  boxDestroy(&clip_box);
200  if (box_pix == nullptr) {
201  return nullptr;
202  }
203  if (num_rotations > 0) {
204  Image rot_pix = pixRotateOrth(box_pix, num_rotations);
205  box_pix.destroy();
206  box_pix = rot_pix;
207  }
208  // Convert sub-8-bit images to 8 bit.
209  int depth = pixGetDepth(box_pix);
210  if (depth < 8) {
211  Image grey;
212  grey = pixConvertTo8(box_pix, false);
213  box_pix.destroy();
214  box_pix = grey;
215  }
216  bool vertical_text = false;
217  if (num_rotations > 0) {
218  // Rotated the clipped revised box back to internal coordinates.
219  FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
220  revised_box->rotate(rotation);
221  if (num_rotations != 2) {
222  vertical_text = true;
223  }
224  }
225  return new ImageData(vertical_text, box_pix);
226 }

◆ GetSubAndSuperscriptCandidates()

void tesseract::Tesseract::GetSubAndSuperscriptCandidates ( const WERD_RES word,
int *  num_rebuilt_leading,
ScriptPos leading_pos,
float *  leading_certainty,
int *  num_rebuilt_trailing,
ScriptPos trailing_pos,
float *  trailing_certainty,
float *  avg_certainty,
float *  unlikely_threshold 
)

Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.

Parameters
[in]wordThe word to examine.
[out]num_rebuilt_leadingthe number of rebuilt blobs at the start of the word which are all up or down and seem badly classified.
[out]leading_pos"super" or "sub" (for debugging)
[out]leading_certaintythe worst certainty in the leading blobs.
[out]num_rebuilt_trailingthe number of rebuilt blobs at the end of the word which are all up or down and seem badly classified.
[out]trailing_pos"super" or "sub" (for debugging)
[out]trailing_certaintythe worst certainty in the trailing blobs.
[out]avg_certaintythe average certainty of "normal" blobs in the word.
[out]unlikely_thresholdthe threshold (on certainty) we used to select "bad enough" outlier characters.

Definition at line 250 of file superscript.cpp.

254  {
255  *avg_certainty = *unlikely_threshold = 0.0f;
256  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
257  *leading_certainty = *trailing_certainty = 0.0f;
258 
259  int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
260  int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
261 
262  // Step one: Get an average certainty for "normally placed" characters.
263 
264  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
265  *leading_pos = *trailing_pos = SP_NORMAL;
266  int leading_outliers = 0;
267  int trailing_outliers = 0;
268  int num_normal = 0;
269  float normal_certainty_total = 0.0f;
270  float worst_normal_certainty = 0.0f;
271  ScriptPos last_pos = SP_NORMAL;
272  int num_blobs = word->rebuild_word->NumBlobs();
273  for (int b = 0; b < num_blobs; ++b) {
274  TBOX box = word->rebuild_word->blobs[b]->bounding_box();
275  ScriptPos pos = SP_NORMAL;
276  if (box.bottom() >= super_y_bottom) {
277  pos = SP_SUPERSCRIPT;
278  } else if (box.top() <= sub_y_top) {
279  pos = SP_SUBSCRIPT;
280  }
281  if (pos == SP_NORMAL) {
282  if (word->best_choice->unichar_id(b) != 0) {
283  float char_certainty = word->best_choice->certainty(b);
284  if (char_certainty < worst_normal_certainty) {
285  worst_normal_certainty = char_certainty;
286  }
287  num_normal++;
288  normal_certainty_total += char_certainty;
289  }
290  if (trailing_outliers == b) {
291  leading_outliers = trailing_outliers;
292  *leading_pos = last_pos;
293  }
294  trailing_outliers = 0;
295  } else {
296  if (last_pos == pos) {
297  trailing_outliers++;
298  } else {
299  trailing_outliers = 1;
300  }
301  }
302  last_pos = pos;
303  }
304  *trailing_pos = last_pos;
305  if (num_normal >= 3) { // throw out the worst as an outlier.
306  num_normal--;
307  normal_certainty_total -= worst_normal_certainty;
308  }
309  if (num_normal > 0) {
310  *avg_certainty = normal_certainty_total / num_normal;
311  *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
312  }
313  if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) {
314  return;
315  }
316 
317  // Step two: Try to split off bits of the word that are both outliers
318  // and have much lower certainty than average
319  // Calculate num_leading and leading_certainty.
320  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers;
321  (*num_rebuilt_leading)++) {
322  float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
323  if (char_certainty > *unlikely_threshold) {
324  break;
325  }
326  if (char_certainty < *leading_certainty) {
327  *leading_certainty = char_certainty;
328  }
329  }
330 
331  // Calculate num_trailing and trailing_certainty.
332  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
333  *num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) {
334  int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
335  float char_certainty = word->best_choice->certainty(blob_idx);
336  if (char_certainty > *unlikely_threshold) {
337  break;
338  }
339  if (char_certainty < *trailing_certainty) {
340  *trailing_certainty = char_certainty;
341  }
342  }
343 }
@ SP_SUBSCRIPT
Definition: ratngs.h:250
@ SP_NORMAL
Definition: ratngs.h:250
@ SP_SUPERSCRIPT
Definition: ratngs.h:250

◆ ImageHeight()

int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 260 of file tesseractclass.h.

260  {
261  return pixGetHeight(pix_binary_);
262  }

◆ ImageWidth()

int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 257 of file tesseractclass.h.

257  {
258  return pixGetWidth(pix_binary_);
259  }

◆ init_recog_training()

FILE * tesseract::Tesseract::init_recog_training ( const char *  filename)

Definition at line 36 of file recogtraining.cpp.

36  {
37  if (tessedit_ambigs_training) {
38  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39  tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
40  // Explore all segmentations.
41  getDict().stopper_no_acceptable_choices.set_value(true);
42  }
43 
44  std::string output_fname = filename;
45  const char *lastdot = strrchr(output_fname.c_str(), '.');
46  if (lastdot != nullptr) {
47  output_fname[lastdot - output_fname.c_str()] = '\0';
48  }
49  output_fname += ".txt";
50  FILE *output_file = fopen(output_fname.c_str(), "a+");
51  if (output_file == nullptr) {
52  tprintf("Error: Could not open file %s\n", output_fname.c_str());
53  ASSERT_HOST(output_file);
54  }
55  return output_file;
56 }
Dict & getDict() override

◆ init_tesseract() [1/2]

int tesseract::Tesseract::init_tesseract ( const std::string &  arg0,
const std::string &  textbase,
const std::string &  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const std::vector< std::string > *  vars_vec,
const std::vector< std::string > *  vars_values,
bool  set_only_non_debug_params,
TessdataManager mgr 
)

Definition at line 291 of file tessedit.cpp.

295  {
296  std::vector<std::string> langs_to_load;
297  std::vector<std::string> langs_not_to_load;
298  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
299 
300  for (auto *lang : sub_langs_) {
301  delete lang;
302  }
303 
304  // Set the basename, compute the data directory.
305  main_setup(arg0, textbase);
306 
307  sub_langs_.clear();
308  // Find the first loadable lang and load into this.
309  // Add any languages that this language requires
310  bool loaded_primary = false;
311  // Load the rest into sub_langs_.
312  // A range based for loop does not work here because langs_to_load
313  // might be changed in the loop when a new submodel is found.
314  for (auto &lang_to_load : langs_to_load) {
315  if (!IsStrInList(lang_to_load, langs_not_to_load)) {
316  const char *lang_str = lang_to_load.c_str();
317  Tesseract *tess_to_init;
318  if (!loaded_primary) {
319  tess_to_init = this;
320  } else {
321  tess_to_init = new Tesseract;
322  tess_to_init->main_setup(arg0, textbase);
323  }
324 
325  int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
326  configs_size, vars_vec, vars_values,
327  set_only_non_debug_params, mgr);
328  // Forget that language, but keep any reader we were given.
329  mgr->Clear();
330 
331  if (!loaded_primary) {
332  if (result < 0) {
333  tprintf("Failed loading language '%s'\n", lang_str);
334  } else {
335  ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
336  &langs_not_to_load);
337  loaded_primary = true;
338  }
339  } else {
340  if (result < 0) {
341  tprintf("Failed loading language '%s'\n", lang_str);
342  delete tess_to_init;
343  } else {
344  sub_langs_.push_back(tess_to_init);
345  // Add any languages that this language requires
346  ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
347  &langs_not_to_load);
348  }
349  }
350  }
351  }
352  if (!loaded_primary && !langs_to_load.empty()) {
353  tprintf("Tesseract couldn't load any languages!\n");
354  return -1; // Couldn't load any language!
355  }
356 #ifndef DISABLED_LEGACY_ENGINE
357  if (!sub_langs_.empty()) {
358  // In multilingual mode word ratings have to be directly comparable,
359  // so use the same language model weights for all languages:
360  // use the primary language's params model if
361  // tessedit_use_primary_params_model is set,
362  // otherwise use default language model weights.
363  if (tessedit_use_primary_params_model) {
364  for (auto &sub_lang : sub_langs_) {
365  sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
366  }
367  tprintf("Using params model of the primary language\n");
368  } else {
369  this->language_model_->getParamsModel().Clear();
370  for (auto &sub_lang : sub_langs_) {
371  sub_lang->language_model_->getParamsModel().Clear();
372  }
373  }
374  }
375 
377 #endif // ndef DISABLED_LEGACY_ENGINE
378  return 0;
379 }
void ParseLanguageString(const std::string &lang_str, std::vector< std::string > *to_load, std::vector< std::string > *not_to_load)
Definition: tessedit.cpp:246
void SetupUniversalFontIds()
Definition: tessedit.cpp:438
void main_setup(const std::string &argv0, const std::string &basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:40
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:382

◆ init_tesseract() [2/2]

int tesseract::Tesseract::init_tesseract ( const std::string &  datapath,
const std::string &  language,
OcrEngineMode  oem 
)
inline

Definition at line 500 of file tesseractclass.h.

500  {
501  TessdataManager mgr;
502  return init_tesseract(datapath, {}, language, oem, nullptr, 0, nullptr, nullptr, false, &mgr);
503  }
int init_tesseract(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:291

◆ init_tesseract_internal()

int tesseract::Tesseract::init_tesseract_internal ( const std::string &  arg0,
const std::string &  textbase,
const std::string &  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const std::vector< std::string > *  vars_vec,
const std::vector< std::string > *  vars_values,
bool  set_only_non_debug_params,
TessdataManager mgr 
)

Definition at line 397 of file tessedit.cpp.

402  {
403  if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,
404  vars_values, set_only_non_debug_params, mgr)) {
405  return -1;
406  }
407  if (tessedit_init_config_only) {
408  return 0;
409  }
410  // If only LSTM will be used, skip loading Tesseract classifier's
411  // pre-trained templates and dictionary.
412  bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
413  program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
414  return 0; // Normal exit
415 }
bool init_tesseract_lang_data(const std::string &arg0, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:78
void program_editup(const std::string &textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:39

◆ init_tesseract_lang_data()

bool tesseract::Tesseract::init_tesseract_lang_data ( const std::string &  arg0,
const std::string &  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const std::vector< std::string > *  vars_vec,
const std::vector< std::string > *  vars_values,
bool  set_only_non_debug_params,
TessdataManager mgr 
)

Definition at line 78 of file tessedit.cpp.

83  {
84  // Set the language data path prefix
85  lang = !language.empty() ? language : "eng";
89 
90  // Initialize TessdataManager.
91  std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
92  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
93  tprintf("Error opening data file %s\n", tessdata_path.c_str());
94  tprintf(
95  "Please make sure the TESSDATA_PREFIX environment variable is set"
96  " to your \"tessdata\" directory.\n");
97  return false;
98  }
99 #ifdef DISABLED_LEGACY_ENGINE
100  tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
101 #else
102  if (oem == OEM_DEFAULT) {
103  // Set the engine mode from availability, which can then be overridden by
104  // the config file when we read it below.
105  if (!mgr->IsLSTMAvailable()) {
106  tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
107  } else if (!mgr->IsBaseAvailable()) {
108  tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
109  } else {
110  tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
111  }
112  }
113 #endif // ndef DISABLED_LEGACY_ENGINE
114 
115  // If a language specific config file (lang.config) exists, load it in.
116  TFile fp;
117  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
119  }
120 
121  SetParamConstraint set_params_constraint =
123  // Load tesseract variables from config files. This is done after loading
124  // language-specific variables from [lang].traineddata file, so that custom
125  // config files can override values in [lang].traineddata file.
126  for (int i = 0; i < configs_size; ++i) {
127  read_config_file(configs[i], set_params_constraint);
128  }
129 
130  // Set params specified in vars_vec (done after setting params from config
131  // files, so that params in vars_vec can override those from files).
132  if (vars_vec != nullptr && vars_values != nullptr) {
133  for (unsigned i = 0; i < vars_vec->size(); ++i) {
134  if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
135  set_params_constraint, this->params())) {
136  tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
137  }
138  }
139  }
140 
141  if (!tessedit_write_params_to_file.empty()) {
142  FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
143  if (params_file != nullptr) {
144  ParamUtils::PrintParams(params_file, this->params());
145  fclose(params_file);
146  } else {
147  tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
148  }
149  }
150 
151 #ifndef DISABLED_LEGACY_ENGINE
152  // Determine which ocr engine(s) should be loaded and used for recognition.
153  if (oem != OEM_DEFAULT) {
154  tessedit_ocr_engine_mode.set_value(oem);
155  }
156 #endif
157 
158  // If we are only loading the config file (and so not planning on doing any
159  // recognition) then there's nothing else do here.
160  if (tessedit_init_config_only) {
161  return true;
162  }
163 
164 // The various OcrEngineMode settings (see tesseract/publictypes.h) determine
165 // which engine-specific data files need to be loaded. If LSTM_ONLY is
166 // requested, the base Tesseract files are *Not* required.
167 #ifdef DISABLED_LEGACY_ENGINE
168  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
169 #else
170  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
171  tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
172 #endif // ndef DISABLED_LEGACY_ENGINE
173  if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
174  lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
175  ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
176  } else {
177  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
178  tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
179  }
180  }
181 
182  // Load the unicharset
183  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
184  // Avoid requiring a unicharset when we aren't running base tesseract.
185  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
186  }
187 #ifndef DISABLED_LEGACY_ENGINE
188  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
189  tprintf(
190  "Error: Tesseract (legacy) engine requested, but components are "
191  "not present in %s!!\n",
192  tessdata_path.c_str());
193  return false;
194  }
195 #endif // ndef DISABLED_LEGACY_ENGINE
196  if (unicharset.size() > MAX_NUM_CLASSES) {
197  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
198  return false;
199  }
200  right_to_left_ = unicharset.major_right_to_left();
201 
202 #ifndef DISABLED_LEGACY_ENGINE
203 
204  // Setup initial unichar ambigs table and read universal ambigs.
205  UNICHARSET encoder_unicharset;
206  encoder_unicharset.CopyFrom(unicharset);
207  unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
208  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
209 
210  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
211  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
212  use_ambigs_for_adaption, &unicharset);
213  }
214 
215  // Init ParamsModel.
216  // Load pass1 and pass2 weights (for now these two sets are the same, but in
217  // the future separate sets of weights can be generated).
219  language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
220  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
221  if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
222  return false;
223  }
224  }
225  }
226 #endif // ndef DISABLED_LEGACY_ENGINE
227 
228  return true;
229 }
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
SetParamConstraint
Definition: params.h:38
@ SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
Definition: params.h:41
@ SET_PARAM_CONSTRAINT_NONE
Definition: params.h:39
@ TESSDATA_PARAMS_MODEL
@ TESSDATA_LANG_CONFIG
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:48
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:64
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:51
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:72
std::string language_data_path_prefix
Definition: ccutil.h:60
std::string datadir
Definition: ccutil.h:57
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:63
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:51
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:164
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:81
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
bool major_right_to_left() const
Definition: unicharset.cpp:983
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:438
size_t size() const
Definition: unicharset.h:355
bool Load(const ParamsVectors *params, const std::string &lang, TessdataManager *mgr)

◆ INT_VAR_H() [1/43]

tesseract::Tesseract::INT_VAR_H ( applybox_debug  )

◆ INT_VAR_H() [2/43]

tesseract::Tesseract::INT_VAR_H ( applybox_page  )

◆ INT_VAR_H() [3/43]

tesseract::Tesseract::INT_VAR_H ( bidi_debug  )

◆ INT_VAR_H() [4/43]

tesseract::Tesseract::INT_VAR_H ( crunch_debug  )

◆ INT_VAR_H() [5/43]

tesseract::Tesseract::INT_VAR_H ( crunch_leave_lc_strings  )

◆ INT_VAR_H() [6/43]

tesseract::Tesseract::INT_VAR_H ( crunch_leave_uc_strings  )

◆ INT_VAR_H() [7/43]

tesseract::Tesseract::INT_VAR_H ( crunch_long_repetitions  )

◆ INT_VAR_H() [8/43]

tesseract::Tesseract::INT_VAR_H ( crunch_pot_indicators  )

◆ INT_VAR_H() [9/43]

tesseract::Tesseract::INT_VAR_H ( crunch_rating_max  )

◆ INT_VAR_H() [10/43]

tesseract::Tesseract::INT_VAR_H ( debug_fix_space_level  )

◆ INT_VAR_H() [11/43]

tesseract::Tesseract::INT_VAR_H ( debug_noise_removal  )

◆ INT_VAR_H() [12/43]

tesseract::Tesseract::INT_VAR_H ( debug_x_ht_level  )

◆ INT_VAR_H() [13/43]

tesseract::Tesseract::INT_VAR_H ( fixsp_done_mode  )

◆ INT_VAR_H() [14/43]

tesseract::Tesseract::INT_VAR_H ( fixsp_non_noise_limit  )

◆ INT_VAR_H() [15/43]

tesseract::Tesseract::INT_VAR_H ( jpg_quality  )

◆ INT_VAR_H() [16/43]

tesseract::Tesseract::INT_VAR_H ( lstm_choice_iterations  )

◆ INT_VAR_H() [17/43]

tesseract::Tesseract::INT_VAR_H ( lstm_choice_mode  )

◆ INT_VAR_H() [18/43]

tesseract::Tesseract::INT_VAR_H ( min_characters_to_try  )

◆ INT_VAR_H() [19/43]

tesseract::Tesseract::INT_VAR_H ( min_sane_x_ht_pixels  )

◆ INT_VAR_H() [20/43]

tesseract::Tesseract::INT_VAR_H ( multilang_debug_level  )

◆ INT_VAR_H() [21/43]

tesseract::Tesseract::INT_VAR_H ( noise_maxperblob  )

◆ INT_VAR_H() [22/43]

tesseract::Tesseract::INT_VAR_H ( noise_maxperword  )

◆ INT_VAR_H() [23/43]

tesseract::Tesseract::INT_VAR_H ( ocr_devanagari_split_strategy  )

◆ INT_VAR_H() [24/43]

tesseract::Tesseract::INT_VAR_H ( pageseg_devanagari_split_strategy  )

◆ INT_VAR_H() [25/43]

tesseract::Tesseract::INT_VAR_H ( paragraph_debug_level  )

◆ INT_VAR_H() [26/43]

tesseract::Tesseract::INT_VAR_H ( quality_min_initial_alphas_reqd  )

◆ INT_VAR_H() [27/43]

tesseract::Tesseract::INT_VAR_H ( superscript_debug  )

◆ INT_VAR_H() [28/43]

tesseract::Tesseract::INT_VAR_H ( suspect_level  )

◆ INT_VAR_H() [29/43]

tesseract::Tesseract::INT_VAR_H ( suspect_short_words  )

◆ INT_VAR_H() [30/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_bigram_debug  )

◆ INT_VAR_H() [31/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_font_id  )

◆ INT_VAR_H() [32/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_image_border  )

◆ INT_VAR_H() [33/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_ocr_engine_mode  )

◆ INT_VAR_H() [34/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_page_number  )

◆ INT_VAR_H() [35/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_pageseg_mode  )

◆ INT_VAR_H() [36/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_parallelize  )

◆ INT_VAR_H() [37/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_preserve_min_wd_len  )

◆ INT_VAR_H() [38/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_reject_mode  )

◆ INT_VAR_H() [39/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_tess_adaption_mode  )

◆ INT_VAR_H() [40/43]

tesseract::Tesseract::INT_VAR_H ( thresholding_method  )

◆ INT_VAR_H() [41/43]

tesseract::Tesseract::INT_VAR_H ( user_defined_dpi  )

◆ INT_VAR_H() [42/43]

tesseract::Tesseract::INT_VAR_H ( x_ht_acceptance_tolerance  )

◆ INT_VAR_H() [43/43]

tesseract::Tesseract::INT_VAR_H ( x_ht_min_change  )

◆ join_words()

void tesseract::Tesseract::join_words ( WERD_RES word,
WERD_RES word2,
BlamerBundle orig_bb 
) const

Definition at line 216 of file tfacepp.cpp.

216  {
217  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
218  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
219  // Tack the word2 outputs onto the end of the word outputs.
220  word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
221  word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
222  word2->chopped_word->blobs.clear();
223  word2->rebuild_word->blobs.clear();
224  TPOINT split_pt;
225  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
226  split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4;
227  // Move the word2 seams onto the end of the word1 seam_array.
228  // Since the seam list is one element short, an empty seam marking the
229  // end of the last blob in the first word is needed first.
230  word->seam_array.push_back(new SEAM(0.0f, split_pt));
231  word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
232  word2->seam_array.clear();
233  // Fix widths and gaps.
234  word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
235  word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
236  // Fix the ratings matrix.
237  int rat1 = word->ratings->dimension();
238  int rat2 = word2->ratings->dimension();
239  word->ratings->AttachOnCorner(word2->ratings);
240  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
241  word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
242  // Append the word choices.
243  *word->raw_choice += *word2->raw_choice;
244 
245  // How many alt choices from each should we try to get?
246  const int kAltsPerPiece = 2;
247  // When do we start throwing away extra alt choices?
248  const int kTooManyAltChoices = 100;
249 
250  // Construct the cartesian product of the best_choices of word(1) and word2.
251  WERD_CHOICE_LIST joined_choices;
252  WERD_CHOICE_IT jc_it(&joined_choices);
253  WERD_CHOICE_IT bc1_it(&word->best_choices);
254  WERD_CHOICE_IT bc2_it(&word2->best_choices);
255  int num_word1_choices = word->best_choices.length();
256  int total_joined_choices = num_word1_choices;
257  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
258  // word2 choices, and put them in the joined_choices list. The 1st word2
259  // choice gets added to the original word1 choices in-place after we have
260  // finished with them.
261  int bc2_index = 1;
262  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
263  if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) {
264  break;
265  }
266  int bc1_index = 0;
267  for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) {
268  if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) {
269  break;
270  }
271  auto *wc = new WERD_CHOICE(*bc1_it.data());
272  *wc += *bc2_it.data();
273  jc_it.add_after_then_move(wc);
274  ++total_joined_choices;
275  }
276  }
277  // Now that we've filled in as many alternates as we want, paste the best
278  // choice for word2 onto the original word alt_choices.
279  bc1_it.move_to_first();
280  bc2_it.move_to_first();
281  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
282  *bc1_it.data() += *bc2_it.data();
283  }
284  bc1_it.move_to_last();
285  bc1_it.add_list_after(&joined_choices);
286 
287  // Restore the pointer to original blamer bundle and combine blamer
288  // information recorded in the splits.
289  if (orig_bb != nullptr) {
290  orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer);
291  delete word->blamer_bundle;
292  word->blamer_bundle = orig_bb;
293  }
294  word->SetupBoxWord();
295  word->reject_map.initialise(word->box_word->length());
296  delete word2;
297 }
@ TPOINT

◆ LSTMRecognizeWord()

void tesseract::Tesseract::LSTMRecognizeWord ( const BLOCK block,
ROW row,
WERD_RES word,
PointerVector< WERD_RES > *  words 
)

Definition at line 230 of file linerec.cpp.

231  {
232  TBOX word_box = word->word->bounding_box();
233  // Get the word image - no frills.
234  if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {
235  // In single word mode, use the whole image without any other row/word
236  // interpretation.
237  word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
238  } else {
239  float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
240  if (baseline + row->descenders() < word_box.bottom()) {
241  word_box.set_bottom(baseline + row->descenders());
242  }
243  if (baseline + row->x_height() + row->ascenders() > word_box.top()) {
244  word_box.set_top(baseline + row->x_height() + row->ascenders());
245  }
246  }
247  ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
248  if (im_data == nullptr) {
249  return;
250  }
251 
252  bool do_invert = tessedit_do_invert;
253  lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
254  kWorstDictCertainty / kCertaintyScale, word_box, words,
255  lstm_choice_mode, lstm_choice_iterations);
256  delete im_data;
257  SearchWords(words);
258 }
const float kWorstDictCertainty
Definition: linerec.cpp:35
@ PSM_SINGLE_WORD
Treat the image as a single word.
Definition: publictypes.h:170
const float kCertaintyScale
Definition: linerec.cpp:33
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:263
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0, int lstm_choice_amount=5)

◆ make_reject_map()

void tesseract::Tesseract::make_reject_map ( WERD_RES word,
ROW row,
int16_t  pass 
)

Definition at line 96 of file reject.cpp.

96  {
97  flip_0O(word);
98  check_debug_pt(word, -1); // For trap only
99  set_done(word, pass); // Set acceptance
100  word->reject_map.initialise(word->best_choice->unichar_lengths().length());
101  reject_blanks(word);
102  /*
103 0: Rays original heuristic - the baseline
104 */
105  if (tessedit_reject_mode == 0) {
106  if (!word->done) {
107  reject_poor_matches(word);
108  }
109  } else if (tessedit_reject_mode == 5) {
110  /*
111 5: Reject I/1/l from words where there is no strong contextual confirmation;
112  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
113  and the whole of any words which are very small
114 */
115  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
116  word->reject_map.rej_word_small_xht();
117  } else {
118  one_ell_conflict(word, true);
119  /*
120  Originally the code here just used the done flag. Now I have duplicated
121  and unpacked the conditions for setting the done flag so that each
122  mechanism can be turned on or off independently. This works WITHOUT
123  affecting the done flag setting.
124 */
125  if (rej_use_tess_accepted && !word->tess_accepted) {
126  word->reject_map.rej_word_not_tess_accepted();
127  }
128 
129  if (rej_use_tess_blanks &&
130  (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
131  word->reject_map.rej_word_contains_blanks();
132  }
133 
134  WERD_CHOICE *best_choice = word->best_choice;
135  if (rej_use_good_perm) {
136  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
137  best_choice->permuter() == FREQ_DAWG_PERM ||
138  best_choice->permuter() == USER_DAWG_PERM) &&
139  (!rej_use_sensible_wd ||
140  acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),
141  best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {
142  // PASSED TEST
143  } else if (best_choice->permuter() == NUMBER_PERM) {
144  if (rej_alphas_in_number_perm) {
145  for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
146  offset += best_choice->unichar_lengths()[i++]) {
147  if (word->reject_map[i].accepted() &&
148  word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
149  best_choice->unichar_lengths()[i])) {
150  word->reject_map[i].setrej_bad_permuter();
151  }
152  // rej alpha
153  }
154  }
155  } else {
156  word->reject_map.rej_word_bad_permuter();
157  }
158  }
159  /* Ambig word rejection was here once !!*/
160  }
161  } else {
162  tprintf("BAD tessedit_reject_mode\n");
163  ASSERT_HOST("Fatal error encountered!" == nullptr);
164  }
165 
166  if (tessedit_image_border > -1) {
167  reject_edge_blobs(word);
168  }
169 
170  check_debug_pt(word, 10);
171  if (tessedit_rejection_debug) {
172  tprintf("Permuter Type = %d\n", word->best_choice->permuter());
173  tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty(),
174  word->best_choice->rating());
175  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
176  }
177 
178  flip_hyphens(word);
179  check_debug_pt(word, 20);
180 }
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:208
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:182
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:260
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:287
void set_done(WERD_RES *word, int16_t pass)
Definition: reject.cpp:62
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:602
void flip_0O(WERD_RES *word)
Definition: reject.cpp:660
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:86

◆ match_current_words()

void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 218 of file fixspace.cpp.

218  {
219  WERD_RES_IT word_it(&words);
220  WERD_RES *word;
221  // Since we are not using PAGE_RES to iterate over words, we need to update
222  // prev_word_best_choice_ before calling classify_word_pass2().
223  prev_word_best_choice_ = nullptr;
224  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
225  word = word_it.data();
226  if ((!word->part_of_combo) && (word->box_word == nullptr)) {
227  WordData word_data(block, row, word);
228  SetupWordPassN(2, &word_data);
229  classify_word_and_language(2, nullptr, &word_data);
230  }
231  prev_word_best_choice_ = word->best_choice;
232  }
233 }

◆ match_word_pass_n()

void tesseract::Tesseract::match_word_pass_n ( int  pass_n,
WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1589 of file control.cpp.

1589  {
1590  if (word->tess_failed) {
1591  return;
1592  }
1593  tess_segment_pass_n(pass_n, word);
1594 
1595  if (!word->tess_failed) {
1596  if (!word->word->flag(W_REP_CHAR)) {
1597  word->fix_quotes();
1598  if (tessedit_fix_hyphens) {
1599  word->fix_hyphens();
1600  }
1601  /* Don't trust fix_quotes! - though I think I've fixed the bug */
1602  if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {
1603  tprintf(
1604  "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1605  " #Blobs=%u\n",
1606  word->best_choice->debug_string().c_str(), word->best_choice->length(),
1607  word->box_word->length());
1608  }
1609  word->tess_accepted = tess_acceptable_word(word);
1610 
1611  // Also sets word->done flag
1612  make_reject_map(word, row, pass_n);
1613  }
1614  }
1615  set_word_fonts(word);
1616 
1617  ASSERT_HOST(word->raw_choice != nullptr);
1618 }
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1927
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:64
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
Definition: reject.cpp:96

◆ MaximallyChopWord()

void tesseract::Tesseract::MaximallyChopWord ( const std::vector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.

Definition at line 231 of file applybox.cpp.

232  {
233  if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
234  classify_bln_numeric_mode, textord_use_cjk_fp_model,
235  poly_allow_detailed_fx, row, block)) {
236  word_res->CloneChoppedToRebuild();
237  return;
238  }
239  if (chop_debug) {
240  tprintf("Maximally chopping word at:");
241  word_res->word->bounding_box().print();
242  }
243  std::vector<BLOB_CHOICE *> blob_choices;
244  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
245  auto rating = static_cast<float>(INT8_MAX);
246  for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
247  // The rating and certainty are not quite arbitrary. Since
248  // select_blob_to_chop uses the worst certainty to choose, they all have
249  // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
250  // in here, and then divide by e each time they are chopped, which
251  // should guarantee a set of unequal values for the whole tree of blobs
252  // produced, however much chopping is required. The chops are thus only
253  // limited by the ability of the chopper to find suitable chop points,
254  // and not by the value of the certainties.
255  auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
256  blob_choices.push_back(choice);
257  rating -= 0.125f;
258  }
259  const double e = exp(1.0); // The base of natural logs.
260  unsigned blob_number;
261  int right_chop_index = 0;
262  if (!assume_fixed_pitch_char_segment) {
263  // We only chop if the language is not fixed pitch like CJK.
264  SEAM *seam = nullptr;
265  while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
266  word_res->InsertSeam(blob_number, seam);
267  BLOB_CHOICE *left_choice = blob_choices[blob_number];
268  rating = left_choice->rating() / e;
269  left_choice->set_rating(rating);
270  left_choice->set_certainty(-rating);
271  // combine confidence w/ serial #
272  auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
273  0.0f, 0.0f, BCC_FAKE);
274  blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
275  }
276  }
277  word_res->CloneChoppedToRebuild();
278  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
279 }
@ BCC_FAKE
Definition: ratngs.h:53
SEAM * chop_one_blob(const std::vector< TBOX > &boxes, const std::vector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, unsigned *blob_number)
Definition: chopper.cpp:367

◆ mutable_pix_binary()

Image* tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 204 of file tesseractclass.h.

204  {
205  pix_binary_.destroy();
206  return &pix_binary_;
207  }

◆ mutable_textord()

Textord* tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 276 of file tesseractclass.h.

276  {
277  return &textord_;
278  }

◆ nn_match_word()

void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)

◆ nn_recover_rejects()

void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)

◆ noise_outlines()

bool tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 907 of file docqual.cpp.

907  {
908  TBOX box; // BB of outline
909  int16_t outline_count = 0;
910  int16_t small_outline_count = 0;
911  int16_t max_dimension;
912  float small_limit = kBlnXHeight * crunch_small_outlines_size;
913 
914  for (unsigned b = 0; b < word->NumBlobs(); ++b) {
915  TBLOB *blob = word->blobs[b];
916  for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
917  outline_count++;
918  box = ol->bounding_box();
919  if (box.height() > box.width()) {
920  max_dimension = box.height();
921  } else {
922  max_dimension = box.width();
923  }
924  if (max_dimension < small_limit) {
925  small_outline_count++;
926  }
927  }
928  }
929  return small_outline_count >= outline_count;
930 }

◆ non_0_digit()

bool tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 772 of file reject.cpp.

772  {
773  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
774 }

◆ non_O_upper()

bool tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 768 of file reject.cpp.

768  {
769  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
770 }

◆ num_sub_langs()

int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 283 of file tesseractclass.h.

283  {
284  return sub_langs_.size();
285  }

◆ one_ell_conflict()

bool tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
bool  update_map 
)

Definition at line 287 of file reject.cpp.

287  {
288  const char *word;
289  const char *lengths;
290  int16_t word_len; // its length
291  int16_t first_alphanum_index_;
292  int16_t first_alphanum_offset_;
293  int16_t i;
294  int16_t offset;
295  bool non_conflict_set_char; // non conf set a/n?
296  bool conflict = false;
297  bool allow_1s;
298  ACCEPTABLE_WERD_TYPE word_type;
299  bool dict_perm_type;
300  bool dict_word_ok;
301  int dict_word_type;
302 
303  word = word_res->best_choice->unichar_string().c_str();
304  lengths = word_res->best_choice->unichar_lengths().c_str();
305  word_len = strlen(lengths);
306  /*
307  If there are no occurrences of the conflict set characters then the word
308  is OK.
309 */
310  if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {
311  return false;
312  }
313 
314  /*
315  There is a conflict if there are NO other (confirmed) alphanumerics apart
316  from those in the conflict set.
317 */
318 
319  for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;
320  offset += lengths[i++]) {
321  non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
322  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
323  !conflict_set_I_l_1.contains(word[offset]);
324  }
325  if (!non_conflict_set_char) {
326  if (update_map) {
327  reject_I_1_L(word_res);
328  }
329  return true;
330  }
331 
332  /*
333  If the word is accepted by a dawg permuter, and the first alpha character
334  is "I" or "l", check to see if the alternative is also a dawg word. If it
335  is, then there is a potential error otherwise the word is ok.
336 */
337 
338  dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
339  (word_res->best_choice->permuter() == USER_DAWG_PERM) ||
340  (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
341  (word_res->best_choice->permuter() == FREQ_DAWG_PERM);
342  dict_word_type = dict_word(*(word_res->best_choice));
343  dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
344 
345  if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
346  (dict_perm_type && dict_word_ok)) {
347  first_alphanum_index_ = first_alphanum_index(word, lengths);
348  first_alphanum_offset_ = first_alphanum_offset(word, lengths);
349  if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
350  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
351  if (safe_dict_word(word_res) > 0) {
352  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
353  if (update_map) {
354  word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
355  }
356  return true;
357  } else {
358  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
359  return false;
360  }
361  }
362 
363  if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
364  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
365  if (safe_dict_word(word_res) > 0) {
366  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
367  if (update_map) {
368  word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
369  }
370  return true;
371  } else {
372  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
373  return false;
374  }
375  }
376  return false;
377  }
378 
379  /*
380  NEW 1Il code. The old code relied on permuter types too much. In fact,
381  tess will use TOP_CHOICE permute for good things like "palette".
382  In this code the string is examined independently to see if it looks like
383  a well formed word.
384 */
385 
386  /*
387  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
388  dictionary word.
389 */
390  first_alphanum_index_ = first_alphanum_index(word, lengths);
391  first_alphanum_offset_ = first_alphanum_offset(word, lengths);
392  if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
393  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
394  if (safe_dict_word(word_res) > 0) {
395  return false;
396  } else {
397  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
398  }
399  } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
400  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
401  if (safe_dict_word(word_res) > 0) {
402  return false;
403  } else {
404  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
405  }
406  }
407  /*
408  For strings containing digits:
409  If there are no alphas OR the numeric permuter liked the word,
410  reject any non 1 conflict chs
411  Else reject all conflict chs
412 */
413  if (word_contains_non_1_digit(word, lengths)) {
414  allow_1s =
415  (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);
416 
417  int16_t offset;
418  conflict = false;
419  for (i = 0, offset = 0; word[offset] != '\0';
420  offset += word_res->best_choice->unichar_lengths()[i++]) {
421  if ((!allow_1s || (word[offset] != '1')) &&
422  conflict_set_I_l_1.contains(word[offset])) {
423  if (update_map) {
424  word_res->reject_map[i].setrej_1Il_conflict();
425  }
426  conflict = true;
427  }
428  }
429  return conflict;
430  }
431  /*
432  For anything else. See if it conforms to an acceptable word type. If so,
433  treat accordingly.
434 */
435  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
436  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
437  first_alphanum_index_ = first_alphanum_index(word, lengths);
438  first_alphanum_offset_ = first_alphanum_offset(word, lengths);
439  if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
440  if (update_map) {
441  word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
442  }
443  return true;
444  } else {
445  return false;
446  }
447  } else if (word_type == AC_UPPER_CASE) {
448  return false;
449  } else {
450  if (update_map) {
451  reject_I_1_L(word_res);
452  }
453  return true;
454  }
455 }
@ DOC_DAWG_PERM
Definition: ratngs.h:241
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:457
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:470
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:483
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:195
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:496

◆ output_pass()

void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 39 of file output.cpp.

40  {
41  BLOCK_RES *block_of_last_word;
42  bool force_eol; // During output
43  BLOCK *nextblock; // block of next word
44  WERD *nextword; // next word
45 
46  page_res_it.restart_page();
47  block_of_last_word = nullptr;
48  while (page_res_it.word() != nullptr) {
49  check_debug_pt(page_res_it.word(), 120);
50 
51  if (target_word_box) {
52  TBOX current_word_box = page_res_it.word()->word->bounding_box();
53  FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
54  (current_word_box.bottom() + current_word_box.top()) / 2);
55  if (!target_word_box->contains(center_pt)) {
56  page_res_it.forward();
57  continue;
58  }
59  }
60  if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
61  block_of_last_word = page_res_it.block();
62  }
63 
64  force_eol =
65  (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
66  (page_res_it.next_word() == nullptr);
67 
68  if (page_res_it.next_word() != nullptr) {
69  nextword = page_res_it.next_word()->word;
70  } else {
71  nextword = nullptr;
72  }
73  if (page_res_it.next_block() != nullptr) {
74  nextblock = page_res_it.next_block()->block;
75  } else {
76  nextblock = nullptr;
77  }
78  // regardless of tilde crunching
79  write_results(page_res_it,
80  determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
81  nextword, nextblock),
82  force_eol);
83  page_res_it.forward();
84  }
85 }
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:207
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:99

◆ ParseLanguageString()

void tesseract::Tesseract::ParseLanguageString ( const std::string &  lang_str,
std::vector< std::string > *  to_load,
std::vector< std::string > *  not_to_load 
)

Definition at line 246 of file tessedit.cpp.

247  {
248  std::string remains(lang_str);
249  // Look whether the model file uses a prefix which must be applied to
250  // included model files as well.
251  std::regex e("(.*)/[^/]*");
252  std::cmatch cm;
253  std::string prefix;
254  if (std::regex_match(lang.c_str(), cm, e, std::regex_constants::match_default)) {
255  // A prefix was found.
256  prefix = cm[1].str() + "/";
257  }
258  while (!remains.empty()) {
259  // Find the start of the lang code and which vector to add to.
260  const char *start = remains.c_str();
261  while (*start == '+') {
262  ++start;
263  }
264  std::vector<std::string> *target = to_load;
265  if (*start == '~') {
266  target = not_to_load;
267  ++start;
268  }
269  // Find the index of the end of the lang code in string start.
270  int end = strlen(start);
271  const char *plus = strchr(start, '+');
272  if (plus != nullptr && plus - start < end) {
273  end = plus - start;
274  }
275  std::string lang_code(start);
276  lang_code.resize(end);
277  std::string next(start + end);
278  remains = next;
279  lang_code = prefix + lang_code;
280  // Check whether lang_code is already in the target vector and add.
281  if (!IsStrInList(lang_code, *target)) {
282  target->push_back(lang_code);
283  }
284  }
285 }

◆ pgeditor_main()

void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 354 of file pgedit.cpp.

354  {
355  current_page_res = page_res;
356  if (current_page_res->block_res_list.empty()) {
357  return;
358  }
359 
360  recog_done = false;
361  stillRunning = true;
362 
363  build_image_window(width, height);
364  word_display_mode.set(DF_EDGE_STEP);
366 # ifndef GRAPHICS_DISABLED
367  pe = new ParamsEditor(this, image_win);
368 # endif
369  PGEventHandler pgEventHandler(this);
370 
371  image_win->AddEventHandler(&pgEventHandler);
372  image_win->AddMessageBox();
373 
374  SVMenuNode *svMenuRoot = build_menu_new();
375 
376  svMenuRoot->BuildMenu(image_win);
377  image_win->SetVisible(true);
378 
379  image_win->AwaitEvent(SVET_DESTROY);
380  image_win->AddEventHandler(nullptr);
381 }
@ SVET_DESTROY
Definition: scrollview.h:53
@ DF_EDGE_STEP
Edge steps.
Definition: werd.h:51
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:274
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:899
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:324
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
void SetVisible(bool visible)
Definition: scrollview.cpp:528
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
Definition: scrollview.cpp:418
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:445

◆ pix_binary()

Image tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 208 of file tesseractclass.h.

208  {
209  return pix_binary_;
210  }

◆ pix_grey()

Image tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 211 of file tesseractclass.h.

211  {
212  return pix_grey_;
213  }

◆ pix_original()

Image tesseract::Tesseract::pix_original ( ) const
inline

Definition at line 218 of file tesseractclass.h.

218  {
219  return pix_original_;
220  }

◆ potential_word_crunch()

bool tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
bool  ok_dict_word 
)

Definition at line 488 of file docqual.cpp.

489  {
490  float rating_per_ch;
491  int adjusted_len;
492  const char *str = word->best_choice->unichar_string().c_str();
493  const char *lengths = word->best_choice->unichar_lengths().c_str();
494  bool word_crunchable;
495  int poor_indicator_count = 0;
496 
497  word_crunchable =
498  !crunch_leave_accept_strings || word->reject_map.length() < 3 ||
499  (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);
500 
501  adjusted_len = word->reject_map.length();
502  if (adjusted_len > 10) {
503  adjusted_len = 10;
504  }
505  rating_per_ch = word->best_choice->rating() / adjusted_len;
506 
507  if (rating_per_ch > crunch_pot_poor_rate) {
508  if (crunch_debug > 2) {
509  tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());
510  }
511  poor_indicator_count++;
512  }
513 
514  if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {
515  if (crunch_debug > 2) {
516  tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());
517  }
518  poor_indicator_count++;
519  }
520 
521  if (garbage_level != G_OK) {
522  if (crunch_debug > 2) {
523  tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());
524  }
525  poor_indicator_count++;
526  }
527  return poor_indicator_count >= crunch_pot_indicators;
528 }

◆ PreenXHeights()

void tesseract::Tesseract::PreenXHeights ( BLOCK_LIST *  block_list)

Any row xheight that is significantly different from the median is set to the median.

Definition at line 174 of file applybox.cpp.

174  {
175  const double median_xheight = MedianXHeight(block_list);
176  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
177  // Strip all fuzzy space markers to simplify the PAGE_RES.
178  BLOCK_IT b_it(block_list);
179  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
180  BLOCK *block = b_it.data();
181  ROW_IT r_it(block->row_list());
182  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
183  ROW *row = r_it.data();
184  const double diff = fabs(row->x_height() - median_xheight);
185  if (diff > max_deviation) {
186  if (applybox_debug) {
187  tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
188  }
189  row->set_x_height(static_cast<float>(median_xheight));
190  }
191  }
192  }
193 }
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:36

◆ PrepareForPageseg()

void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 551 of file tesseractclass.cpp.

551  {
552  textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
553  // Find the max splitter strategy over all langs.
554  auto max_pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
555  static_cast<int32_t>(pageseg_devanagari_split_strategy));
556  for (auto &sub_lang : sub_langs_) {
557  auto pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
558  static_cast<int32_t>(sub_lang->pageseg_devanagari_split_strategy));
559  if (pageseg_strategy > max_pageseg_strategy) {
560  max_pageseg_strategy = pageseg_strategy;
561  }
562  sub_lang->pix_binary_.destroy();
563  sub_lang->pix_binary_ = pix_binary().clone();
564  }
565  // Perform shiro-rekha (top-line) splitting and replace the current image by
566  // the newly split image.
567  splitter_.set_orig_pix(pix_binary());
568  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
569  if (splitter_.Split(true, &pixa_debug_)) {
570  ASSERT_HOST(splitter_.splitted_image());
571  pix_binary_.destroy();
572  pix_binary_ = splitter_.splitted_image().clone();
573  }
574 }
Image pix_binary() const
Image clone() const
Definition: image.cpp:24
void set_pageseg_split_strategy(SplitStrategy strategy)
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:101

◆ PrepareForTessOCR()

void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 581 of file tesseractclass.cpp.

581  {
582  // Find the max splitter strategy over all langs.
583  auto max_ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
584  static_cast<int32_t>(ocr_devanagari_split_strategy));
585  for (auto &sub_lang : sub_langs_) {
586  auto ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
587  static_cast<int32_t>(sub_lang->ocr_devanagari_split_strategy));
588  if (ocr_strategy > max_ocr_strategy) {
589  max_ocr_strategy = ocr_strategy;
590  }
591  }
592  // Utilize the segmentation information available.
593  splitter_.set_segmentation_block_list(block_list);
594  splitter_.set_ocr_split_strategy(max_ocr_strategy);
595  // Run the splitter for OCR
596  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
597  // Restore pix_binary to the binarized original pix for future reference.
598  ASSERT_HOST(splitter_.orig_pix());
599  pix_binary_.destroy();
600  pix_binary_ = splitter_.orig_pix().clone();
601  // If the pageseg and ocr strategies are different, refresh the block list
602  // (from the last SegmentImage call) with blobs from the real image to be used
603  // for OCR.
604  if (splitter_.HasDifferentSplitStrategies()) {
605  BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_), pixGetHeight(pix_binary_));
606  Image pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : splitter_.orig_pix();
607  extract_edges(pix_for_ocr, &block);
608  splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
609  }
610  // The splitter isn't needed any more after this, so save memory by clearing.
611  splitter_.Clear();
612 }
void extract_edges(Image pix, BLOCK *block)
Definition: edgblob.cpp:347
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
void set_segmentation_block_list(BLOCK_LIST *block_list)
void set_ocr_split_strategy(SplitStrategy strategy)

◆ PrerecAllWordsPar()

void tesseract::Tesseract::PrerecAllWordsPar ( const std::vector< WordData > &  words)

Definition at line 38 of file par_control.cpp.

38  {
39  // Prepare all the blobs.
40  std::vector<BlobData> blobs;
41  for (const auto &w : words) {
42  if (w.word->ratings != nullptr && w.word->ratings->get(0, 0) == nullptr) {
43  for (size_t s = 0; s < w.lang_words.size(); ++s) {
44  Tesseract *sub = s < sub_langs_.size() ? sub_langs_[s] : this;
45  const WERD_RES &word = *w.lang_words[s];
46  for (unsigned b = 0; b < word.chopped_word->NumBlobs(); ++b) {
47  blobs.emplace_back(b, sub, word);
48  }
49  }
50  }
51  }
52  // Pre-classify all the blobs.
53  if (tessedit_parallelize > 1) {
54 #ifdef _OPENMP
55 # pragma omp parallel for num_threads(10)
56 #endif // _OPENMP
57  // NOLINTNEXTLINE(modernize-loop-convert)
58  for (size_t b = 0; b < blobs.size(); ++b) {
59  *blobs[b].choices =
60  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", ScrollView::WHITE, nullptr);
61  }
62  } else {
63  // TODO(AMD) parallelize this.
64  for (auto &blob : blobs) {
65  *blob.choices = blob.tesseract->classify_blob(blob.blob, "par", ScrollView::WHITE, nullptr);
66  }
67  }
68 }

◆ process_cmd_win_event()

bool tesseract::Tesseract::process_cmd_win_event ( int32_t  cmd_event,
char *  new_value 
)

process_cmd_win_event()

Process a command returned from the command window (Just call the appropriate command handler)

Definition at line 390 of file pgedit.cpp.

393  {
394  char msg[160];
395  bool exit = false;
396 
397  color_mode = CM_RAINBOW;
398 
399  // Run recognition on the full page if needed.
400  switch (cmd_event) {
401  case BLAMER_CMD_EVENT:
405  case SHOW_BOLD_CMD_EVENT:
411  if (!recog_done) {
412  recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
413  recog_done = true;
414  }
415  break;
416  default:
417  break;
418  }
419 
420  char *parameter;
421 
422  switch (cmd_event) {
423  case NULL_CMD_EVENT:
424  break;
425 
427  case DUMP_WERD_CMD_EVENT:
430  case RECOG_WERDS:
431  case RECOG_PSEUDO:
432  case SHOW_BLOB_FEATURES:
433  mode = static_cast<CMD_EVENTS>(cmd_event);
434  break;
436  mode = DEBUG_WERD_CMD_EVENT;
437  parameter = image_win->ShowInputDialog("Config File Name");
438  word_config_ = parameter;
439  delete[] parameter;
440  break;
442  if (new_value[0] == 'T') {
443  word_display_mode.set(DF_BOX);
444  } else {
445  word_display_mode.reset(DF_BOX);
446  }
447  mode = CHANGE_DISP_CMD_EVENT;
448  break;
449  case BLAMER_CMD_EVENT:
450  if (new_value[0] == 'T') {
451  word_display_mode.set(DF_BLAMER);
452  } else {
453  word_display_mode.reset(DF_BLAMER);
454  }
456  mode = CHANGE_DISP_CMD_EVENT;
457  break;
459  if (new_value[0] == 'T') {
460  word_display_mode.set(DF_TEXT);
461  } else {
462  word_display_mode.reset(DF_TEXT);
463  }
464  mode = CHANGE_DISP_CMD_EVENT;
465  break;
466  case POLYGONAL_CMD_EVENT:
467  if (new_value[0] == 'T') {
468  word_display_mode.set(DF_POLYGONAL);
469  } else {
470  word_display_mode.reset(DF_POLYGONAL);
471  }
472  mode = CHANGE_DISP_CMD_EVENT;
473  break;
474  case BL_NORM_CMD_EVENT:
475  if (new_value[0] == 'T') {
476  word_display_mode.set(DF_BN_POLYGONAL);
477  } else {
478  word_display_mode.reset(DF_BN_POLYGONAL);
479  }
480  mode = CHANGE_DISP_CMD_EVENT;
481  break;
482  case BITMAP_CMD_EVENT:
483  if (new_value[0] == 'T') {
484  word_display_mode.set(DF_EDGE_STEP);
485  } else {
486  word_display_mode.reset(DF_EDGE_STEP);
487  }
488  mode = CHANGE_DISP_CMD_EVENT;
489  break;
492  break;
493  case IMAGE_CMD_EVENT:
494  display_image = (new_value[0] == 'T');
496  break;
497  case BLOCKS_CMD_EVENT:
498  display_blocks = (new_value[0] == 'T');
500  break;
501  case BASELINES_CMD_EVENT:
502  display_baselines = (new_value[0] == 'T');
504  break;
506  color_mode = CM_SUBSCRIPT;
508  break;
510  color_mode = CM_SUPERSCRIPT;
512  break;
514  color_mode = CM_ITALIC;
516  break;
517  case SHOW_BOLD_CMD_EVENT:
518  color_mode = CM_BOLD;
520  break;
522  color_mode = CM_UNDERLINE;
524  break;
526  color_mode = CM_FIXEDPITCH;
528  break;
530  color_mode = CM_SERIF;
532  break;
534  color_mode = CM_SMALLCAPS;
536  break;
538  color_mode = CM_DROPCAPS;
540  break;
541  case REFRESH_CMD_EVENT:
543  break;
544  case QUIT_CMD_EVENT:
545  exit = true;
547  break;
548 
549  default:
550  snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
551  image_win->AddMessage(msg);
552  break;
553  }
554  return exit;
555 }
@ NULL_CMD_EVENT
Definition: pgedit.cpp:48
@ CM_ITALIC
Definition: pgedit.cpp:84
@ CM_SUBSCRIPT
Definition: pgedit.cpp:82
@ CM_RAINBOW
Definition: pgedit.cpp:81
@ CM_FIXEDPITCH
Definition: pgedit.cpp:87
@ CM_BOLD
Definition: pgedit.cpp:85
@ CM_SMALLCAPS
Definition: pgedit.cpp:89
@ CM_SUPERSCRIPT
Definition: pgedit.cpp:83
@ CM_SERIF
Definition: pgedit.cpp:88
@ CM_DROPCAPS
Definition: pgedit.cpp:90
@ CM_UNDERLINE
Definition: pgedit.cpp:86
@ DF_POLYGONAL
Polyg approx.
Definition: werd.h:50
@ DF_BLAMER
Blamer information.
Definition: werd.h:53
@ DF_BOX
Bounding box.
Definition: werd.h:48
@ DF_BN_POLYGONAL
BL normalisd polyapx.
Definition: werd.h:52
@ DF_TEXT
Correct ascii.
Definition: werd.h:49
bool word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:701
char * ShowInputDialog(const char *msg)
Definition: scrollview.cpp:735
void AddMessage(const char *message)
Definition: scrollview.cpp:546
static void Exit()
Definition: scrollview.cpp:572

◆ process_image_event()

void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 566 of file pgedit.cpp.

567  {
568  // The following variable should remain static, since it is used by
569  // debug editor, which uses a single Tesseract instance.
570  static ICOORD down;
571  ICOORD up;
572  TBOX selection_box;
573  char msg[80];
574 
575  switch (event.type) {
576  case SVET_SELECTION:
577  if (event.type == SVET_SELECTION) {
578  down.set_x(event.x + event.x_size);
579  down.set_y(event.y + event.y_size);
580  if (mode == SHOW_POINT_CMD_EVENT) {
581  show_point(current_page_res, event.x, event.y);
582  }
583  }
584 
585  up.set_x(event.x);
586  up.set_y(event.y);
587 
588  selection_box = TBOX(down, up);
589 
590  switch (mode) {
592  process_selected_words(current_page_res, selection_box,
594  break;
595  case DUMP_WERD_CMD_EVENT:
596  process_selected_words(current_page_res, selection_box,
598  break;
600  process_selected_words(current_page_res, selection_box,
602  break;
604  debug_word(current_page_res, selection_box);
605  break;
607  break; // ignore up event
608 
609  case RECOG_WERDS:
610 # ifndef DISABLED_LEGACY_ENGINE
611  image_win->AddMessage("Recogging selected words");
612  this->process_selected_words(current_page_res, selection_box,
614 # endif // ndef DISABLED_LEGACY_ENGINE
615  break;
616  case RECOG_PSEUDO:
617  image_win->AddMessage("Recogging selected blobs");
618  recog_pseudo_word(current_page_res, selection_box);
619  break;
620  case SHOW_BLOB_FEATURES:
621  blob_feature_display(current_page_res, selection_box);
622  break;
623 
624  default:
625  sprintf(msg, "Mode %d not yet implemented", mode);
626  image_win->AddMessage(msg);
627  break;
628  }
629  default:
630  break;
631  }
632 }
@ SVET_SELECTION
Definition: scrollview.h:56
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:76
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:62
bool word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:676
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:30
bool word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:876
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:666
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:639
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:912

◆ process_selected_words()

void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_processor 
)

Definition at line 30 of file pagewalk.cpp.

32  {
33  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr; page_res_it.forward()) {
34  WERD *word = page_res_it.word()->word;
35  if (word->bounding_box().overlap(selection_box)) {
36  if (!(this->*word_processor)(&page_res_it)) {
37  return;
38  }
39  }
40  }
41 }

◆ ProcessTargetWord()

bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 118 of file control.cpp.

119  {
120  if (word_config != nullptr) {
121  if (word_box.major_overlap(target_word_box)) {
122  if (backup_config_file_ == nullptr) {
123  backup_config_file_ = kBackUpConfigFile;
124  FILE *config_fp = fopen(backup_config_file_, "wb");
125  if (config_fp == nullptr) {
126  tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
127  } else {
128  ParamUtils::PrintParams(config_fp, params());
129  fclose(config_fp);
130  }
132  }
133  } else {
134  if (backup_config_file_ != nullptr) {
136  backup_config_file_ = nullptr;
137  }
138  }
139  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
140  return false;
141  }
142  return true;
143 }
const char *const kBackUpConfigFile
Definition: control.cpp:47
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:40
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41

◆ quality_based_rejection()

void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 120 of file docqual.cpp.

120  {
121  if ((tessedit_good_quality_unrej && good_quality_doc)) {
122  unrej_good_quality_words(page_res_it);
123  }
124  doc_and_block_rejection(page_res_it, good_quality_doc);
125  if (unlv_tilde_crunching) {
126  tilde_crunch(page_res_it);
127  tilde_delete(page_res_it);
128  }
129 }
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:530
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:373
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:210
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:142

◆ read_config_file()

void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 48 of file tessedit.cpp.

48  {
49  std::string path = datadir;
50  path += "configs/";
51  path += filename;
52  FILE *fp;
53  if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
54  fclose(fp);
55  } else {
56  path = datadir;
57  path += "tessconfigs/";
58  path += filename;
59  if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
60  fclose(fp);
61  } else {
62  path = filename;
63  }
64  }
65  ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
66 }

◆ ReassignDiacritics()

bool tesseract::Tesseract::ReassignDiacritics ( int  pass,
PAGE_RES_IT pr_it,
bool *  make_next_word_fuzzy 
)

Definition at line 914 of file control.cpp.

914  {
915  *make_next_word_fuzzy = false;
916  WERD *real_word = pr_it->word()->word;
917  if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() ||
918  real_word->rej_cblob_list()->length() > noise_maxperword) {
919  return false;
920  }
921  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
922  // Get the noise outlines into a vector with matching bool map.
923  std::vector<C_OUTLINE *> outlines;
924  real_word->GetNoiseOutlines(&outlines);
925  std::vector<bool> word_wanted;
926  std::vector<bool> overlapped_any_blob;
927  std::vector<C_BLOB *> target_blobs;
928  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
929  &overlapped_any_blob, &target_blobs);
930  // Filter the outlines that overlapped any blob and put them into the word
931  // now. This simplifies the remaining task and also makes it more accurate
932  // as it has more completed blobs to work on.
933  std::vector<bool> wanted;
934  std::vector<C_BLOB *> wanted_blobs;
935  std::vector<C_OUTLINE *> wanted_outlines;
936  int num_overlapped = 0;
937  int num_overlapped_used = 0;
938  for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) {
939  if (overlapped_any_blob[i]) {
940  ++num_overlapped;
941  if (word_wanted[i]) {
942  ++num_overlapped_used;
943  }
944  wanted.push_back(word_wanted[i]);
945  wanted_blobs.push_back(target_blobs[i]);
946  wanted_outlines.push_back(outlines[i]);
947  outlines[i] = nullptr;
948  }
949  }
950  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
951  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs);
952  int non_overlapped = 0;
953  int non_overlapped_used = 0;
954  for (unsigned i = 0; i < word_wanted.size(); ++i) {
955  if (word_wanted[i]) {
956  ++non_overlapped_used;
957  }
958  if (outlines[i] != nullptr) {
959  ++non_overlapped_used;
960  }
961  }
962  if (debug_noise_removal) {
963  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", num_overlapped_used,
964  num_overlapped, non_overlapped_used, non_overlapped);
965  real_word->bounding_box().print();
966  }
967  // Now we have decided which outlines we want, put them into the real_word.
968  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {
969  pr_it->MakeCurrentWordFuzzy();
970  }
971  // TODO(rays) Parts of combos have a deep copy of the real word, and need
972  // to have their noise outlines moved/assigned in the same way!!
973  return num_overlapped_used != 0 || non_overlapped_used != 0;
974 }
void AssignDiacriticsToNewBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< C_BLOB * > *target_blobs)
Definition: control.cpp:1036
void AssignDiacriticsToOverlappingBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< bool > *overlapped_any_blob, std::vector< C_BLOB * > *target_blobs)
Definition: control.cpp:981
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124

◆ recog_all_words()

bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 287 of file control.cpp.

289  {
290  PAGE_RES_IT page_res_it(page_res);
291 
292  if (tessedit_minimal_rej_pass1) {
293  tessedit_test_adaption.set_value(true);
294  tessedit_minimal_rejection.set_value(true);
295  }
296 
297  if (dopasses == 0 || dopasses == 1) {
298  page_res_it.restart_page();
299  // ****************** Pass 1 *******************
300 
301 #ifndef DISABLED_LEGACY_ENGINE
302  // If the adaptive classifier is full switch to one we prepared earlier,
303  // ie on the previous page. If the current adaptive classifier is non-empty,
304  // prepare a backup starting at this page, in case it fills up. Do all this
305  // independently for each language.
306  if (AdaptiveClassifierIsFull()) {
308  } else if (!AdaptiveClassifierIsEmpty()) {
310  }
311  // Now check the sub-langs as well.
312  for (auto &lang : sub_langs_) {
313  if (lang->AdaptiveClassifierIsFull()) {
314  lang->SwitchAdaptiveClassifier();
315  } else if (!lang->AdaptiveClassifierIsEmpty()) {
316  lang->StartBackupAdaptiveClassifier();
317  }
318  }
319 
320 #endif // ndef DISABLED_LEGACY_ENGINE
321 
322  // Set up all words ready for recognition, so that if parallelism is on
323  // all the input and output classes are ready to run the classifier.
324  std::vector<WordData> words;
325  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
326 #ifndef DISABLED_LEGACY_ENGINE
327  if (tessedit_parallelize) {
328  PrerecAllWordsPar(words);
329  }
330 #endif // ndef DISABLED_LEGACY_ENGINE
331 
332  stats_.word_count = words.size();
333 
334  stats_.dict_words = 0;
335  stats_.doc_blob_quality = 0;
336  stats_.doc_outline_errs = 0;
337  stats_.doc_char_quality = 0;
338  stats_.good_char_count = 0;
339  stats_.doc_good_char_quality = 0;
340 
341  most_recently_used_ = this;
342  // Run pass 1 word recognition.
343  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) {
344  return false;
345  }
346  // Pass 1 post-processing.
347  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
348  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
349  fix_rep_char(&page_res_it);
350  continue;
351  }
352 
353  // Count dict words.
354  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) {
355  ++(stats_.dict_words);
356  }
357 
358  // Update misadaption log (we only need to do it on pass 1, since
359  // adaption only happens on this pass).
360  if (page_res_it.word()->blamer_bundle != nullptr &&
361  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
362  page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug());
363  }
364  }
365  }
366 
367  if (dopasses == 1) {
368  return true;
369  }
370 
371 #ifndef DISABLED_LEGACY_ENGINE
372 
373  // ****************** Pass 2 *******************
374  if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) {
375  page_res_it.restart_page();
376  std::vector<WordData> words;
377  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
378  if (tessedit_parallelize) {
379  PrerecAllWordsPar(words);
380  }
381  most_recently_used_ = this;
382  // Run pass 2 word recognition.
383  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) {
384  return false;
385  }
386  }
387 
388  // The next passes are only required for Tess-only.
389  if (AnyTessLang() && !AnyLSTMLang()) {
390  // ****************** Pass 3 *******************
391  // Fix fuzzy spaces.
392 
393  if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&
394  !right_to_left()) {
395  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
396  }
397 
398  // ****************** Pass 4 *******************
399  if (tessedit_enable_dict_correction) {
400  dictionary_correction_pass(page_res);
401  }
402  if (tessedit_enable_bigram_correction) {
403  bigram_correction_pass(page_res);
404  }
405 
406  // ****************** Pass 5,6 *******************
407  rejection_passes(page_res, monitor, target_word_box, word_config);
408 
409  // ****************** Pass 8 *******************
410  font_recognition_pass(page_res);
411 
412  // ****************** Pass 9 *******************
413  // Check the correctness of the final results.
414  blamer_pass(page_res);
415  script_pos_pass(page_res);
416  }
417 
418 #endif // ndef DISABLED_LEGACY_ENGINE
419 
420  // Write results pass.
421  // This is now redundant, but retained commented so show how to obtain
422  // bounding boxes and style information.
423 
424 #ifndef DISABLED_LEGACY_ENGINE
425  // changed by jetsoft
426  // needed for dll to output memory structure
427  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {
428  output_pass(page_res_it, target_word_box);
429  }
430 // end jetsoft
431 #endif // ndef DISABLED_LEGACY_ENGINE
432 
433  const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
434  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
435 
436  // Remove empty words, as these mess up the result iterators.
437  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
438  const WERD_RES *word = page_res_it.word();
439  const POLY_BLOCK *pb = page_res_it.block()->block != nullptr
440  ? page_res_it.block()->block->pdblk.poly_block()
441  : nullptr;
442  if (word->best_choice == nullptr || word->best_choice->empty() ||
443  (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
444  page_res_it.DeleteCurrentWord();
445  }
446  }
447 
448  if (monitor != nullptr) {
449  monitor->progress = 100;
450  }
451  return true;
452 }
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:456
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1665
void PrerecAllWordsPar(const std::vector< WordData > &words)
Definition: par_control.cpp:38
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:599
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:39
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, std::vector< WordData > *words)
Definition: control.cpp:198
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2069
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2015
bool AnyTessLang() const
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:77
bool right_to_left() const
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:707
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:683
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, std::vector< WordData > *words)
Definition: control.cpp:146
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:268
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:625
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:609
bool AdaptiveClassifierIsFull() const
Definition: classify.h:265
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:264

◆ recog_interactive()

bool tesseract::Tesseract::recog_interactive ( PAGE_RES_IT pr_it)

Recognize a single word in interactive mode.

Parameters
pr_itthe page results iterator

Definition at line 76 of file control.cpp.

76  {
77  WordData word_data(*pr_it);
78  SetupWordPassN(2, &word_data);
79  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
80  if (lstm_recognizer_ == nullptr) {
81 #ifndef DISABLED_LEGACY_ENGINE
82  classify_word_and_language(2, pr_it, &word_data);
83 #endif // ndef DISABLED_LEGACY_ENGINE
84  } else {
85  classify_word_and_language(1, pr_it, &word_data);
86  }
87 #ifndef DISABLED_LEGACY_ENGINE
88  if (tessedit_debug_quality_metrics) {
89  int16_t char_qual;
90  int16_t good_char_qual;
91  WERD_RES *word_res = pr_it->word();
92  word_char_quality(word_res, &char_qual, &good_char_qual);
93  tprintf(
94  "\n%d chars; word_blob_quality: %d; outline_errs: %d; "
95  "char_quality: %d; good_char_quality: %d\n",
96  word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res),
97  char_qual, good_char_qual);
98  }
99 #endif // ndef DISABLED_LEGACY_ENGINE
100  return true;
101 }
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:51
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:62

◆ recog_pseudo_word()

void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 62 of file control.cpp.

62  {
63  PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
64  if (it != nullptr) {
66  it->DeleteCurrentWord();
67  delete it;
68  }
69 }

◆ recog_training_segmented()

void tesseract::Tesseract::recog_training_segmented ( const char *  filename,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 86 of file recogtraining.cpp.

87  {
88  std::string box_fname = filename;
89  const char *lastdot = strrchr(box_fname.c_str(), '.');
90  if (lastdot != nullptr) {
91  box_fname[lastdot - box_fname.c_str()] = '\0';
92  }
93  box_fname += ".box";
94  // ReadNextBox() will close box_file
95  FILE *box_file = fopen(box_fname.c_str(), "r");
96  if (box_file == nullptr) {
97  tprintf("Error: Could not open file %s\n", box_fname.c_str());
98  ASSERT_HOST(box_file);
99  }
100 
101  PAGE_RES_IT page_res_it;
102  page_res_it.page_res = page_res;
103  page_res_it.restart_page();
104  std::string label;
105 
106  // Process all the words on this page.
107  TBOX tbox; // tesseract-identified box
108  TBOX bbox; // box from the box file
109  bool keep_going;
110  int line_number = 0;
111  int examined_words = 0;
112  do {
113  keep_going = read_t(&page_res_it, &tbox);
114  keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
115  // Align bottom left points of the TBOXes.
116  while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
117  if (bbox.bottom() < tbox.bottom()) {
118  page_res_it.forward();
119  keep_going = read_t(&page_res_it, &tbox);
120  } else {
121  keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
122  }
123  }
124  while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
125  if (bbox.left() > tbox.left()) {
126  page_res_it.forward();
127  keep_going = read_t(&page_res_it, &tbox);
128  } else {
129  keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
130  }
131  }
132  // OCR the word if top right points of the TBOXes are similar.
133  if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
134  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
135  ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
136  examined_words++;
137  }
138  page_res_it.forward();
139  } while (keep_going);
140 
141  // Set up scripts on all of the words that did not get sent to
142  // ambigs_classify_and_output. They all should have, but if all the
143  // werd_res's don't get uch_sets, tesseract will crash when you try
144  // to iterate over them. :-(
145  int total_words = 0;
146  for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
147  if (page_res_it.word()) {
148  if (page_res_it.word()->uch_set == nullptr) {
149  page_res_it.word()->SetupFake(unicharset);
150  }
151  total_words++;
152  }
153  }
154  if (examined_words < 0.85 * total_words) {
155  tprintf(
156  "TODO(antonova): clean up recog_training_segmented; "
157  " It examined only a small fraction of the ambigs image.\n");
158  }
159  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
160 }
const int16_t kMaxBoxEdgeDiff
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:146
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)

◆ recog_word()

void tesseract::Tesseract::recog_word ( WERD_RES word)

Definition at line 37 of file tfacepp.cpp.

37  {
38  if (wordrec_skip_no_truth_words &&
39  (word->blamer_bundle == nullptr ||
40  word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
41  if (classify_debug_level) {
42  tprintf("No truth for word - skipping\n");
43  }
44  word->tess_failed = true;
45  return;
46  }
47  ASSERT_HOST(!word->chopped_word->blobs.empty());
49  word->SetupBoxWord();
50  ASSERT_HOST(static_cast<unsigned>(word->best_choice->length()) == word->box_word->length());
51  // Check that the ratings matrix size matches the sum of all the
52  // segmentation states.
53  if (!word->StatesAllValid()) {
54  tprintf("Not all words have valid states relative to ratings matrix!!");
55  word->DebugWordChoices(true, nullptr);
56  ASSERT_HOST(word->StatesAllValid());
57  }
58  if (tessedit_override_permuter) {
59  /* Override the permuter type if a straight dictionary check disagrees. */
60  uint8_t perm_type = word->best_choice->permuter();
61  if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) &&
62  (perm_type != USER_DAWG_PERM)) {
63  uint8_t real_dict_perm_type = dict_word(*word->best_choice);
64  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) ||
65  (real_dict_perm_type == USER_DAWG_PERM)) &&
66  (alpha_count(word->best_choice->unichar_string().c_str(),
67  word->best_choice->unichar_lengths().c_str()) > 0)) {
68  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
69  }
70  }
71  if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) {
72  tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter());
73  }
74  }
75  // Factored out from control.cpp
76  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
77  if (word->best_choice == nullptr || word->best_choice->empty() ||
78  strspn(word->best_choice->unichar_string().c_str(), " ") ==
79  word->best_choice->length()) {
80  word->tess_failed = true;
81  word->reject_map.initialise(word->box_word->length());
82  word->reject_map.rej_word_tess_failure();
83  } else {
84  word->tess_failed = false;
85  }
86 }
@ IRR_NO_TRUTH
Definition: blamer.h:98
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:94

◆ recog_word_recursive()

void tesseract::Tesseract::recog_word_recursive ( WERD_RES word)

Definition at line 94 of file tfacepp.cpp.

94  {
95  auto word_length = word->chopped_word->NumBlobs(); // no of blobs
96  if (word_length > MAX_UNDIVIDED_LENGTH) {
97  return split_and_recog_word(word);
98  }
99  cc_recog(word);
100  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
101 
102  // Do sanity checks and minor fixes on best_choice.
103  if (word->best_choice->length() > word_length) {
104  word->best_choice->make_bad(); // should never happen
105  tprintf(
106  "recog_word: Discarded long string \"%s\""
107  " (%d characters vs %d blobs)\n",
108  word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length);
109  tprintf("Word is at:");
110  word->word->bounding_box().print();
111  }
112  if (word->best_choice->length() < word_length) {
113  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
114  while (word->best_choice->length() < word_length) {
115  word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty());
116  }
117  }
118 }
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:28
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:126
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
void cc_recog(WERD_RES *word)
Definition: tface.cpp:119

◆ RecogAllWordsPassN()

bool tesseract::Tesseract::RecogAllWordsPassN ( int  pass_n,
ETEXT_DESC monitor,
PAGE_RES_IT pr_it,
std::vector< WordData > *  words 
)

Definition at line 198 of file control.cpp.

199  {
200  // TODO(rays) Before this loop can be parallelized (it would yield a massive
201  // speed-up) all remaining member globals need to be converted to local/heap
202  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
203  // added. The results will be significantly different with adaption on, and
204  // deterioration will need investigation.
205  pr_it->restart_page();
206  for (unsigned w = 0; w < words->size(); ++w) {
207  WordData *word = &(*words)[w];
208  if (w > 0) {
209  word->prev_word = &(*words)[w - 1];
210  }
211  if (monitor != nullptr) {
212  monitor->ocr_alive = true;
213  if (pass_n == 1) {
214  monitor->progress = 70 * w / words->size();
215  } else {
216  monitor->progress = 70 + 30 * w / words->size();
217  }
218  if (monitor->progress_callback2 != nullptr) {
219  TBOX box = pr_it->word()->word->bounding_box();
220  (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom());
221  }
222  if (monitor->deadline_exceeded() ||
223  (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) {
224  // Timeout. Fake out the rest of the words.
225  for (; w < words->size(); ++w) {
226  (*words)[w].word->SetupFake(unicharset);
227  }
228  return false;
229  }
230  }
231  if (word->word->tess_failed) {
232  unsigned s;
233  for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
234  }
235  // If all are failed, skip it. Image words are skipped by this test.
236  if (s > word->lang_words.size()) {
237  continue;
238  }
239  }
240  // Sync pr_it with the WordData.
241  while (pr_it->word() != nullptr && pr_it->word() != word->word) {
242  pr_it->forward();
243  }
244  ASSERT_HOST(pr_it->word() != nullptr);
245  bool make_next_word_fuzzy = false;
246 #ifndef DISABLED_LEGACY_ENGINE
247  if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
248  // Needs to be setup again to see the new outlines in the chopped_word.
249  SetupWordPassN(pass_n, word);
250  }
251 #endif // ndef DISABLED_LEGACY_ENGINE
252 
253  classify_word_and_language(pass_n, pr_it, word);
254  if (tessedit_dump_choices || debug_noise_removal) {
255  tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(),
256  word->word->best_choice->debug_string().c_str());
257  }
258  pr_it->forward();
259  if (make_next_word_fuzzy && pr_it->word() != nullptr) {
260  pr_it->MakeCurrentWordFuzzy();
261  }
262  }
263  return true;
264 }
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:914

◆ recognize_page()

void tesseract::Tesseract::recognize_page ( std::string &  image_name)

◆ reject_edge_blobs()

void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 260 of file reject.cpp.

260  {
261  TBOX word_box = word->word->bounding_box();
262  // Use the box_word as it is already denormed back to image coordinates.
263  int blobcount = word->box_word->length();
264 
265  if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||
266  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
267  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
268  ASSERT_HOST(word->reject_map.length() == blobcount);
269  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
270  TBOX blob_box = word->box_word->BlobBox(blobindex);
271  if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||
272  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
273  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
274  word->reject_map[blobindex].setrej_edge_char();
275  // Close to edge
276  }
277  }
278  }
279 }

◆ reject_I_1_L()

void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 195 of file reject.cpp.

195  {
196  int16_t i;
197  int16_t offset;
198 
199  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
200  offset += word->best_choice->unichar_lengths()[i], i += 1) {
201  if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
202  // rej 1Il conflict
203  word->reject_map[i].setrej_1Il_conflict();
204  }
205  }
206 }

◆ reject_mostly_rejects()

void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 556 of file reject.cpp.

556  {
557  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
558 
559  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
560  rej_whole_of_mostly_reject_word_fract) {
561  word->reject_map.rej_word_mostly_rej();
562  }
563 }

◆ rejection_passes()

void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 599 of file control.cpp.

600  {
601  PAGE_RES_IT page_res_it(page_res);
602  // ****************** Pass 5 *******************
603  // Gather statistics on rejects.
604  int word_index = 0;
605  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
606  WERD_RES *word = page_res_it.word();
607  word_index++;
608  if (monitor != nullptr) {
609  monitor->ocr_alive = true;
610  monitor->progress = 95 + 5 * word_index / stats_.word_count;
611  }
612  if (word->rebuild_word == nullptr) {
613  // Word was not processed by tesseract.
614  page_res_it.forward();
615  continue;
616  }
617  check_debug_pt(word, 70);
618 
619  // changed by jetsoft
620  // specific to its needs to extract one word when need
621  if (target_word_box &&
622  !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) {
623  page_res_it.forward();
624  continue;
625  }
626  // end jetsoft
627 
628  page_res_it.rej_stat_word();
629  const int chars_in_word = word->reject_map.length();
630  const int rejects_in_word = word->reject_map.reject_count();
631 
632  const int blob_quality = word_blob_quality(word);
633  stats_.doc_blob_quality += blob_quality;
634  const int outline_errs = word_outline_errs(word);
635  stats_.doc_outline_errs += outline_errs;
636  int16_t all_char_quality;
637  int16_t accepted_all_char_quality;
638  word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
639  stats_.doc_char_quality += all_char_quality;
640  const uint8_t permuter_type = word->best_choice->permuter();
641  if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) ||
642  (permuter_type == USER_DAWG_PERM)) {
643  stats_.good_char_count += chars_in_word - rejects_in_word;
644  stats_.doc_good_char_quality += accepted_all_char_quality;
645  }
646  check_debug_pt(word, 80);
647  if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {
648  word->reject_map.rej_word_bad_quality();
649  }
650  check_debug_pt(word, 90);
651  page_res_it.forward();
652  }
653 
654  if (tessedit_debug_quality_metrics) {
655  tprintf(
656  "QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
657  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
658  page_res->char_count, page_res->rej_count,
659  page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality,
660  stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs,
661  stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality,
662  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
663  stats_.doc_good_char_quality,
664  (stats_.good_char_count > 0)
665  ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count))
666  : 0.0);
667  }
668  bool good_quality_doc =
669  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) &&
670  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) &&
671  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) &&
672  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc);
673 
674  // ****************** Pass 6 *******************
675  // Do whole document or whole block rejection pass
676  if (!tessedit_test_adaption) {
677  quality_based_rejection(page_res_it, good_quality_doc);
678  }
679 }
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:120
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:118

◆ repeated_nonalphanum_wd()

bool tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 565 of file reject.cpp.

565  {
566  if (word->best_choice->unichar_lengths().length() <= 1) {
567  return false;
568  }
569 
570  if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {
571  return false;
572  }
573 
574  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
575  for (unsigned i = 1; i < word->best_choice->length(); ++i) {
576  if (word->best_choice->unichar_id(i) != uch_id) {
577  return false;
578  }
579  }
580 
581  int16_t char_quality;
582  int16_t accepted_char_quality;
583  word_char_quality(word, &char_quality, &accepted_char_quality);
584 
585  if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) &&
586  (char_quality == accepted_char_quality)) {
587  return true;
588  } else {
589  return false;
590  }
591 }

◆ ReportFailedBox()

void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

◆ ReportXhtFixResult()

void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 1436 of file control.cpp.

1437  {
1438  tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(),
1439  word->best_choice->debug_string().c_str());
1440  word->reject_map.print(debug_fp);
1441  tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(),
1442  new_word->best_choice->debug_string().c_str());
1443  new_word->reject_map.print(debug_fp);
1444  tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT",
1445  new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1446  accept_new_word ? "ACCEPTED" : "");
1447 }

◆ ReSegmentByClassification()

void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

◆ ResegmentCharBox()

bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.

Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.

This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.

Definition at line 310 of file applybox.cpp.

◆ ResegmentWordBox()

bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

◆ ResetAdaptiveClassifier()

void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 507 of file tesseractclass.cpp.

507  {
509  for (auto &sub_lang : sub_langs_) {
510  sub_lang->ResetAdaptiveClassifierInternal();
511  }
512 }
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:596

◆ ResetDocumentDictionary()

void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 517 of file tesseractclass.cpp.

517  {
519  for (auto &sub_lang : sub_langs_) {
520  sub_lang->getDict().ResetDocumentDictionary();
521  }
522 }
void ResetDocumentDictionary()
Definition: dict.h:297

◆ reskew()

const FCOORD& tesseract::Tesseract::reskew ( ) const
inline

Definition at line 200 of file tesseractclass.h.

200  {
201  return reskew_;
202  }

◆ RetryWithLanguage()

int tesseract::Tesseract::RetryWithLanguage ( const WordData word_data,
WordRecognizer  recognizer,
bool  debug,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  best_words 
)

Definition at line 873 of file control.cpp.

874  {
875  if (debug) {
876  tprintf("Trying word using lang %s, oem %d\n", lang.c_str(),
877  static_cast<int>(tessedit_ocr_engine_mode));
878  }
879  // Run the recognizer on the word.
880  PointerVector<WERD_RES> new_words;
881  (this->*recognizer)(word_data, in_word, &new_words);
882  if (new_words.empty()) {
883  // Transfer input word to new_words, as the classifier must have put
884  // the result back in the input.
885  new_words.push_back(*in_word);
886  *in_word = nullptr;
887  }
888  if (debug) {
889  for (unsigned i = 0; i < new_words.size(); ++i) {
890  new_words[i]->DebugTopChoice("Lang result");
891  }
892  }
893  // Initial version is a bit of a hack based on better certainty and rating
894  // or a dictionary vs non-dictionary word.
895  return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,
896  &new_words, best_words);
897 }

◆ right_to_left()

bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 280 of file tesseractclass.h.

280  {
281  return right_to_left_;
282  }

◆ RunOldFixXht()

bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)

◆ safe_dict_word()

int16_t tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 593 of file reject.cpp.

593  {
594  const WERD_CHOICE &word = *werd_res->best_choice;
595  int dict_word_type = werd_res->tesseract->dict_word(word);
596  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
597 }

◆ scaled_color()

Image tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 263 of file tesseractclass.h.

263  {
264  return scaled_color_;
265  }

◆ scaled_factor()

int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 266 of file tesseractclass.h.

266  {
267  return scaled_factor_;
268  }

◆ script_pos_pass()

void tesseract::Tesseract::script_pos_pass ( PAGE_RES page_res)

Definition at line 707 of file control.cpp.

707  {
708  PAGE_RES_IT page_res_it(page_res);
709  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
710  WERD_RES *word = page_res_it.word();
711  if (word->word->flag(W_REP_CHAR)) {
712  page_res_it.forward();
713  continue;
714  }
715  const float x_height = page_res_it.block()->block->x_height();
716  float word_x_height = word->x_height;
717  if (word_x_height < word->best_choice->min_x_height() ||
718  word_x_height > word->best_choice->max_x_height()) {
719  word_x_height =
720  (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f;
721  }
722  // Test for small caps. Word capheight must be close to block xheight,
723  // and word must contain no lower case letters, and at least one upper case.
724  const double small_cap_xheight = x_height * kXHeightCapRatio;
725  const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
726  if (word->uch_set->script_has_xheight() &&
727  small_cap_xheight - small_cap_delta <= word_x_height &&
728  word_x_height <= small_cap_xheight + small_cap_delta) {
729  // Scan for upper/lower.
730  int num_upper = 0;
731  int num_lower = 0;
732  for (unsigned i = 0; i < word->best_choice->length(); ++i) {
733  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
734  ++num_upper;
735  } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
736  ++num_lower;
737  }
738  }
739  if (num_upper > 0 && num_lower == 0) {
740  word->small_caps = true;
741  }
742  }
743  word->SetScriptPositions();
744  }
745 }
static const double kXHeightCapRatio
Definition: ccstruct.h:37

◆ SearchForText()

void tesseract::Tesseract::SearchForText ( const std::vector< BLOB_CHOICE_LIST * > *  choices,
int  choices_pos,
unsigned  choices_length,
const std::vector< UNICHAR_ID > &  target_text,
unsigned  text_index,
float  rating,
std::vector< int > *  segmentation,
float *  best_rating,
std::vector< int > *  best_segmentation 
)

◆ SearchWords()

void tesseract::Tesseract::SearchWords ( PointerVector< WERD_RES > *  words)

Definition at line 263 of file linerec.cpp.

263  {
264  // Run the segmentation search on the network outputs and make a BoxWord
265  // for each of the output words.
266  // If we drop a word as junk, then there is always a space in front of the
267  // next.
268  const Dict *stopper_dict = lstm_recognizer_->GetDict();
269  if (stopper_dict == nullptr) {
270  stopper_dict = &getDict();
271  }
272  for (unsigned w = 0; w < words->size(); ++w) {
273  WERD_RES *word = (*words)[w];
274  if (word->best_choice == nullptr) {
275  // It is a dud.
276  word->SetupFake(lstm_recognizer_->GetUnicharset());
277  } else {
278  // Set the best state.
279  for (unsigned i = 0; i < word->best_choice->length(); ++i) {
280  int length = word->best_choice->state(i);
281  word->best_state.push_back(length);
282  }
283  word->reject_map.initialise(word->best_choice->length());
284  word->tess_failed = false;
285  word->tess_accepted = true;
286  word->tess_would_adapt = false;
287  word->done = true;
288  word->tesseract = this;
289  float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
290  word_certainty *= kCertaintyScale;
291  if (getDict().stopper_debug_level >= 1) {
292  tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
293  word->best_choice->certainty(), word->space_certainty,
294  std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
295  word_certainty);
296  word->best_choice->print();
297  }
298  word->best_choice->set_certainty(word_certainty);
299 
300  word->tess_accepted = stopper_dict->AcceptableResult(word);
301  }
302  }
303 }

◆ SegmentPage()

int tesseract::Tesseract::SegmentPage ( const char *  input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.

Definition at line 101 of file pagesegmain.cpp.

102  {
103  ASSERT_HOST(pix_binary_ != nullptr);
104  int width = pixGetWidth(pix_binary_);
105  int height = pixGetHeight(pix_binary_);
106  // Get page segmentation mode.
107  auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
108  // If a UNLV zone file can be found, use that instead of segmentation.
109  if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file[0] != '\0') {
110  std::string name = input_file;
111  const char *lastdot = strrchr(name.c_str(), '.');
112  if (lastdot != nullptr) {
113  name[lastdot - name.c_str()] = '\0';
114  }
115  read_unlv_file(name, width, height, blocks);
116  }
117  if (blocks->empty()) {
118  // No UNLV file present. Work according to the PageSegMode.
119  // First make a single block covering the whole image.
120  BLOCK_IT block_it(blocks);
121  auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
122  block->set_right_to_left(right_to_left());
123  block_it.add_to_end(block);
124  } else {
125  // UNLV file present. Use PSM_SINGLE_BLOCK.
126  pageseg_mode = PSM_SINGLE_BLOCK;
127  }
128  // The diacritic_blobs holds noise blobs that may be diacritics. They
129  // are separated out on areas of the image that seem noisy and short-circuit
130  // the layout process, going straight from the initial partition creation
131  // right through to after word segmentation, where they are added to the
132  // rej_cblobs list of the most appropriate word. From there classification
133  // will determine whether they are used.
134  BLOBNBOX_LIST diacritic_blobs;
135  int auto_page_seg_ret_val = 0;
136  TO_BLOCK_LIST to_blocks;
137  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
138  PSM_SPARSE(pageseg_mode)) {
139  auto_page_seg_ret_val =
140  AutoPageSeg(pageseg_mode, blocks, &to_blocks,
141  enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
142  if (pageseg_mode == PSM_OSD_ONLY) {
143  return auto_page_seg_ret_val;
144  }
145  // To create blobs from the image region bounds uncomment this line:
146  // to_blocks.clear(); // Uncomment to go back to the old mode.
147  } else {
148  deskew_ = FCOORD(1.0f, 0.0f);
149  reskew_ = FCOORD(1.0f, 0.0f);
150  if (pageseg_mode == PSM_CIRCLE_WORD) {
151  Image pixcleaned = RemoveEnclosingCircle(pix_binary_);
152  if (pixcleaned != nullptr) {
153  pix_binary_.destroy();
154  pix_binary_ = pixcleaned;
155  }
156  }
157  }
158 
159  if (auto_page_seg_ret_val < 0) {
160  return -1;
161  }
162 
163  if (blocks->empty()) {
164  if (textord_debug_tabfind) {
165  tprintf("Empty page\n");
166  }
167  return 0; // AutoPageSeg found an empty page.
168  }
169  bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
170  bool cjk_mode = textord_use_cjk_fp_model;
171 
172  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_,
173  pix_grey_, splitting || cjk_mode, &diacritic_blobs, blocks, &to_blocks);
174  return auto_page_seg_ret_val;
175 }
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:188
@ PSM_CIRCLE_WORD
Treat the image as a single word in a circle.
Definition: publictypes.h:171
@ PSM_OSD_ONLY
Orientation and script detection only.
Definition: publictypes.h:160
bool read_unlv_file(std::string &name, int32_t xsize, int32_t ysize, BLOCK_LIST *blocks)
Definition: blread.cpp:36
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:194
bool PSM_SPARSE(int pageseg_mode)
Definition: publictypes.h:197
int textord_debug_tabfind
Definition: alignedblob.cpp:29
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:200
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Image binary_pix, Image thresholds_pix, Image grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: textord.cpp:177

◆ SelectGoodDiacriticOutlines()

bool tesseract::Tesseract::SelectGoodDiacriticOutlines ( int  pass,
float  certainty_threshold,
PAGE_RES_IT pr_it,
C_BLOB blob,
const std::vector< C_OUTLINE * > &  outlines,
int  num_outlines,
std::vector< bool > *  ok_outlines 
)

Definition at line 1120 of file control.cpp.

1123  {
1124  std::string best_str;
1125  float target_cert = certainty_threshold;
1126  if (blob != nullptr) {
1127  float target_c2;
1128  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2);
1129  if (debug_noise_removal) {
1130  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert,
1131  target_c2);
1132  blob->bounding_box().print();
1133  }
1134  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1135  }
1136  std::vector<bool> test_outlines = *ok_outlines;
1137  // Start with all the outlines in.
1138  std::string all_str;
1139  std::vector<bool> best_outlines = *ok_outlines;
1140  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str);
1141  if (debug_noise_removal) {
1142  TBOX ol_box;
1143  for (unsigned i = 0; i < test_outlines.size(); ++i) {
1144  if (test_outlines[i]) {
1145  ol_box += outlines[i]->bounding_box();
1146  }
1147  }
1148  tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert,
1149  best_cert - target_cert);
1150  ol_box.print();
1151  }
1152  // Iteratively zero out the bit that improves the certainty the most, until
1153  // we get past the threshold, have zero bits, or fail to improve.
1154  int best_index = 0; // To zero out.
1155  while (num_outlines > 1 && best_index >= 0 &&
1156  (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1157  // Find the best bit to zero out.
1158  best_index = -1;
1159  for (unsigned i = 0; i < outlines.size(); ++i) {
1160  if (test_outlines[i]) {
1161  test_outlines[i] = false;
1162  std::string str;
1163  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str);
1164  if (debug_noise_removal) {
1165  TBOX ol_box;
1166  for (unsigned j = 0; j < outlines.size(); ++j) {
1167  if (test_outlines[j]) {
1168  ol_box += outlines[j]->bounding_box();
1169  }
1170  tprintf("%c", test_outlines[j] ? 'T' : 'F');
1171  }
1172  tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert,
1173  cert - target_cert);
1174  ol_box.print();
1175  }
1176  if (cert > best_cert) {
1177  best_cert = cert;
1178  best_index = i;
1179  best_outlines = test_outlines;
1180  }
1181  test_outlines[i] = true;
1182  }
1183  }
1184  if (best_index >= 0) {
1185  test_outlines[best_index] = false;
1186  --num_outlines;
1187  }
1188  }
1189  if (best_cert >= target_cert) {
1190  // Save the best combination.
1191  *ok_outlines = best_outlines;
1192  if (debug_noise_removal) {
1193  tprintf("%s noise combination ", blob ? "Adding" : "New");
1194  for (auto best_outline : best_outlines) {
1195  tprintf("%c", best_outline ? 'T' : 'F');
1196  }
1197  tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert);
1198  }
1199  return true;
1200  }
1201 
1202  return false;
1203 }
float ClassifyBlobPlusOutlines(const std::vector< bool > &ok_outlines, const std::vector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str)
Definition: control.cpp:1207

◆ set_done()

void tesseract::Tesseract::set_done ( WERD_RES word,
int16_t  pass 
)

Definition at line 62 of file reject.cpp.

62  {
63  word->done =
64  word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
65  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
66  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
67  word->best_choice->permuter() == FREQ_DAWG_PERM ||
68  word->best_choice->permuter() == USER_DAWG_PERM;
69  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
70  one_ell_conflict(word, false)) {
71  if (tessedit_rejection_debug) {
72  tprintf("one_ell_conflict detected\n");
73  }
74  word->done = false;
75  }
76  if (word->done &&
77  ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
78  if (tessedit_rejection_debug) {
79  tprintf("non-dict or ambig word detected\n");
80  }
81  word->done = false;
82  }
83  if (tessedit_rejection_debug) {
84  tprintf("set_done(): done=%d\n", word->done);
85  word->best_choice->print("");
86  }
87 }

◆ set_pix_grey()

void tesseract::Tesseract::set_pix_grey ( Image  grey_pix)
inline

Definition at line 214 of file tesseractclass.h.

214  {
215  pix_grey_.destroy();
216  pix_grey_ = grey_pix;
217  }

◆ set_pix_original()

void tesseract::Tesseract::set_pix_original ( Image  original_pix)
inline

Definition at line 222 of file tesseractclass.h.

222  {
223  pix_original_.destroy();
224  pix_original_ = original_pix;
225  // Clone to sublangs as well.
226  for (auto &lang : sub_langs_) {
227  lang->set_pix_original(original_pix ? original_pix.clone() : nullptr);
228  }
229  }

◆ set_pix_thresholds()

void tesseract::Tesseract::set_pix_thresholds ( Image  thresholds)
inline

Definition at line 247 of file tesseractclass.h.

247  {
248  pix_thresholds_.destroy();
249  pix_thresholds_ = thresholds;
250  }

◆ set_source_resolution()

void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 254 of file tesseractclass.h.

254  {
255  source_resolution_ = ppi;
256  }

◆ set_unlv_suspects()

void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 270 of file output.cpp.

270  {
271  int len = word_res->reject_map.length();
272  const WERD_CHOICE &word = *(word_res->best_choice);
273  const UNICHARSET &uchset = *word.unicharset();
274  int i;
275  float rating_per_ch;
276 
277  if (suspect_level == 0) {
278  for (i = 0; i < len; i++) {
279  if (word_res->reject_map[i].rejected()) {
280  word_res->reject_map[i].setrej_minimal_rej_accept();
281  }
282  }
283  return;
284  }
285 
286  if (suspect_level >= 3) {
287  return; // Use defaults
288  }
289 
290  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
291 
292  if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
293  /* Unreject alphas in dictionary words */
294  for (i = 0; i < len; ++i) {
295  if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
296  word_res->reject_map[i].setrej_minimal_rej_accept();
297  }
298  }
299  }
300 
301  rating_per_ch = word.rating() / word_res->reject_map.length();
302 
303  if (rating_per_ch >= suspect_rating_per_ch) {
304  return; // Don't touch bad ratings
305  }
306 
307  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
308  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
309  for (i = 0; i < len; ++i) {
310  if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
311  word_res->reject_map[i].setrej_minimal_rej_accept();
312  }
313  }
314  }
315 
316  for (i = 0; i < len; i++) {
317  if (word_res->reject_map[i].rejected()) {
318  if (word_res->reject_map[i].flag(R_DOC_REJ)) {
319  word_res->reject_map[i].setrej_minimal_rej_accept();
320  }
321  if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
322  word_res->reject_map[i].setrej_minimal_rej_accept();
323  }
324  if (word_res->reject_map[i].flag(R_ROW_REJ)) {
325  word_res->reject_map[i].setrej_minimal_rej_accept();
326  }
327  }
328  }
329 
330  if (suspect_level == 2) {
331  return;
332  }
333 
334  if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
335  for (i = 0; i < len; i++) {
336  if (word_res->reject_map[i].rejected()) {
337  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
338  word_res->reject_map[i].flag(R_POSTNN_1IL))) {
339  word_res->reject_map[i].setrej_minimal_rej_accept();
340  }
341 
342  if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
343  word_res->reject_map[i].setrej_minimal_rej_accept();
344  }
345  }
346  }
347  }
348 
349  if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
350  word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
351  acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
352  if (word_res->reject_map.length() > suspect_short_words) {
353  for (i = 0; i < len; i++) {
354  if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
355  word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
356  word_res->reject_map[i].flag(R_POSTNN_1IL) ||
357  word_res->reject_map[i].flag(R_MM_REJECT))) {
358  word_res->reject_map[i].setrej_minimal_rej_accept();
359  }
360  }
361  }
362  }
363 }
@ R_ROW_REJ
Definition: rejctmap.h:81
@ R_DOC_REJ
Definition: rejctmap.h:79
@ R_BLOCK_REJ
Definition: rejctmap.h:80
@ R_POSTNN_1IL
Definition: rejctmap.h:57
@ R_MM_REJECT
Definition: rejctmap.h:59
@ R_1IL_CONFLICT
Definition: rejctmap.h:56
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:386
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:365

◆ set_word_fonts()

void tesseract::Tesseract::set_word_fonts ( WERD_RES word)

set_word_fonts

Get the fonts for the word.

Definition at line 1927 of file control.cpp.

1927  {
1928  // Don't try to set the word fonts for an lstm word, as the configs
1929  // will be meaningless.
1930  if (word->chopped_word == nullptr) {
1931  return;
1932  }
1933  ASSERT_HOST(word->best_choice != nullptr);
1934 
1935 #ifndef DISABLED_LEGACY_ENGINE
1936  const int fontinfo_size = fontinfo_table_.size();
1937  if (fontinfo_size == 0) {
1938  return;
1939  }
1940  if (tessedit_font_id > 0) {
1941  if (tessedit_font_id >= fontinfo_size) {
1942  tprintf("Error, invalid font ID provided: must be below %d.\n"
1943  "Falling back to font auto-detection.\n", fontinfo_size);
1944  } else {
1945  word->fontinfo = &fontinfo_table_.at(tessedit_font_id);
1946  word->fontinfo2 = nullptr;
1947  word->fontinfo_id_count = INT8_MAX;
1948  word->fontinfo_id2_count = 0;
1949  return;
1950  }
1951  }
1952  std::vector<int> font_total_score(fontinfo_size);
1953 
1954  // Compute the font scores for the word
1955  if (tessedit_debug_fonts) {
1956  tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
1957  }
1958  for (unsigned b = 0; b < word->best_choice->length(); ++b) {
1959  const BLOB_CHOICE *choice = word->GetBlobChoice(b);
1960  if (choice == nullptr) {
1961  continue;
1962  }
1963  auto &fonts = choice->fonts();
1964  for (auto &f : fonts) {
1965  const int fontinfo_id = f.fontinfo_id;
1966  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1967  font_total_score[fontinfo_id] += f.score;
1968  }
1969  }
1970  }
1971  // Find the top and 2nd choice for the word.
1972  int score1 = 0, score2 = 0;
1973  int16_t font_id1 = -1, font_id2 = -1;
1974  for (int f = 0; f < fontinfo_size; ++f) {
1975  if (tessedit_debug_fonts && font_total_score[f] > 0) {
1976  tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]);
1977  }
1978  if (font_total_score[f] > score1) {
1979  score2 = score1;
1980  font_id2 = font_id1;
1981  score1 = font_total_score[f];
1982  font_id1 = f;
1983  } else if (font_total_score[f] > score2) {
1984  score2 = font_total_score[f];
1985  font_id2 = f;
1986  }
1987  }
1988  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr;
1989  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr;
1990  // Each score has a limit of UINT16_MAX, so divide by that to get the number
1991  // of "votes" for that font, ie number of perfect scores.
1992  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
1993  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
1994  if (score1 > 0) {
1995  const FontInfo fi = fontinfo_table_.at(font_id1);
1996  if (tessedit_debug_fonts) {
1997  if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
1998  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name,
1999  word->fontinfo_id_count, fontinfo_table_.at(font_id2).name,
2000  word->fontinfo_id2_count);
2001  } else {
2002  tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count);
2003  }
2004  }
2005  }
2006 #endif // ndef DISABLED_LEGACY_ENGINE
2007 }
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:435

◆ SetBlackAndWhitelist()

void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 524 of file tesseractclass.cpp.

524  {
525  // Set the white and blacklists (if any)
526  unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
527  tessedit_char_whitelist.c_str(),
528  tessedit_char_unblacklist.c_str());
529  if (lstm_recognizer_) {
530  UNICHARSET &lstm_unicharset = lstm_recognizer_->GetUnicharset();
531  lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
532  tessedit_char_whitelist.c_str(),
533  tessedit_char_unblacklist.c_str());
534  }
535  // Black and white lists should apply to all loaded classifiers.
536  for (auto &sub_lang : sub_langs_) {
537  sub_lang->unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
538  tessedit_char_whitelist.c_str(),
539  tessedit_char_unblacklist.c_str());
540  if (sub_lang->lstm_recognizer_) {
541  UNICHARSET &lstm_unicharset = sub_lang->lstm_recognizer_->GetUnicharset();
542  lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
543  tessedit_char_whitelist.c_str(),
544  tessedit_char_unblacklist.c_str());
545  }
546  }
547 }
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)

◆ SetEquationDetect()

void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 501 of file tesseractclass.cpp.

501  {
502  equ_detect_ = detector;
503  equ_detect_->SetLangTesseract(this);
504 }
void SetLangTesseract(Tesseract *lang_tesseract)

◆ SetScaledColor()

void tesseract::Tesseract::SetScaledColor ( int  factor,
Image  color 
)
inline

Definition at line 269 of file tesseractclass.h.

269  {
270  scaled_factor_ = factor;
271  scaled_color_ = color;
272  }

◆ SetupAllWordsPassN()

void tesseract::Tesseract::SetupAllWordsPassN ( int  pass_n,
const TBOX target_word_box,
const char *  word_config,
PAGE_RES page_res,
std::vector< WordData > *  words 
)

If tesseract is to be run, sets the words up ready for it.

Definition at line 146 of file control.cpp.

147  {
148  // Prepare all the words.
149  PAGE_RES_IT page_res_it(page_res);
150  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
151  if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(),
152  *target_word_box, word_config, 1)) {
153  words->push_back(WordData(page_res_it));
154  }
155  }
156  // Setup all the words for recognition with polygonal approximation.
157  for (unsigned w = 0; w < words->size(); ++w) {
158  SetupWordPassN(pass_n, &(*words)[w]);
159  if (w > 0) {
160  (*words)[w].prev_word = &(*words)[w - 1];
161  }
162  }
163 }

◆ SetupApplyBoxes()

PAGE_RES * tesseract::Tesseract::SetupApplyBoxes ( const std::vector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.

Definition at line 197 of file applybox.cpp.

197  {
198  PreenXHeights(block_list);
199  // Strip all fuzzy space markers to simplify the PAGE_RES.
200  BLOCK_IT b_it(block_list);
201  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
202  BLOCK *block = b_it.data();
203  ROW_IT r_it(block->row_list());
204  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
205  ROW *row = r_it.data();
206  WERD_IT w_it(row->word_list());
207  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
208  WERD *word = w_it.data();
209  if (word->cblob_list()->empty()) {
210  delete w_it.extract();
211  } else {
212  word->set_flag(W_FUZZY_SP, false);
213  word->set_flag(W_FUZZY_NON, false);
214  }
215  }
216  }
217  }
218  auto *page_res = new PAGE_RES(false, block_list, nullptr);
219  PAGE_RES_IT pr_it(page_res);
220  WERD_RES *word_res;
221  while ((word_res = pr_it.word()) != nullptr) {
222  MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
223  pr_it.forward();
224  }
225  return page_res;
226 }
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:174
void MaximallyChopWord(const std::vector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:231

◆ SetupPageSegAndDetectOrientation()

ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Image photo_mask_pix,
Image music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 272 of file pagesegmain.cpp.

276  {
277  int vertical_x = 0;
278  int vertical_y = 1;
279  TabVector_LIST v_lines;
280  TabVector_LIST h_lines;
281  ICOORD bleft(0, 0);
282 
283  ASSERT_HOST(pix_binary_ != nullptr);
284  if (tessedit_dump_pageseg_images) {
285  pixa_debug_.AddPix(pix_binary_, "PageSegInput");
286  }
287  // Leptonica is used to find the rule/separator lines in the input.
288  LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
289  &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
290  if (tessedit_dump_pageseg_images) {
291  pixa_debug_.AddPix(pix_binary_, "NoLines");
292  }
293  // Leptonica is used to find a mask of the photo regions in the input.
294  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
295  if (tessedit_dump_pageseg_images) {
296  Image pix_no_image_ = nullptr;
297  if (*photo_mask_pix != nullptr) {
298  pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);
299  } else {
300  pix_no_image_ = pix_binary_.clone();
301  }
302  pixa_debug_.AddPix(pix_no_image_, "NoImages");
303  pix_no_image_.destroy();
304  }
305  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
306  v_lines.clear();
307  }
308 
309  // The rest of the algorithm uses the usual connected components.
310  textord_.find_components(pix_binary_, blocks, to_blocks);
311 
312  TO_BLOCK_IT to_block_it(to_blocks);
313  // There must be exactly one input block.
314  // TODO(rays) handle new textline finding with a UNLV zone file.
315  ASSERT_HOST(to_blocks->singleton());
316  TO_BLOCK *to_block = to_block_it.data();
317  TBOX blkbox = to_block->block->pdblk.bounding_box();
318  ColumnFinder *finder = nullptr;
319  int estimated_resolution = source_resolution_;
320  if (source_resolution_ == kMinCredibleResolution) {
321  // Try to estimate resolution from typical body text size.
322  int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
323  if (res > estimated_resolution && res < kMaxCredibleResolution) {
324  estimated_resolution = res;
325  tprintf("Estimating resolution as %d\n", estimated_resolution);
326  }
327  }
328 
329  if (to_block->line_size >= 2) {
330  finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(),
331  blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model,
332  textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x,
333  vertical_y);
334 
335  finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
336 
337 #ifndef DISABLED_LEGACY_ENGINE
338 
339  if (equ_detect_) {
340  equ_detect_->LabelSpecialText(to_block);
341  }
342 
343  BLOBNBOX_CLIST osd_blobs;
344  // osd_orientation is the number of 90 degree rotations to make the
345  // characters upright. (See tesseract/osdetect.h for precise definition.)
346  // We want the text lines horizontal, (vertical text indicates vertical
347  // textlines) which may conflict (eg vertically written CJK).
348  int osd_orientation = 0;
349  bool vertical_text =
350  textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
351  if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) {
352  vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block,
353  &osd_blobs);
354  }
355  if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
356  std::vector<int> osd_scripts;
357  if (osd_tess != this) {
358  // We are running osd as part of layout analysis, so constrain the
359  // scripts to those allowed by *this.
360  AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
361  for (auto &lang : sub_langs_) {
362  AddAllScriptsConverted(lang->unicharset, osd_tess->unicharset, &osd_scripts);
363  }
364  }
365  os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
366  if (pageseg_mode == PSM_OSD_ONLY) {
367  delete finder;
368  return nullptr;
369  }
370  osd_orientation = osr->best_result.orientation_id;
371  double osd_score = osr->orientations[osd_orientation];
372  double osd_margin = min_orientation_margin * 2;
373  for (int i = 0; i < 4; ++i) {
374  if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) {
375  osd_margin = osd_score - osr->orientations[i];
376  }
377  }
378  int best_script_id = osr->best_result.script_id;
379  const char *best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id);
380  bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
381  best_script_id == osd_tess->unicharset.hiragana_sid() ||
382  best_script_id == osd_tess->unicharset.katakana_sid() ||
383  strcmp("Japanese", best_script_str) == 0 ||
384  strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0;
385  if (cjk) {
386  finder->set_cjk_script(true);
387  }
388  if (osd_margin < min_orientation_margin) {
389  // The margin is weak.
390  if (!cjk && !vertical_text && osd_orientation == 2) {
391  // upside down latin text is improbable with such a weak margin.
392  tprintf(
393  "OSD: Weak margin (%.2f), horiz textlines, not CJK: "
394  "Don't rotate.\n",
395  osd_margin);
396  osd_orientation = 0;
397  } else {
398  tprintf(
399  "OSD: Weak margin (%.2f) for %d blob text block, "
400  "but using orientation anyway: %d\n",
401  osd_margin, osd_blobs.length(), osd_orientation);
402  }
403  }
404  }
405  osd_blobs.shallow_clear();
406  finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
407 
408 #endif // ndef DISABLED_LEGACY_ENGINE
409  }
410 
411  return finder;
412 }
constexpr int kResolutionEstimationFactor
Definition: publictypes.h:45
@ PSM_SINGLE_BLOCK_VERT_TEXT
Definition: publictypes.h:166
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
int os_detect_blobs(const std::vector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:274
constexpr int kMaxCredibleResolution
Definition: publictypes.h:40
constexpr int kMinCredibleResolution
Definition: publictypes.h:38
int LabelSpecialText(TO_BLOCK *to_block) override
void AddPix(const Image pix, const char *caption)
Definition: debugpixa.h:32
static Image FindImages(Image pix, DebugPixa *pixa_debug)
Definition: imagefind.cpp:63
static void FindAndRemoveLines(int resolution, bool debug, Image pix, int *vertical_x, int *vertical_y, Image *pix_music_mask, TabVector_LIST *v_lines, TabVector_LIST *h_lines)
Definition: linefind.cpp:240
void find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:211

◆ SetupUniversalFontIds()

void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 438 of file tessedit.cpp.

438  {
439  // Note that we can get away with bitwise copying FontInfo in
440  // all_fonts, as it is a temporary structure and we avoid setting the
441  // delete callback.
442  UnicityTable<FontInfo> all_fonts;
443 
444  // Create the universal ID table.
445  CollectFonts(get_fontinfo_table(), &all_fonts);
446  for (auto &sub_lang : sub_langs_) {
447  CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
448  }
449  // Assign ids from the table to each font table.
450  AssignIds(all_fonts, &get_fontinfo_table());
451  for (auto &sub_lang : sub_langs_) {
452  AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
453  }
454  font_table_size_ = all_fonts.size();
455 }

◆ SetupWordPassN()

void tesseract::Tesseract::SetupWordPassN ( int  pass_n,
WordData word 
)

Definition at line 166 of file control.cpp.

166  {
167  if (pass_n == 1 || !word->word->done) {
168  if (pass_n == 1) {
169  word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode,
170  nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,
171  poly_allow_detailed_fx, word->row, word->block);
172  } else if (pass_n == 2) {
173  // TODO(rays) Should we do this on pass1 too?
174  word->word->caps_height = 0.0;
175  if (word->word->x_height == 0.0f) {
176  word->word->x_height = word->row->x_height();
177  }
178  }
179  word->lang_words.truncate(0);
180  for (unsigned s = 0; s <= sub_langs_.size(); ++s) {
181  // The sub_langs_.size() entry is for the master language.
182  Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
183  auto *word_res = new WERD_RES;
184  word_res->InitForRetryRecognition(*word->word);
185  word->lang_words.push_back(word_res);
186  // LSTM doesn't get setup for pass2.
187  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
188  word_res->SetupForRecognition(
189  lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr,
190  lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,
191  lang_t->poly_allow_detailed_fx, word->row, word->block);
192  }
193  }
194  }
195 }

◆ SetupWordScripts()

void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)

◆ source_resolution()

int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 251 of file tesseractclass.h.

251  {
252  return source_resolution_;
253  }

◆ split_and_recog_word()

void tesseract::Tesseract::split_and_recog_word ( WERD_RES word)

Definition at line 126 of file tfacepp.cpp.

126  {
127  // Find the biggest blob gap in the chopped_word.
128  int bestgap = -INT32_MAX;
129  int split_index = 0;
130  for (unsigned b = 1; b < word->chopped_word->NumBlobs(); ++b) {
131  TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
132  TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
133  int gap = blob_box.left() - prev_box.right();
134  if (gap > bestgap) {
135  bestgap = gap;
136  split_index = b;
137  }
138  }
139  ASSERT_HOST(split_index > 0);
140 
141  WERD_RES *word2 = nullptr;
142  BlamerBundle *orig_bb = nullptr;
143  split_word(word, split_index, &word2, &orig_bb);
144 
145  // Recognize the first part of the word.
146  recog_word_recursive(word);
147  // Recognize the second part of the word.
148  recog_word_recursive(word2);
149 
150  join_words(word, word2, orig_bb);
151 }
void split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:163
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:216

◆ split_word()

void tesseract::Tesseract::split_word ( WERD_RES word,
unsigned  split_pt,
WERD_RES **  right_piece,
BlamerBundle **  orig_blamer_bundle 
) const

Definition at line 163 of file tfacepp.cpp.

164  {
165  ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
166 
167  // Save a copy of the blamer bundle so we can try to reconstruct it below.
168  BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
169 
170  auto *word2 = new WERD_RES(*word);
171 
172  // blow away the copied chopped_word, as we want to work with
173  // the blobs from the input chopped_word so seam_arrays can be merged.
174  TWERD *chopped = word->chopped_word;
175  auto *chopped2 = new TWERD;
176  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
177  for (auto i = split_pt; i < chopped->NumBlobs(); ++i) {
178  chopped2->blobs.push_back(chopped->blobs[i]);
179  }
180  chopped->blobs.resize(split_pt);
181  word->chopped_word = nullptr;
182  delete word2->chopped_word;
183  word2->chopped_word = nullptr;
184 
185  const UNICHARSET &unicharset = *word->uch_set;
186  word->ClearResults();
187  word2->ClearResults();
188  word->chopped_word = chopped;
189  word2->chopped_word = chopped2;
190  word->SetupBasicsFromChoppedWord(unicharset);
191  word2->SetupBasicsFromChoppedWord(unicharset);
192 
193  // Try to adjust the blamer bundle.
194  if (orig_bb != nullptr) {
195  // TODO(rays) Looks like a leak to me.
196  // orig_bb should take, rather than copy.
197  word->blamer_bundle = new BlamerBundle();
198  word2->blamer_bundle = new BlamerBundle();
199  orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
200  word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer,
201  word->blamer_bundle, word2->blamer_bundle);
202  }
203 
204  *right_piece = word2;
205  *orig_blamer_bundle = orig_bb;
206 }

◆ STRING_VAR_H() [1/17]

tesseract::Tesseract::STRING_VAR_H ( applybox_exposure_pattern  )

◆ STRING_VAR_H() [2/17]

tesseract::Tesseract::STRING_VAR_H ( chs_leading_punct  )

◆ STRING_VAR_H() [3/17]

tesseract::Tesseract::STRING_VAR_H ( chs_trailing_punct1  )

◆ STRING_VAR_H() [4/17]

tesseract::Tesseract::STRING_VAR_H ( chs_trailing_punct2  )

◆ STRING_VAR_H() [5/17]

tesseract::Tesseract::STRING_VAR_H ( conflict_set_I_l_1  )

◆ STRING_VAR_H() [6/17]

tesseract::Tesseract::STRING_VAR_H ( file_type  )

◆ STRING_VAR_H() [7/17]

tesseract::Tesseract::STRING_VAR_H ( numeric_punctuation  )

◆ STRING_VAR_H() [8/17]

tesseract::Tesseract::STRING_VAR_H ( ok_repeated_ch_non_alphanum_wds  )

◆ STRING_VAR_H() [9/17]

tesseract::Tesseract::STRING_VAR_H ( outlines_2  )

◆ STRING_VAR_H() [10/17]

tesseract::Tesseract::STRING_VAR_H ( outlines_odd  )

◆ STRING_VAR_H() [11/17]

tesseract::Tesseract::STRING_VAR_H ( page_separator  )

◆ STRING_VAR_H() [12/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_char_blacklist  )

◆ STRING_VAR_H() [13/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_char_unblacklist  )

◆ STRING_VAR_H() [14/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_char_whitelist  )

◆ STRING_VAR_H() [15/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_load_sublangs  )

◆ STRING_VAR_H() [16/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_write_params_to_file  )

◆ STRING_VAR_H() [17/17]

tesseract::Tesseract::STRING_VAR_H ( unrecognised_char  )

◆ SubAndSuperscriptFix()

bool tesseract::Tesseract::SubAndSuperscriptFix ( WERD_RES word)

Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.

This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.

Returns
Whether we modified the given word.

Definition at line 108 of file superscript.cpp.

108  {
109  if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) {
110  return false;
111  }
112  int num_leading, num_trailing;
113  ScriptPos sp_leading, sp_trailing;
114  float leading_certainty, trailing_certainty;
115  float avg_certainty, unlikely_threshold;
116 
117  // Calculate the number of whole suspicious characters at the edges.
118  GetSubAndSuperscriptCandidates(word, &num_leading, &sp_leading, &leading_certainty, &num_trailing,
119  &sp_trailing, &trailing_certainty, &avg_certainty,
120  &unlikely_threshold);
121 
122  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
123  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
124 
125  int num_blobs = word->best_choice->length();
126 
127  // Calculate the remainder (partial characters) at the edges.
128  // This accounts for us having classified the best version of
129  // a word as [speaker?'] when it was instead [speaker.^{21}]
130  // (that is we accidentally thought the 2 was attached to the period).
131  int num_remainder_leading = 0, num_remainder_trailing = 0;
132  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
133  int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
134  int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
135  int last_word_char = num_blobs - 1 - num_trailing;
136  float last_char_certainty = word->best_choice->certainty(last_word_char);
137  if (word->best_choice->unichar_id(last_word_char) != 0 &&
138  last_char_certainty <= unlikely_threshold) {
139  ScriptPos rpos;
140  YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, nullptr, nullptr, &rpos,
141  &num_remainder_trailing);
142  if (num_trailing > 0 && rpos != sp_trailing) {
143  num_remainder_trailing = 0;
144  }
145  if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) {
146  trailing_certainty = last_char_certainty;
147  }
148  }
149  bool another_blob_available =
150  (num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs;
151  int first_char_certainty = word->best_choice->certainty(num_leading);
152  if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 &&
153  first_char_certainty <= unlikely_threshold) {
154  ScriptPos lpos;
155  YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading,
156  nullptr, nullptr);
157  if (num_leading > 0 && lpos != sp_leading) {
158  num_remainder_leading = 0;
159  }
160  if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) {
161  leading_certainty = first_char_certainty;
162  }
163  }
164  }
165 
166  // If nothing to do, bail now.
167  if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) {
168  return false;
169  }
170 
171  if (superscript_debug >= 1) {
172  tprintf("Candidate for superscript detection: %s (",
173  word->best_choice->unichar_string().c_str());
174  if (num_leading || num_remainder_leading) {
175  tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos);
176  }
177  if (num_trailing || num_remainder_trailing) {
178  tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos);
179  }
180  tprintf(")\n");
181  }
182  if (superscript_debug >= 3) {
183  word->best_choice->print();
184  }
185  if (superscript_debug >= 2) {
186  tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ", avg_certainty,
187  unlikely_threshold);
188  if (num_leading) {
189  tprintf("Orig. leading (min): %.2f ", leading_certainty);
190  }
191  if (num_trailing) {
192  tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
193  }
194  tprintf("\n");
195  }
196 
197  // We've now calculated the number of rebuilt blobs we want to carve off.
198  // However, split_word() works from TBLOBs in chopped_word, so we need to
199  // convert to those.
200  int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
201  int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
202 
203  int retry_leading = 0;
204  int retry_trailing = 0;
205  bool is_good = false;
206  WERD_RES *revised = TrySuperscriptSplits(num_chopped_leading, leading_certainty, sp_leading,
207  num_chopped_trailing, trailing_certainty, sp_trailing,
208  word, &is_good, &retry_leading, &retry_trailing);
209  if (is_good) {
210  word->ConsumeWordResults(revised);
211  } else if (retry_leading || retry_trailing) {
212  int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading);
213  int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing);
214  WERD_RES *revised2 = TrySuperscriptSplits(
215  retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing,
216  trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing);
217  if (is_good) {
218  word->ConsumeWordResults(revised2);
219  }
220  delete revised2;
221  }
222  delete revised;
223  return is_good;
224 }
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)

◆ terrible_word_crunch()

bool tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 450 of file docqual.cpp.

450  {
451  float rating_per_ch;
452  int adjusted_len;
453  int crunch_mode = 0;
454 
455  if (word->best_choice->unichar_string().empty() ||
456  (strspn(word->best_choice->unichar_string().c_str(), " ") ==
457  word->best_choice->unichar_string().size())) {
458  crunch_mode = 1;
459  } else {
460  adjusted_len = word->reject_map.length();
461  if (adjusted_len > crunch_rating_max) {
462  adjusted_len = crunch_rating_max;
463  }
464  rating_per_ch = word->best_choice->rating() / adjusted_len;
465 
466  if (rating_per_ch > crunch_terrible_rating) {
467  crunch_mode = 2;
468  } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {
469  crunch_mode = 3;
470  } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&
471  (garbage_level != G_OK)) {
472  crunch_mode = 4;
473  } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {
474  crunch_mode = 5;
475  }
476  }
477  if (crunch_mode > 0) {
478  if (crunch_debug > 2) {
479  tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
480  word->best_choice->unichar_string().c_str());
481  }
482  return true;
483  } else {
484  return false;
485  }
486 }

◆ tess_acceptable_word()

bool tesseract::Tesseract::tess_acceptable_word ( WERD_RES word)

Definition at line 64 of file tessbox.cpp.

64  {
65  return getDict().AcceptableResult(word);
66 }
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:111

◆ tess_add_doc_word()

void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 73 of file tessbox.cpp.

73  {
74  getDict().add_document_word(*word_choice);
75 }
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:647

◆ tess_segment_pass_n()

void tesseract::Tesseract::tess_segment_pass_n ( int  pass_n,
WERD_RES word 
)

Definition at line 32 of file tessbox.cpp.

32  {
33  int saved_enable_assoc = 0;
34  int saved_chop_enable = 0;
35 
36  if (word->word->flag(W_DONT_CHOP)) {
37  saved_enable_assoc = wordrec_enable_assoc;
38  saved_chop_enable = chop_enable;
39  wordrec_enable_assoc.set_value(false);
40  chop_enable.set_value(false);
41  }
42  if (pass_n == 1) {
43  set_pass1();
44  } else {
45  set_pass2();
46  }
47  recog_word(word);
48  if (word->best_choice == nullptr) {
49  word->SetupFake(*word->uch_set);
50  }
51  if (word->word->flag(W_DONT_CHOP)) {
52  wordrec_enable_assoc.set_value(saved_enable_assoc);
53  chop_enable.set_value(saved_chop_enable);
54  }
55 }
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:37
void set_pass1()
Definition: tface.cpp:97
void set_pass2()
Definition: tface.cpp:108

◆ TestNewNormalization()

bool tesseract::Tesseract::TestNewNormalization ( int  original_misfits,
float  baseline_shift,
float  new_x_ht,
WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1488 of file control.cpp.

1489  {
1490  bool accept_new_x_ht = false;
1491  WERD_RES new_x_ht_word(word->word);
1492  if (word->blamer_bundle != nullptr) {
1493  new_x_ht_word.blamer_bundle = new BlamerBundle();
1494  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1495  }
1496  new_x_ht_word.x_height = new_x_ht;
1497  new_x_ht_word.baseline_shift = baseline_shift;
1498  new_x_ht_word.caps_height = 0.0;
1499  new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1500  classify_bln_numeric_mode, textord_use_cjk_fp_model,
1501  poly_allow_detailed_fx, row, block);
1502  match_word_pass_n(2, &new_x_ht_word, row, block);
1503  if (!new_x_ht_word.tess_failed) {
1504  int new_misfits = CountMisfitTops(&new_x_ht_word);
1505  if (debug_x_ht_level >= 1) {
1506  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits,
1507  word->x_height, new_misfits, new_x_ht);
1508  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(),
1509  word->best_choice->certainty(), new_x_ht_word.best_choice->rating(),
1510  new_x_ht_word.best_choice->certainty());
1511  }
1512  // The misfits must improve and either the rating or certainty.
1513  accept_new_x_ht = new_misfits < original_misfits &&
1514  (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() ||
1515  new_x_ht_word.best_choice->rating() < word->best_choice->rating());
1516  if (debug_x_ht_level >= 1) {
1517  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1518  }
1519  }
1520  if (accept_new_x_ht) {
1521  word->ConsumeWordResults(&new_x_ht_word);
1522  return true;
1523  }
1524  return false;
1525 }
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:72
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1436

◆ textord()

const Textord& tesseract::Tesseract::textord ( ) const
inline

Definition at line 273 of file tesseractclass.h.

273  {
274  return textord_;
275  }

◆ TidyUp()

void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)

◆ tilde_crunch()

void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 373 of file docqual.cpp.

373  {
374  WERD_RES *word;
375  GARBAGE_LEVEL garbage_level;
376  PAGE_RES_IT copy_it;
377  bool prev_potential_marked = false;
378  bool found_terrible_word = false;
379  bool ok_dict_word;
380 
381  page_res_it.restart_page();
382  while (page_res_it.word() != nullptr) {
383  POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();
384  if (pb != nullptr && !pb->IsText()) {
385  page_res_it.forward();
386  continue;
387  }
388  word = page_res_it.word();
389 
390  if (crunch_early_convert_bad_unlv_chs) {
391  convert_bad_unlv_chs(word);
392  }
393 
394  if (crunch_early_merge_tess_fails) {
395  word->merge_tess_fails();
396  }
397 
398  if (word->reject_map.accept_count() != 0) {
399  found_terrible_word = false;
400  // Forget earlier potential crunches
401  prev_potential_marked = false;
402  } else {
403  ok_dict_word = safe_dict_word(word);
404  garbage_level = garbage_word(word, ok_dict_word);
405 
406  if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {
407  if (crunch_debug > 0) {
408  tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
409  }
410  word->unlv_crunch_mode = CR_KEEP_SPACE;
411  if (prev_potential_marked) {
412  while (copy_it.word() != word) {
413  if (crunch_debug > 0) {
414  tprintf("P1 CRUNCHING: \"%s\"\n",
415  copy_it.word()->best_choice->unichar_string().c_str());
416  }
417  copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;
418  copy_it.forward();
419  }
420  prev_potential_marked = false;
421  }
422  found_terrible_word = true;
423  } else if ((garbage_level != G_NEVER_CRUNCH) &&
424  (potential_word_crunch(word, garbage_level, ok_dict_word))) {
425  if (found_terrible_word) {
426  if (crunch_debug > 0) {
427  tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
428  }
429  word->unlv_crunch_mode = CR_KEEP_SPACE;
430  } else if (!prev_potential_marked) {
431  copy_it = page_res_it;
432  prev_potential_marked = true;
433  if (crunch_debug > 1) {
434  tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
435  }
436  }
437  } else {
438  found_terrible_word = false;
439  // Forget earlier potential crunches
440  prev_potential_marked = false;
441  if (crunch_debug > 2) {
442  tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());
443  }
444  }
445  }
446  page_res_it.forward();
447  }
448 }
@ CR_KEEP_SPACE
Definition: pageres.h:160
GARBAGE_LEVEL
Definition: docqual.h:30
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:616
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:594
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:488
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:450

◆ tilde_delete()

void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 530 of file docqual.cpp.

530  {
531  WERD_RES *word;
532  PAGE_RES_IT copy_it;
533  bool deleting_from_bol = false;
534  bool marked_delete_point = false;
535  int16_t debug_delete_mode;
536  CRUNCH_MODE delete_mode;
537  int16_t x_debug_delete_mode;
538  CRUNCH_MODE x_delete_mode;
539 
540  page_res_it.restart_page();
541  while (page_res_it.word() != nullptr) {
542  word = page_res_it.word();
543 
544  delete_mode = word_deletable(word, debug_delete_mode);
545  if (delete_mode != CR_NONE) {
546  if (word->word->flag(W_BOL) || deleting_from_bol) {
547  if (crunch_debug > 0) {
548  tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
549  word->best_choice->unichar_string().c_str());
550  }
551  word->unlv_crunch_mode = delete_mode;
552  deleting_from_bol = true;
553  } else if (word->word->flag(W_EOL)) {
554  if (marked_delete_point) {
555  while (copy_it.word() != word) {
556  x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);
557  if (crunch_debug > 0) {
558  tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
559  copy_it.word()->best_choice->unichar_string().c_str());
560  }
561  copy_it.word()->unlv_crunch_mode = x_delete_mode;
562  copy_it.forward();
563  }
564  }
565  if (crunch_debug > 0) {
566  tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
567  word->best_choice->unichar_string().c_str());
568  }
569  word->unlv_crunch_mode = delete_mode;
570  deleting_from_bol = false;
571  marked_delete_point = false;
572  } else {
573  if (!marked_delete_point) {
574  copy_it = page_res_it;
575  marked_delete_point = true;
576  }
577  }
578  } else {
579  deleting_from_bol = false;
580  // Forget earlier potential crunches
581  marked_delete_point = false;
582  }
583  /*
584  The following step has been left till now as the tess fails are used to
585  determine if the word is deletable.
586 */
587  if (!crunch_early_merge_tess_fails) {
588  word->merge_tess_fails();
589  }
590  page_res_it.forward();
591  }
592 }
@ CR_NONE
Definition: pageres.h:160
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:825

◆ TrainedXheightFix()

bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1455 of file control.cpp.

1455  {
1456  int original_misfits = CountMisfitTops(word);
1457  if (original_misfits == 0) {
1458  return false;
1459  }
1460  float baseline_shift = 0.0f;
1461  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1462  if (baseline_shift != 0.0f) {
1463  // Try the shift on its own first.
1464  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) {
1465  return false;
1466  }
1467  original_misfits = CountMisfitTops(word);
1468  if (original_misfits > 0) {
1469  float new_baseline_shift;
1470  // Now recompute the new x_height.
1471  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1472  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1473  // No test of return value here, as we are definitely making a change
1474  // to the word by shifting the baseline.
1475  TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row);
1476  }
1477  }
1478  return true;
1479  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1480  return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row);
1481  } else {
1482  return false;
1483  }
1484 }
const double kMinRefitXHeightFraction
Definition: control.cpp:51
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:105
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1488

◆ TrainFromBoxes()

void tesseract::Tesseract::TrainFromBoxes ( const std::vector< TBOX > &  boxes,
const std::vector< std::string > &  texts,
BLOCK_LIST *  block_list,
DocumentData training_data 
)

Definition at line 76 of file linerec.cpp.

77  {
78  auto box_count = boxes.size();
79  // Process all the text lines in this page, as defined by the boxes.
80  unsigned end_box = 0;
81  // Don't let \t, which marks newlines in the box file, get into the line
82  // content, as that makes the line unusable in training.
83  while (end_box < texts.size() && texts[end_box] == "\t") {
84  ++end_box;
85  }
86  for (auto start_box = end_box; start_box < box_count; start_box = end_box) {
87  // Find the textline of boxes starting at start and their bounding box.
88  TBOX line_box = boxes[start_box];
89  std::string line_str = texts[start_box];
90  for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
91  line_box += boxes[end_box];
92  line_str += texts[end_box];
93  }
94  // Find the most overlapping block.
95  BLOCK *best_block = nullptr;
96  int best_overlap = 0;
97  BLOCK_IT b_it(block_list);
98  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
99  BLOCK *block = b_it.data();
100  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
101  continue; // Not a text block.
102  }
103  TBOX block_box = block->pdblk.bounding_box();
104  block_box.rotate(block->re_rotation());
105  if (block_box.major_overlap(line_box)) {
106  TBOX overlap_box = line_box.intersection(block_box);
107  if (overlap_box.area() > best_overlap) {
108  best_overlap = overlap_box.area();
109  best_block = block;
110  }
111  }
112  }
113  ImageData *imagedata = nullptr;
114  if (best_block == nullptr) {
115  tprintf("No block overlapping textline: %s\n", line_str.c_str());
116  } else {
117  imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
118  }
119  if (imagedata != nullptr) {
120  training_data->AddPageToDocument(imagedata);
121  }
122  // Don't let \t, which marks newlines in the box file, get into the line
123  // content, as that makes the line unusable in training.
124  while (end_box < texts.size() && texts[end_box] == "\t") {
125  ++end_box;
126  }
127  }
128 }
ImageData * GetLineData(const TBOX &line_box, const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:133

◆ TrainLineRecognizer()

bool tesseract::Tesseract::TrainLineRecognizer ( const char *  input_imagename,
const std::string &  output_basename,
BLOCK_LIST *  block_list 
)

Definition at line 41 of file linerec.cpp.

42  {
43  std::string lstmf_name = output_basename + ".lstmf";
44  DocumentData images(lstmf_name);
45  if (applybox_page > 0) {
46  // Load existing document for the previous pages.
47  if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
48  tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
49  return false;
50  }
51  }
52  std::vector<TBOX> boxes;
53  std::vector<std::string> texts;
54  // Get the boxes for this page, if there are any.
55  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||
56  boxes.empty()) {
57  tprintf("Failed to read boxes from %s\n", input_imagename);
58  return false;
59  }
60  TrainFromBoxes(boxes, texts, block_list, &images);
61  if (images.PagesSize() == 0) {
62  tprintf("Failed to read pages from %s\n", input_imagename);
63  return false;
64  }
65  images.Shuffle();
66  if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
67  tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
68  return false;
69  }
70  return true;
71 }
void TrainFromBoxes(const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:76

◆ TrySuperscriptSplits()

WERD_RES * tesseract::Tesseract::TrySuperscriptSplits ( int  num_chopped_leading,
float  leading_certainty,
ScriptPos  leading_pos,
int  num_chopped_trailing,
float  trailing_certainty,
ScriptPos  trailing_pos,
WERD_RES word,
bool *  is_good,
int *  retry_rebuild_leading,
int *  retry_rebuild_trailing 
)

Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.

Parameters
[in]num_chopped_leadinghow many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript)
[in]leading_certaintythe (minimum) certainty had by the characters in the original leading section.
[in]leading_pos"super" or "sub" (for debugging)
[in]num_chopped_trailinghow many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript)
[in]trailing_certaintythe (minimum) certainty had by the characters in the original trailing section.
[in]trailing_pos"super" or "sub" (for debugging)
[in]wordthe word to try to chop up.
[out]is_gooddo we believe our result?
[out]retry_rebuild_leading,retry_rebuild_trailingIf non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars.
Returns
A word which is the result of re-recognizing as asked.

Definition at line 369 of file superscript.cpp.

373  {
374  int num_chopped = word->chopped_word->NumBlobs();
375 
376  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
377 
378  // Chop apart the word into up to three pieces.
379 
380  BlamerBundle *bb0 = nullptr;
381  BlamerBundle *bb1 = nullptr;
382  WERD_RES *prefix = nullptr;
383  WERD_RES *core = nullptr;
384  WERD_RES *suffix = nullptr;
385  if (num_chopped_leading > 0) {
386  prefix = new WERD_RES(*word);
387  split_word(prefix, num_chopped_leading, &core, &bb0);
388  } else {
389  core = new WERD_RES(*word);
390  }
391 
392  if (num_chopped_trailing > 0) {
393  int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
394  split_word(core, split_pt, &suffix, &bb1);
395  }
396 
397  // Recognize the pieces in turn.
398  int saved_cp_multiplier = classify_class_pruner_multiplier;
399  int saved_im_multiplier = classify_integer_matcher_multiplier;
400  if (prefix) {
401  // Turn off Tesseract's y-position penalties for the leading superscript.
402  classify_class_pruner_multiplier.set_value(0);
403  classify_integer_matcher_multiplier.set_value(0);
404 
405  // Adjust our expectations about the baseline for this prefix.
406  if (superscript_debug >= 3) {
407  tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
408  }
409  recog_word_recursive(prefix);
410  if (superscript_debug >= 2) {
411  tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos),
412  prefix->best_choice->unichar_string().c_str());
413  }
414 
415  // Restore the normal y-position penalties.
416  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
417  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
418  }
419 
420  if (superscript_debug >= 3) {
421  tprintf(" recognizing middle %d chopped blobs\n",
422  num_chopped - num_chopped_leading - num_chopped_trailing);
423  }
424 
425  if (suffix) {
426  // Turn off Tesseract's y-position penalties for the trailing superscript.
427  classify_class_pruner_multiplier.set_value(0);
428  classify_integer_matcher_multiplier.set_value(0);
429 
430  if (superscript_debug >= 3) {
431  tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
432  }
433  recog_word_recursive(suffix);
434  if (superscript_debug >= 2) {
435  tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos),
436  suffix->best_choice->unichar_string().c_str());
437  }
438 
439  // Restore the normal y-position penalties.
440  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
441  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
442  }
443 
444  // Evaluate whether we think the results are believably better
445  // than what we already had.
446  bool good_prefix =
447  !prefix || BelievableSuperscript(superscript_debug >= 1, *prefix,
448  superscript_bettered_certainty * leading_certainty,
449  retry_rebuild_leading, nullptr);
450  bool good_suffix =
451  !suffix || BelievableSuperscript(superscript_debug >= 1, *suffix,
452  superscript_bettered_certainty * trailing_certainty, nullptr,
453  retry_rebuild_trailing);
454 
455  *is_good = good_prefix && good_suffix;
456  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
457  // None of it is any good. Quit now.
458  delete core;
459  delete prefix;
460  delete suffix;
461  delete bb1;
462  return nullptr;
463  }
464  recog_word_recursive(core);
465 
466  // Now paste the results together into core.
467  if (suffix) {
468  suffix->SetAllScriptPositions(trailing_pos);
469  join_words(core, suffix, bb1);
470  }
471  if (prefix) {
472  prefix->SetAllScriptPositions(leading_pos);
473  join_words(prefix, core, bb0);
474  core = prefix;
475  prefix = nullptr;
476  }
477 
478  if (superscript_debug >= 1) {
479  tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
480  core->best_choice->unichar_string().c_str());
481  }
482  return core;
483 }
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:193
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const

◆ unrej_good_chs()

void tesseract::Tesseract::unrej_good_chs ( WERD_RES word)

Definition at line 98 of file docqual.cpp.

98  {
99  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
100  word->rebuild_word->blobs.empty()) {
101  using namespace std::placeholders; // for _1
102  word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
103  std::bind(acceptIfGoodQuality, word, _1));
104  }
105 }

◆ unrej_good_quality_words()

void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 142 of file docqual.cpp.

143  {
144  WERD_RES *word;
145  ROW_RES *current_row;
146  BLOCK_RES *current_block;
147  int i;
148 
149  page_res_it.restart_page();
150  while (page_res_it.word() != nullptr) {
151  check_debug_pt(page_res_it.word(), 100);
152  if (bland_unrej) {
153  word = page_res_it.word();
154  for (i = 0; i < word->reject_map.length(); i++) {
155  if (word->reject_map[i].accept_if_good_quality()) {
156  word->reject_map[i].setrej_quality_accept();
157  }
158  }
159  page_res_it.forward();
160  } else if ((page_res_it.row()->char_count > 0) &&
161  ((page_res_it.row()->rej_count /
162  static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {
163  word = page_res_it.word();
164  if (word->reject_map.quality_recoverable_rejects() &&
165  (tessedit_unrej_any_wd ||
166  acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
167  word->best_choice->unichar_lengths().c_str()) !=
168  AC_UNACCEPTABLE)) {
169  unrej_good_chs(word);
170  }
171  page_res_it.forward();
172  } else {
173  // Skip to end of dodgy row.
174  current_row = page_res_it.row();
175  while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {
176  page_res_it.forward();
177  }
178  }
179  check_debug_pt(page_res_it.word(), 110);
180  }
181  page_res_it.restart_page();
182  page_res_it.page_res->char_count = 0;
183  page_res_it.page_res->rej_count = 0;
184  current_block = nullptr;
185  current_row = nullptr;
186  while (page_res_it.word() != nullptr) {
187  if (current_block != page_res_it.block()) {
188  current_block = page_res_it.block();
189  current_block->char_count = 0;
190  current_block->rej_count = 0;
191  }
192  if (current_row != page_res_it.row()) {
193  current_row = page_res_it.row();
194  current_row->char_count = 0;
195  current_row->rej_count = 0;
196  current_row->whole_word_rej_count = 0;
197  }
198  page_res_it.rej_stat_word();
199  page_res_it.forward();
200  }
201 }
void unrej_good_chs(WERD_RES *word)
Definition: docqual.cpp:98

◆ word_adaptable()

bool tesseract::Tesseract::word_adaptable ( WERD_RES word,
uint16_t  mode 
)

Definition at line 34 of file adaptions.cpp.

35  {
36  if (tessedit_adaption_debug) {
37  tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
38  word->best_choice->unichar_string().c_str(), word->best_choice->rating(),
39  word->best_choice->certainty());
40  }
41 
42  bool status = false;
43  std::bitset<16> flags(mode);
44 
45  enum MODES {
46  ADAPTABLE_WERD,
47  ACCEPTABLE_WERD,
48  CHECK_DAWGS,
49  CHECK_SPACES,
50  CHECK_ONE_ELL_CONFLICT,
51  CHECK_AMBIG_WERD
52  };
53 
54  /*
55 0: NO adaption
56 */
57  if (mode == 0) {
58  if (tessedit_adaption_debug) {
59  tprintf("adaption disabled\n");
60  }
61  return false;
62  }
63 
64  if (flags[ADAPTABLE_WERD]) {
65  status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
66  if (tessedit_adaption_debug && !status) {
67  tprintf("tess_would_adapt bit is false\n");
68  }
69  }
70 
71  if (flags[ACCEPTABLE_WERD]) {
72  status |= word->tess_accepted;
73  if (tessedit_adaption_debug && !status) {
74  tprintf("tess_accepted bit is false\n");
75  }
76  }
77 
78  if (!status) { // If not set then
79  return false; // ignore other checks
80  }
81 
82  if (flags[CHECK_DAWGS] && (word->best_choice->permuter() != SYSTEM_DAWG_PERM) &&
83  (word->best_choice->permuter() != FREQ_DAWG_PERM) &&
84  (word->best_choice->permuter() != USER_DAWG_PERM) &&
85  (word->best_choice->permuter() != NUMBER_PERM)) {
86  if (tessedit_adaption_debug) {
87  tprintf("word not in dawgs\n");
88  }
89  return false;
90  }
91 
92  if (flags[CHECK_ONE_ELL_CONFLICT] && one_ell_conflict(word, false)) {
93  if (tessedit_adaption_debug) {
94  tprintf("word has ell conflict\n");
95  }
96  return false;
97  }
98 
99  if (flags[CHECK_SPACES] &&
100  (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
101  if (tessedit_adaption_debug) {
102  tprintf("word contains spaces\n");
103  }
104  return false;
105  }
106 
107  if (flags[CHECK_AMBIG_WERD] && word->best_choice->dangerous_ambig_found()) {
108  if (tessedit_adaption_debug) {
109  tprintf("word is ambiguous\n");
110  }
111  return false;
112  }
113 
114  if (tessedit_adaption_debug) {
115  tprintf("returning status %d\n", status);
116  }
117  return status;
118 }

◆ word_blank_and_set_display()

bool tesseract::Tesseract::word_blank_and_set_display ( PAGE_RES_IT pr_it)

word_blank_and_set_display() Word processor

Blank display of word then redisplay word according to current display mode settings

Definition at line 666 of file pgedit.cpp.

666  {
667  pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK, ScrollView::BLACK);
668  return word_set_display(pr_it);
669 }

◆ word_bln_display()

bool tesseract::Tesseract::word_bln_display ( PAGE_RES_IT pr_it)

word_bln_display()

Normalize word and display in word window

Definition at line 676 of file pgedit.cpp.

676  {
677  WERD_RES *word_res = pr_it->word();
678  if (word_res->chopped_word == nullptr) {
679  // Setup word normalization parameters.
680  word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
681  classify_bln_numeric_mode, textord_use_cjk_fp_model,
682  poly_allow_detailed_fx, pr_it->row()->row, pr_it->block()->block);
683  }
684  bln_word_window_handle()->Clear();
685  display_bln_lines(bln_word_window_handle(), ScrollView::CYAN, 1.0, 0.0f, -1000.0f, 1000.0f);
686  C_BLOB_IT it(word_res->word->cblob_list());
688  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
689  it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN, bln_word_window_handle());
690  color = WERD::NextColor(color);
691  }
692  bln_word_window_handle()->Update();
693  return true;
694 }
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:298

◆ word_blob_quality()

int16_t tesseract::Tesseract::word_blob_quality ( WERD_RES word)

Definition at line 51 of file docqual.cpp.

51  {
52  int16_t match_count = 0;
53  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
54  !word->rebuild_word->blobs.empty()) {
55  using namespace std::placeholders; // for _1
56  word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
57  std::bind(countMatchingBlobs, match_count, _1));
58  }
59  return match_count;
60 }

◆ word_char_quality()

void tesseract::Tesseract::word_char_quality ( WERD_RES word,
int16_t *  match_count,
int16_t *  accepted_match_count 
)

Definition at line 81 of file docqual.cpp.

82  {
83  *match_count = 0;
84  *accepted_match_count = 0;
85  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
86  !word->rebuild_word->blobs.empty()) {
87  using namespace std::placeholders; // for _1
88  word->bln_boxes->ProcessMatchedBlobs(
89  *word->rebuild_word,
90  std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
91  }
92 }

◆ word_contains_non_1_digit()

bool tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 496 of file reject.cpp.

496  {
497  int16_t i;
498  int16_t offset;
499 
500  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
501  if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
502  (word_lengths[i] != 1 || word[offset] != '1')) {
503  return true;
504  }
505  }
506  return false;
507 }

◆ word_deletable()

CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
int16_t &  delete_mode 
)

Definition at line 825 of file docqual.cpp.

825  {
826  int word_len = word->reject_map.length();
827  float rating_per_ch;
828  TBOX box; // BB of word
829 
830  if (word->unlv_crunch_mode == CR_NONE) {
831  delete_mode = 0;
832  return CR_NONE;
833  }
834 
835  if (word_len == 0) {
836  delete_mode = 1;
837  return CR_DELETE;
838  }
839 
840  if (word->rebuild_word != nullptr) {
841  // Cube leaves rebuild_word nullptr.
842  box = word->rebuild_word->bounding_box();
843  if (box.height() < crunch_del_min_ht * kBlnXHeight) {
844  delete_mode = 4;
845  return CR_DELETE;
846  }
847 
848  if (noise_outlines(word->rebuild_word)) {
849  delete_mode = 5;
850  return CR_DELETE;
851  }
852  }
853 
854  if ((failure_count(word) * 1.5) > word_len) {
855  delete_mode = 2;
856  return CR_LOOSE_SPACE;
857  }
858 
859  if (word->best_choice->certainty() < crunch_del_cert) {
860  delete_mode = 7;
861  return CR_LOOSE_SPACE;
862  }
863 
864  rating_per_ch = word->best_choice->rating() / word_len;
865 
866  if (rating_per_ch > crunch_del_rating) {
867  delete_mode = 8;
868  return CR_LOOSE_SPACE;
869  }
870 
871  if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
872  delete_mode = 9;
873  return CR_LOOSE_SPACE;
874  }
875 
876  if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
877  delete_mode = 10;
878  return CR_LOOSE_SPACE;
879  }
880 
881  if (box.height() > crunch_del_max_ht * kBlnXHeight) {
882  delete_mode = 11;
883  return CR_LOOSE_SPACE;
884  }
885 
886  if (box.width() < crunch_del_min_width * kBlnXHeight) {
887  delete_mode = 3;
888  return CR_LOOSE_SPACE;
889  }
890 
891  delete_mode = 0;
892  return CR_NONE;
893 }
@ CR_LOOSE_SPACE
Definition: pageres.h:160
@ CR_DELETE
Definition: pageres.h:160
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:907
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:895

◆ word_display()

bool tesseract::Tesseract::word_display ( PAGE_RES_IT pr_it)

word_display() Word Processor

Display a word according to its display modes

Definition at line 701 of file pgedit.cpp.

701  {
702  WERD_RES *word_res = pr_it->word();
703  WERD *word = word_res->word;
704  TBOX word_bb; // word bounding box
705  int word_height; // ht of word BB
706  bool displayed_something = false;
707  float shift; // from bot left
708 
709  if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
710 # ifndef DISABLED_LEGACY_ENGINE
711  BoxWord *box_word = word_res->box_word;
712  WERD_CHOICE *best_choice = word_res->best_choice;
713  int length = box_word->length();
714  if (word_res->fontinfo == nullptr) {
715  return false;
716  }
717  const FontInfo &font_info = *word_res->fontinfo;
718  for (int i = 0; i < length; ++i) {
720  switch (color_mode) {
721  case CM_SUBSCRIPT:
722  if (best_choice->BlobPosition(i) == SP_SUBSCRIPT) {
723  color = ScrollView::RED;
724  }
725  break;
726  case CM_SUPERSCRIPT:
727  if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT) {
728  color = ScrollView::RED;
729  }
730  break;
731  case CM_ITALIC:
732  if (font_info.is_italic()) {
733  color = ScrollView::RED;
734  }
735  break;
736  case CM_BOLD:
737  if (font_info.is_bold()) {
738  color = ScrollView::RED;
739  }
740  break;
741  case CM_FIXEDPITCH:
742  if (font_info.is_fixed_pitch()) {
743  color = ScrollView::RED;
744  }
745  break;
746  case CM_SERIF:
747  if (font_info.is_serif()) {
748  color = ScrollView::RED;
749  }
750  break;
751  case CM_SMALLCAPS:
752  if (word_res->small_caps) {
753  color = ScrollView::RED;
754  }
755  break;
756  case CM_DROPCAPS:
757  if (best_choice->BlobPosition(i) == SP_DROPCAP) {
758  color = ScrollView::RED;
759  }
760  break;
761  // TODO(rays) underline is currently completely unsupported.
762  case CM_UNDERLINE:
763  default:
764  break;
765  }
766  image_win->Pen(color);
767  TBOX box = box_word->BlobBox(i);
768  image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
769  }
770  return true;
771 # else
772  return false;
773 # endif // ndef DISABLED_LEGACY_ENGINE
774  }
775  /*
776  Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
777  etc. are to keep the compiler happy.
778 */
779  // display bounding box
780  if (word->display_flag(DF_BOX)) {
781  word->bounding_box().plot(image_win,
782  static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
783  static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
784 
785  auto c = static_cast<ScrollView::Color>((int32_t)editor_image_blob_bb_color);
786  image_win->Pen(c);
787  // cblob iterator
788  C_BLOB_IT c_it(word->cblob_list());
789  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
790  c_it.data()->bounding_box().plot(image_win);
791  }
792  displayed_something = true;
793  }
794 
795  // display edge steps
796  if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
797  word->plot(image_win); // rainbow colors
798  displayed_something = true;
799  }
800 
801  // display poly approx
802  if (word->display_flag(DF_POLYGONAL)) {
803  // need to convert
804  TWERD *tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);
805  tword->plot(image_win);
806  delete tword;
807  displayed_something = true;
808  }
809 
810  // Display correct text and blamer information.
811  std::string text;
812  std::string blame;
813  if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
814  text = word->text();
815  }
816  if (word->display_flag(DF_BLAMER) &&
817  !(word_res->blamer_bundle != nullptr &&
818  word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
819  text = "";
820  const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
821  if (blamer_bundle == nullptr) {
822  text += "NULL";
823  } else {
824  text = blamer_bundle->TruthString();
825  }
826  text += " -> ";
827  std::string best_choice_str;
828  if (word_res->best_choice == nullptr) {
829  best_choice_str = "NULL";
830  } else {
831  word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
832  }
833  text += best_choice_str;
834  IncorrectResultReason reason =
835  (blamer_bundle == nullptr) ? IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
836  ASSERT_HOST(reason < IRR_NUM_REASONS);
837  blame += " [";
838  blame += BlamerBundle::IncorrectReasonName(reason);
839  blame += "]";
840  }
841  if (text.length() > 0) {
842  word_bb = word->bounding_box();
843  image_win->Pen(ScrollView::RED);
844  word_height = word_bb.height();
845  int text_height = 0.50 * word_height;
846  if (text_height > 20) {
847  text_height = 20;
848  }
849  image_win->TextAttributes("Arial", text_height, false, false, false);
850  shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
851  image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.c_str());
852  if (blame.length() > 0) {
853  image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height,
854  blame.c_str());
855  }
856 
857  displayed_something = true;
858  }
859 
860  if (!displayed_something) { // display BBox anyway
861  word->bounding_box().plot(image_win,
862  static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
863  static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
864  }
865  return true;
866 }
@ SP_DROPCAP
Definition: ratngs.h:250
@ IRR_CORRECT
Definition: blamer.h:58
@ IRR_PAGE_LAYOUT
Definition: blamer.h:77
int editor_image_word_bb_color
Definition: pgedit.cpp:123
int editor_image_blob_bb_color
Definition: pgedit.cpp:124
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:778
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:623
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:648
void Pen(Color color)
Definition: scrollview.cpp:723
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:589

◆ word_dumper()

bool tesseract::Tesseract::word_dumper ( PAGE_RES_IT pr_it)

word_dumper()

Dump members to the debug window

Definition at line 876 of file pgedit.cpp.

876  {
877  if (pr_it->block()->block != nullptr) {
878  tprintf("\nBlock data...\n");
879  pr_it->block()->block->print(nullptr, false);
880  }
881  tprintf("\nRow data...\n");
882  pr_it->row()->row->print(nullptr);
883  tprintf("\nWord data...\n");
884  WERD_RES *word_res = pr_it->word();
885  word_res->word->print();
886  if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
887  word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
888  tprintf("Current blamer debug: %s\n", word_res->blamer_bundle->debug().c_str());
889  }
890  return true;
891 }

◆ word_outline_errs()

int16_t tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 62 of file docqual.cpp.

62  {
63  int16_t i = 0;
64  int16_t err_count = 0;
65 
66  if (word->rebuild_word != nullptr) {
67  for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
68  TBLOB *blob = word->rebuild_word->blobs[b];
69  err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
70  i++;
71  }
72  }
73  return err_count;
74 }
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:107

◆ word_set_display()

bool tesseract::Tesseract::word_set_display ( PAGE_RES_IT pr_it)

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 899 of file pgedit.cpp.

899  {
900  WERD *word = pr_it->word()->word;
901  word->set_display_flag(DF_BOX, word_display_mode[DF_BOX]);
902  word->set_display_flag(DF_TEXT, word_display_mode[DF_TEXT]);
903  word->set_display_flag(DF_POLYGONAL, word_display_mode[DF_POLYGONAL]);
904  word->set_display_flag(DF_EDGE_STEP, word_display_mode[DF_EDGE_STEP]);
905  word->set_display_flag(DF_BN_POLYGONAL, word_display_mode[DF_BN_POLYGONAL]);
906  word->set_display_flag(DF_BLAMER, word_display_mode[DF_BLAMER]);
907  return word_display(pr_it);
908 }

◆ worst_noise_blob()

int16_t tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 685 of file fixspace.cpp.

685  {
686  float noise_score[512];
687  int min_noise_blob; // 1st contender
688  int max_noise_blob; // last contender
689  int non_noise_count;
690  int worst_noise_blob; // Worst blob
691  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
692  float non_noise_limit = kBlnXHeight * 0.8;
693 
694  if (word_res->rebuild_word == nullptr) {
695  return -1; // Can't handle cube words.
696  }
697 
698  // Normalised.
699  auto blob_count = word_res->box_word->length();
700  ASSERT_HOST(blob_count <= 512);
701  if (blob_count < 5) {
702  return -1; // too short to split
703  }
704 
705  /* Get the noise scores for all blobs */
706 
707 #ifndef SECURE_NAMES
708  if (debug_fix_space_level > 5) {
709  tprintf("FP fixspace Noise metrics for \"%s\": ",
710  word_res->best_choice->unichar_string().c_str());
711  }
712 #endif
713 
714  for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
715  TBLOB *blob = word_res->rebuild_word->blobs[i];
716  if (word_res->reject_map[i].accepted()) {
717  noise_score[i] = non_noise_limit;
718  } else {
719  noise_score[i] = blob_noise_score(blob);
720  }
721 
722  if (debug_fix_space_level > 5) {
723  tprintf("%1.1f ", noise_score[i]);
724  }
725  }
726  if (debug_fix_space_level > 5) {
727  tprintf("\n");
728  }
729 
730  /* Now find the worst one which is far enough away from the end of the word */
731 
732  non_noise_count = 0;
733  int i;
734  for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
735  if (noise_score[i] >= non_noise_limit) {
736  non_noise_count++;
737  }
738  }
739  if (non_noise_count < fixsp_non_noise_limit) {
740  return -1;
741  }
742 
743  min_noise_blob = i;
744 
745  non_noise_count = 0;
746  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {
747  if (noise_score[i] >= non_noise_limit) {
748  non_noise_count++;
749  }
750  }
751  if (non_noise_count < fixsp_non_noise_limit) {
752  return -1;
753  }
754 
755  max_noise_blob = i;
756 
757  if (min_noise_blob > max_noise_blob) {
758  return -1;
759  }
760 
761  *worst_noise_score = small_limit;
762  worst_noise_blob = -1;
763  for (auto i = min_noise_blob; i <= max_noise_blob; i++) {
764  if (noise_score[i] < *worst_noise_score) {
765  worst_noise_blob = i;
766  *worst_noise_score = noise_score[i];
767  }
768  }
769  return worst_noise_blob;
770 }

◆ write_results()

void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
bool  force_eol 
)

Definition at line 99 of file output.cpp.

101  { // override tilde crunch?
102  WERD_RES *word = page_res_it.word();
103  const UNICHARSET &uchset = *word->uch_set;
104  bool need_reject = false;
105  UNICHAR_ID space = uchset.unichar_to_id(" ");
106 
107  if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
108  !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
109  if ((word->unlv_crunch_mode != CR_DELETE) &&
110  (!stats_.tilde_crunch_written ||
111  ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
112  !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
113  if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
114  !word->word->flag(W_FUZZY_SP)) {
115  stats_.last_char_was_tilde = false;
116  }
117  need_reject = true;
118  }
119  if ((need_reject && !stats_.last_char_was_tilde) ||
120  (force_eol && stats_.write_results_empty_block)) {
121  /* Write a reject char - mark as rejected unless zero_rejection mode */
122  stats_.last_char_was_tilde = true;
123  stats_.tilde_crunch_written = true;
124  stats_.last_char_was_newline = false;
125  stats_.write_results_empty_block = false;
126  }
127 
128  if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
129  stats_.tilde_crunch_written = false;
130  stats_.last_char_was_newline = true;
131  stats_.last_char_was_tilde = false;
132  }
133 
134  if (force_eol) {
135  stats_.write_results_empty_block = true;
136  }
137  return;
138  }
139 
140  /* NORMAL PROCESSING of non tilde crunched words */
141 
142  stats_.tilde_crunch_written = false;
143  if (newline_type) {
144  stats_.last_char_was_newline = true;
145  } else {
146  stats_.last_char_was_newline = false;
147  }
148  stats_.write_results_empty_block = force_eol; // about to write a real word
149 
150  if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
151  !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
152  (word->best_choice->unichar_id(0) == space)) {
153  /* Prevent adjacent tilde across words - we know that adjacent tildes within
154  words have been removed */
155  word->MergeAdjacentBlobs(0);
156  }
157  if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
158  stats_.last_char_was_tilde = false;
159  } else {
160  if (word->reject_map.length() > 0) {
161  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
162  stats_.last_char_was_tilde = true;
163  } else {
164  stats_.last_char_was_tilde = false;
165  }
166  } else if (word->word->space() > 0) {
167  stats_.last_char_was_tilde = false;
168  }
169  /* else it is unchanged as there are no output chars */
170  }
171 
172  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
173 
174  set_unlv_suspects(word);
175  check_debug_pt(word, 120);
176  if (tessedit_rejection_debug) {
177  tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
178  dict_word(*(word->best_choice)));
179  }
180  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
181  if (tessedit_zero_rejection) {
182  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
183  for (unsigned i = 0; i < word->best_choice->length(); ++i) {
184  if (word->reject_map[i].rejected()) {
185  word->reject_map[i].setrej_minimal_rej_accept();
186  }
187  }
188  }
189  if (tessedit_minimal_rejection) {
190  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
191  for (unsigned i = 0; i < word->best_choice->length(); ++i) {
192  if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
193  word->reject_map[i].setrej_minimal_rej_accept();
194  }
195  }
196  }
197  }
198 }
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:270

The documentation for this class was generated from the following files: