/* -*-C-*- ******************************************************************************** * * File: pieces.c (Formerly pieces.c) * Description: * Author: Mark Seaman, OCR Technology * Created: Fri Oct 16 14:37:00 1987 * Modified: Mon May 20 12:12:35 1991 (Mark Seaman) marks@hpgrlt * Language: C * Package: N/A * Status: Reusable Software Component * * (c) Copyright 1987, Hewlett-Packard Company. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * *********************************************************************************/ /*---------------------------------------------------------------------- I n c l u d e s ----------------------------------------------------------------------*/ #include "blobs.h" #include "freelist.h" #include "helpers.h" #include "matchtab.h" #include "matrix.h" #include "ndminx.h" #include "plotseg.h" #include "ratngs.h" #include "seam.h" #include "states.h" #include "wordclass.h" #include "wordrec.h" // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H #include "config_auto.h" #endif /*---------------------------------------------------------------------- F u n c t i o n s ----------------------------------------------------------------------*/ /********************************************************************** * bounds_of_piece * * Find the bounds of the piece that will be created by joining the * requested collection of pieces together. **********************************************************************/ TBOX bounds_of_piece(TBOX *bounds, inT16 start, inT16 end) { TBOX all_together = bounds[start]; for (int x = start + 1; x <= end; x++) { all_together += bounds[x]; } return all_together; } /********************************************************************** * classify_piece * * Create a larger piece from a collection of smaller ones. Classify * it and return the results. Take the large piece apart to leave * the collection of small pieces un modified. **********************************************************************/ namespace tesseract { BLOB_CHOICE_LIST *Wordrec::classify_piece(TBLOB *pieces, const DENORM& denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle) { BLOB_CHOICE_LIST *choices; TBLOB *blob; inT16 x; join_pieces(pieces, seams, start, end); for (blob = pieces, x = 0; x < start; x++) { blob = blob->next; } choices = classify_blob(blob, denorm, "pieces:", White, blamer_bundle); break_pieces(blob, seams, start, end); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations > 2) { STATE current_state; SEARCH_STATE chunk_groups; set_n_ones (¤t_state, array_count(seams)); chunk_groups = bin_to_chunks(¤t_state, array_count(seams)); display_segmentation(pieces, chunk_groups); window_wait(segm_window); memfree(chunk_groups); } #endif return (choices); } template int SortByUnicharID(const void *void1, const void *void2) { const BLOB_CHOICE *p1 = *reinterpret_cast(void1); const BLOB_CHOICE *p2 = *reinterpret_cast(void2); return p1->unichar_id() - p2->unichar_id(); } template int SortByRating(const void *void1, const void *void2) { const BLOB_CHOICE *p1 = *reinterpret_cast(void1); const BLOB_CHOICE *p2 = *reinterpret_cast(void2); if (p1->rating() < p2->rating()) return 1; return -1; } /********************************************************************** * fill_filtered_fragment_list * * Filter the fragment list so that the filtered_choices only contain * fragments that are in the correct position. choices is the list * that we are going to filter. fragment_pos is the position in the * fragment that we are looking for and num_frag_parts is the the * total number of pieces. The result will be appended to * filtered_choices. **********************************************************************/ void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices) { BLOB_CHOICE_IT filtered_choices_it(filtered_choices); BLOB_CHOICE_IT choices_it(choices); for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) { UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id); if (frag != NULL && frag->get_pos() == fragment_pos && frag->get_total() == num_frag_parts) { // Recover the unichar_id of the unichar that this fragment is // a part of BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data()); int original_unichar = unicharset.unichar_to_id(frag->get_unichar()); b->set_unichar_id(original_unichar); filtered_choices_it.add_to_end(b); } } filtered_choices->sort(SortByUnicharID); } /********************************************************************** * merge_and_put_fragment_lists * * Merge the fragment lists in choice_lists and append it to the * ratings matrix. **********************************************************************/ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings) { BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts]; for (int i = 0; i < num_frag_parts; i++) { choice_lists_it[i].set_to_list(&choice_lists[i]); choice_lists_it[i].mark_cycle_pt(); } BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column); if (merged_choice == NULL) merged_choice = new BLOB_CHOICE_LIST; bool end_of_list = false; BLOB_CHOICE_IT merged_choice_it(merged_choice); while (!end_of_list) { // Find the maximum unichar_id of the current entry the iterators // are pointing at UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id(); int max_list = 0; for (int i = 0; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); if (max_unichar_id < unichar_id) { max_unichar_id = unichar_id; max_list = i; } } // Move the each iterators until it gets to an entry that has a // value greater than or equal to max_unichar_id for (int i = 0; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); while (!choice_lists_it[i].cycled_list() && unichar_id < max_unichar_id) { choice_lists_it[i].forward(); unichar_id = choice_lists_it[i].data()->unichar_id(); } if (choice_lists_it[i].cycled_list()) { end_of_list = true; break; } } if (end_of_list) break; // Checks if the fragments are parts of the same character UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id(); bool same_unichar = true; for (int i = 1; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); if (unichar_id != first_unichar_id) { same_unichar = false; break; } } if (same_unichar) { // Add the merged character to the result UNICHAR_ID merged_unichar_id = first_unichar_id; inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id(); inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2(); inT16 merged_min_xheight = choice_lists_it[0].data()->min_xheight(); inT16 merged_max_xheight = choice_lists_it[0].data()->max_xheight(); int merged_script_id = choice_lists_it[0].data()->script_id(); bool merged_adapted = choice_lists_it[0].data()->adapted(); float merged_rating = 0, merged_certainty = 0; for (int i = 0; i < num_frag_parts; i++) { float rating = choice_lists_it[i].data()->rating(); float certainty = choice_lists_it[i].data()->certainty(); if (i == 0 || certainty < merged_certainty) merged_certainty = certainty; merged_rating += rating; choice_lists_it[i].forward(); if (choice_lists_it[i].cycled_list()) end_of_list = true; IntersectRange(choice_lists_it[i].data()->min_xheight(), choice_lists_it[i].data()->max_xheight(), &merged_min_xheight, &merged_max_xheight); } merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id, merged_rating, merged_certainty, merged_fontinfo_id, merged_fontinfo_id2, merged_script_id, merged_min_xheight, merged_max_xheight, merged_adapted)); } } if (classify_debug_level) print_ratings_list("Merged Fragments", merged_choice, unicharset); if (merged_choice->empty()) delete merged_choice; else ratings->put(row, column, merged_choice); delete [] choice_lists_it; } /********************************************************************** * get_fragment_lists * * Recursively go through the ratings matrix to find lists of fragments * to be merged in the function merge_and_put_fragment_lists. * current_frag is the postion of the piece we are looking for. * current_row is the row in the rating matrix we are currently at. * start is the row we started initially, so that we can know where * to append the results to the matrix. num_frag_parts is the total * number of pieces we are looking for and num_blobs is the size of the * ratings matrix. **********************************************************************/ void Wordrec::get_fragment_lists(inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists) { if (current_frag == num_frag_parts) { merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts, choice_lists, ratings); return; } for (inT16 x = current_row; x < num_blobs; x++) { BLOB_CHOICE_LIST *choices = ratings->get(current_row, x); if (choices == NULL) continue; fill_filtered_fragment_list(choices, current_frag, num_frag_parts, &choice_lists[current_frag]); if (!choice_lists[current_frag].empty()) { get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts, num_blobs, ratings, choice_lists); choice_lists[current_frag].clear(); } } } /********************************************************************** * merge_fragments * * Try to merge fragments in the ratings matrix and put the result in * the corresponding row and column **********************************************************************/ void Wordrec::merge_fragments(MATRIX *ratings, inT16 num_blobs) { BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks]; for (inT16 start = 0; start < num_blobs; start++) { for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks; frag_parts++) { get_fragment_lists(0, start, start, frag_parts, num_blobs, ratings, choice_lists); } } // Delete fragments from the rating matrix for (inT16 x = 0; x < num_blobs; x++) { for (inT16 y = x; y < num_blobs; y++) { BLOB_CHOICE_LIST *choices = ratings->get(x, y); if (choices != NULL) { BLOB_CHOICE_IT choices_it(choices); for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) { UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id); if (frag != NULL) delete choices_it.extract(); } } } } } /********************************************************************** * get_piece_rating * * Check to see if this piece has already been classified. If it has * return that rating. Otherwise build the piece from the smaller * pieces, classify it, store the rating for later, and take the piece * apart again. **********************************************************************/ BLOB_CHOICE_LIST *Wordrec::get_piece_rating(MATRIX *ratings, TBLOB *blobs, const DENORM& denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle) { BLOB_CHOICE_LIST *choices = ratings->get(start, end); if (choices == NOT_CLASSIFIED) { choices = classify_piece(blobs, denorm, seams, start, end, blamer_bundle); ratings->put(start, end, choices); if (wordrec_debug_level > 1) { tprintf("get_piece_rating(): updated ratings matrix\n"); ratings->print(getDict().getUnicharset()); } } return (choices); } /********************************************************************** * record_blob_bounds * * Set up and initialize an array that holds the bounds of a set of * blobs. Caller should delete[] the array. **********************************************************************/ TBOX *Wordrec::record_blob_bounds(TBLOB *blobs) { int nblobs = count_blobs(blobs); TBOX *bboxes = new TBOX[nblobs]; inT16 x = 0; for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { bboxes[x] = blob->bounding_box(); x++; } return bboxes; } /********************************************************************** * record_piece_ratings * * Save the choices for all the pieces that have been classified into * a matrix that can be used to look them up later. A two dimensional * matrix is created. The indices correspond to the starting and * ending initial piece number. **********************************************************************/ MATRIX *Wordrec::record_piece_ratings(TBLOB *blobs) { inT16 num_blobs = count_blobs(blobs); TBOX *bounds = record_blob_bounds(blobs); MATRIX *ratings = new MATRIX(num_blobs); for (int x = 0; x < num_blobs; x++) { for (int y = x; y < num_blobs; y++) { TBOX piecebox = bounds_of_piece(bounds, x, y); BLOB_CHOICE_LIST *choices = blob_match_table.get_match_by_box(piecebox); if (choices != NULL) { ratings->put(x, y, choices); } } } if (merge_fragments_in_matrix) merge_fragments(ratings, num_blobs); delete []bounds; return ratings; } } // namespace tesseract