Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
classify.h
Go to the documentation of this file.
1 
2 // File: classify.h
3 // Description: classify class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
20 #define TESSERACT_CLASSIFY_CLASSIFY_H__
21 
22 #include "adaptive.h"
23 #include "ccstruct.h"
24 #include "classify.h"
25 #include "dict.h"
26 #include "featdefs.h"
27 #include "fontinfo.h"
28 #include "intfx.h"
29 #include "intmatcher.h"
30 #include "normalis.h"
31 #include "ratngs.h"
32 #include "ocrfeatures.h"
33 #include "unicity_table.h"
34 
35 class ScrollView;
36 class WERD_CHOICE;
37 class WERD_RES;
38 struct ADAPT_RESULTS;
39 struct NORM_PROTOS;
40 
41 static const int kUnknownFontinfoId = -1;
42 static const int kBlankFontinfoId = -2;
43 
44 namespace tesseract {
45 
46 struct ShapeRating;
47 class ShapeTable;
48 
49 // How segmented is a blob. In this enum, character refers to a classifiable
50 // unit, but that is too long and character is usually easier to understand.
52  CST_FRAGMENT, // A partial character.
53  CST_WHOLE, // A correctly segmented character.
54  CST_IMPROPER, // More than one but less than 2 characters.
55  CST_NGRAM // Multiple characters.
56 };
57 
58 class Classify : public CCStruct {
59  public:
60  Classify();
61  virtual ~Classify();
63  return dict_;
64  }
65 
66  const ShapeTable* shape_table() const {
67  return shape_table_;
68  }
69 
70  /* adaptive.cpp ************************************************************/
71  ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
72  int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId);
73  // Runs the class pruner from int_templates on the given features, returning
74  // the number of classes output in results.
75  // int_templates Class pruner tables
76  // num_features Number of features in blob
77  // features Array of features
78  // normalization_factors (input) Array of int_templates->NumClasses fudge
79  // factors from blob normalization process.
80  // (Indexed by CLASS_INDEX)
81  // expected_num_features (input) Array of int_templates->NumClasses
82  // expected number of features for each class.
83  // (Indexed by CLASS_INDEX)
84  // results (output) Sorted Array of pruned classes.
85  // Array must be sized to take the maximum possible
86  // number of outputs : int_templates->NumClasses.
87  int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
88  int num_features,
89  const INT_FEATURE_STRUCT* features,
90  const uinT8* normalization_factors,
91  const uinT16* expected_num_features,
92  CP_RESULT_STRUCT* results);
93  void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
94  CLASS_CUTOFF_ARRAY Cutoffs);
95  void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
96  void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
98  /* normmatch.cpp ************************************************************/
100  const FEATURE_STRUCT& feature, BOOL8 DebugMatch);
101  void FreeNormProtos();
102  NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
103  /* protos.cpp ***************************************************************/
104  void ReadClassFile();
105  void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
107  const UNICHARSET& target_unicharset);
108  /* adaptmatch.cpp ***********************************************************/
109 
110  // Learn the given word using its chopped_word, seam_array, denorm,
111  // box_word, best_state, and correct_text to learn both correctly and
112  // incorrectly segmented blobs. If filename is not NULL, then LearnBlob
113  // is called and the data will be written to a file for static training.
114  // Otherwise AdaptToBlob is called for adaption within a document.
115  // If rejmap is not NULL, then only chars with a rejmap entry of '1' will
116  // be learned, otherwise all chars with good correct_text are learned.
117  void LearnWord(const char* filename, const char *rejmap, WERD_RES *word);
118 
119  // Builds a blob of length fragments, from the word, starting at start,
120  // and then learn it, as having the given correct_text.
121  // If filename is not NULL, then LearnBlob
122  // is called and the data will be written to a file for static training.
123  // Otherwise AdaptToBlob is called for adaption within a document.
124  // threshold is a magic number required by AdaptToChar and generated by
125  // GetAdaptThresholds.
126  // Although it can be partly inferred from the string, segmentation is
127  // provided to explicitly clarify the character segmentation.
128  void LearnPieces(const char* filename, int start, int length,
129  float threshold, CharSegmentationType segmentation,
130  const char* correct_text, WERD_RES *word);
131  void InitAdaptiveClassifier(bool load_pre_trained_templates);
132  void InitAdaptedClass(TBLOB *Blob,
133  const DENORM& denorm,
134  CLASS_ID ClassId,
135  int FontinfoId,
136  ADAPT_CLASS Class,
137  ADAPT_TEMPLATES Templates);
138  void AdaptToPunc(TBLOB *Blob,
139  const DENORM& denorm,
140  CLASS_ID ClassId,
141  int FontinfoId,
142  FLOAT32 Threshold);
143  void AmbigClassifier(TBLOB *Blob,
144  const DENORM& denorm,
145  INT_TEMPLATES Templates,
146  ADAPT_CLASS *Classes,
147  UNICHAR_ID *Ambiguities,
148  ADAPT_RESULTS *Results);
149  void MasterMatcher(INT_TEMPLATES templates,
150  inT16 num_features,
151  const INT_FEATURE_STRUCT* features,
152  const uinT8* norm_factors,
153  ADAPT_CLASS* classes,
154  int debug,
155  int num_classes,
156  const TBOX& blob_box,
157  CLASS_PRUNER_RESULTS results,
158  ADAPT_RESULTS* final_results);
159  // Converts configs to fonts, and if the result is not adapted, and a
160  // shape_table_ is present, the shape is expanded to include all
161  // unichar_ids represented, before applying a set of corrections to the
162  // distance rating in int_result, (see ComputeCorrectedRating.)
163  // The results are added to the final_results output.
165  bool debug,
166  int class_id,
167  int bottom, int top,
168  float cp_rating,
169  int blob_length,
170  const uinT8* cn_factors,
171  INT_RESULT_STRUCT& int_result,
172  ADAPT_RESULTS* final_results);
173  // Applies a set of corrections to the distance im_rating,
174  // including the cn_correction, miss penalty and additional penalty
175  // for non-alnums being vertical misfits. Returns the corrected distance.
176  double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
177  double im_rating, int feature_misses,
178  int bottom, int top,
179  int blob_length, const uinT8* cn_factors);
180  void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
181  ADAPT_RESULTS *Results,
182  BLOB_CHOICE_LIST *Choices);
183  void AddNewResult(ADAPT_RESULTS *results,
184  CLASS_ID class_id,
185  int shape_id,
186  FLOAT32 rating,
187  bool adapted,
188  int config,
189  int fontinfo_id,
190  int fontinfo_id2);
191  int GetAdaptiveFeatures(TBLOB *Blob,
192  INT_FEATURE_ARRAY IntFeatures,
193  FEATURE_SET *FloatFeatures);
194 
195 #ifndef GRAPHICS_DISABLED
196  void DebugAdaptiveClassifier(TBLOB *Blob,
197  const DENORM& denorm,
198  ADAPT_RESULTS *Results);
199 #endif
200  void GetAdaptThresholds (TWERD * Word,
201  const DENORM& denorm,
202  const WERD_CHOICE& BestChoice,
203  const WERD_CHOICE& BestRawChoice,
204  FLOAT32 Thresholds[]);
205 
207  int NumBadFeat,
208  FEATURE_ID BadFeat[],
209  INT_CLASS IClass,
210  ADAPT_CLASS Class,
213  CLASS_ID ClassId,
214  int FontinfoId,
215  int NumFeatures,
216  INT_FEATURE_ARRAY Features,
217  FEATURE_SET FloatFeatures);
218  void MakePermanent(ADAPT_TEMPLATES Templates,
219  CLASS_ID ClassId,
220  int ConfigId,
221  const DENORM& denorm,
222  TBLOB *Blob);
223  void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
224  void RemoveExtraPuncs(ADAPT_RESULTS *Results);
225  void RemoveBadMatches(ADAPT_RESULTS *Results);
226  void SetAdaptiveThreshold(FLOAT32 Threshold);
227  void ShowBestMatchFor(TBLOB *Blob,
228  const DENORM& denorm,
229  CLASS_ID ClassId,
230  int shape_id,
231  BOOL8 AdaptiveOn,
232  BOOL8 PreTrainedOn,
233  ADAPT_RESULTS *Results);
234  // Returns a string for the classifier class_id: either the corresponding
235  // unicharset debug_str or the shape_table_ debug str.
237  int class_id, int config_id) const;
238  // Converts a classifier class_id index with a config ID to:
239  // shape_table_ present: a shape_table_ index OR
240  // No shape_table_: a font ID.
241  // Without shape training, each class_id, config pair represents a single
242  // unichar id/font combination, so this function looks up the corresponding
243  // font id.
244  // With shape training, each class_id, config pair represents a single
245  // shape table index, so the fontset_table stores the shape table index,
246  // and the shape_table_ must be consulted to obtain the actual unichar_id/
247  // font combinations that the shape represents.
248  int ClassAndConfigIDToFontOrShapeID(int class_id,
249  int int_result_config) const;
250  // Converts a shape_table_ index to a classifier class_id index (not a
251  // unichar-id!). Uses a search, so not fast.
252  int ShapeIDToClassID(int shape_id) const;
254  const DENORM& denorm,
255  ADAPT_TEMPLATES Templates,
256  ADAPT_RESULTS *Results);
257  int CharNormClassifier(TBLOB *Blob,
258  const DENORM& denorm,
259  INT_TEMPLATES Templates,
260  ADAPT_RESULTS *Results);
261 
262  // As CharNormClassifier, but operates on a TrainingSample and outputs to
263  // a GenericVector of ShapeRating without conversion to classes.
264  int CharNormTrainingSample(bool pruner_only, const TrainingSample& sample,
265  GenericVector<ShapeRating>* results);
267  const DENORM& denorm,
268  CLASS_ID CorrectClass);
269  void DoAdaptiveMatch(TBLOB *Blob,
270  const DENORM& denorm,
271  ADAPT_RESULTS *Results);
272  void AdaptToChar(TBLOB *Blob,
273  const DENORM& denorm,
274  CLASS_ID ClassId,
275  int FontinfoId,
276  FLOAT32 Threshold);
277  void DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
278  INT_CLASS_STRUCT* int_class);
279  int AdaptableWord(TWERD *Word,
280  const WERD_CHOICE &BestChoiceWord,
281  const WERD_CHOICE &RawChoiceWord);
282  void EndAdaptiveClassifier();
283  void PrintAdaptiveStatistics(FILE *File);
284  void SettupPass1();
285  void SettupPass2();
286  void AdaptiveClassifier(TBLOB *Blob,
287  const DENORM& denorm,
288  BLOB_CHOICE_LIST *Choices,
289  CLASS_PRUNER_RESULTS cp_results);
290  void ClassifyAsNoise(ADAPT_RESULTS *Results);
292 
293  int GetBaselineFeatures(TBLOB *Blob,
294  const DENORM& denorm,
295  INT_TEMPLATES Templates,
296  INT_FEATURE_ARRAY IntFeatures,
297  uinT8* CharNormArray,
298  inT32 *BlobLength);
299  int GetCharNormFeatures(TBLOB *Blob,
300  const DENORM& denorm,
301  INT_TEMPLATES Templates,
302  INT_FEATURE_ARRAY IntFeatures,
303  uinT8* PrunerNormArray,
304  uinT8* CharNormArray,
305  inT32 *BlobLength,
306  inT32 *FeatureOutlineIndex);
307  // Computes the char_norm_array for the unicharset and, if not NULL, the
308  // pruner_array as appropriate according to the existence of the shape_table.
309  // The norm_feature is deleted as it is almost certainly no longer needed.
310  void ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
311  INT_TEMPLATES_STRUCT* templates,
312  uinT8* char_norm_array,
313  uinT8* pruner_array);
314 
315  bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
316  void UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm, TBLOB *Blob);
317 
319  bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
320  bool LooksLikeGarbage(const DENORM& denorm, TBLOB *blob);
321  void RefreshDebugWindow(ScrollView **win, const char *msg,
322  int y_offset, const TBOX &wbox);
323  /* float2int.cpp ************************************************************/
324  void ClearCharNormArray(uinT8* char_norm_array);
325  void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
326  uinT8* char_norm_array);
327  void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
328  /* intproto.cpp *************************************************************/
329  INT_TEMPLATES ReadIntTemplates(FILE *File);
330  void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
331  const UNICHARSET& target_unicharset);
332  CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
333  bool* pretrained_on, int* shape_id);
334  void ShowMatchDisplay();
335  /* font detection ***********************************************************/
337  return fontinfo_table_;
338  }
340  return fontset_table_;
341  }
342  /* mfoutline.cpp ***********************************************************/
343  void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale);
344  /* outfeat.cpp ***********************************************************/
346  /* picofeat.cpp ***********************************************************/
348 
349 
350  // Member variables.
351 
352  // Parameters.
354  "Prioritize blob division over chopping");
355  INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
356  BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
357  INT_VAR_H(classify_debug_level, 0, "Classify debug level");
358 
359  /* mfoutline.cpp ***********************************************************/
360  /* control knobs used to control normalization of outlines */
361  INT_VAR_H(classify_norm_method, character, "Normalization Method ...");
363  "Character Normalization Range ...");
364  double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...");
365  double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
366  double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
367  double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
368 
369  /* adaptmatch.cpp ***********************************************************/
370  BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
371  BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching");
372  BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
374  "Use pre-adapted classifier templates");
376  "Save adapted templates to a file");
377  BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
378  INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
379  INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
380  INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
381  double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)");
382  double_VAR_H(matcher_great_threshold, 0.0, "Great Match (0-1)");
383  double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
384  double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
385  double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)");
386  double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
387  INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes");
389  "Reliable Config Threshold");
391  "Enable adaption even if the ambiguities have not been seen");
393  "Maximum angle delta for prototype clustering");
395  "Penalty to apply when a non-alnum is vertically out of "
396  "its expected textline position");
397  double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
398  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
400  "Scale factor for features not used");
402  "Threshold for good protos during adaptive 0-255");
404  "Threshold for good features during adaptive 0-255");
406  "Do not include character fragments in the"
407  " results of the classifier");
409  "Exclude fragments that do not match any whole character"
410  " with at least this certainty");
412  "Bring up graphical debugging windows for fragments training");
414  "Use two different windows for debugging the matching: "
415  "One for the protos and one for the features.");
416  STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning");
417 
418  /* intmatcher.cpp **********************************************************/
420  "Class Pruner Threshold 0-255");
422  "Class Pruner Multiplier 0-255: ");
424  "Class Pruner CutoffStrength: ");
426  "Integer Matcher Multiplier 0-255: ");
427 
428  // Use class variables to hold onto built-in templates and adapted templates.
431 
432  // Create dummy proto and config masks for use with the built-in templates.
440  /* normmatch.cpp */
442  /* font detection ***********************************************************/
444  // Without shape training, each class_id, config pair represents a single
445  // unichar id/font combination, so each fontset_table_ entry holds font ids
446  // for each config in the class.
447  // With shape training, each class_id, config pair represents a single
448  // shape_table_ index, so the fontset_table_ stores the shape_table_ index,
449  // and the shape_table_ must be consulted to obtain the actual unichar_id/
450  // font combinations that the shape represents.
452 
453  INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
455  "Assume the input is numbers [0-9].");
456 
457  protected:
460  // If a shape_table_ is present, it is used to remap classifier output in
461  // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
462  // mean an index to the shape_table_ and the choices returned are *all* the
463  // shape_table_ entries at that index.
465 
466  private:
467 
468  Dict dict_;
469 
470  /* variables used to hold performance statistics */
471  int AdaptiveMatcherCalls;
472  int BaselineClassifierCalls;
473  int CharNormClassifierCalls;
474  int AmbigClassifierCalls;
475  int NumWordsAdaptedTo;
476  int NumCharsAdaptedTo;
477  int NumBaselineClassesTried;
478  int NumCharNormClassesTried;
479  int NumAmbigClassesTried;
480  int NumClassesOutput;
481  int NumAdaptationsFailed;
482 
483  /* variables used to hold onto extracted features. This is used
484  to map from the old scheme in which baseline features and char norm
485  features are extracted separately, to the new scheme in which they
486  are extracted at the same time. */
487  bool FeaturesHaveBeenExtracted;
488  bool FeaturesOK;
489  INT_FEATURE_ARRAY BaselineFeatures;
490  INT_FEATURE_ARRAY CharNormFeatures;
491  INT_FX_RESULT_STRUCT FXInfo;
492 
493  // Expected number of features in the class pruner, used to penalize
494  // unknowns that have too few features (like a c being classified as e) so
495  // it doesn't recognize everything as '@' or '#'.
496  // CharNormCutoffs is for the static classifier (with no shapetable).
497  // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
498  // value in the adaptive classifier. Both are indexed by unichar_id.
499  // shapetable_cutoffs_ provides a similar value for each shape in the
500  // shape_table_
501  uinT16* CharNormCutoffs;
502  uinT16* BaselineCutoffs;
503  GenericVector<uinT16> shapetable_cutoffs_;
504  ScrollView* learn_debug_win_;
505  ScrollView* learn_fragmented_word_debug_win_;
506  ScrollView* learn_fragments_debug_win_;
507 };
508 } // namespace tesseract
509 
510 #endif // TESSERACT_CLASSIFY_CLASSIFY_H__