Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
mastertrainer.h
Go to the documentation of this file.
1
// Copyright 2010 Google Inc. All Rights Reserved.
2
// Author: rays@google.com (Ray Smith)
4
// File: mastertrainer.h
5
// Description: Trainer to build the MasterClassifier.
6
// Author: Ray Smith
7
// Created: Wed Nov 03 18:07:01 PDT 2010
8
//
9
// (C) Copyright 2010, Google Inc.
10
// Licensed under the Apache License, Version 2.0 (the "License");
11
// you may not use this file except in compliance with the License.
12
// You may obtain a copy of the License at
13
// http://www.apache.org/licenses/LICENSE-2.0
14
// Unless required by applicable law or agreed to in writing, software
15
// distributed under the License is distributed on an "AS IS" BASIS,
16
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
// See the License for the specific language governing permissions and
18
// limitations under the License.
19
//
21
22
#ifndef TESSERACT_TRAINING_MASTERTRAINER_H__
23
#define TESSERACT_TRAINING_MASTERTRAINER_H__
24
28
#include "
classify.h
"
29
#include "
cluster.h
"
30
#include "
intfx.h
"
31
#include "
elst.h
"
32
#include "
featdefs.h
"
33
#include "
fontinfo.h
"
34
#include "
indexmapbidi.h
"
35
#include "
intfeaturespace.h
"
36
#include "
intfeaturemap.h
"
37
#include "
intmatcher.h
"
38
#include "
params.h
"
39
#include "
shapetable.h
"
40
#include "
trainingsample.h
"
41
#include "
trainingsampleset.h
"
42
#include "
unicharset.h
"
43
44
namespace
tesseract
{
45
46
class
ShapeClassifier;
47
48
// Simple struct to hold the distance between two shapes during clustering.
49
struct
ShapeDist
{
50
ShapeDist
() :
shape1
(0),
shape2
(0),
distance
(0.0
f
) {}
51
ShapeDist
(
int
s1,
int
s2,
float
dist)
52
:
shape1
(s1),
shape2
(s2),
distance
(dist) {}
53
54
// Sort operator to sort in ascending order of distance.
55
bool
operator<
(
const
ShapeDist
& other)
const
{
56
return
distance
< other.
distance
;
57
}
58
59
int
shape1
;
60
int
shape2
;
61
float
distance
;
62
};
63
64
// Class to encapsulate training processes that use the TrainingSampleSet.
65
// Initially supports shape clustering and mftrainining.
66
// Other important features of the MasterTrainer are conditioning the data
67
// by outlier elimination, replication with perturbation, and serialization.
68
class
MasterTrainer
{
69
public
:
70
MasterTrainer
(
NormalizationMode
norm_mode,
bool
shape_analysis,
71
bool
replicate_samples,
int
debug_level);
72
~MasterTrainer
();
73
74
// Writes to the given file. Returns false in case of error.
75
bool
Serialize
(FILE* fp)
const
;
76
// Reads from the given file. Returns false in case of error.
77
// If swap is true, assumes a big/little-endian swap is needed.
78
bool
DeSerialize
(
bool
swap, FILE* fp);
79
80
// Loads an initial unicharset, or sets one up if the file cannot be read.
81
void
LoadUnicharset
(
const
char
*
filename
);
82
83
// Sets the feature space definition.
84
void
SetFeatureSpace
(
const
IntFeatureSpace
& fs) {
85
feature_space_ = fs;
86
feature_map_.
Init
(fs);
87
}
88
89
// Reads the samples and their features from the given file,
90
// adding them to the trainer with the font_id from the content of the file.
91
// If verification, then these are verification samples, not training.
92
void
ReadTrainingSamples
(FILE *fp,
93
const
FEATURE_DEFS_STRUCT
&
feature_defs
,
94
bool
verification);
95
96
// Adds the given single sample to the trainer, setting the classid
97
// appropriately from the given unichar_str.
98
void
AddSample
(
bool
verification,
const
char
* unichar_str,
99
TrainingSample
*
sample
);
100
101
// Loads all pages from the given tif filename and append to page_images_.
102
// Must be called after ReadTrainingSamples, as the current number of images
103
// is used as an offset for page numbers in the samples.
104
void
LoadPageImages
(
const
char
* filename);
105
106
// Cleans up the samples after initial load from the tr files, and prior to
107
// saving the MasterTrainer:
108
// Remaps fragmented chars if running shape anaylsis.
109
// Sets up the samples appropriately for class/fontwise access.
110
// Deletes outlier samples.
111
void
PostLoadCleanup
();
112
113
// Gets the samples ready for training. Use after both
114
// ReadTrainingSamples+PostLoadCleanup or DeSerialize.
115
// Re-indexes the features and computes canonical and cloud features.
116
void
PreTrainingSetup
();
117
118
// Sets up the master_shapes_ table, which tells which fonts should stay
119
// together until they get to a leaf node classifier.
120
void
SetupMasterShapes
();
121
122
// Adds the junk_samples_ to the main samples_ set. Junk samples are initially
123
// fragments and n-grams (all incorrectly segmented characters).
124
// Various training functions may result in incorrectly segmented characters
125
// being added to the unicharset of the main samples, perhaps because they
126
// form a "radical" decomposition of some (Indic) grapheme, or because they
127
// just look the same as a real character (like rn/m)
128
// This function moves all the junk samples, to the main samples_ set, but
129
// desirable junk, being any sample for which the unichar already exists in
130
// the samples_ unicharset gets the unichar-ids re-indexed to match, but
131
// anything else gets re-marked as unichar_id 0 (space character) to identify
132
// it as junk to the error counter.
133
void
IncludeJunk
();
134
135
// Replicates the samples and perturbs them if the enable_replication_ flag
136
// is set. MUST be used after the last call to OrganizeByFontAndClass on
137
// the training samples, ie after IncludeJunk if it is going to be used, as
138
// OrganizeByFontAndClass will eat the replicated samples into the regular
139
// samples.
140
void
ReplicateAndRandomizeSamplesIfRequired
();
141
142
// Loads the basic font properties file into fontinfo_table_.
143
// Returns false on failure.
144
bool
LoadFontInfo
(
const
char
* filename);
145
146
// Loads the xheight font properties file into xheights_.
147
// Returns false on failure.
148
bool
LoadXHeights
(
const
char
* filename);
149
150
// Reads spacing stats from filename and adds them to fontinfo_table.
151
// Returns false on failure.
152
bool
AddSpacingInfo
(
const
char
*filename);
153
154
// Returns the font id corresponding to the given font name.
155
// Returns -1 if the font cannot be found.
156
int
GetFontInfoId
(
const
char
* font_name);
157
// Returns the font_id of the closest matching font name to the given
158
// filename. It is assumed that a substring of the filename will match
159
// one of the fonts. If more than one is matched, the longest is returned.
160
int
GetBestMatchingFontInfoId
(
const
char
* filename);
161
162
// Sets up a flat shapetable with one shape per class/font combination.
163
void
SetupFlatShapeTable
(
ShapeTable
* shape_table);
164
165
// Sets up a Clusterer for mftraining on a single shape_id.
166
// Call FreeClusterer on the return value after use.
167
CLUSTERER
*
SetupForClustering
(
const
ShapeTable
& shape_table,
168
const
FEATURE_DEFS_STRUCT
&
feature_defs
,
169
int
shape_id,
int
* num_samples);
170
171
// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
172
// to the given inttemp_file, and the corresponding pffmtable.
173
// The unicharset is the original encoding of graphemes, and shape_set should
174
// match the size of the shape_table, and may possibly be totally fake.
175
void
WriteInttempAndPFFMTable
(
const
UNICHARSET
&
unicharset
,
176
const
UNICHARSET
& shape_set,
177
const
ShapeTable
& shape_table,
178
CLASS_STRUCT
* float_classes,
179
const
char
* inttemp_file,
180
const
char
* pffmtable_file);
181
182
const
UNICHARSET
&
unicharset
()
const
{
183
return
samples_.
unicharset
();
184
}
185
TrainingSampleSet
*
GetSamples
() {
186
return
&samples_;
187
}
188
const
ShapeTable
&
master_shapes
()
const
{
189
return
master_shapes_;
190
}
191
192
// Generates debug output relating to the canonical distance between the
193
// two given UTF8 grapheme strings.
194
void
DebugCanonical
(
const
char
* unichar_str1,
const
char
* unichar_str2);
195
#ifndef GRAPHICS_DISABLED
196
// Debugging for cloud/canonical features.
197
// Displays a Features window containing:
198
// If unichar_str2 is in the unicharset, and canonical_font is non-negative,
199
// displays the canonical features of the char/font combination in red.
200
// If unichar_str1 is in the unicharset, and cloud_font is non-negative,
201
// displays the cloud feature of the char/font combination in green.
202
// The canonical features are drawn first to show which ones have no
203
// matches in the cloud features.
204
// Until the features window is destroyed, each click in the features window
205
// will display the samples that have that feature in a separate window.
206
void
DisplaySamples
(
const
char
* unichar_str1,
int
cloud_font,
207
const
char
* unichar_str2,
int
canonical_font);
208
#endif // GRAPHICS_DISABLED
209
210
// Tests the given test_classifier on the internal samples.
211
// See TestClassifier for details.
212
void
TestClassifierOnSamples
(
int
report_level,
213
bool
replicate_samples,
214
ShapeClassifier
* test_classifier,
215
STRING
* report_string);
216
// Tests the given test_classifier on the given samples
217
// report_levels:
218
// 0 = no output.
219
// 1 = bottom-line error rate.
220
// 2 = bottom-line error rate + time.
221
// 3 = font-level error rate + time.
222
// 4 = list of all errors + short classifier debug output on 16 errors.
223
// 5 = list of all errors + short classifier debug output on 25 errors.
224
// If replicate_samples is true, then the test is run on an extended test
225
// sample including replicated and systematically perturbed samples.
226
// If report_string is non-NULL, a summary of the results for each font
227
// is appended to the report_string.
228
double
TestClassifier
(
int
report_level,
229
bool
replicate_samples,
230
TrainingSampleSet
* samples,
231
ShapeClassifier
* test_classifier,
232
STRING
* report_string);
233
234
// Returns the average (in some sense) distance between the two given
235
// shapes, which may contain multiple fonts and/or unichars.
236
// This function is public to facilitate testing.
237
float
ShapeDistance
(
const
ShapeTable
& shapes,
int
s1,
int
s2);
238
239
private
:
240
// Replaces samples that are always fragmented with the corresponding
241
// fragment samples.
242
void
ReplaceFragmentedSamples();
243
244
// Runs a hierarchical agglomerative clustering to merge shapes in the given
245
// shape_table, while satisfying the given constraints:
246
// * End with at least min_shapes left in shape_table,
247
// * No shape shall have more than max_shape_unichars in it,
248
// * Don't merge shapes where the distance between them exceeds max_dist.
249
void
ClusterShapes(
int
min_shapes,
int
max_shape_unichars,
250
float
max_dist,
ShapeTable
* shape_table);
251
252
private
:
253
NormalizationMode
norm_mode_;
254
// Character set we are training for.
255
UNICHARSET
unicharset_;
256
// Original feature space. Subspace mapping is contained in feature_map_.
257
IntFeatureSpace
feature_space_;
258
TrainingSampleSet
samples_;
259
TrainingSampleSet
junk_samples_;
260
TrainingSampleSet
verify_samples_;
261
// Master shape table defines what fonts stay together until the leaves.
262
ShapeTable
master_shapes_;
263
// Flat shape table has each unichar/font id pair in a separate shape.
264
ShapeTable
flat_shapes_;
265
// Font metrics gathered from multiple files.
266
UnicityTable<FontInfo>
fontinfo_table_;
267
// Array of xheights indexed by font ids in fontinfo_table_;
268
GenericVector<int>
xheights_;
269
270
// Non-serialized data initialized by other means or used temporarily
271
// during loading of training samples.
272
// Number of different class labels in unicharset_.
273
int
charsetsize_;
274
// Flag to indicate that we are running shape analysis and need fragments
275
// fixing.
276
bool
enable_shape_anaylsis_;
277
// Flag to indicate that sample replication is required.
278
bool
enable_replication_;
279
// Flag to indicate that junk should be included in samples_.
280
bool
include_junk_;
281
// Array of classids of fragments that replace the correctly segmented chars.
282
int
* fragments_;
283
// Classid of previous correctly segmented sample that was added.
284
int
prev_unichar_id_;
285
// Debug output control.
286
int
debug_level_;
287
// Feature map used to construct reduced feature spaces for compact
288
// classifiers.
289
IntFeatureMap
feature_map_;
290
// Vector of Pix pointers used for classifiers that need the image.
291
// Indexed by page_num_ in the samples.
292
// These images are owned by the trainer and need to be pixDestroyed.
293
GenericVector<Pix*>
page_images_;
294
};
295
296
}
// namespace tesseract.
297
298
#endif
mnt
data
src
tesseract-ocr
classify
mastertrainer.h
Generated on Thu Nov 1 2012 20:19:47 for Tesseract by
1.8.1