Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
scanutils.cpp
Go to the documentation of this file.
1 // Copyright 2006 Google Inc.
2 // All Rights Reserved.
3 // Author: renn
4 //
5 // The fscanf, vfscanf and creat functions are implemented so that their
6 // functionality is mostly like their stdio counterparts. However, currently
7 // these functions do not use any buffering, making them rather slow.
8 // File streams are thus processed one character at a time.
9 // Although the implementations of the scanf functions do lack a few minor
10 // features, they should be sufficient for their use in tesseract.
11 //
12 // Licensed under the Apache License, Version 2.0 (the "License");
13 // you may not use this file except in compliance with the License.
14 // You may obtain a copy of the License at
15 // http://www.apache.org/licenses/LICENSE-2.0
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 // See the License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #ifdef EMBEDDED
23 
24 #include <ctype.h>
25 #include <stdarg.h>
26 #include <stddef.h>
27 #include <inttypes.h>
28 #include <string.h>
29 #include <limits.h>
30 #include <stdio.h>
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include <fcntl.h>
34 
35 #include "scanutils.h"
36 #include "tprintf.h"
37 
38 enum Flags {
39  FL_SPLAT = 0x01, // Drop the value, do not assign
40  FL_INV = 0x02, // Character-set with inverse
41  FL_WIDTH = 0x04, // Field width specified
42  FL_MINUS = 0x08, // Negative number
43 };
44 
45 enum Ranks {
46  RANK_CHAR = -2,
47  RANK_SHORT = -1,
48  RANK_INT = 0,
49  RANK_LONG = 1,
50  RANK_LONGLONG = 2,
51  RANK_PTR = INT_MAX // Special value used for pointers
52 };
53 
54 const enum Ranks kMinRank = RANK_CHAR;
55 const enum Ranks kMaxRank = RANK_LONGLONG;
56 
57 const enum Ranks kIntMaxRank = RANK_LONGLONG;
58 const enum Ranks kSizeTRank = RANK_LONG;
59 const enum Ranks kPtrDiffRank = RANK_LONG;
60 
61 enum Bail {
62  BAIL_NONE = 0, // No error condition
63  BAIL_EOF, // Hit EOF
64  BAIL_ERR // Conversion mismatch
65 };
66 
67 // Helper functions ------------------------------------------------------------
68 inline size_t LongBit() {
69  return CHAR_BIT * sizeof(long);
70 }
71 
72 static inline int
73 SkipSpace(FILE *s)
74 {
75  int p;
76  while (isspace(p = fgetc(s)));
77  ungetc(p, s); // Make sure next char is available for reading
78  return p;
79 }
80 
81 static inline void
82 SetBit(unsigned long *bitmap, unsigned int bit)
83 {
84  bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
85 }
86 
87 static inline int
88 TestBit(unsigned long *bitmap, unsigned int bit)
89 {
90  return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
91 }
92 
93 static inline int DigitValue(int ch)
94 {
95  if (ch >= '0' && ch <= '9') {
96  return ch-'0';
97  } else if (ch >= 'A' && ch <= 'Z') {
98  return ch-'A'+10;
99  } else if (ch >= 'a' && ch <= 'z') {
100  return ch-'a'+10;
101  } else {
102  return -1;
103  }
104 }
105 
106 // IO (re-)implementations -----------------------------------------------------
107 uintmax_t streamtoumax(FILE* s, int base)
108 {
109  int minus = 0;
110  uintmax_t v = 0;
111  int d, c = 0;
112 
113  for (c = fgetc(s);
114  isspace(static_cast<unsigned char>(c)) && (c != EOF);
115  c = fgetc(s))
116 
117  // Single optional + or -
118  if (c == '-' || c == '+') {
119  minus = (c == '-');
120  c = fgetc(s);
121  }
122 
123  // Assign correct base
124  if (base == 0) {
125  if (c == '0') {
126  c = fgetc(s);
127  if (c == 'x' || c == 'X') {
128  base = 16;
129  c = fgetc(s);
130  } else {
131  base = 8;
132  }
133  }
134  } else if (base == 16) {
135  if (c == '0') {
136  c = fgetc(s);
137  if (c == 'x' && c == 'X') c = fgetc(s);
138  }
139  }
140 
141  // Actual number parsing
142  for (; (c != EOF) && (d = DigitValue(c)) >= 0 && d < base; c = fgetc(s))
143  v = v*base + d;
144 
145  ungetc(c, s);
146  return minus ? -v : v;
147 }
148 
149 double streamtofloat(FILE* s)
150 {
151  int minus = 0;
152  int v = 0;
153  int d, c = 0;
154  int k = 1;
155  int w = 0;
156 
157  for (c = fgetc(s);
158  isspace(static_cast<unsigned char>(c)) && (c != EOF);
159  c = fgetc(s));
160 
161  // Single optional + or -
162  if (c == '-' || c == '+') {
163  minus = (c == '-');
164  c = fgetc(s);
165  }
166 
167  // Actual number parsing
168  for (; (c != EOF) && (d = DigitValue(c)) >= 0; c = fgetc(s))
169  v = v*10 + d;
170  if (c == '.') {
171  for (c = fgetc(s); (c != EOF) && (d = DigitValue(c)) >= 0; c = fgetc(s)) {
172  w = w*10 + d;
173  k *= 10;
174  }
175  } else if (c == 'e' || c == 'E')
176  tprintf("WARNING: Scientific Notation not supported!");
177 
178  ungetc(c, s);
179  double f = static_cast<double>(v)
180  + static_cast<double>(w) / static_cast<double>(k);
181 
182  return minus ? -f : f;
183 }
184 
185 double strtofloat(const char* s)
186 {
187  int minus = 0;
188  int v = 0;
189  int d;
190  int k = 1;
191  int w = 0;
192 
193  while(*s && isspace(static_cast<unsigned char>(*s))) s++;
194 
195  // Single optional + or -
196  if (*s == '-' || *s == '+') {
197  minus = (*s == '-');
198  s++;
199  }
200 
201  // Actual number parsing
202  for (; *s && (d = DigitValue(*s)) >= 0; s++)
203  v = v*10 + d;
204  if (*s == '.') {
205  for (++s; *s && (d = DigitValue(*s)) >= 0; s++) {
206  w = w*10 + d;
207  k *= 10;
208  }
209  } else if (*s == 'e' || *s == 'E')
210  tprintf("WARNING: Scientific Notation not supported!");
211 
212  double f = static_cast<double>(v)
213  + static_cast<double>(w) / static_cast<double>(k);
214 
215  return minus ? -f : f;
216 }
217 
218 int fscanf(FILE* stream, const char *format, ...)
219 {
220  va_list ap;
221  int rv;
222 
223  va_start(ap, format);
224  rv = vfscanf(stream, format, ap);
225  va_end(ap);
226 
227  return rv;
228 }
229 
230 int vfscanf(FILE* stream, const char *format, va_list ap)
231 {
232  const char *p = format;
233  char ch;
234  int q = 0;
235  uintmax_t val = 0;
236  int rank = RANK_INT; // Default rank
237  unsigned int width = UINT_MAX;
238  int base;
239  int flags = 0;
240  enum {
241  ST_NORMAL, // Ground state
242  ST_FLAGS, // Special flags
243  ST_WIDTH, // Field width
244  ST_MODIFIERS, // Length or conversion modifiers
245  ST_MATCH_INIT, // Initial state of %[ sequence
246  ST_MATCH, // Main state of %[ sequence
247  ST_MATCH_RANGE, // After - in a %[ sequence
248  } state = ST_NORMAL;
249  char *sarg = NULL; // %s %c or %[ string argument
250  enum Bail bail = BAIL_NONE;
251  int sign;
252  int converted = 0; // Successful conversions
253  unsigned long matchmap[((1 << CHAR_BIT)+(LongBit()-1))/LongBit()];
254  int matchinv = 0; // Is match map inverted?
255  unsigned char range_start = 0;
256  off_t start_off = ftell(stream);
257 
258  // Skip leading spaces
259  SkipSpace(stream);
260 
261  while ((ch = *p++) && !bail) {
262  switch (state) {
263  case ST_NORMAL:
264  if (ch == '%') {
265  state = ST_FLAGS;
266  flags = 0; rank = RANK_INT; width = UINT_MAX;
267  } else if (isspace(static_cast<unsigned char>(ch))) {
268  SkipSpace(stream);
269  } else {
270  if (fgetc(stream) != ch)
271  bail = BAIL_ERR; // Match failure
272  }
273  break;
274 
275  case ST_FLAGS:
276  switch (ch) {
277  case '*':
278  flags |= FL_SPLAT;
279  break;
280 
281  case '0' ... '9':
282  width = (ch-'0');
283  state = ST_WIDTH;
284  flags |= FL_WIDTH;
285  break;
286 
287  default:
288  state = ST_MODIFIERS;
289  p--; // Process this character again
290  break;
291  }
292  break;
293 
294  case ST_WIDTH:
295  if (ch >= '0' && ch <= '9') {
296  width = width*10+(ch-'0');
297  } else {
298  state = ST_MODIFIERS;
299  p--; // Process this character again
300  }
301  break;
302 
303  case ST_MODIFIERS:
304  switch (ch) {
305  // Length modifiers - nonterminal sequences
306  case 'h':
307  rank--; // Shorter rank
308  break;
309  case 'l':
310  rank++; // Longer rank
311  break;
312  case 'j':
313  rank = kIntMaxRank;
314  break;
315  case 'z':
316  rank = kSizeTRank;
317  break;
318  case 't':
319  rank = kPtrDiffRank;
320  break;
321  case 'L':
322  case 'q':
323  rank = RANK_LONGLONG; // long double/long long
324  break;
325 
326  default:
327  // Output modifiers - terminal sequences
328  state = ST_NORMAL; // Next state will be normal
329  if (rank < kMinRank) // Canonicalize rank
330  rank = kMinRank;
331  else if (rank > kMaxRank)
332  rank = kMaxRank;
333 
334  switch (ch) {
335  case 'P': // Upper case pointer
336  case 'p': // Pointer
337  rank = RANK_PTR;
338  base = 0; sign = 0;
339  goto scan_int;
340 
341  case 'i': // Base-independent integer
342  base = 0; sign = 1;
343  goto scan_int;
344 
345  case 'd': // Decimal integer
346  base = 10; sign = 1;
347  goto scan_int;
348 
349  case 'o': // Octal integer
350  base = 8; sign = 0;
351  goto scan_int;
352 
353  case 'u': // Unsigned decimal integer
354  base = 10; sign = 0;
355  goto scan_int;
356 
357  case 'x': // Hexadecimal integer
358  case 'X':
359  base = 16; sign = 0;
360  goto scan_int;
361 
362  case 'n': // Number of characters consumed
363  val = ftell(stream) - start_off;
364  goto set_integer;
365 
366  scan_int:
367  q = SkipSpace(stream);
368  if ( q <= 0 ) {
369  bail = BAIL_EOF;
370  break;
371  }
372  val = streamtoumax(stream, base);
373  converted++;
374  // fall through
375 
376  set_integer:
377  if (!(flags & FL_SPLAT)) {
378  switch(rank) {
379  case RANK_CHAR:
380  *va_arg(ap, unsigned char *)
381  = static_cast<unsigned char>(val);
382  break;
383  case RANK_SHORT:
384  *va_arg(ap, unsigned short *)
385  = static_cast<unsigned short>(val);
386  break;
387  case RANK_INT:
388  *va_arg(ap, unsigned int *)
389  = static_cast<unsigned int>(val);
390  break;
391  case RANK_LONG:
392  *va_arg(ap, unsigned long *)
393  = static_cast<unsigned long>(val);
394  break;
395  case RANK_LONGLONG:
396  *va_arg(ap, unsigned long long *)
397  = static_cast<unsigned long long>(val);
398  break;
399  case RANK_PTR:
400  *va_arg(ap, void **)
401  = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
402  break;
403  }
404  }
405  break;
406 
407  case 'f': // Preliminary float value parsing
408  case 'g':
409  case 'G':
410  case 'e':
411  case 'E':
412  q = SkipSpace(stream);
413  if (q <= 0) {
414  bail = BAIL_EOF;
415  break;
416  }
417 
418  {
419  double fval = streamtofloat(stream);
420  switch(rank) {
421  case RANK_INT:
422  *va_arg(ap, float *) = static_cast<float>(fval);
423  break;
424  case RANK_LONG:
425  *va_arg(ap, double *) = static_cast<double>(fval);
426  break;
427  }
428  converted++;
429  }
430  break;
431 
432  case 'c': // Character
433  width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
434  sarg = va_arg(ap, char *);
435  while (width--) {
436  if ((q = fgetc(stream)) <= 0) {
437  bail = BAIL_EOF;
438  break;
439  }
440  *sarg++ = q;
441  }
442  if (!bail)
443  converted++;
444  break;
445 
446  case 's': // String
447  {
448  char *sp;
449  sp = sarg = va_arg(ap, char *);
450  while (width--) {
451  q = fgetc(stream);
452  if (isspace(static_cast<unsigned char>(q)) || q <= 0) {
453  ungetc(q, stream);
454  break;
455  }
456  *sp++ = q;
457  }
458  if (sarg != sp) {
459  *sp = '\0'; // Terminate output
460  converted++;
461  } else {
462  bail = BAIL_EOF;
463  }
464  }
465  break;
466 
467  case '[': // Character range
468  sarg = va_arg(ap, char *);
469  state = ST_MATCH_INIT;
470  matchinv = 0;
471  memset(matchmap, 0, sizeof matchmap);
472  break;
473 
474  case '%': // %% sequence
475  if (fgetc(stream) != '%' )
476  bail = BAIL_ERR;
477  break;
478 
479  default: // Anything else
480  bail = BAIL_ERR; // Unknown sequence
481  break;
482  }
483  }
484  break;
485 
486  case ST_MATCH_INIT: // Initial state for %[ match
487  if (ch == '^' && !(flags & FL_INV)) {
488  matchinv = 1;
489  } else {
490  SetBit(matchmap, static_cast<unsigned char>(ch));
491  state = ST_MATCH;
492  }
493  break;
494 
495  case ST_MATCH: // Main state for %[ match
496  if (ch == ']') {
497  goto match_run;
498  } else if (ch == '-') {
499  range_start = static_cast<unsigned char>(ch);
500  state = ST_MATCH_RANGE;
501  } else {
502  SetBit(matchmap, static_cast<unsigned char>(ch));
503  }
504  break;
505 
506  case ST_MATCH_RANGE: // %[ match after -
507  if (ch == ']') {
508  SetBit(matchmap, static_cast<unsigned char>('-'));
509  goto match_run;
510  } else {
511  int i;
512  for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
513  SetBit(matchmap, i);
514  state = ST_MATCH;
515  }
516  break;
517 
518  match_run: // Match expression finished
519  char* oarg = sarg;
520  while (width) {
521  q = fgetc(stream);
522  unsigned char qc = static_cast<unsigned char>(q);
523  if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
524  ungetc(q, stream);
525  break;
526  }
527  *sarg++ = q;
528  }
529  if (oarg != sarg) {
530  *sarg = '\0';
531  converted++;
532  } else {
533  bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
534  }
535  break;
536  }
537  }
538 
539  if (bail == BAIL_EOF && !converted)
540  converted = -1; // Return EOF (-1)
541 
542  return converted;
543 }
544 
545 int creat(const char *pathname, mode_t mode)
546 {
547  return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode);
548 }
549 
550 #endif // EMBEDDED