aboutsummaryrefslogtreecommitdiff
path: root/ccutil/ambigs.h
blob: e3bde2fd990a5a55c06cb8737a23f9d52727bec1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
///////////////////////////////////////////////////////////////////////
// File:        ambigs.h
// Description: Constants, flags, functions for dealing with
//              ambiguities (training and recognition).
// Author:      Daria Antonova
// Created:     Mon Aug 23 11:26:43 PDT 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_CCUTIL_AMBIGS_H_
#define TESSERACT_CCUTIL_AMBIGS_H_

#include "elst.h"
#include "tprintf.h"
#include "unichar.h"
#include "unicharset.h"
#include "genericvector.h"

#define MAX_AMBIG_SIZE    10

extern INT_VAR_H(global_ambigs_debug_level, 0,
                 "Debug level for unichar ambiguities");
extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0,
                  "Use definite ambiguities when running character classifier");

namespace tesseract {

static const int kUnigramAmbigsBufferSize = 1000;
static const char kAmbigNgramSeparator[] = { ' ', '\0' };
static const char kAmbigDelimiters[] = "\t ";
static const char kIllegalMsg[] =
  "Illegal ambiguity specification on line %d\n";
static const char kIllegalUnicharMsg[] =
  "Illegal unichar %s in ambiguity specification\n";

enum AmbigType {
  NOT_AMBIG,        // the ngram pair is not ambiguous
  REPLACE_AMBIG,    // ocred ngram should always be substituted with correct
  DEFINITE_AMBIG,   // add correct ngram to the classifier results (1-1)
  SIMILAR_AMBIG,    // use pairwise classifier for ocred/correct pair (1-1)
  CASE_AMBIG,       // this is a case ambiguity (1-1)

  AMBIG_TYPE_COUNT  // number of enum entries
};

// A collection of utility functions for arrays of UNICHAR_IDs that are
// terminated by INVALID_UNICHAR_ID.
class UnicharIdArrayUtils {
 public:
  // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
  // less than length of array2, if any array1[i] is less than array2[i].
  // Returns 0 if the arrays are equal, 1 otherwise.
  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
  static inline int compare(const UNICHAR_ID array1[],
                            const UNICHAR_ID array2[]) {
    const UNICHAR_ID *ptr1 = array1;
    const UNICHAR_ID *ptr2 = array2;
    while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
      if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
      ++ptr1;
      ++ptr2;
    }
    if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
    return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
  }

  // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
  // and that dst has enough space for all the elements from src.
  static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
    int i = 0;
    do {
      dst[i] = src[i];
    } while (dst[i++] != INVALID_UNICHAR_ID);
    return i - 1;
  }

  // Prints unichars corresponding to the unichar_ids in the given array.
  // The function assumes that array is terminated by INVALID_UNICHAR_ID.
  static inline void print(const UNICHAR_ID array[],
                           const UNICHARSET &unicharset) {
    const UNICHAR_ID *ptr = array;
    if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
    while (*ptr != INVALID_UNICHAR_ID) {
      tprintf("%s ", unicharset.id_to_unichar(*ptr++));
    }
    tprintf("( ");
    ptr = array;
    while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
    tprintf(")\n");
  }
};

// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
// start with the same unichar (e.g. r->t rn->m rr1->m).
class AmbigSpec : public ELIST_LINK {
 public:
  AmbigSpec();
  ~AmbigSpec() {}

  // Comparator function for sorting AmbigSpec_LISTs. The lists will
  // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
  // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
  static int compare_ambig_specs(const void *spec1, const void *spec2) {
    const AmbigSpec *s1 =
      *reinterpret_cast<const AmbigSpec * const *>(spec1);
    const AmbigSpec *s2 =
      *reinterpret_cast<const AmbigSpec * const *>(spec2);
    return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
  }

  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
  UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
  UNICHAR_ID correct_ngram_id;
  AmbigType type;
  int wrong_ngram_size;
};
ELISTIZEH(AmbigSpec);

// AMBIG_TABLE[i] stores a set of ambiguities whose
// wrong ngram starts with unichar id i.
typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
typedef GenericVector<UNICHAR_ID> UnicharIdVector;

class UnicharAmbigs {
 public:
  UnicharAmbigs() {}
  ~UnicharAmbigs() {
    replace_ambigs_.delete_data_pointers();
    dang_ambigs_.delete_data_pointers();
    one_to_one_definite_ambigs_.delete_data_pointers();
  }

  const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
  const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }

  // Fills in two ambiguity tables (replaceable and dangerous) with information
  // read from the ambigs file. An ambiguity table is an array of lists.
  // The array is indexed by a class id. Each entry in the table provides
  // a list of potential ambiguities which can start with the corresponding
  // character. For example the ambiguity "rn -> m", would be located in the
  // table at index of unicharset.unichar_to_id('r').
  // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
  // one_to_one_definite_ambigs_. This vector is also indexed by the class id
  // of the wrong part of the ambiguity and each entry contains a vector of
  // unichar ids that are ambiguous to it.
  void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset,
                         UNICHARSET *unicharset);

  // Return definite 1-1 ambigs.
  const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
    if (one_to_one_definite_ambigs_.empty()) return NULL;
    return one_to_one_definite_ambigs_[unichar_id];
  }

 private:

  bool ParseAmbiguityLine(int line_num, int version,
                          const UNICHARSET &unicharset, char *buffer,
                          int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
                          int *ReplacementAmbigPartSize,
                          char *ReplacementString, int *type);
  void InsertIntoTable(UnicharAmbigsVector &table,
                       int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
                       int ReplacementAmbigPartSize,
                       const char *ReplacementString, int type,
                       AmbigSpec *ambig_spec, UNICHARSET *unicharset);
  UnicharAmbigsVector dang_ambigs_;
  UnicharAmbigsVector replace_ambigs_;
  GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
};

}  // namespace tesseract

#endif  // TESSERACT_CCUTIL_AMBIGS_H_