Tesseract 3.0 initial commmit

Signed-off-by: Alan Viverette <alanv@google.com>
author: Alan Viverette <alanv@google.com> 2009-08-04 18:36:23 -0700
committer: Alan Viverette <alanv@google.com> 2009-08-04 18:42:39 -0700
commit: f8d326e45038968a99db48306ecd083f08dd65e3 (patch)
tree: 752c581c148d50e051c6d77810d8fb2efa9574b3 /ccutil/unicharset.h
parent: d544c9231465999ad600ec13614b4d69d351798d (diff)
download: tesseract-f8d326e45038968a99db48306ecd083f08dd65e3.tar.gz
1 files changed, 117 insertions, 13 deletions
diff --git a/ccutil/unicharset.h b/ccutil/unicharset.h
index 713ae03..9765ec1 100644
--- a/ccutil/unicharset.h
+++ b/ccutil/unicharset.h
@@ -20,9 +20,11 @@
 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
 #define TESSERACT_CCUTIL_UNICHARSET_H__
 
+#include "assert.h"
 #include "strngs.h"
 #include "unichar.h"
 #include "unicharmap.h"
+#include "varable.h"
 
 class CHAR_FRAGMENT {
  public:
@@ -119,7 +121,6 @@ class CHAR_FRAGMENT {
 // by a unique number, from 0 to (size - 1).
 class UNICHARSET {
  public:
-
   // Create an empty UNICHARSET
   UNICHARSET();
 
@@ -160,20 +161,20 @@ class UNICHARSET {
 
   // Return true if the given unichar id exists within the set.
   // Relies on the fact that unichar ids are contiguous in the unicharset.
-  bool contains_unichar_id(UNICHAR_ID unichar_id) {
+  bool contains_unichar_id(UNICHAR_ID unichar_id) const {
     return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
   }
 
   // Return true if the given unichar representation exists within the set.
-  bool contains_unichar(const char* const unichar_repr);
-  bool contains_unichar(const char* const unichar_repr, int length);
+  bool contains_unichar(const char* const unichar_repr) const;
+  bool contains_unichar(const char* const unichar_repr, int length) const;
 
   // Return true if the given unichar representation corresponds to the given
   // UNICHAR_ID within the set.
-  bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr);
+  bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
 
   // Delete CHAR_FRAGMENTs stored in properties of unichars array.
-  void delete_fragments() {
+  void delete_pointers_in_unichars() {
     for (int i = 0; i < size_used; ++i) {
       if (unichars[i].properties.fragment != NULL) {
         delete unichars[i].properties.fragment;
@@ -191,7 +192,7 @@ class UNICHARSET {
       script_table = 0;
       script_table_size_reserved = 0;
       script_table_size_used = 0;
-      delete_fragments();
+      delete_pointers_in_unichars();
       delete[] unichars;
       unichars = 0;
       size_reserved = 0;
@@ -208,13 +209,34 @@ class UNICHARSET {
   // Reserve enough memory space for the given number of UNICHARS
   void reserve(int unichars_number);
 
-  // Save the content of the UNICHARSET to the given file. Return true if the
-  // operation is successful.
-  bool save_to_file(const char* const filename) const;
+  // Opens the file indicated by filename and saves unicharset to that file.
+  // Returns true if the operation is successful.
+  bool save_to_file(const char * const filename) const {
+    FILE* file = fopen(filename, "w+");
+    if (file == NULL) return false;
+    bool result = save_to_file(file);
+    fclose(file);
+    return result;
+  }
+
+  // Saves the content of the UNICHARSET to the given file.
+  // Returns true if the operation is successful.
+  bool save_to_file(FILE *file) const;
 
-  // Load the UNICHARSET from the given file. The previous data is lost. Return
-  // true if the operation is successful.
-  bool load_from_file(const char* const filename);
+  // Opens the file indicated by filename and loads the UNICHARSET
+  // from the given file. The previous data is lost.
+  // Returns true if the operation is successful.
+  bool load_from_file(const char* const filename) {
+    FILE* file = fopen(filename, "r");
+    if (file == NULL) return false;
+    bool result = load_from_file(file);
+    fclose(file);
+    return result;
+  }
+
+  // Loads the UNICHARSET from the given file. The previous data is lost.
+  // Returns true if the operation is successful.
+  bool load_from_file(FILE *file);
 
   // Set a whitelist and/or blacklist of characters to recognize.
   // An empty or NULL whitelist enables everything (minus any blacklist).
@@ -245,12 +267,27 @@ class UNICHARSET {
     unichars[unichar_id].properties.isdigit = value;
   }
 
+  // Set the ispunctuation property of the given unichar to the given value.
+  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
+    unichars[unichar_id].properties.ispunctuation = value;
+  }
+
+  // Set the isngram property of the given unichar to the given value.
+  void set_isngram(UNICHAR_ID unichar_id, bool value) {
+    unichars[unichar_id].properties.isngram = value;
+  }
+
   // Set the script name of the given unichar to the given value.
   // Value is copied and thus can be a temporary;
   void set_script(UNICHAR_ID unichar_id, const char* value) {
     unichars[unichar_id].properties.script_id = add_script(value);
   }
 
+  // Set other_case unichar id in the properties for the given unichar id.
+  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
+    unichars[unichar_id].properties.other_case = other_case;
+  }
+
   // Return the isalpha property of the given unichar.
   bool get_isalpha(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.isalpha;
@@ -271,6 +308,16 @@ class UNICHARSET {
     return unichars[unichar_id].properties.isdigit;
   }
 
+  // Return the ispunctuation property of the given unichar.
+  bool get_ispunctuation(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.ispunctuation;
+  }
+
+  // Return the isngram property of the given unichar.
+  bool get_isngram(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.isngram;
+  }
+
   // Return the script name of the given unichar.
   // The returned pointer will always be the same for the same script, it's
   // managed by unicharset and thus MUST NOT be deleted
@@ -278,6 +325,23 @@ class UNICHARSET {
     return unichars[unichar_id].properties.script_id;
   }
 
+  // Get other_case unichar id in the properties for the given unichar id.
+  UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.other_case;
+  }
+
+  // Returns UNICHAR_ID of the corresponding lower-case unichar.
+  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
+    if (unichars[unichar_id].properties.islower) return unichar_id;
+    return unichars[unichar_id].properties.other_case;
+  }
+
+  // Returns UNICHAR_ID of the corresponding upper-case unichar.
+  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
+    if (unichars[unichar_id].properties.isupper) return unichar_id;
+    return unichars[unichar_id].properties.other_case;
+  }
+
   // Return a pointer to the CHAR_FRAGMENT class if the given
   // unichar id represents a character fragment.
   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
@@ -304,6 +368,11 @@ class UNICHARSET {
     return get_isdigit(unichar_to_id(unichar_repr));
   }
 
+  // Return the ispunctuation property of the given unichar representation.
+  bool get_ispunctuation(const char* const unichar_repr) const {
+    return get_ispunctuation(unichar_to_id(unichar_repr));
+  }
+
   // Return the script name of the given unichar representation.
   // The returned pointer will always be the same for the same script, it's
   // managed by unicharset and thus MUST NOT be deleted
@@ -349,6 +418,13 @@ class UNICHARSET {
     return get_isdigit(unichar_to_id(unichar_repr, length));
   }
 
+  // Return the ispunctuation property of the given unichar representation.
+  // Only the first length characters from unichar_repr are used.
+  bool get_ispunctuation(const char* const unichar_repr,
+                          int length) const {
+    return get_ispunctuation(unichar_to_id(unichar_repr, length));
+  }
+
   // Return the script name of the given unichar representation.
   // Only the first length characters from unichar_repr are used.
   // The returned pointer will always be the same for the same script, it's
@@ -370,6 +446,13 @@ class UNICHARSET {
     return script_table[id];
   }
 
+  // Returns the id from the name of the script, or 0 if script is not found.
+  // Note that this is an expensive operation since it involves iteratively
+  // comparing strings in the script table.  To avoid dependency on STL, we
+  // won't use a hash.  Instead, the calling function can use this to lookup
+  // and save the ID for relevant scripts for fast comparisons later.
+  int get_script_id_from_name(const char* script_name) const;
+
   // Return true if the given script is the null script
   bool is_null_script(const char* script) const {
     return script == null_script;
@@ -385,6 +468,14 @@ class UNICHARSET {
     return unichars[unichar_id].properties.enabled;
   }
 
+
+  int null_sid() const { return null_sid_; }
+  int common_sid() const { return common_sid_; }
+  int latin_sid() const { return latin_sid_; }
+  int cyrillic_sid() const { return cyrillic_sid_; }
+  int greek_sid() const { return greek_sid_; }
+  int han_sid() const { return han_sid_; }
+
  private:
 
   struct UNICHAR_PROPERTIES {
@@ -392,8 +483,11 @@ class UNICHARSET {
     bool  islower;
     bool  isupper;
     bool  isdigit;
+    bool  ispunctuation;
+    bool  isngram;
     bool  enabled;
     int   script_id;
+    UNICHAR_ID other_case;  // id of the corresponding upper/lower case unichar
 
     // Contains meta information about the fragment if a unichar represents
     // a fragment of a character, otherwise should be set to NULL.
@@ -415,6 +509,16 @@ class UNICHARSET {
   int script_table_size_used;
   int script_table_size_reserved;
   const char* null_script;
+
+  // A few convenient script name-to-id mapping without using hash.
+  // These are initialized when unicharset file is loaded.  Anything
+  // missing from this list can be looked up using get_script_id_from_name.
+  int null_sid_;
+  int common_sid_;
+  int latin_sid_;
+  int cyrillic_sid_;
+  int greek_sid_;
+  int han_sid_;
 };
 
 #endif  // TESSERACT_CCUTIL_UNICHARSET_H__
author	Alan Viverette <alanv@google.com>	2009-08-04 18:36:23 -0700
committer	Alan Viverette <alanv@google.com>	2009-08-04 18:42:39 -0700
commit	f8d326e45038968a99db48306ecd083f08dd65e3 (patch)
tree	752c581c148d50e051c6d77810d8fb2efa9574b3 /ccutil/unicharset.h
parent	d544c9231465999ad600ec13614b4d69d351798d (diff)
download	tesseract-f8d326e45038968a99db48306ecd083f08dd65e3.tar.gz