aboutsummaryrefslogtreecommitdiff
path: root/ccutil/tessdatamanager.cpp
diff options
context:
space:
mode:
authorAlan Viverette <alanv@google.com>2009-08-04 18:36:23 -0700
committerAlan Viverette <alanv@google.com>2009-08-04 18:42:39 -0700
commitf8d326e45038968a99db48306ecd083f08dd65e3 (patch)
tree752c581c148d50e051c6d77810d8fb2efa9574b3 /ccutil/tessdatamanager.cpp
parentd544c9231465999ad600ec13614b4d69d351798d (diff)
downloadtesseract-f8d326e45038968a99db48306ecd083f08dd65e3.tar.gz
Tesseract 3.0 initial commmit
Signed-off-by: Alan Viverette <alanv@google.com>
Diffstat (limited to 'ccutil/tessdatamanager.cpp')
-rw-r--r--ccutil/tessdatamanager.cpp203
1 files changed, 203 insertions, 0 deletions
diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp
new file mode 100644
index 0000000..1c8dc61
--- /dev/null
+++ b/ccutil/tessdatamanager.cpp
@@ -0,0 +1,203 @@
+///////////////////////////////////////////////////////////////////////
+// File: tessdatamanager.cpp
+// Description: Functions to handle loading/combining tesseract data files.
+// Author: Daria Antonova
+// Created: Wed Jun 03 11:26:43 PST 2009
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tessdatamanager.h"
+
+#include <stdio.h>
+
+#include "serialis.h"
+#include "strngs.h"
+#include "tprintf.h"
+#include "varable.h"
+
+BOOL_VAR(global_load_system_dawg, true, "Load system word dawg.");
+BOOL_VAR(global_load_freq_dawg, true, "Load frequent word dawg.");
+BOOL_VAR(global_load_punc_dawg, true, "Load dawg with punctuation patterns.");
+BOOL_VAR(global_load_number_dawg, true, "Load dawg with number patterns.");
+
+INT_VAR(global_tessdata_manager_debug_level, 0,
+ "Debug level for TessdataManager functions.");
+
+namespace tesseract {
+
+void TessdataManager::Init(const char *data_file_name) {
+ int i;
+ data_file_ = fopen(data_file_name, "rb");
+ if (data_file_ == NULL) {
+ tprintf("Error openning data file %s\n", data_file_name);
+ exit(1);
+ }
+ fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
+ bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
+ if (swap) {
+ actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
+ }
+ ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
+ fread(offset_table_, sizeof(inT64),
+ actual_tessdata_num_entries_, data_file_);
+ if (swap) {
+ for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
+ offset_table_[i] = reverse64(offset_table_[i]);
+ }
+ }
+ if (global_tessdata_manager_debug_level) {
+ tprintf("TessdataManager loaded %d types of tesseract data files.\n",
+ actual_tessdata_num_entries_);
+ for (i = 0; i < actual_tessdata_num_entries_; ++i) {
+ tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
+ }
+ }
+}
+
+FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
+ const char *file_suffix, bool required_file,
+ bool text_file) {
+ STRING file_name = language_data_path_prefix;
+ file_name += file_suffix;
+ FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
+ if (required_file && (file_ptr == NULL)) {
+ tprintf("Error openning required file %s\n", file_name.string());
+ exit(1);
+ }
+ return file_ptr;
+}
+
+void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
+ bool newline_end) {
+ int buffer_size = 1024;
+ char *chunk = new char[buffer_size];
+ int bytes_read;
+ char last_char = 0x0;
+ while ((bytes_read = fread(chunk, sizeof(char),
+ buffer_size, input_file))) {
+ fwrite(chunk, sizeof(char), bytes_read, output_file);
+ last_char = chunk[bytes_read-1];
+ }
+ if (newline_end) ASSERT_HOST(last_char == '\n');
+ delete[] chunk;
+}
+
+void TessdataManager::CombineDataFiles(
+ const char *language_data_path_prefix,
+ const char *output_filename) {
+ FILE *file_ptr;
+ STRING file_name;
+ int i;
+ inT64 offset_table[TESSDATA_NUM_ENTRIES];
+ for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
+ FILE *output_file = fopen(output_filename, "wb");
+ // Leave some space for recording the offset_table.
+ fseek(output_file,
+ sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
+
+ // Record language-specific tesseract config file.
+ file_ptr = GetFilePtr(language_data_path_prefix,
+ kLangConfigFileSuffix, false, true);
+ if (file_ptr != NULL) {
+ offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
+ CopyFile(file_ptr, output_file, true);
+ fclose(file_ptr);
+ }
+
+ // Record unicharset.
+ file_ptr = GetFilePtr(language_data_path_prefix,
+ kUnicharsetFileSuffix, true, true);
+ offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
+ CopyFile(file_ptr, output_file, true);
+ fclose(file_ptr);
+
+ // Record ambiguities.
+ file_ptr = GetFilePtr(language_data_path_prefix,
+ kAmbigsFileSuffix, false, true);
+ if (file_ptr != NULL) {
+ offset_table[TESSDATA_AMBIGS] = ftell(output_file);
+ CopyFile(file_ptr, output_file, true);
+ fclose(file_ptr);
+ }
+
+ // Record inttemp.
+ file_ptr =
+ GetFilePtr(language_data_path_prefix,
+ kBuiltInTemplatesFileSuffix, false, false);
+ if (file_ptr != NULL) {
+ offset_table[TESSDATA_INTTEMP] = ftell(output_file);
+ CopyFile(file_ptr, output_file, false);
+ fclose(file_ptr);
+
+ // Record pffmtable.
+ file_ptr = GetFilePtr(language_data_path_prefix,
+ kBuiltInCutoffsFileSuffix, true, true);
+ offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
+ CopyFile(file_ptr, output_file, true);
+ fclose(file_ptr);
+
+ // Record normproto.
+ file_ptr = GetFilePtr(language_data_path_prefix,
+ kNormProtoFileSuffix, true, true);
+ offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
+ CopyFile(file_ptr, output_file, true);
+ fclose(file_ptr);
+ }
+
+ // Record dawgs.
+ file_ptr = GetFilePtr(language_data_path_prefix,
+ kPuncDawgFileSuffix, false, false);
+ if (file_ptr != NULL) {
+ offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
+ CopyFile(file_ptr, output_file, false);
+ fclose(file_ptr);
+ }
+
+ file_ptr = GetFilePtr(language_data_path_prefix,
+ kSystemDawgFileSuffix, false, false);
+ if (file_ptr != NULL) {
+ offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
+ CopyFile(file_ptr, output_file, false);
+ fclose(file_ptr);
+ }
+
+ file_ptr = GetFilePtr(language_data_path_prefix,
+ kNumberDawgFileSuffix, false, false);
+ if (file_ptr != NULL) {
+ offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
+ CopyFile(file_ptr, output_file, false);
+ fclose(file_ptr);
+ }
+
+ file_ptr = GetFilePtr(language_data_path_prefix,
+ kFreqDawgFileSuffix, false, false);
+ if (file_ptr != NULL) {
+ offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
+ CopyFile(file_ptr, output_file, false);
+ fclose(file_ptr);
+ }
+
+ fseek(output_file, 0, SEEK_SET);
+ inT32 num_entries = TESSDATA_NUM_ENTRIES;
+ fwrite(&num_entries, sizeof(inT32), 1, output_file);
+ fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
+ fclose(output_file);
+
+ tprintf("TessdataManager combined tesseract data files.\n");
+ for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+ tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
+ }
+}
+
+} // namespace tesseract