diff options
author | Alan Viverette <alanv@google.com> | 2009-08-04 18:36:23 -0700 |
---|---|---|
committer | Alan Viverette <alanv@google.com> | 2009-08-04 18:42:39 -0700 |
commit | f8d326e45038968a99db48306ecd083f08dd65e3 (patch) | |
tree | 752c581c148d50e051c6d77810d8fb2efa9574b3 /ccutil/tessdatamanager.cpp | |
parent | d544c9231465999ad600ec13614b4d69d351798d (diff) | |
download | tesseract-f8d326e45038968a99db48306ecd083f08dd65e3.tar.gz |
Tesseract 3.0 initial commmit
Signed-off-by: Alan Viverette <alanv@google.com>
Diffstat (limited to 'ccutil/tessdatamanager.cpp')
-rw-r--r-- | ccutil/tessdatamanager.cpp | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp new file mode 100644 index 0000000..1c8dc61 --- /dev/null +++ b/ccutil/tessdatamanager.cpp @@ -0,0 +1,203 @@ +/////////////////////////////////////////////////////////////////////// +// File: tessdatamanager.cpp +// Description: Functions to handle loading/combining tesseract data files. +// Author: Daria Antonova +// Created: Wed Jun 03 11:26:43 PST 2009 +// +// (C) Copyright 2009, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "tessdatamanager.h" + +#include <stdio.h> + +#include "serialis.h" +#include "strngs.h" +#include "tprintf.h" +#include "varable.h" + +BOOL_VAR(global_load_system_dawg, true, "Load system word dawg."); +BOOL_VAR(global_load_freq_dawg, true, "Load frequent word dawg."); +BOOL_VAR(global_load_punc_dawg, true, "Load dawg with punctuation patterns."); +BOOL_VAR(global_load_number_dawg, true, "Load dawg with number patterns."); + +INT_VAR(global_tessdata_manager_debug_level, 0, + "Debug level for TessdataManager functions."); + +namespace tesseract { + +void TessdataManager::Init(const char *data_file_name) { + int i; + data_file_ = fopen(data_file_name, "rb"); + if (data_file_ == NULL) { + tprintf("Error openning data file %s\n", data_file_name); + exit(1); + } + fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_); + bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries); + if (swap) { + actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_); + } + ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES); + fread(offset_table_, sizeof(inT64), + actual_tessdata_num_entries_, data_file_); + if (swap) { + for (i = 0 ; i < actual_tessdata_num_entries_; ++i) { + offset_table_[i] = reverse64(offset_table_[i]); + } + } + if (global_tessdata_manager_debug_level) { + tprintf("TessdataManager loaded %d types of tesseract data files.\n", + actual_tessdata_num_entries_); + for (i = 0; i < actual_tessdata_num_entries_; ++i) { + tprintf("Offset for type %d is %lld\n", i, offset_table_[i]); + } + } +} + +FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix, + const char *file_suffix, bool required_file, + bool text_file) { + STRING file_name = language_data_path_prefix; + file_name += file_suffix; + FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb"); + if (required_file && (file_ptr == NULL)) { + tprintf("Error openning required file %s\n", file_name.string()); + exit(1); + } + return file_ptr; +} + +void TessdataManager::CopyFile(FILE *input_file, FILE *output_file, + bool newline_end) { + int buffer_size = 1024; + char *chunk = new char[buffer_size]; + int bytes_read; + char last_char = 0x0; + while ((bytes_read = fread(chunk, sizeof(char), + buffer_size, input_file))) { + fwrite(chunk, sizeof(char), bytes_read, output_file); + last_char = chunk[bytes_read-1]; + } + if (newline_end) ASSERT_HOST(last_char == '\n'); + delete[] chunk; +} + +void TessdataManager::CombineDataFiles( + const char *language_data_path_prefix, + const char *output_filename) { + FILE *file_ptr; + STRING file_name; + int i; + inT64 offset_table[TESSDATA_NUM_ENTRIES]; + for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; + FILE *output_file = fopen(output_filename, "wb"); + // Leave some space for recording the offset_table. + fseek(output_file, + sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); + + // Record language-specific tesseract config file. + file_ptr = GetFilePtr(language_data_path_prefix, + kLangConfigFileSuffix, false, true); + if (file_ptr != NULL) { + offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file); + CopyFile(file_ptr, output_file, true); + fclose(file_ptr); + } + + // Record unicharset. + file_ptr = GetFilePtr(language_data_path_prefix, + kUnicharsetFileSuffix, true, true); + offset_table[TESSDATA_UNICHARSET] = ftell(output_file); + CopyFile(file_ptr, output_file, true); + fclose(file_ptr); + + // Record ambiguities. + file_ptr = GetFilePtr(language_data_path_prefix, + kAmbigsFileSuffix, false, true); + if (file_ptr != NULL) { + offset_table[TESSDATA_AMBIGS] = ftell(output_file); + CopyFile(file_ptr, output_file, true); + fclose(file_ptr); + } + + // Record inttemp. + file_ptr = + GetFilePtr(language_data_path_prefix, + kBuiltInTemplatesFileSuffix, false, false); + if (file_ptr != NULL) { + offset_table[TESSDATA_INTTEMP] = ftell(output_file); + CopyFile(file_ptr, output_file, false); + fclose(file_ptr); + + // Record pffmtable. + file_ptr = GetFilePtr(language_data_path_prefix, + kBuiltInCutoffsFileSuffix, true, true); + offset_table[TESSDATA_PFFMTABLE] = ftell(output_file); + CopyFile(file_ptr, output_file, true); + fclose(file_ptr); + + // Record normproto. + file_ptr = GetFilePtr(language_data_path_prefix, + kNormProtoFileSuffix, true, true); + offset_table[TESSDATA_NORMPROTO] = ftell(output_file); + CopyFile(file_ptr, output_file, true); + fclose(file_ptr); + } + + // Record dawgs. + file_ptr = GetFilePtr(language_data_path_prefix, + kPuncDawgFileSuffix, false, false); + if (file_ptr != NULL) { + offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file); + CopyFile(file_ptr, output_file, false); + fclose(file_ptr); + } + + file_ptr = GetFilePtr(language_data_path_prefix, + kSystemDawgFileSuffix, false, false); + if (file_ptr != NULL) { + offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file); + CopyFile(file_ptr, output_file, false); + fclose(file_ptr); + } + + file_ptr = GetFilePtr(language_data_path_prefix, + kNumberDawgFileSuffix, false, false); + if (file_ptr != NULL) { + offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file); + CopyFile(file_ptr, output_file, false); + fclose(file_ptr); + } + + file_ptr = GetFilePtr(language_data_path_prefix, + kFreqDawgFileSuffix, false, false); + if (file_ptr != NULL) { + offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file); + CopyFile(file_ptr, output_file, false); + fclose(file_ptr); + } + + fseek(output_file, 0, SEEK_SET); + inT32 num_entries = TESSDATA_NUM_ENTRIES; + fwrite(&num_entries, sizeof(inT32), 1, output_file); + fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file); + fclose(output_file); + + tprintf("TessdataManager combined tesseract data files.\n"); + for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + tprintf("Offset for type %d is %lld\n", i, offset_table[i]); + } +} + +} // namespace tesseract |