diff options
author | Ian Hodson <idh@google.com> | 2012-05-30 21:27:06 +0100 |
---|---|---|
committer | Ian Hodson <idh@google.com> | 2012-05-30 22:47:36 +0100 |
commit | f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2 (patch) | |
tree | b131ed907f9b2d5af09c0983b651e9e69bc6aab9 /src/include/fst/extensions/far/compile-strings.h | |
parent | a92766f0a6ba4fac46cd6fd3856ef20c3b204f0d (diff) | |
download | openfst-f4c12fce1ee58e670f9c3fce46c40296ba9ee8a2.tar.gz |
Add openfst to external, as used by GoogleTTSandroid-sdk-support_r11android-cts-4.2_r2android-cts-4.2_r1android-cts-4.1_r4android-cts-4.1_r2android-cts-4.1_r1android-4.2_r1android-4.2.2_r1.2android-4.2.2_r1.1android-4.2.2_r1android-4.2.1_r1.2android-4.2.1_r1.1android-4.2.1_r1android-4.1.2_r2.1android-4.1.2_r2android-4.1.2_r1android-4.1.1_r6.1android-4.1.1_r6android-4.1.1_r5android-4.1.1_r4android-4.1.1_r3android-4.1.1_r2android-4.1.1_r1.1android-4.1.1_r1tools_r22tools_r21jb-releasejb-mr1.1-releasejb-mr1.1-dev-plus-aospjb-mr1.1-devjb-mr1-releasejb-mr1-dev-plus-aospjb-mr1-devjb-mr0-releasejb-dev
Moved from GoogleTTS
Change-Id: I6bc6bdadaa53bd0f810b88443339f6d899502cc8
Diffstat (limited to 'src/include/fst/extensions/far/compile-strings.h')
-rw-r--r-- | src/include/fst/extensions/far/compile-strings.h | 271 |
1 files changed, 271 insertions, 0 deletions
diff --git a/src/include/fst/extensions/far/compile-strings.h b/src/include/fst/extensions/far/compile-strings.h new file mode 100644 index 0000000..d7f4d6b --- /dev/null +++ b/src/include/fst/extensions/far/compile-strings.h @@ -0,0 +1,271 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Authors: allauzen@google.com (Cyril Allauzen) +// ttai@google.com (Terry Tai) +// jpr@google.com (Jake Ratkiewicz) + + +#ifndef FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ +#define FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ + +#include <libgen.h> +#include <string> +#include <vector> +using std::vector; + +#include <fst/extensions/far/far.h> +#include <fst/string.h> + +namespace fst { + +// Construct a reader that provides FSTs from a file (stream) either on a +// line-by-line basis or on a per-stream basis. Note that the freshly +// constructed reader is already set to the first input. +// +// Sample Usage: +// for (StringReader<Arc> reader(...); !reader.Done(); reader.Next()) { +// Fst *fst = reader.GetVectorFst(); +// } +template <class A> +class StringReader { + public: + typedef A Arc; + typedef typename A::Label Label; + typedef typename A::Weight Weight; + typedef typename StringCompiler<A>::TokenType TokenType; + + enum EntryType { LINE = 1, FILE = 2 }; + + StringReader(istream &istrm, + const string &source, + EntryType entry_type, + TokenType token_type, + bool allow_negative_labels, + const SymbolTable *syms = 0, + Label unknown_label = kNoStateId) + : nline_(0), strm_(istrm), source_(source), entry_type_(entry_type), + token_type_(token_type), done_(false), + compiler_(token_type, syms, unknown_label, allow_negative_labels) { + Next(); // Initialize the reader to the first input. + } + + bool Done() { + return done_; + } + + void Next() { + VLOG(1) << "Processing source " << source_ << " at line " << nline_; + if (!strm_) { // We're done if we have no more input. + done_ = true; + return; + } + if (entry_type_ == LINE) { + getline(strm_, content_); + ++nline_; + } else { + content_.clear(); + string line; + while (getline(strm_, line)) { + ++nline_; + content_.append(line); + content_.append("\n"); + } + } + if (!strm_ && content_.empty()) // We're also done if we read off all the + done_ = true; // whitespace at the end of a file. + } + + VectorFst<A> *GetVectorFst() { + VectorFst<A> *fst = new VectorFst<A>; + if (compiler_(content_, fst)) { + return fst; + } else { + delete fst; + return NULL; + } + } + + CompactFst<A, StringCompactor<A> > *GetCompactFst() { + CompactFst<A, StringCompactor<A> > *fst = + new CompactFst<A, StringCompactor<A> >; + if (compiler_(content_, fst)) { + return fst; + } else { + delete fst; + return NULL; + } + } + + private: + size_t nline_; + istream &strm_; + string source_; + EntryType entry_type_; + TokenType token_type_; + bool done_; + StringCompiler<A> compiler_; + string content_; // The actual content of the input stream's next FST. + + DISALLOW_COPY_AND_ASSIGN(StringReader); +}; + +// Compute the minimal length required to encode each line number as a decimal +// number. +int KeySize(const char *filename); + +template <class Arc> +void FarCompileStrings(const vector<string> &in_fnames, + const string &out_fname, + const string &fst_type, + const FarType &far_type, + int32 generate_keys, + FarEntryType fet, + FarTokenType tt, + const string &symbols_fname, + const string &unknown_symbol, + bool allow_negative_labels, + bool file_list_input, + const string &key_prefix, + const string &key_suffix) { + typename StringReader<Arc>::EntryType entry_type; + if (fet == FET_LINE) { + entry_type = StringReader<Arc>::LINE; + } else if (fet == FET_FILE) { + entry_type = StringReader<Arc>::FILE; + } else { + FSTERROR() << "FarCompileStrings: unknown entry type"; + return; + } + + typename StringCompiler<Arc>::TokenType token_type; + if (tt == FTT_SYMBOL) { + token_type = StringCompiler<Arc>::SYMBOL; + } else if (tt == FTT_BYTE) { + token_type = StringCompiler<Arc>::BYTE; + } else if (tt == FTT_UTF8) { + token_type = StringCompiler<Arc>::UTF8; + } else { + FSTERROR() << "FarCompileStrings: unknown token type"; + return; + } + + bool compact; + if (fst_type.empty() || (fst_type == "vector")) { + compact = false; + } else if (fst_type == "compact") { + compact = true; + } else { + FSTERROR() << "FarCompileStrings: unknown fst type: " + << fst_type; + return; + } + + const SymbolTable *syms = 0; + typename Arc::Label unknown_label = kNoLabel; + if (!symbols_fname.empty()) { + syms = SymbolTable::ReadText(symbols_fname, + allow_negative_labels); + if (!syms) { + FSTERROR() << "FarCompileStrings: error reading symbol table: " + << symbols_fname; + return; + } + if (!unknown_symbol.empty()) { + unknown_label = syms->Find(unknown_symbol); + if (unknown_label == kNoLabel) { + FSTERROR() << "FarCompileStrings: unknown label \"" << unknown_label + << "\" missing from symbol table: " << symbols_fname; + return; + } + } + } + + FarWriter<Arc> *far_writer = + FarWriter<Arc>::Create(out_fname, far_type); + if (!far_writer) return; + + vector<string> inputs; + if (file_list_input) { + for (int i = 1; i < in_fnames.size(); ++i) { + ifstream istrm(in_fnames[i].c_str()); + string str; + while (getline(istrm, str)) + inputs.push_back(str); + } + } else { + inputs = in_fnames; + } + + for (int i = 0, n = 0; i < inputs.size(); ++i) { + int key_size = generate_keys ? generate_keys : + (entry_type == StringReader<Arc>::FILE ? 1 : + KeySize(inputs[i].c_str())); + ifstream istrm(inputs[i].c_str()); + + for (StringReader<Arc> reader( + istrm, inputs[i], entry_type, token_type, + allow_negative_labels, syms, unknown_label); + !reader.Done(); + reader.Next()) { + ++n; + const Fst<Arc> *fst; + if (compact) + fst = reader.GetCompactFst(); + else + fst = reader.GetVectorFst(); + if (!fst) { + FSTERROR() << "FarCompileStrings: compiling string number " << n + << " in file " << inputs[i] << " failed with token_type = " + << (tt == FTT_BYTE ? "byte" : + (tt == FTT_UTF8 ? "utf8" : + (tt == FTT_SYMBOL ? "symbol" : "unknown"))) + << " and entry_type = " + << (fet == FET_LINE ? "line" : + (fet == FET_FILE ? "file" : "unknown")); + delete far_writer; + delete syms; + return; + } + ostringstream keybuf; + keybuf.width(key_size); + keybuf.fill('0'); + keybuf << n; + string key; + if (generate_keys > 0) { + key = keybuf.str(); + } else { + char* filename = new char[inputs[i].size() + 1]; + strcpy(filename, inputs[i].c_str()); + key = basename(filename); + if (entry_type != StringReader<Arc>::FILE) { + key += "-"; + key += keybuf.str(); + } + delete[] filename; + } + far_writer->Add(key_prefix + key + key_suffix, *fst); + delete fst; + } + if (generate_keys == 0) + n = 0; + } + + delete far_writer; +} + +} // namespace fst + + +#endif // FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ |