aboutsummaryrefslogtreecommitdiff
path: root/src/include/fst/extensions/far
diff options
context:
space:
mode:
authorIan Hodson <idh@google.com>2012-05-30 21:27:06 +0100
committerIan Hodson <idh@google.com>2012-05-30 22:47:36 +0100
commitf4c12fce1ee58e670f9c3fce46c40296ba9ee8a2 (patch)
treeb131ed907f9b2d5af09c0983b651e9e69bc6aab9 /src/include/fst/extensions/far
parenta92766f0a6ba4fac46cd6fd3856ef20c3b204f0d (diff)
downloadopenfst-jb-mr1-dev.tar.gz
Moved from GoogleTTS Change-Id: I6bc6bdadaa53bd0f810b88443339f6d899502cc8
Diffstat (limited to 'src/include/fst/extensions/far')
-rw-r--r--src/include/fst/extensions/far/compile-strings.h271
-rw-r--r--src/include/fst/extensions/far/create.h87
-rw-r--r--src/include/fst/extensions/far/extract.h85
-rw-r--r--src/include/fst/extensions/far/far.h360
-rw-r--r--src/include/fst/extensions/far/farlib.h31
-rw-r--r--src/include/fst/extensions/far/farscript.h234
-rw-r--r--src/include/fst/extensions/far/info.h128
-rw-r--r--src/include/fst/extensions/far/main.h43
-rw-r--r--src/include/fst/extensions/far/print-strings.h126
-rw-r--r--src/include/fst/extensions/far/stlist.h304
-rw-r--r--src/include/fst/extensions/far/sttable.h370
11 files changed, 2039 insertions, 0 deletions
diff --git a/src/include/fst/extensions/far/compile-strings.h b/src/include/fst/extensions/far/compile-strings.h
new file mode 100644
index 0000000..d7f4d6b
--- /dev/null
+++ b/src/include/fst/extensions/far/compile-strings.h
@@ -0,0 +1,271 @@
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Authors: allauzen@google.com (Cyril Allauzen)
+// ttai@google.com (Terry Tai)
+// jpr@google.com (Jake Ratkiewicz)
+
+
+#ifndef FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
+#define FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
+
+#include <libgen.h>
+#include <string>
+#include <vector>
+using std::vector;
+
+#include <fst/extensions/far/far.h>
+#include <fst/string.h>
+
+namespace fst {
+
+// Construct a reader that provides FSTs from a file (stream) either on a
+// line-by-line basis or on a per-stream basis. Note that the freshly
+// constructed reader is already set to the first input.
+//
+// Sample Usage:
+// for (StringReader<Arc> reader(...); !reader.Done(); reader.Next()) {
+// Fst *fst = reader.GetVectorFst();
+// }
+template <class A>
+class StringReader {
+ public:
+ typedef A Arc;
+ typedef typename A::Label Label;
+ typedef typename A::Weight Weight;
+ typedef typename StringCompiler<A>::TokenType TokenType;
+
+ enum EntryType { LINE = 1, FILE = 2 };
+
+ StringReader(istream &istrm,
+ const string &source,
+ EntryType entry_type,
+ TokenType token_type,
+ bool allow_negative_labels,
+ const SymbolTable *syms = 0,
+ Label unknown_label = kNoStateId)
+ : nline_(0), strm_(istrm), source_(source), entry_type_(entry_type),
+ token_type_(token_type), done_(false),
+ compiler_(token_type, syms, unknown_label, allow_negative_labels) {
+ Next(); // Initialize the reader to the first input.
+ }
+
+ bool Done() {
+ return done_;
+ }
+
+ void Next() {
+ VLOG(1) << "Processing source " << source_ << " at line " << nline_;
+ if (!strm_) { // We're done if we have no more input.
+ done_ = true;
+ return;
+ }
+ if (entry_type_ == LINE) {
+ getline(strm_, content_);
+ ++nline_;
+ } else {
+ content_.clear();
+ string line;
+ while (getline(strm_, line)) {
+ ++nline_;
+ content_.append(line);
+ content_.append("\n");
+ }
+ }
+ if (!strm_ && content_.empty()) // We're also done if we read off all the
+ done_ = true; // whitespace at the end of a file.
+ }
+
+ VectorFst<A> *GetVectorFst() {
+ VectorFst<A> *fst = new VectorFst<A>;
+ if (compiler_(content_, fst)) {
+ return fst;
+ } else {
+ delete fst;
+ return NULL;
+ }
+ }
+
+ CompactFst<A, StringCompactor<A> > *GetCompactFst() {
+ CompactFst<A, StringCompactor<A> > *fst =
+ new CompactFst<A, StringCompactor<A> >;
+ if (compiler_(content_, fst)) {
+ return fst;
+ } else {
+ delete fst;
+ return NULL;
+ }
+ }
+
+ private:
+ size_t nline_;
+ istream &strm_;
+ string source_;
+ EntryType entry_type_;
+ TokenType token_type_;
+ bool done_;
+ StringCompiler<A> compiler_;
+ string content_; // The actual content of the input stream's next FST.
+
+ DISALLOW_COPY_AND_ASSIGN(StringReader);
+};
+
+// Compute the minimal length required to encode each line number as a decimal
+// number.
+int KeySize(const char *filename);
+
+template <class Arc>
+void FarCompileStrings(const vector<string> &in_fnames,
+ const string &out_fname,
+ const string &fst_type,
+ const FarType &far_type,
+ int32 generate_keys,
+ FarEntryType fet,
+ FarTokenType tt,
+ const string &symbols_fname,
+ const string &unknown_symbol,
+ bool allow_negative_labels,
+ bool file_list_input,
+ const string &key_prefix,
+ const string &key_suffix) {
+ typename StringReader<Arc>::EntryType entry_type;
+ if (fet == FET_LINE) {
+ entry_type = StringReader<Arc>::LINE;
+ } else if (fet == FET_FILE) {
+ entry_type = StringReader<Arc>::FILE;
+ } else {
+ FSTERROR() << "FarCompileStrings: unknown entry type";
+ return;
+ }
+
+ typename StringCompiler<Arc>::TokenType token_type;
+ if (tt == FTT_SYMBOL) {
+ token_type = StringCompiler<Arc>::SYMBOL;
+ } else if (tt == FTT_BYTE) {
+ token_type = StringCompiler<Arc>::BYTE;
+ } else if (tt == FTT_UTF8) {
+ token_type = StringCompiler<Arc>::UTF8;
+ } else {
+ FSTERROR() << "FarCompileStrings: unknown token type";
+ return;
+ }
+
+ bool compact;
+ if (fst_type.empty() || (fst_type == "vector")) {
+ compact = false;
+ } else if (fst_type == "compact") {
+ compact = true;
+ } else {
+ FSTERROR() << "FarCompileStrings: unknown fst type: "
+ << fst_type;
+ return;
+ }
+
+ const SymbolTable *syms = 0;
+ typename Arc::Label unknown_label = kNoLabel;
+ if (!symbols_fname.empty()) {
+ syms = SymbolTable::ReadText(symbols_fname,
+ allow_negative_labels);
+ if (!syms) {
+ FSTERROR() << "FarCompileStrings: error reading symbol table: "
+ << symbols_fname;
+ return;
+ }
+ if (!unknown_symbol.empty()) {
+ unknown_label = syms->Find(unknown_symbol);
+ if (unknown_label == kNoLabel) {
+ FSTERROR() << "FarCompileStrings: unknown label \"" << unknown_label
+ << "\" missing from symbol table: " << symbols_fname;
+ return;
+ }
+ }
+ }
+
+ FarWriter<Arc> *far_writer =
+ FarWriter<Arc>::Create(out_fname, far_type);
+ if (!far_writer) return;
+
+ vector<string> inputs;
+ if (file_list_input) {
+ for (int i = 1; i < in_fnames.size(); ++i) {
+ ifstream istrm(in_fnames[i].c_str());
+ string str;
+ while (getline(istrm, str))
+ inputs.push_back(str);
+ }
+ } else {
+ inputs = in_fnames;
+ }
+
+ for (int i = 0, n = 0; i < inputs.size(); ++i) {
+ int key_size = generate_keys ? generate_keys :
+ (entry_type == StringReader<Arc>::FILE ? 1 :
+ KeySize(inputs[i].c_str()));
+ ifstream istrm(inputs[i].c_str());
+
+ for (StringReader<Arc> reader(
+ istrm, inputs[i], entry_type, token_type,
+ allow_negative_labels, syms, unknown_label);
+ !reader.Done();
+ reader.Next()) {
+ ++n;
+ const Fst<Arc> *fst;
+ if (compact)
+ fst = reader.GetCompactFst();
+ else
+ fst = reader.GetVectorFst();
+ if (!fst) {
+ FSTERROR() << "FarCompileStrings: compiling string number " << n
+ << " in file " << inputs[i] << " failed with token_type = "
+ << (tt == FTT_BYTE ? "byte" :
+ (tt == FTT_UTF8 ? "utf8" :
+ (tt == FTT_SYMBOL ? "symbol" : "unknown")))
+ << " and entry_type = "
+ << (fet == FET_LINE ? "line" :
+ (fet == FET_FILE ? "file" : "unknown"));
+ delete far_writer;
+ delete syms;
+ return;
+ }
+ ostringstream keybuf;
+ keybuf.width(key_size);
+ keybuf.fill('0');
+ keybuf << n;
+ string key;
+ if (generate_keys > 0) {
+ key = keybuf.str();
+ } else {
+ char* filename = new char[inputs[i].size() + 1];
+ strcpy(filename, inputs[i].c_str());
+ key = basename(filename);
+ if (entry_type != StringReader<Arc>::FILE) {
+ key += "-";
+ key += keybuf.str();
+ }
+ delete[] filename;
+ }
+ far_writer->Add(key_prefix + key + key_suffix, *fst);
+ delete fst;
+ }
+ if (generate_keys == 0)
+ n = 0;
+ }
+
+ delete far_writer;
+}
+
+} // namespace fst
+
+
+#endif // FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
diff --git a/src/include/fst/extensions/far/create.h b/src/include/fst/extensions/far/create.h
new file mode 100644
index 0000000..edb31e7
--- /dev/null
+++ b/src/include/fst/extensions/far/create.h
@@ -0,0 +1,87 @@
+// create-main.h
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: riley@google.com (Michael Riley)
+// Modified: jpr@google.com (Jake Ratkiewicz) to use new dispatch
+//
+// \file
+// Creates a finite-state archive from component FSTs. Includes
+// helper function for farcreate.cc that templates the main on the arc
+// type to support multiple and extensible arc types.
+//
+
+#ifndef FST_EXTENSIONS_FAR_CREATE_H__
+#define FST_EXTENSIONS_FAR_CREATE_H__
+
+#include <libgen.h>
+#include <string>
+#include <vector>
+using std::vector;
+
+#include <fst/extensions/far/far.h>
+
+namespace fst {
+
+template <class Arc>
+void FarCreate(const vector<string> &in_fnames,
+ const string &out_fname,
+ const int32 generate_keys,
+ const bool file_list_input,
+ const FarType &far_type,
+ const string &key_prefix,
+ const string &key_suffix) {
+ FarWriter<Arc> *far_writer =
+ FarWriter<Arc>::Create(out_fname, far_type);
+ if (!far_writer) return;
+
+ vector<string> inputs;
+ if (file_list_input) {
+ for (int i = 1; i < in_fnames.size(); ++i) {
+ ifstream istrm(in_fnames[i].c_str());
+ string str;
+ while (getline(istrm, str))
+ inputs.push_back(str);
+ }
+ } else {
+ inputs = in_fnames;
+ }
+
+ for (int i = 0; i < inputs.size(); ++i) {
+ Fst<Arc> *ifst = Fst<Arc>::Read(inputs[i]);
+ if (!ifst) return;
+ string key;
+ if (generate_keys > 0) {
+ ostringstream keybuf;
+ keybuf.width(generate_keys);
+ keybuf.fill('0');
+ keybuf << i + 1;
+ key = keybuf.str();
+ } else {
+ char* filename = new char[inputs[i].size() + 1];
+ strcpy(filename, inputs[i].c_str());
+ key = basename(filename);
+ delete[] filename;
+ }
+
+ far_writer->Add(key_prefix + key + key_suffix, *ifst);
+ delete ifst;
+ }
+
+ delete far_writer;
+}
+
+} // namespace fst
+
+#endif // FST_EXTENSIONS_FAR_CREATE_H__
diff --git a/src/include/fst/extensions/far/extract.h b/src/include/fst/extensions/far/extract.h
new file mode 100644
index 0000000..022ca60
--- /dev/null
+++ b/src/include/fst/extensions/far/extract.h
@@ -0,0 +1,85 @@
+// extract-main.h
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: riley@google.com (Michael Riley)
+// Modified: jpr@google.com (Jake Ratkiewicz) to use the new arc-dispatch
+
+// \file
+// Extracts component FSTs from an finite-state archive.
+//
+
+#ifndef FST_EXTENSIONS_FAR_EXTRACT_H__
+#define FST_EXTENSIONS_FAR_EXTRACT_H__
+
+#include <string>
+#include <vector>
+using std::vector;
+
+#include <fst/extensions/far/far.h>
+
+namespace fst {
+
+template<class Arc>
+void FarExtract(const vector<string> &ifilenames,
+ const int32 &generate_filenames,
+ const string &begin_key,
+ const string &end_key,
+ const string &filename_prefix,
+ const string &filename_suffix) {
+ FarReader<Arc> *far_reader = FarReader<Arc>::Open(ifilenames);
+ if (!far_reader) return;
+
+ if (!begin_key.empty())
+ far_reader->Find(begin_key);
+
+ string okey;
+ int nrep = 0;
+ for (int i = 1; !far_reader->Done(); far_reader->Next(), ++i) {
+ string key = far_reader->GetKey();
+ if (!end_key.empty() && end_key < key)
+ break;
+ const Fst<Arc> &fst = far_reader->GetFst();
+
+ if (key == okey)
+ ++nrep;
+ else
+ nrep = 0;
+
+ okey = key;
+
+ string ofilename;
+ if (generate_filenames) {
+ ostringstream tmp;
+ tmp.width(generate_filenames);
+ tmp.fill('0');
+ tmp << i;
+ ofilename = tmp.str();
+ } else {
+ if (nrep > 0) {
+ ostringstream tmp;
+ tmp << '.' << nrep;
+ key += tmp.str();
+ }
+ ofilename = key;
+ }
+ fst.Write(filename_prefix + ofilename + filename_suffix);
+ }
+
+ return;
+}
+
+} // namespace fst
+
+#endif // FST_EXTENSIONS_FAR_EXTRACT_H__
diff --git a/src/include/fst/extensions/far/far.h b/src/include/fst/extensions/far/far.h
new file mode 100644
index 0000000..82b9e5c
--- /dev/null
+++ b/src/include/fst/extensions/far/far.h
@@ -0,0 +1,360 @@
+// far.h
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: riley@google.com (Michael Riley)
+//
+// \file
+// Finite-State Transducer (FST) archive classes.
+//
+
+#ifndef FST_EXTENSIONS_FAR_FAR_H__
+#define FST_EXTENSIONS_FAR_FAR_H__
+
+#include <fst/extensions/far/stlist.h>
+#include <fst/extensions/far/sttable.h>
+#include <fst/fst.h>
+#include <fst/vector-fst.h>
+
+namespace fst {
+
+enum FarEntryType { FET_LINE, FET_FILE };
+enum FarTokenType { FTT_SYMBOL, FTT_BYTE, FTT_UTF8 };
+
+// FST archive header class
+class FarHeader {
+ public:
+ const string &FarType() const { return fartype_; }
+ const string &ArcType() const { return arctype_; }
+
+ bool Read(const string &filename) {
+ FstHeader fsthdr;
+ if (filename.empty()) { // Header reading unsupported on stdin.
+ return false;
+ } else if (IsSTTable(filename)) { // Check if STTable
+ ReadSTTableHeader(filename, &fsthdr);
+ fartype_ = "sttable";
+ arctype_ = fsthdr.ArcType().empty() ? "unknown" : fsthdr.ArcType();
+ return true;
+ } else if (IsSTList(filename)) { // Check if STList
+ ReadSTListHeader(filename, &fsthdr);
+ fartype_ = "sttable";
+ arctype_ = fsthdr.ArcType().empty() ? "unknown" : fsthdr.ArcType();
+ return true;
+ }
+ return false;
+ }
+
+ private:
+ string fartype_;
+ string arctype_;
+};
+
+enum FarType { FAR_DEFAULT = 0, FAR_STTABLE = 1, FAR_STLIST = 2,
+ FAR_SSTABLE = 3 };
+
+// This class creates an archive of FSTs.
+template <class A>
+class FarWriter {
+ public:
+ typedef A Arc;
+
+ // Creates a new (empty) FST archive; returns NULL on error.
+ static FarWriter *Create(const string &filename, FarType type = FAR_DEFAULT);
+
+ // Adds an FST to the end of an archive. Keys must be non-empty and
+ // in lexicographic order. FSTs must have a suitable write method.
+ virtual void Add(const string &key, const Fst<A> &fst) = 0;
+
+ virtual FarType Type() const = 0;
+
+ virtual bool Error() const = 0;
+
+ virtual ~FarWriter() {}
+
+ protected:
+ FarWriter() {}
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(FarWriter);
+};
+
+
+// This class iterates through an existing archive of FSTs.
+template <class A>
+class FarReader {
+ public:
+ typedef A Arc;
+
+ // Opens an existing FST archive in a single file; returns NULL on error.
+ // Sets current position to the beginning of the achive.
+ static FarReader *Open(const string &filename);
+
+ // Opens an existing FST archive in multiple files; returns NULL on error.
+ // Sets current position to the beginning of the achive.
+ static FarReader *Open(const vector<string> &filenames);
+
+ // Resets current posision to beginning of archive.
+ virtual void Reset() = 0;
+
+ // Sets current position to first entry >= key. Returns true if a match.
+ virtual bool Find(const string &key) = 0;
+
+ // Current position at end of archive?
+ virtual bool Done() const = 0;
+
+ // Move current position to next FST.
+ virtual void Next() = 0;
+
+ // Returns key at the current position. This reference is invalidated if
+ // the current position in the archive is changed.
+ virtual const string &GetKey() const = 0;
+
+ // Returns FST at the current position. This reference is invalidated if
+ // the current position in the archive is changed.
+ virtual const Fst<A> &GetFst() const = 0;
+
+ virtual FarType Type() const = 0;
+
+ virtual bool Error() const = 0;
+
+ virtual ~FarReader() {}
+
+ protected:
+ FarReader() {}
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(FarReader);
+};
+
+
+template <class A>
+class FstWriter {
+ public:
+ void operator()(ostream &strm, const Fst<A> &fst) const {
+ fst.Write(strm, FstWriteOptions());
+ }
+};
+
+
+template <class A>
+class STTableFarWriter : public FarWriter<A> {
+ public:
+ typedef A Arc;
+
+ static STTableFarWriter *Create(const string filename) {
+ STTableWriter<Fst<A>, FstWriter<A> > *writer =
+ STTableWriter<Fst<A>, FstWriter<A> >::Create(filename);
+ return new STTableFarWriter(writer);
+ }
+
+ void Add(const string &key, const Fst<A> &fst) { writer_->Add(key, fst); }
+
+ FarType Type() const { return FAR_STTABLE; }
+
+ bool Error() const { return writer_->Error(); }
+
+ ~STTableFarWriter() { delete writer_; }
+
+ private:
+ explicit STTableFarWriter(STTableWriter<Fst<A>, FstWriter<A> > *writer)
+ : writer_(writer) {}
+
+ private:
+ STTableWriter<Fst<A>, FstWriter<A> > *writer_;
+
+ DISALLOW_COPY_AND_ASSIGN(STTableFarWriter);
+};
+
+
+template <class A>
+class STListFarWriter : public FarWriter<A> {
+ public:
+ typedef A Arc;
+
+ static STListFarWriter *Create(const string filename) {
+ STListWriter<Fst<A>, FstWriter<A> > *writer =
+ STListWriter<Fst<A>, FstWriter<A> >::Create(filename);
+ return new STListFarWriter(writer);
+ }
+
+ void Add(const string &key, const Fst<A> &fst) { writer_->Add(key, fst); }
+
+ FarType Type() const { return FAR_STLIST; }
+
+ bool Error() const { return writer_->Error(); }
+
+ ~STListFarWriter() { delete writer_; }
+
+ private:
+ explicit STListFarWriter(STListWriter<Fst<A>, FstWriter<A> > *writer)
+ : writer_(writer) {}
+
+ private:
+ STListWriter<Fst<A>, FstWriter<A> > *writer_;
+
+ DISALLOW_COPY_AND_ASSIGN(STListFarWriter);
+};
+
+
+template <class A>
+FarWriter<A> *FarWriter<A>::Create(const string &filename, FarType type) {
+ switch(type) {
+ case FAR_DEFAULT:
+ if (filename.empty())
+ return STListFarWriter<A>::Create(filename);
+ case FAR_STTABLE:
+ return STTableFarWriter<A>::Create(filename);
+ break;
+ case FAR_STLIST:
+ return STListFarWriter<A>::Create(filename);
+ break;
+ default:
+ LOG(ERROR) << "FarWriter::Create: unknown far type";
+ return 0;
+ }
+}
+
+
+template <class A>
+class FstReader {
+ public:
+ Fst<A> *operator()(istream &strm) const {
+ return Fst<A>::Read(strm, FstReadOptions());
+ }
+};
+
+
+template <class A>
+class STTableFarReader : public FarReader<A> {
+ public:
+ typedef A Arc;
+
+ static STTableFarReader *Open(const string &filename) {
+ STTableReader<Fst<A>, FstReader<A> > *reader =
+ STTableReader<Fst<A>, FstReader<A> >::Open(filename);
+ // TODO: error check
+ return new STTableFarReader(reader);
+ }
+
+ static STTableFarReader *Open(const vector<string> &filenames) {
+ STTableReader<Fst<A>, FstReader<A> > *reader =
+ STTableReader<Fst<A>, FstReader<A> >::Open(filenames);
+ // TODO: error check
+ return new STTableFarReader(reader);
+ }
+
+ void Reset() { reader_->Reset(); }
+
+ bool Find(const string &key) { return reader_->Find(key); }
+
+ bool Done() const { return reader_->Done(); }
+
+ void Next() { return reader_->Next(); }
+
+ const string &GetKey() const { return reader_->GetKey(); }
+
+ const Fst<A> &GetFst() const { return reader_->GetEntry(); }
+
+ FarType Type() const { return FAR_STTABLE; }
+
+ bool Error() const { return reader_->Error(); }
+
+ ~STTableFarReader() { delete reader_; }
+
+ private:
+ explicit STTableFarReader(STTableReader<Fst<A>, FstReader<A> > *reader)
+ : reader_(reader) {}
+
+ private:
+ STTableReader<Fst<A>, FstReader<A> > *reader_;
+
+ DISALLOW_COPY_AND_ASSIGN(STTableFarReader);
+};
+
+
+template <class A>
+class STListFarReader : public FarReader<A> {
+ public:
+ typedef A Arc;
+
+ static STListFarReader *Open(const string &filename) {
+ STListReader<Fst<A>, FstReader<A> > *reader =
+ STListReader<Fst<A>, FstReader<A> >::Open(filename);
+ // TODO: error check
+ return new STListFarReader(reader);
+ }
+
+ static STListFarReader *Open(const vector<string> &filenames) {
+ STListReader<Fst<A>, FstReader<A> > *reader =
+ STListReader<Fst<A>, FstReader<A> >::Open(filenames);
+ // TODO: error check
+ return new STListFarReader(reader);
+ }
+
+ void Reset() { reader_->Reset(); }
+
+ bool Find(const string &key) { return reader_->Find(key); }
+
+ bool Done() const { return reader_->Done(); }
+
+ void Next() { return reader_->Next(); }
+
+ const string &GetKey() const { return reader_->GetKey(); }
+
+ const Fst<A> &GetFst() const { return reader_->GetEntry(); }
+
+ FarType Type() const { return FAR_STLIST; }
+
+ bool Error() const { return reader_->Error(); }
+
+ ~STListFarReader() { delete reader_; }
+
+ private:
+ explicit STListFarReader(STListReader<Fst<A>, FstReader<A> > *reader)
+ : reader_(reader) {}
+
+ private:
+ STListReader<Fst<A>, FstReader<A> > *reader_;
+
+ DISALLOW_COPY_AND_ASSIGN(STListFarReader);
+};
+
+
+template <class A>
+FarReader<A> *FarReader<A>::Open(const string &filename) {
+ if (filename.empty())
+ return STListFarReader<A>::Open(filename);
+ else if (IsSTTable(filename))
+ return STTableFarReader<A>::Open(filename);
+ else if (IsSTList(filename))
+ return STListFarReader<A>::Open(filename);
+ return 0;
+}
+
+
+template <class A>
+FarReader<A> *FarReader<A>::Open(const vector<string> &filenames) {
+ if (!filenames.empty() && filenames[0].empty())
+ return STListFarReader<A>::Open(filenames);
+ else if (!filenames.empty() && IsSTTable(filenames[0]))
+ return STTableFarReader<A>::Open(filenames);
+ else if (!filenames.empty() && IsSTList(filenames[0]))
+ return STListFarReader<A>::Open(filenames);
+ return 0;
+}
+
+} // namespace fst
+
+#endif // FST_EXTENSIONS_FAR_FAR_H__
diff --git a/src/include/fst/extensions/far/farlib.h b/src/include/fst/extensions/far/farlib.h
new file mode 100644
index 0000000..91ba224
--- /dev/null
+++ b/src/include/fst/extensions/far/farlib.h
@@ -0,0 +1,31 @@
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: jpr@google.com (Jake Ratkiewicz)
+
+// A finite-state archive (FAR) is used to store an indexable collection of
+// FSTs in a single file. Utilities are provided to create FARs from FSTs,
+// to iterate over FARs, and to extract specific FSTs from FARs.
+
+#ifndef FST_EXTENSIONS_FAR_FARLIB_H_
+#define FST_EXTENSIONS_FAR_FARLIB_H_
+
+#include <fst/extensions/far/far.h>
+#include <fst/extensions/far/compile-strings.h>
+#include <fst/extensions/far/create.h>
+#include <fst/extensions/far/extract.h>
+#include <fst/extensions/far/info.h>
+#include <fst/extensions/far/print-strings.h>
+
+#endif // FST_EXTENSIONS_FAR_FARLIB_H_
diff --git a/src/include/fst/extensions/far/farscript.h b/src/include/fst/extensions/far/farscript.h
new file mode 100644
index 0000000..9c3b1ca
--- /dev/null
+++ b/src/include/fst/extensions/far/farscript.h
@@ -0,0 +1,234 @@
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: jpr@google.com (Jake Ratkiewicz)
+
+// Convenience file for including all of the FAR operations,
+// or registering them for new arc types.
+
+#ifndef FST_EXTENSIONS_FAR_FARSCRIPT_H_
+#define FST_EXTENSIONS_FAR_FARSCRIPT_H_
+
+#include <vector>
+using std::vector;
+#include <string>
+
+#include <fst/script/arg-packs.h>
+#include <fst/extensions/far/compile-strings.h>
+#include <fst/extensions/far/create.h>
+#include <fst/extensions/far/extract.h>
+#include <fst/extensions/far/info.h>
+#include <fst/extensions/far/print-strings.h>
+#include <fst/extensions/far/far.h>
+
+#include <fst/types.h>
+
+namespace fst {
+namespace script {
+
+// Note: it is safe to pass these strings as references because
+// this struct is only used to pass them deeper in the call graph.
+// Be sure you understand why this is so before using this struct
+// for anything else!
+struct FarCompileStringsArgs {
+ const vector<string> &in_fnames;
+ const string &out_fname;
+ const string &fst_type;
+ const FarType &far_type;
+ const int32 generate_keys;
+ const FarEntryType fet;
+ const FarTokenType tt;
+ const string &symbols_fname;
+ const string &unknown_symbol;
+ const bool allow_negative_labels;
+ const bool file_list_input;
+ const string &key_prefix;
+ const string &key_suffix;
+
+ FarCompileStringsArgs(const vector<string> &in_fnames,
+ const string &out_fname,
+ const string &fst_type,
+ const FarType &far_type,
+ int32 generate_keys,
+ FarEntryType fet,
+ FarTokenType tt,
+ const string &symbols_fname,
+ const string &unknown_symbol,
+ bool allow_negative_labels,
+ bool file_list_input,
+ const string &key_prefix,
+ const string &key_suffix) :
+ in_fnames(in_fnames), out_fname(out_fname), fst_type(fst_type),
+ far_type(far_type), generate_keys(generate_keys), fet(fet),
+ tt(tt), symbols_fname(symbols_fname), unknown_symbol(unknown_symbol),
+ allow_negative_labels(allow_negative_labels),
+ file_list_input(file_list_input), key_prefix(key_prefix),
+ key_suffix(key_suffix) { }
+};
+
+template <class Arc>
+void FarCompileStrings(FarCompileStringsArgs *args) {
+ fst::FarCompileStrings<Arc>(
+ args->in_fnames, args->out_fname, args->fst_type, args->far_type,
+ args->generate_keys, args->fet, args->tt, args->symbols_fname,
+ args->unknown_symbol, args->allow_negative_labels, args->file_list_input,
+ args->key_prefix, args->key_suffix);
+}
+
+void FarCompileStrings(
+ const vector<string> &in_fnames,
+ const string &out_fname,
+ const string &arc_type,
+ const string &fst_type,
+ const FarType &far_type,
+ int32 generate_keys,
+ FarEntryType fet,
+ FarTokenType tt,
+ const string &symbols_fname,
+ const string &unknown_symbol,
+ bool allow_negative_labels,
+ bool file_list_input,
+ const string &key_prefix,
+ const string &key_suffix);
+
+
+// Note: it is safe to pass these strings as references because
+// this struct is only used to pass them deeper in the call graph.
+// Be sure you understand why this is so before using this struct
+// for anything else!
+struct FarCreateArgs {
+ const vector<string> &in_fnames;
+ const string &out_fname;
+ const int32 generate_keys;
+ const bool file_list_input;
+ const FarType &far_type;
+ const string &key_prefix;
+ const string &key_suffix;
+
+ FarCreateArgs(
+ const vector<string> &in_fnames, const string &out_fname,
+ const int32 generate_keys, const bool file_list_input,
+ const FarType &far_type, const string &key_prefix,
+ const string &key_suffix)
+ : in_fnames(in_fnames), out_fname(out_fname),
+ generate_keys(generate_keys), file_list_input(file_list_input),
+ far_type(far_type), key_prefix(key_prefix), key_suffix(key_suffix) { }
+};
+
+template<class Arc>
+void FarCreate(FarCreateArgs *args) {
+ fst::FarCreate<Arc>(args->in_fnames, args->out_fname, args->generate_keys,
+ args->file_list_input, args->far_type,
+ args->key_prefix, args->key_suffix);
+}
+
+void FarCreate(const vector<string> &in_fnames,
+ const string &out_fname,
+ const string &arc_type,
+ const int32 generate_keys,
+ const bool file_list_input,
+ const FarType &far_type,
+ const string &key_prefix,
+ const string &key_suffix);
+
+
+typedef args::Package<const vector<string> &, int32,
+ const string&, const string&, const string&,
+ const string&> FarExtractArgs;
+
+template<class Arc>
+void FarExtract(FarExtractArgs *args) {
+ fst::FarExtract<Arc>(
+ args->arg1, args->arg2, args->arg3, args->arg4, args->arg5, args->arg6);
+}
+
+void FarExtract(const vector<string> &ifilenames,
+ const string &arc_type,
+ int32 generate_filenames, const string &begin_key,
+ const string &end_key, const string &filename_prefix,
+ const string &filename_suffix);
+
+typedef args::Package<const vector<string> &, const string &,
+ const string &, const bool> FarInfoArgs;
+
+template <class Arc>
+void FarInfo(FarInfoArgs *args) {
+ fst::FarInfo<Arc>(args->arg1, args->arg2, args->arg3, args->arg4);
+}
+
+void FarInfo(const vector<string> &filenames,
+ const string &arc_type,
+ const string &begin_key,
+ const string &end_key,
+ const bool list_fsts);
+
+struct FarPrintStringsArgs {
+ const vector<string> &ifilenames;
+ const FarEntryType entry_type;
+ const FarTokenType token_type;
+ const string &begin_key;
+ const string &end_key;
+ const bool print_key;
+ const string &symbols_fname;
+ const int32 generate_filenames;
+ const string &filename_prefix;
+ const string &filename_suffix;
+
+ FarPrintStringsArgs(
+ const vector<string> &ifilenames, const FarEntryType entry_type,
+ const FarTokenType token_type, const string &begin_key,
+ const string &end_key, const bool print_key,
+ const string &symbols_fname, const int32 generate_filenames,
+ const string &filename_prefix, const string &filename_suffix) :
+ ifilenames(ifilenames), entry_type(entry_type), token_type(token_type),
+ begin_key(begin_key), end_key(end_key), print_key(print_key),
+ symbols_fname(symbols_fname),
+ generate_filenames(generate_filenames), filename_prefix(filename_prefix),
+ filename_suffix(filename_suffix) { }
+};
+
+template <class Arc>
+void FarPrintStrings(FarPrintStringsArgs *args) {
+ fst::FarPrintStrings<Arc>(
+ args->ifilenames, args->entry_type, args->token_type,
+ args->begin_key, args->end_key, args->print_key,
+ args->symbols_fname, args->generate_filenames, args->filename_prefix,
+ args->filename_suffix);
+}
+
+
+void FarPrintStrings(const vector<string> &ifilenames,
+ const string &arc_type,
+ const FarEntryType entry_type,
+ const FarTokenType token_type,
+ const string &begin_key,
+ const string &end_key,
+ const bool print_key,
+ const string &symbols_fname,
+ const int32 generate_filenames,
+ const string &filename_prefix,
+ const string &filename_suffix);
+
+} // namespace script
+} // namespace fst
+
+
+#define REGISTER_FST_FAR_OPERATIONS(ArcType) \
+ REGISTER_FST_OPERATION(FarCompileStrings, ArcType, FarCompileStringsArgs); \
+ REGISTER_FST_OPERATION(FarCreate, ArcType, FarCreateArgs); \
+ REGISTER_FST_OPERATION(FarExtract, ArcType, FarExtractArgs); \
+ REGISTER_FST_OPERATION(FarInfo, ArcType, FarInfoArgs); \
+ REGISTER_FST_OPERATION(FarPrintStrings, ArcType, FarPrintStringsArgs)
+
+#endif // FST_EXTENSIONS_FAR_FARSCRIPT_H_
diff --git a/src/include/fst/extensions/far/info.h b/src/include/fst/extensions/far/info.h
new file mode 100644
index 0000000..f010546
--- /dev/null
+++ b/src/include/fst/extensions/far/info.h
@@ -0,0 +1,128 @@
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: allauzen@google.com (Cyril Allauzen)
+// Modified: jpr@google.com (Jake Ratkiewicz)
+
+#ifndef FST_EXTENSIONS_FAR_INFO_H_
+#define FST_EXTENSIONS_FAR_INFO_H_
+
+#include <iomanip>
+#include <set>
+#include <string>
+#include <vector>
+using std::vector;
+
+#include <fst/extensions/far/far.h>
+#include <fst/extensions/far/main.h> // For FarTypeToString
+
+namespace fst {
+
+template <class Arc>
+void CountStatesAndArcs(const Fst<Arc> &fst, size_t *nstate, size_t *narc) {
+ StateIterator<Fst<Arc> > siter(fst);
+ for (; !siter.Done(); siter.Next(), ++(*nstate)) {
+ ArcIterator<Fst<Arc> > aiter(fst, siter.Value());
+ for (; !aiter.Done(); aiter.Next(), ++(*narc));
+ }
+}
+
+struct KeyInfo {
+ string key;
+ string type;
+ size_t nstate;
+ size_t narc;
+
+ KeyInfo(string k, string t, int64 ns = 0, int64 na = 0)
+ : key(k), type(t), nstate(ns), narc(na) {}
+};
+
+template <class Arc>
+void FarInfo(const vector<string> &filenames, const string &begin_key,
+ const string &end_key, const bool list_fsts) {
+ FarReader<Arc> *far_reader = FarReader<Arc>::Open(filenames);
+ if (!far_reader) return;
+
+ if (!begin_key.empty())
+ far_reader->Find(begin_key);
+
+ vector<KeyInfo> *infos = list_fsts ? new vector<KeyInfo>() : 0;
+ size_t nfst = 0, nstate = 0, narc = 0;
+ set<string> fst_types;
+ for (; !far_reader->Done(); far_reader->Next()) {
+ string key = far_reader->GetKey();
+ if (!end_key.empty() && end_key < key)
+ break;
+ ++nfst;
+ const Fst<Arc> &fst = far_reader->GetFst();
+ fst_types.insert(fst.Type());
+ if (infos) {
+ KeyInfo info(key, fst.Type());
+ CountStatesAndArcs(fst, &info.nstate, &info.narc);
+ nstate += info.nstate;
+ nstate += info.narc;
+ infos->push_back(info);
+ } else {
+ CountStatesAndArcs(fst, &nstate, &narc);
+ }
+ }
+
+ if (!infos) {
+ cout << std::left << setw(50) << "far type"
+ << FarTypeToString(far_reader->Type()) << endl;
+ cout << std::left << setw(50) << "arc type" << Arc::Type() << endl;
+ cout << std::left << setw(50) << "fst type";
+ for (set<string>::const_iterator iter = fst_types.begin();
+ iter != fst_types.end();
+ ++iter) {
+ if (iter != fst_types.begin())
+ cout << ",";
+ cout << *iter;
+ }
+ cout << endl;
+ cout << std::left << setw(50) << "# of FSTs" << nfst << endl;
+ cout << std::left << setw(50) << "total # of states" << nstate << endl;
+ cout << std::left << setw(50) << "total # of arcs" << narc << endl;
+
+ } else {
+ int wkey = 10, wtype = 10, wnstate = 16, wnarc = 16;
+ for (size_t i = 0; i < infos->size(); ++i) {
+ const KeyInfo &info = (*infos)[i];
+ if (info.key.size() + 2 > wkey)
+ wkey = info.key.size() + 2;
+ if (info.type.size() + 2 > wtype)
+ wtype = info.type.size() + 2;
+ if (ceil(log10(info.nstate)) + 2 > wnstate)
+ wnstate = ceil(log10(info.nstate)) + 2;
+ if (ceil(log10(info.narc)) + 2 > wnarc)
+ wnarc = ceil(log10(info.narc)) + 2;
+ }
+
+ cout << std::left << setw(wkey) << "key" << setw(wtype) << "type"
+ << std::right << setw(wnstate) << "# of states"
+ << setw(wnarc) << "# of arcs" << endl;
+
+ for (size_t i = 0; i < infos->size(); ++i) {
+ const KeyInfo &info = (*infos)[i];
+ cout << std::left << setw(wkey) << info.key << setw(wtype) << info.type
+ << std::right << setw(wnstate) << info.nstate
+ << setw(wnarc) << info.narc << endl;
+ }
+ }
+}
+
+} // namespace fst
+
+
+#endif // FST_EXTENSIONS_FAR_INFO_H_
diff --git a/src/include/fst/extensions/far/main.h b/src/include/fst/extensions/far/main.h
new file mode 100644
index 0000000..00ccfef
--- /dev/null
+++ b/src/include/fst/extensions/far/main.h
@@ -0,0 +1,43 @@
+// main.h
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: riley@google.com (Michael Riley)
+//
+// \file
+// Classes and functions for registering and invoking Far main
+// functions that support multiple and extensible arc types.
+
+#ifndef FST_EXTENSIONS_FAR_MAIN_H__
+#define FST_EXTENSIONS_FAR_MAIN_H__
+
+#include <fst/extensions/far/far.h>
+
+namespace fst {
+
+FarEntryType StringToFarEntryType(const string &s);
+FarTokenType StringToFarTokenType(const string &s);
+
+// Return the 'FarType' value corresponding to a far type name.
+FarType FarTypeFromString(const string &str);
+
+// Return the textual name corresponding to a 'FarType;.
+string FarTypeToString(FarType type);
+
+string LoadArcTypeFromFar(const string& far_fname);
+string LoadArcTypeFromFst(const string& far_fname);
+
+} // namespace fst
+
+#endif // FST_EXTENSIONS_FAR_MAIN_H__
diff --git a/src/include/fst/extensions/far/print-strings.h b/src/include/fst/extensions/far/print-strings.h
new file mode 100644
index 0000000..aff1e51
--- /dev/null
+++ b/src/include/fst/extensions/far/print-strings.h
@@ -0,0 +1,126 @@
+// printstrings-main.h
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: allauzen@google.com (Cyril Allauzen)
+// Modified by: jpr@google.com (Jake Ratkiewicz)
+//
+// \file
+// Output as strings the string FSTs in a finite-state archive.
+
+#ifndef FST_EXTENSIONS_FAR_PRINT_STRINGS_H__
+#define FST_EXTENSIONS_FAR_PRINT_STRINGS_H__
+
+#include <string>
+#include <vector>
+using std::vector;
+
+#include <fst/extensions/far/far.h>
+#include <fst/string.h>
+
+namespace fst {
+
+template <class Arc>
+void FarPrintStrings(
+ const vector<string> &ifilenames, const FarEntryType entry_type,
+ const FarTokenType far_token_type, const string &begin_key,
+ const string &end_key, const bool print_key, const string &symbols_fname,
+ const int32 generate_filenames, const string &filename_prefix,
+ const string &filename_suffix) {
+
+ typename StringPrinter<Arc>::TokenType token_type;
+ if (far_token_type == FTT_SYMBOL) {
+ token_type = StringPrinter<Arc>::SYMBOL;
+ } else if (far_token_type == FTT_BYTE) {
+ token_type = StringPrinter<Arc>::BYTE;
+ } else if (far_token_type == FTT_UTF8) {
+ token_type = StringPrinter<Arc>::UTF8;
+ } else {
+ FSTERROR() << "FarPrintStrings: unknown token type";
+ return;
+ }
+
+ const SymbolTable *syms = 0;
+ if (!symbols_fname.empty()) {
+ // allow negative flag?
+ syms = SymbolTable::ReadText(symbols_fname, true);
+ if (!syms) {
+ FSTERROR() << "FarPrintStrings: error reading symbol table: "
+ << symbols_fname;
+ return;
+ }
+ }
+
+ StringPrinter<Arc> string_printer(token_type, syms);
+
+ FarReader<Arc> *far_reader = FarReader<Arc>::Open(ifilenames);
+ if (!far_reader) return;
+
+ if (!begin_key.empty())
+ far_reader->Find(begin_key);
+
+ string okey;
+ int nrep = 0;
+ for (int i = 1; !far_reader->Done(); far_reader->Next(), ++i) {
+ string key = far_reader->GetKey();
+ if (!end_key.empty() && end_key < key)
+ break;
+ if (okey == key)
+ ++nrep;
+ else
+ nrep = 0;
+ okey = key;
+
+ const Fst<Arc> &fst = far_reader->GetFst();
+ string str;
+ VLOG(2) << "Handling key: " << key;
+ string_printer(fst, &str);
+
+ if (entry_type == FET_LINE) {
+ if (print_key)
+ cout << key << "\t";
+ cout << str << endl;
+ } else if (entry_type == FET_FILE) {
+ stringstream sstrm;
+ if (generate_filenames) {
+ sstrm.fill('0');
+ sstrm << std::right << setw(generate_filenames) << i;
+ } else {
+ sstrm << key;
+ if (nrep > 0)
+ sstrm << "." << nrep;
+ }
+
+ string filename;
+ filename = filename_prefix + sstrm.str() + filename_suffix;
+
+ ofstream ostrm(filename.c_str());
+ if (!ostrm) {
+ FSTERROR() << "FarPrintStrings: Can't open file:" << filename;
+ delete syms;
+ delete far_reader;
+ return;
+ }
+ ostrm << str;
+ if (token_type == StringPrinter<Arc>::SYMBOL)
+ ostrm << "\n";
+ }
+ }
+}
+
+
+
+} // namespace fst
+
+#endif // FST_EXTENSIONS_FAR_PRINT_STRINGS_H__
diff --git a/src/include/fst/extensions/far/stlist.h b/src/include/fst/extensions/far/stlist.h
new file mode 100644
index 0000000..4738181
--- /dev/null
+++ b/src/include/fst/extensions/far/stlist.h
@@ -0,0 +1,304 @@
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: allauzen@google.com (Cyril Allauzen)
+//
+// \file
+// A generic (string,type) list file format.
+//
+// This is a stripped-down version of STTable that does
+// not support the Find() operation but that does support
+// reading/writting from standard in/out.
+
+#ifndef FST_EXTENSIONS_FAR_STLIST_H_
+#define FST_EXTENSIONS_FAR_STLIST_H_
+
+#include <iostream>
+#include <fstream>
+#include <fst/util.h>
+
+#include <algorithm>
+#include <functional>
+#include <queue>
+#include <string>
+#include <utility>
+using std::pair; using std::make_pair;
+#include <vector>
+using std::vector;
+
+namespace fst {
+
+static const int32 kSTListMagicNumber = 5656924;
+static const int32 kSTListFileVersion = 1;
+
+// String-type list writing class for object of type 'T' using functor 'W'
+// to write an object of type 'T' from a stream. 'W' must conform to the
+// following interface:
+//
+// struct Writer {
+// void operator()(ostream &, const T &) const;
+// };
+//
+template <class T, class W>
+class STListWriter {
+ public:
+ typedef T EntryType;
+ typedef W EntryWriter;
+
+ explicit STListWriter(const string filename)
+ : stream_(
+ filename.empty() ? &std::cout :
+ new ofstream(filename.c_str(), ofstream::out | ofstream::binary)),
+ error_(false) {
+ WriteType(*stream_, kSTListMagicNumber);
+ WriteType(*stream_, kSTListFileVersion);
+ if (!stream_) {
+ FSTERROR() << "STListWriter::STListWriter: error writing to file: "
+ << filename;
+ error_ = true;
+ }
+ }
+
+ static STListWriter<T, W> *Create(const string &filename) {
+ return new STListWriter<T, W>(filename);
+ }
+
+ void Add(const string &key, const T &t) {
+ if (key == "") {
+ FSTERROR() << "STListWriter::Add: key empty: " << key;
+ error_ = true;
+ } else if (key < last_key_) {
+ FSTERROR() << "STListWriter::Add: key disorder: " << key;
+ error_ = true;
+ }
+ if (error_) return;
+ last_key_ = key;
+ WriteType(*stream_, key);
+ entry_writer_(*stream_, t);
+ }
+
+ bool Error() const { return error_; }
+
+ ~STListWriter() {
+ WriteType(*stream_, string());
+ if (stream_ != &std::cout)
+ delete stream_;
+ }
+
+ private:
+ EntryWriter entry_writer_; // Write functor for 'EntryType'
+ ostream *stream_; // Output stream
+ string last_key_; // Last key
+ bool error_;
+
+ DISALLOW_COPY_AND_ASSIGN(STListWriter);
+};
+
+
+// String-type list reading class for object of type 'T' using functor 'R'
+// to read an object of type 'T' form a stream. 'R' must conform to the
+// following interface:
+//
+// struct Reader {
+// T *operator()(istream &) const;
+// };
+//
+template <class T, class R>
+class STListReader {
+ public:
+ typedef T EntryType;
+ typedef R EntryReader;
+
+ explicit STListReader(const vector<string> &filenames)
+ : sources_(filenames), entry_(0), error_(false) {
+ streams_.resize(filenames.size(), 0);
+ bool has_stdin = false;
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ if (filenames[i].empty()) {
+ if (!has_stdin) {
+ streams_[i] = &std::cin;
+ sources_[i] = "stdin";
+ has_stdin = true;
+ } else {
+ FSTERROR() << "STListReader::STListReader: stdin should only "
+ << "appear once in the input file list.";
+ error_ = true;
+ return;
+ }
+ } else {
+ streams_[i] = new ifstream(
+ filenames[i].c_str(), ifstream::in | ifstream::binary);
+ }
+ int32 magic_number = 0, file_version = 0;
+ ReadType(*streams_[i], &magic_number);
+ ReadType(*streams_[i], &file_version);
+ if (magic_number != kSTListMagicNumber) {
+ FSTERROR() << "STListReader::STTableReader: wrong file type: "
+ << filenames[i];
+ error_ = true;
+ return;
+ }
+ if (file_version != kSTListFileVersion) {
+ FSTERROR() << "STListReader::STTableReader: wrong file version: "
+ << filenames[i];
+ error_ = true;
+ return;
+ }
+ string key;
+ ReadType(*streams_[i], &key);
+ if (!key.empty())
+ heap_.push(make_pair(key, i));
+ if (!*streams_[i]) {
+ FSTERROR() << "STTableReader: error reading file: " << sources_[i];
+ error_ = true;
+ return;
+ }
+ }
+ if (heap_.empty()) return;
+ size_t current = heap_.top().second;
+ entry_ = entry_reader_(*streams_[current]);
+ if (!entry_ || !*streams_[current]) {
+ FSTERROR() << "STTableReader: error reading entry for key: "
+ << heap_.top().first << ", file: " << sources_[current];
+ error_ = true;
+ }
+ }
+
+ ~STListReader() {
+ for (size_t i = 0; i < streams_.size(); ++i) {
+ if (streams_[i] != &std::cin)
+ delete streams_[i];
+ }
+ if (entry_)
+ delete entry_;
+ }
+
+ static STListReader<T, R> *Open(const string &filename) {
+ vector<string> filenames;
+ filenames.push_back(filename);
+ return new STListReader<T, R>(filenames);
+ }
+
+ static STListReader<T, R> *Open(const vector<string> &filenames) {
+ return new STListReader<T, R>(filenames);
+ }
+
+ void Reset() {
+ FSTERROR()
+ << "STListReader::Reset: stlist does not support reset operation";
+ error_ = true;
+ }
+
+ bool Find(const string &key) {
+ FSTERROR()
+ << "STListReader::Find: stlist does not support find operation";
+ error_ = true;
+ return false;
+ }
+
+ bool Done() const {
+ return error_ || heap_.empty();
+ }
+
+ void Next() {
+ if (error_) return;
+ size_t current = heap_.top().second;
+ string key;
+ heap_.pop();
+ ReadType(*(streams_[current]), &key);
+ if (!*streams_[current]) {
+ FSTERROR() << "STTableReader: error reading file: "
+ << sources_[current];
+ error_ = true;
+ return;
+ }
+ if (!key.empty())
+ heap_.push(make_pair(key, current));
+
+ if(!heap_.empty()) {
+ current = heap_.top().second;
+ if (entry_)
+ delete entry_;
+ entry_ = entry_reader_(*streams_[current]);
+ if (!entry_ || !*streams_[current]) {
+ FSTERROR() << "STTableReader: error reading entry for key: "
+ << heap_.top().first << ", file: " << sources_[current];
+ error_ = true;
+ }
+ }
+ }
+
+ const string &GetKey() const {
+ return heap_.top().first;
+ }
+
+ const EntryType &GetEntry() const {
+ return *entry_;
+ }
+
+ bool Error() const { return error_; }
+
+ private:
+ EntryReader entry_reader_; // Read functor for 'EntryType'
+ vector<istream*> streams_; // Input streams
+ vector<string> sources_; // and corresponding file names
+ priority_queue<
+ pair<string, size_t>, vector<pair<string, size_t> >,
+ greater<pair<string, size_t> > > heap_; // (Key, stream id) heap
+ mutable EntryType *entry_; // Pointer to the currently read entry
+ bool error_;
+
+ DISALLOW_COPY_AND_ASSIGN(STListReader);
+};
+
+
+// String-type list header reading function template on the entry header
+// type 'H' having a member function:
+// Read(istream &strm, const string &filename);
+// Checks that 'filename' is an STTable and call the H::Read() on the last
+// entry in the STTable.
+// Does not support reading from stdin.
+template <class H>
+bool ReadSTListHeader(const string &filename, H *header) {
+ if (filename.empty()) {
+ LOG(ERROR) << "ReadSTListHeader: reading header not supported on stdin";
+ return false;
+ }
+ ifstream strm(filename.c_str(), ifstream::in | ifstream::binary);
+ int32 magic_number = 0, file_version = 0;
+ ReadType(strm, &magic_number);
+ ReadType(strm, &file_version);
+ if (magic_number != kSTListMagicNumber) {
+ LOG(ERROR) << "ReadSTTableHeader: wrong file type: " << filename;
+ return false;
+ }
+ if (file_version != kSTListFileVersion) {
+ LOG(ERROR) << "ReadSTTableHeader: wrong file version: " << filename;
+ return false;
+ }
+ string key;
+ ReadType(strm, &key);
+ header->Read(strm, filename + ":" + key);
+ if (!strm) {
+ LOG(ERROR) << "ReadSTTableHeader: error reading file: " << filename;
+ return false;
+ }
+ return true;
+}
+
+bool IsSTList(const string &filename);
+
+} // namespace fst
+
+#endif // FST_EXTENSIONS_FAR_STLIST_H_
diff --git a/src/include/fst/extensions/far/sttable.h b/src/include/fst/extensions/far/sttable.h
new file mode 100644
index 0000000..3a03133
--- /dev/null
+++ b/src/include/fst/extensions/far/sttable.h
@@ -0,0 +1,370 @@
+// sttable.h
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: allauzen@google.com (Cyril Allauzen)
+//
+// \file
+// A generic string-to-type table file format
+//
+// This is not meant as a generalization of SSTable. This is more of
+// a simple replacement for SSTable in order to provide an open-source
+// implementation of the FAR format for the external version of the
+// FST Library.
+
+#ifndef FST_EXTENSIONS_FAR_STTABLE_H_
+#define FST_EXTENSIONS_FAR_STTABLE_H_
+
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <fst/util.h>
+
+namespace fst {
+
+static const int32 kSTTableMagicNumber = 2125656924;
+static const int32 kSTTableFileVersion = 1;
+
+// String-to-type table writing class for object of type 'T' using functor 'W'
+// to write an object of type 'T' from a stream. 'W' must conform to the
+// following interface:
+//
+// struct Writer {
+// void operator()(ostream &, const T &) const;
+// };
+//
+template <class T, class W>
+class STTableWriter {
+ public:
+ typedef T EntryType;
+ typedef W EntryWriter;
+
+ explicit STTableWriter(const string &filename)
+ : stream_(filename.c_str(), ofstream::out | ofstream::binary),
+ error_(false) {
+ WriteType(stream_, kSTTableMagicNumber);
+ WriteType(stream_, kSTTableFileVersion);
+ if (!stream_) {
+ FSTERROR() << "STTableWriter::STTableWriter: error writing to file: "
+ << filename;
+ error_=true;
+ }
+ }
+
+ static STTableWriter<T, W> *Create(const string &filename) {
+ if (filename.empty()) {
+ LOG(ERROR) << "STTableWriter: writing to standard out unsupported.";
+ return 0;
+ }
+ return new STTableWriter<T, W>(filename);
+ }
+
+ void Add(const string &key, const T &t) {
+ if (key == "") {
+ FSTERROR() << "STTableWriter::Add: key empty: " << key;
+ error_ = true;
+ } else if (key < last_key_) {
+ FSTERROR() << "STTableWriter::Add: key disorder: " << key;
+ error_ = true;
+ }
+ if (error_) return;
+ last_key_ = key;
+ positions_.push_back(stream_.tellp());
+ WriteType(stream_, key);
+ entry_writer_(stream_, t);
+ }
+
+ bool Error() const { return error_; }
+
+ ~STTableWriter() {
+ WriteType(stream_, positions_);
+ WriteType(stream_, static_cast<int64>(positions_.size()));
+ }
+
+ private:
+ EntryWriter entry_writer_; // Write functor for 'EntryType'
+ ofstream stream_; // Output stream
+ vector<int64> positions_; // Position in file of each key-entry pair
+ string last_key_; // Last key
+ bool error_;
+
+ DISALLOW_COPY_AND_ASSIGN(STTableWriter);
+};
+
+
+// String-to-type table reading class for object of type 'T' using functor 'R'
+// to read an object of type 'T' form a stream. 'R' must conform to the
+// following interface:
+//
+// struct Reader {
+// T *operator()(istream &) const;
+// };
+//
+template <class T, class R>
+class STTableReader {
+ public:
+ typedef T EntryType;
+ typedef R EntryReader;
+
+ explicit STTableReader(const vector<string> &filenames)
+ : sources_(filenames), entry_(0), error_(false) {
+ compare_ = new Compare(&keys_);
+ keys_.resize(filenames.size());
+ streams_.resize(filenames.size(), 0);
+ positions_.resize(filenames.size());
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ streams_[i] = new ifstream(
+ filenames[i].c_str(), ifstream::in | ifstream::binary);
+ int32 magic_number = 0, file_version = 0;
+ ReadType(*streams_[i], &magic_number);
+ ReadType(*streams_[i], &file_version);
+ if (magic_number != kSTTableMagicNumber) {
+ FSTERROR() << "STTableReader::STTableReader: wrong file type: "
+ << filenames[i];
+ error_ = true;
+ return;
+ }
+ if (file_version != kSTTableFileVersion) {
+ FSTERROR() << "STTableReader::STTableReader: wrong file version: "
+ << filenames[i];
+ error_ = true;
+ return;
+ }
+ int64 num_entries;
+ streams_[i]->seekg(-static_cast<int>(sizeof(int64)), ios_base::end);
+ ReadType(*streams_[i], &num_entries);
+ streams_[i]->seekg(-static_cast<int>(sizeof(int64)) *
+ (num_entries + 1), ios_base::end);
+ positions_[i].resize(num_entries);
+ for (size_t j = 0; (j < num_entries) && (*streams_[i]); ++j)
+ ReadType(*streams_[i], &(positions_[i][j]));
+ streams_[i]->seekg(positions_[i][0]);
+ if (!*streams_[i]) {
+ FSTERROR() << "STTableReader::STTableReader: error reading file: "
+ << filenames[i];
+ error_ = true;
+ return;
+ }
+
+ }
+ MakeHeap();
+ }
+
+ ~STTableReader() {
+ for (size_t i = 0; i < streams_.size(); ++i)
+ delete streams_[i];
+ delete compare_;
+ if (entry_)
+ delete entry_;
+ }
+
+ static STTableReader<T, R> *Open(const string &filename) {
+ if (filename.empty()) {
+ LOG(ERROR) << "STTableReader: reading from standard in not supported";
+ return 0;
+ }
+ vector<string> filenames;
+ filenames.push_back(filename);
+ return new STTableReader<T, R>(filenames);
+ }
+
+ static STTableReader<T, R> *Open(const vector<string> &filenames) {
+ return new STTableReader<T, R>(filenames);
+ }
+
+ void Reset() {
+ if (error_) return;
+ for (size_t i = 0; i < streams_.size(); ++i)
+ streams_[i]->seekg(positions_[i].front());
+ MakeHeap();
+ }
+
+ bool Find(const string &key) {
+ if (error_) return false;
+ for (size_t i = 0; i < streams_.size(); ++i)
+ LowerBound(i, key);
+ MakeHeap();
+ return keys_[current_] == key;
+ }
+
+ bool Done() const { return error_ || heap_.empty(); }
+
+ void Next() {
+ if (error_) return;
+ if (streams_[current_]->tellg() <= positions_[current_].back()) {
+ ReadType(*(streams_[current_]), &(keys_[current_]));
+ if (!*streams_[current_]) {
+ FSTERROR() << "STTableReader: error reading file: "
+ << sources_[current_];
+ error_ = true;
+ return;
+ }
+ push_heap(heap_.begin(), heap_.end(), *compare_);
+ } else {
+ heap_.pop_back();
+ }
+ if (!heap_.empty())
+ PopHeap();
+ }
+
+ const string &GetKey() const {
+ return keys_[current_];
+ }
+
+ const EntryType &GetEntry() const {
+ return *entry_;
+ }
+
+ bool Error() const { return error_; }
+
+ private:
+ // Comparison functor used to compare stream IDs in the heap
+ struct Compare {
+ Compare(const vector<string> *keys) : keys_(keys) {}
+
+ bool operator()(size_t i, size_t j) const {
+ return (*keys_)[i] > (*keys_)[j];
+ };
+
+ private:
+ const vector<string> *keys_;
+ };
+
+ // Position the stream with ID 'id' at the position corresponding
+ // to the lower bound for key 'find_key'
+ void LowerBound(size_t id, const string &find_key) {
+ ifstream *strm = streams_[id];
+ const vector<int64> &positions = positions_[id];
+ size_t low = 0, high = positions.size() - 1;
+
+ while (low < high) {
+ size_t mid = (low + high)/2;
+ strm->seekg(positions[mid]);
+ string key;
+ ReadType(*strm, &key);
+ if (key > find_key) {
+ high = mid;
+ } else if (key < find_key) {
+ low = mid + 1;
+ } else {
+ for (size_t i = mid; i > low; --i) {
+ strm->seekg(positions[i - 1]);
+ ReadType(*strm, &key);
+ if (key != find_key) {
+ strm->seekg(positions[i]);
+ return;
+ }
+ }
+ strm->seekg(positions[low]);
+ return;
+ }
+ }
+ strm->seekg(positions[low]);
+ }
+
+ // Add all streams to the heap
+ void MakeHeap() {
+ heap_.clear();
+ for (size_t i = 0; i < streams_.size(); ++i) {
+ ReadType(*streams_[i], &(keys_[i]));
+ if (!*streams_[i]) {
+ FSTERROR() << "STTableReader: error reading file: " << sources_[i];
+ error_ = true;
+ return;
+ }
+ heap_.push_back(i);
+ }
+ make_heap(heap_.begin(), heap_.end(), *compare_);
+ PopHeap();
+ }
+
+ // Position the stream with the lowest key at the top
+ // of the heap, set 'current_' to the ID of that stream
+ // and read the current entry from that stream
+ void PopHeap() {
+ pop_heap(heap_.begin(), heap_.end(), *compare_);
+ current_ = heap_.back();
+ if (entry_)
+ delete entry_;
+ entry_ = entry_reader_(*streams_[current_]);
+ if (!entry_)
+ error_ = true;
+ if (!*streams_[current_]) {
+ FSTERROR() << "STTableReader: error reading entry for key: "
+ << keys_[current_] << ", file: " << sources_[current_];
+ error_ = true;
+ }
+ }
+
+
+ EntryReader entry_reader_; // Read functor for 'EntryType'
+ vector<ifstream*> streams_; // Input streams
+ vector<string> sources_; // and corresponding file names
+ vector<vector<int64> > positions_; // Index of positions for each stream
+ vector<string> keys_; // Lowest unread key for each stream
+ vector<int64> heap_; // Heap containing ID of streams with unread keys
+ int64 current_; // Id of current stream to be read
+ Compare *compare_; // Functor comparing stream IDs for the heap
+ mutable EntryType *entry_; // Pointer to the currently read entry
+ bool error_;
+
+ DISALLOW_COPY_AND_ASSIGN(STTableReader);
+};
+
+
+// String-to-type table header reading function template on the entry header
+// type 'H' having a member function:
+// Read(istream &strm, const string &filename);
+// Checks that 'filename' is an STTable and call the H::Read() on the last
+// entry in the STTable.
+template <class H>
+bool ReadSTTableHeader(const string &filename, H *header) {
+ ifstream strm(filename.c_str(), ifstream::in | ifstream::binary);
+ int32 magic_number = 0, file_version = 0;
+ ReadType(strm, &magic_number);
+ ReadType(strm, &file_version);
+ if (magic_number != kSTTableMagicNumber) {
+ LOG(ERROR) << "ReadSTTableHeader: wrong file type: " << filename;
+ return false;
+ }
+ if (file_version != kSTTableFileVersion) {
+ LOG(ERROR) << "ReadSTTableHeader: wrong file version: " << filename;
+ return false;
+ }
+ int64 i = -1;
+ strm.seekg(-static_cast<int>(sizeof(int64)), ios_base::end);
+ ReadType(strm, &i); // Read number of entries
+ if (!strm) {
+ LOG(ERROR) << "ReadSTTableHeader: error reading file: " << filename;
+ return false;
+ }
+ if (i == 0) return true; // No entry header to read
+ strm.seekg(-2 * static_cast<int>(sizeof(int64)), ios_base::end);
+ ReadType(strm, &i); // Read position for last entry in file
+ strm.seekg(i);
+ string key;
+ ReadType(strm, &key);
+ header->Read(strm, filename + ":" + key);
+ if (!strm) {
+ LOG(ERROR) << "ReadSTTableHeader: error reading file: " << filename;
+ return false;
+ }
+ return true;
+}
+
+bool IsSTTable(const string &filename);
+
+} // namespace fst
+
+#endif // FST_EXTENSIONS_FAR_STTABLE_H_