From dfd8b8327b93660601d016cdc6f29f433b45a8d8 Mon Sep 17 00:00:00 2001 From: Alexander Gutkin Date: Wed, 12 Sep 2012 18:11:43 +0100 Subject: Updated OpenFST version to openfst-1.3.2-CL32004048 from Greco3. Change-Id: I19b0db718256b35c0e3e5a7315f1ed6335e6dcac --- AUTHORS | 1 + Android.mk | 2 - LICENCE | 13 - LICENSE | 13 + NEWS | 21 +- README | 6 +- README.android | 5 +- configure | 87 ++- configure.ac | 12 +- fix_headers.sh | 21 + src/bin/fstarcsort.cc | 2 +- src/bin/fstclosure.cc | 2 +- src/bin/fstcompile.cc | 13 +- src/bin/fstcompose.cc | 2 +- src/bin/fstconcat.cc | 2 +- src/bin/fstconnect.cc | 2 +- src/bin/fstconvert.cc | 2 +- src/bin/fstdeterminize.cc | 2 +- src/bin/fstdifference.cc | 2 +- src/bin/fstdraw.cc | 23 +- src/bin/fstencode.cc | 2 +- src/bin/fstepsnormalize.cc | 2 +- src/bin/fstequal.cc | 2 +- src/bin/fstequivalent.cc | 2 +- src/bin/fstinfo.cc | 2 +- src/bin/fstintersect.cc | 2 +- src/bin/fstinvert.cc | 2 +- src/bin/fstmap.cc | 2 +- src/bin/fstminimize.cc | 2 +- src/bin/fstprint.cc | 13 +- src/bin/fstproject.cc | 2 +- src/bin/fstprune.cc | 2 +- src/bin/fstpush.cc | 2 +- src/bin/fstrandgen.cc | 2 +- src/bin/fstrelabel.cc | 15 +- src/bin/fstreplace.cc | 2 +- src/bin/fstreverse.cc | 3 +- src/bin/fstreweight.cc | 2 +- src/bin/fstrmepsilon.cc | 2 +- src/bin/fstshortestdistance.cc | 2 +- src/bin/fstshortestpath.cc | 2 +- src/bin/fstsymbols.cc | 11 +- src/bin/fstsynchronize.cc | 2 +- src/bin/fsttopsort.cc | 2 +- src/bin/fstunion.cc | 3 +- src/extensions/Makefile.am | 6 +- src/extensions/Makefile.in | 5 +- src/extensions/far/Makefile.am | 7 +- src/extensions/far/Makefile.in | 32 +- src/extensions/far/compile-strings.cc | 36 - src/extensions/far/farcompilestrings.cc | 34 +- src/extensions/far/farcreate.cc | 22 +- src/extensions/far/farequal.cc | 57 ++ src/extensions/far/farextract.cc | 12 +- src/extensions/far/farinfo.cc | 13 +- src/extensions/far/farprintstrings.cc | 21 +- src/extensions/far/farscript.cc | 28 +- src/extensions/far/main.cc | 18 +- src/extensions/far/strings.cc | 40 + src/extensions/ngram/Makefile.am | 10 + src/extensions/ngram/Makefile.in | 543 ++++++++++++++ src/extensions/ngram/bitmap-index.cc | 187 +++++ src/extensions/ngram/ngram-fst.cc | 28 + src/extensions/ngram/nthbit.cc | 107 +++ src/extensions/pdt/pdtcompose.cc | 2 +- src/extensions/pdt/pdtexpand.cc | 6 +- src/extensions/pdt/pdtinfo.cc | 2 +- src/extensions/pdt/pdtreplace.cc | 6 +- src/extensions/pdt/pdtreverse.cc | 2 +- src/extensions/pdt/pdtshortestpath.cc | 6 +- src/include/Makefile.am | 19 +- src/include/Makefile.in | 29 +- src/include/fst/accumulator.h | 22 +- src/include/fst/arc.h | 1 + src/include/fst/arcsort.h | 16 +- src/include/fst/bi-table.h | 81 +- src/include/fst/cache.h | 9 +- src/include/fst/compact-fst.h | 152 +++- src/include/fst/compat.h | 23 +- src/include/fst/compose.h | 68 +- src/include/fst/config.h | 2 +- src/include/fst/const-fst.h | 30 +- src/include/fst/dfs-visit.h | 3 +- src/include/fst/expanded-fst.h | 4 +- src/include/fst/extensions/far/compile-strings.h | 61 +- src/include/fst/extensions/far/equal.h | 99 +++ src/include/fst/extensions/far/extract.h | 2 +- src/include/fst/extensions/far/far.h | 184 ++++- src/include/fst/extensions/far/farscript.h | 51 +- src/include/fst/extensions/far/info.h | 2 +- src/include/fst/extensions/far/print-strings.h | 28 +- src/include/fst/extensions/far/stlist.h | 9 +- src/include/fst/extensions/far/sttable.h | 1 + src/include/fst/extensions/ngram/bitmap-index.h | 183 +++++ src/include/fst/extensions/ngram/ngram-fst.h | 912 +++++++++++++++++++++++ src/include/fst/extensions/ngram/nthbit.h | 46 ++ src/include/fst/extensions/pdt/collection.h | 33 +- src/include/fst/extensions/pdt/info.h | 2 +- src/include/fst/extensions/pdt/paren.h | 30 +- src/include/fst/extensions/pdt/shortest-path.h | 14 +- src/include/fst/flags.h | 70 +- src/include/fst/float-weight.h | 91 +-- src/include/fst/fst.h | 17 +- src/include/fst/fstlib.h | 2 + src/include/fst/icu.h | 135 ++-- src/include/fst/lock.h | 11 + src/include/fst/lookahead-matcher.h | 25 +- src/include/fst/matcher.h | 72 +- src/include/fst/mutable-fst.h | 2 +- src/include/fst/queue.h | 4 +- src/include/fst/register.h | 1 + src/include/fst/rmepsilon.h | 1 - src/include/fst/script/compile-impl.h | 1 + src/include/fst/script/draw-impl.h | 6 +- src/include/fst/script/draw.h | 1 + src/include/fst/script/fst-class.h | 43 +- src/include/fst/script/text-io.h | 1 + src/include/fst/script/weight-class.h | 6 +- src/include/fst/shortest-distance.h | 2 +- src/include/fst/signed-log-weight.h | 12 +- src/include/fst/slist.h | 2 +- src/include/fst/state-map.h | 8 +- src/include/fst/string.h | 3 +- src/include/fst/symbol-table.h | 66 +- src/include/fst/test-properties.h | 20 +- src/include/fst/util.h | 7 +- src/include/fst/vector-fst.h | 10 +- src/include/fst/visit.h | 3 +- src/include/fst/weight.h | 26 +- src/lib/flags.cc | 75 +- src/lib/properties.cc | 5 +- src/lib/symbol-table.cc | 53 +- src/lib/temp_Android.temp_mk | 20 - src/lib/util.cc | 4 +- src/script/fst-class.cc | 10 +- src/script/text-io.cc | 4 +- src/test/algo_test.cc | 2 +- src/test/fst_test.cc | 2 +- src/test/fst_test.h | 6 +- src/test/weight_test.cc | 2 +- 140 files changed, 3754 insertions(+), 741 deletions(-) delete mode 100644 LICENCE create mode 100644 LICENSE create mode 100755 fix_headers.sh delete mode 100644 src/extensions/far/compile-strings.cc create mode 100644 src/extensions/far/farequal.cc create mode 100644 src/extensions/far/strings.cc create mode 100644 src/extensions/ngram/Makefile.am create mode 100644 src/extensions/ngram/Makefile.in create mode 100644 src/extensions/ngram/bitmap-index.cc create mode 100644 src/extensions/ngram/ngram-fst.cc create mode 100644 src/extensions/ngram/nthbit.cc create mode 100644 src/include/fst/extensions/far/equal.h create mode 100644 src/include/fst/extensions/ngram/bitmap-index.h create mode 100644 src/include/fst/extensions/ngram/ngram-fst.h create mode 100644 src/include/fst/extensions/ngram/nthbit.h delete mode 100644 src/lib/temp_Android.temp_mk diff --git a/AUTHORS b/AUTHORS index ac92ec6..2a8b3e4 100644 --- a/AUTHORS +++ b/AUTHORS @@ -8,6 +8,7 @@ Contributors: Boulos Harb Dan Bikel +Jeff Sorensen Johan Schalkwyk Kasturi Raghavan Masha Shugrina diff --git a/Android.mk b/Android.mk index 0748c91..51989dc 100644 --- a/Android.mk +++ b/Android.mk @@ -1,9 +1,7 @@ -# # Copyright 2012 Google Inc. All Rights Reserved. # Author: npereira@google.com (Nicole Pereira) # # Android makefile for openfst library. -# OPENFST_DIR := $(call my-dir) diff --git a/LICENCE b/LICENCE deleted file mode 100644 index d216912..0000000 --- a/LICENCE +++ /dev/null @@ -1,13 +0,0 @@ -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use these files except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -Copyright 2005-2010 Google, Inc. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d216912 --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use these files except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Copyright 2005-2010 Google, Inc. diff --git a/NEWS b/NEWS index 0152c55..530d7b4 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,24 @@ -OpenFst - Release 1.2 +OpenFst - Release 1.3 + * Support for non-fatal exits on errors: (1.3.1) + * Added FLAGS_fst_error_fatal: FST errors are + fatal if true (default); o.w. return objects flagged as bad: + e.g., FSTs - kError + prop. true, FST weights - not a Member(). + * Added kError property bit signifying bad FST + * Added NoWeight() method to FST weight requirements that returns + weight that is not a Member(). + * Various improvements to the FAR extensions (1.3.1) + * a single FST is now a FAR type + * FLAGS_initial_symbols: Uses the symbol table from the + first FST in the archive for all entries" + * Input/output to standard input/output for some FAR and arc types + * --with-icu configuration option no longer needed (1.3.1) + * Improved flags usage esp. if use SET_FLAGS not SetFlags/InitFst (1.3.2) + * Added 'fst' as possible far writer type (1.3.2) + * phi matcher can now accept 0 as the phi label (1.3.2) + * Added ngram-fst extension (1.3.2) +OpenFst - Release 1.2 * Added lookahead matching and filtering for faster composition * Added EditFst for mutation of o.w. immutable FSTs * Added script sub-namespace defining type FstClass - a non-templated diff --git a/README b/README index a4643d7..aff9e2f 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -OpenFst - Release 1.2 +OpenFst - Release 1.3 OpenFst is a library for constructing, combining, optimizing, and searching weighted finite-state transducers (FSTs). @@ -11,7 +11,7 @@ REQUIREMENTS: INSTALLATION: Follow the generic GNU build system instructions in ./INSTALL. We - recommend configuring with =--enable-static=no= for faster + recommend configuring with --enable-static=no for faster compiles. Optional features: @@ -20,8 +20,8 @@ INSTALLATION: --enable-const-fsts Enable all ConstFst classes (def: no) --enable-far Enable FAR (FST Archive) extension (def: no) --enable-lookahead-fsts Enable LookAheadFst classes (def: no) + --enable-ngram-fsts Enable NGramFst classes (def: no) --enable-pdt Experimental push-down transducer extensions (def: no) - --with-icu Use ICU to provide unicode support (def: no) Configuring with --enable-bin=no gives very fast compiles, but excludes the command line utilities. diff --git a/README.android b/README.android index 6d822ec..1e90058 100644 --- a/README.android +++ b/README.android @@ -1,4 +1,3 @@ -Openfst openfst-1.2.10 -Updated to Google CL27738347 +Copy of openfst-1.3.2-CL32004048 from Greco3. -Modified to use stlport, Android.mk file added. \ No newline at end of file +Modified to use stlport, Android.mk file added. diff --git a/configure b/configure index 38c7241..0bd3a30 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.65 for OpenFst 1.2.10. +# Generated by GNU Autoconf 2.65 for OpenFst 1.3.2. # # Report bugs to . # @@ -701,8 +701,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='OpenFst' PACKAGE_TARNAME='openfst' -PACKAGE_VERSION='1.2.10' -PACKAGE_STRING='OpenFst 1.2.10' +PACKAGE_VERSION='1.3.2' +PACKAGE_STRING='OpenFst 1.3.2' PACKAGE_BUGREPORT='help@www.openfst.org' PACKAGE_URL='' @@ -757,6 +757,8 @@ HAVE_PDT_FALSE HAVE_PDT_TRUE HAVE_LOOKAHEAD_FALSE HAVE_LOOKAHEAD_TRUE +HAVE_NGRAM_FALSE +HAVE_NGRAM_TRUE HAVE_FAR_FALSE HAVE_FAR_TRUE HAVE_CONST_FALSE @@ -893,6 +895,7 @@ enable_bin enable_compact_fsts enable_const_fsts enable_far +enable_ngram_fsts enable_lookahead_fsts enable_pdt with_icu @@ -1451,7 +1454,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures OpenFst 1.2.10 to adapt to many kinds of systems. +\`configure' configures OpenFst 1.3.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1521,7 +1524,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of OpenFst 1.2.10:";; + short | recursive ) echo "Configuration of OpenFst 1.3.2:";; esac cat <<\_ACEOF @@ -1540,6 +1543,7 @@ Optional Features: --enable-compact-fsts enable all CompactFst extensions --enable-const-fsts enable all ConstFst extensions --enable-far enable FAR extensions + --enable-ngram-fsts enable NGramFst extension --enable-lookahead-fsts enable LookAheadFst extensions --enable-pdt enable PDT extensions @@ -1630,7 +1634,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -OpenFst configure 1.2.10 +OpenFst configure 1.3.2 generated by GNU Autoconf 2.65 Copyright (C) 2009 Free Software Foundation, Inc. @@ -2083,7 +2087,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by OpenFst $as_me 1.2.10, which was +It was created by OpenFst $as_me 1.3.2, which was generated by GNU Autoconf 2.65. Invocation command line was $ $0 $@ @@ -2891,7 +2895,7 @@ fi # Define the identity of the package. PACKAGE='openfst' - VERSION='1.2.10' + VERSION='1.3.2' cat >>confdefs.h <<_ACEOF @@ -5007,13 +5011,13 @@ if test "${lt_cv_nm_interface+set}" = set; then : else lt_cv_nm_interface="BSD nm" echo "int some_variable = 0;" > conftest.$ac_ext - (eval echo "\"\$as_me:5010: $ac_compile\"" >&5) + (eval echo "\"\$as_me:5014: $ac_compile\"" >&5) (eval "$ac_compile" 2>conftest.err) cat conftest.err >&5 - (eval echo "\"\$as_me:5013: $NM \\\"conftest.$ac_objext\\\"\"" >&5) + (eval echo "\"\$as_me:5017: $NM \\\"conftest.$ac_objext\\\"\"" >&5) (eval "$NM \"conftest.$ac_objext\"" 2>conftest.err > conftest.out) cat conftest.err >&5 - (eval echo "\"\$as_me:5016: output\"" >&5) + (eval echo "\"\$as_me:5020: output\"" >&5) cat conftest.out >&5 if $GREP 'External.*some_variable' conftest.out > /dev/null; then lt_cv_nm_interface="MS dumpbin" @@ -6219,7 +6223,7 @@ ia64-*-hpux*) ;; *-*-irix6*) # Find out which ABI we are using. - echo '#line 6222 "configure"' > conftest.$ac_ext + echo '#line 6226 "configure"' > conftest.$ac_ext if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5 (eval $ac_compile) 2>&5 ac_status=$? @@ -8246,11 +8250,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:8249: $lt_compile\"" >&5) + (eval echo "\"\$as_me:8253: $lt_compile\"" >&5) (eval "$lt_compile" 2>conftest.err) ac_status=$? cat conftest.err >&5 - echo "$as_me:8253: \$? = $ac_status" >&5 + echo "$as_me:8257: \$? = $ac_status" >&5 if (exit $ac_status) && test -s "$ac_outfile"; then # The compiler can only warn and ignore the option if not recognized # So say no if there are warnings other than the usual output. @@ -8585,11 +8589,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:8588: $lt_compile\"" >&5) + (eval echo "\"\$as_me:8592: $lt_compile\"" >&5) (eval "$lt_compile" 2>conftest.err) ac_status=$? cat conftest.err >&5 - echo "$as_me:8592: \$? = $ac_status" >&5 + echo "$as_me:8596: \$? = $ac_status" >&5 if (exit $ac_status) && test -s "$ac_outfile"; then # The compiler can only warn and ignore the option if not recognized # So say no if there are warnings other than the usual output. @@ -8690,11 +8694,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:8693: $lt_compile\"" >&5) + (eval echo "\"\$as_me:8697: $lt_compile\"" >&5) (eval "$lt_compile" 2>out/conftest.err) ac_status=$? cat out/conftest.err >&5 - echo "$as_me:8697: \$? = $ac_status" >&5 + echo "$as_me:8701: \$? = $ac_status" >&5 if (exit $ac_status) && test -s out/conftest2.$ac_objext then # The compiler can only warn and ignore the option if not recognized @@ -8745,11 +8749,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:8748: $lt_compile\"" >&5) + (eval echo "\"\$as_me:8752: $lt_compile\"" >&5) (eval "$lt_compile" 2>out/conftest.err) ac_status=$? cat out/conftest.err >&5 - echo "$as_me:8752: \$? = $ac_status" >&5 + echo "$as_me:8756: \$? = $ac_status" >&5 if (exit $ac_status) && test -s out/conftest2.$ac_objext then # The compiler can only warn and ignore the option if not recognized @@ -11129,7 +11133,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 11132 "configure" +#line 11136 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -11225,7 +11229,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 11228 "configure" +#line 11232 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -13181,11 +13185,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:13184: $lt_compile\"" >&5) + (eval echo "\"\$as_me:13188: $lt_compile\"" >&5) (eval "$lt_compile" 2>conftest.err) ac_status=$? cat conftest.err >&5 - echo "$as_me:13188: \$? = $ac_status" >&5 + echo "$as_me:13192: \$? = $ac_status" >&5 if (exit $ac_status) && test -s "$ac_outfile"; then # The compiler can only warn and ignore the option if not recognized # So say no if there are warnings other than the usual output. @@ -13280,11 +13284,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:13283: $lt_compile\"" >&5) + (eval echo "\"\$as_me:13287: $lt_compile\"" >&5) (eval "$lt_compile" 2>out/conftest.err) ac_status=$? cat out/conftest.err >&5 - echo "$as_me:13287: \$? = $ac_status" >&5 + echo "$as_me:13291: \$? = $ac_status" >&5 if (exit $ac_status) && test -s out/conftest2.$ac_objext then # The compiler can only warn and ignore the option if not recognized @@ -13332,11 +13336,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:13335: $lt_compile\"" >&5) + (eval echo "\"\$as_me:13339: $lt_compile\"" >&5) (eval "$lt_compile" 2>out/conftest.err) ac_status=$? cat out/conftest.err >&5 - echo "$as_me:13339: \$? = $ac_status" >&5 + echo "$as_me:13343: \$? = $ac_status" >&5 if (exit $ac_status) && test -s out/conftest2.$ac_objext then # The compiler can only warn and ignore the option if not recognized @@ -14303,7 +14307,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu ac_config_headers="$ac_config_headers config.h src/include/fst/config.h" -ac_config_files="$ac_config_files Makefile src/Makefile src/include/Makefile src/lib/Makefile src/bin/Makefile src/test/Makefile src/extensions/Makefile src/extensions/compact/Makefile src/extensions/const/Makefile src/extensions/far/Makefile src/extensions/lookahead/Makefile src/extensions/pdt/Makefile src/script/Makefile" +ac_config_files="$ac_config_files Makefile src/Makefile src/include/Makefile src/lib/Makefile src/bin/Makefile src/test/Makefile src/extensions/Makefile src/extensions/compact/Makefile src/extensions/const/Makefile src/extensions/far/Makefile src/extensions/lookahead/Makefile src/extensions/ngram/Makefile src/extensions/pdt/Makefile src/script/Makefile" ac_ext=cpp @@ -14408,6 +14412,22 @@ else fi +# Check whether --enable-ngram-fsts was given. +if test "${enable_ngram_fsts+set}" = set; then : + enableval=$enable_ngram_fsts; +else + enable_ngram_fsts=no +fi + + if test "x$enable_ngram_fsts" != xno; then + HAVE_NGRAM_TRUE= + HAVE_NGRAM_FALSE='#' +else + HAVE_NGRAM_TRUE='#' + HAVE_NGRAM_FALSE= +fi + + # Check whether --enable-lookahead-fsts was given. if test "${enable_lookahead_fsts+set}" = set; then : enableval=$enable_lookahead_fsts; @@ -14706,6 +14726,10 @@ if test -z "${HAVE_FAR_TRUE}" && test -z "${HAVE_FAR_FALSE}"; then as_fn_error "conditional \"HAVE_FAR\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${HAVE_NGRAM_TRUE}" && test -z "${HAVE_NGRAM_FALSE}"; then + as_fn_error "conditional \"HAVE_NGRAM\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi if test -z "${HAVE_LOOKAHEAD_TRUE}" && test -z "${HAVE_LOOKAHEAD_FALSE}"; then as_fn_error "conditional \"HAVE_LOOKAHEAD\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 @@ -15122,7 +15146,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by OpenFst $as_me 1.2.10, which was +This file was extended by OpenFst $as_me 1.3.2, which was generated by GNU Autoconf 2.65. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -15188,7 +15212,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -OpenFst config.status 1.2.10 +OpenFst config.status 1.3.2 configured by $0, generated by GNU Autoconf 2.65, with options \\"\$ac_cs_config\\" @@ -15677,6 +15701,7 @@ do "src/extensions/const/Makefile") CONFIG_FILES="$CONFIG_FILES src/extensions/const/Makefile" ;; "src/extensions/far/Makefile") CONFIG_FILES="$CONFIG_FILES src/extensions/far/Makefile" ;; "src/extensions/lookahead/Makefile") CONFIG_FILES="$CONFIG_FILES src/extensions/lookahead/Makefile" ;; + "src/extensions/ngram/Makefile") CONFIG_FILES="$CONFIG_FILES src/extensions/ngram/Makefile" ;; "src/extensions/pdt/Makefile") CONFIG_FILES="$CONFIG_FILES src/extensions/pdt/Makefile" ;; "src/script/Makefile") CONFIG_FILES="$CONFIG_FILES src/script/Makefile" ;; diff --git a/configure.ac b/configure.ac index 211dcd7..908a6d5 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([OpenFst], [1.2.10], [help@www.openfst.org]) +AC_INIT([OpenFst], [1.3.2], [help@www.openfst.org]) AM_INIT_AUTOMAKE([foreign nostdinc -Wall -Werror]) AC_PROG_CXX @@ -19,6 +19,7 @@ AC_CONFIG_FILES([ src/extensions/const/Makefile src/extensions/far/Makefile src/extensions/lookahead/Makefile + src/extensions/ngram/Makefile src/extensions/pdt/Makefile src/script/Makefile ]) @@ -62,6 +63,15 @@ AC_ARG_ENABLE([far], [AS_HELP_STRING([--enable-far], [enable FAR extensions])], [], [enable_far=no]) AM_CONDITIONAL([HAVE_FAR], [test "x$enable_far" != xno]) +AC_ARG_ENABLE( + [ngram-fsts], + [AS_HELP_STRING( + [--enable-ngram-fsts], + [enable NGramFst extension])], + [], + [enable_ngram_fsts=no]) +AM_CONDITIONAL([HAVE_NGRAM], [test "x$enable_ngram_fsts" != xno]) + AC_ARG_ENABLE( [lookahead-fsts], [AS_HELP_STRING( diff --git a/fix_headers.sh b/fix_headers.sh new file mode 100755 index 0000000..b1b0844 --- /dev/null +++ b/fix_headers.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +INPUT_FILE=$1 + +function fix_tr1() { + grep $1 $INPUT_FILE > /dev/null 2>&1 + if [ $? -eq 0 ] ; then + echo "Modifying $INPUT_FILE ..." + sed -e "s/$1/$2/g" $INPUT_FILE > .tmp + if [ $? -ne 0 ] ; then + echo "sed failed!" + return 1 + fi + mv -f .tmp $INPUT_FILE + fi + return 0 +} + +fix_tr1 "tr1\/unordered_map" "unordered_map" +fix_tr1 "tr1/unordered_set" "unordered_set" +fix_tr1 "tr1::hash" "hash" diff --git a/src/bin/fstarcsort.cc b/src/bin/fstarcsort.cc index 33e534d..9f1b420 100644 --- a/src/bin/fstarcsort.cc +++ b/src/bin/fstarcsort.cc @@ -38,7 +38,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); diff --git a/src/bin/fstclosure.cc b/src/bin/fstclosure.cc index 569c708..f6b3324 100644 --- a/src/bin/fstclosure.cc +++ b/src/bin/fstclosure.cc @@ -34,7 +34,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstcompile.cc b/src/bin/fstcompile.cc index db62f95..2e0bdd4 100644 --- a/src/bin/fstcompile.cc +++ b/src/bin/fstcompile.cc @@ -45,14 +45,14 @@ int main(int argc, char **argv) { usage += " [text.fst [binary.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; } const char *source = "standard input"; - istream *istrm = &std::cin; + istream *istrm = &cin; if (argc > 1 && strcmp(argv[1], "-") != 0) { source = argv[1]; istrm = new fst::ifstream(argv[1]); @@ -63,13 +63,16 @@ int main(int argc, char **argv) { } const SymbolTable *isyms = 0, *osyms = 0, *ssyms = 0; + fst::SymbolTableTextOptions opts; + opts.allow_negative = FLAGS_allow_negative_labels; + if (!FLAGS_isymbols.empty()) { - isyms = SymbolTable::ReadText(FLAGS_isymbols, FLAGS_allow_negative_labels); + isyms = SymbolTable::ReadText(FLAGS_isymbols, opts); if (!isyms) exit(1); } if (!FLAGS_osymbols.empty()) { - osyms = SymbolTable::ReadText(FLAGS_osymbols, FLAGS_allow_negative_labels); + osyms = SymbolTable::ReadText(FLAGS_osymbols, opts); if (!osyms) exit(1); } @@ -85,7 +88,7 @@ int main(int argc, char **argv) { FLAGS_acceptor, FLAGS_keep_isymbols, FLAGS_keep_osymbols, FLAGS_keep_state_numbering, FLAGS_allow_negative_labels); - if (istrm != &std::cin) + if (istrm != &cin) delete istrm; return 0; diff --git a/src/bin/fstcompose.cc b/src/bin/fstcompose.cc index 0ba8c47..996aca7 100644 --- a/src/bin/fstcompose.cc +++ b/src/bin/fstcompose.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { usage += " in1.fst in2.fst [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 3 || argc > 4) { ShowUsage(); return 1; diff --git a/src/bin/fstconcat.cc b/src/bin/fstconcat.cc index fbe9b68..4a02205 100644 --- a/src/bin/fstconcat.cc +++ b/src/bin/fstconcat.cc @@ -34,7 +34,7 @@ int main(int argc, char **argv) { usage += " in1.fst in2.fst [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 3 || argc > 4) { ShowUsage(); return 1; diff --git a/src/bin/fstconnect.cc b/src/bin/fstconnect.cc index f774767..2900d1b 100644 --- a/src/bin/fstconnect.cc +++ b/src/bin/fstconnect.cc @@ -33,7 +33,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstconvert.cc b/src/bin/fstconvert.cc index f7c4ad9..9fbb43e 100644 --- a/src/bin/fstconvert.cc +++ b/src/bin/fstconvert.cc @@ -33,7 +33,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstdeterminize.cc b/src/bin/fstdeterminize.cc index aa5064f..ca7840e 100644 --- a/src/bin/fstdeterminize.cc +++ b/src/bin/fstdeterminize.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstdifference.cc b/src/bin/fstdifference.cc index 5459b03..8bdf89e 100644 --- a/src/bin/fstdifference.cc +++ b/src/bin/fstdifference.cc @@ -39,7 +39,7 @@ int main(int argc, char **argv) { usage += " in1.fst in2.fst [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 3 || argc > 4) { ShowUsage(); return 1; diff --git a/src/bin/fstdraw.cc b/src/bin/fstdraw.cc index 51ebb2d..98f8ff4 100644 --- a/src/bin/fstdraw.cc +++ b/src/bin/fstdraw.cc @@ -26,8 +26,6 @@ DEFINE_string(isymbols, "", "Input label symbol table"); DEFINE_string(osymbols, "", "Output label symbol table"); DEFINE_string(ssymbols, "", "State label symbol table"); DEFINE_bool(numeric, false, "Print numeric labels"); -DEFINE_string(save_isymbols, "", "Save input symbol table to file"); -DEFINE_string(save_osymbols, "", "Save output symbol table to file"); DEFINE_int32(precision, 5, "Set precision (number of char/float)"); DEFINE_bool(show_weight_one, false, "Print/draw arc weights and final weights equal to Weight::One()"); @@ -51,10 +49,10 @@ int main(int argc, char **argv) { string usage = "Prints out binary FSTs in dot text format.\n\n Usage: "; usage += argv[0]; - usage += " [binary.fst [text.fst]]\n"; + usage += " [binary.fst [text.dot]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; @@ -65,7 +63,7 @@ int main(int argc, char **argv) { s::FstClass *fst = s::FstClass::Read(in_name); if (!fst) return 1; - ostream *ostrm = &std::cout; + ostream *ostrm = &cout; string dest = "stdout"; if (argc == 3) { dest = argv[2]; @@ -79,13 +77,16 @@ int main(int argc, char **argv) { const SymbolTable *isyms = 0, *osyms = 0, *ssyms = 0; + fst::SymbolTableTextOptions opts; + opts.allow_negative = FLAGS_allow_negative_labels; + if (!FLAGS_isymbols.empty() && !FLAGS_numeric) { - isyms = SymbolTable::ReadText(FLAGS_isymbols, FLAGS_allow_negative_labels); + isyms = SymbolTable::ReadText(FLAGS_isymbols, opts); if (!isyms) exit(1); } if (!FLAGS_osymbols.empty() && !FLAGS_numeric) { - osyms = SymbolTable::ReadText(FLAGS_osymbols, FLAGS_allow_negative_labels); + osyms = SymbolTable::ReadText(FLAGS_osymbols, opts); if (!osyms) exit(1); } @@ -106,13 +107,7 @@ int main(int argc, char **argv) { FLAGS_fontsize, FLAGS_precision, FLAGS_show_weight_one, ostrm, dest); - if (isyms && !FLAGS_save_isymbols.empty()) - isyms->WriteText(FLAGS_save_isymbols); - - if (osyms && !FLAGS_save_osymbols.empty()) - osyms->WriteText(FLAGS_save_osymbols); - - if (ostrm != &std::cout) + if (ostrm != &cout) delete ostrm; return 0; } diff --git a/src/bin/fstencode.cc b/src/bin/fstencode.cc index 95dd49c..003ca24 100644 --- a/src/bin/fstencode.cc +++ b/src/bin/fstencode.cc @@ -39,7 +39,7 @@ int main(int argc, char **argv) { usage += " in.fst codex [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 3 || argc > 4) { ShowUsage(); return 1; diff --git a/src/bin/fstepsnormalize.cc b/src/bin/fstepsnormalize.cc index c8813c6..d5c1a8d 100644 --- a/src/bin/fstepsnormalize.cc +++ b/src/bin/fstepsnormalize.cc @@ -35,7 +35,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstequal.cc b/src/bin/fstequal.cc index 885b330..c64f6e3 100644 --- a/src/bin/fstequal.cc +++ b/src/bin/fstequal.cc @@ -33,7 +33,7 @@ int main(int argc, char **argv) { usage += " in1.fst in2.fst\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc != 3) { ShowUsage(); return 1; diff --git a/src/bin/fstequivalent.cc b/src/bin/fstequivalent.cc index e7e09f2..70010d7 100644 --- a/src/bin/fstequivalent.cc +++ b/src/bin/fstequivalent.cc @@ -43,7 +43,7 @@ int main(int argc, char **argv) { usage += " in1.fst in2.fst\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc != 3) { ShowUsage(); return 1; diff --git a/src/bin/fstinfo.cc b/src/bin/fstinfo.cc index 23816be..f02e0b3 100644 --- a/src/bin/fstinfo.cc +++ b/src/bin/fstinfo.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { usage += " [in.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 2) { ShowUsage(); return 1; diff --git a/src/bin/fstintersect.cc b/src/bin/fstintersect.cc index b3558fc..3bdc26b 100644 --- a/src/bin/fstintersect.cc +++ b/src/bin/fstintersect.cc @@ -40,7 +40,7 @@ int main(int argc, char **argv) { usage += " Flags: connect\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 3 || argc > 4) { ShowUsage(); return 1; diff --git a/src/bin/fstinvert.cc b/src/bin/fstinvert.cc index f9b348b..00dadc5 100644 --- a/src/bin/fstinvert.cc +++ b/src/bin/fstinvert.cc @@ -32,7 +32,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstmap.cc b/src/bin/fstmap.cc index 231c725..81336b1 100644 --- a/src/bin/fstmap.cc +++ b/src/bin/fstmap.cc @@ -43,7 +43,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstminimize.cc b/src/bin/fstminimize.cc index 8d9d635..2d4e7a9 100644 --- a/src/bin/fstminimize.cc +++ b/src/bin/fstminimize.cc @@ -36,7 +36,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out1.fst [out2.fst]]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 4) { ShowUsage(); return 1; diff --git a/src/bin/fstprint.cc b/src/bin/fstprint.cc index b42bb7e..b5b23e5 100644 --- a/src/bin/fstprint.cc +++ b/src/bin/fstprint.cc @@ -44,7 +44,7 @@ int main(int argc, char **argv) { usage += " [binary.fst [text.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; @@ -56,7 +56,7 @@ int main(int argc, char **argv) { s::FstClass *fst = s::FstClass::Read(in_name); if (!fst) return 1; - ostream *ostrm = &std::cout; + ostream *ostrm = &cout; string dest = "standard output"; if (argc == 3) { dest = argv[2]; @@ -70,13 +70,16 @@ int main(int argc, char **argv) { const SymbolTable *isyms = 0, *osyms = 0, *ssyms = 0; + fst::SymbolTableTextOptions opts; + opts.allow_negative = FLAGS_allow_negative_labels; + if (!FLAGS_isymbols.empty() && !FLAGS_numeric) { - isyms = SymbolTable::ReadText(FLAGS_isymbols, FLAGS_allow_negative_labels); + isyms = SymbolTable::ReadText(FLAGS_isymbols, opts); if (!isyms) exit(1); } if (!FLAGS_osymbols.empty() && !FLAGS_numeric) { - osyms = SymbolTable::ReadText(FLAGS_osymbols, FLAGS_allow_negative_labels); + osyms = SymbolTable::ReadText(FLAGS_osymbols, opts); if (!osyms) exit(1); } @@ -99,7 +102,7 @@ int main(int argc, char **argv) { if (osyms && !FLAGS_save_osymbols.empty()) osyms->WriteText(FLAGS_save_osymbols); - if (ostrm != &std::cout) + if (ostrm != &cout) delete ostrm; return 0; } diff --git a/src/bin/fstproject.cc b/src/bin/fstproject.cc index a67f431..efd8dbf 100644 --- a/src/bin/fstproject.cc +++ b/src/bin/fstproject.cc @@ -35,7 +35,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstprune.cc b/src/bin/fstprune.cc index 2041b7c..dedf237 100644 --- a/src/bin/fstprune.cc +++ b/src/bin/fstprune.cc @@ -38,7 +38,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstpush.cc b/src/bin/fstpush.cc index 3b849b5..d40ff9f 100644 --- a/src/bin/fstpush.cc +++ b/src/bin/fstpush.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstrandgen.cc b/src/bin/fstrandgen.cc index f48d0cf..b3716e5 100644 --- a/src/bin/fstrandgen.cc +++ b/src/bin/fstrandgen.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstrelabel.cc b/src/bin/fstrelabel.cc index cc86450..e75db99 100644 --- a/src/bin/fstrelabel.cc +++ b/src/bin/fstrelabel.cc @@ -43,6 +43,7 @@ DEFINE_bool(allow_negative_labels, false, int main(int argc, char **argv) { namespace s = fst::script; using fst::SymbolTable; + using fst::SymbolTableTextOptions; using fst::script::FstClass; using fst::script::MutableFstClass; @@ -58,7 +59,7 @@ int main(int argc, char **argv) { usage += " -relabel_opairs opairs.txts\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; @@ -71,24 +72,24 @@ int main(int argc, char **argv) { if (!fst) return 1; // Relabel with symbol tables + SymbolTableTextOptions opts; + opts.allow_negative = FLAGS_allow_negative_labels; if (!FLAGS_relabel_isymbols.empty() || !FLAGS_relabel_osymbols.empty()) { bool attach_new_isymbols = (fst->InputSymbols() != 0); const SymbolTable* old_isymbols = FLAGS_isymbols.empty() ? fst->InputSymbols() - : SymbolTable::ReadText(FLAGS_isymbols, FLAGS_allow_negative_labels); + : SymbolTable::ReadText(FLAGS_isymbols, opts); const SymbolTable* relabel_isymbols = FLAGS_relabel_isymbols.empty() ? NULL - : SymbolTable::ReadText(FLAGS_relabel_isymbols, - FLAGS_allow_negative_labels); + : SymbolTable::ReadText(FLAGS_relabel_isymbols, opts); bool attach_new_osymbols = (fst->OutputSymbols() != 0); const SymbolTable* old_osymbols = FLAGS_osymbols.empty() ? fst->OutputSymbols() - : SymbolTable::ReadText(FLAGS_osymbols, FLAGS_allow_negative_labels); + : SymbolTable::ReadText(FLAGS_osymbols, opts); const SymbolTable* relabel_osymbols = FLAGS_relabel_osymbols.empty() ? NULL - : SymbolTable::ReadText(FLAGS_relabel_osymbols, - FLAGS_allow_negative_labels); + : SymbolTable::ReadText(FLAGS_relabel_osymbols, opts); s::Relabel(fst, old_isymbols, relabel_isymbols, attach_new_isymbols, diff --git a/src/bin/fstreplace.cc b/src/bin/fstreplace.cc index 5d1d770..d6522fc 100644 --- a/src/bin/fstreplace.cc +++ b/src/bin/fstreplace.cc @@ -31,7 +31,7 @@ int main(int argc, char **argv) { usage += " root.fst rootlabel [rule1.fst label1 ...] [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 4) { ShowUsage(); return 1; diff --git a/src/bin/fstreverse.cc b/src/bin/fstreverse.cc index 7507ffa..a919f58 100644 --- a/src/bin/fstreverse.cc +++ b/src/bin/fstreverse.cc @@ -26,6 +26,7 @@ #include #include #include +#include int main(int argc, char **argv) { using fst::script::FstClass; @@ -37,7 +38,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstreweight.cc b/src/bin/fstreweight.cc index 3913a6d..4e8333d 100644 --- a/src/bin/fstreweight.cc +++ b/src/bin/fstreweight.cc @@ -35,7 +35,7 @@ int main(int argc, char **argv) { usage += " in.fst potential.txt [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 3 || argc > 4) { ShowUsage(); return 1; diff --git a/src/bin/fstrmepsilon.cc b/src/bin/fstrmepsilon.cc index 63a80b8..da4e6d9 100644 --- a/src/bin/fstrmepsilon.cc +++ b/src/bin/fstrmepsilon.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstshortestdistance.cc b/src/bin/fstshortestdistance.cc index 01f5617..9103b72 100644 --- a/src/bin/fstshortestdistance.cc +++ b/src/bin/fstshortestdistance.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { usage += " [in.fst [distance.txt]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstshortestpath.cc b/src/bin/fstshortestpath.cc index 4a9e720..6e33db8 100644 --- a/src/bin/fstshortestpath.cc +++ b/src/bin/fstshortestpath.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstsymbols.cc b/src/bin/fstsymbols.cc index c07f1be..23a6447 100644 --- a/src/bin/fstsymbols.cc +++ b/src/bin/fstsymbols.cc @@ -48,7 +48,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; @@ -78,17 +78,18 @@ int main(int argc, char **argv) { } } + fst::SymbolTableTextOptions opts; + opts.allow_negative = FLAGS_allow_negative_labels; + if (FLAGS_clear_isymbols) fst->SetInputSymbols(0); else if (!FLAGS_isymbols.empty()) - fst->SetInputSymbols( - SymbolTable::ReadText(FLAGS_isymbols, FLAGS_allow_negative_labels)); + fst->SetInputSymbols(SymbolTable::ReadText(FLAGS_isymbols, opts)); if (FLAGS_clear_osymbols) fst->SetOutputSymbols(0); else if (!FLAGS_osymbols.empty()) - fst->SetOutputSymbols( - SymbolTable::ReadText(FLAGS_osymbols, FLAGS_allow_negative_labels)); + fst->SetOutputSymbols(SymbolTable::ReadText(FLAGS_osymbols, opts)); if (!FLAGS_relabel_ipairs.empty()) { typedef int64 Label; diff --git a/src/bin/fstsynchronize.cc b/src/bin/fstsynchronize.cc index 7137504..8c226ad 100644 --- a/src/bin/fstsynchronize.cc +++ b/src/bin/fstsynchronize.cc @@ -32,7 +32,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fsttopsort.cc b/src/bin/fsttopsort.cc index 6f04215..69896c7 100644 --- a/src/bin/fsttopsort.cc +++ b/src/bin/fsttopsort.cc @@ -32,7 +32,7 @@ int main(int argc, char **argv) { usage += " [in.fst [out.fst]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/bin/fstunion.cc b/src/bin/fstunion.cc index 41c4f83..ae7eb66 100644 --- a/src/bin/fstunion.cc +++ b/src/bin/fstunion.cc @@ -25,6 +25,7 @@ #include #include #include +#include int main(int argc, char **argv) { using fst::script::FstClass; @@ -36,7 +37,7 @@ int main(int argc, char **argv) { usage += " in1.fst in2.fst [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 3 || argc > 4) { ShowUsage(); return 1; diff --git a/src/extensions/Makefile.am b/src/extensions/Makefile.am index 64d5483..d982c15 100644 --- a/src/extensions/Makefile.am +++ b/src/extensions/Makefile.am @@ -14,8 +14,12 @@ if HAVE_LOOKAHEAD lookaheaddir = lookahead endif +if HAVE_NGRAM +ngramdir = ngram +endif + if HAVE_PDT pdtdir = pdt endif -SUBDIRS = $(compactdir) $(constdir) $(fardir) $(lookaheaddir) $(pdtdir) +SUBDIRS = $(compactdir) $(constdir) $(fardir) $(lookaheaddir) $(ngramdir) $(pdtdir) diff --git a/src/extensions/Makefile.in b/src/extensions/Makefile.in index 43a74c8..6bcf056 100644 --- a/src/extensions/Makefile.in +++ b/src/extensions/Makefile.in @@ -63,7 +63,7 @@ AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ distdir ETAGS = etags CTAGS = ctags -DIST_SUBDIRS = compact const far lookahead pdt +DIST_SUBDIRS = compact const far lookahead ngram pdt DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) am__relativize = \ dir0=`pwd`; \ @@ -216,8 +216,9 @@ top_srcdir = @top_srcdir@ @HAVE_CONST_TRUE@constdir = const @HAVE_FAR_TRUE@fardir = far @HAVE_LOOKAHEAD_TRUE@lookaheaddir = lookahead +@HAVE_NGRAM_TRUE@ngramdir = ngram @HAVE_PDT_TRUE@pdtdir = pdt -SUBDIRS = $(compactdir) $(constdir) $(fardir) $(lookaheaddir) $(pdtdir) +SUBDIRS = $(compactdir) $(constdir) $(fardir) $(lookaheaddir) $(ngramdir) $(pdtdir) all: all-recursive .SUFFIXES: diff --git a/src/extensions/far/Makefile.am b/src/extensions/far/Makefile.am index 61f7cc8..a7c2fff 100644 --- a/src/extensions/far/Makefile.am +++ b/src/extensions/far/Makefile.am @@ -12,12 +12,13 @@ libfstfar_la_SOURCES = sttable.cc stlist.cc libfstfar_la_LDFLAGS = -version-info 0:0:0 if HAVE_SCRIPT -libfstfarscript_la_SOURCES = farscript.cc compile-strings.cc main.cc +libfstfarscript_la_SOURCES = farscript.cc strings.cc main.cc libfstfarscript_la_LDFLAGS = -version-info 0:0:0 endif if HAVE_BIN -bin_PROGRAMS = farcompilestrings farcreate farextract farinfo farprintstrings +bin_PROGRAMS = farcompilestrings farcreate farequal farextract farinfo \ + farprintstrings LDADD = libfstfarscript.la libfstfar.la ../../script/libfstscript.la \ ../../lib/libfst.la -lm -ldl @@ -26,6 +27,8 @@ farcompilestrings_SOURCES = farcompilestrings.cc farcreate_SOURCES = farcreate.cc +farequal_SOURCES = farequal.cc + farextract_SOURCES = farextract.cc farinfo_SOURCES = farinfo.cc diff --git a/src/extensions/far/Makefile.in b/src/extensions/far/Makefile.in index 8e05cab..0635556 100644 --- a/src/extensions/far/Makefile.in +++ b/src/extensions/far/Makefile.in @@ -36,8 +36,9 @@ POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ @HAVE_BIN_TRUE@bin_PROGRAMS = farcompilestrings$(EXEEXT) \ -@HAVE_BIN_TRUE@ farcreate$(EXEEXT) farextract$(EXEEXT) \ -@HAVE_BIN_TRUE@ farinfo$(EXEEXT) farprintstrings$(EXEEXT) +@HAVE_BIN_TRUE@ farcreate$(EXEEXT) farequal$(EXEEXT) \ +@HAVE_BIN_TRUE@ farextract$(EXEEXT) farinfo$(EXEEXT) \ +@HAVE_BIN_TRUE@ farprintstrings$(EXEEXT) subdir = src/extensions/far DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 @@ -84,10 +85,9 @@ libfstfar_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \ @HAVE_SCRIPT_FALSE@am_libfstfar_la_rpath = -rpath $(libfstdir) @HAVE_SCRIPT_TRUE@am_libfstfar_la_rpath = -rpath $(libfstdir) libfstfarscript_la_LIBADD = -am__libfstfarscript_la_SOURCES_DIST = farscript.cc compile-strings.cc \ - main.cc +am__libfstfarscript_la_SOURCES_DIST = farscript.cc strings.cc main.cc @HAVE_SCRIPT_TRUE@am_libfstfarscript_la_OBJECTS = farscript.lo \ -@HAVE_SCRIPT_TRUE@ compile-strings.lo main.lo +@HAVE_SCRIPT_TRUE@ strings.lo main.lo libfstfarscript_la_OBJECTS = $(am_libfstfarscript_la_OBJECTS) libfstfarscript_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ @@ -109,6 +109,13 @@ farcreate_LDADD = $(LDADD) @HAVE_BIN_TRUE@farcreate_DEPENDENCIES = libfstfarscript.la \ @HAVE_BIN_TRUE@ libfstfar.la ../../script/libfstscript.la \ @HAVE_BIN_TRUE@ ../../lib/libfst.la +am__farequal_SOURCES_DIST = farequal.cc +@HAVE_BIN_TRUE@am_farequal_OBJECTS = farequal.$(OBJEXT) +farequal_OBJECTS = $(am_farequal_OBJECTS) +farequal_LDADD = $(LDADD) +@HAVE_BIN_TRUE@farequal_DEPENDENCIES = libfstfarscript.la libfstfar.la \ +@HAVE_BIN_TRUE@ ../../script/libfstscript.la \ +@HAVE_BIN_TRUE@ ../../lib/libfst.la am__farextract_SOURCES_DIST = farextract.cc @HAVE_BIN_TRUE@am_farextract_OBJECTS = farextract.$(OBJEXT) farextract_OBJECTS = $(am_farextract_OBJECTS) @@ -145,13 +152,13 @@ CXXLINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ $(LDFLAGS) -o $@ SOURCES = $(libfstfar_la_SOURCES) $(libfstfarscript_la_SOURCES) \ $(farcompilestrings_SOURCES) $(farcreate_SOURCES) \ - $(farextract_SOURCES) $(farinfo_SOURCES) \ + $(farequal_SOURCES) $(farextract_SOURCES) $(farinfo_SOURCES) \ $(farprintstrings_SOURCES) DIST_SOURCES = $(libfstfar_la_SOURCES) \ $(am__libfstfarscript_la_SOURCES_DIST) \ $(am__farcompilestrings_SOURCES_DIST) \ - $(am__farcreate_SOURCES_DIST) $(am__farextract_SOURCES_DIST) \ - $(am__farinfo_SOURCES_DIST) \ + $(am__farcreate_SOURCES_DIST) $(am__farequal_SOURCES_DIST) \ + $(am__farextract_SOURCES_DIST) $(am__farinfo_SOURCES_DIST) \ $(am__farprintstrings_SOURCES_DIST) ETAGS = etags CTAGS = ctags @@ -283,13 +290,14 @@ AM_CPPFLAGS = -I$(srcdir)/../../include $(ICU_CPPFLAGS) @HAVE_SCRIPT_TRUE@libfst_LTLIBRARIES = libfstfarscript.la libfstfar.la libfstfar_la_SOURCES = sttable.cc stlist.cc libfstfar_la_LDFLAGS = -version-info 0:0:0 -@HAVE_SCRIPT_TRUE@libfstfarscript_la_SOURCES = farscript.cc compile-strings.cc main.cc +@HAVE_SCRIPT_TRUE@libfstfarscript_la_SOURCES = farscript.cc strings.cc main.cc @HAVE_SCRIPT_TRUE@libfstfarscript_la_LDFLAGS = -version-info 0:0:0 @HAVE_BIN_TRUE@LDADD = libfstfarscript.la libfstfar.la ../../script/libfstscript.la \ @HAVE_BIN_TRUE@ ../../lib/libfst.la -lm -ldl @HAVE_BIN_TRUE@farcompilestrings_SOURCES = farcompilestrings.cc @HAVE_BIN_TRUE@farcreate_SOURCES = farcreate.cc +@HAVE_BIN_TRUE@farequal_SOURCES = farequal.cc @HAVE_BIN_TRUE@farextract_SOURCES = farextract.cc @HAVE_BIN_TRUE@farinfo_SOURCES = farinfo.cc @HAVE_BIN_TRUE@farprintstrings_SOURCES = farprintstrings.cc @@ -411,6 +419,9 @@ farcompilestrings$(EXEEXT): $(farcompilestrings_OBJECTS) $(farcompilestrings_DEP farcreate$(EXEEXT): $(farcreate_OBJECTS) $(farcreate_DEPENDENCIES) @rm -f farcreate$(EXEEXT) $(CXXLINK) $(farcreate_OBJECTS) $(farcreate_LDADD) $(LIBS) +farequal$(EXEEXT): $(farequal_OBJECTS) $(farequal_DEPENDENCIES) + @rm -f farequal$(EXEEXT) + $(CXXLINK) $(farequal_OBJECTS) $(farequal_LDADD) $(LIBS) farextract$(EXEEXT): $(farextract_OBJECTS) $(farextract_DEPENDENCIES) @rm -f farextract$(EXEEXT) $(CXXLINK) $(farextract_OBJECTS) $(farextract_LDADD) $(LIBS) @@ -427,15 +438,16 @@ mostlyclean-compile: distclean-compile: -rm -f *.tab.c -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/compile-strings.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/farcompilestrings.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/farcreate.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/farequal.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/farextract.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/farinfo.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/farprintstrings.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/farscript.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stlist.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/strings.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sttable.Plo@am__quote@ .cc.o: diff --git a/src/extensions/far/compile-strings.cc b/src/extensions/far/compile-strings.cc deleted file mode 100644 index e8a99cc..0000000 --- a/src/extensions/far/compile-strings.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Copyright 2005-2010 Google, Inc. -// Author: jpr@google.com (Jake Ratkiewicz) - -#include -#include -#include - -namespace fst { - -// Compute the minimal length required to -// encode each line number as a decimal number -int KeySize(const char *filename) { - ifstream istrm(filename); - istrm.seekg(0); - string s; - int nline = 0; - while (getline(istrm, s)) - ++nline; - istrm.seekg(0); - return nline ? ceil(log10(nline + 1)) : 1; -} - -} // namespace fst diff --git a/src/extensions/far/farcompilestrings.cc b/src/extensions/far/farcompilestrings.cc index 0c9b352..765b908 100644 --- a/src/extensions/far/farcompilestrings.cc +++ b/src/extensions/far/farcompilestrings.cc @@ -25,12 +25,15 @@ #include #include #include +#include DEFINE_string(key_prefix, "", "Prefix to append to keys"); DEFINE_string(key_suffix, "", "Suffix to append to keys"); DEFINE_int32(generate_keys, 0, "Generate N digit numeric keys (def: use file basenames)"); -DEFINE_string(far_type, "default", "FAR file format type: one of: "); +DEFINE_string(far_type, "default", + "FAR file format type: one of: \"default\", \"fst\"" + "\"stlist\", \"sttable\""); DEFINE_bool(allow_negative_labels, false, "Allow negative labels (not recommended; may cause conflicts)"); DEFINE_string(arc_type, "standard", "Output arc type"); @@ -43,7 +46,11 @@ DEFINE_string(symbols, "", "Label symbol table"); DEFINE_string(unknown_symbol, "", ""); DEFINE_bool(file_list_input, false, "Each input files contains a list of files to be processed"); - +DEFINE_bool(keep_symbols, false, + "Store symbol table in Far file"); +DEFINE_bool(initial_symbols, true, + "When keep_symbols==true, stores symbol table only for the first" + " Fst in archive."); int main(int argc, char **argv) { namespace s = fst::script; @@ -51,23 +58,19 @@ int main(int argc, char **argv) { string usage = "Compiles a set of strings as FSTs and stores them in"; usage += " a finite-state archive.\n\n Usage:"; usage += argv[0]; - usage += " in1.txt [in2.txt ...] out.far\n"; + usage += " [in1.txt [[in2.txt ...] out.far]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); - - if (argc < 3) { - ShowUsage(); - return 1; - } - - vector in_fnames(argc - 2); + SET_FLAGS(usage.c_str(), &argc, &argv, true); - for (unsigned i = 1; i < argc - 1; ++i) { - in_fnames[i - 1] = argv[i]; - } + vector in_fnames; + for (unsigned i = 1; i < argc - 1; ++i) + in_fnames.push_back(strcmp(argv[i], "") != 0 ? argv[i] : ""); + if (in_fnames.empty()) + in_fnames.push_back(argc == 2 && strcmp(argv[1], "-") != 0 ? argv[1] : ""); - string out_fname = argv[argc - 1]; + string out_fname = + argc > 2 && strcmp(argv[argc - 1], "-") != 0 ? argv[argc - 1] : ""; fst::FarEntryType fet = fst::StringToFarEntryType(FLAGS_entry_type); fst::FarTokenType ftt = fst::StringToFarTokenType(FLAGS_token_type); @@ -76,6 +79,7 @@ int main(int argc, char **argv) { s::FarCompileStrings(in_fnames, out_fname, FLAGS_arc_type, FLAGS_fst_type, far_type, FLAGS_generate_keys, fet, ftt, FLAGS_symbols, FLAGS_unknown_symbol, + FLAGS_keep_symbols, FLAGS_initial_symbols, FLAGS_allow_negative_labels, FLAGS_file_list_input, FLAGS_key_prefix, FLAGS_key_suffix); diff --git a/src/extensions/far/farcreate.cc b/src/extensions/far/farcreate.cc index 009fc34..9ddbec3 100644 --- a/src/extensions/far/farcreate.cc +++ b/src/extensions/far/farcreate.cc @@ -29,7 +29,8 @@ DEFINE_string(key_suffix, "", "Suffix to append to keys"); DEFINE_int32(generate_keys, 0, "Generate N digit numeric keys (def: use file basenames)"); DEFINE_string(far_type, "default", - "FAR file format type: one of: \"default\", \"stlist\", \"sstable\", \"sttable\""); + "FAR file format type: one of: \"default\", " + "\"stlist\", \"sttable\""); DEFINE_bool(file_list_input, false, "Each input files contains a list of files to be processed"); @@ -38,21 +39,20 @@ int main(int argc, char **argv) { string usage = "Creates a finite-state archive from input FSTs.\n\n Usage:"; usage += argv[0]; - usage += " in1.fst [in2.fst ...] out.far\n"; + usage += " [in1.fst [[in2.fst ...] out.far]]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); - - if (argc < 3) { - ShowUsage(); - return 1; - } + SET_FLAGS(usage.c_str(), &argc, &argv, true); vector in_fnames; - for (int i = 1; i < argc - 1; ++i) - in_fnames.push_back(argv[i]); + for (unsigned i = 1; i < argc - 1; ++i) + in_fnames.push_back(strcmp(argv[i], "") != 0 ? argv[i] : ""); + if (in_fnames.empty()) + in_fnames.push_back(argc == 2 && strcmp(argv[1], "-") != 0 ? argv[1] : ""); + + string out_fname = + argc > 2 && strcmp(argv[argc - 1], "-") != 0 ? argv[argc - 1] : ""; - string out_fname = argv[argc - 1]; string arc_type = fst::LoadArcTypeFromFst(in_fnames[0]); fst::FarType far_type = fst::FarTypeFromString(FLAGS_far_type); diff --git a/src/extensions/far/farequal.cc b/src/extensions/far/farequal.cc new file mode 100644 index 0000000..94e607c --- /dev/null +++ b/src/extensions/far/farequal.cc @@ -0,0 +1,57 @@ +// farequal.cc + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: allauzen@google.com (Cyril Allauzen) +// +// \file +// Tests if two Far files contains the same (key,fst) pairs. + +#include +#include + +DEFINE_string(begin_key, "", + "First key to extract (def: first key in archive)"); +DEFINE_string(end_key, "", + "Last key to extract (def: last key in archive)"); +DEFINE_double(delta, fst::kDelta, "Comparison/quantization delta"); + +int main(int argc, char **argv) { + namespace s = fst::script; + + string usage = "Prints some basic information about the FSTs in an FST "; + usage += "archive.\n\n Usage:"; + usage += argv[0]; + usage += " in1.far in2.far\n"; + usage += " Flags: begin_key end_key"; + + std::set_new_handler(FailedNewHandler); + SET_FLAGS(usage.c_str(), &argc, &argv, true); + + if (argc != 3) { + ShowUsage(); + return 1; + } + + string filename1(argv[1]), filename2(argv[2]); + + bool result = s::FarEqual( + filename1, filename2, fst::LoadArcTypeFromFar(filename1), + FLAGS_delta, FLAGS_begin_key, FLAGS_end_key); + + if (!result) + VLOG(1) << "FARs are not equal."; + + return result ? 0 : 2; +} diff --git a/src/extensions/far/farextract.cc b/src/extensions/far/farextract.cc index 72b2d15..66b7840 100644 --- a/src/extensions/far/farextract.cc +++ b/src/extensions/far/farextract.cc @@ -38,19 +38,15 @@ int main(int argc, char **argv) { string usage = "Extracts FSTs from a finite-state archive.\n\n Usage:"; usage += argv[0]; - usage += " in1.far [in2.far...]\n"; + usage += " [in1.far in2.far...]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); - - if (argc < 2) { - ShowUsage(); - return 1; - } + SET_FLAGS(usage.c_str(), &argc, &argv, true); vector ifilenames; for (int i = 1; i < argc; ++i) - ifilenames.push_back(argv[i]); + ifilenames.push_back(strcmp(argv[i], "") != 0 ? argv[i] : ""); + if (ifilenames.empty()) ifilenames.push_back(""); const string &arc_type = fst::LoadArcTypeFromFar(ifilenames[0]); diff --git a/src/extensions/far/farinfo.cc b/src/extensions/far/farinfo.cc index e8036a9..2665141 100644 --- a/src/extensions/far/farinfo.cc +++ b/src/extensions/far/farinfo.cc @@ -36,20 +36,17 @@ int main(int argc, char **argv) { string usage = "Prints some basic information about the FSTs in an FST "; usage += "archive.\n\n Usage:"; usage += argv[0]; - usage += " in1.far [in2.far...]\n"; + usage += " [in1.far in2.far...]\n"; usage += " Flags: begin_key end_key list_fsts"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); - - if (argc < 2) { - ShowUsage(); - return 1; - } + SET_FLAGS(usage.c_str(), &argc, &argv, true); vector filenames; for (int i = 1; i < argc; ++i) - filenames.push_back(argv[i]); + filenames.push_back(strcmp(argv[i], "") != 0 ? argv[i] : ""); + if (filenames.empty()) + filenames.push_back(""); s::FarInfo(filenames, fst::LoadArcTypeFromFar(filenames[0]), FLAGS_begin_key, FLAGS_end_key, FLAGS_list_fsts); diff --git a/src/extensions/far/farprintstrings.cc b/src/extensions/far/farprintstrings.cc index ab7d52b..0204ab7 100644 --- a/src/extensions/far/farprintstrings.cc +++ b/src/extensions/far/farprintstrings.cc @@ -31,11 +31,14 @@ DEFINE_string(end_key, "", "Last key to extract (def: last key in archive)"); // PrintStringsMain specific flag definitions. DEFINE_bool(print_key, false, "Prefix each string by its key"); +DEFINE_bool(print_weight, false, "Suffix each string by its weight"); DEFINE_string(entry_type, "line", "Entry type: one of : " "\"file\" (one FST per file), \"line\" (one FST per line)"); DEFINE_string(token_type, "symbol", "Token type: one of : " "\"symbol\", \"byte\", \"utf8\""); DEFINE_string(symbols, "", "Label symbol table"); +DEFINE_bool(initial_symbols, true, + "Uses symbol table from the first Fst in archive for all entries."); int main(int argc, char **argv) { @@ -43,27 +46,25 @@ int main(int argc, char **argv) { string usage = "Print as string the string FSTs in an archive.\n\n Usage:"; usage += argv[0]; - usage += " in1.far [in2.far ...]\n"; + usage += " [in1.far in2.far ...]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); - - if (argc < 2) { - ShowUsage(); - return 1; - } + SET_FLAGS(usage.c_str(), &argc, &argv, true); vector ifilenames; for (int i = 1; i < argc; ++i) - ifilenames.push_back(argv[i]); + ifilenames.push_back(strcmp(argv[i], "") != 0 ? argv[i] : ""); + if (ifilenames.empty()) ifilenames.push_back(""); string arc_type = fst::LoadArcTypeFromFar(ifilenames[0]); s::FarPrintStrings(ifilenames, arc_type, fst::StringToFarEntryType(FLAGS_entry_type), fst::StringToFarTokenType(FLAGS_token_type), - FLAGS_begin_key, FLAGS_end_key, FLAGS_print_key, - FLAGS_symbols, FLAGS_generate_filenames, + FLAGS_begin_key, FLAGS_end_key, + FLAGS_print_key, FLAGS_print_weight, + FLAGS_symbols, FLAGS_initial_symbols, + FLAGS_generate_filenames, FLAGS_filename_prefix, FLAGS_filename_suffix); return 0; diff --git a/src/extensions/far/farscript.cc b/src/extensions/far/farscript.cc index a04645d..bc4ad0e 100644 --- a/src/extensions/far/farscript.cc +++ b/src/extensions/far/farscript.cc @@ -34,14 +34,17 @@ void FarCompileStrings(const vector &in_fnames, FarTokenType tt, const string &symbols_fname, const string &unknown_symbol, + bool keep_symbols, + bool initial_symbols, bool allow_negative_labels, bool file_list_input, const string &key_prefix, const string &key_suffix) { FarCompileStringsArgs args(in_fnames, out_fname, fst_type, far_type, generate_keys, fet, tt, symbols_fname, - unknown_symbol, allow_negative_labels, - file_list_input, key_prefix, key_suffix); + unknown_symbol, keep_symbols, initial_symbols, + allow_negative_labels, file_list_input, + key_prefix, key_suffix); Apply >("FarCompileStrings", arc_type, &args); @@ -61,6 +64,16 @@ void FarCreate(const vector &in_fnames, Apply >("FarCreate", arc_type, &args); } +bool FarEqual(const string &filename1, const string &filename2, + const string &arc_type, float delta, + const string &begin_key, const string &end_key) { + FarEqualInnerArgs args(filename1, filename2, delta, begin_key, end_key); + FarEqualArgs args_with_retval(args); + + Apply >("FarEqual", arc_type, &args_with_retval); + return args_with_retval.retval; +} + void FarExtract(const vector &ifilenames, const string &arc_type, int32 generate_filenames, const string &begin_key, @@ -89,15 +102,16 @@ void FarPrintStrings(const vector &ifilenames, const string &begin_key, const string &end_key, const bool print_key, + const bool print_weight, const string &symbols_fname, + const bool initial_symbols, const int32 generate_filenames, const string &filename_prefix, const string &filename_suffix) { - FarPrintStringsArgs args(ifilenames, entry_type, token_type, begin_key, - end_key, print_key, symbols_fname, - generate_filenames, - filename_prefix, - filename_suffix); + FarPrintStringsArgs args(ifilenames, entry_type, token_type, + begin_key, end_key, print_key, print_weight, + symbols_fname, initial_symbols, generate_filenames, + filename_prefix, filename_suffix); Apply >("FarPrintStrings", arc_type, &args); diff --git a/src/extensions/far/main.cc b/src/extensions/far/main.cc index b01d639..855def6 100644 --- a/src/extensions/far/main.cc +++ b/src/extensions/far/main.cc @@ -26,6 +26,7 @@ using std::vector; #include #include +#include #include namespace fst { @@ -33,7 +34,9 @@ namespace fst { // Return the 'FarType' value corresponding to a far type name. FarType FarTypeFromString(const string &str) { FarType type = FAR_DEFAULT; - if (str == "stlist") + if (str == "fst") + type = FAR_FST; + else if (str == "stlist") type = FAR_STLIST; else if (str == "sttable") type = FAR_STTABLE; @@ -46,6 +49,8 @@ FarType FarTypeFromString(const string &str) { // Return the textual name corresponding to a 'FarType;. string FarTypeToString(FarType type) { switch (type) { + case FAR_FST: + return "fst"; case FAR_STLIST: return "stlist"; case FAR_STTABLE: @@ -85,19 +90,14 @@ FarTokenType StringToFarTokenType(const string &s) { string LoadArcTypeFromFar(const string &far_fname) { FarHeader hdr; - if (far_fname.empty()) { - LOG(ERROR) << "Reading FAR from standard in not supported"; - return ""; - } - if (!hdr.Read(far_fname)) { - LOG(ERROR) << "Error reading FAR: " << far_fname; + FSTERROR() << "Error reading FAR: " << far_fname; return ""; } string atype = hdr.ArcType(); if (atype == "unknown") { - LOG(ERROR) << "Empty FST archive: " << far_fname; + FSTERROR() << "Empty FST archive: " << far_fname; return ""; } @@ -108,7 +108,7 @@ string LoadArcTypeFromFst(const string &fst_fname) { FstHeader hdr; ifstream in(fst_fname.c_str(), ifstream::in | ifstream::binary); if (!hdr.Read(in, fst_fname)) { - LOG(ERROR) << "Error reading FST: " << fst_fname; + FSTERROR() << "Error reading FST: " << fst_fname; return ""; } diff --git a/src/extensions/far/strings.cc b/src/extensions/far/strings.cc new file mode 100644 index 0000000..c62a45b --- /dev/null +++ b/src/extensions/far/strings.cc @@ -0,0 +1,40 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: jpr@google.com (Jake Ratkiewicz) + +#include +#include +#include +#include + +DEFINE_string(far_field_separator, "\t", + "Set of characters used as a separator between printed fields"); + +namespace fst { + +// Compute the minimal length required to +// encode each line number as a decimal number +int KeySize(const char *filename) { + ifstream istrm(filename); + istrm.seekg(0); + string s; + int nline = 0; + while (getline(istrm, s)) + ++nline; + istrm.seekg(0); + return nline ? ceil(log10(nline + 1)) : 1; +} + +} // namespace fst diff --git a/src/extensions/ngram/Makefile.am b/src/extensions/ngram/Makefile.am new file mode 100644 index 0000000..e909eaf --- /dev/null +++ b/src/extensions/ngram/Makefile.am @@ -0,0 +1,10 @@ +AM_CPPFLAGS = -I$(srcdir)/../../include $(ICU_CPPFLAGS) + +libfstdir = @libfstdir@ +libfst_LTLIBRARIES = libfstngram.la ngram-fst.la + +ngram_fst_la_SOURCES = bitmap-index.cc ngram-fst.cc nthbit.cc +ngram_fst_la_LDFLAGS = -module + +libfstngram_la_SOURCES = bitmap-index.cc ngram-fst.cc nthbit.cc +libfstngram_la_LDFLAGS = -version-info 0:0:0 diff --git a/src/extensions/ngram/Makefile.in b/src/extensions/ngram/Makefile.in new file mode 100644 index 0000000..cce1255 --- /dev/null +++ b/src/extensions/ngram/Makefile.in @@ -0,0 +1,543 @@ +# Makefile.in generated by automake 1.11.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, +# Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = src/extensions/ngram +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_icu.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h \ + $(top_builddir)/src/include/fst/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__installdirs = "$(DESTDIR)$(libfstdir)" +LTLIBRARIES = $(libfst_LTLIBRARIES) +libfstngram_la_LIBADD = +am_libfstngram_la_OBJECTS = bitmap-index.lo ngram-fst.lo nthbit.lo +libfstngram_la_OBJECTS = $(am_libfstngram_la_OBJECTS) +libfstngram_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(libfstngram_la_LDFLAGS) $(LDFLAGS) -o $@ +ngram_fst_la_LIBADD = +am_ngram_fst_la_OBJECTS = bitmap-index.lo ngram-fst.lo nthbit.lo +ngram_fst_la_OBJECTS = $(am_ngram_fst_la_OBJECTS) +ngram_fst_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(ngram_fst_la_LDFLAGS) $(LDFLAGS) -o $@ +DEFAULT_INCLUDES = +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +am__mv = mv -f +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +SOURCES = $(libfstngram_la_SOURCES) $(ngram_fst_la_SOURCES) +DIST_SOURCES = $(libfstngram_la_SOURCES) $(ngram_fst_la_SOURCES) +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +ICU_CFLAGS = @ICU_CFLAGS@ +ICU_CONFIG = @ICU_CONFIG@ +ICU_CPPFLAGS = @ICU_CPPFLAGS@ +ICU_CXXFLAGS = @ICU_CXXFLAGS@ +ICU_LIBS = @ICU_LIBS@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +libfstdir = @libfstdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +lt_ECHO = @lt_ECHO@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +AM_CPPFLAGS = -I$(srcdir)/../../include $(ICU_CPPFLAGS) +libfst_LTLIBRARIES = libfstngram.la ngram-fst.la +ngram_fst_la_SOURCES = bitmap-index.cc ngram-fst.cc nthbit.cc +ngram_fst_la_LDFLAGS = -module +libfstngram_la_SOURCES = bitmap-index.cc ngram-fst.cc nthbit.cc +libfstngram_la_LDFLAGS = -version-info 0:0:0 +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/extensions/ngram/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign src/extensions/ngram/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): +install-libfstLTLIBRARIES: $(libfst_LTLIBRARIES) + @$(NORMAL_INSTALL) + test -z "$(libfstdir)" || $(MKDIR_P) "$(DESTDIR)$(libfstdir)" + @list='$(libfst_LTLIBRARIES)'; test -n "$(libfstdir)" || list=; \ + list2=; for p in $$list; do \ + if test -f $$p; then \ + list2="$$list2 $$p"; \ + else :; fi; \ + done; \ + test -z "$$list2" || { \ + echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libfstdir)'"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libfstdir)"; \ + } + +uninstall-libfstLTLIBRARIES: + @$(NORMAL_UNINSTALL) + @list='$(libfst_LTLIBRARIES)'; test -n "$(libfstdir)" || list=; \ + for p in $$list; do \ + $(am__strip_dir) \ + echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libfstdir)/$$f'"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libfstdir)/$$f"; \ + done + +clean-libfstLTLIBRARIES: + -test -z "$(libfst_LTLIBRARIES)" || rm -f $(libfst_LTLIBRARIES) + @list='$(libfst_LTLIBRARIES)'; for p in $$list; do \ + dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ + test "$$dir" != "$$p" || dir=.; \ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +libfstngram.la: $(libfstngram_la_OBJECTS) $(libfstngram_la_DEPENDENCIES) + $(libfstngram_la_LINK) -rpath $(libfstdir) $(libfstngram_la_OBJECTS) $(libfstngram_la_LIBADD) $(LIBS) +ngram-fst.la: $(ngram_fst_la_OBJECTS) $(ngram_fst_la_DEPENDENCIES) + $(ngram_fst_la_LINK) -rpath $(libfstdir) $(ngram_fst_la_OBJECTS) $(ngram_fst_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitmap-index.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ngram-fst.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nthbit.Plo@am__quote@ + +.cc.o: +@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ $< + +.cc.obj: +@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: +@am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(LTCXXCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + set x; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: + for dir in "$(DESTDIR)$(libfstdir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libfstLTLIBRARIES clean-libtool \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: install-libfstLTLIBRARIES + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-libfstLTLIBRARIES + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libfstLTLIBRARIES clean-libtool ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am \ + install-libfstLTLIBRARIES install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags uninstall uninstall-am uninstall-libfstLTLIBRARIES + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/src/extensions/ngram/bitmap-index.cc b/src/extensions/ngram/bitmap-index.cc new file mode 100644 index 0000000..c6aaa14 --- /dev/null +++ b/src/extensions/ngram/bitmap-index.cc @@ -0,0 +1,187 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: Jeffrey Soresnen (sorenj@google.com) + +#include + +#include +#include + +#include + +namespace fst { + +// These two internal classes implemented inverted views of the +// primary and secondary indexes. That is, they provide iterators +// that have operator*'s that return the number zeros rather than +// the number of ones. + +class primary_index_inverted : public vector::const_iterator { + public: + primary_index_inverted() {} + primary_index_inverted(vector::const_iterator loc, + vector::const_iterator begin) : + vector::const_iterator(loc), begin_(begin) {} + uint32 operator*() { + return BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize * + (1 + std::distance::const_iterator>(begin_, *this)) - + vector::const_iterator::operator*(); + } + private: + vector::const_iterator begin_; +}; + +class secondary_index_inverted : public vector::const_iterator { + public: + secondary_index_inverted() : vector::const_iterator() {} + secondary_index_inverted(vector::const_iterator loc, + vector::const_iterator block_begin) : + vector::const_iterator(loc), block_begin_(block_begin) {} + uint16 operator*() { + return ((1 + std::distance::const_iterator>( + block_begin_, *this)) << BitmapIndex::kStorageLogBitSize) - + vector::const_iterator::operator*(); + } + private: + vector::const_iterator block_begin_; +}; + +size_t BitmapIndex::Rank1(size_t end) const { + if (end == 0) return 0; + CHECK_LE(end, Bits()); + const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize; + const uint32 sum = get_index_ones_count(end_word); + const uint64 zero = 0; + const uint64 ones = ~zero; + return sum + __builtin_popcountll(bits_[end_word] & + (ones >> (kStorageBitSize - (end & kStorageBlockMask)))); +} + +size_t BitmapIndex::Select1(size_t bit_index) const { + if (bit_index >= GetOnesCount()) return Bits(); + // search primary index for the relevant block + uint32 rembits = bit_index + 1; + const uint32 block = find_primary_block(bit_index + 1); + uint32 offset = 0; + if (block > 0) { + rembits -= primary_index_[block - 1]; + offset += block * kSecondaryBlockSize; + } + // search the secondary index + uint32 word = find_secondary_block(offset, rembits); + if (word > 0) { + rembits -= secondary_index_[offset + word - 1]; + offset += word; + } + int nth = nth_bit(bits_[offset], rembits); + return (offset << BitmapIndex::kStorageLogBitSize) + nth; +} + +size_t BitmapIndex::Select0(size_t bit_index) const { + if (bit_index >= Bits() - GetOnesCount()) return Bits(); + // search inverted primary index for relevant block + uint32 remzeros = bit_index + 1; + uint32 offset = 0; + const uint32 block = find_inverted_primary_block(bit_index + 1); + if (block > 0) { + remzeros -= *primary_index_inverted(primary_index_.begin() + block - 1, + primary_index_.begin()); + offset += block * kSecondaryBlockSize; + } + // search the inverted secondary index + uint32 word = find_inverted_secondary_block(offset, remzeros); + if (word > 0) { + vector::const_iterator block_begin = + secondary_index_.begin() + offset; + remzeros -= *secondary_index_inverted(block_begin + word - 1, block_begin); + offset += word; + } + int nth = nth_bit(~bits_[offset], remzeros); + return (offset << BitmapIndex::kStorageLogBitSize) + nth; +} + +size_t BitmapIndex::get_index_ones_count(size_t array_index) const { + uint32 sum = 0; + if (array_index > 0) { + sum += secondary_index_[array_index-1]; + uint32 end_block = (array_index - 1) / kSecondaryBlockSize; + if (end_block > 0) sum += primary_index_[end_block-1]; + } + return sum; +} + +void BitmapIndex::BuildIndex(const uint64 *bits, size_t size) { + bits_ = bits; + size_ = size; + secondary_index_.clear(); + secondary_index_.reserve(ArraySize()); + primary_index_.clear(); + primary_index_.reserve(primary_index_size()); + const uint64 zero = 0; + const uint64 ones = ~zero; + uint32 popcount = 0; + for (uint32 block_begin = 0; block_begin < ArraySize(); + block_begin += kSecondaryBlockSize) { + uint32 block_popcount = 0; + uint32 block_end = block_begin + kSecondaryBlockSize; + if (block_end > ArraySize()) block_end = ArraySize(); + for (uint32 j = block_begin; j < block_end; ++j) { + uint64 mask = ones; + if (j == ArraySize() - 1) { + mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask); + } + block_popcount += __builtin_popcountll(bits_[j] & mask); + secondary_index_.push_back(block_popcount); + } + popcount += block_popcount; + primary_index_.push_back(popcount); + } +} + +size_t BitmapIndex::find_secondary_block( + size_t block_begin, size_t rem_bit_index) const { + size_t block_end = block_begin + kSecondaryBlockSize; + if (block_end > secondary_index_.size()) block_end = secondary_index_.size(); + return std::distance(secondary_index_.begin() + block_begin, + std::lower_bound(secondary_index_.begin() + block_begin, + secondary_index_.begin() + block_end, + rem_bit_index)); +} + +size_t BitmapIndex::find_inverted_secondary_block( + size_t block_begin, size_t rem_bit_index) const { + size_t block_end = block_begin + kSecondaryBlockSize; + if (block_end > secondary_index_.size()) block_end = secondary_index_.size(); + secondary_index_inverted start(secondary_index_.begin() + block_begin, + secondary_index_.begin() + block_begin); + secondary_index_inverted end(secondary_index_.begin() + block_end, + secondary_index_.begin() + block_begin); + return std::distance(start, + std::lower_bound(start, end, rem_bit_index)); +} + +inline size_t BitmapIndex::find_primary_block(size_t bit_index) const { + return std::distance(primary_index_.begin(), + std::lower_bound(primary_index_.begin(), + primary_index_.end(), bit_index)); +} + +size_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const { + primary_index_inverted start(primary_index_.begin(), primary_index_.begin()); + primary_index_inverted end(primary_index_.end(), primary_index_.begin()); + return std::distance(start, std::lower_bound(start, end, bit_index)); +} + +} // end namespace fst diff --git a/src/extensions/ngram/ngram-fst.cc b/src/extensions/ngram/ngram-fst.cc new file mode 100644 index 0000000..18f88db --- /dev/null +++ b/src/extensions/ngram/ngram-fst.cc @@ -0,0 +1,28 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: sorenj@google.com (Jeffrey Sorensen) + +#include + +#include + +#include + +using fst::NGramFst; +using fst::StdArc; +using fst::LogArc; + +REGISTER_FST(NGramFst, StdArc); +REGISTER_FST(NGramFst, LogArc); diff --git a/src/extensions/ngram/nthbit.cc b/src/extensions/ngram/nthbit.cc new file mode 100644 index 0000000..7b006e7 --- /dev/null +++ b/src/extensions/ngram/nthbit.cc @@ -0,0 +1,107 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: sorenj@google.com (Jeffrey Sorensen + +#include + +// +// This table is generate using: +// +// unsigned int nth_bit_scan(uint64 v, unsigned int r) { +// int i=0; +// for (; i<64; i++) { +// if ((r -= v & 1) == 0) return i; +// v >>= 1; +// } +// return i; +// } +// +// for (size_t i = 0; i < 256; ++i) { +// uint32 offsets = 0; +// for (size_t b = 1; b <= 8; ++b) { +// uint32 offset = min(nth_bit_scan(i, b), 8); +// offsets |= (offset << ((b - 1) << 2)); +// } +// bit_offset = offsets; +// printf("0x%x, ", bit_offset); +// if (i % 4 == 3) printf("\n"); +// } +// +uint32 nth_bit_bit_offset[] = { +0x88888888, 0x88888880, 0x88888881, 0x88888810, +0x88888882, 0x88888820, 0x88888821, 0x88888210, +0x88888883, 0x88888830, 0x88888831, 0x88888310, +0x88888832, 0x88888320, 0x88888321, 0x88883210, +0x88888884, 0x88888840, 0x88888841, 0x88888410, +0x88888842, 0x88888420, 0x88888421, 0x88884210, +0x88888843, 0x88888430, 0x88888431, 0x88884310, +0x88888432, 0x88884320, 0x88884321, 0x88843210, +0x88888885, 0x88888850, 0x88888851, 0x88888510, +0x88888852, 0x88888520, 0x88888521, 0x88885210, +0x88888853, 0x88888530, 0x88888531, 0x88885310, +0x88888532, 0x88885320, 0x88885321, 0x88853210, +0x88888854, 0x88888540, 0x88888541, 0x88885410, +0x88888542, 0x88885420, 0x88885421, 0x88854210, +0x88888543, 0x88885430, 0x88885431, 0x88854310, +0x88885432, 0x88854320, 0x88854321, 0x88543210, +0x88888886, 0x88888860, 0x88888861, 0x88888610, +0x88888862, 0x88888620, 0x88888621, 0x88886210, +0x88888863, 0x88888630, 0x88888631, 0x88886310, +0x88888632, 0x88886320, 0x88886321, 0x88863210, +0x88888864, 0x88888640, 0x88888641, 0x88886410, +0x88888642, 0x88886420, 0x88886421, 0x88864210, +0x88888643, 0x88886430, 0x88886431, 0x88864310, +0x88886432, 0x88864320, 0x88864321, 0x88643210, +0x88888865, 0x88888650, 0x88888651, 0x88886510, +0x88888652, 0x88886520, 0x88886521, 0x88865210, +0x88888653, 0x88886530, 0x88886531, 0x88865310, +0x88886532, 0x88865320, 0x88865321, 0x88653210, +0x88888654, 0x88886540, 0x88886541, 0x88865410, +0x88886542, 0x88865420, 0x88865421, 0x88654210, +0x88886543, 0x88865430, 0x88865431, 0x88654310, +0x88865432, 0x88654320, 0x88654321, 0x86543210, +0x88888887, 0x88888870, 0x88888871, 0x88888710, +0x88888872, 0x88888720, 0x88888721, 0x88887210, +0x88888873, 0x88888730, 0x88888731, 0x88887310, +0x88888732, 0x88887320, 0x88887321, 0x88873210, +0x88888874, 0x88888740, 0x88888741, 0x88887410, +0x88888742, 0x88887420, 0x88887421, 0x88874210, +0x88888743, 0x88887430, 0x88887431, 0x88874310, +0x88887432, 0x88874320, 0x88874321, 0x88743210, +0x88888875, 0x88888750, 0x88888751, 0x88887510, +0x88888752, 0x88887520, 0x88887521, 0x88875210, +0x88888753, 0x88887530, 0x88887531, 0x88875310, +0x88887532, 0x88875320, 0x88875321, 0x88753210, +0x88888754, 0x88887540, 0x88887541, 0x88875410, +0x88887542, 0x88875420, 0x88875421, 0x88754210, +0x88887543, 0x88875430, 0x88875431, 0x88754310, +0x88875432, 0x88754320, 0x88754321, 0x87543210, +0x88888876, 0x88888760, 0x88888761, 0x88887610, +0x88888762, 0x88887620, 0x88887621, 0x88876210, +0x88888763, 0x88887630, 0x88887631, 0x88876310, +0x88887632, 0x88876320, 0x88876321, 0x88763210, +0x88888764, 0x88887640, 0x88887641, 0x88876410, +0x88887642, 0x88876420, 0x88876421, 0x88764210, +0x88887643, 0x88876430, 0x88876431, 0x88764310, +0x88876432, 0x88764320, 0x88764321, 0x87643210, +0x88888765, 0x88887650, 0x88887651, 0x88876510, +0x88887652, 0x88876520, 0x88876521, 0x88765210, +0x88887653, 0x88876530, 0x88876531, 0x88765310, +0x88876532, 0x88765320, 0x88765321, 0x87653210, +0x88887654, 0x88876540, 0x88876541, 0x88765410, +0x88876542, 0x88765420, 0x88765421, 0x87654210, +0x88876543, 0x88765430, 0x88765431, 0x87654310, +0x88765432, 0x87654320, 0x87654321, 0x76543210, +}; diff --git a/src/extensions/pdt/pdtcompose.cc b/src/extensions/pdt/pdtcompose.cc index 0bab405..0076e1d 100644 --- a/src/extensions/pdt/pdtcompose.cc +++ b/src/extensions/pdt/pdtcompose.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { usage += " in.fst in.pdt [out.pdt]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 3 || argc > 4) { ShowUsage(); return 1; diff --git a/src/extensions/pdt/pdtexpand.cc b/src/extensions/pdt/pdtexpand.cc index 2134932..18d849d 100644 --- a/src/extensions/pdt/pdtexpand.cc +++ b/src/extensions/pdt/pdtexpand.cc @@ -17,7 +17,7 @@ // Modified: jpr@google.com (Jake Ratkiewicz) to use FstClass // // \file -// Expands a PDT and an FST. +// Expands a (bounded-stack) PDT as an FST. // #include @@ -32,12 +32,12 @@ DEFINE_string(weight, "", "Weight threshold"); int main(int argc, char **argv) { namespace s = fst::script; - string usage = "Expand a PDT and an FST.\n\n Usage: "; + string usage = "Expand a (bounded-stack) PDT as an FST.\n\n Usage: "; usage += argv[0]; usage += " in.pdt [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/extensions/pdt/pdtinfo.cc b/src/extensions/pdt/pdtinfo.cc index 89227d3..71c0ed0 100644 --- a/src/extensions/pdt/pdtinfo.cc +++ b/src/extensions/pdt/pdtinfo.cc @@ -34,7 +34,7 @@ int main(int argc, char **argv) { std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 2) { ShowUsage(); return 1; diff --git a/src/extensions/pdt/pdtreplace.cc b/src/extensions/pdt/pdtreplace.cc index 543b6ea..9f8aa86 100644 --- a/src/extensions/pdt/pdtreplace.cc +++ b/src/extensions/pdt/pdtreplace.cc @@ -15,6 +15,7 @@ // Copyright 2005-2010 Google, Inc. // Author: riley@google.com (Michael Riley) // +// Converts an RTN represented by FSTs and non-terminal labels into a PDT . #include using std::pair; using std::make_pair; @@ -29,13 +30,14 @@ DEFINE_string(pdt_parentheses, "", "PDT parenthesis label pairs."); int main(int argc, char **argv) { namespace s = fst::script; - string usage = "Recursively replace Fst arcs with other Fst(s).\n"; + string usage = "Converts an RTN represented by FSTs"; + usage += " and non-terminal labels into PDT"; usage += " Usage: "; usage += argv[0]; usage += " root.fst rootlabel [rule1.fst label1 ...] [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc < 4) { ShowUsage(); return 1; diff --git a/src/extensions/pdt/pdtreverse.cc b/src/extensions/pdt/pdtreverse.cc index e79cd7f..48f412d 100644 --- a/src/extensions/pdt/pdtreverse.cc +++ b/src/extensions/pdt/pdtreverse.cc @@ -32,7 +32,7 @@ int main(int argc, char **argv) { usage += " in.pdt [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/extensions/pdt/pdtshortestpath.cc b/src/extensions/pdt/pdtshortestpath.cc index ad67969..5973db9 100644 --- a/src/extensions/pdt/pdtshortestpath.cc +++ b/src/extensions/pdt/pdtshortestpath.cc @@ -17,7 +17,7 @@ // Modified: jpr@google.com (Jake Ratkiewicz) to use FstClass // // \file -// Return the shortest path in a PDT. +// Return the shortest path in a (bounded-stack) PDT. // #include @@ -33,12 +33,12 @@ DEFINE_string(pdt_parentheses, "", "PDT parenthesis label pairs."); int main(int argc, char **argv) { namespace s = fst::script; - string usage = "Shortest path in a PDT.\n\n Usage: "; + string usage = "Shortest path in a (bounded-stack) PDT.\n\n Usage: "; usage += argv[0]; usage += " in.pdt [out.fst]\n"; std::set_new_handler(FailedNewHandler); - SetFlags(usage.c_str(), &argc, &argv, true); + SET_FLAGS(usage.c_str(), &argc, &argv, true); if (argc > 3) { ShowUsage(); return 1; diff --git a/src/include/Makefile.am b/src/include/Makefile.am index b4e6213..e9eb92f 100644 --- a/src/include/Makefile.am +++ b/src/include/Makefile.am @@ -1,10 +1,17 @@ if HAVE_FAR far_include_headers = fst/extensions/far/compile-strings.h \ -fst/extensions/far/create.h fst/extensions/far/extract.h \ -fst/extensions/far/far.h fst/extensions/far/farlib.h \ -fst/extensions/far/farscript.h fst/extensions/far/info.h \ -fst/extensions/far/main.h fst/extensions/far/print-strings.h \ -fst/extensions/far/stlist.h fst/extensions/far/sttable.h +fst/extensions/far/create.h fst/extensions/far/equal.h \ +fst/extensions/far/extract.h fst/extensions/far/far.h \ +fst/extensions/far/farlib.h fst/extensions/far/farscript.h \ +fst/extensions/far/info.h fst/extensions/far/main.h \ +fst/extensions/far/print-strings.h fst/extensions/far/stlist.h \ +fst/extensions/far/sttable.h +endif + +if HAVE_NGRAM +ngram_include_headers = fst/extensions/ngram/bitmap-index.h \ +fst/extensions/ngram/ngram-fst.h \ +fst/extensions/ngram/nthbit.h endif if HAVE_PDT @@ -62,6 +69,6 @@ fst/replace-util.h fst/icu.h fst/string.h fst/signed-log-weight.h \ fst/sparse-tuple-weight.h fst/sparse-power-weight.h fst/expectation-weight.h \ fst/symbol-table-ops.h fst/bi-table.h \ $(far_include_headers) \ +$(ngram_include_headers) \ $(pdt_include_headers) \ $(script_include_headers) - diff --git a/src/include/Makefile.in b/src/include/Makefile.in index ab6c28d..b4e3f86 100644 --- a/src/include/Makefile.in +++ b/src/include/Makefile.in @@ -82,11 +82,14 @@ am__nobase_include_HEADERS_DIST = fst/arc.h fst/determinize.h \ fst/sparse-power-weight.h fst/expectation-weight.h \ fst/symbol-table-ops.h fst/bi-table.h \ fst/extensions/far/compile-strings.h \ - fst/extensions/far/create.h fst/extensions/far/extract.h \ - fst/extensions/far/far.h fst/extensions/far/farlib.h \ - fst/extensions/far/farscript.h fst/extensions/far/info.h \ - fst/extensions/far/main.h fst/extensions/far/print-strings.h \ - fst/extensions/far/stlist.h fst/extensions/far/sttable.h \ + fst/extensions/far/create.h fst/extensions/far/equal.h \ + fst/extensions/far/extract.h fst/extensions/far/far.h \ + fst/extensions/far/farlib.h fst/extensions/far/farscript.h \ + fst/extensions/far/info.h fst/extensions/far/main.h \ + fst/extensions/far/print-strings.h fst/extensions/far/stlist.h \ + fst/extensions/far/sttable.h \ + fst/extensions/ngram/bitmap-index.h \ + fst/extensions/ngram/ngram-fst.h fst/extensions/ngram/nthbit.h \ fst/extensions/pdt/collection.h fst/extensions/pdt/compose.h \ fst/extensions/pdt/expand.h fst/extensions/pdt/info.h \ fst/extensions/pdt/paren.h fst/extensions/pdt/pdt.h \ @@ -264,11 +267,16 @@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ @HAVE_FAR_TRUE@far_include_headers = fst/extensions/far/compile-strings.h \ -@HAVE_FAR_TRUE@fst/extensions/far/create.h fst/extensions/far/extract.h \ -@HAVE_FAR_TRUE@fst/extensions/far/far.h fst/extensions/far/farlib.h \ -@HAVE_FAR_TRUE@fst/extensions/far/farscript.h fst/extensions/far/info.h \ -@HAVE_FAR_TRUE@fst/extensions/far/main.h fst/extensions/far/print-strings.h \ -@HAVE_FAR_TRUE@fst/extensions/far/stlist.h fst/extensions/far/sttable.h +@HAVE_FAR_TRUE@fst/extensions/far/create.h fst/extensions/far/equal.h \ +@HAVE_FAR_TRUE@fst/extensions/far/extract.h fst/extensions/far/far.h \ +@HAVE_FAR_TRUE@fst/extensions/far/farlib.h fst/extensions/far/farscript.h \ +@HAVE_FAR_TRUE@fst/extensions/far/info.h fst/extensions/far/main.h \ +@HAVE_FAR_TRUE@fst/extensions/far/print-strings.h fst/extensions/far/stlist.h \ +@HAVE_FAR_TRUE@fst/extensions/far/sttable.h + +@HAVE_NGRAM_TRUE@ngram_include_headers = fst/extensions/ngram/bitmap-index.h \ +@HAVE_NGRAM_TRUE@fst/extensions/ngram/ngram-fst.h \ +@HAVE_NGRAM_TRUE@fst/extensions/ngram/nthbit.h @HAVE_PDT_TRUE@pdt_include_headers = fst/extensions/pdt/collection.h \ @HAVE_PDT_TRUE@fst/extensions/pdt/compose.h fst/extensions/pdt/expand.h \ @@ -323,6 +331,7 @@ fst/replace-util.h fst/icu.h fst/string.h fst/signed-log-weight.h \ fst/sparse-tuple-weight.h fst/sparse-power-weight.h fst/expectation-weight.h \ fst/symbol-table-ops.h fst/bi-table.h \ $(far_include_headers) \ +$(ngram_include_headers) \ $(pdt_include_headers) \ $(script_include_headers) diff --git a/src/include/fst/accumulator.h b/src/include/fst/accumulator.h index fcb960c..9801b93 100644 --- a/src/include/fst/accumulator.h +++ b/src/include/fst/accumulator.h @@ -258,7 +258,7 @@ class FastLogAccumulator { for(StateIterator siter(fst); !siter.Done(); siter.Next()) { StateId s = siter.Value(); if (fst.NumArcs(s) >= arc_limit_) { - double sum = FloatLimits::kPosInfinity; + double sum = FloatLimits::PosInfinity(); weight_positions.push_back(weight_position); weights.push_back(sum); ++weight_position; @@ -282,12 +282,12 @@ class FastLogAccumulator { private: double LogPosExp(double x) { - return x == FloatLimits::kPosInfinity ? + return x == FloatLimits::PosInfinity() ? 0.0 : log(1.0F + exp(-x)); } double LogMinusExp(double x) { - return x == FloatLimits::kPosInfinity ? + return x == FloatLimits::PosInfinity() ? 0.0 : log(1.0F - exp(-x)); } @@ -302,7 +302,7 @@ class FastLogAccumulator { double LogPlus(double f1, Weight v) { double f2 = to_log_weight_(v).Value(); - if (f1 == FloatLimits::kPosInfinity) + if (f1 == FloatLimits::PosInfinity()) return f2; else if (f1 > f2) return f2 - LogPosExp(f1 - f2); @@ -317,7 +317,7 @@ class FastLogAccumulator { error_ = true; return Weight::NoWeight(); } - if (f2 == FloatLimits::kPosInfinity) + if (f2 == FloatLimits::PosInfinity()) return to_weight_(f1); else return to_weight_(f1 - LogMinusExp(f2 - f1)); @@ -485,7 +485,7 @@ class CacheLogAccumulator { if ((weights_ == 0) && (fst_->NumArcs(s) >= arc_limit_)) { weights_ = new vector; weights_->reserve(fst_->NumArcs(s) + 1); - weights_->push_back(FloatLimits::kPosInfinity); + weights_->push_back(FloatLimits::PosInfinity()); data_->AddWeights(s, weights_); } } @@ -524,7 +524,7 @@ class CacheLogAccumulator { - weights_->begin() - 1; } else { size_t n = 0; - double x = FloatLimits::kPosInfinity; + double x = FloatLimits::PosInfinity(); for(aiter->Reset(); !aiter->Done(); aiter->Next(), ++n) { x = LogPlus(x, aiter->Value().weight); if (x < w) break; @@ -537,12 +537,12 @@ class CacheLogAccumulator { private: double LogPosExp(double x) { - return x == FloatLimits::kPosInfinity ? + return x == FloatLimits::PosInfinity() ? 0.0 : log(1.0F + exp(-x)); } double LogMinusExp(double x) { - return x == FloatLimits::kPosInfinity ? + return x == FloatLimits::PosInfinity() ? 0.0 : log(1.0F - exp(-x)); } @@ -557,7 +557,7 @@ class CacheLogAccumulator { double LogPlus(double f1, Weight v) { double f2 = to_log_weight_(v).Value(); - if (f1 == FloatLimits::kPosInfinity) + if (f1 == FloatLimits::PosInfinity()) return f2; else if (f1 > f2) return f2 - LogPosExp(f1 - f2); @@ -572,7 +572,7 @@ class CacheLogAccumulator { error_ = true; return Weight::NoWeight(); } - if (f2 == FloatLimits::kPosInfinity) + if (f2 == FloatLimits::PosInfinity()) return to_weight_(f1); else return to_weight_(f1 - LogMinusExp(f2 - f1)); diff --git a/src/include/fst/arc.h b/src/include/fst/arc.h index 56086c9..5f4014b 100644 --- a/src/include/fst/arc.h +++ b/src/include/fst/arc.h @@ -34,6 +34,7 @@ #include #include #include +#include #include diff --git a/src/include/fst/arcsort.h b/src/include/fst/arcsort.h index 38f4f95..37a51dc 100644 --- a/src/include/fst/arcsort.h +++ b/src/include/fst/arcsort.h @@ -118,9 +118,11 @@ typedef CacheOptions ArcSortFstOptions; // and exclusive of caching. template class ArcSortFst : public StateMapFst > { + using StateMapFst >::GetImpl; public: typedef A Arc; - typedef ArcSortMapper M; + typedef typename Arc::StateId StateId; + typedef ArcSortMapper M; ArcSortFst(const Fst &fst, const C &comp) : StateMapFst(fst, ArcSortMapper(fst, comp)) {} @@ -136,6 +138,18 @@ class ArcSortFst : public StateMapFst > { virtual ArcSortFst *Copy(bool safe = false) const { return new ArcSortFst(*this, safe); } + + virtual size_t NumArcs(StateId s) const { + return GetImpl()->GetFst().NumArcs(s); + } + + virtual size_t NumInputEpsilons(StateId s) const { + return GetImpl()->GetFst().NumInputEpsilons(s); + } + + virtual size_t NumOutputEpsilons(StateId s) const { + return GetImpl()->GetFst().NumOutputEpsilons(s); + } }; diff --git a/src/include/fst/bi-table.h b/src/include/fst/bi-table.h index dbb436c..bd37781 100644 --- a/src/include/fst/bi-table.h +++ b/src/include/fst/bi-table.h @@ -39,8 +39,9 @@ namespace fst { // // Required constructors. // BiTable(); // -// // Lookup integer ID from entry. If it doesn't exist, then add it. -// I FindId(const T &entry); +// // Lookup integer ID from entry. If it doesn't exist and 'insert' +// / is true, then add it. Otherwise return -1. +// I FindId(const T &entry, bool insert = true); // // Lookup entry from integer ID. // const T &FindEntry(I) const; // // # of stored entries. @@ -58,11 +59,15 @@ class HashBiTable { T empty_entry; } - I FindId(const T &entry) { + I FindId(const T &entry, bool insert = true) { I &id_ref = entry2id_[entry]; - if (id_ref == 0) { // T not found; store and assign it a new ID. - id2entry_.push_back(entry); - id_ref = id2entry_.size(); + if (id_ref == 0) { // T not found + if (insert) { // store and assign it a new ID + id2entry_.push_back(entry); + id_ref = id2entry_.size(); + } else { + return -1; + } } return id_ref - 1; // NB: id_ref = ID + 1 } @@ -109,14 +114,18 @@ class CompactHashBiTable { id2entry_.reserve(table_size); } - I FindId(const T &entry) { + I FindId(const T &entry, bool insert = true) { current_entry_ = &entry; typename KeyHashSet::const_iterator it = keys_.find(kCurrentKey); - if (it == keys_.end()) { - I key = id2entry_.size(); - id2entry_.push_back(entry); - keys_.insert(key); - return key; + if (it == keys_.end()) { // T not found + if (insert) { // store and assign it a new ID + I key = id2entry_.size(); + id2entry_.push_back(entry); + keys_.insert(key); + return key; + } else { + return -1; + } } else { return *it; } @@ -191,14 +200,18 @@ class VectorBiTable { ~VectorBiTable() { delete fp_; } - I FindId(const T &entry) { + I FindId(const T &entry, bool insert = true) { ssize_t fp = (*fp_)(entry); if (fp >= fp2id_.size()) fp2id_.resize(fp + 1); I &id_ref = fp2id_[fp]; - if (id_ref == 0) { // T not found; store and assign it a new ID. - id2entry_.push_back(entry); - id_ref = id2entry_.size(); + if (id_ref == 0) { // T not found + if (insert) { // store and assign it a new ID + id2entry_.push_back(entry); + id_ref = id2entry_.size(); + } else { + return -1; + } } return id_ref - 1; // NB: id_ref = ID + 1 } @@ -251,24 +264,32 @@ class VectorHashBiTable { delete h_; } - I FindId(const T &entry) { + I FindId(const T &entry, bool insert = true) { if ((*selector_)(entry)) { // Use the vector if 'selector_(entry) == true' uint64 fp = (*fp_)(entry); if (fp2id_.size() <= fp) fp2id_.resize(fp + 1, 0); - if (fp2id_[fp] == 0) { - id2entry_.push_back(entry); - fp2id_[fp] = id2entry_.size(); + if (fp2id_[fp] == 0) { // T not found + if (insert) { // store and assign it a new ID + id2entry_.push_back(entry); + fp2id_[fp] = id2entry_.size(); + } else { + return -1; + } } return fp2id_[fp] - 1; // NB: assoc_value = ID + 1 } else { // Use the hash table otherwise. current_entry_ = &entry; typename KeyHashSet::const_iterator it = keys_.find(kCurrentKey); if (it == keys_.end()) { - I key = id2entry_.size(); - id2entry_.push_back(entry); - keys_.insert(key); - return key; + if (insert) { + I key = id2entry_.size(); + id2entry_.push_back(entry); + keys_.insert(key); + return key; + } else { + return -1; + } } else { return *it; } @@ -357,11 +378,15 @@ class ErasableBiTable { public: ErasableBiTable() : first_(0) {} - I FindId(const T &entry) { + I FindId(const T &entry, bool insert = true) { I &id_ref = entry2id_[entry]; - if (id_ref == 0) { // T not found; store and assign it a new ID. - id2entry_.push_back(entry); - id_ref = id2entry_.size() + first_; + if (id_ref == 0) { // T not found + if (insert) { // store and assign it a new ID + id2entry_.push_back(entry); + id_ref = id2entry_.size() + first_; + } else { + return -1; + } } return id_ref - 1; // NB: id_ref = ID + 1 } diff --git a/src/include/fst/cache.h b/src/include/fst/cache.h index a6a92d4..0177396 100644 --- a/src/include/fst/cache.h +++ b/src/include/fst/cache.h @@ -292,13 +292,13 @@ class CacheBaseImpl : public VectorFstBaseImpl { void DeleteArcs(StateId s, size_t n) { S *state = ExtendState(s); - const vector &arcs = GetState(s)->arcs; + const vector &arcs = state->arcs; for (size_t i = 0; i < n; ++i) { size_t j = arcs.size() - i - 1; if (arcs[j].ilabel == 0) - --GetState(s)->niepsilons; + --state->niepsilons; if (arcs[j].olabel == 0) - --GetState(s)->noepsilons; + --state->noepsilons; } state->arcs.resize(arcs.size() - n); SetProperties(DeleteArcsProperties(Properties())); @@ -503,9 +503,6 @@ struct CacheState { size_t noepsilons; // # of output epsilons mutable uint32 flags; mutable int ref_count; - - private: - DISALLOW_COPY_AND_ASSIGN(CacheState); }; // A CacheBaseImpl with a commonly used CacheState. diff --git a/src/include/fst/compact-fst.h b/src/include/fst/compact-fst.h index efa567a..57c927e 100644 --- a/src/include/fst/compact-fst.h +++ b/src/include/fst/compact-fst.h @@ -175,10 +175,10 @@ class CompactFstData { bool Error() const { return error_; } - private: // Byte alignment for states and arcs in file format (version 1 only) static const int kFileAlign = 16; + private: Unsigned *states_; CompactElement *compacts_; size_t nstates_; @@ -539,17 +539,16 @@ class CompactFstImpl : public CacheImpl { } Weight Final(StateId s) { - if (!HasFinal(s)) { - Arc arc(kNoLabel, kNoLabel, Weight::Zero(), kNoStateId); - if ((compactor_->Size() != -1) || - (data_->States(s) != data_->States(s + 1))) - arc = ComputeArc(s, - compactor_->Size() == -1 - ? data_->States(s) - : s * compactor_->Size()); - SetFinal(s, arc.ilabel == kNoLabel ? arc.weight : Weight::Zero()); - } - return CacheImpl::Final(s); + if (HasFinal(s)) + return CacheImpl::Final(s); + Arc arc(kNoLabel, kNoLabel, Weight::Zero(), kNoStateId); + if ((compactor_->Size() != -1) || + (data_->States(s) != data_->States(s + 1))) + arc = ComputeArc(s, + compactor_->Size() == -1 + ? data_->States(s) + : s * compactor_->Size()); + return arc.ilabel == kNoLabel ? arc.weight : Weight::Zero(); } StateId NumStates() const { @@ -650,7 +649,6 @@ class CompactFstImpl : public CacheImpl { // Ensures compatibility int file_version = opts.align ? kAlignedFileVersion : kFileVersion; WriteHeader(strm, opts, file_version, &hdr); - compactor_->Write(strm); return data_->Write(strm, opts); } @@ -678,9 +676,13 @@ class CompactFstImpl : public CacheImpl { data_->States(s + 1) : (s + 1) * compactor_->Size(); for (size_t i = begin; i < end; ++i) { const Arc &arc = ComputeArc(s, i); - if (arc.ilabel == kNoLabel) continue; - PushArc(s, arc); + if (arc.ilabel == kNoLabel) + SetFinal(s, arc.weight); + else + PushArc(s, arc); } + if (!HasFinal(s)) + SetFinal(s, Weight::Zero()); SetArcs(s); } @@ -694,6 +696,9 @@ class CompactFstImpl : public CacheImpl { C *GetCompactor() const { return compactor_; } CompactFstData *Data() const { return data_; } + // Properties always true of this Fst class + static const uint64 kStaticProperties = kExpanded; + protected: template explicit CompactFstImpl(const CompactFstImpl &impl) @@ -710,6 +715,8 @@ class CompactFstImpl : public CacheImpl { } private: + friend class CompactFst; // allow access during write. + void Init(const Fst &fst) { string type = "compact"; if (sizeof(U) != sizeof(uint32)) { @@ -751,8 +758,6 @@ class CompactFstImpl : public CacheImpl { SetProperties(kError, kError); } - // Properties always true of this Fst class - static const uint64 kStaticProperties = kExpanded; // Current unaligned file format version static const int kFileVersion = 2; // Current aligned file format version @@ -863,6 +868,10 @@ class CompactFst : public ImplToExpandedFst< CompactFstImpl > { return Fst::WriteFile(filename); } + template + static bool WriteFst(const F &fst, const C &compactor, ostream &strm, + const FstWriteOptions &opts); + virtual void InitStateIterator(StateIteratorData *data) const { GetImpl()->InitStateIterator(data); } @@ -893,6 +902,115 @@ class CompactFst : public ImplToExpandedFst< CompactFstImpl > { void operator=(const CompactFst &fst); // disallow }; +// Writes Fst in Compact format, potentially with a pass over the machine +// before writing to compute the number of states and arcs. +// +template +template +bool CompactFst::WriteFst(const F &fst, + const C &compactor, + ostream &strm, + const FstWriteOptions &opts) { + typedef U Unsigned; + typedef typename C::Element CompactElement; + typedef typename A::Weight Weight; + static const int kFileAlign = + CompactFstData::kFileAlign; + int file_version = opts.align ? + CompactFstImpl::kAlignedFileVersion : + CompactFstImpl::kFileVersion; + size_t num_arcs = -1, num_states = -1, num_compacts = -1; + C first_pass_compactor = compactor; + if (fst.Type() == CompactFst().Type()) { + const CompactFst *compact_fst = + reinterpret_cast *>(&fst); + num_arcs = compact_fst->GetImpl()->Data()->NumArcs(); + num_states = compact_fst->GetImpl()->Data()->NumStates(); + num_compacts = compact_fst->GetImpl()->Data()->NumCompacts(); + first_pass_compactor = *compact_fst->GetImpl()->GetCompactor(); + } else { + // A first pass is needed to compute the state of the compactor, which + // is saved ahead of the rest of the data structures. This unfortunately + // means forcing a complete double compaction when writing in this format. + // TODO(allauzen): eliminate mutable state from compactors. + num_arcs = 0; + num_states = 0; + for (StateIterator siter(fst); !siter.Done(); siter.Next()) { + const StateId s = siter.Value(); + ++num_states; + if (fst.Final(s) != Weight::Zero()) { + first_pass_compactor.Compact( + s, A(kNoLabel, kNoLabel, fst.Final(s), kNoStateId)); + } + for (ArcIterator aiter(fst, s); !aiter.Done(); aiter.Next()) { + ++num_arcs; + first_pass_compactor.Compact(s, aiter.Value()); + } + } + } + FstHeader hdr; + hdr.SetStart(fst.Start()); + hdr.SetNumStates(num_states); + hdr.SetNumArcs(num_arcs); + string type = "compact"; + if (sizeof(U) != sizeof(uint32)) { + string size; + Int64ToStr(8 * sizeof(U), &size); + type += size; + } + type += "_"; + type += C::Type(); + uint64 copy_properties = fst.Properties(kCopyProperties, true); + if ((copy_properties & kError) || !compactor.Compatible(fst)) { + LOG(ERROR) << "fst incompatible with compactor"; + return false; + } + uint64 properties = copy_properties | + CompactFstImpl::kStaticProperties; + FstImpl::WriteFstHeader(fst, strm, opts, file_version, type, properties, + &hdr); + first_pass_compactor.Write(strm); + if (first_pass_compactor.Size() == -1) { + if (opts.align && !AlignOutput(strm, kFileAlign)) { + LOG(ERROR) << "CompactFst::Write: Alignment failed: " << opts.source; + return false; + } + Unsigned compacts = 0; + for (StateIterator siter(fst); !siter.Done(); siter.Next()) { + const StateId s = siter.Value(); + strm.write(reinterpret_cast(&compacts), sizeof(compacts)); + if (fst.Final(s) != Weight::Zero()) { + ++compacts; + } + compacts += fst.NumArcs(s); + } + strm.write(reinterpret_cast(&compacts), sizeof(compacts)); + } + if (opts.align && !AlignOutput(strm, kFileAlign)) { + LOG(ERROR) << "Could not align file during write after writing states"; + } + C second_pass_compactor = compactor; + CompactElement element; + for (StateIterator siter(fst); !siter.Done(); siter.Next()) { + const StateId s = siter.Value(); + if (fst.Final(s) != Weight::Zero()) { + element = second_pass_compactor.Compact( + s, A(kNoLabel, kNoLabel, fst.Final(s), kNoStateId)); + strm.write(reinterpret_cast(&element), sizeof(element)); + } + for (ArcIterator aiter(fst, s); !aiter.Done(); aiter.Next()) { + element = second_pass_compactor.Compact(s, aiter.Value()); + strm.write(reinterpret_cast(&element), sizeof(element)); + } + } + strm.flush(); + if (!strm) { + LOG(ERROR) << "CompactFst write failed: " << opts.source; + return false; + } + return true; +} + // Specialization for CompactFst; see generic version in fst.h // for sample usage (but use the CompactFst type!). This version diff --git a/src/include/fst/compat.h b/src/include/fst/compat.h index 034b57e..3b5275d 100644 --- a/src/include/fst/compat.h +++ b/src/include/fst/compat.h @@ -39,10 +39,7 @@ #include #include #include - -#ifdef HAVE_ICU #include -#endif using std::cin; using std::cout; @@ -87,7 +84,7 @@ class CheckSummer { void Reset() { count_ = 0; for (int i = 0; i < kCheckSumLength; ++i) - check_sum_[0] = '\0'; + check_sum_[i] = '\0'; } void Update(void const *data, int size) { @@ -113,24 +110,6 @@ class CheckSummer { DISALLOW_COPY_AND_ASSIGN(CheckSummer); }; -// Define the UTF8 string conversion function to throw an error -// when the ICU Library is missing or disabled. -#ifndef HAVE_ICU - -template -bool UTF8StringToLabels(const string&, std::vector > { switch (LookAheadMatchType(fst1, fst2)) { // Check for lookahead matchers default: case MATCH_NONE: { // Default composition (no look-ahead) + VLOG(2) << "ComposeFst: Default composition (no look-ahead)"; ComposeFstOptions nopts(opts); return CreateBase1(fst1, fst2, nopts); } case MATCH_OUTPUT: { // Lookahead on fst1 + VLOG(2) << "ComposeFst: Lookahead on fst1"; typedef typename DefaultLookAhead::FstMatcher M; typedef typename DefaultLookAhead::ComposeFilter F; ComposeFstOptions nopts(opts); return CreateBase1(fst1, fst2, nopts); } case MATCH_INPUT: { // Lookahead on fst2 + VLOG(2) << "ComposeFst: Lookahead on fst2"; typedef typename DefaultLookAhead::FstMatcher M; typedef typename DefaultLookAhead::ComposeFilter F; ComposeFstOptions nopts(opts); diff --git a/src/include/fst/config.h b/src/include/fst/config.h index 046b49c..47e472e 100644 --- a/src/include/fst/config.h +++ b/src/include/fst/config.h @@ -4,7 +4,7 @@ /* Define to 1 if you have the ICU library. */ /* #undef HAVE_ICU */ -/* Define to 1 if the system has the type `std::tr1::hash'. */ #define HAVE_STD__TR1__HASH_LONG_LONG_UNSIGNED_ 1 diff --git a/src/include/fst/const-fst.h b/src/include/fst/const-fst.h index f68e8ed..80efc8d 100644 --- a/src/include/fst/const-fst.h +++ b/src/include/fst/const-fst.h @@ -87,8 +87,6 @@ class ConstFstImpl : public FstImpl { static ConstFstImpl *Read(istream &strm, const FstReadOptions &opts); - bool Write(ostream &strm, const FstWriteOptions &opts) const; - A *Arcs(StateId s) { return arcs_ + states_[s].pos; } // Provide information needed for generic state iterator @@ -330,23 +328,24 @@ template template bool ConstFst::WriteFst(const F &fst, ostream &strm, const FstWriteOptions &opts) { - static const int kFileVersion = 2; - static const int kAlignedFileVersion = 1; - static const int kFileAlign = 16; - int file_version = opts.align ? kAlignedFileVersion : kFileVersion; + int file_version = opts.align ? ConstFstImpl::kAlignedFileVersion : + ConstFstImpl::kFileVersion; size_t num_arcs = -1, num_states = -1; size_t start_offset = 0; bool update_header = true; if (fst.Type() == ConstFst().Type()) { - const ConstFst *const_fst = static_cast *>(&fst); + const ConstFst *const_fst = + reinterpret_cast *>(&fst); num_arcs = const_fst->GetImpl()->narcs_; num_states = const_fst->GetImpl()->nstates_; update_header = false; } else if ((start_offset = strm.tellp()) == -1) { // precompute values needed for header when we cannot seek to rewrite it. + num_arcs = 0; + num_states = 0; for (StateIterator siter(fst); !siter.Done(); siter.Next()) { num_arcs += fst.NumArcs(siter.Value()); - num_states++; + ++num_states; } update_header = false; } @@ -360,8 +359,11 @@ bool ConstFst::WriteFst(const F &fst, ostream &strm, Int64ToStr(8 * sizeof(U), &size); type += size; } - FstImpl::WriteFstHeader(fst, strm, opts, file_version, type, &hdr); - if (opts.align && !AlignOutput(strm, kFileAlign)) { + uint64 properties = fst.Properties(kCopyProperties, true) | + ConstFstImpl::kStaticProperties; + FstImpl::WriteFstHeader(fst, strm, opts, file_version, type, properties, + &hdr); + if (opts.align && !AlignOutput(strm, ConstFstImpl::kFileAlign)) { LOG(ERROR) << "Could not align file during write after header"; return false; } @@ -375,11 +377,11 @@ bool ConstFst::WriteFst(const F &fst, ostream &strm, state.noepsilons = fst.NumOutputEpsilons(siter.Value()); strm.write(reinterpret_cast(&state), sizeof(state)); pos += state.narcs; - states++; + ++states; } hdr.SetNumStates(states); hdr.SetNumArcs(pos); - if (opts.align && !AlignOutput(strm, kFileAlign)) { + if (opts.align && !AlignOutput(strm, ConstFstImpl::kFileAlign)) { LOG(ERROR) << "Could not align file during write after writing states"; } for (StateIterator siter(fst); !siter.Done(); siter.Next()) { @@ -391,12 +393,12 @@ bool ConstFst::WriteFst(const F &fst, ostream &strm, } strm.flush(); if (!strm) { - LOG(ERROR) << "WriteAsVectorFst write failed: " << opts.source; + LOG(ERROR) << "ConstFst Write write failed: " << opts.source; return false; } if (update_header) { return FstImpl::UpdateFstHeader(fst, strm, opts, file_version, type, - &hdr, start_offset); + properties, &hdr, start_offset); } else { if (hdr.NumStates() != num_states) { LOG(ERROR) << "Inconsistent number of states observed during write"; diff --git a/src/include/fst/dfs-visit.h b/src/include/fst/dfs-visit.h index b47c78d..4d93a39 100644 --- a/src/include/fst/dfs-visit.h +++ b/src/include/fst/dfs-visit.h @@ -177,7 +177,8 @@ void DfsVisit(const Fst &fst, V *visitor, ArcFilter filter) { // Find next tree root for (root = root == start ? 0 : root + 1; root < nstates && state_color[root] != kDfsWhite; - ++root); + ++root) { + } // Check for a state beyond the largest known state if (!expanded && root == nstates) { diff --git a/src/include/fst/expanded-fst.h b/src/include/fst/expanded-fst.h index b44b81c..676ceb3 100644 --- a/src/include/fst/expanded-fst.h +++ b/src/include/fst/expanded-fst.h @@ -82,7 +82,7 @@ class ExpandedFst : public Fst { } return Read(strm, FstReadOptions(filename)); } else { - return Read(std::cin, FstReadOptions("standard input")); + return Read(cin, FstReadOptions("standard input")); } } }; @@ -154,7 +154,7 @@ class ImplToExpandedFst : public ImplToFst { } return I::Read(strm, FstReadOptions(filename)); } else { - return I::Read(std::cin, FstReadOptions("standard input")); + return I::Read(cin, FstReadOptions("standard input")); } } diff --git a/src/include/fst/extensions/far/compile-strings.h b/src/include/fst/extensions/far/compile-strings.h index d7f4d6b..ca247db 100644 --- a/src/include/fst/extensions/far/compile-strings.h +++ b/src/include/fst/extensions/far/compile-strings.h @@ -56,7 +56,7 @@ class StringReader { const SymbolTable *syms = 0, Label unknown_label = kNoStateId) : nline_(0), strm_(istrm), source_(source), entry_type_(entry_type), - token_type_(token_type), done_(false), + token_type_(token_type), symbols_(syms), done_(false), compiler_(token_type, syms, unknown_label, allow_negative_labels) { Next(); // Initialize the reader to the first input. } @@ -87,8 +87,12 @@ class StringReader { done_ = true; // whitespace at the end of a file. } - VectorFst *GetVectorFst() { + VectorFst *GetVectorFst(bool keep_symbols = false) { VectorFst *fst = new VectorFst; + if (keep_symbols) { + fst->SetInputSymbols(symbols_); + fst->SetOutputSymbols(symbols_); + } if (compiler_(content_, fst)) { return fst; } else { @@ -97,9 +101,16 @@ class StringReader { } } - CompactFst > *GetCompactFst() { - CompactFst > *fst = - new CompactFst >; + CompactFst > *GetCompactFst(bool keep_symbols = false) { + CompactFst > *fst; + if (keep_symbols) { + VectorFst tmp; + tmp.SetInputSymbols(symbols_); + tmp.SetOutputSymbols(symbols_); + fst = new CompactFst >(tmp); + } else { + fst = new CompactFst >; + } if (compiler_(content_, fst)) { return fst; } else { @@ -114,6 +125,7 @@ class StringReader { string source_; EntryType entry_type_; TokenType token_type_; + const SymbolTable *symbols_; bool done_; StringCompiler compiler_; string content_; // The actual content of the input stream's next FST. @@ -135,6 +147,8 @@ void FarCompileStrings(const vector &in_fnames, FarTokenType tt, const string &symbols_fname, const string &unknown_symbol, + bool keep_symbols, + bool initial_symbols, bool allow_negative_labels, bool file_list_input, const string &key_prefix, @@ -175,8 +189,9 @@ void FarCompileStrings(const vector &in_fnames, const SymbolTable *syms = 0; typename Arc::Label unknown_label = kNoLabel; if (!symbols_fname.empty()) { - syms = SymbolTable::ReadText(symbols_fname, - allow_negative_labels); + SymbolTableTextOptions opts; + opts.allow_negative = allow_negative_labels; + syms = SymbolTable::ReadText(symbols_fname, opts); if (!syms) { FSTERROR() << "FarCompileStrings: error reading symbol table: " << symbols_fname; @@ -199,32 +214,47 @@ void FarCompileStrings(const vector &in_fnames, vector inputs; if (file_list_input) { for (int i = 1; i < in_fnames.size(); ++i) { - ifstream istrm(in_fnames[i].c_str()); + istream *istrm = in_fnames.empty() ? &cin : + new ifstream(in_fnames[i].c_str()); string str; - while (getline(istrm, str)) + while (getline(*istrm, str)) inputs.push_back(str); + if (!in_fnames.empty()) + delete istrm; } } else { inputs = in_fnames; } for (int i = 0, n = 0; i < inputs.size(); ++i) { + if (generate_keys == 0 && inputs[i].empty()) { + FSTERROR() << "FarCompileStrings: read from a file instead of stdin or" + << " set the --generate_keys flags."; + delete far_writer; + delete syms; + return; + } int key_size = generate_keys ? generate_keys : (entry_type == StringReader::FILE ? 1 : KeySize(inputs[i].c_str())); - ifstream istrm(inputs[i].c_str()); + istream *istrm = inputs[i].empty() ? &cin : + new ifstream(inputs[i].c_str()); + bool keep_syms = keep_symbols; for (StringReader reader( - istrm, inputs[i], entry_type, token_type, - allow_negative_labels, syms, unknown_label); + *istrm, inputs[i].empty() ? "stdin" : inputs[i], + entry_type, token_type, allow_negative_labels, + syms, unknown_label); !reader.Done(); reader.Next()) { ++n; const Fst *fst; if (compact) - fst = reader.GetCompactFst(); + fst = reader.GetCompactFst(keep_syms); else - fst = reader.GetVectorFst(); + fst = reader.GetVectorFst(keep_syms); + if (initial_symbols) + keep_syms = false; if (!fst) { FSTERROR() << "FarCompileStrings: compiling string number " << n << " in file " << inputs[i] << " failed with token_type = " @@ -236,6 +266,7 @@ void FarCompileStrings(const vector &in_fnames, (fet == FET_FILE ? "file" : "unknown")); delete far_writer; delete syms; + if (!inputs[i].empty()) delete istrm; return; } ostringstream keybuf; @@ -260,6 +291,8 @@ void FarCompileStrings(const vector &in_fnames, } if (generate_keys == 0) n = 0; + if (!inputs[i].empty()) + delete istrm; } delete far_writer; diff --git a/src/include/fst/extensions/far/equal.h b/src/include/fst/extensions/far/equal.h new file mode 100644 index 0000000..be82e2d --- /dev/null +++ b/src/include/fst/extensions/far/equal.h @@ -0,0 +1,99 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: allauzen@google.com (Cyril Allauzen) + +#ifndef FST_EXTENSIONS_FAR_EQUAL_H_ +#define FST_EXTENSIONS_FAR_EQUAL_H_ + +#include + +#include +#include + +namespace fst { + +template +bool FarEqual(const string &filename1, + const string &filename2, + float delta = kDelta, + const string &begin_key = string(), + const string &end_key = string()) { + + FarReader *reader1 = FarReader::Open(filename1); + FarReader *reader2 = FarReader::Open(filename2); + if (!reader1 || !reader2) { + delete reader1; + delete reader2; + VLOG(1) << "FarEqual: cannot open input Far file(s)"; + return false; + } + + if (!begin_key.empty()) { + bool find_begin1 = reader1->Find(begin_key); + bool find_begin2 = reader2->Find(begin_key); + if (!find_begin1 || !find_begin2) { + bool ret = !find_begin1 && !find_begin2; + if (!ret) { + VLOG(1) << "FarEqual: key \"" << begin_key << "\" missing from " + << (find_begin1 ? "second" : "first") << " archive."; + } + delete reader1; + delete reader2; + return ret; + } + } + + for(; !reader1->Done() && !reader2->Done(); + reader1->Next(), reader2->Next()) { + const string key1 = reader1->GetKey(); + const string key2 = reader2->GetKey(); + if (!end_key.empty() && end_key < key1 && end_key < key2) { + delete reader1; + delete reader2; + return true; + } + if (key1 != key2) { + VLOG(1) << "FarEqual: mismatched keys \"" + << key1 << "\" <> \"" << key2 << "\"."; + delete reader1; + delete reader2; + return false; + } + if (!Equal(reader1->GetFst(), reader2->GetFst(), delta)) { + VLOG(1) << "FarEqual: Fsts for key \"" << key1 << "\" are not equal."; + delete reader1; + delete reader2; + return false; + } + } + + if (!reader1->Done() || !reader2->Done()) { + VLOG(1) << "FarEqual: key \"" + << (reader1->Done() ? reader2->GetKey() : reader1->GetKey()) + << "\" missing form " << (reader2->Done() ? "first" : "second") + << " archive."; + delete reader1; + delete reader2; + return false; + } + + delete reader1; + delete reader2; + return true; +} + +} // namespace fst + +#endif // FST_EXTENSIONS_FAR_EQUAL_H_ diff --git a/src/include/fst/extensions/far/extract.h b/src/include/fst/extensions/far/extract.h index 022ca60..d6f92ff 100644 --- a/src/include/fst/extensions/far/extract.h +++ b/src/include/fst/extensions/far/extract.h @@ -70,7 +70,7 @@ void FarExtract(const vector &ifilenames, if (nrep > 0) { ostringstream tmp; tmp << '.' << nrep; - key += tmp.str(); + key.append(tmp.str().data(), tmp.str().size()); } ofilename = key; } diff --git a/src/include/fst/extensions/far/far.h b/src/include/fst/extensions/far/far.h index 82b9e5c..acce76e 100644 --- a/src/include/fst/extensions/far/far.h +++ b/src/include/fst/extensions/far/far.h @@ -32,6 +32,13 @@ namespace fst { enum FarEntryType { FET_LINE, FET_FILE }; enum FarTokenType { FTT_SYMBOL, FTT_BYTE, FTT_UTF8 }; +inline bool IsFst(const string &filename) { + ifstream strm(filename.c_str()); + if (!strm) + return false; + return IsFstHeader(strm, filename); +} + // FST archive header class class FarHeader { public: @@ -40,8 +47,11 @@ class FarHeader { bool Read(const string &filename) { FstHeader fsthdr; - if (filename.empty()) { // Header reading unsupported on stdin. - return false; + if (filename.empty()) { + // Header reading unsupported on stdin. Assumes STList and StdArc. + fartype_ = "stlist"; + arctype_ = "standard"; + return true; } else if (IsSTTable(filename)) { // Check if STTable ReadSTTableHeader(filename, &fsthdr); fartype_ = "sttable"; @@ -52,6 +62,12 @@ class FarHeader { fartype_ = "sttable"; arctype_ = fsthdr.ArcType().empty() ? "unknown" : fsthdr.ArcType(); return true; + } else if (IsFst(filename)) { // Check if Fst + ifstream istrm(filename.c_str()); + fsthdr.Read(istrm, filename); + fartype_ = "fst"; + arctype_ = fsthdr.ArcType().empty() ? "unknown" : fsthdr.ArcType(); + return true; } return false; } @@ -61,8 +77,12 @@ class FarHeader { string arctype_; }; -enum FarType { FAR_DEFAULT = 0, FAR_STTABLE = 1, FAR_STLIST = 2, - FAR_SSTABLE = 3 }; +enum FarType { + FAR_DEFAULT = 0, + FAR_STTABLE = 1, + FAR_STLIST = 2, + FAR_FST = 3, +}; // This class creates an archive of FSTs. template @@ -153,7 +173,7 @@ class STTableFarWriter : public FarWriter { public: typedef A Arc; - static STTableFarWriter *Create(const string filename) { + static STTableFarWriter *Create(const string &filename) { STTableWriter, FstWriter > *writer = STTableWriter, FstWriter >::Create(filename); return new STTableFarWriter(writer); @@ -183,7 +203,7 @@ class STListFarWriter : public FarWriter { public: typedef A Arc; - static STListFarWriter *Create(const string filename) { + static STListFarWriter *Create(const string &filename) { STListWriter, FstWriter > *writer = STListWriter, FstWriter >::Create(filename); return new STListFarWriter(writer); @@ -208,6 +228,43 @@ class STListFarWriter : public FarWriter { }; +template +class FstFarWriter : public FarWriter { + public: + typedef A Arc; + + explicit FstFarWriter(const string &filename) + : filename_(filename), error_(false), written_(false) {} + + static FstFarWriter *Create(const string &filename) { + return new FstFarWriter(filename); + } + + void Add(const string &key, const Fst &fst) { + if (written_) { + LOG(WARNING) << "FstFarWriter::Add: only one Fst supported," + << " subsequent entries discarded."; + } else { + error_ = !fst.Write(filename_); + written_ = true; + } + } + + FarType Type() const { return FAR_FST; } + + bool Error() const { return error_; } + + ~FstFarWriter() {} + + private: + string filename_; + bool error_; + bool written_; + + DISALLOW_COPY_AND_ASSIGN(FstFarWriter); +}; + + template FarWriter *FarWriter::Create(const string &filename, FarType type) { switch(type) { @@ -220,6 +277,9 @@ FarWriter *FarWriter::Create(const string &filename, FarType type) { case FAR_STLIST: return STListFarWriter::Create(filename); break; + case FAR_FST: + return FstFarWriter::Create(filename); + break; default: LOG(ERROR) << "FarWriter::Create: unknown far type"; return 0; @@ -331,6 +391,114 @@ class STListFarReader : public FarReader { DISALLOW_COPY_AND_ASSIGN(STListFarReader); }; +template +class FstFarReader : public FarReader { + public: + typedef A Arc; + + static FstFarReader *Open(const string &filename) { + vector filenames; + filenames.push_back(filename); + return new FstFarReader(filenames); + } + + static FstFarReader *Open(const vector &filenames) { + return new FstFarReader(filenames); + } + + FstFarReader(const vector &filenames) + : keys_(filenames), has_stdin_(false), pos_(0), fst_(0), error_(false) { + sort(keys_.begin(), keys_.end()); + streams_.resize(keys_.size(), 0); + for (size_t i = 0; i < keys_.size(); ++i) { + if (keys_[i].empty()) { + if (!has_stdin_) { + streams_[i] = &cin; + //sources_[i] = "stdin"; + has_stdin_ = true; + } else { + FSTERROR() << "FstFarReader::FstFarReader: stdin should only " + << "appear once in the input file list."; + error_ = true; + return; + } + } else { + streams_[i] = new ifstream( + keys_[i].c_str(), ifstream::in | ifstream::binary); + } + } + if (pos_ >= keys_.size()) return; + ReadFst(); + } + + void Reset() { + if (has_stdin_) { + FSTERROR() << "FstFarReader::Reset: operation not supported on stdin"; + error_ = true; + return; + } + pos_ = 0; + ReadFst(); + } + + bool Find(const string &key) { + if (has_stdin_) { + FSTERROR() << "FstFarReader::Find: operation not supported on stdin"; + error_ = true; + return false; + } + pos_ = 0;//TODO + ReadFst(); + return true; + } + + bool Done() const { return error_ || pos_ >= keys_.size(); } + + void Next() { + ++pos_; + ReadFst(); + } + + const string &GetKey() const { + return keys_[pos_]; + } + + const Fst &GetFst() const { + return *fst_; + } + + FarType Type() const { return FAR_FST; } + + bool Error() const { return error_; } + + ~FstFarReader() { + if (fst_) delete fst_; + for (size_t i = 0; i < keys_.size(); ++i) + delete streams_[i]; + } + + private: + void ReadFst() { + if (fst_) delete fst_; + if (pos_ >= keys_.size()) return; + streams_[pos_]->seekg(0); + fst_ = Fst::Read(*streams_[pos_], FstReadOptions()); + if (!fst_) { + FSTERROR() << "FstFarReader: error reading Fst from: " << keys_[pos_]; + error_ = true; + } + } + + private: + vector keys_; + vector streams_; + bool has_stdin_; + size_t pos_; + mutable Fst *fst_; + mutable bool error_; + + DISALLOW_COPY_AND_ASSIGN(FstFarReader); +}; template FarReader *FarReader::Open(const string &filename) { @@ -340,6 +508,8 @@ FarReader *FarReader::Open(const string &filename) { return STTableFarReader::Open(filename); else if (IsSTList(filename)) return STListFarReader::Open(filename); + else if (IsFst(filename)) + return FstFarReader::Open(filename); return 0; } @@ -352,6 +522,8 @@ FarReader *FarReader::Open(const vector &filenames) { return STTableFarReader::Open(filenames); else if (!filenames.empty() && IsSTList(filenames[0])) return STListFarReader::Open(filenames); + else if (!filenames.empty() && IsFst(filenames[0])) + return FstFarReader::Open(filenames); return 0; } diff --git a/src/include/fst/extensions/far/farscript.h b/src/include/fst/extensions/far/farscript.h index 9c3b1ca..3a9c145 100644 --- a/src/include/fst/extensions/far/farscript.h +++ b/src/include/fst/extensions/far/farscript.h @@ -27,6 +27,7 @@ using std::vector; #include #include #include +#include #include #include #include @@ -51,6 +52,8 @@ struct FarCompileStringsArgs { const FarTokenType tt; const string &symbols_fname; const string &unknown_symbol; + const bool keep_symbols; + const bool initial_symbols; const bool allow_negative_labels; const bool file_list_input; const string &key_prefix; @@ -65,6 +68,8 @@ struct FarCompileStringsArgs { FarTokenType tt, const string &symbols_fname, const string &unknown_symbol, + bool keep_symbols, + bool initial_symbols, bool allow_negative_labels, bool file_list_input, const string &key_prefix, @@ -72,6 +77,7 @@ struct FarCompileStringsArgs { in_fnames(in_fnames), out_fname(out_fname), fst_type(fst_type), far_type(far_type), generate_keys(generate_keys), fet(fet), tt(tt), symbols_fname(symbols_fname), unknown_symbol(unknown_symbol), + keep_symbols(keep_symbols), initial_symbols(initial_symbols), allow_negative_labels(allow_negative_labels), file_list_input(file_list_input), key_prefix(key_prefix), key_suffix(key_suffix) { } @@ -82,7 +88,8 @@ void FarCompileStrings(FarCompileStringsArgs *args) { fst::FarCompileStrings( args->in_fnames, args->out_fname, args->fst_type, args->far_type, args->generate_keys, args->fet, args->tt, args->symbols_fname, - args->unknown_symbol, args->allow_negative_labels, args->file_list_input, + args->unknown_symbol, args->keep_symbols, args->initial_symbols, + args->allow_negative_labels, args->file_list_input, args->key_prefix, args->key_suffix); } @@ -97,6 +104,8 @@ void FarCompileStrings( FarTokenType tt, const string &symbols_fname, const string &unknown_symbol, + bool keep_symbols, + bool initial_symbols, bool allow_negative_labels, bool file_list_input, const string &key_prefix, @@ -143,6 +152,25 @@ void FarCreate(const vector &in_fnames, const string &key_suffix); +typedef args::Package FarEqualInnerArgs; +typedef args::WithReturnValue FarEqualArgs; + +template +void FarEqual(FarEqualArgs *args) { + args->retval = fst::FarEqual( + args->args.arg1, args->args.arg2, args->args.arg3, + args->args.arg4, args->args.arg5); +} + +bool FarEqual(const string &filename1, + const string &filename2, + const string &arc_type, + float delta = kDelta, + const string &begin_key = string(), + const string &end_key = string()); + + typedef args::Package &, int32, const string&, const string&, const string&, const string&> FarExtractArgs; @@ -180,7 +208,9 @@ struct FarPrintStringsArgs { const string &begin_key; const string &end_key; const bool print_key; + const bool print_weight; const string &symbols_fname; + const bool initial_symbols; const int32 generate_filenames; const string &filename_prefix; const string &filename_suffix; @@ -188,12 +218,14 @@ struct FarPrintStringsArgs { FarPrintStringsArgs( const vector &ifilenames, const FarEntryType entry_type, const FarTokenType token_type, const string &begin_key, - const string &end_key, const bool print_key, - const string &symbols_fname, const int32 generate_filenames, + const string &end_key, const bool print_key, const bool print_weight, + const string &symbols_fname, const bool initial_symbols, + const int32 generate_filenames, const string &filename_prefix, const string &filename_suffix) : ifilenames(ifilenames), entry_type(entry_type), token_type(token_type), - begin_key(begin_key), end_key(end_key), print_key(print_key), - symbols_fname(symbols_fname), + begin_key(begin_key), end_key(end_key), + print_key(print_key), print_weight(print_weight), + symbols_fname(symbols_fname), initial_symbols(initial_symbols), generate_filenames(generate_filenames), filename_prefix(filename_prefix), filename_suffix(filename_suffix) { } }; @@ -202,9 +234,9 @@ template void FarPrintStrings(FarPrintStringsArgs *args) { fst::FarPrintStrings( args->ifilenames, args->entry_type, args->token_type, - args->begin_key, args->end_key, args->print_key, - args->symbols_fname, args->generate_filenames, args->filename_prefix, - args->filename_suffix); + args->begin_key, args->end_key, args->print_key, args->print_weight, + args->symbols_fname, args->initial_symbols, args->generate_filenames, + args->filename_prefix, args->filename_suffix); } @@ -215,7 +247,9 @@ void FarPrintStrings(const vector &ifilenames, const string &begin_key, const string &end_key, const bool print_key, + const bool print_weight, const string &symbols_fname, + const bool initial_symbols, const int32 generate_filenames, const string &filename_prefix, const string &filename_suffix); @@ -227,6 +261,7 @@ void FarPrintStrings(const vector &ifilenames, #define REGISTER_FST_FAR_OPERATIONS(ArcType) \ REGISTER_FST_OPERATION(FarCompileStrings, ArcType, FarCompileStringsArgs); \ REGISTER_FST_OPERATION(FarCreate, ArcType, FarCreateArgs); \ + REGISTER_FST_OPERATION(FarEqual, ArcType, FarEqualArgs); \ REGISTER_FST_OPERATION(FarExtract, ArcType, FarExtractArgs); \ REGISTER_FST_OPERATION(FarInfo, ArcType, FarInfoArgs); \ REGISTER_FST_OPERATION(FarPrintStrings, ArcType, FarPrintStringsArgs) diff --git a/src/include/fst/extensions/far/info.h b/src/include/fst/extensions/far/info.h index f010546..100fe68 100644 --- a/src/include/fst/extensions/far/info.h +++ b/src/include/fst/extensions/far/info.h @@ -34,7 +34,7 @@ void CountStatesAndArcs(const Fst &fst, size_t *nstate, size_t *narc) { StateIterator > siter(fst); for (; !siter.Done(); siter.Next(), ++(*nstate)) { ArcIterator > aiter(fst, siter.Value()); - for (; !aiter.Done(); aiter.Next(), ++(*narc)); + for (; !aiter.Done(); aiter.Next(), ++(*narc)) {} } } diff --git a/src/include/fst/extensions/far/print-strings.h b/src/include/fst/extensions/far/print-strings.h index aff1e51..dcc7351 100644 --- a/src/include/fst/extensions/far/print-strings.h +++ b/src/include/fst/extensions/far/print-strings.h @@ -27,17 +27,21 @@ using std::vector; #include +#include #include +DECLARE_string(far_field_separator); + namespace fst { template void FarPrintStrings( const vector &ifilenames, const FarEntryType entry_type, const FarTokenType far_token_type, const string &begin_key, - const string &end_key, const bool print_key, const string &symbols_fname, - const int32 generate_filenames, const string &filename_prefix, - const string &filename_suffix) { + const string &end_key, const bool print_key, const bool print_weight, + const string &symbols_fname, const bool initial_symbols, + const int32 generate_filenames, + const string &filename_prefix, const string &filename_suffix) { typename StringPrinter::TokenType token_type; if (far_token_type == FTT_SYMBOL) { @@ -54,7 +58,9 @@ void FarPrintStrings( const SymbolTable *syms = 0; if (!symbols_fname.empty()) { // allow negative flag? - syms = SymbolTable::ReadText(symbols_fname, true); + SymbolTableTextOptions opts; + opts.allow_negative = true; + syms = SymbolTable::ReadText(symbols_fname, opts); if (!syms) { FSTERROR() << "FarPrintStrings: error reading symbol table: " << symbols_fname; @@ -62,8 +68,6 @@ void FarPrintStrings( } } - StringPrinter string_printer(token_type, syms); - FarReader *far_reader = FarReader::Open(ifilenames); if (!far_reader) return; @@ -83,14 +87,21 @@ void FarPrintStrings( okey = key; const Fst &fst = far_reader->GetFst(); + if (i == 1 && initial_symbols && syms == 0 && fst.InputSymbols() != 0) + syms = fst.InputSymbols()->Copy(); string str; VLOG(2) << "Handling key: " << key; + StringPrinter string_printer( + token_type, syms ? syms : fst.InputSymbols()); string_printer(fst, &str); if (entry_type == FET_LINE) { if (print_key) - cout << key << "\t"; - cout << str << endl; + cout << key << FLAGS_far_field_separator[0]; + cout << str; + if (print_weight) + cout << FLAGS_far_field_separator[0] << ShortestDistance(fst); + cout << endl; } else if (entry_type == FET_FILE) { stringstream sstrm; if (generate_filenames) { @@ -117,6 +128,7 @@ void FarPrintStrings( ostrm << "\n"; } } + delete syms; } diff --git a/src/include/fst/extensions/far/stlist.h b/src/include/fst/extensions/far/stlist.h index 4738181..1cdc80c 100644 --- a/src/include/fst/extensions/far/stlist.h +++ b/src/include/fst/extensions/far/stlist.h @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -58,7 +59,7 @@ class STListWriter { explicit STListWriter(const string filename) : stream_( - filename.empty() ? &std::cout : + filename.empty() ? &cout : new ofstream(filename.c_str(), ofstream::out | ofstream::binary)), error_(false) { WriteType(*stream_, kSTListMagicNumber); @@ -92,7 +93,7 @@ class STListWriter { ~STListWriter() { WriteType(*stream_, string()); - if (stream_ != &std::cout) + if (stream_ != &cout) delete stream_; } @@ -127,7 +128,7 @@ class STListReader { for (size_t i = 0; i < filenames.size(); ++i) { if (filenames[i].empty()) { if (!has_stdin) { - streams_[i] = &std::cin; + streams_[i] = &cin; sources_[i] = "stdin"; has_stdin = true; } else { @@ -177,7 +178,7 @@ class STListReader { ~STListReader() { for (size_t i = 0; i < streams_.size(); ++i) { - if (streams_[i] != &std::cin) + if (streams_[i] != &cin) delete streams_[i]; } if (entry_) diff --git a/src/include/fst/extensions/far/sttable.h b/src/include/fst/extensions/far/sttable.h index 3a03133..3ce0a4b 100644 --- a/src/include/fst/extensions/far/sttable.h +++ b/src/include/fst/extensions/far/sttable.h @@ -29,6 +29,7 @@ #include #include #include +#include #include namespace fst { diff --git a/src/include/fst/extensions/ngram/bitmap-index.h b/src/include/fst/extensions/ngram/bitmap-index.h new file mode 100644 index 0000000..f5a5ba7 --- /dev/null +++ b/src/include/fst/extensions/ngram/bitmap-index.h @@ -0,0 +1,183 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: sorenj@google.com (Jeffrey Sorensen) + +#ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_ +#define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_ + +#include +using std::vector; + +#include + +// This class is a bitstring storage class with an index that allows +// seeking to the Nth set or clear bit in time O(Log(N)) where N is +// the length of the bit vector. In addition, it allows counting set or +// clear bits over ranges in constant time. +// +// This is accomplished by maintaining an "secondary" index of limited +// size in bits that maintains a running count of the number of bits set +// in each block of bitmap data. A block is defined as the number of +// uint64 values that can fit in the secondary index before an overflow +// occurs. +// +// To handle overflows, a "primary" index containing a running count of +// bits set in each block is created using the type uint64. + +namespace fst { + +class BitmapIndex { + public: + static size_t StorageSize(size_t size) { + return ((size + kStorageBlockMask) >> kStorageLogBitSize); + } + + BitmapIndex() : bits_(NULL), size_(0) { } + + bool Get(size_t index) const { + return (bits_[index >> kStorageLogBitSize] & + (kOne << (index & kStorageBlockMask))) != 0; + } + + static void Set(uint64* bits, size_t index) { + bits[index >> kStorageLogBitSize] |= (kOne << (index & kStorageBlockMask)); + } + + static void Clear(uint64* bits, size_t index) { + bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask)); + } + + size_t Bits() const { + return size_; + } + + size_t ArraySize() const { + return StorageSize(size_); + } + + // Returns the number of one bits in the bitmap + size_t GetOnesCount() const { + return primary_index_[primary_index_size() - 1]; + } + + // Returns the number of one bits in positions 0 to limit - 1. + // REQUIRES: limit <= Bits() + size_t Rank1(size_t end) const; + + // Returns the number of one bits in the range start to end - 1. + // REQUIRES: limit <= Bits() + size_t GetOnesCountInRange(size_t start, size_t end) const { + return Rank1(end) - Rank1(start); + } + + // Returns the number of zero bits in positions 0 to limit - 1. + // REQUIRES: limit <= Bits() + size_t Rank0(size_t end) const { + return end - Rank1(end); + } + + // Returns the number of zero bits in the range start to end - 1. + // REQUIRES: limit <= Bits() + size_t GetZeroesCountInRange(size_t start, size_t end) const { + return end - start - GetOnesCountInRange(start, end); + } + + // Return true if any bit between begin inclusive and end exclusive + // is set. 0 <= begin <= end <= Bits() is required. + // + bool TestRange(size_t start, size_t end) const { + return Rank1(end) > Rank1(start); + } + + // Returns the offset to the nth set bit (zero based) + // or Bits() if index >= number of ones + size_t Select1(size_t bit_index) const; + + // Returns the offset to the nth clear bit (zero based) + // or Bits() if index > number of + size_t Select0(size_t bit_index) const; + + // Rebuilds from index for the associated Bitmap, should be called + // whenever changes have been made to the Bitmap or else behavior + // of the indexed bitmap methods will be undefined. + void BuildIndex(const uint64 *bits, size_t size); + + // the secondary index accumulates counts until it can possibly overflow + // this constant computes the number of uint64 units that can fit into + // units the size of uint16. + static const uint64 kOne = 1; + static const uint32 kStorageBitSize = 64; + static const uint32 kStorageLogBitSize = 6; + static const uint32 kSecondaryBlockSize = ((1 << 16) - 1) + >> kStorageLogBitSize; + + private: + static const uint32 kStorageBlockMask = kStorageBitSize - 1; + + // returns, from the index, the count of ones up to array_index + size_t get_index_ones_count(size_t array_index) const; + + // because the indexes, both primary and secondary, contain a running + // count of the population of one bits contained in [0,i), there is + // no reason to have an element in the zeroth position as this value would + // necessarily be zero. (The bits are indexed in a zero based way.) Thus + // we don't store the 0th element in either index. Both of the following + // functions, if greater than 0, must be decremented by one before retreiving + // the value from the corresponding array. + // returns the 1 + the block that contains the bitindex in question + // the inverted version works the same but looks for zeros using an inverted + // view of the index + size_t find_primary_block(size_t bit_index) const; + + size_t find_inverted_primary_block(size_t bit_index) const; + + // similarly, the secondary index (which resets its count to zero at + // the end of every kSecondaryBlockSize entries) does not store the element + // at 0. Note that the rem_bit_index parameter is the number of bits + // within the secondary block, after the bits accounted for by the primary + // block have been removed (i.e. the remaining bits) And, because we + // reset to zero with each new block, there is no need to store those + // actual zeros. + // returns 1 + the secondary block that contains the bitindex in question + size_t find_secondary_block(size_t block, size_t rem_bit_index) const; + + size_t find_inverted_secondary_block(size_t block, size_t rem_bit_index) + const; + + // We create a primary index based upon the number of secondary index + // blocks. The primary index uses fields wide enough to accomodate any + // index of the bitarray so cannot overflow + // The primary index is the actual running + // count of one bits set for all blocks (and, thus, all uint64s). + size_t primary_index_size() const { + return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize; + } + + const uint64* bits_; + size_t size_; + + // The primary index contains the running popcount of all blocks + // which means the nth value contains the popcounts of + // [0,n*kSecondaryBlockSize], however, the 0th element is omitted. + vector primary_index_; + // The secondary index contains the running popcount of the associated + // bitmap. It is the same length (in units of uint16) as the + // bitmap's map is in units of uint64s. + vector secondary_index_; +}; + +} // end namespace fst + +#endif // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_ diff --git a/src/include/fst/extensions/ngram/ngram-fst.h b/src/include/fst/extensions/ngram/ngram-fst.h new file mode 100644 index 0000000..eee664a --- /dev/null +++ b/src/include/fst/extensions/ngram/ngram-fst.h @@ -0,0 +1,912 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: sorenj@google.com (Jeffrey Sorensen) +// +#ifndef FST_EXTENSIONS_NGRAM_NGRAM_FST_H_ +#define FST_EXTENSIONS_NGRAM_NGRAM_FST_H_ + +#include +#include +#include +#include +#include +using std::vector; + +#include +#include +#include + +// NgramFst implements a n-gram language model based upon the LOUDS data +// structure. Please refer to "Unary Data Strucutres for Language Models" +// http://research.google.com/pubs/archive/37218.pdf + +namespace fst { +template class NGramFst; +template class NGramFstMatcher; + +// Instance data containing mutable state for bookkeeping repeated access to +// the same state. +template +struct NGramFstInst { + typedef typename A::Label Label; + typedef typename A::StateId StateId; + typedef typename A::Weight Weight; + StateId state_; + size_t num_futures_; + size_t offset_; + size_t node_; + StateId node_state_; + vector { + using FstImpl::SetInputSymbols; + using FstImpl::SetOutputSymbols; + using FstImpl::SetType; + using FstImpl::WriteHeader; + + friend class ArcIterator >; + friend class NGramFstMatcher; + + public: + using FstImpl::InputSymbols; + using FstImpl::SetProperties; + using FstImpl::Properties; + + typedef A Arc; + typedef typename A::Label Label; + typedef typename A::StateId StateId; + typedef typename A::Weight Weight; + + NGramFstImpl() : data_(0), owned_(false) { + SetType("ngram"); + SetInputSymbols(NULL); + SetOutputSymbols(NULL); + SetProperties(kStaticProperties); + } + + NGramFstImpl(const Fst &fst, vector* order_out); + + ~NGramFstImpl() { + if (owned_) { + delete [] data_; + } + } + + static NGramFstImpl* Read(istream &strm, // NOLINT + const FstReadOptions &opts) { + NGramFstImpl* impl = new NGramFstImpl(); + FstHeader hdr; + if (!impl->ReadHeader(strm, opts, kMinFileVersion, &hdr)) return 0; + uint64 num_states, num_futures, num_final; + const size_t offset = sizeof(num_states) + sizeof(num_futures) + + sizeof(num_final); + // Peek at num_states and num_futures to see how much more needs to be read. + strm.read(reinterpret_cast(&num_states), sizeof(num_states)); + strm.read(reinterpret_cast(&num_futures), sizeof(num_futures)); + strm.read(reinterpret_cast(&num_final), sizeof(num_final)); + size_t size = Storage(num_states, num_futures, num_final); + char* data = new char[size]; + // Copy num_states, num_futures and num_final back into data. + memcpy(data, reinterpret_cast(&num_states), sizeof(num_states)); + memcpy(data + sizeof(num_states), reinterpret_cast(&num_futures), + sizeof(num_futures)); + memcpy(data + sizeof(num_states) + sizeof(num_futures), + reinterpret_cast(&num_final), sizeof(num_final)); + strm.read(data + offset, size - offset); + if (!strm) { + delete impl; + return NULL; + } + impl->Init(data, true /* owned */); + return impl; + } + + bool Write(ostream &strm, // NOLINT + const FstWriteOptions &opts) const { + FstHeader hdr; + hdr.SetStart(Start()); + hdr.SetNumStates(num_states_); + WriteHeader(strm, opts, kFileVersion, &hdr); + strm.write(data_, Storage(num_states_, num_futures_, num_final_)); + return strm; + } + + StateId Start() const { + return 1; + } + + Weight Final(StateId state) const { + if (final_index_.Get(state)) { + return final_probs_[final_index_.Rank1(state)]; + } else { + return Weight::Zero(); + } + } + + size_t NumArcs(StateId state, NGramFstInst *inst = NULL) const { + if (inst == NULL) { + const size_t next_zero = future_index_.Select0(state + 1); + const size_t this_zero = future_index_.Select0(state); + return next_zero - this_zero - 1; + } + SetInstFuture(state, inst); + return inst->num_futures_ + ((state == 0) ? 0 : 1); + } + + size_t NumInputEpsilons(StateId state) const { + // State 0 has no parent, thus no backoff. + if (state == 0) return 0; + return 1; + } + + size_t NumOutputEpsilons(StateId state) const { + return NumInputEpsilons(state); + } + + StateId NumStates() const { + return num_states_; + } + + void InitStateIterator(StateIteratorData* data) const { + data->base = 0; + data->nstates = num_states_; + } + + static size_t Storage(uint64 num_states, uint64 num_futures, + uint64 num_final) { + uint64 b64; + Weight weight; + Label label; + size_t offset = sizeof(num_states) + sizeof(num_futures) + + sizeof(num_final); + offset += sizeof(b64) * ( + BitmapIndex::StorageSize(num_states * 2 + 1) + + BitmapIndex::StorageSize(num_futures + num_states + 1) + + BitmapIndex::StorageSize(num_states)); + offset += (num_states + 1) * sizeof(label) + num_futures * sizeof(label); + // Pad for alignemnt, see + // http://en.wikipedia.org/wiki/Data_structure_alignment#Computing_padding + offset = (offset + sizeof(weight) - 1) & ~(sizeof(weight) - 1); + offset += (num_states + 1) * sizeof(weight) + num_final * sizeof(weight) + + (num_futures + 1) * sizeof(weight); + return offset; + } + + void SetInstFuture(StateId state, NGramFstInst *inst) const { + if (inst->state_ != state) { + inst->state_ = state; + const size_t next_zero = future_index_.Select0(state + 1); + const size_t this_zero = future_index_.Select0(state); + inst->num_futures_ = next_zero - this_zero - 1; + inst->offset_ = future_index_.Rank1(future_index_.Select0(state) + 1); + } + } + + void SetInstNode(NGramFstInst *inst) const { + if (inst->node_state_ != inst->state_) { + inst->node_state_ = inst->state_; + inst->node_ = context_index_.Select1(inst->state_); + } + } + + void SetInstContext(NGramFstInst *inst) const { + SetInstNode(inst); + if (inst->context_state_ != inst->state_) { + inst->context_state_ = inst->state_; + inst->context_.clear(); + size_t node = inst->node_; + while (node != 0) { + inst->context_.push_back(context_words_[context_index_.Rank1(node)]); + node = context_index_.Select1(context_index_.Rank0(node) - 1); + } + } + } + + // Access to the underlying representation + const char* GetData(size_t* data_size) const { + *data_size = Storage(num_states_, num_futures_, num_final_); + return data_; + } + + void Init(const char* data, bool owned); + + private: + StateId Transition(const vector &); // Disallow +}; + +template +NGramFstImpl::NGramFstImpl(const Fst &fst, vector* order_out) + : data_(0), owned_(false) { + typedef A Arc; + typedef typename Arc::Label Label; + typedef typename Arc::Weight Weight; + typedef typename Arc::StateId StateId; + SetType("ngram"); + SetInputSymbols(fst.InputSymbols()); + SetOutputSymbols(fst.OutputSymbols()); + SetProperties(kStaticProperties); + + // Check basic requirements for an OpenGRM language model Fst. + int64 props = kAcceptor | kIDeterministic | kIEpsilons | kILabelSorted; + if (fst.Properties(props, true) != props) { + FSTERROR() << "NGramFst only accepts OpenGRM langauge models as input"; + SetProperties(kError, kError); + return; + } + + int64 num_states = CountStates(fst); + Label* context = new Label[num_states]; + + // Find the unigram state by starting from the start state, following + // epsilons. + StateId unigram = fst.Start(); + while (1) { + ArcIterator > aiter(fst, unigram); + if (aiter.Done()) { + FSTERROR() << "Start state has no arcs"; + SetProperties(kError, kError); + return; + } + if (aiter.Value().ilabel != 0) break; + unigram = aiter.Value().nextstate; + } + + // Each state's context is determined by the subtree it is under from the + // unigram state. + queue > label_queue; + vector visited(num_states); + // Force an epsilon link to the start state. + label_queue.push(make_pair(fst.Start(), 0)); + for (ArcIterator > aiter(fst, unigram); + !aiter.Done(); aiter.Next()) { + label_queue.push(make_pair(aiter.Value().nextstate, aiter.Value().ilabel)); + } + // investigate states in breadth first fashion to assign context words. + while (!label_queue.empty()) { + pair &now = label_queue.front(); + if (!visited[now.first]) { + context[now.first] = now.second; + visited[now.first] = true; + for (ArcIterator > aiter(fst, now.first); + !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + if (arc.ilabel != 0) { + label_queue.push(make_pair(arc.nextstate, now.second)); + } + } + } + label_queue.pop(); + } + visited.clear(); + + // The arc from the start state should be assigned an epsilon to put it + // in front of the all other labels (which makes Start state 1 after + // unigram which is state 0). + context[fst.Start()] = 0; + + // Build the tree of contexts fst by reversing the epsilon arcs from fst. + VectorFst context_fst; + uint64 num_final = 0; + for (int i = 0; i < num_states; ++i) { + if (fst.Final(i) != Weight::Zero()) { + ++num_final; + } + context_fst.SetFinal(context_fst.AddState(), fst.Final(i)); + } + context_fst.SetStart(unigram); + context_fst.SetInputSymbols(fst.InputSymbols()); + context_fst.SetOutputSymbols(fst.OutputSymbols()); + int64 num_context_arcs = 0; + int64 num_futures = 0; + for (StateIterator > siter(fst); !siter.Done(); siter.Next()) { + const StateId &state = siter.Value(); + num_futures += fst.NumArcs(state) - fst.NumInputEpsilons(state); + ArcIterator > aiter(fst, state); + if (!aiter.Done()) { + const Arc &arc = aiter.Value(); + // this arc goes from state to arc.nextstate, so create an arc from + // arc.nextstate to state to reverse it. + if (arc.ilabel == 0) { + context_fst.AddArc(arc.nextstate, Arc(context[state], context[state], + arc.weight, state)); + num_context_arcs++; + } + } + } + if (num_context_arcs != context_fst.NumStates() - 1) { + FSTERROR() << "Number of contexts arcs != number of states - 1"; + SetProperties(kError, kError); + return; + } + if (context_fst.NumStates() != num_states) { + FSTERROR() << "Number of contexts != number of states"; + SetProperties(kError, kError); + return; + } + int64 context_props = context_fst.Properties(kIDeterministic | + kILabelSorted, true); + if (!(context_props & kIDeterministic)) { + FSTERROR() << "Input fst is not structured properly"; + SetProperties(kError, kError); + return; + } + if (!(context_props & kILabelSorted)) { + ArcSort(&context_fst, ILabelCompare()); + } + + delete [] context; + + uint64 b64; + Weight weight; + Label label = kNoLabel; + const size_t storage = Storage(num_states, num_futures, num_final); + char* data = new char[storage]; + memset(data, 0, storage); + size_t offset = 0; + memcpy(data + offset, reinterpret_cast(&num_states), + sizeof(num_states)); + offset += sizeof(num_states); + memcpy(data + offset, reinterpret_cast(&num_futures), + sizeof(num_futures)); + offset += sizeof(num_futures); + memcpy(data + offset, reinterpret_cast(&num_final), + sizeof(num_final)); + offset += sizeof(num_final); + uint64* context_bits = reinterpret_cast(data + offset); + offset += BitmapIndex::StorageSize(num_states * 2 + 1) * sizeof(b64); + uint64* future_bits = reinterpret_cast(data + offset); + offset += + BitmapIndex::StorageSize(num_futures + num_states + 1) * sizeof(b64); + uint64* final_bits = reinterpret_cast(data + offset); + offset += BitmapIndex::StorageSize(num_states) * sizeof(b64); + Label* context_words = reinterpret_cast(data + offset); + offset += (num_states + 1) * sizeof(label); + Label* future_words = reinterpret_cast(data + offset); + offset += num_futures * sizeof(label); + offset = (offset + sizeof(weight) - 1) & ~(sizeof(weight) - 1); + Weight* backoff = reinterpret_cast(data + offset); + offset += (num_states + 1) * sizeof(weight); + Weight* final_probs = reinterpret_cast(data + offset); + offset += num_final * sizeof(weight); + Weight* future_probs = reinterpret_cast(data + offset); + int64 context_arc = 0, future_arc = 0, context_bit = 0, future_bit = 0, + final_bit = 0; + + // pseudo-root bits + BitmapIndex::Set(context_bits, context_bit++); + ++context_bit; + context_words[context_arc] = label; + backoff[context_arc] = Weight::Zero(); + context_arc++; + + ++future_bit; + if (order_out) { + order_out->clear(); + order_out->resize(num_states); + } + + queue context_q; + context_q.push(context_fst.Start()); + StateId state_number = 0; + while (!context_q.empty()) { + const StateId &state = context_q.front(); + if (order_out) { + (*order_out)[state] = state_number; + } + + const Weight &final = context_fst.Final(state); + if (final != Weight::Zero()) { + BitmapIndex::Set(final_bits, state_number); + final_probs[final_bit] = final; + ++final_bit; + } + + for (ArcIterator > aiter(context_fst, state); + !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + context_words[context_arc] = arc.ilabel; + backoff[context_arc] = arc.weight; + ++context_arc; + BitmapIndex::Set(context_bits, context_bit++); + context_q.push(arc.nextstate); + } + ++context_bit; + + for (ArcIterator > aiter(fst, state); !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + if (arc.ilabel != 0) { + future_words[future_arc] = arc.ilabel; + future_probs[future_arc] = arc.weight; + ++future_arc; + BitmapIndex::Set(future_bits, future_bit++); + } + } + ++future_bit; + ++state_number; + context_q.pop(); + } + + if ((state_number != num_states) || + (context_bit != num_states * 2 + 1) || + (context_arc != num_states) || + (future_arc != num_futures) || + (future_bit != num_futures + num_states + 1) || + (final_bit != num_final)) { + FSTERROR() << "Structure problems detected during construction"; + SetProperties(kError, kError); + return; + } + + Init(data, true /* owned */); +} + +template +inline void NGramFstImpl::Init(const char* data, bool owned) { + if (owned_) { + delete [] data_; + } + owned_ = owned; + data_ = data; + size_t offset = 0; + num_states_ = *(reinterpret_cast(data_ + offset)); + offset += sizeof(num_states_); + num_futures_ = *(reinterpret_cast(data_ + offset)); + offset += sizeof(num_futures_); + num_final_ = *(reinterpret_cast(data_ + offset)); + offset += sizeof(num_final_); + uint64 bits; + size_t context_bits = num_states_ * 2 + 1; + size_t future_bits = num_futures_ + num_states_ + 1; + context_ = reinterpret_cast(data_ + offset); + offset += BitmapIndex::StorageSize(context_bits) * sizeof(bits); + future_ = reinterpret_cast(data_ + offset); + offset += BitmapIndex::StorageSize(future_bits) * sizeof(bits); + final_ = reinterpret_cast(data_ + offset); + offset += BitmapIndex::StorageSize(num_states_ + 1) * sizeof(bits); + context_words_ = reinterpret_cast(data_ + offset); + offset += (num_states_ + 1) * sizeof(*context_words_); + future_words_ = reinterpret_cast(data_ + offset); + offset += num_futures_ * sizeof(*future_words_); + offset = (offset + sizeof(*backoff_) - 1) & ~(sizeof(*backoff_) - 1); + backoff_ = reinterpret_cast(data_ + offset); + offset += (num_states_ + 1) * sizeof(*backoff_); + final_probs_ = reinterpret_cast(data_ + offset); + offset += num_final_ * sizeof(*final_probs_); + future_probs_ = reinterpret_cast(data_ + offset); + + context_index_.BuildIndex(context_, context_bits); + future_index_.BuildIndex(future_, future_bits); + final_index_.BuildIndex(final_, num_states_); + + const size_t node_rank = context_index_.Rank1(0); + root_first_child_ = context_index_.Select0(node_rank) + 1; + if (context_index_.Get(root_first_child_) == false) { + FSTERROR() << "Missing unigrams"; + SetProperties(kError, kError); + return; + } + const size_t last_child = context_index_.Select0(node_rank + 1) - 1; + root_num_children_ = last_child - root_first_child_ + 1; + root_children_ = context_words_ + context_index_.Rank1(root_first_child_); +} + +template +inline typename A::StateId NGramFstImpl::Transition( + const vector; + + public: + typedef A Arc; + typedef typename A::StateId StateId; + typedef typename A::Label Label; + typedef typename A::Weight Weight; + typedef NGramFstImpl Impl; + + explicit NGramFst(const Fst &dst) + : ImplToExpandedFst(new Impl(dst, NULL)) {} + + NGramFst(const Fst &fst, vector* order_out) + : ImplToExpandedFst(new Impl(fst, order_out)) {} + + // Because the NGramFstImpl is a const stateless data structure, there + // is never a need to do anything beside copy the reference. + NGramFst(const NGramFst &fst, bool safe = false) + : ImplToExpandedFst(fst, false) {} + + NGramFst() : ImplToExpandedFst(new Impl()) {} + + // Non-standard constructor to initialize NGramFst directly from data. + NGramFst(const char* data, bool owned) : ImplToExpandedFst(new Impl()) { + GetImpl()->Init(data, owned); + } + + // Get method that gets the data associated with Init(). + const char* GetData(size_t* data_size) const { + return GetImpl()->GetData(data_size); + } + + virtual size_t NumArcs(StateId s) const { + return GetImpl()->NumArcs(s, &inst_); + } + + virtual NGramFst* Copy(bool safe = false) const { + return new NGramFst(*this, safe); + } + + static NGramFst* Read(istream &strm, const FstReadOptions &opts) { + Impl* impl = Impl::Read(strm, opts); + return impl ? new NGramFst(impl) : 0; + } + + static NGramFst* Read(const string &filename) { + if (!filename.empty()) { + ifstream strm(filename.c_str(), ifstream::in | ifstream::binary); + if (!strm) { + LOG(ERROR) << "NGramFst::Read: Can't open file: " << filename; + return 0; + } + return Read(strm, FstReadOptions(filename)); + } else { + return Read(cin, FstReadOptions("standard input")); + } + } + + virtual bool Write(ostream &strm, const FstWriteOptions &opts) const { + return GetImpl()->Write(strm, opts); + } + + virtual bool Write(const string &filename) const { + return Fst::WriteFile(filename); + } + + virtual inline void InitStateIterator(StateIteratorData* data) const { + GetImpl()->InitStateIterator(data); + } + + virtual inline void InitArcIterator( + StateId s, ArcIteratorData* data) const; + + virtual MatcherBase* InitMatcher(MatchType match_type) const { + return new NGramFstMatcher(*this, match_type); + } + + private: + explicit NGramFst(Impl* impl) : ImplToExpandedFst(impl) {} + + Impl* GetImpl() const { + return + ImplToExpandedFst >::GetImpl(); + } + + void SetImpl(Impl* impl, bool own_impl = true) { + ImplToExpandedFst >::SetImpl(impl, own_impl); + } + + mutable NGramFstInst inst_; +}; + +template inline void +NGramFst::InitArcIterator(StateId s, ArcIteratorData* data) const { + GetImpl()->SetInstFuture(s, &inst_); + GetImpl()->SetInstNode(&inst_); + data->base = new ArcIterator >(*this, s); +} + +/*****************************************************************************/ +template +class NGramFstMatcher : public MatcherBase { + public: + typedef A Arc; + typedef typename A::Label Label; + typedef typename A::StateId StateId; + typedef typename A::Weight Weight; + + NGramFstMatcher(const NGramFst &fst, MatchType match_type) + : fst_(fst), inst_(fst.inst_), match_type_(match_type), + current_loop_(false), + loop_(kNoLabel, 0, A::Weight::One(), kNoStateId) { + if (match_type_ == MATCH_OUTPUT) { + swap(loop_.ilabel, loop_.olabel); + } + } + + NGramFstMatcher(const NGramFstMatcher &matcher, bool safe = false) + : fst_(matcher.fst_), inst_(matcher.inst_), + match_type_(matcher.match_type_), current_loop_(false), + loop_(kNoLabel, 0, A::Weight::One(), kNoStateId) { + if (match_type_ == MATCH_OUTPUT) { + swap(loop_.ilabel, loop_.olabel); + } + } + + virtual NGramFstMatcher* Copy(bool safe = false) const { + return new NGramFstMatcher(*this, safe); + } + + virtual MatchType Type(bool test) const { + return match_type_; + } + + virtual const Fst &GetFst() const { + return fst_; + } + + virtual uint64 Properties(uint64 props) const { + return props; + } + + private: + virtual void SetState_(StateId s) { + fst_.GetImpl()->SetInstFuture(s, &inst_); + current_loop_ = false; + } + + virtual bool Find_(Label label) { + const Label nolabel = kNoLabel; + done_ = true; + if (label == 0 || label == nolabel) { + if (label == 0) { + current_loop_ = true; + loop_.nextstate = inst_.state_; + } + // The unigram state has no epsilon arc. + if (inst_.state_ != 0) { + arc_.ilabel = arc_.olabel = 0; + fst_.GetImpl()->SetInstNode(&inst_); + arc_.nextstate = fst_.GetImpl()->context_index_.Rank1( + fst_.GetImpl()->context_index_.Select1( + fst_.GetImpl()->context_index_.Rank0(inst_.node_) - 1)); + arc_.weight = fst_.GetImpl()->backoff_[inst_.state_]; + done_ = false; + } + } else { + const Label *start = fst_.GetImpl()->future_words_ + inst_.offset_; + const Label *end = start + inst_.num_futures_; + const Label* search = lower_bound(start, end, label); + if (search != end && *search == label) { + size_t state = search - start; + arc_.ilabel = arc_.olabel = label; + arc_.weight = fst_.GetImpl()->future_probs_[inst_.offset_ + state]; + fst_.GetImpl()->SetInstContext(&inst_); + arc_.nextstate = fst_.GetImpl()->Transition(inst_.context_, label); + done_ = false; + } + } + return !Done_(); + } + + virtual bool Done_() const { + return !current_loop_ && done_; + } + + virtual const Arc& Value_() const { + return (current_loop_) ? loop_ : arc_; + } + + virtual void Next_() { + if (current_loop_) { + current_loop_ = false; + } else { + done_ = true; + } + } + + const NGramFst& fst_; + NGramFstInst inst_; + MatchType match_type_; // Supplied by caller + bool done_; + Arc arc_; + bool current_loop_; // Current arc is the implicit loop + Arc loop_; +}; + +/*****************************************************************************/ +template +class ArcIterator > : public ArcIteratorBase { + public: + typedef A Arc; + typedef typename A::Label Label; + typedef typename A::StateId StateId; + typedef typename A::Weight Weight; + + ArcIterator(const NGramFst &fst, StateId state) + : lazy_(~0), impl_(fst.GetImpl()), i_(0), flags_(kArcValueFlags) { + inst_ = fst.inst_; + impl_->SetInstFuture(state, &inst_); + impl_->SetInstNode(&inst_); + } + + bool Done() const { + return i_ >= ((inst_.node_ == 0) ? inst_.num_futures_ : + inst_.num_futures_ + 1); + } + + const Arc &Value() const { + bool eps = (inst_.node_ != 0 && i_ == 0); + StateId state = (inst_.node_ == 0) ? i_ : i_ - 1; + if (flags_ & lazy_ & (kArcILabelValue | kArcOLabelValue)) { + arc_.ilabel = + arc_.olabel = eps ? 0 : impl_->future_words_[inst_.offset_ + state]; + lazy_ &= ~(kArcILabelValue | kArcOLabelValue); + } + if (flags_ & lazy_ & kArcNextStateValue) { + if (eps) { + arc_.nextstate = impl_->context_index_.Rank1( + impl_->context_index_.Select1( + impl_->context_index_.Rank0(inst_.node_) - 1)); + } else { + if (lazy_ & kArcNextStateValue) { + impl_->SetInstContext(&inst_); // first time only. + } + arc_.nextstate = + impl_->Transition(inst_.context_, + impl_->future_words_[inst_.offset_ + state]); + } + lazy_ &= ~kArcNextStateValue; + } + if (flags_ & lazy_ & kArcWeightValue) { + arc_.weight = eps ? impl_->backoff_[inst_.state_] : + impl_->future_probs_[inst_.offset_ + state]; + lazy_ &= ~kArcWeightValue; + } + return arc_; + } + + void Next() { + ++i_; + lazy_ = ~0; + } + + size_t Position() const { return i_; } + + void Reset() { + i_ = 0; + lazy_ = ~0; + } + + void Seek(size_t a) { + if (i_ != a) { + i_ = a; + lazy_ = ~0; + } + } + + uint32 Flags() const { + return flags_; + } + + void SetFlags(uint32 f, uint32 m) { + flags_ &= ~m; + flags_ |= (f & kArcValueFlags); + } + + private: + virtual bool Done_() const { return Done(); } + virtual const Arc& Value_() const { return Value(); } + virtual void Next_() { Next(); } + virtual size_t Position_() const { return Position(); } + virtual void Reset_() { Reset(); } + virtual void Seek_(size_t a) { Seek(a); } + uint32 Flags_() const { return Flags(); } + void SetFlags_(uint32 f, uint32 m) { SetFlags(f, m); } + + mutable Arc arc_; + mutable uint32 lazy_; + const NGramFstImpl *impl_; + mutable NGramFstInst inst_; + + size_t i_; + uint32 flags_; + + DISALLOW_COPY_AND_ASSIGN(ArcIterator); +}; + +/*****************************************************************************/ +// Specialization for NGramFst; see generic version in fst.h +// for sample usage (but use the ProdLmFst type!). This version +// should inline. +template +class StateIterator > : public StateIteratorBase { + public: + typedef typename A::StateId StateId; + + explicit StateIterator(const NGramFst &fst) + : s_(0), num_states_(fst.NumStates()) { } + + bool Done() const { return s_ >= num_states_; } + StateId Value() const { return s_; } + void Next() { ++s_; } + void Reset() { s_ = 0; } + + private: + virtual bool Done_() const { return Done(); } + virtual StateId Value_() const { return Value(); } + virtual void Next_() { Next(); } + virtual void Reset_() { Reset(); } + + StateId s_, num_states_; + + DISALLOW_COPY_AND_ASSIGN(StateIterator); +}; +} // namespace fst +#endif // FST_EXTENSIONS_NGRAM_NGRAM_FST_H_ diff --git a/src/include/fst/extensions/ngram/nthbit.h b/src/include/fst/extensions/ngram/nthbit.h new file mode 100644 index 0000000..d4a9a5a --- /dev/null +++ b/src/include/fst/extensions/ngram/nthbit.h @@ -0,0 +1,46 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: sorenj@google.com (Jeffrey Sorensen) +// dr@google.com (Doug Rohde) + +#ifndef FST_EXTENSIONS_NGRAM_NTHBIT_H_ +#define FST_EXTENSIONS_NGRAM_NTHBIT_H_ + +#include + +extern uint32 nth_bit_bit_offset[]; + +inline uint32 nth_bit(uint64 v, uint32 r) { + uint32 shift = 0; + uint32 c = __builtin_popcount(v & 0xffffffff); + uint32 mask = -(r > c); + r -= c & mask; + shift += (32 & mask); + + c = __builtin_popcount((v >> shift) & 0xffff); + mask = -(r > c); + r -= c & mask; + shift += (16 & mask); + + c = __builtin_popcount((v >> shift) & 0xff); + mask = -(r > c); + r -= c & mask; + shift += (8 & mask); + + return shift + ((nth_bit_bit_offset[(v >> shift) & 0xff] >> + ((r - 1) << 2)) & 0xf); +} + +#endif // FST_EXTENSIONS_NGRAM_NTHBIT_H_ diff --git a/src/include/fst/extensions/pdt/collection.h b/src/include/fst/extensions/pdt/collection.h index 26be504..24a443f 100644 --- a/src/include/fst/extensions/pdt/collection.h +++ b/src/include/fst/extensions/pdt/collection.h @@ -16,7 +16,7 @@ // Author: riley@google.com (Michael Riley) // // \file -// Class to store a collection of sets with elements of type T. +// Class to store a collection of ordered (multi-)sets with elements of type T. #ifndef FST_EXTENSIONS_PDT_COLLECTION_H__ #define FST_EXTENSIONS_PDT_COLLECTION_H__ @@ -29,11 +29,11 @@ using std::vector; namespace fst { -// Stores a collection of non-empty sets with elements of type T. A -// default constructor, equality ==, a total order <, and an STL-style -// hash class must be defined on the elements. Provides signed -// integer ID (of type I) of each unique set. The IDs are allocated -// starting from 0 in order. +// Stores a collection of non-empty, ordered (multi-)sets with elements +// of type T. A default constructor, equality ==, and an STL-style +// hash class must be defined on the elements. Provides signed integer +// ID (of type I) of each unique set. The IDs are allocated starting +// from 0 in order. template class Collection { public: @@ -80,31 +80,34 @@ class Collection { Collection() {} - // Lookups integer ID from set. If it doesn't exist, then adds it. - // Set elements should be in strict order (and therefore unique). - I FindId(const vector &set) { + // Lookups integer ID from ordered multi-set. If it doesn't exist + // and 'insert' is true, then adds it. Otherwise returns -1. + I FindId(const vector &set, bool insert = true) { I node_id = kNoNodeId; for (ssize_t i = set.size() - 1; i >= 0; --i) { Node node(node_id, set[i]); - node_id = node_table_.FindId(node); + node_id = node_table_.FindId(node, insert); + if (node_id == -1) break; } return node_id; } - // Finds set given integer ID. Returns true if ID corresponds - // to set. Use iterators below to traverse result. + // Finds ordered (multi-)set given integer ID. Returns set iterator + // to traverse result. SetIterator FindSet(I id) { - if (id < 0 && id >= node_table_.Size()) { + if (id < 0 || id >= node_table_.Size()) { return SetIterator(kNoNodeId, Node(kNoNodeId, T()), &node_table_); } else { return SetIterator(id, node_table_.FindEntry(id), &node_table_); } } + I Size() const { return node_table_.Size(); } + private: static const I kNoNodeId; static const size_t kPrime; - static std::tr1::hash hash_; + static std::hash hash_; NodeTable node_table_; @@ -115,7 +118,7 @@ template const I Collection::kNoNodeId = -1; template const size_t Collection::kPrime = 7853; -template std::tr1::hash Collection::hash_; +template std::hash Collection::hash_; } // namespace fst diff --git a/src/include/fst/extensions/pdt/info.h b/src/include/fst/extensions/pdt/info.h index ef9a860..55e76c4 100644 --- a/src/include/fst/extensions/pdt/info.h +++ b/src/include/fst/extensions/pdt/info.h @@ -24,7 +24,7 @@ #include using std::tr1::unordered_map; using std::tr1::unordered_multimap; -#include +#include using std::tr1::unordered_set; using std::tr1::unordered_multiset; #include diff --git a/src/include/fst/extensions/pdt/paren.h b/src/include/fst/extensions/pdt/paren.h index 7b9887f..a9d30c5 100644 --- a/src/include/fst/extensions/pdt/paren.h +++ b/src/include/fst/extensions/pdt/paren.h @@ -26,7 +26,7 @@ #include using std::tr1::unordered_map; using std::tr1::unordered_multimap; -#include +#include using std::tr1::unordered_set; using std::tr1::unordered_multiset; #include @@ -144,7 +144,8 @@ class PdtParenReachable { const vector > &parens, bool close) : fst_(fst), parens_(parens), - close_(close) { + close_(close), + error_(false) { for (Label i = 0; i < parens.size(); ++i) { const pair &p = parens[i]; paren_id_map_[p.first] = i; @@ -155,12 +156,18 @@ class PdtParenReachable { StateId start = fst.Start(); if (start == kNoStateId) return; - DFSearch(start, start); + if (!DFSearch(start)) { + FSTERROR() << "PdtReachable: Underlying cyclicity not supported"; + error_ = true; + } } else { FSTERROR() << "PdtParenReachable: open paren info not implemented"; + error_ = true; } } + bool const Error() { return error_; } + // Given a state ID, returns an iterator over paren IDs // for close (open) parens reachable from that state along balanced // paths. @@ -194,7 +201,7 @@ class PdtParenReachable { private: // DFS that gathers paren and state set information. // Bool returns false when cycle detected. - bool DFSearch(StateId s, StateId start); + bool DFSearch(StateId s); // Unions state sets together gathered by the DFS. void ComputeStateSet(StateId s); @@ -212,12 +219,13 @@ class PdtParenReachable { vector state_color_; // DFS state mutable Collection state_sets_; // Reachable states -> ID StateSetMap set_map_; // ID -> Reachable states + bool error_; DISALLOW_COPY_AND_ASSIGN(PdtParenReachable); }; // DFS that gathers paren and state set information. template -bool PdtParenReachable::DFSearch(StateId s, StateId start) { +bool PdtParenReachable::DFSearch(StateId s) { if (s >= state_color_.size()) state_color_.resize(s + 1, kDfsWhite); @@ -239,7 +247,8 @@ bool PdtParenReachable::DFSearch(StateId s, StateId start) { if (pit != paren_id_map_.end()) { // paren? Label paren_id = pit->second; if (arc.ilabel == parens_[paren_id].first) { // open paren - DFSearch(arc.nextstate, arc.nextstate); + if (!DFSearch(arc.nextstate)) + return false; for (SetIterator set_iter = FindStates(paren_id, arc.nextstate); !set_iter.Done(); set_iter.Next()) { for (ParenArcIterator paren_arc_iter = @@ -247,15 +256,14 @@ bool PdtParenReachable::DFSearch(StateId s, StateId start) { !paren_arc_iter.Done(); paren_arc_iter.Next()) { const A &cparc = paren_arc_iter.Value(); - DFSearch(cparc.nextstate, start); + if (!DFSearch(cparc.nextstate)) + return false; } } } } else { // non-paren - if(!DFSearch(arc.nextstate, start)) { - FSTERROR() << "PdtReachable: Underlying cyclicity not supported"; - return true; - } + if(!DFSearch(arc.nextstate)) + return false; } } ComputeStateSet(s); diff --git a/src/include/fst/extensions/pdt/shortest-path.h b/src/include/fst/extensions/pdt/shortest-path.h index e90471b..85f94b8 100644 --- a/src/include/fst/extensions/pdt/shortest-path.h +++ b/src/include/fst/extensions/pdt/shortest-path.h @@ -28,7 +28,7 @@ #include using std::tr1::unordered_map; using std::tr1::unordered_multimap; -#include +#include using std::tr1::unordered_set; using std::tr1::unordered_multiset; #include @@ -387,7 +387,6 @@ class PdtShortestPath { typedef typename SpData::SearchState SearchState; typedef typename SpData::ParenSpec ParenSpec; - typedef typename PdtParenReachable::SetIterator StateSetIterator; typedef typename PdtBalanceData::SetIterator CloseSourceIterator; PdtShortestPath(const Fst &ifst, @@ -403,7 +402,7 @@ class PdtShortestPath { if ((Weight::Properties() & (kPath | kRightSemiring)) != (kPath | kRightSemiring)) { - FSTERROR() << "SingleShortestPath: Weight needs to have the path" + FSTERROR() << "PdtShortestPath: Weight needs to have the path" << " property and be right distributive: " << Weight::Type(); error_ = true; } @@ -440,6 +439,7 @@ class PdtShortestPath { static const Arc kNoArc; static const uint8 kEnqueued; static const uint8 kExpanded; + static const uint8 kFinished; const uint8 kFinal; public: @@ -543,6 +543,7 @@ void PdtShortestPath::GetDistance(StateId start) { ProcArcs(s); sp_data_.SetFlags(s, kExpanded, kExpanded); } + sp_data_.SetFlags(q, kFinished, kFinished); balance_data_.FinishInsert(start); sp_data_.GC(start); } @@ -607,7 +608,11 @@ void PdtShortestPath::ProcOpenParen( Queue *state_queue = state_queue_; GetDistance(d.start); state_queue_ = state_queue; + } else if (!(sp_data_.Flags(d) & kFinished)) { + FSTERROR() << "PdtShortestPath: open parenthesis recursion: not bounded stack"; + error_ = true; } + for (CloseSourceIterator set_iter = balance_data_.Find(paren_id, arc.nextstate); !set_iter.Done(); set_iter.Next()) { @@ -764,6 +769,9 @@ const uint8 PdtShortestPath::kEnqueued = 0x10; template const uint8 PdtShortestPath::kExpanded = 0x20; +template +const uint8 PdtShortestPath::kFinished = 0x40; + template void ShortestPath(const Fst &ifst, const vector #include +#include +#include #include #include @@ -42,7 +44,7 @@ using std::string; // // DECLARE_int32(length); // -// SetFlags() can be used to set flags from the command line +// SET_FLAGS() can be used to set flags from the command line // using, for example, '--length=2'. // // ShowUsage() can be used to print out command and flag usage. @@ -56,12 +58,18 @@ using std::string; template struct FlagDescription { - FlagDescription(T *addr, const char *doc, const char *type, const T val) - : address(addr), doc_string(doc), type_name(type), default_value(val) {} + FlagDescription(T *addr, const char *doc, const char *type, + const char *file, const T val) + : address(addr), + doc_string(doc), + type_name(type), + file_name(file), + default_value(val) {} T *address; const char *doc_string; const char *type_name; + const char *file_name; const T default_value; }; @@ -118,8 +126,7 @@ class FlagRegister { } bool SetFlag(const string &arg, const string &val) const { - for (typename std::map< string, - FlagDescription >::const_iterator it = + for (typename std::map< string, FlagDescription >::const_iterator it = flag_table_.begin(); it != flag_table_.end(); ++it) { @@ -131,19 +138,7 @@ class FlagRegister { return false; } - void ShowDefault(bool default_value) const { - std::cout << ", default = "; - std::cout << (default_value ? "true" : "false"); - } - void ShowDefault(const string &default_value) const { - std::cout << ", default = "; - std::cout << "\"" << default_value << "\""; - } - template void ShowDefault(const V& default_value) const { - std::cout << ", default = "; - std::cout << default_value; - } - void ShowUsage() const { + void GetUsage(std::set< std::pair > *usage_set) const { for (typename std::map< string, FlagDescription >::const_iterator it = flag_table_.begin(); @@ -151,10 +146,13 @@ class FlagRegister { ++it) { const string &name = it->first; const FlagDescription &desc = it->second; - std::cout << " --" << name - << ": type = " << desc.type_name; - ShowDefault(desc.default_value); - std::cout << "\n " << desc.doc_string << "\n"; + string usage = " --" + name; + usage += ": type = "; + usage += desc.type_name; + usage += ", default = "; + usage += GetDefault(desc.default_value) + "\n "; + usage += desc.doc_string; + usage_set->insert(make_pair(desc.file_name, usage)); } } @@ -163,11 +161,26 @@ class FlagRegister { register_lock_ = new fst::Mutex; register_ = new FlagRegister; } + + std::map< string, FlagDescription > flag_table_; + + string GetDefault(bool default_value) const { + return default_value ? "true" : "false"; + } + + string GetDefault(const string &default_value) const { + return "\"" + default_value + "\""; + } + + template string GetDefault(const V& default_value) const { + std::ostringstream strm; + strm << default_value; + return strm.str(); + } + static fst::FstOnceType register_init_; // ensures only called once static fst::Mutex* register_lock_; // multithreading lock static FlagRegister *register_; - - std::map< string, FlagDescription > flag_table_; }; template @@ -199,6 +212,7 @@ class FlagRegisterer { name ## _flags_registerer(#name, FlagDescription(&FLAGS_ ## name, \ doc, \ #type, \ + __FILE__, \ value)) #define DEFINE_bool(name, value, doc) DEFINE_VAR(bool, name, value, doc) @@ -212,13 +226,17 @@ class FlagRegisterer { // Temporary directory DECLARE_string(tmpdir); -void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags); +void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags, + const char *src = ""); + +#define SET_FLAGS(usage, argc, argv, rmflags) \ +SetFlags(usage, argc, argv, rmflags, __FILE__) // Deprecated - for backward compatibility inline void InitFst(const char *usage, int *argc, char ***argv, bool rmflags) { return SetFlags(usage, argc, argv, rmflags); } -void ShowUsage(); +void ShowUsage(bool long_usage = true); #endif // FST_LIB_FLAGS_H__ diff --git a/src/include/fst/float-weight.h b/src/include/fst/float-weight.h index 530cbdd..eb22638 100644 --- a/src/include/fst/float-weight.h +++ b/src/include/fst/float-weight.h @@ -37,19 +37,22 @@ namespace fst { template class FloatLimits { public: - static const T kPosInfinity; - static const T kNegInfinity; - static const T kNumberBad; -}; + static const T PosInfinity() { + static const T pos_infinity = numeric_limits::infinity(); + return pos_infinity; + } -template -const T FloatLimits::kPosInfinity = numeric_limits::infinity(); + static const T NegInfinity() { + static const T neg_infinity = -PosInfinity(); + return neg_infinity; + } -template -const T FloatLimits::kNegInfinity = -FloatLimits::kPosInfinity; + static const T NumberBad() { + static const T number_bad = numeric_limits::quiet_NaN(); + return number_bad; + } -template -const T FloatLimits::kNumberBad = numeric_limits::quiet_NaN(); +}; // weight class to be templated on floating-points types template @@ -151,9 +154,9 @@ inline bool ApproxEqual(const FloatWeightTpl &w1, template inline ostream &operator<<(ostream &strm, const FloatWeightTpl &w) { - if (w.Value() == FloatLimits::kPosInfinity) + if (w.Value() == FloatLimits::PosInfinity()) return strm << "Infinity"; - else if (w.Value() == FloatLimits::kNegInfinity) + else if (w.Value() == FloatLimits::NegInfinity()) return strm << "-Infinity"; else if (w.Value() != w.Value()) // Fails for NaN return strm << "BadNumber"; @@ -166,9 +169,9 @@ inline istream &operator>>(istream &strm, FloatWeightTpl &w) { string s; strm >> s; if (s == "Infinity") { - w = FloatWeightTpl(FloatLimits::kPosInfinity); + w = FloatWeightTpl(FloatLimits::PosInfinity()); } else if (s == "-Infinity") { - w = FloatWeightTpl(FloatLimits::kNegInfinity); + w = FloatWeightTpl(FloatLimits::NegInfinity()); } else { char *p; T f = strtod(s.c_str(), &p); @@ -196,13 +199,13 @@ class TropicalWeightTpl : public FloatWeightTpl { TropicalWeightTpl(const TropicalWeightTpl &w) : FloatWeightTpl(w) {} static const TropicalWeightTpl Zero() { - return TropicalWeightTpl(FloatLimits::kPosInfinity); } + return TropicalWeightTpl(FloatLimits::PosInfinity()); } static const TropicalWeightTpl One() { return TropicalWeightTpl(0.0F); } static const TropicalWeightTpl NoWeight() { - return TropicalWeightTpl(FloatLimits::kNumberBad); } + return TropicalWeightTpl(FloatLimits::NumberBad()); } static const string &Type() { static const string type = "tropical" + @@ -212,12 +215,12 @@ class TropicalWeightTpl : public FloatWeightTpl { bool Member() const { // First part fails for IEEE NaN - return Value() == Value() && Value() != FloatLimits::kNegInfinity; + return Value() == Value() && Value() != FloatLimits::NegInfinity(); } TropicalWeightTpl Quantize(float delta = kDelta) const { - if (Value() == FloatLimits::kNegInfinity || - Value() == FloatLimits::kPosInfinity || + if (Value() == FloatLimits::NegInfinity() || + Value() == FloatLimits::PosInfinity() || Value() != Value()) return *this; else @@ -259,9 +262,9 @@ inline TropicalWeightTpl Times(const TropicalWeightTpl &w1, if (!w1.Member() || !w2.Member()) return TropicalWeightTpl::NoWeight(); T f1 = w1.Value(), f2 = w2.Value(); - if (f1 == FloatLimits::kPosInfinity) + if (f1 == FloatLimits::PosInfinity()) return w1; - else if (f2 == FloatLimits::kPosInfinity) + else if (f2 == FloatLimits::PosInfinity()) return w2; else return TropicalWeightTpl(f1 + f2); @@ -284,10 +287,10 @@ inline TropicalWeightTpl Divide(const TropicalWeightTpl &w1, if (!w1.Member() || !w2.Member()) return TropicalWeightTpl::NoWeight(); T f1 = w1.Value(), f2 = w2.Value(); - if (f2 == FloatLimits::kPosInfinity) - return FloatLimits::kNumberBad; - else if (f1 == FloatLimits::kPosInfinity) - return FloatLimits::kPosInfinity; + if (f2 == FloatLimits::PosInfinity()) + return FloatLimits::NumberBad(); + else if (f1 == FloatLimits::PosInfinity()) + return FloatLimits::PosInfinity(); else return TropicalWeightTpl(f1 - f2); } @@ -320,7 +323,7 @@ class LogWeightTpl : public FloatWeightTpl { LogWeightTpl(const LogWeightTpl &w) : FloatWeightTpl(w) {} static const LogWeightTpl Zero() { - return LogWeightTpl(FloatLimits::kPosInfinity); + return LogWeightTpl(FloatLimits::PosInfinity()); } static const LogWeightTpl One() { @@ -328,7 +331,7 @@ class LogWeightTpl : public FloatWeightTpl { } static const LogWeightTpl NoWeight() { - return LogWeightTpl(FloatLimits::kNumberBad); } + return LogWeightTpl(FloatLimits::NumberBad()); } static const string &Type() { static const string type = "log" + FloatWeightTpl::GetPrecisionString(); @@ -337,12 +340,12 @@ class LogWeightTpl : public FloatWeightTpl { bool Member() const { // First part fails for IEEE NaN - return Value() == Value() && Value() != FloatLimits::kNegInfinity; + return Value() == Value() && Value() != FloatLimits::NegInfinity(); } LogWeightTpl Quantize(float delta = kDelta) const { - if (Value() == FloatLimits::kNegInfinity || - Value() == FloatLimits::kPosInfinity || + if (Value() == FloatLimits::NegInfinity() || + Value() == FloatLimits::PosInfinity() || Value() != Value()) return *this; else @@ -368,9 +371,9 @@ template inline LogWeightTpl Plus(const LogWeightTpl &w1, const LogWeightTpl &w2) { T f1 = w1.Value(), f2 = w2.Value(); - if (f1 == FloatLimits::kPosInfinity) + if (f1 == FloatLimits::PosInfinity()) return w2; - else if (f2 == FloatLimits::kPosInfinity) + else if (f2 == FloatLimits::PosInfinity()) return w1; else if (f1 > f2) return LogWeightTpl(f2 - LogExp(f1 - f2)); @@ -394,9 +397,9 @@ inline LogWeightTpl Times(const LogWeightTpl &w1, if (!w1.Member() || !w2.Member()) return LogWeightTpl::NoWeight(); T f1 = w1.Value(), f2 = w2.Value(); - if (f1 == FloatLimits::kPosInfinity) + if (f1 == FloatLimits::PosInfinity()) return w1; - else if (f2 == FloatLimits::kPosInfinity) + else if (f2 == FloatLimits::PosInfinity()) return w2; else return LogWeightTpl(f1 + f2); @@ -419,10 +422,10 @@ inline LogWeightTpl Divide(const LogWeightTpl &w1, if (!w1.Member() || !w2.Member()) return LogWeightTpl::NoWeight(); T f1 = w1.Value(), f2 = w2.Value(); - if (f2 == FloatLimits::kPosInfinity) - return FloatLimits::kNumberBad; - else if (f1 == FloatLimits::kPosInfinity) - return FloatLimits::kPosInfinity; + if (f2 == FloatLimits::PosInfinity()) + return FloatLimits::NumberBad(); + else if (f1 == FloatLimits::PosInfinity()) + return FloatLimits::PosInfinity(); else return LogWeightTpl(f1 - f2); } @@ -454,15 +457,15 @@ class MinMaxWeightTpl : public FloatWeightTpl { MinMaxWeightTpl(const MinMaxWeightTpl &w) : FloatWeightTpl(w) {} static const MinMaxWeightTpl Zero() { - return MinMaxWeightTpl(FloatLimits::kPosInfinity); + return MinMaxWeightTpl(FloatLimits::PosInfinity()); } static const MinMaxWeightTpl One() { - return MinMaxWeightTpl(FloatLimits::kNegInfinity); + return MinMaxWeightTpl(FloatLimits::NegInfinity()); } static const MinMaxWeightTpl NoWeight() { - return MinMaxWeightTpl(FloatLimits::kNumberBad); } + return MinMaxWeightTpl(FloatLimits::NumberBad()); } static const string &Type() { static const string type = "minmax" + @@ -477,8 +480,8 @@ class MinMaxWeightTpl : public FloatWeightTpl { MinMaxWeightTpl Quantize(float delta = kDelta) const { // If one of infinities, or a NaN - if (Value() == FloatLimits::kNegInfinity || - Value() == FloatLimits::kPosInfinity || + if (Value() == FloatLimits::NegInfinity() || + Value() == FloatLimits::PosInfinity() || Value() != Value()) return *this; else @@ -541,7 +544,7 @@ inline MinMaxWeightTpl Divide(const MinMaxWeightTpl &w1, if (!w1.Member() || !w2.Member()) return MinMaxWeightTpl::NoWeight(); // min(w1, x) = w2, w1 >= w2 => min(w1, x) = w2, x = w2 - return w1.Value() >= w2.Value() ? w1 : FloatLimits::kNumberBad; + return w1.Value() >= w2.Value() ? w1 : FloatLimits::NumberBad(); } inline MinMaxWeightTpl Divide(const MinMaxWeightTpl &w1, diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h index 9c4d0db..dd11e4f 100644 --- a/src/include/fst/fst.h +++ b/src/include/fst/fst.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -229,7 +230,7 @@ class Fst { } return Read(strm, FstReadOptions(filename)); } else { - return Read(std::cin, FstReadOptions("standard input")); + return Read(cin, FstReadOptions("standard input")); } } @@ -267,7 +268,6 @@ class Fst { virtual MatcherBase *InitMatcher(MatchType match_type) const; protected: - bool WriteFile(const string &filename) const { if (!filename.empty()) { ofstream strm(filename.c_str(), ofstream::out | ofstream::binary); @@ -277,7 +277,7 @@ class Fst { } return Write(strm, FstWriteOptions(filename)); } else { - return Write(std::cout, FstWriteOptions("standard output")); + return Write(cout, FstWriteOptions("standard output")); } } }; @@ -706,12 +706,13 @@ template class FstImpl { // This method is used in the cross-type serialization methods Fst::WriteFst. static void WriteFstHeader(const Fst &fst, ostream &strm, const FstWriteOptions& opts, int version, - const string &type, FstHeader *hdr) { + const string &type, uint64 properties, + FstHeader *hdr) { if (opts.write_header) { hdr->SetFstType(type); hdr->SetArcType(A::Type()); hdr->SetVersion(version); - hdr->SetProperties(fst.Properties(kFstProperties, false)); + hdr->SetProperties(properties); int32 file_flags = 0; if (fst.InputSymbols() && opts.write_isymbols) file_flags |= FstHeader::HAS_ISYMBOLS; @@ -737,14 +738,14 @@ template class FstImpl { // returns true on success, false on failure. static bool UpdateFstHeader(const Fst &fst, ostream &strm, const FstWriteOptions& opts, int version, - const string &type, FstHeader *hdr, - size_t header_offset) { + const string &type, uint64 properties, + FstHeader *hdr, size_t header_offset) { strm.seekp(header_offset); if (!strm) { LOG(ERROR) << "Fst::UpdateFstHeader: write failed: " << opts.source; return false; } - WriteFstHeader(fst, strm, opts, version, type, hdr); + WriteFstHeader(fst, strm, opts, version, type, properties, hdr); if (!strm) { LOG(ERROR) << "Fst::UpdateFstHeader: write failed: " << opts.source; return false; diff --git a/src/include/fst/fstlib.h b/src/include/fst/fstlib.h index c05c775..de5976d 100644 --- a/src/include/fst/fstlib.h +++ b/src/include/fst/fstlib.h @@ -142,6 +142,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/src/include/fst/icu.h b/src/include/fst/icu.h index 6b74c2e..3947716 100644 --- a/src/include/fst/icu.h +++ b/src/include/fst/icu.h @@ -13,88 +13,101 @@ // limitations under the License. // // Copyright 2005-2010 Google, Inc. -// Author: roubert@google.com (Fredrik Roubert) - -// Wrapper class for UErrorCode, with conversion operators for direct use in -// ICU C and C++ APIs. -// -// Features: -// - The constructor initializes the internal UErrorCode to U_ZERO_ERROR, -// removing one common source of errors. -// - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking -// UErrorCode& (reference), via conversion operators. -// - Automatic checking for success when it goes out of scope. On failure, -// the destructor will FSTERROR() an error message. -// -// Most of ICU will handle errors gracefully and provide sensible fallbacks. -// Using IcuErrorCode, it is therefore possible to write very compact code -// that does sensible things on failure and provides logging for debugging. +// Author: sorenj@google.com (Jeffrey Sorensen) +// roubert@google.com (Fredrik Roubert) // -// Example: -// -// IcuErrorCode icuerrorcode; -// return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL; +// This library implements an unrestricted Thompson/Pike UTF-8 parser and +// serializer. UTF-8 is a restricted subset of this byte stream encoding. See +// http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding +// details. #ifndef FST_LIB_ICU_H_ #define FST_LIB_ICU_H_ -#include -#include -#include -#include - -class IcuErrorCode : public icu::ErrorCode { - public: - IcuErrorCode() {} - virtual ~IcuErrorCode() { if (isFailure()) handleFailure(); } - - // Redefine 'errorName()' in order to be compatible with ICU version 4.2 - const char* errorName() const { - return u_errorName(errorCode); - } - - protected: - virtual void handleFailure() const { - FSTERROR() << errorName(); -} - - private: - DISALLOW_COPY_AND_ASSIGN(IcuErrorCode); -}; +#include +#include +#include namespace fst { template bool UTF8StringToLabels(const string &str, vector { } return Read(strm, FstReadOptions(filename)); } else { - return Read(std::cin, FstReadOptions("standard input")); + return Read(cin, FstReadOptions("standard input")); } } else { // Converts to 'convert_type' if not mutable. Fst *ifst = Fst::Read(filename); diff --git a/src/include/fst/queue.h b/src/include/fst/queue.h index 707dffc..e31f087 100644 --- a/src/include/fst/queue.h +++ b/src/include/fst/queue.h @@ -451,7 +451,7 @@ class SccQueue : public QueueBase { while ((front_ <= back_) && (((*queue_)[front_] && (*queue_)[front_]->Empty()) || (((*queue_)[front_] == 0) && - ((front_ > trivial_queue_.size()) + ((front_ >= trivial_queue_.size()) || (trivial_queue_[front_] == kNoStateId))))) ++front_; if ((*queue_)[front_]) @@ -493,7 +493,7 @@ class SccQueue : public QueueBase { else if ((*queue_)[front_]) return (*queue_)[front_]->Empty(); else - return (front_ > trivial_queue_.size()) + return (front_ >= trivial_queue_.size()) || (trivial_queue_[front_] == kNoStateId); } diff --git a/src/include/fst/register.h b/src/include/fst/register.h index 55651cd..ea3f4d8 100644 --- a/src/include/fst/register.h +++ b/src/include/fst/register.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include diff --git a/src/include/fst/rmepsilon.h b/src/include/fst/rmepsilon.h index ee9753e..32e64de 100644 --- a/src/include/fst/rmepsilon.h +++ b/src/include/fst/rmepsilon.h @@ -110,7 +110,6 @@ class RmEpsilonState { class ElementKey { public: size_t operator()(const Element& e) const { - return static_cast(e.nextstate); return static_cast(e.nextstate + e.ilabel * kPrime0 + e.olabel * kPrime1); diff --git a/src/include/fst/script/compile-impl.h b/src/include/fst/script/compile-impl.h index 4aab15b..1743452 100644 --- a/src/include/fst/script/compile-impl.h +++ b/src/include/fst/script/compile-impl.h @@ -31,6 +31,7 @@ using std::vector; #include #include +#include #include #include #include diff --git a/src/include/fst/script/draw-impl.h b/src/include/fst/script/draw-impl.h index e346649..893e258 100644 --- a/src/include/fst/script/draw-impl.h +++ b/src/include/fst/script/draw-impl.h @@ -139,9 +139,9 @@ template class FstDrawer { EscapeChars(symbol, &nsymbol); PrintString(nsymbol); } else { - ostringstream sid; - sid << id; - PrintString(sid.str()); + string idstr; + Int64ToStr(id, &idstr); + PrintString(idstr); } } diff --git a/src/include/fst/script/draw.h b/src/include/fst/script/draw.h index 1611ad1..2b66373 100644 --- a/src/include/fst/script/draw.h +++ b/src/include/fst/script/draw.h @@ -22,6 +22,7 @@ #include #include #include +#include namespace fst { namespace script { diff --git a/src/include/fst/script/fst-class.h b/src/include/fst/script/fst-class.h index 3eacab4..a820c1c 100644 --- a/src/include/fst/script/fst-class.h +++ b/src/include/fst/script/fst-class.h @@ -24,6 +24,7 @@ #include #include #include +#include // Classes to support "boxing" all existing types of FST arcs in a single // FstClass which hides the arc types. This allows clients to load @@ -52,6 +53,7 @@ class FstClassBase { virtual const SymbolTable *InputSymbols() const = 0; virtual const SymbolTable *OutputSymbols() const = 0; virtual void Write(const string& fname) const = 0; + virtual void Write(ostream &ostr, const FstWriteOptions &opts) const = 0; virtual uint64 Properties(uint64 mask, bool test) const = 0; virtual ~FstClassBase() { } }; @@ -114,12 +116,18 @@ class FstClassImpl : public FstClassImplBase { impl_->Write(fname); } + virtual void Write(ostream &ostr, const FstWriteOptions &opts) const { + impl_->Write(ostr, opts); + } + virtual uint64 Properties(uint64 mask, bool test) const { return impl_->Properties(mask, test); } virtual ~FstClassImpl() { delete impl_; } + Fst *GetImpl() const { return impl_; } + Fst *GetImpl() { return impl_; } virtual FstClassImpl *Copy() { @@ -154,13 +162,25 @@ class FstClass : public FstClassBase { } } + FstClass() : impl_(NULL) { + } + template - explicit FstClass(Fst *fst) : impl_(new FstClassImpl(fst)) { } + explicit FstClass(Fst *fst) : impl_(new FstClassImpl(fst)) { + } explicit FstClass(const FstClass &other) : impl_(other.impl_->Copy()) { } + FstClass &operator=(const FstClass &other) { + delete impl_; + impl_ = other.impl_->Copy(); + return *this; + } + static FstClass *Read(const string &fname); + static FstClass *Read(istream &istr, const string &source); + virtual const string &ArcType() const { return impl_->ArcType(); } @@ -185,6 +205,10 @@ class FstClass : public FstClassBase { impl_->Write(fname); } + virtual void Write(ostream &ostr, const FstWriteOptions &opts) const { + impl_->Write(ostr, opts); + } + virtual uint64 Properties(uint64 mask, bool test) const { return impl_->Properties(mask, test); } @@ -214,6 +238,8 @@ class FstClass : public FstClassBase { << "particular arc type."; return 0; } + + protected: explicit FstClass(FstClassImplBase *impl) : impl_(impl) { } @@ -233,7 +259,12 @@ class FstClass : public FstClassBase { } } + FstClassImplBase *GetImpl() const { return impl_; } + FstClassImplBase *GetImpl() { return impl_; } + +// friend ostream &operator<<(ostream&, const FstClass&); + private: FstClassImplBase *impl_; }; @@ -269,6 +300,14 @@ class MutableFstClass : public FstClass { } } + virtual void Write(const string &fname) const { + GetImpl()->Write(fname); + } + + virtual void Write(ostream &ostr, const FstWriteOptions &opts) const { + GetImpl()->Write(ostr, opts); + } + static MutableFstClass *Read(const string &fname, bool convert = false); virtual void SetInputSymbols(SymbolTable *is) { @@ -338,6 +377,4 @@ class VectorFstClass : public MutableFstClass { } // namespace script } // namespace fst - - #endif // FST_SCRIPT_FST_CLASS_H_ diff --git a/src/include/fst/script/text-io.h b/src/include/fst/script/text-io.h index 95cc182..d97a007 100644 --- a/src/include/fst/script/text-io.h +++ b/src/include/fst/script/text-io.h @@ -32,6 +32,7 @@ using std::vector; #include #include +#include #include namespace fst { diff --git a/src/include/fst/script/weight-class.h b/src/include/fst/script/weight-class.h index 5a4890f..228216d 100644 --- a/src/include/fst/script/weight-class.h +++ b/src/include/fst/script/weight-class.h @@ -56,9 +56,9 @@ struct WeightClassImpl : public WeightImplBase { } virtual string to_string() const { - ostringstream s; - s << weight; - return s.str(); + string str; + WeightToStr(weight, &str); + return str; } virtual bool operator == (const WeightImplBase &other) const { diff --git a/src/include/fst/shortest-distance.h b/src/include/fst/shortest-distance.h index 5d38409..9320c4c 100644 --- a/src/include/fst/shortest-distance.h +++ b/src/include/fst/shortest-distance.h @@ -178,7 +178,7 @@ void ShortestDistanceState::ShortestDistance( !aiter.Done(); aiter.Next()) { const Arc &arc = aiter.Value(); - if (!arc_filter_(arc) || arc.weight == Weight::Zero()) + if (!arc_filter_(arc)) continue; while (distance_->size() <= arc.nextstate) { distance_->push_back(Weight::Zero()); diff --git a/src/include/fst/signed-log-weight.h b/src/include/fst/signed-log-weight.h index da96479..61adefb 100644 --- a/src/include/fst/signed-log-weight.h +++ b/src/include/fst/signed-log-weight.h @@ -113,9 +113,9 @@ inline SignedLogWeightTpl Plus(const SignedLogWeightTpl &w1, bool s2 = w2.Value1().Value() > 0.0; T f1 = w1.Value2().Value(); T f2 = w2.Value2().Value(); - if (f1 == FloatLimits::kPosInfinity) + if (f1 == FloatLimits::PosInfinity()) return w2; - else if (f2 == FloatLimits::kPosInfinity) + else if (f2 == FloatLimits::PosInfinity()) return w1; else if (f1 == f2) { if (s1 == s2) @@ -173,12 +173,12 @@ inline SignedLogWeightTpl Divide(const SignedLogWeightTpl &w1, bool s2 = w2.Value1().Value() > 0.0; T f1 = w1.Value2().Value(); T f2 = w2.Value2().Value(); - if (f2 == FloatLimits::kPosInfinity) + if (f2 == FloatLimits::PosInfinity()) return SignedLogWeightTpl(TropicalWeight(1.0), - FloatLimits::kNumberBad); - else if (f1 == FloatLimits::kPosInfinity) + FloatLimits::NumberBad()); + else if (f1 == FloatLimits::PosInfinity()) return SignedLogWeightTpl(TropicalWeight(1.0), - FloatLimits::kPosInfinity); + FloatLimits::PosInfinity()); else if (s1 == s2) return SignedLogWeightTpl(TropicalWeight(1.0), (f1 - f2)); else diff --git a/src/include/fst/slist.h b/src/include/fst/slist.h index 9f94027..d061ebe 100644 --- a/src/include/fst/slist.h +++ b/src/include/fst/slist.h @@ -22,7 +22,7 @@ #include -#if !defined(__ANDROID__) && defined(HAVE___GNU_CXX__SLIST_INT_) +#if !defined(__ANDROID__) && HAVE___GNU_CXX__SLIST_INT_ #include diff --git a/src/include/fst/state-map.h b/src/include/fst/state-map.h index ace4a3c..454db24 100644 --- a/src/include/fst/state-map.h +++ b/src/include/fst/state-map.h @@ -295,6 +295,10 @@ class StateMapFstImpl : public CacheImpl { SetArcs(s); } + const Fst &GetFst() const { + return *fst_; + } + private: void Init() { SetType("statemap"); @@ -364,10 +368,10 @@ class StateMapFst : public ImplToFst< StateMapFstImpl > { GetImpl()->InitArcIterator(s, data); } - private: - // Makes visible to friends. + protected: Impl *GetImpl() const { return ImplToFst::GetImpl(); } + private: void operator=(const StateMapFst &fst); // disallow }; diff --git a/src/include/fst/string.h b/src/include/fst/string.h index 3099b87..d51182e 100644 --- a/src/include/fst/string.h +++ b/src/include/fst/string.h @@ -24,6 +24,7 @@ #define FST_LIB_STRING_H_ #include +#include #include DECLARE_string(fst_field_separator); @@ -48,7 +49,7 @@ class StringCompiler { // Compile string 's' into FST 'fst'. template - bool operator()(const string &s, F *fst) { + bool operator()(const string &s, F *fst) const { vector > { SetProperties(DeleteArcsProperties(Properties())); } - private: // Properties always true of this Fst class static const uint64 kStaticProperties = kExpanded | kMutable; + + private: // Current file format version static const int kFileVersion = 2; // Minimum file format version supported @@ -542,7 +543,10 @@ bool VectorFst::WriteFst(const F &fst, ostream &strm, hdr.SetNumStates(CountStates(fst)); update_header = false; } - FstImpl::WriteFstHeader(fst, strm, opts, kFileVersion, "vector", &hdr); + uint64 properties = fst.Properties(kCopyProperties, false) | + VectorFstImpl::kStaticProperties; + FstImpl::WriteFstHeader(fst, strm, opts, kFileVersion, "vector", + properties, &hdr); StateId num_states = 0; for (StateIterator siter(fst); !siter.Done(); siter.Next()) { typename A::StateId s = siter.Value(); @@ -566,7 +570,7 @@ bool VectorFst::WriteFst(const F &fst, ostream &strm, if (update_header) { hdr.SetNumStates(num_states); return FstImpl::UpdateFstHeader(fst, strm, opts, kFileVersion, "vector", - &hdr, start_offset); + properties, &hdr, start_offset); } else { if (num_states != hdr.NumStates()) { LOG(ERROR) << "Inconsistent number of states observed during write"; diff --git a/src/include/fst/visit.h b/src/include/fst/visit.h index 31a00a8..a02d86a 100644 --- a/src/include/fst/visit.h +++ b/src/include/fst/visit.h @@ -166,7 +166,8 @@ void Visit(const Fst &fst, V *visitor, Q *queue, ArcFilter filter) { // Finds next tree root for (root = root == start ? 0 : root + 1; root < nstates && state_status[root] != kWhiteState; - ++root); + ++root) { + } // Check for a state beyond the largest known state if (!expanded && root == nstates) { diff --git a/src/include/fst/weight.h b/src/include/fst/weight.h index 72f5a22..7eb4bb1 100644 --- a/src/include/fst/weight.h +++ b/src/include/fst/weight.h @@ -28,16 +28,18 @@ // A left semiring distributes on the left; a right semiring is // similarly defined. // -// A Weight class is required to be (at least) a left or right semiring. +// A Weight class must have binary functions =Plus= and =Times= and +// static member functions =Zero()= and =One()= and these must form +// (at least) a left or right semiring. // // In addition, the following should be defined for a Weight: // Member: predicate on set membership. -// NoWeight: returns an element that is not a member, should only be -// used to signal an error. -// >>: reads weight. -// <<: prints weight. -// Read(istream &strm): reads from an input stream. -// Write(ostream &strm): writes to an output stream. +// NoWeight: static member function that returns an element that is +// not a set member; used to signal an error. +// >>: reads textual representation of a weight. +// <<: prints textual representation of a weight. +// Read(istream &strm): reads binary representation of a weight. +// Write(ostream &strm): writes binary representation of a weight. // Hash: maps weight to size_t. // ApproxEqual: approximate equality (for inexact weights) // Quantize: quantizes wrt delta (for inexact weights) @@ -46,11 +48,9 @@ // and Times(a, b') == c // --> a' = Divide(c, b, DIVIDE_RIGHT) if a right semiring, a'.Member() // and Times(a', b) == c -// --> b' = Divide(c, a) -// = Divide(c, a, DIVIDE_ANY) -// = Divide(c, a, DIVIDE_LEFT) -// = Divide(c, a, DIVIDE_RIGHT) if a commutative semiring, -// b'.Member() and Times(a, b') == Times(b', a) == c +// --> b' = Divide(c, a) = Divide(c, a, DIVIDE_ANY) = +// Divide(c, a, DIVIDE_LEFT) = Divide(c, a, DIVIDE_RIGHT) if a +// commutative semiring, b'.Member() and Times(a, b') = Times(b', a) = c // ReverseWeight: the type of the corresponding reverse weight. // Typically the same type as Weight for a (both left and right) semiring. // For the left string semiring, it is the right string semiring. @@ -66,7 +66,7 @@ // RightSemiring: indicates weights form a right semiring. // Commutative: for all a,b: Times(a,b) == Times(b,a) // Idempotent: for all a: Plus(a, a) == a. -// Path Property: for all a, b: Plus(a, b) == a or Plus(a, b) == b. +// Path: for all a, b: Plus(a, b) == a or Plus(a, b) == b. #ifndef FST_LIB_WEIGHT_H__ diff --git a/src/lib/flags.cc b/src/lib/flags.cc index d05fd73..222d66b 100644 --- a/src/lib/flags.cc +++ b/src/lib/flags.cc @@ -24,14 +24,18 @@ DEFINE_int32(v, 0, "verbose level"); DEFINE_bool(help, false, "show usage information"); +DEFINE_bool(helpshort, false, "show brief usage information"); DEFINE_string(tmpdir, "/tmp/", "temporary directory"); using namespace std; static string flag_usage; +static string prog_src; -void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags) { +void SetFlags(const char *usage, int *argc, char ***argv, + bool remove_flags, const char *src) { flag_usage = usage; + prog_src = src; int index = 1; for (; index < *argc; ++index) { string argval = (*argv)[index]; @@ -82,22 +86,75 @@ void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags) { } if (FLAGS_help) { - ShowUsage(); + ShowUsage(true); + exit(1); + } + + if (FLAGS_helpshort) { + ShowUsage(false); exit(1); } } -void ShowUsage() { +// If flag is defined in file 'src' and 'in_src' true or is not +// defined in file 'src' and 'in_src' is false, then print usage. +static void +ShowUsageRestrict(const std::set< pair > &usage_set, + const string &src, bool in_src, bool show_file) { + string old_file; + bool file_out = false; + bool usage_out = false; + for (std::set< pair >::const_iterator it = + usage_set.begin(); + it != usage_set.end(); + ++it) { + const string &file = it->first; + const string &usage = it->second; + + bool match = file == src; + if ((match && !in_src) || (!match && in_src)) + continue; + + if (file != old_file) { + if (show_file) { + if (file_out) cout << "\n"; + cout << "Flags from: " << file << "\n"; + file_out = true; + } + old_file = file; + } + cout << usage << "\n"; + usage_out = true; + } + if (usage_out) cout << "\n"; +} + +void ShowUsage(bool long_usage) { + std::set< pair > usage_set; + cout << flag_usage << "\n"; - cout << " Flags Description:\n"; + FlagRegister *bool_register = FlagRegister::GetRegister(); - bool_register->ShowUsage(); + bool_register->GetUsage(&usage_set); FlagRegister *string_register = FlagRegister::GetRegister(); - string_register->ShowUsage(); + string_register->GetUsage(&usage_set); FlagRegister *int32_register = FlagRegister::GetRegister(); - int32_register->ShowUsage(); + int32_register->GetUsage(&usage_set); FlagRegister *int64_register = FlagRegister::GetRegister(); - int64_register->ShowUsage(); + int64_register->GetUsage(&usage_set); FlagRegister *double_register = FlagRegister::GetRegister(); - double_register->ShowUsage(); + double_register->GetUsage(&usage_set); + + if (!prog_src.empty()) { + cout << "PROGRAM FLAGS:\n\n"; + ShowUsageRestrict(usage_set, prog_src, true, false); + } + + if (!long_usage) + return; + + if (!prog_src.empty()) + cout << "LIBRARY FLAGS:\n\n"; + + ShowUsageRestrict(usage_set, prog_src, false, true); } diff --git a/src/lib/properties.cc b/src/lib/properties.cc index db0e2c8..9e03d59 100644 --- a/src/lib/properties.cc +++ b/src/lib/properties.cc @@ -117,8 +117,10 @@ uint64 DeterminizeProperties(uint64 inprops, bool has_subsequential_label) { uint64 outprops = kAccessible; if (((kAcceptor | kNoIEpsilons) & inprops) || has_subsequential_label) outprops |= kIDeterministic; - outprops |= (kError | kAcceptor | kNoEpsilons | kAcyclic | + outprops |= (kError | kAcceptor | kAcyclic | kInitialAcyclic | kCoAccessible | kString) & inprops; + if (inprops & kNoIEpsilons) + outprops |= kNoEpsilons & inprops; if (inprops & kAccessible) outprops |= (kNotAcceptor | kEpsilons | kIEpsilons | kOEpsilons | kCyclic) & inprops; @@ -375,6 +377,7 @@ uint64 UnionProperties(uint64 inprops1, uint64 inprops2, bool delayed) { uint64 outprops = (kAcceptor | kUnweighted | kAcyclic | kAccessible) & inprops1 & inprops2; outprops |= kError & (inprops1 | inprops2); + outprops |= kInitialAcyclic; bool empty1 = delayed; // Can fst1 be the empty machine? bool empty2 = delayed; // Can fst2 be the empty machine? diff --git a/src/lib/symbol-table.cc b/src/lib/symbol-table.cc index 8b35cdf..a195a7c 100644 --- a/src/lib/symbol-table.cc +++ b/src/lib/symbol-table.cc @@ -20,6 +20,7 @@ // Classes to provide symbol-to-integer and integer-to-symbol mappings. #include + #include DEFINE_bool(fst_compat_symbols, true, @@ -35,9 +36,12 @@ const int kLineLen = 8096; // Identifies stream data as a symbol table (and its endianity) static const int32 kSymbolTableMagicNumber = 2125658996; +SymbolTableTextOptions::SymbolTableTextOptions() + : allow_negative(false), fst_field_separator(FLAGS_fst_field_separator) { } + SymbolTableImpl* SymbolTableImpl::ReadText(istream &strm, const string &filename, - bool allow_negative) { + const SymbolTableTextOptions &opts) { SymbolTableImpl* impl = new SymbolTableImpl(filename); int64 nline = 0; @@ -45,7 +49,7 @@ SymbolTableImpl* SymbolTableImpl::ReadText(istream &strm, while (strm.getline(line, kLineLen)) { ++nline; vector col; - string separator = FLAGS_fst_field_separator + "\n"; + string separator = opts.fst_field_separator + "\n"; SplitToVector(line, separator.c_str(), &col, true); if (col.size() == 0) // empty line continue; @@ -61,7 +65,7 @@ SymbolTableImpl* SymbolTableImpl::ReadText(istream &strm, char *p; int64 key = strtoll(value, &p, 10); if (p < value + strlen(value) || - (!allow_negative && key < 0) || key == -1) { + (!opts.allow_negative && key < 0) || key == -1) { LOG(ERROR) << "SymbolTable::ReadText: Bad non-negative integer \"" << value << "\" (skipping), " << "file = " << filename << ", line = " << nline; @@ -74,21 +78,30 @@ SymbolTableImpl* SymbolTableImpl::ReadText(istream &strm, } void SymbolTableImpl::MaybeRecomputeCheckSum() const { - if (check_sum_finalized_) - return; + { + ReaderMutexLock check_sum_lock(&check_sum_mutex_); + if (check_sum_finalized_) + return; + } + + // We'll aquire an exclusive lock to recompute the checksums. + MutexLock check_sum_lock(&check_sum_mutex_); + if (check_sum_finalized_) // Another thread (coming in around the same time + return; // might have done it already). So we recheck. // Calculate the original label-agnostic check sum. - check_sum_.Reset(); + CheckSummer check_sum; for (int64 i = 0; i < symbols_.size(); ++i) - check_sum_.Update(symbols_[i], strlen(symbols_[i]) + 1); - check_sum_string_ = check_sum_.Digest(); + check_sum.Update(symbols_[i], strlen(symbols_[i]) + 1); + check_sum_string_ = check_sum.Digest(); // Calculate the safer, label-dependent check sum. - labeled_check_sum_.Reset(); + CheckSummer labeled_check_sum; for (int64 key = 0; key < dense_key_limit_; ++key) { ostringstream line; line << symbols_[key] << '\t' << key; - labeled_check_sum_.Update(line.str()); } + labeled_check_sum.Update(line.str().data(), line.str().size()); + } for (map::const_iterator it = key_map_.begin(); it != key_map_.end(); @@ -96,10 +109,10 @@ void SymbolTableImpl::MaybeRecomputeCheckSum() const { if (it->first >= dense_key_limit_) { ostringstream line; line << it->second << '\t' << it->first; - labeled_check_sum_.Update(line.str()); + labeled_check_sum.Update(line.str().data(), line.str().size()); } } - labeled_check_sum_string_ = labeled_check_sum_.Digest(); + labeled_check_sum_string_ = labeled_check_sum.Digest(); check_sum_finalized_ = true; } @@ -231,12 +244,22 @@ void SymbolTable::AddTable(const SymbolTable& table) { impl_->AddSymbol(iter.Symbol()); } -bool SymbolTable::WriteText(ostream &strm) const { +bool SymbolTable::WriteText(ostream &strm, + const SymbolTableTextOptions &opts) const { + if (opts.fst_field_separator.empty()) { + LOG(ERROR) << "Missing required field separator"; + return false; + } + bool once_only = false; for (SymbolTableIterator iter(*this); !iter.Done(); iter.Next()) { ostringstream line; - line << iter.Symbol() << FLAGS_fst_field_separator[0] << iter.Value() + if (iter.Value() < 0 && !opts.allow_negative && !once_only) { + LOG(WARNING) << "Negative symbol table entry when not allowed"; + once_only = true; + } + line << iter.Symbol() << opts.fst_field_separator[0] << iter.Value() << '\n'; - strm.write(line.str().c_str(), line.str().length()); + strm.write(line.str().data(), line.str().length()); } return true; } diff --git a/src/lib/temp_Android.temp_mk b/src/lib/temp_Android.temp_mk deleted file mode 100644 index 7a6936e..0000000 --- a/src/lib/temp_Android.temp_mk +++ /dev/null @@ -1,20 +0,0 @@ -# -# Copyright 2012 Google Inc. All Rights Reserved. -# Author: npereira@google.com (Nicole Pereira) -# -# Android makefile for openfst library. -# - -OPENFST_DIR := $(call my-dir) - -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) - -LOCAL_MODULE := libopenfst_patts - -LOCAL_CPP_EXTENSION := .cc -LOCAL_C_INCLUDES += $(OPENFST_DIR)/src/include/ - -#LOCAL_SRC_FILES := $(call private-function-all-cpp-files-under, src) - -include $(BUILD_STATIC_LIBRARY) diff --git a/src/lib/util.cc b/src/lib/util.cc index eeba92b..f754da5 100644 --- a/src/lib/util.cc +++ b/src/lib/util.cc @@ -26,7 +26,7 @@ DEFINE_bool(fst_error_fatal, true, "FST errors are fatal; o.w. return objects flagged as bad: " - " e.g., FSTs - kError prop. true, FST weights - not a member()"); + " e.g., FSTs - kError prop. true, FST weights - not a Member()"); namespace fst { @@ -49,7 +49,7 @@ int64 StrToInt64(const string &s, const string &src, size_t nline, void Int64ToStr(int64 n, string *s) { ostringstream nstr; nstr << n; - *s = nstr.str(); + s->append(nstr.str().data(), nstr.str().size()); } void ConvertToLegalCSymbol(string *s) { diff --git a/src/script/fst-class.cc b/src/script/fst-class.cc index a784b08..bf5d963 100644 --- a/src/script/fst-class.cc +++ b/src/script/fst-class.cc @@ -74,10 +74,14 @@ FstClass *FstClass::Read(const string &fname) { ifstream in(fname.c_str(), ifstream::in | ifstream::binary); return ReadFst(in, fname); } else { - return ReadFst(std::cin, "standard input"); + return ReadFst(cin, "standard input"); } } +FstClass *FstClass::Read(istream &istr, const string &source) { + return ReadFst(istr, source); +} + // // MUTABLE FST CLASS METHODS // @@ -88,7 +92,7 @@ MutableFstClass *MutableFstClass::Read(const string &fname, bool convert) { ifstream in(fname.c_str(), ifstream::in | ifstream::binary); return ReadFst(in, fname); } else { - return ReadFst(std::cin, "standard input"); + return ReadFst(cin, "standard input"); } } else { // Converts to VectorFstClass if not mutable. FstClass *ifst = FstClass::Read(fname); @@ -133,7 +137,7 @@ VectorFstClass *VectorFstClass::Read(const string &fname) { ifstream in(fname.c_str(), ifstream::in | ifstream::binary); return ReadFst(in, fname); } else { - return ReadFst(std::cin, "standard input"); + return ReadFst(cin, "standard input"); } } diff --git a/src/script/text-io.cc b/src/script/text-io.cc index 2f5efab..65b4eed 100644 --- a/src/script/text-io.cc +++ b/src/script/text-io.cc @@ -67,7 +67,7 @@ bool ReadPotentials(const string &weight_type, // Writes vector of weights; returns true on success. bool WritePotentials(const string& filename, const vector& potential) { - ostream *strm = &std::cout; + ostream *strm = &cout; if (!filename.empty()) { strm = new ofstream(filename.c_str()); if (!*strm) { @@ -85,7 +85,7 @@ bool WritePotentials(const string& filename, LOG(ERROR) << "WritePotentials: Write failed: " << (filename.empty() ? "standard output" : filename); bool ret = *strm; - if (strm != &std::cout) + if (strm != &cout) delete strm; return ret; } diff --git a/src/test/algo_test.cc b/src/test/algo_test.cc index 9538d54..27e3e95 100644 --- a/src/test/algo_test.cc +++ b/src/test/algo_test.cc @@ -66,7 +66,7 @@ using fst::AlgoTester; int main(int argc, char **argv) { FLAGS_fst_verify_properties = true; std::set_new_handler(FailedNewHandler); - SetFlags(argv[0], &argc, &argv, true); + SET_FLAGS(argv[0], &argc, &argv, true); static const int kCacheGcLimit = 20; diff --git a/src/test/fst_test.cc b/src/test/fst_test.cc index 7d8d600..47899ba 100644 --- a/src/test/fst_test.cc +++ b/src/test/fst_test.cc @@ -123,7 +123,7 @@ using fst::EditFst; int main(int argc, char **argv) { FLAGS_fst_verify_properties = true; std::set_new_handler(FailedNewHandler); - SetFlags(argv[0], &argc, &argv, true); + SET_FLAGS(argv[0], &argc, &argv, true); // VectorFst tests { diff --git a/src/test/fst_test.h b/src/test/fst_test.h index da16476..8f9dca0 100644 --- a/src/test/fst_test.h +++ b/src/test/fst_test.h @@ -65,14 +65,14 @@ class FstTester { StateIterator siter(fst); Matcher matcher(fst, MATCH_INPUT); MatchType match_type = matcher.Type(true); - for (; !siter.Done(); siter.Next()); + for (; !siter.Done(); siter.Next()) {} for (siter.Reset(); !siter.Done(); siter.Next()) { StateId s = siter.Value(); matcher.SetState(s); CHECK_EQ(fst.Final(s), NthWeight(s)); size_t na = 0; ArcIterator aiter(fst, s); - for (; !aiter.Done(); aiter.Next()); + for (; !aiter.Done(); aiter.Next()) {} for (aiter.Reset(); !aiter.Done(); aiter.Next()) { ++na; const Arc &arc = aiter.Value(); @@ -129,7 +129,7 @@ class FstTester { size_t na = 0; size_t ni = fst->NumInputEpsilons(s); MutableArcIterator aiter(fst, s); - for (; !aiter.Done(); aiter.Next()); + for (; !aiter.Done(); aiter.Next()) {} for (aiter.Reset(); !aiter.Done(); aiter.Next()) { ++na; Arc arc = aiter.Value(); diff --git a/src/test/weight_test.cc b/src/test/weight_test.cc index 54ba85d..3554456 100644 --- a/src/test/weight_test.cc +++ b/src/test/weight_test.cc @@ -97,7 +97,7 @@ void TestTemplatedWeights(int repeat, int seed) { int main(int argc, char **argv) { std::set_new_handler(FailedNewHandler); - SetFlags(argv[0], &argc, &argv, true); + SET_FLAGS(argv[0], &argc, &argv, true); int seed = FLAGS_seed >= 0 ? FLAGS_seed : time(0); LOG(INFO) << "Seed = " << seed; -- cgit v1.2.3