path: root/fcp/tensorflow/dictionary_ops.py
diff options
Diffstat (limited to 'fcp/tensorflow/dictionary_ops.py')
1 files changed, 372 insertions, 0 deletions
diff --git a/fcp/tensorflow/dictionary_ops.py b/fcp/tensorflow/dictionary_ops.py
new file mode 100644
index 0000000..b168087
--- /dev/null
+++ b/fcp/tensorflow/dictionary_ops.py
@@ -0,0 +1,372 @@
+# Copyright 2022 Google LLC
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python and TensorFlow functions to work with dictionaries.
+Please see fcp/dictionary/dictionary.h for more on this type of
+Python Classes:
+* `Dictionary`: A Python analogue to fcp/dictionary/dictionary.h
+ that includes additional helpers for dictionary construction.
+TensorFlow ops:
+* dictionary_size
+ Queries the size of a dictionary.
+* dictionary_lookup
+ Looks up ids for string tokens in the dictionary.
+* dictionary_reverse_lookup
+ Looks up string tokens from ids in the dictionary.
+Canonical use (note that the dictionary is known at graph construction time):
+ dictionary = Dictionary.from_tokens(
+ tokens=['some', 'token', 'list'], unk_id=0,
+ vocabulary_type=VocabularyType.TOKEN_INDEX)
+ with tf.Graph().as_default():
+ tokens = tf.compat.v1.placeholder(tf.String, ...) # Tokens to look up.
+ ids = dictionary_lookup(
+ tokens, dictionary.dictionary_description_proto)
+import collections
+import enum
+import tensorflow as tf
+from fcp.dictionary.dictionary_pb2 import DictionaryDescription # pylint: disable=g-importing-member
+from fcp.tensorflow.gen_dictionary_ops import dictionary_lookup
+from fcp.tensorflow.gen_dictionary_ops import dictionary_reverse_lookup
+from fcp.tensorflow.gen_dictionary_ops import dictionary_size
+_dictionary_ops = tf.load_op_library(
+ tf.compat.v1.resource_loader.get_path_to_datafile('./_dictionary_ops.so'))
+def ignore_ids_mask(token_ids, ignore_ids, name=None):
+ """Creates a bool mask with True everywhere token_ids is not in ignore_ids."""
+ with tf.op_scope([token_ids, ignore_ids], name, 'ignore_ids_mask'):
+ # Yay broadcasting
+ all_check = tf.not_equal(tf.expand_dims(token_ids, -1), ignore_ids)
+ check = tf.reduce_all(all_check, reduction_indices=tf.rank(all_check) - 1)
+ check.set_shape(token_ids.get_shape())
+ return check
+def mask_and_replace_padding(token_ids,
+ lengths,
+ eos_id=None,
+ special_tokens=(),
+ name=None):
+ """Creates a mask of valid tokens and sets padded values in id space.
+ This creates a mask the same shape as token_ids with a boolean indicating
+ if the id was a valid token (i.e not padding or a special token). If
+ provided, this also remaps tokens after lengths to the eos_id. Since the
+ dictionary doesn't map tokens to eos or bos ids, it would generally be the
+ unknown token id which is not correct if you need to predict the eos.
+ Args:
+ token_ids: A matrix `Tensor` of integer ids.
+ lengths: A vector `Tensor` of lengths for each row in token_ids.
+ eos_id: The end of sequence id, if provided then all token ids after length
+ in a row will be replaced with `eos_id`.
+ special_tokens: An iterable of special tokens for ids that are not
+ considered valid.
+ name: Name scope for these ops.
+ Returns:
+ token_ids: `token_ids` with all tokens after a row's length replaced with
+ eos if provided.
+ mask: A bool `Tensor` the same shape as `token_ids` indicating which tokens
+ are valid.
+ """
+ with tf.op_scope([token_ids, lengths, eos_id, special_tokens], name,
+ 'mask_and_replace_padding'):
+ ranges = tf.range(0, tf.gather(tf.shape(token_ids), 1))
+ # Yay! Broadcasting.
+ selected = tf.less(ranges, tf.expand_dims(lengths, -1))
+ if eos_id is not None:
+ token_ids = tf.where(
+ selected, token_ids,
+ tf.fill(
+ tf.shape(token_ids), tf.constant(eos_id, dtype=token_ids.dtype)))
+ if special_tokens:
+ mask = tf.logical_and(
+ ignore_ids_mask(token_ids, special_tokens), selected)
+ else:
+ mask = selected
+ return token_ids, mask
+class VocabularyType(enum.Enum):
+ """Valid vocabulary types for Dictionary construction.
+ TOKEN_INDEX: dictionary.dictionary_description contains an embedded map of
+ string names stored in order with ids assigned starting from the lowest
+ non-special id. Preserves order but is not compact.
+ """
+class Dictionary(object):
+ """Utility for working with fcp/dictionary/ via TensorFlow."""
+ def __init__(
+ self,
+ dictionary_description
+ ):
+ """Creates a dictionary from a dictionary_description.
+ Use static from_* constructor methods for building dictionaries from
+ common data types.
+ Args:
+ dictionary_description: A `dictionary_pb2.DictionaryDescription`
+ describing the dictionary.
+ Raises:
+ ValueError: An invalid dictionary description.
+ """
+ if not isinstance(dictionary_description, DictionaryDescription):
+ raise ValueError('Expected a DictionaryDescription')
+ if not dictionary_description.HasField('vocabulary'):
+ raise ValueError('dictionary_description has no vocabulary')
+ self._dictionary_description = dictionary_description
+ # Lazily constructed fields for lookup.
+ self._lookup_graph = None
+ self._lookup_placeholder = None
+ self._lookup_result = None
+ self._reverse_lookup_placeholder = None
+ self._reverse_lookup_result = None
+ @classmethod
+ def from_tokens(
+ cls,
+ tokens,
+ bos_id=None,
+ eos_id=None,
+ unk_id=None,
+ output_blocklist_tokens=None,
+ output_size=None,
+ vocabulary_type=VocabularyType.TOKEN_INDEX
+ ):
+ """Creates a dictionary from a provided list of tokens.
+ The id mappings to token ids depend on the vocabulary_type requested.
+ NB: the special tokens must be the first ids [0, num-specials)
+ Args:
+ tokens: An unordered iterable of tokens for the dictionary.
+ bos_id: Token id for start of sequence.
+ eos_id: Token id for end of sequence.
+ unk_id: Token id for unknown words.
+ output_blocklist_tokens: A list of vocabulary tokens that should be
+ filtered from predictions (e.g., punctuation, bad words etc.).
+ output_size: If a positive integer, tokens with ids greater than this are
+ automatically added to the output blocklist.
+ vocabulary_type: `VocabularyType` to use, defaults to TOKEN_INDEX.
+ Returns:
+ A `Dictionary` instance.
+ Raises:
+ ValueError: If the special tokens don't have the lowest ids.
+ ValueError: If there are duplicates in tokens.
+ """
+ dictionary_description = DictionaryDescription()
+ # Special ids.
+ special_ids = []
+ if unk_id is not None:
+ dictionary_description.special_ids.unk = unk_id
+ special_ids.append(unk_id)
+ if bos_id is not None:
+ dictionary_description.special_ids.bos = bos_id
+ special_ids.append(bos_id)
+ if eos_id is not None:
+ dictionary_description.special_ids.eos = eos_id
+ special_ids.append(eos_id)
+ if sorted(special_ids) != list(range(len(special_ids))):
+ raise ValueError(
+ 'Special ids must be the first items of the dictionary starting at 0'
+ 'or None. eos: %s; bos %s; unk: %s' % (eos_id, bos_id, unk_id))
+ # Vocabulary.
+ if len(tokens) != len(set(tokens)):
+ raise ValueError('Duplicate tokens provided')
+ for token in tokens:
+ if not isinstance(token, (str, bytes)):
+ raise ValueError('Bad type in tokens %s' % token)
+ if vocabulary_type == VocabularyType.TOKEN_INDEX:
+ for token in tokens:
+ dictionary_description.vocabulary.index.token.append(token)
+ else:
+ raise AssertionError('Unsupported vocabulary_type: %s' % vocabulary_type)
+ # Output blocklist.
+ output_blocklist_tokens = list(output_blocklist_tokens or [])
+ if output_size:
+ assert output_size >= len(special_ids), (
+ 'Cannot blocklist special tokens via output_size.')
+ assert isinstance(tokens, list) # Make sure order preserving pre-slice.
+ output_blocklist_tokens.extend(tokens[output_size - len(special_ids):])
+ for token in output_blocklist_tokens:
+ assert token in tokens, "Unexpected blocklist token: '%s'" % token
+ with tf.compat.v1.Session(graph=tf.Graph()) as sess:
+ output_blocklist_ids = sess.run(
+ dictionary_lookup(output_blocklist_tokens,
+ dictionary_description.SerializeToString()))
+ dictionary_description.output_blocklist_ids.id.extend(
+ sorted(output_blocklist_ids))
+ assert (len(set(dictionary_description.output_blocklist_ids.id)) == len(
+ output_blocklist_tokens)), 'blocklist contains dups or unks?'
+ # Return completed dictionary.
+ return cls(
+ dictionary_description=dictionary_description)
+ @classmethod
+ def from_dictionary_description(cls,
+ dictionary_description):
+ """Returns a Dictionary from a DictionaryDescription."""
+ return cls(
+ dictionary_description=dictionary_description)
+ def _get_lookup_graph(self):
+ """Returns a graph to use for lookup, reverse lookup, and size queries."""
+ if self._lookup_graph is None:
+ self._lookup_graph = tf.Graph()
+ serialized_description_proto = (
+ self._dictionary_description.SerializeToString())
+ with self._lookup_graph.as_default():
+ self._lookup_placeholder = tf.compat.v1.placeholder(
+ tf.string, shape=None)
+ self._reverse_lookup_placeholder = tf.compat.v1.placeholder(
+ tf.int64, shape=None)
+ # Use Dictionary(Op) (without blob) variants.
+ self._lookup_result = dictionary_lookup(
+ self._lookup_placeholder,
+ dictionary_description_proto=serialized_description_proto)
+ self._reverse_lookup_result = dictionary_reverse_lookup(
+ self._reverse_lookup_placeholder,
+ dictionary_description_proto=serialized_description_proto)
+ self._size_result = dictionary_size(
+ dictionary_description_proto=serialized_description_proto)
+ return self._lookup_graph
+ def lookup(self, tokens):
+ """Maps a list of tokens to a list of ids.
+ Args:
+ tokens: A list of tokens to lookup.
+ Returns:
+ A list of token ids of the same size.
+ Raises:
+ ValueError: If tokens is not a list.
+ """
+ if not isinstance(tokens, list):
+ raise ValueError('lookup expected a list of tokens.')
+ with tf.compat.v1.Session(graph=self._get_lookup_graph()) as sess:
+ return sess.run(self._lookup_result, {
+ self._lookup_placeholder: tokens
+ }).tolist()
+ def reverse_lookup(self, ids):
+ """Maps a list of ids to tokens.
+ Args:
+ ids: A list of ids to map back to tokens.
+ Returns:
+ A list of tokens corresponding to those ids.
+ Raises:
+ ValueError: If ids is not a list.
+ """
+ if not isinstance(ids, list):
+ raise ValueError('reverse_lookup expected a list of ids.')
+ with tf.compat.v1.Session(graph=self._get_lookup_graph()) as sess:
+ return list(
+ sess.run(self._reverse_lookup_result,
+ {self._reverse_lookup_placeholder: ids}))
+ @property
+ def special_ids(self):
+ """Returns a list of special token ids."""
+ return [t for t in [self.unk_id, self.bos_id, self.eos_id] if t is not None]
+ @property
+ def eos_id(self):
+ eos_id = self._dictionary_description.special_ids.eos
+ return eos_id if eos_id >= 0 else None
+ @property
+ def bos_id(self):
+ bos_id = self._dictionary_description.special_ids.bos
+ return bos_id if bos_id >= 0 else None
+ @property
+ def unk_id(self):
+ unk_id = self._dictionary_description.special_ids.unk
+ return unk_id if unk_id >= 0 else None
+ @property
+ def size(self):
+ with tf.compat.v1.Session(graph=self._get_lookup_graph()) as sess:
+ return sess.run(self._size_result)
+ @property
+ def output_blocklist_ids(self):
+ return list(self._dictionary_description.output_blocklist_ids.id)
+ @property
+ def output_blocklist_tokens(self):
+ return self.reverse_lookup(self.output_blocklist_ids)
+ @property
+ def tokens(self):
+ return self.reverse_lookup(list(range(len(self.special_ids), self.size)))
+ @property
+ def dictionary_description_proto(self):
+ """Serialized proto containing self.dictionary_description."""
+ return self.dictionary_description.SerializeToString()
+ @property
+ def dictionary_description(self):
+ """Returns the `DictionaryDescription` proto describing this dictionary.
+ """
+ desc = self._dictionary_description
+ return desc
+ def __len__(self):
+ return self.size