diff options
Diffstat (limited to 'Lib/fontTools/otlLib/optimize/gpos.py')
-rw-r--r-- | Lib/fontTools/otlLib/optimize/gpos.py | 439 |
1 files changed, 439 insertions, 0 deletions
diff --git a/Lib/fontTools/otlLib/optimize/gpos.py b/Lib/fontTools/otlLib/optimize/gpos.py new file mode 100644 index 00000000..79873fad --- /dev/null +++ b/Lib/fontTools/otlLib/optimize/gpos.py @@ -0,0 +1,439 @@ +import logging +from collections import defaultdict, namedtuple +from functools import reduce +from itertools import chain +from math import log2 +from typing import DefaultDict, Dict, Iterable, List, Sequence, Tuple + +from fontTools.misc.intTools import bit_count, bit_indices +from fontTools.ttLib import TTFont +from fontTools.ttLib.tables import otBase, otTables + +# NOTE: activating this optimization via the environment variable is +# experimental and may not be supported once an alternative mechanism +# is in place. See: https://github.com/fonttools/fonttools/issues/2349 +GPOS_COMPACT_MODE_ENV_KEY = "FONTTOOLS_GPOS_COMPACT_MODE" +GPOS_COMPACT_MODE_DEFAULT = "0" + +log = logging.getLogger("fontTools.otlLib.optimize.gpos") + + +def compact(font: TTFont, mode: str) -> TTFont: + # Ideal plan: + # 1. Find lookups of Lookup Type 2: Pair Adjustment Positioning Subtable + # https://docs.microsoft.com/en-us/typography/opentype/spec/gpos#lookup-type-2-pair-adjustment-positioning-subtable + # 2. Extract glyph-glyph kerning and class-kerning from all present subtables + # 3. Regroup into different subtable arrangements + # 4. Put back into the lookup + # + # Actual implementation: + # 2. Only class kerning is optimized currently + # 3. If the input kerning is already in several subtables, the subtables + # are not grouped together first; instead each subtable is treated + # independently, so currently this step is: + # Split existing subtables into more smaller subtables + gpos = font["GPOS"] + for lookup in gpos.table.LookupList.Lookup: + if lookup.LookupType == 2: + compact_lookup(font, mode, lookup) + elif lookup.LookupType == 9 and lookup.SubTable[0].ExtensionLookupType == 2: + compact_ext_lookup(font, mode, lookup) + return font + + +def compact_lookup(font: TTFont, mode: str, lookup: otTables.Lookup) -> None: + new_subtables = compact_pair_pos(font, mode, lookup.SubTable) + lookup.SubTable = new_subtables + lookup.SubTableCount = len(new_subtables) + + +def compact_ext_lookup(font: TTFont, mode: str, lookup: otTables.Lookup) -> None: + new_subtables = compact_pair_pos( + font, mode, [ext_subtable.ExtSubTable for ext_subtable in lookup.SubTable] + ) + new_ext_subtables = [] + for subtable in new_subtables: + ext_subtable = otTables.ExtensionPos() + ext_subtable.Format = 1 + ext_subtable.ExtSubTable = subtable + new_ext_subtables.append(ext_subtable) + lookup.SubTable = new_ext_subtables + lookup.SubTableCount = len(new_ext_subtables) + + +def compact_pair_pos( + font: TTFont, mode: str, subtables: Sequence[otTables.PairPos] +) -> Sequence[otTables.PairPos]: + new_subtables = [] + for subtable in subtables: + if subtable.Format == 1: + # Not doing anything to Format 1 (yet?) + new_subtables.append(subtable) + elif subtable.Format == 2: + new_subtables.extend(compact_class_pairs(font, mode, subtable)) + return new_subtables + + +def compact_class_pairs( + font: TTFont, mode: str, subtable: otTables.PairPos +) -> List[otTables.PairPos]: + from fontTools.otlLib.builder import buildPairPosClassesSubtable + + subtables = [] + classes1: DefaultDict[int, List[str]] = defaultdict(list) + for g in subtable.Coverage.glyphs: + classes1[subtable.ClassDef1.classDefs.get(g, 0)].append(g) + classes2: DefaultDict[int, List[str]] = defaultdict(list) + for g, i in subtable.ClassDef2.classDefs.items(): + classes2[i].append(g) + all_pairs = {} + for i, class1 in enumerate(subtable.Class1Record): + for j, class2 in enumerate(class1.Class2Record): + if is_really_zero(class2): + continue + all_pairs[(tuple(sorted(classes1[i])), tuple(sorted(classes2[j])))] = ( + getattr(class2, "Value1", None), + getattr(class2, "Value2", None), + ) + + if len(mode) == 1 and mode in "123456789": + grouped_pairs = cluster_pairs_by_class2_coverage_custom_cost( + font, all_pairs, int(mode) + ) + for pairs in grouped_pairs: + subtables.append( + buildPairPosClassesSubtable(pairs, font.getReverseGlyphMap()) + ) + else: + raise ValueError(f"Bad {GPOS_COMPACT_MODE_ENV_KEY}={mode}") + return subtables + + +def is_really_zero(class2: otTables.Class2Record) -> bool: + v1 = getattr(class2, "Value1", None) + v2 = getattr(class2, "Value2", None) + return (v1 is None or v1.getEffectiveFormat() == 0) and ( + v2 is None or v2.getEffectiveFormat() == 0 + ) + + +Pairs = Dict[ + Tuple[Tuple[str, ...], Tuple[str, ...]], + Tuple[otBase.ValueRecord, otBase.ValueRecord], +] + +# Adapted from https://github.com/fonttools/fonttools/blob/f64f0b42f2d1163b2d85194e0979def539f5dca3/Lib/fontTools/ttLib/tables/otTables.py#L935-L958 +def _getClassRanges(glyphIDs: Iterable[int]): + glyphIDs = sorted(glyphIDs) + last = glyphIDs[0] + ranges = [[last]] + for glyphID in glyphIDs[1:]: + if glyphID != last + 1: + ranges[-1].append(last) + ranges.append([glyphID]) + last = glyphID + ranges[-1].append(last) + return ranges, glyphIDs[0], glyphIDs[-1] + + +# Adapted from https://github.com/fonttools/fonttools/blob/f64f0b42f2d1163b2d85194e0979def539f5dca3/Lib/fontTools/ttLib/tables/otTables.py#L960-L989 +def _classDef_bytes( + class_data: List[Tuple[List[Tuple[int, int]], int, int]], + class_ids: List[int], + coverage=False, +): + if not class_ids: + return 0 + first_ranges, min_glyph_id, max_glyph_id = class_data[class_ids[0]] + range_count = len(first_ranges) + for i in class_ids[1:]: + data = class_data[i] + range_count += len(data[0]) + min_glyph_id = min(min_glyph_id, data[1]) + max_glyph_id = max(max_glyph_id, data[2]) + glyphCount = max_glyph_id - min_glyph_id + 1 + # https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#class-definition-table-format-1 + format1_bytes = 6 + glyphCount * 2 + # https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#class-definition-table-format-2 + format2_bytes = 4 + range_count * 6 + return min(format1_bytes, format2_bytes) + + +ClusteringContext = namedtuple( + "ClusteringContext", + [ + "lines", + "all_class1", + "all_class1_data", + "all_class2_data", + "valueFormat1_bytes", + "valueFormat2_bytes", + ], +) + + +class Cluster: + # TODO(Python 3.7): Turn this into a dataclass + # ctx: ClusteringContext + # indices: int + # Caches + # TODO(Python 3.8): use functools.cached_property instead of the + # manually cached properties, and remove the cache fields listed below. + # _indices: Optional[List[int]] = None + # _column_indices: Optional[List[int]] = None + # _cost: Optional[int] = None + + __slots__ = "ctx", "indices_bitmask", "_indices", "_column_indices", "_cost" + + def __init__(self, ctx: ClusteringContext, indices_bitmask: int): + self.ctx = ctx + self.indices_bitmask = indices_bitmask + self._indices = None + self._column_indices = None + self._cost = None + + @property + def indices(self): + if self._indices is None: + self._indices = bit_indices(self.indices_bitmask) + return self._indices + + @property + def column_indices(self): + if self._column_indices is None: + # Indices of columns that have a 1 in at least 1 line + # => binary OR all the lines + bitmask = reduce(int.__or__, (self.ctx.lines[i] for i in self.indices)) + self._column_indices = bit_indices(bitmask) + return self._column_indices + + @property + def width(self): + # Add 1 because Class2=0 cannot be used but needs to be encoded. + return len(self.column_indices) + 1 + + @property + def cost(self): + if self._cost is None: + self._cost = ( + # 2 bytes to store the offset to this subtable in the Lookup table above + 2 + # Contents of the subtable + # From: https://docs.microsoft.com/en-us/typography/opentype/spec/gpos#pair-adjustment-positioning-format-2-class-pair-adjustment + # uint16 posFormat Format identifier: format = 2 + + 2 + # Offset16 coverageOffset Offset to Coverage table, from beginning of PairPos subtable. + + 2 + + self.coverage_bytes + # uint16 valueFormat1 ValueRecord definition — for the first glyph of the pair (may be zero). + + 2 + # uint16 valueFormat2 ValueRecord definition — for the second glyph of the pair (may be zero). + + 2 + # Offset16 classDef1Offset Offset to ClassDef table, from beginning of PairPos subtable — for the first glyph of the pair. + + 2 + + self.classDef1_bytes + # Offset16 classDef2Offset Offset to ClassDef table, from beginning of PairPos subtable — for the second glyph of the pair. + + 2 + + self.classDef2_bytes + # uint16 class1Count Number of classes in classDef1 table — includes Class 0. + + 2 + # uint16 class2Count Number of classes in classDef2 table — includes Class 0. + + 2 + # Class1Record class1Records[class1Count] Array of Class1 records, ordered by classes in classDef1. + + (self.ctx.valueFormat1_bytes + self.ctx.valueFormat2_bytes) + * len(self.indices) + * self.width + ) + return self._cost + + @property + def coverage_bytes(self): + format1_bytes = ( + # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-1 + # uint16 coverageFormat Format identifier — format = 1 + # uint16 glyphCount Number of glyphs in the glyph array + 4 + # uint16 glyphArray[glyphCount] Array of glyph IDs — in numerical order + + sum(len(self.ctx.all_class1[i]) for i in self.indices) * 2 + ) + ranges = sorted( + chain.from_iterable(self.ctx.all_class1_data[i][0] for i in self.indices) + ) + merged_range_count = 0 + last = None + for (start, end) in ranges: + if last is not None and start != last + 1: + merged_range_count += 1 + last = end + format2_bytes = ( + # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-2 + # uint16 coverageFormat Format identifier — format = 2 + # uint16 rangeCount Number of RangeRecords + 4 + # RangeRecord rangeRecords[rangeCount] Array of glyph ranges — ordered by startGlyphID. + # uint16 startGlyphID First glyph ID in the range + # uint16 endGlyphID Last glyph ID in the range + # uint16 startCoverageIndex Coverage Index of first glyph ID in range + + merged_range_count * 6 + ) + return min(format1_bytes, format2_bytes) + + @property + def classDef1_bytes(self): + # We can skip encoding one of the Class1 definitions, and use + # Class1=0 to represent it instead, because Class1 is gated by the + # Coverage definition. Use Class1=0 for the highest byte savings. + # Going through all options takes too long, pick the biggest class + # = what happens in otlLib.builder.ClassDefBuilder.classes() + biggest_index = max(self.indices, key=lambda i: len(self.ctx.all_class1[i])) + return _classDef_bytes( + self.ctx.all_class1_data, [i for i in self.indices if i != biggest_index] + ) + + @property + def classDef2_bytes(self): + # All Class2 need to be encoded because we can't use Class2=0 + return _classDef_bytes(self.ctx.all_class2_data, self.column_indices) + + +def cluster_pairs_by_class2_coverage_custom_cost( + font: TTFont, + pairs: Pairs, + compression: int = 5, +) -> List[Pairs]: + if not pairs: + # The subtable was actually empty? + return [pairs] + + # Sorted for reproducibility/determinism + all_class1 = sorted(set(pair[0] for pair in pairs)) + all_class2 = sorted(set(pair[1] for pair in pairs)) + + # Use Python's big ints for binary vectors representing each line + lines = [ + sum( + 1 << i if (class1, class2) in pairs else 0 + for i, class2 in enumerate(all_class2) + ) + for class1 in all_class1 + ] + + # Map glyph names to ids and work with ints throughout for ClassDef formats + name_to_id = font.getReverseGlyphMap() + # Each entry in the arrays below is (range_count, min_glyph_id, max_glyph_id) + all_class1_data = [ + _getClassRanges(name_to_id[name] for name in cls) for cls in all_class1 + ] + all_class2_data = [ + _getClassRanges(name_to_id[name] for name in cls) for cls in all_class2 + ] + + format1 = 0 + format2 = 0 + for pair, value in pairs.items(): + format1 |= value[0].getEffectiveFormat() if value[0] else 0 + format2 |= value[1].getEffectiveFormat() if value[1] else 0 + valueFormat1_bytes = bit_count(format1) * 2 + valueFormat2_bytes = bit_count(format2) * 2 + + ctx = ClusteringContext( + lines, + all_class1, + all_class1_data, + all_class2_data, + valueFormat1_bytes, + valueFormat2_bytes, + ) + + cluster_cache: Dict[int, Cluster] = {} + + def make_cluster(indices: int) -> Cluster: + cluster = cluster_cache.get(indices, None) + if cluster is not None: + return cluster + cluster = Cluster(ctx, indices) + cluster_cache[indices] = cluster + return cluster + + def merge(cluster: Cluster, other: Cluster) -> Cluster: + return make_cluster(cluster.indices_bitmask | other.indices_bitmask) + + # Agglomerative clustering by hand, checking the cost gain of the new + # cluster against the previously separate clusters + # Start with 1 cluster per line + # cluster = set of lines = new subtable + clusters = [make_cluster(1 << i) for i in range(len(lines))] + + # Cost of 1 cluster with everything + # `(1 << len) - 1` gives a bitmask full of 1's of length `len` + cost_before_splitting = make_cluster((1 << len(lines)) - 1).cost + log.debug(f" len(clusters) = {len(clusters)}") + + while len(clusters) > 1: + lowest_cost_change = None + best_cluster_index = None + best_other_index = None + best_merged = None + for i, cluster in enumerate(clusters): + for j, other in enumerate(clusters[i + 1 :]): + merged = merge(cluster, other) + cost_change = merged.cost - cluster.cost - other.cost + if lowest_cost_change is None or cost_change < lowest_cost_change: + lowest_cost_change = cost_change + best_cluster_index = i + best_other_index = i + 1 + j + best_merged = merged + assert lowest_cost_change is not None + assert best_cluster_index is not None + assert best_other_index is not None + assert best_merged is not None + + # If the best merge we found is still taking down the file size, then + # there's no question: we must do it, because it's beneficial in both + # ways (lower file size and lower number of subtables). However, if the + # best merge we found is not reducing file size anymore, then we need to + # look at the other stop criteria = the compression factor. + if lowest_cost_change > 0: + # Stop critera: check whether we should keep merging. + # Compute size reduction brought by splitting + cost_after_splitting = sum(c.cost for c in clusters) + # size_reduction so that after = before * (1 - size_reduction) + # E.g. before = 1000, after = 800, 1 - 800/1000 = 0.2 + size_reduction = 1 - cost_after_splitting / cost_before_splitting + + # Force more merging by taking into account the compression number. + # Target behaviour: compression number = 1 to 9, default 5 like gzip + # - 1 = accept to add 1 subtable to reduce size by 50% + # - 5 = accept to add 5 subtables to reduce size by 50% + # See https://github.com/harfbuzz/packtab/blob/master/Lib/packTab/__init__.py#L690-L691 + # Given the size reduction we have achieved so far, compute how many + # new subtables are acceptable. + max_new_subtables = -log2(1 - size_reduction) * compression + log.debug( + f" len(clusters) = {len(clusters):3d} size_reduction={size_reduction:5.2f} max_new_subtables={max_new_subtables}", + ) + if compression == 9: + # Override level 9 to mean: create any number of subtables + max_new_subtables = len(clusters) + + # If we have managed to take the number of new subtables below the + # threshold, then we can stop. + if len(clusters) <= max_new_subtables + 1: + break + + # No reason to stop yet, do the merge and move on to the next. + del clusters[best_other_index] + clusters[best_cluster_index] = best_merged + + # All clusters are final; turn bitmasks back into the "Pairs" format + pairs_by_class1: Dict[Tuple[str, ...], Pairs] = defaultdict(dict) + for pair, values in pairs.items(): + pairs_by_class1[pair[0]][pair] = values + pairs_groups: List[Pairs] = [] + for cluster in clusters: + pairs_group: Pairs = dict() + for i in cluster.indices: + class1 = all_class1[i] + pairs_group.update(pairs_by_class1[class1]) + pairs_groups.append(pairs_group) + return pairs_groups |