diff options
author | Haibo Huang <hhb@google.com> | 2018-07-03 17:43:11 -0700 |
---|---|---|
committer | Haibo Huang <hhb@google.com> | 2018-07-04 06:00:25 +0000 |
commit | 8b3c57bdcbbbd9cb1dc98546b567a138207b7638 (patch) | |
tree | af539e10dc3b2b42edd3197b34f1c8b5ee9a82a0 /Lib/fontTools/ttLib/tables/_n_a_m_e.py | |
parent | cc9a86e36194f2ef4456e14d98b810e41e4fa52d (diff) | |
download | fonttools-8b3c57bdcbbbd9cb1dc98546b567a138207b7638.tar.gz |
Upgrade fonttools from 2.4 to 3.28.0android-p-preview-5android-o-mr1-iot-release-1.0.2android-n-iot-release-smart-display-r2
1. Add METADATA.
2. Run tools/external_updater/updater.sh update fonttools
Test: m checkbuild
Change-Id: Iab9e8c5da04f4c06347a924b4cea04f743f274c3
Diffstat (limited to 'Lib/fontTools/ttLib/tables/_n_a_m_e.py')
-rw-r--r-- | Lib/fontTools/ttLib/tables/_n_a_m_e.py | 868 |
1 files changed, 832 insertions, 36 deletions
diff --git a/Lib/fontTools/ttLib/tables/_n_a_m_e.py b/Lib/fontTools/ttLib/tables/_n_a_m_e.py index 53fde4d7..a30291cc 100644 --- a/Lib/fontTools/ttLib/tables/_n_a_m_e.py +++ b/Lib/fontTools/ttLib/tables/_n_a_m_e.py @@ -1,9 +1,17 @@ +# -*- coding: utf-8 -*- from __future__ import print_function, division, absolute_import +from __future__ import unicode_literals from fontTools.misc.py23 import * from fontTools.misc import sstruct from fontTools.misc.textTools import safeEval +from fontTools.misc.encodingTools import getEncoding +from fontTools.ttLib import newTable from . import DefaultTable import struct +import logging + + +log = logging.getLogger(__name__) nameRecordFormat = """ > # big endian @@ -19,22 +27,27 @@ nameRecordSize = sstruct.calcsize(nameRecordFormat) class table__n_a_m_e(DefaultTable.DefaultTable): - + dependencies = ["ltag"] + def decompile(self, data, ttFont): - format, n, stringOffset = struct.unpack(">HHH", data[:6]) + format, n, stringOffset = struct.unpack(b">HHH", data[:6]) expectedStringOffset = 6 + n * nameRecordSize if stringOffset != expectedStringOffset: - # XXX we need a warn function - print("Warning: 'name' table stringOffset incorrect. Expected: %s; Actual: %s" % (expectedStringOffset, stringOffset)) + log.error( + "'name' table stringOffset incorrect. Expected: %s; Actual: %s", + expectedStringOffset, stringOffset) stringData = data[stringOffset:] data = data[6:] self.names = [] for i in range(n): if len(data) < 12: - # compensate for buggy font - break + log.error('skipping malformed name record #%d', i) + continue name, data = sstruct.unpack2(nameRecordFormat, data, NameRecord()) name.string = stringData[name.offset:name.offset+name.length] + if name.offset + name.length > len(stringData): + log.error('skipping malformed name record #%d', i) + continue assert len(name.string) == name.length #if (name.platEncID, name.platformID) in ((0, 0), (1, 3)): # if len(name.string) % 2: @@ -42,33 +55,35 @@ class table__n_a_m_e(DefaultTable.DefaultTable): # print name.__dict__ del name.offset, name.length self.names.append(name) - + def compile(self, ttFont): if not hasattr(self, "names"): # only happens when there are NO name table entries read # from the TTX file self.names = [] - self.names.sort() # sort according to the spec; see NameRecord.__lt__() + names = self.names + names.sort() # sort according to the spec; see NameRecord.__lt__() stringData = b"" format = 0 - n = len(self.names) + n = len(names) stringOffset = 6 + n * sstruct.calcsize(nameRecordFormat) - data = struct.pack(">HHH", format, n, stringOffset) + data = struct.pack(b">HHH", format, n, stringOffset) lastoffset = 0 done = {} # remember the data so we can reuse the "pointers" - for name in self.names: - if name.string in done: - name.offset, name.length = done[name.string] + for name in names: + string = name.toBytes() + if string in done: + name.offset, name.length = done[string] else: - name.offset, name.length = done[name.string] = len(stringData), len(name.string) - stringData = stringData + name.string + name.offset, name.length = done[string] = len(stringData), len(string) + stringData = bytesjoin([stringData, string]) data = data + sstruct.pack(nameRecordFormat, name) return data + stringData - + def toXML(self, writer, ttFont): for name in self.names: name.toXML(writer, ttFont) - + def fromXML(self, name, attrs, content, ttFont): if name != "namerecord": return # ignore unknown tags @@ -77,56 +92,347 @@ class table__n_a_m_e(DefaultTable.DefaultTable): name = NameRecord() self.names.append(name) name.fromXML(name, attrs, content, ttFont) - + def getName(self, nameID, platformID, platEncID, langID=None): for namerecord in self.names: - if ( namerecord.nameID == nameID and - namerecord.platformID == platformID and + if ( namerecord.nameID == nameID and + namerecord.platformID == platformID and namerecord.platEncID == platEncID): if langID is None or namerecord.langID == langID: return namerecord return None # not found + def getDebugName(self, nameID): + englishName = someName = None + for name in self.names: + if name.nameID != nameID: + continue + try: + unistr = name.toUnicode() + except UnicodeDecodeError: + continue + + someName = unistr + if (name.platformID, name.langID) in ((1, 0), (3, 0x409)): + englishName = unistr + break + if englishName: + return englishName + elif someName: + return someName + else: + return None + + def setName(self, string, nameID, platformID, platEncID, langID): + """ Set the 'string' for the name record identified by 'nameID', 'platformID', + 'platEncID' and 'langID'. If a record with that nameID doesn't exist, create it + and append to the name table. + + 'string' can be of type `str` (`unicode` in PY2) or `bytes`. In the latter case, + it is assumed to be already encoded with the correct plaform-specific encoding + identified by the (platformID, platEncID, langID) triplet. A warning is issued + to prevent unexpected results. + """ + if not hasattr(self, 'names'): + self.names = [] + if not isinstance(string, unicode): + if isinstance(string, bytes): + log.warning( + "name string is bytes, ensure it's correctly encoded: %r", string) + else: + raise TypeError( + "expected unicode or bytes, found %s: %r" % ( + type(string).__name__, string)) + namerecord = self.getName(nameID, platformID, platEncID, langID) + if namerecord: + namerecord.string = string + else: + self.names.append(makeName(string, nameID, platformID, platEncID, langID)) + + def _findUnusedNameID(self, minNameID=256): + """Finds an unused name id. + + The nameID is assigned in the range between 'minNameID' and 32767 (inclusive), + following the last nameID in the name table. + """ + names = getattr(self, 'names', []) + nameID = 1 + max([n.nameID for n in names] + [minNameID - 1]) + if nameID > 32767: + raise ValueError("nameID must be less than 32768") + return nameID + + def addMultilingualName(self, names, ttFont=None, nameID=None): + """Add a multilingual name, returning its name ID + + 'names' is a dictionary with the name in multiple languages, + such as {'en': 'Pale', 'de': 'Blaß', 'de-CH': 'Blass'}. + The keys can be arbitrary IETF BCP 47 language codes; + the values are Unicode strings. + + 'ttFont' is the TTFont to which the names are added, or None. + If present, the font's 'ltag' table can get populated + to store exotic language codes, which allows encoding + names that otherwise cannot get encoded at all. + + 'nameID' is the name ID to be used, or None to let the library + pick an unused name ID. + """ + if not hasattr(self, 'names'): + self.names = [] + if nameID is None: + nameID = self._findUnusedNameID() + # TODO: Should minimize BCP 47 language codes. + # https://github.com/fonttools/fonttools/issues/930 + for lang, name in sorted(names.items()): + # Apple platforms have been recognizing Windows names + # since early OSX (~2001), so we only add names + # for the Macintosh platform when we cannot not make + # a Windows name. This can happen for exotic BCP47 + # language tags that have no Windows language code. + windowsName = _makeWindowsName(name, nameID, lang) + if windowsName is not None: + self.names.append(windowsName) + else: + macName = _makeMacName(name, nameID, lang, ttFont) + if macName is not None: + self.names.append(macName) + return nameID + + def addName(self, string, platforms=((1, 0, 0), (3, 1, 0x409)), minNameID=255): + """ Add a new name record containing 'string' for each (platformID, platEncID, + langID) tuple specified in the 'platforms' list. + + The nameID is assigned in the range between 'minNameID'+1 and 32767 (inclusive), + following the last nameID in the name table. + If no 'platforms' are specified, two English name records are added, one for the + Macintosh (platformID=0), and one for the Windows platform (3). + + The 'string' must be a Unicode string, so it can be encoded with different, + platform-specific encodings. + + Return the new nameID. + """ + assert len(platforms) > 0, \ + "'platforms' must contain at least one (platformID, platEncID, langID) tuple" + if not hasattr(self, 'names'): + self.names = [] + if not isinstance(string, unicode): + raise TypeError( + "expected %s, found %s: %r" % ( + unicode.__name__, type(string).__name__,string )) + nameID = self._findUnusedNameID(minNameID + 1) + for platformID, platEncID, langID in platforms: + self.names.append(makeName(string, nameID, platformID, platEncID, langID)) + return nameID + + +def makeName(string, nameID, platformID, platEncID, langID): + name = NameRecord() + name.string, name.nameID, name.platformID, name.platEncID, name.langID = ( + string, nameID, platformID, platEncID, langID) + return name + + +def _makeWindowsName(name, nameID, language): + """Create a NameRecord for the Microsoft Windows platform + + 'language' is an arbitrary IETF BCP 47 language identifier such + as 'en', 'de-CH', 'de-AT-1901', or 'fa-Latn'. If Microsoft Windows + does not support the desired language, the result will be None. + Future versions of fonttools might return a NameRecord for the + OpenType 'name' table format 1, but this is not implemented yet. + """ + langID = _WINDOWS_LANGUAGE_CODES.get(language.lower()) + if langID is not None: + return makeName(name, nameID, 3, 1, langID) + else: + log.warning("cannot add Windows name in language %s " + "because fonttools does not yet support " + "name table format 1" % language) + return None + + +def _makeMacName(name, nameID, language, font=None): + """Create a NameRecord for Apple platforms + + 'language' is an arbitrary IETF BCP 47 language identifier such + as 'en', 'de-CH', 'de-AT-1901', or 'fa-Latn'. When possible, we + create a Macintosh NameRecord that is understood by old applications + (platform ID 1 and an old-style Macintosh language enum). If this + is not possible, we create a Unicode NameRecord (platform ID 0) + whose language points to the font’s 'ltag' table. The latter + can encode any string in any language, but legacy applications + might not recognize the format (in which case they will ignore + those names). + + 'font' should be the TTFont for which you want to create a name. + If 'font' is None, we only return NameRecords for legacy Macintosh; + in that case, the result will be None for names that need to + be encoded with an 'ltag' table. + + See the section “The language identifier” in Apple’s specification: + https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html + """ + macLang = _MAC_LANGUAGE_CODES.get(language.lower()) + macScript = _MAC_LANGUAGE_TO_SCRIPT.get(macLang) + if macLang is not None and macScript is not None: + encoding = getEncoding(1, macScript, macLang, default="ascii") + # Check if we can actually encode this name. If we can't, + # for example because we have no support for the legacy + # encoding, or because the name string contains Unicode + # characters that the legacy encoding cannot represent, + # we fall back to encoding the name in Unicode and put + # the language tag into the ltag table. + try: + _ = tobytes(name, encoding, errors="strict") + return makeName(name, nameID, 1, macScript, macLang) + except UnicodeEncodeError: + pass + if font is not None: + ltag = font.tables.get("ltag") + if ltag is None: + ltag = font["ltag"] = newTable("ltag") + # 0 = Unicode; 4 = “Unicode 2.0 or later semantics (non-BMP characters allowed)” + # “The preferred platform-specific code for Unicode would be 3 or 4.” + # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html + return makeName(name, nameID, 0, 4, ltag.addTag(language)) + else: + log.warning("cannot store language %s into 'ltag' table " + "without having access to the TTFont object" % + language) + return None + class NameRecord(object): - + + def getEncoding(self, default='ascii'): + """Returns the Python encoding name for this name entry based on its platformID, + platEncID, and langID. If encoding for these values is not known, by default + 'ascii' is returned. That can be overriden by passing a value to the default + argument. + """ + return getEncoding(self.platformID, self.platEncID, self.langID, default) + + def encodingIsUnicodeCompatible(self): + return self.getEncoding(None) in ['utf_16_be', 'ucs2be', 'ascii', 'latin1'] + + def __str__(self): + return self.toStr(errors='backslashreplace') + def isUnicode(self): return (self.platformID == 0 or (self.platformID == 3 and self.platEncID in [0, 1, 10])) + def toUnicode(self, errors='strict'): + """ + If self.string is a Unicode string, return it; otherwise try decoding the + bytes in self.string to a Unicode string using the encoding of this + entry as returned by self.getEncoding(); Note that self.getEncoding() + returns 'ascii' if the encoding is unknown to the library. + + Certain heuristics are performed to recover data from bytes that are + ill-formed in the chosen encoding, or that otherwise look misencoded + (mostly around bad UTF-16BE encoded bytes, or bytes that look like UTF-16BE + but marked otherwise). If the bytes are ill-formed and the heuristics fail, + the error is handled according to the errors parameter to this function, which is + passed to the underlying decode() function; by default it throws a + UnicodeDecodeError exception. + + Note: The mentioned heuristics mean that roundtripping a font to XML and back + to binary might recover some misencoded data whereas just loading the font + and saving it back will not change them. + """ + def isascii(b): + return (b >= 0x20 and b <= 0x7E) or b in [0x09, 0x0A, 0x0D] + encoding = self.getEncoding() + string = self.string + + if encoding == 'utf_16_be' and len(string) % 2 == 1: + # Recover badly encoded UTF-16 strings that have an odd number of bytes: + # - If the last byte is zero, drop it. Otherwise, + # - If all the odd bytes are zero and all the even bytes are ASCII, + # prepend one zero byte. Otherwise, + # - If first byte is zero and all other bytes are ASCII, insert zero + # bytes between consecutive ASCII bytes. + # + # (Yes, I've seen all of these in the wild... sigh) + if byteord(string[-1]) == 0: + string = string[:-1] + elif all(byteord(b) == 0 if i % 2 else isascii(byteord(b)) for i,b in enumerate(string)): + string = b'\0' + string + elif byteord(string[0]) == 0 and all(isascii(byteord(b)) for b in string[1:]): + string = bytesjoin(b'\0'+bytechr(byteord(b)) for b in string[1:]) + + string = tounicode(string, encoding=encoding, errors=errors) + + # If decoded strings still looks like UTF-16BE, it suggests a double-encoding. + # Fix it up. + if all(ord(c) == 0 if i % 2 == 0 else isascii(ord(c)) for i,c in enumerate(string)): + # If string claims to be Mac encoding, but looks like UTF-16BE with ASCII text, + # narrow it down. + string = ''.join(c for c in string[1::2]) + + return string + + def toBytes(self, errors='strict'): + """ If self.string is a bytes object, return it; otherwise try encoding + the Unicode string in self.string to bytes using the encoding of this + entry as returned by self.getEncoding(); Note that self.getEncoding() + returns 'ascii' if the encoding is unknown to the library. + + If the Unicode string cannot be encoded to bytes in the chosen encoding, + the error is handled according to the errors parameter to this function, + which is passed to the underlying encode() function; by default it throws a + UnicodeEncodeError exception. + """ + return tobytes(self.string, encoding=self.getEncoding(), errors=errors) + + def toStr(self, errors='strict'): + if str == bytes: + # python 2 + return self.toBytes(errors) + else: + # python 3 + return self.toUnicode(errors) + def toXML(self, writer, ttFont): - writer.begintag("namerecord", [ + try: + unistr = self.toUnicode() + except UnicodeDecodeError: + unistr = None + attrs = [ ("nameID", self.nameID), ("platformID", self.platformID), ("platEncID", self.platEncID), ("langID", hex(self.langID)), - ]) + ] + + if unistr is None or not self.encodingIsUnicodeCompatible(): + attrs.append(("unicode", unistr is not None)) + + writer.begintag("namerecord", attrs) writer.newline() - if self.isUnicode(): - if len(self.string) % 2: - # no, shouldn't happen, but some of the Apple - # tools cause this anyway :-( - writer.write16bit(self.string + b"\0", strip=True) - else: - writer.write16bit(self.string, strip=True) + if unistr is not None: + writer.write(unistr) else: - writer.write8bit(self.string, strip=True) + writer.write8bit(self.string) writer.newline() writer.endtag("namerecord") writer.newline() - + def fromXML(self, name, attrs, content, ttFont): self.nameID = safeEval(attrs["nameID"]) self.platformID = safeEval(attrs["platformID"]) self.platEncID = safeEval(attrs["platEncID"]) self.langID = safeEval(attrs["langID"]) s = strjoin(content).strip() - if self.isUnicode(): - self.string = s.encode("utf_16_be") + encoding = self.getEncoding() + if self.encodingIsUnicodeCompatible() or safeEval(attrs.get("unicode", "False")): + self.string = s.encode(encoding) else: # This is the inverse of write8bit... self.string = s.encode("latin1") - + def __lt__(self, other): if type(self) != type(other): return NotImplemented @@ -147,7 +453,497 @@ class NameRecord(object): getattr(other, "string", None), ) return selfTuple < otherTuple - + def __repr__(self): return "<NameRecord NameID=%d; PlatformID=%d; LanguageID=%d>" % ( self.nameID, self.platformID, self.langID) + + +# Windows language ID → IETF BCP-47 language tag +# +# While Microsoft indicates a region/country for all its language +# IDs, we follow Unicode practice by omitting “most likely subtags” +# as per Unicode CLDR. For example, English is simply “en” and not +# “en-Latn” because according to Unicode, the default script +# for English is Latin. +# +# http://www.unicode.org/cldr/charts/latest/supplemental/likely_subtags.html +# http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry +_WINDOWS_LANGUAGES = { + 0x0436: 'af', + 0x041C: 'sq', + 0x0484: 'gsw', + 0x045E: 'am', + 0x1401: 'ar-DZ', + 0x3C01: 'ar-BH', + 0x0C01: 'ar', + 0x0801: 'ar-IQ', + 0x2C01: 'ar-JO', + 0x3401: 'ar-KW', + 0x3001: 'ar-LB', + 0x1001: 'ar-LY', + 0x1801: 'ary', + 0x2001: 'ar-OM', + 0x4001: 'ar-QA', + 0x0401: 'ar-SA', + 0x2801: 'ar-SY', + 0x1C01: 'aeb', + 0x3801: 'ar-AE', + 0x2401: 'ar-YE', + 0x042B: 'hy', + 0x044D: 'as', + 0x082C: 'az-Cyrl', + 0x042C: 'az', + 0x046D: 'ba', + 0x042D: 'eu', + 0x0423: 'be', + 0x0845: 'bn', + 0x0445: 'bn-IN', + 0x201A: 'bs-Cyrl', + 0x141A: 'bs', + 0x047E: 'br', + 0x0402: 'bg', + 0x0403: 'ca', + 0x0C04: 'zh-HK', + 0x1404: 'zh-MO', + 0x0804: 'zh', + 0x1004: 'zh-SG', + 0x0404: 'zh-TW', + 0x0483: 'co', + 0x041A: 'hr', + 0x101A: 'hr-BA', + 0x0405: 'cs', + 0x0406: 'da', + 0x048C: 'prs', + 0x0465: 'dv', + 0x0813: 'nl-BE', + 0x0413: 'nl', + 0x0C09: 'en-AU', + 0x2809: 'en-BZ', + 0x1009: 'en-CA', + 0x2409: 'en-029', + 0x4009: 'en-IN', + 0x1809: 'en-IE', + 0x2009: 'en-JM', + 0x4409: 'en-MY', + 0x1409: 'en-NZ', + 0x3409: 'en-PH', + 0x4809: 'en-SG', + 0x1C09: 'en-ZA', + 0x2C09: 'en-TT', + 0x0809: 'en-GB', + 0x0409: 'en', + 0x3009: 'en-ZW', + 0x0425: 'et', + 0x0438: 'fo', + 0x0464: 'fil', + 0x040B: 'fi', + 0x080C: 'fr-BE', + 0x0C0C: 'fr-CA', + 0x040C: 'fr', + 0x140C: 'fr-LU', + 0x180C: 'fr-MC', + 0x100C: 'fr-CH', + 0x0462: 'fy', + 0x0456: 'gl', + 0x0437: 'ka', + 0x0C07: 'de-AT', + 0x0407: 'de', + 0x1407: 'de-LI', + 0x1007: 'de-LU', + 0x0807: 'de-CH', + 0x0408: 'el', + 0x046F: 'kl', + 0x0447: 'gu', + 0x0468: 'ha', + 0x040D: 'he', + 0x0439: 'hi', + 0x040E: 'hu', + 0x040F: 'is', + 0x0470: 'ig', + 0x0421: 'id', + 0x045D: 'iu', + 0x085D: 'iu-Latn', + 0x083C: 'ga', + 0x0434: 'xh', + 0x0435: 'zu', + 0x0410: 'it', + 0x0810: 'it-CH', + 0x0411: 'ja', + 0x044B: 'kn', + 0x043F: 'kk', + 0x0453: 'km', + 0x0486: 'quc', + 0x0487: 'rw', + 0x0441: 'sw', + 0x0457: 'kok', + 0x0412: 'ko', + 0x0440: 'ky', + 0x0454: 'lo', + 0x0426: 'lv', + 0x0427: 'lt', + 0x082E: 'dsb', + 0x046E: 'lb', + 0x042F: 'mk', + 0x083E: 'ms-BN', + 0x043E: 'ms', + 0x044C: 'ml', + 0x043A: 'mt', + 0x0481: 'mi', + 0x047A: 'arn', + 0x044E: 'mr', + 0x047C: 'moh', + 0x0450: 'mn', + 0x0850: 'mn-CN', + 0x0461: 'ne', + 0x0414: 'nb', + 0x0814: 'nn', + 0x0482: 'oc', + 0x0448: 'or', + 0x0463: 'ps', + 0x0415: 'pl', + 0x0416: 'pt', + 0x0816: 'pt-PT', + 0x0446: 'pa', + 0x046B: 'qu-BO', + 0x086B: 'qu-EC', + 0x0C6B: 'qu', + 0x0418: 'ro', + 0x0417: 'rm', + 0x0419: 'ru', + 0x243B: 'smn', + 0x103B: 'smj-NO', + 0x143B: 'smj', + 0x0C3B: 'se-FI', + 0x043B: 'se', + 0x083B: 'se-SE', + 0x203B: 'sms', + 0x183B: 'sma-NO', + 0x1C3B: 'sms', + 0x044F: 'sa', + 0x1C1A: 'sr-Cyrl-BA', + 0x0C1A: 'sr', + 0x181A: 'sr-Latn-BA', + 0x081A: 'sr-Latn', + 0x046C: 'nso', + 0x0432: 'tn', + 0x045B: 'si', + 0x041B: 'sk', + 0x0424: 'sl', + 0x2C0A: 'es-AR', + 0x400A: 'es-BO', + 0x340A: 'es-CL', + 0x240A: 'es-CO', + 0x140A: 'es-CR', + 0x1C0A: 'es-DO', + 0x300A: 'es-EC', + 0x440A: 'es-SV', + 0x100A: 'es-GT', + 0x480A: 'es-HN', + 0x080A: 'es-MX', + 0x4C0A: 'es-NI', + 0x180A: 'es-PA', + 0x3C0A: 'es-PY', + 0x280A: 'es-PE', + 0x500A: 'es-PR', + + # Microsoft has defined two different language codes for + # “Spanish with modern sorting” and “Spanish with traditional + # sorting”. This makes sense for collation APIs, and it would be + # possible to express this in BCP 47 language tags via Unicode + # extensions (eg., “es-u-co-trad” is “Spanish with traditional + # sorting”). However, for storing names in fonts, this distinction + # does not make sense, so we use “es” in both cases. + 0x0C0A: 'es', + 0x040A: 'es', + + 0x540A: 'es-US', + 0x380A: 'es-UY', + 0x200A: 'es-VE', + 0x081D: 'sv-FI', + 0x041D: 'sv', + 0x045A: 'syr', + 0x0428: 'tg', + 0x085F: 'tzm', + 0x0449: 'ta', + 0x0444: 'tt', + 0x044A: 'te', + 0x041E: 'th', + 0x0451: 'bo', + 0x041F: 'tr', + 0x0442: 'tk', + 0x0480: 'ug', + 0x0422: 'uk', + 0x042E: 'hsb', + 0x0420: 'ur', + 0x0843: 'uz-Cyrl', + 0x0443: 'uz', + 0x042A: 'vi', + 0x0452: 'cy', + 0x0488: 'wo', + 0x0485: 'sah', + 0x0478: 'ii', + 0x046A: 'yo', +} + + +_MAC_LANGUAGES = { + 0: 'en', + 1: 'fr', + 2: 'de', + 3: 'it', + 4: 'nl', + 5: 'sv', + 6: 'es', + 7: 'da', + 8: 'pt', + 9: 'no', + 10: 'he', + 11: 'ja', + 12: 'ar', + 13: 'fi', + 14: 'el', + 15: 'is', + 16: 'mt', + 17: 'tr', + 18: 'hr', + 19: 'zh-Hant', + 20: 'ur', + 21: 'hi', + 22: 'th', + 23: 'ko', + 24: 'lt', + 25: 'pl', + 26: 'hu', + 27: 'es', + 28: 'lv', + 29: 'se', + 30: 'fo', + 31: 'fa', + 32: 'ru', + 33: 'zh', + 34: 'nl-BE', + 35: 'ga', + 36: 'sq', + 37: 'ro', + 38: 'cz', + 39: 'sk', + 40: 'sl', + 41: 'yi', + 42: 'sr', + 43: 'mk', + 44: 'bg', + 45: 'uk', + 46: 'be', + 47: 'uz', + 48: 'kk', + 49: 'az-Cyrl', + 50: 'az-Arab', + 51: 'hy', + 52: 'ka', + 53: 'mo', + 54: 'ky', + 55: 'tg', + 56: 'tk', + 57: 'mn-CN', + 58: 'mn', + 59: 'ps', + 60: 'ks', + 61: 'ku', + 62: 'sd', + 63: 'bo', + 64: 'ne', + 65: 'sa', + 66: 'mr', + 67: 'bn', + 68: 'as', + 69: 'gu', + 70: 'pa', + 71: 'or', + 72: 'ml', + 73: 'kn', + 74: 'ta', + 75: 'te', + 76: 'si', + 77: 'my', + 78: 'km', + 79: 'lo', + 80: 'vi', + 81: 'id', + 82: 'tl', + 83: 'ms', + 84: 'ms-Arab', + 85: 'am', + 86: 'ti', + 87: 'om', + 88: 'so', + 89: 'sw', + 90: 'rw', + 91: 'rn', + 92: 'ny', + 93: 'mg', + 94: 'eo', + 128: 'cy', + 129: 'eu', + 130: 'ca', + 131: 'la', + 132: 'qu', + 133: 'gn', + 134: 'ay', + 135: 'tt', + 136: 'ug', + 137: 'dz', + 138: 'jv', + 139: 'su', + 140: 'gl', + 141: 'af', + 142: 'br', + 143: 'iu', + 144: 'gd', + 145: 'gv', + 146: 'ga', + 147: 'to', + 148: 'el-polyton', + 149: 'kl', + 150: 'az', + 151: 'nn', +} + + +_WINDOWS_LANGUAGE_CODES = {lang.lower(): code for code, lang in _WINDOWS_LANGUAGES.items()} +_MAC_LANGUAGE_CODES = {lang.lower(): code for code, lang in _MAC_LANGUAGES.items()} + + +# MacOS language ID → MacOS script ID +# +# Note that the script ID is not sufficient to determine what encoding +# to use in TrueType files. For some languages, MacOS used a modification +# of a mainstream script. For example, an Icelandic name would be stored +# with smRoman in the TrueType naming table, but the actual encoding +# is a special Icelandic version of the normal Macintosh Roman encoding. +# As another example, Inuktitut uses an 8-bit encoding for Canadian Aboriginal +# Syllables but MacOS had run out of available script codes, so this was +# done as a (pretty radical) “modification” of Ethiopic. +# +# http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/Readme.txt +_MAC_LANGUAGE_TO_SCRIPT = { + 0: 0, # langEnglish → smRoman + 1: 0, # langFrench → smRoman + 2: 0, # langGerman → smRoman + 3: 0, # langItalian → smRoman + 4: 0, # langDutch → smRoman + 5: 0, # langSwedish → smRoman + 6: 0, # langSpanish → smRoman + 7: 0, # langDanish → smRoman + 8: 0, # langPortuguese → smRoman + 9: 0, # langNorwegian → smRoman + 10: 5, # langHebrew → smHebrew + 11: 1, # langJapanese → smJapanese + 12: 4, # langArabic → smArabic + 13: 0, # langFinnish → smRoman + 14: 6, # langGreek → smGreek + 15: 0, # langIcelandic → smRoman (modified) + 16: 0, # langMaltese → smRoman + 17: 0, # langTurkish → smRoman (modified) + 18: 0, # langCroatian → smRoman (modified) + 19: 2, # langTradChinese → smTradChinese + 20: 4, # langUrdu → smArabic + 21: 9, # langHindi → smDevanagari + 22: 21, # langThai → smThai + 23: 3, # langKorean → smKorean + 24: 29, # langLithuanian → smCentralEuroRoman + 25: 29, # langPolish → smCentralEuroRoman + 26: 29, # langHungarian → smCentralEuroRoman + 27: 29, # langEstonian → smCentralEuroRoman + 28: 29, # langLatvian → smCentralEuroRoman + 29: 0, # langSami → smRoman + 30: 0, # langFaroese → smRoman (modified) + 31: 4, # langFarsi → smArabic (modified) + 32: 7, # langRussian → smCyrillic + 33: 25, # langSimpChinese → smSimpChinese + 34: 0, # langFlemish → smRoman + 35: 0, # langIrishGaelic → smRoman (modified) + 36: 0, # langAlbanian → smRoman + 37: 0, # langRomanian → smRoman (modified) + 38: 29, # langCzech → smCentralEuroRoman + 39: 29, # langSlovak → smCentralEuroRoman + 40: 0, # langSlovenian → smRoman (modified) + 41: 5, # langYiddish → smHebrew + 42: 7, # langSerbian → smCyrillic + 43: 7, # langMacedonian → smCyrillic + 44: 7, # langBulgarian → smCyrillic + 45: 7, # langUkrainian → smCyrillic (modified) + 46: 7, # langByelorussian → smCyrillic + 47: 7, # langUzbek → smCyrillic + 48: 7, # langKazakh → smCyrillic + 49: 7, # langAzerbaijani → smCyrillic + 50: 4, # langAzerbaijanAr → smArabic + 51: 24, # langArmenian → smArmenian + 52: 23, # langGeorgian → smGeorgian + 53: 7, # langMoldavian → smCyrillic + 54: 7, # langKirghiz → smCyrillic + 55: 7, # langTajiki → smCyrillic + 56: 7, # langTurkmen → smCyrillic + 57: 27, # langMongolian → smMongolian + 58: 7, # langMongolianCyr → smCyrillic + 59: 4, # langPashto → smArabic + 60: 4, # langKurdish → smArabic + 61: 4, # langKashmiri → smArabic + 62: 4, # langSindhi → smArabic + 63: 26, # langTibetan → smTibetan + 64: 9, # langNepali → smDevanagari + 65: 9, # langSanskrit → smDevanagari + 66: 9, # langMarathi → smDevanagari + 67: 13, # langBengali → smBengali + 68: 13, # langAssamese → smBengali + 69: 11, # langGujarati → smGujarati + 70: 10, # langPunjabi → smGurmukhi + 71: 12, # langOriya → smOriya + 72: 17, # langMalayalam → smMalayalam + 73: 16, # langKannada → smKannada + 74: 14, # langTamil → smTamil + 75: 15, # langTelugu → smTelugu + 76: 18, # langSinhalese → smSinhalese + 77: 19, # langBurmese → smBurmese + 78: 20, # langKhmer → smKhmer + 79: 22, # langLao → smLao + 80: 30, # langVietnamese → smVietnamese + 81: 0, # langIndonesian → smRoman + 82: 0, # langTagalog → smRoman + 83: 0, # langMalayRoman → smRoman + 84: 4, # langMalayArabic → smArabic + 85: 28, # langAmharic → smEthiopic + 86: 28, # langTigrinya → smEthiopic + 87: 28, # langOromo → smEthiopic + 88: 0, # langSomali → smRoman + 89: 0, # langSwahili → smRoman + 90: 0, # langKinyarwanda → smRoman + 91: 0, # langRundi → smRoman + 92: 0, # langNyanja → smRoman + 93: 0, # langMalagasy → smRoman + 94: 0, # langEsperanto → smRoman + 128: 0, # langWelsh → smRoman (modified) + 129: 0, # langBasque → smRoman + 130: 0, # langCatalan → smRoman + 131: 0, # langLatin → smRoman + 132: 0, # langQuechua → smRoman + 133: 0, # langGuarani → smRoman + 134: 0, # langAymara → smRoman + 135: 7, # langTatar → smCyrillic + 136: 4, # langUighur → smArabic + 137: 26, # langDzongkha → smTibetan + 138: 0, # langJavaneseRom → smRoman + 139: 0, # langSundaneseRom → smRoman + 140: 0, # langGalician → smRoman + 141: 0, # langAfrikaans → smRoman + 142: 0, # langBreton → smRoman (modified) + 143: 28, # langInuktitut → smEthiopic (modified) + 144: 0, # langScottishGaelic → smRoman (modified) + 145: 0, # langManxGaelic → smRoman (modified) + 146: 0, # langIrishGaelicScript → smRoman (modified) + 147: 0, # langTongan → smRoman + 148: 6, # langGreekAncient → smRoman + 149: 0, # langGreenlandic → smRoman + 150: 0, # langAzerbaijanRoman → smRoman + 151: 0, # langNynorsk → smRoman +} |