# This file contains an ordred list of rules used to create a mapping from code points to strings. # Each rule adds mappings, possibly overriding those added by earlier rules. # The rules have the format below: # # ; ; # comment # # Code points matching the source are mapped to the target IF the target matches the targetFilter. # # source # The source can be based on properties (UnicodeSet) or explicit code points: # UnicodeSet (like [:Lm:]), # a single code point (like 0130), or # a code point range (like 2170-217F) # # target # The target can be one or more hex codes, or special instructions: # XXXX a specific code point (like 0130) # exclude the code point maps to itself # caseonly no normalization is done, but case folding is (plus NFC) # delete the code point maps to the empty string (is removed) # nfc do nfc only # bracket use basic mapping, but add space before and after # ok it is ok to use the basic mapping (NFKC+caseFolding) # = delete # # targetFilter # The targetFilter is optional. # UnicodeSet adds restriction on all code points of target # = no restriction # # Commands (determine the characters listed) # # @LIST=ALL - (default) list all characters # @LIST=SHOW_AGE - list all characters, show age of new characters # @LIST=ONLY_OLD - only list Unicode 5.0 characters # # Running GenerateNormalizeForMatch produces a log file, with the following format: # - Each line is repeated, with the line number # - Any rule is followed by example characters where it makes a difference. # - The ↛ shows the old mapping, before the rule was added # - The → shows the new mapping, after the rule was added # Example # 59: [:decomposition_type=circle:] ; caseonly ; # 24B6 «Ⓐ» CIRCLED LATIN CAPITAL LETTER A ; caseonly ; # «Ⓐ» ↛ «Ⓐ» → «ⓐ» # # If the log file name contains .htm, then an HTML file is generated. # ==================================== # Fix the case (and NFC) of all characters # We do this first, just to show progressive steps [:assigned:] ; caseonly ; [:^whitespace:] [:decomposition_type=wide:] ; ok ; [:decomposition_type=narrow:] ; ok ; [:whitespace:] ; ok # Change almost characters and whitespace (by default) using NFKC+case [:assigned:] ; ok ; [:^whitespace:] # We adjust certain special cases, bracketing or excluding [:decomposition_type=circle:] ; bracket_circle ; [:decomposition_type=fraction:] ; bracket [:decomposition_type=super:] ; caseonly [:decomposition_type=sub:] ; caseonly # Remove all default ignorables [:default_ignorable_code_point:] ; delete ; # Fix all decimal numbers [[:numeric_value=0:]&[:Nd:]] ; 0030 [[:numeric_value=1:]&[:Nd:]] ; 0031 [[:numeric_value=2:]&[:Nd:]] ; 0032 [[:numeric_value=3:]&[:Nd:]] ; 0033 [[:numeric_value=4:]&[:Nd:]] ; 0034 [[:numeric_value=5:]&[:Nd:]] ; 0035 [[:numeric_value=6:]&[:Nd:]] ; 0036 [[:numeric_value=7:]&[:Nd:]] ; 0037 [[:numeric_value=8:]&[:Nd:]] ; 0038 [[:numeric_value=9:]&[:Nd:]] ; 0039 # Detailed changes # Many of these originally come from the UCA file. # Special handling of special slashes, ZWSP, full-width macron, and dotted I. 2044 ; 002F ∕ ; 002F 200B ; 0020 FFE3 ; 00AF ; # FULLWIDTH MACRON => MACRON 0130 ; 0069 ; # LATIN CAPITAL LETTER I WITH DOT ABOVE => LATIN SMALL LETTER I # Sharp S 00DF ; exclude ; # LATIN SMALL LETTER SHARP S => LATIN SMALL LETTER S + LATIN SMALL LETTER S 1E9E ; 00DF ; # LATIN CAPITAL LETTER SHARP S => LATIN SMALL LETTER SHARP S 05F0 ; 05D5 05D5 ; # HEBREW LIGATURE YIDDISH DOUBLE VAV => HEBREW LETTER VAV + HEBREW LETTER VAV 05F1 ; 05D5 05D9 ; # HEBREW LIGATURE YIDDISH VAV YOD => HEBREW LETTER VAV + HEBREW LETTER YOD 05F2 ; 05D9 05D9 ; # HEBREW LIGATURE YIDDISH DOUBLE YOD => HEBREW LETTER YOD + HEBREW LETTER YOD FB1F ; 05D9 05D9 05B7 ; # HEBREW LIGATURE YIDDISH YOD YOD PATAH => HEBREW LETTER YOD + HEBREW LETTER YOD + HEBREW POINT PATAH 0E33 ; exclude ; # THAI CHARACTER SARA AM => THAI CHARACTER NIKHAHIT + THAI CHARACTER SARA AA # Fix isolated forms, tatweel 0640 ; delete ; # ARABIC TATWEEL => FE73 ; delete ; # ARABIC TAIL FRAGMENT => FC5E ; 064C 0651 ; # ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM => ARABIC DAMMATAN + ARABIC SHADDA FC5F ; 064D 0651 ; # ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM => ARABIC KASRATAN + ARABIC SHADDA FC60 ; 064E 0651 ; # ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM => ARABIC FATHA + ARABIC SHADDA FC61 ; 064F 0651 ; # ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM => ARABIC DAMMA + ARABIC SHADDA FC62 ; 0650 0651 ; # ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM => ARABIC KASRA + ARABIC SHADDA FC63 ; 0651 0670 ; # ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM => ARABIC SHADDA + ARABIC LETTER SUPERSCRIPT ALEF FCF2 ; 064E 0651 ; # ARABIC LIGATURE SHADDA WITH FATHA MEDIAL FORM => ARABIC FATHA + ARABIC SHADDA FCF3 ; 064F 0651 ; # ARABIC LIGATURE SHADDA WITH DAMMA MEDIAL FORM => ARABIC DAMMA + ARABIC SHADDA FCF4 ; 0650 0651 ; # ARABIC LIGATURE SHADDA WITH KASRA MEDIAL FORM => ARABIC KASRA + ARABIC SHADDA FE70 ; 064B ; # ARABIC FATHATAN ISOLATED FORM => ARABIC FATHATAN FE71 ; 064B ; # ARABIC TATWEEL WITH FATHATAN ABOVE => ARABIC FATHATAN FE72 ; 064C ; # ARABIC DAMMATAN ISOLATED FORM => ARABIC DAMMATAN FE74 ; 064D ; # ARABIC KASRATAN ISOLATED FORM => ARABIC KASRATAN FE76 ; 064E ; # ARABIC FATHA ISOLATED FORM => ARABIC FATHA FE77 ; 064E ; # ARABIC FATHA MEDIAL FORM => ARABIC FATHA FE78 ; 064F ; # ARABIC DAMMA ISOLATED FORM => ARABIC DAMMA FE79 ; 064F ; # ARABIC DAMMA MEDIAL FORM => ARABIC DAMMA FE7A ; 0650 ; # ARABIC KASRA ISOLATED FORM => ARABIC KASRA FE7B ; 0650 ; # ARABIC KASRA MEDIAL FORM => ARABIC KASRA FE7C ; 0651 ; # ARABIC SHADDA ISOLATED FORM => ARABIC SHADDA FE7D ; 0651 ; # ARABIC SHADDA MEDIAL FORM => ARABIC SHADDA FE7E ; 0652 ; # ARABIC SUKUN ISOLATED FORM => ARABIC SUKUN FE7F ; 0652 ; # ARABIC SUKUN MEDIAL FORM => ARABIC SUKUN #Fix Hangul [:block=Hangul_Compatibility_Jamo:]; exclude ; # a thorough job would require context dependence # point to isolated characters, not combining jamo FFA0 ; 3164 ; # HALFWIDTH HANGUL FILLER => HANGUL FILLER FFA1 ; 3131 ; # HALFWIDTH HANGUL LETTER KIYEOK => HANGUL LETTER KIYEOK FFA2 ; 3132 ; # HALFWIDTH HANGUL LETTER SSANGKIYEOK => HANGUL LETTER SSANGKIYEOK FFA3 ; 3133 ; # HALFWIDTH HANGUL LETTER KIYEOK-SIOS => HANGUL LETTER KIYEOK-SIOS FFA4 ; 3134 ; # HALFWIDTH HANGUL LETTER NIEUN => HANGUL LETTER NIEUN FFA5 ; 3135 ; # HALFWIDTH HANGUL LETTER NIEUN-CIEUC => HANGUL LETTER NIEUN-CIEUC FFA6 ; 3136 ; # HALFWIDTH HANGUL LETTER NIEUN-HIEUH => HANGUL LETTER NIEUN-HIEUH FFA7 ; 3137 ; # HALFWIDTH HANGUL LETTER TIKEUT => HANGUL LETTER TIKEUT FFA8 ; 3138 ; # HALFWIDTH HANGUL LETTER SSANGTIKEUT => HANGUL LETTER SSANGTIKEUT FFA9 ; 3139 ; # HALFWIDTH HANGUL LETTER RIEUL => HANGUL LETTER RIEUL FFAA ; 313A ; # HALFWIDTH HANGUL LETTER RIEUL-KIYEOK => HANGUL LETTER RIEUL-KIYEOK FFAB ; 313B ; # HALFWIDTH HANGUL LETTER RIEUL-MIEUM => HANGUL LETTER RIEUL-MIEUM FFAC ; 313C ; # HALFWIDTH HANGUL LETTER RIEUL-PIEUP => HANGUL LETTER RIEUL-PIEUP FFAD ; 313D ; # HALFWIDTH HANGUL LETTER RIEUL-SIOS => HANGUL LETTER RIEUL-SIOS FFAE ; 313E ; # HALFWIDTH HANGUL LETTER RIEUL-THIEUTH => HANGUL LETTER RIEUL-THIEUTH FFAF ; 313F ; # HALFWIDTH HANGUL LETTER RIEUL-PHIEUPH => HANGUL LETTER RIEUL-PHIEUPH FFB0 ; 3140 ; # HALFWIDTH HANGUL LETTER RIEUL-HIEUH => HANGUL LETTER RIEUL-HIEUH FFB1 ; 3141 ; # HALFWIDTH HANGUL LETTER MIEUM => HANGUL LETTER MIEUM FFB2 ; 3142 ; # HALFWIDTH HANGUL LETTER PIEUP => HANGUL LETTER PIEUP FFB3 ; 3143 ; # HALFWIDTH HANGUL LETTER SSANGPIEUP => HANGUL LETTER SSANGPIEUP FFB4 ; 3144 ; # HALFWIDTH HANGUL LETTER PIEUP-SIOS => HANGUL LETTER PIEUP-SIOS FFB5 ; 3145 ; # HALFWIDTH HANGUL LETTER SIOS => HANGUL LETTER SIOS FFB6 ; 3146 ; # HALFWIDTH HANGUL LETTER SSANGSIOS => HANGUL LETTER SSANGSIOS FFB7 ; 3147 ; # HALFWIDTH HANGUL LETTER IEUNG => HANGUL LETTER IEUNG FFB8 ; 3148 ; # HALFWIDTH HANGUL LETTER CIEUC => HANGUL LETTER CIEUC FFB9 ; 3149 ; # HALFWIDTH HANGUL LETTER SSANGCIEUC => HANGUL LETTER SSANGCIEUC FFBA ; 314A ; # HALFWIDTH HANGUL LETTER CHIEUCH => HANGUL LETTER CHIEUCH FFBB ; 314B ; # HALFWIDTH HANGUL LETTER KHIEUKH => HANGUL LETTER KHIEUKH FFBC ; 314C ; # HALFWIDTH HANGUL LETTER THIEUTH => HANGUL LETTER THIEUTH FFBD ; 314D ; # HALFWIDTH HANGUL LETTER PHIEUPH => HANGUL LETTER PHIEUPH FFBE ; 314E ; # HALFWIDTH HANGUL LETTER HIEUH => HANGUL LETTER HIEUH FFC2 ; 314F ; # HALFWIDTH HANGUL LETTER A => HANGUL LETTER A FFC3 ; 3150 ; # HALFWIDTH HANGUL LETTER AE => HANGUL LETTER AE FFC4 ; 3151 ; # HALFWIDTH HANGUL LETTER YA => HANGUL LETTER YA FFC5 ; 3152 ; # HALFWIDTH HANGUL LETTER YAE => HANGUL LETTER YAE FFC6 ; 3153 ; # HALFWIDTH HANGUL LETTER EO => HANGUL LETTER EO FFC7 ; 3154 ; # HALFWIDTH HANGUL LETTER E => HANGUL LETTER E FFCA ; 3155 ; # HALFWIDTH HANGUL LETTER YEO => HANGUL LETTER YEO FFCB ; 3156 ; # HALFWIDTH HANGUL LETTER YE => HANGUL LETTER YE FFCC ; 3157 ; # HALFWIDTH HANGUL LETTER O => HANGUL LETTER O FFCD ; 3158 ; # HALFWIDTH HANGUL LETTER WA => HANGUL LETTER WA FFCE ; 3159 ; # HALFWIDTH HANGUL LETTER WAE => HANGUL LETTER WAE FFCF ; 315A ; # HALFWIDTH HANGUL LETTER OE => HANGUL LETTER OE FFD2 ; 315B ; # HALFWIDTH HANGUL LETTER YO => HANGUL LETTER YO FFD3 ; 315C ; # HALFWIDTH HANGUL LETTER U => HANGUL LETTER U FFD4 ; 315D ; # HALFWIDTH HANGUL LETTER WEO => HANGUL LETTER WEO FFD5 ; 315E ; # HALFWIDTH HANGUL LETTER WE => HANGUL LETTER WE FFD6 ; 315F ; # HALFWIDTH HANGUL LETTER WI => HANGUL LETTER WI FFD7 ; 3160 ; # HALFWIDTH HANGUL LETTER YU => HANGUL LETTER YU FFDA ; 3161 ; # HALFWIDTH HANGUL LETTER EU => HANGUL LETTER EU FFDB ; 3162 ; # HALFWIDTH HANGUL LETTER YI => HANGUL LETTER YI FFDC ; 3163 ; # HALFWIDTH HANGUL LETTER I => HANGUL LETTER I # Fix sound marks, since they normally occur when the combining forms are really meant 309B ; 3099 309C ; 309A # Fix squared/cubed units ㎡ ; m² ㎢ ; km² ㎠ ; cm² ㎟ ; mm² ㎨ ; m∕s² ㎯ ; rad∕s² ㍸ ; dm² ㎥ ; m³ ㎤ ; cm³ ㎦ ; km³ ㎣ ; mm³ ㍹ ; dm³