scripts/eucjp_gen.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

#!/bin/sh
# Copyright (c) 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

# References:
#   http://encoding.spec.whatwg.org/#euc-jp
#   http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
#   http://www.iana.org/assignments/charset-reg/CP51932
#   Table 3-64 in CJKV Information Processing 2/e.

# Download the following two files, run it in source/data/mappings directory 
# and save the result to euc-jp-html5.ucm
#   http://encoding.spec.whatwg.org/index-jis0208.txt
#   http://encoding.spec.whatwg.org/index-jis0212.txt

function preamble {
cat <<PREAMBLE
# ***************************************************************************
# *
# *   Copyright (C) 1995-2014, International Business Machines
# *   Corporation and others.  All Rights Reserved.
# *
# *   Generated per the algorithm for EUC-JP
# *   described at http://encoding.spec.whatwg.org/#euc-jp.
# *   Added the 34 decoding only (EUC-JP to Unicode) entries from euc-jp-2007.ucm
# *   for the backward compatibility.
# *
# ***************************************************************************
<code_set_name>               "euc-jp-html5"
<char_name_mask>              "AXXXX"
<mb_cur_max>                  3
<mb_cur_min>                  1
<uconv_class>                 "MBCS"
<subchar>                     \xF4\xFE
<subchar1>                    \x1A
<icu:charsetFamily>           "ASCII"

<icu:state>                   0-7f, 8e:2, 8f:3, a1-fe:1
<icu:state>                   a1-fe
<icu:state>                   a1-e2
<icu:state>                   a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4
<icu:state>                   a1-fe.u

CHARMAP
PREAMBLE
}

#<U0000> \x00 |0
function ascii {
  for i in $(seq 0 127)
  do
    printf '<U%04X> \\x%02X |0\n' $i $i
  done
}


function fullwidth_ascii {
  for i in $(seq 0xA1 0xDF)
  do
    # 65377 = 0xFF61, 161 = 0xA1
    printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161))  $i
  done
}


# index-jis0208.txt has index pointers larger than the size of
# the encoding space available in 2-byte Graphic plane of ISO-2022-based
# encoding (94 x 94 = 8836). We have to exclude them because they're for
# Shift-JIS.
# In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries.
# All the bi-directional mapping entries come *before* the uni-directional
# (EUC-JP to Unicode) entries so that we put '|3' if we have seen
# the same Unicode code point earlier in the list. According to the definition
# of 'index pointer' in the W3C encoding spec, it's the first entry in the
# file for a given Unicode code point.

function jis208 {
  awk '!/^#/ && !/^$/ && $1 <= 8836  \
       { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
                 $1 / 94 + 0xA1, $1 % 94 + 0xA1,\
                 ($2 in uset) ? 3 : 0); \
         uset[$2] = 1;
       }' \
  index-jis0208.txt
}

# JIS X 212 is for decoding only (use '|3' to denote that).

function jis212 {
  awk '!/^#/ && !/^$/ \
       { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\
                 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \
  index-jis0212.txt
}

# Add the uni-directional mapping entries (EUC-JP to Unicode) that
# are only present in euc-jp-2007.ucm. There are 34 of them. They're added
# for the backward compatibility with the old behavior of Chrome.
# See https://www.w3.org/Bugs/Public/show_bug.cgi?id=25266
# Here are the break-downs:
# 1. 0x8E0xE0 to 0x8E0xE2
#   00A2 00A3 00AC
# 2. JIS X 0212 extra (0x8F 0xF3 0xhh)
#   2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171
#   2172 2173 2174 2175 2176 2177 2178 2179 221A 2220 2229 222A 222B 2235 2252
#   2261 22A5 3231
# 3. JIS X 0208 extra : 0xFC 0xFB => FFE2

function decode_only_extra {
  decode_only_list=$(
  for i in $(grep '|3' euc-jp-2007.ucm  | sed 's/^<U\(....\)>.*$/\1/')
  do
    grep 0x${i} index-jis0212.txt > /dev/null  || echo $i
  done)

  for u in $decode_only_list
  do
    grep $u euc-jp-2007.ucm | grep '|3'
  done
}

function unsorted_table {
  ascii
  jis208
  jis212
  decode_only_extra
  echo '<U00A5> \x5C |1'
  echo '<U203E> \x7E |1'
}

preamble
unsorted_table | sort  | uniq
echo 'END CHARMAP'