summaryrefslogtreecommitdiff
path: root/android_icu4j/src/main/java/android/icu/text/AlphabeticIndex.java
blob: cf5b3b101e2f8c2548d03e9e803c25883a6d64aa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
/* GENERATED SOURCE. DO NOT MODIFY. */
/*
 *******************************************************************************
 * Copyright (C) 2008-2016, Google Inc, International Business Machines Corporation
 * and others. All Rights Reserved.
 *******************************************************************************
 */
package android.icu.text;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;

import android.icu.lang.UCharacter;
import android.icu.text.AlphabeticIndex.Bucket;
import android.icu.text.AlphabeticIndex.Bucket.LabelType;
import android.icu.util.LocaleData;
import android.icu.util.ULocale;

/**
 * AlphabeticIndex supports the creation of a UI index appropriate for a given language.
 * It can support either direct use, or use with a client that doesn't support localized collation.
 * The following is an example of what an index might look like in a UI:
 *
 * <pre>
 *  <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z  ...</b>
 *
 *  <b>A</b>
 *     Addison
 *     Albertson
 *     Azensky
 *  <b>B</b>
 *     Baecker
 *  ...
 * </pre>
 *
 * The class can generate a list of labels for use as a UI "index", that is, a list of
 * clickable characters (or character sequences) that allow the user to see a segment
 * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in
 * the target list, where everything in the bucket is greater than or equal to the character
 * (according to the locale's collation). Strings can be added to the index;
 * they will be in sorted order in the right bucket.
 * <p>
 * The class also supports having buckets for strings before the first (underflow),
 * after the last (overflow), and between scripts (inflow). For example, if the index
 * is constructed with labels for Russian and English, Greek characters would fall
 * into an inflow bucket between the other two scripts.
 *
 * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
 * as well as characters from the user's language,
 * then it is a good idea to call addLabels(ULocale.English).
 *
 * <h2>Direct Use</h2>
 * <p>The following shows an example of building an index directly.
 *  The "show..." methods below are just to illustrate usage.
 * 
 * <pre>
 * // Create a simple index where the values for the strings are Integers, and add the strings
 * 
 * AlphabeticIndex&lt;Integer&gt; index = new AlphabeticIndex&lt;Integer&gt;(desiredLocale).addLabels(additionalLocale);
 * int counter = 0;
 * for (String item : test) {
 *     index.addRecord(item, counter++); 
 * }
 * ...
 * // Show index at top. We could skip or gray out empty buckets
 * 
 * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
 *     if (showAll || bucket.size() != 0) {
 *         showLabelAtTop(UI, bucket.getLabel());
 *     }
 * }
 *  ...
 * // Show the buckets with their contents, skipping empty buckets
 * 
 * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
 *     if (bucket.size() != 0) {
 *         showLabelInList(UI, bucket.getLabel());
 *         for (AlphabeticIndex.Record&lt;Integer&gt; item : bucket) {
 *             showIndexedItem(UI, item.getName(), item.getData());
 *         }
 * </pre>
 *
 * The caller can build different UIs using this class.
 * For example, an index character could be omitted or grayed-out
 * if its bucket is empty. Small buckets could also be combined based on size, such as:
 *
 * <pre>
 * <b>... A-F G-N O-Z ...</b>
 * </pre>
 *
 * <h2>Client Support</h2>
 * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself,
 * to support sorting on a client that doesn't support AlphabeticIndex functionality.
 *
 * <p>The ImmutableIndex is both immutable and thread-safe.
 * The corresponding AlphabeticIndex methods are not thread-safe because
 * they "lazily" build the index buckets.
 * <ul>
 * <li>ImmutableIndex.getBucket(index) provides random access to all
 *     buckets and their labels and label types.
 * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class
 *     can be used to get a list of the labels,
 *     such as "...", "A", "B",..., and send that list to the client.
 * <li>When the client has a new name, it sends that name to the server.
 * The server needs to call the following methods,
 * and communicate the bucketIndex and collationKey back to the client.
 *
 * <pre>
 * int bucketIndex = index.getBucketIndex(name);
 * String label = immutableIndex.getBucket(bucketIndex).getLabel();  // optional
 * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
 * </pre>
 *
 * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
 * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
 * </ul>
 *
 * @author Mark Davis
 */
public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
    /**
     * Prefix string for Chinese index buckets.
     * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
     */
    private static final String BASE = "\uFDD0";

    private static final char CGJ = '\u034F';

    private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0);

    private final RuleBasedCollator collatorOriginal;
    private final RuleBasedCollator collatorPrimaryOnly;
    private RuleBasedCollator collatorExternal;

    // Comparator for records, so that the Record class can be static.
    private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() {
        public int compare(Record<V> o1, Record<V> o2) {
            return collatorOriginal.compare(o1.name, o2.name);
        }
    };

    private final List<String> firstCharsInScripts;

    // We accumulate these as we build up the input parameters
    private final UnicodeSet initialLabels = new UnicodeSet();
    private List<Record<V>> inputList;

    // Lazy evaluated: null means that we have not built yet.
    private BucketList<V> buckets;

    private String overflowLabel = "\u2026";
    private String underflowLabel = "\u2026";
    private String inflowLabel = "\u2026";

    /**
     * Immutable, thread-safe version of {@link AlphabeticIndex}.
     * This class provides thread-safe methods for bucketing,
     * and random access to buckets and their properties,
     * but does not offer adding records to the index.
     *
     * @param <V> The Record value type is unused. It can be omitted for this class
     * if it was omitted for the AlphabeticIndex that built it.
     */
    public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> {
        private final BucketList<V> buckets;
        private final Collator collatorPrimaryOnly;

        private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) {
            this.buckets = bucketList;
            this.collatorPrimaryOnly = collatorPrimaryOnly;
        }

        /**
         * Returns the number of index buckets and labels, including underflow/inflow/overflow.
         *
         * @return the number of index buckets
         */
        public int getBucketCount() {
            return buckets.getBucketCount();
        }

        /**
         * Finds the index bucket for the given name and returns the number of that bucket.
         * Use {@link #getBucket(int)} to get the bucket's properties.
         *
         * @param name the string to be sorted into an index bucket
         * @return the bucket number for the name
         */
        public int getBucketIndex(CharSequence name) {
            return buckets.getBucketIndex(name, collatorPrimaryOnly);
        }

        /**
         * Returns the index-th bucket. Returns null if the index is out of range.
         *
         * @param index bucket number
         * @return the index-th bucket
         */
        public Bucket<V> getBucket(int index) {
            if (0 <= index && index < buckets.getBucketCount()) {
                return buckets.immutableVisibleList.get(index);
            } else {
                return null;
            }
        }

        /**
         * {@inheritDoc}
         */
        public Iterator<Bucket<V>> iterator() {
            return buckets.iterator();
        }
    }

    /**
     * Create the index object.
     * 
     * @param locale
     *            The locale for the index.
     */
    public AlphabeticIndex(ULocale locale) {
        this(locale, null);
    }

    /**
     * Create the index object.
     * 
     * @param locale
     *            The locale for the index.
     */
    public AlphabeticIndex(Locale locale) {
        this(ULocale.forLocale(locale), null);
    }

    /** 
     * Create an AlphabeticIndex that uses a specific collator.
     * 
     * <p>The index will be created with no labels; the addLabels() function must be called
     * after creation to add the desired labels to the index.
     * 
     * <p>The index will work directly with the supplied collator. If the caller will need to 
     * continue working with the collator it should be cloned first, so that the
     * collator provided to the AlphabeticIndex remains unchanged after creation of the index.
     * 
     * @param collator The collator to use to order the contents of this index.
     */
    public AlphabeticIndex(RuleBasedCollator collator) {
        this(null, collator);
    }

    /**
     * Internal constructor containing implementation used by public constructors.
     */
    private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) {
        collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
        try {
            collatorPrimaryOnly = collatorOriginal.cloneAsThawed();
        } catch (Exception e) {
            // should never happen
            throw new IllegalStateException("Collator cannot be cloned", e);
        }
        collatorPrimaryOnly.setStrength(Collator.PRIMARY);
        collatorPrimaryOnly.freeze();

        firstCharsInScripts = getFirstCharactersInScripts();
        Collections.sort(firstCharsInScripts, collatorPrimaryOnly);
        // Guard against a degenerate collator where
        // some script boundary strings are primary ignorable.
        for (;;) {
            if (firstCharsInScripts.isEmpty()) {
                throw new IllegalArgumentException(
                        "AlphabeticIndex requires some non-ignorable script boundary strings");
            }
            if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) {
                firstCharsInScripts.remove(0);
            } else {
                break;
            }
        }

        // Chinese index characters, which are specific to each of the several Chinese tailorings,
        // take precedence over the single locale data exemplar set per language.
        if (!addChineseIndexCharacters() && locale != null) {
            addIndexExemplars(locale);
        }
    }

    /**
     * Add more index characters (aside from what are in the locale)
     * @param additions additional characters to add to the index, such as A-Z.
     * @return this, for chaining
     */
    public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
        initialLabels.addAll(additions);
        buckets = null;
        return this;
    }

    /**
     * Add more index characters (aside from what are in the locale)
     * @param additions additional characters to add to the index, such as those in Swedish.
     * @return this, for chaining
     */
    public AlphabeticIndex<V> addLabels(ULocale... additions) {
        for (ULocale addition : additions) {
            addIndexExemplars(addition);
        }
        buckets = null;
        return this;
    }

    /**
     * Add more index characters (aside from what are in the locale)
     * @param additions additional characters to add to the index, such as those in Swedish.
     * @return this, for chaining
     */
    public AlphabeticIndex<V> addLabels(Locale... additions) {
        for (Locale addition : additions) {
            addIndexExemplars(ULocale.forLocale(addition));
        }
        buckets = null;
        return this;
    }

    /**
     * Set the overflow label
     * @param overflowLabel see class description
     * @return this, for chaining
     */
    public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
        this.overflowLabel = overflowLabel;
        buckets = null;
        return this;
    }

    /**
     * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
     * 
     * @return underflow label
     */
    public String getUnderflowLabel() {
        return underflowLabel; // TODO get localized version
    }


    /**
     * Set the underflowLabel label
     * @param underflowLabel see class description
     * @return this, for chaining
     */
    public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
        this.underflowLabel = underflowLabel;
        buckets = null;
        return this;
    }

    /**
     * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
     * 
     * @return overflow label
     */
    public String getOverflowLabel() {
        return overflowLabel; // TODO get localized version
    }


    /**
     * Set the inflowLabel label
     * @param inflowLabel see class description
     * @return this, for chaining
     */
    public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
        this.inflowLabel = inflowLabel;
        buckets = null;
        return this;
    }

    /**
     * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
     * for Latin and Greek are used: X Y Z ... &#x0391; &#x0392; &#x0393;.
     * 
     * @return inflow label
     */
    public String getInflowLabel() {
        return inflowLabel; // TODO get localized version
    }


    /**
     * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
     * 
     * @return maxLabelCount maximum number of labels.
     */
    public int getMaxLabelCount() {
        return maxLabelCount;
    }

    /**
     * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
     * getBucketCount().
     *
     * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every
     *         nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
     *         future.
     * @return this, for chaining
     */
    public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
        this.maxLabelCount = maxLabelCount;
        buckets = null;
        return this;
    }

    /**
     * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
     * and sort differently, and that the overall list is small enough.
     */
    private List<String> initLabels() {
        Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance();
        List<String> indexCharacters = new ArrayList<String>();

        String firstScriptBoundary = firstCharsInScripts.get(0);
        String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1);

        // We make a sorted array of elements.
        // Some of the input may be redundant.
        // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
        // We filter out those cases.
        for (String item : initialLabels) {
            boolean checkDistinct;
            if (!UTF16.hasMoreCodePointsThan(item, 1)) {
                checkDistinct = false;
            } else if(item.charAt(item.length() - 1) == '*' &&
                    item.charAt(item.length() - 2) != '*') {
                // Use a label if it is marked with one trailing star,
                // even if the label string sorts the same when all contractions are suppressed.
                item = item.substring(0, item.length() - 1);
                checkDistinct = false;
            } else {
                checkDistinct = true;
            }
            if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) {
                // Ignore a primary-ignorable or non-alphabetic index character.
            } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) {
                // Ignore an index character that will land in the overflow bucket.
            } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
                // Ignore a multi-code point index character that does not sort distinctly
                // from the sequence of its separate characters.
            } else {
                int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly);
                if (insertionPoint < 0) {
                    indexCharacters.add(~insertionPoint, item);
                } else {
                    String itemAlreadyIn = indexCharacters.get(insertionPoint);
                    if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) {
                        indexCharacters.set(insertionPoint, item);
                    }
                }
            }
        }

        // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element

        final int size = indexCharacters.size() - 1;
        if (size > maxLabelCount) {
            int count = 0;
            int old = -1;
            for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
                ++count;
                it.next();
                final int bump = count * maxLabelCount / size;
                if (bump == old) {
                    it.remove();
                } else {
                    old = bump;
                }
            }
        }

        return indexCharacters;
    }

    private static String fixLabel(String current) {
        if (!current.startsWith(BASE)) {
            return current;
        }
        int rest = current.charAt(BASE.length());
        if (0x2800 < rest && rest <= 0x28FF) { // stroke count
            return (rest-0x2800) + "\u5283";
        }
        return current.substring(BASE.length());
    }

    /**
     * This method is called to get the index exemplars. Normally these come from the locale directly,
     * but if they aren't available, we have to synthesize them.
     */
    private void addIndexExemplars(ULocale locale) {
        UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
        if (exemplars != null) {
            initialLabels.addAll(exemplars);
            return;
        }

        // The locale data did not include explicit Index characters.
        // Synthesize a set of them from the locale's standard exemplar characters.
        exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);

        exemplars = exemplars.cloneAsThawed();
        // question: should we add auxiliary exemplars?
        if (exemplars.containsSome('a', 'z') || exemplars.size() == 0) {
            exemplars.addAll('a', 'z');
        }
        if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
            // cut down to small list
            exemplars.remove(0xAC00, 0xD7A3).
                add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
                add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
                add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
                add(0xD30C).add(0xD558);
        }
        if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
            // cut down to small list
            // make use of the fact that Ethiopic is allocated in 8's, where
            // the base is 0 mod 8.
            UnicodeSet ethiopic = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
            UnicodeSetIterator it = new UnicodeSetIterator(ethiopic);
            while (it.next() && it.codepoint != UnicodeSetIterator.IS_STRING) {
                if ((it.codepoint & 0x7) != 0) {
                    exemplars.remove(it.codepoint);
                }
            }
        }

        // Upper-case any that aren't already so.
        //   (We only do this for synthesized index characters.)
        for (String item : exemplars) {
            initialLabels.add(UCharacter.toUpperCase(locale, item));
        }
    }

    /**
     * Add Chinese index characters from the tailoring.
     */
    private boolean addChineseIndexCharacters() {
        UnicodeSet contractions = new UnicodeSet();
        try {
            collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions);
        } catch (Exception e) {
            return false;
        }
        if (contractions.isEmpty()) { return false; }
        initialLabels.addAll(contractions);
        for (String s : contractions) {
            assert(s.startsWith(BASE));
            char c = s.charAt(s.length() - 1);
            if (0x41 <= c && c <= 0x5A) {  // A-Z
                // There are Pinyin labels, add ASCII A-Z labels as well.
                initialLabels.add(0x41, 0x5A);  // A-Z
                break;
            }
        }
        return true;
    }

    /**
     * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
     * <p>This is used to test whether contractions sort differently from their components.
     */
    private String separated(String item) {
        StringBuilder result = new StringBuilder();
        // add a CGJ except within surrogates
        char last = item.charAt(0);
        result.append(last);
        for (int i = 1; i < item.length(); ++i) {
            char ch = item.charAt(i);
            if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
                result.append(CGJ);
            }
            result.append(ch);
            last = ch;
        }
        return result.toString();
    }

    /**
     * Builds an immutable, thread-safe version of this instance, without data records.
     *
     * @return an immutable index instance
     */
    public ImmutableIndex<V> buildImmutableIndex() {
        // The current AlphabeticIndex Java code never modifies the bucket list once built.
        // If it contains no records, we can use it.
        // addRecord() sets buckets=null rather than inserting the new record into it.
        BucketList<V> immutableBucketList;
        if (inputList != null && !inputList.isEmpty()) {
            // We need a bucket list with no records.
            immutableBucketList = createBucketList();
        } else {
            if (buckets == null) {
                buckets = createBucketList();
            }
            immutableBucketList = buckets;
        }
        return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly);
    }

    /**
     * Get the labels.
     * 
     * @return The list of bucket labels, after processing.
     */
    public List<String> getBucketLabels() {
        initBuckets();
        ArrayList<String> result = new ArrayList<String>();
        for (Bucket<V> bucket : buckets) {
            result.add(bucket.getLabel());
        }
        return result;
    }

    /**
     * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
     * then stored. The next time it is accessed, the same instance is returned.
     * <p>
     * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
     * synchronizing.</i></b>
     * 
     * @return a clone of the collator used internally
     */
    public RuleBasedCollator getCollator() {
        if (collatorExternal == null) {
            try {
                collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
            } catch (Exception e) {
                // should never happen
                throw new IllegalStateException("Collator cannot be cloned", e);
            }
        }
        return collatorExternal;
    }

    /**
     * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
     * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
     * the first added comes first.
     * 
     * @param name
     *            Name, such as a name
     * @param data
     *            Data, such as an address or link
     * @return this, for chaining
     */
    public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
        // TODO instead of invalidating, just add to unprocessed list.
        buckets = null; // invalidate old bucketlist
        if (inputList == null) {
            inputList = new ArrayList<Record<V>>();
        }
        inputList.add(new Record<V>(name, data));
        return this;
    }

    /**
     * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
     * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
     * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
     * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
     * the index or collator.
     * <p>
     * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
     * those are changed, then the bucket number and sort key must be regenerated.
     * 
     * @param name
     *            Name, such as a name
     * @return the bucket index for the name
     */
    public int getBucketIndex(CharSequence name) {
        initBuckets();
        return buckets.getBucketIndex(name, collatorPrimaryOnly);
    }

    /**
     * Clear the index.
     * 
     * @return this, for chaining
     */
    public AlphabeticIndex<V> clearRecords() {
        if (inputList != null && !inputList.isEmpty()) {
            inputList.clear();
            buckets = null;
        }
        return this;
    }

    /**
     * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
     * 
     * @return number of buckets
     */
    public int getBucketCount() {
        initBuckets();
        return buckets.getBucketCount();
    }

    /**
     * Return the number of records in the index: that is, the total number of distinct &lt;name,data&gt; pairs added with addRecord(...), over all the buckets.
     * 
     * @return total number of records in buckets
     */
    public int getRecordCount() {
        return inputList != null ? inputList.size() : 0;
    }

    /**
     * Return an iterator over the buckets.
     * 
     * @return iterator over buckets.
     */
    public Iterator<Bucket<V>> iterator() {
        initBuckets();
        return buckets.iterator();
    }

    /**
     * Creates an index, and buckets and sorts the list of records into the index.
     */
    private void initBuckets() {
        if (buckets != null) {
            return;
        }
        buckets = createBucketList();
        if (inputList == null || inputList.isEmpty()) {
            return;
        }

        // Sort the records by name.
        // Stable sort preserves input order of collation duplicates.
        Collections.sort(inputList, recordComparator);

        // Now, we traverse all of the input, which is now sorted.
        // If the item doesn't go in the current bucket, we find the next bucket that contains it.
        // This makes the process order n*log(n), since we just sort the list and then do a linear process.
        // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
        // we need to improve it for that case.

        Iterator<Bucket<V>> bucketIterator = buckets.fullIterator();
        Bucket<V> currentBucket = bucketIterator.next();
        Bucket<V> nextBucket;
        String upperBoundary;
        if (bucketIterator.hasNext()) {
            nextBucket = bucketIterator.next();
            upperBoundary = nextBucket.lowerBoundary;
        } else {
            nextBucket = null;
            upperBoundary = null;
        }
        for (Record<V> r : inputList) {
            // if the current bucket isn't the right one, find the one that is
            // We have a special flag for the last bucket so that we don't look any further
            while (upperBoundary != null &&
                    collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) {
                currentBucket = nextBucket;
                // now reset the boundary that we compare against
                if (bucketIterator.hasNext()) {
                    nextBucket = bucketIterator.next();
                    upperBoundary = nextBucket.lowerBoundary;
                } else {
                    upperBoundary = null;
                }
            }
            // now put the record into the bucket.
            Bucket<V> bucket = currentBucket;
            if (bucket.displayBucket != null) {
                bucket = bucket.displayBucket;
            }
            if (bucket.records == null) {
                bucket.records = new ArrayList<Record<V>>();
            }
            bucket.records.add(r);
        }
    }

    private int maxLabelCount = 99;

    /**
     * Returns true if one index character string is "better" than the other.
     * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
     * better, and otherwise binary-less-than is better.
     */
    private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) {
        // This is called with primary-equal strings, but never with one.equals(other).
        String n1 = nfkdNormalizer.normalize(one);
        String n2 = nfkdNormalizer.normalize(other);
        int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length());
        if (result != 0) {
            return result < 0;
        }
        result = binaryCmp.compare(n1, n2);
        if (result != 0) {
            return result < 0;
        }
        return binaryCmp.compare(one, other) < 0;
    }

    /**
     * A (name, data) pair, to be sorted by name into one of the index buckets.
     * The user data is not used by the index implementation.
     */
    public static class Record<V> {
        private final CharSequence name;
        private final V data;

        private Record(CharSequence name, V data) {
            this.name = name;
            this.data = data;
        }

        /**
         * Get the name
         * 
         * @return the name
         */
        public CharSequence getName() {
            return name;
        }

        /**
         * Get the data
         * 
         * @return the data
         */
        public V getData() {
            return data;
        }

        /**
         * Standard toString()
         */
        public String toString() {
            return name + "=" + data;
        }
    }

    /**
     * An index "bucket" with a label string and type.
     * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)}
     * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)},
     * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)},
     * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record
     * into a bucket according to the record's name.
     * 
     * @param <V>
     *            Data type
     */
    public static class Bucket<V> implements Iterable<Record<V>> {
        private final String label;
        private final String lowerBoundary;
        private final LabelType labelType;
        private Bucket<V> displayBucket;
        private int displayIndex;
        private List<Record<V>> records;

        /**
         * Type of the label
         */
        public enum LabelType {
            /**
             * Normal
             */
            NORMAL,
            /**
             * Underflow (before the first)
             */
            UNDERFLOW,
            /**
             * Inflow (between scripts)
             */
            INFLOW,
            /**
             * Overflow (after the last)
             */
            OVERFLOW
        }

        /**
         * Set up the bucket.
         * 
         * @param label
         *            label for the bucket
         * @param labelType
         *            is an underflow, overflow, or inflow bucket
         */
        private Bucket(String label, String lowerBoundary, LabelType labelType) {
            this.label = label;
            this.lowerBoundary = lowerBoundary;
            this.labelType = labelType;
        }

        /**
         * Get the label
         * 
         * @return label for the bucket
         */
        public String getLabel() {
            return label;
        }

        /**
         * Is a normal, underflow, overflow, or inflow bucket
         * 
         * @return is an underflow, overflow, or inflow bucket
         */
        public LabelType getLabelType() {
            return labelType;
        }

        /**
         * Get the number of records in the bucket.
         * 
         * @return number of records in bucket
         */
        public int size() {
            return records == null ? 0 : records.size();
        }

        /**
         * Iterator over the records in the bucket
         */
        public Iterator<Record<V>> iterator() {
            if (records == null) {
                return Collections.<Record<V>>emptyList().iterator();
            }
            return records.iterator();
        }

        /**
         * Standard toString()
         */
        @Override
        public String toString() {
            return "{" +
            "labelType=" + labelType
            + ", " +
            "lowerBoundary=" + lowerBoundary
            + ", " +
            "label=" + label
            + "}"
            ;
        }
    }

    private BucketList<V> createBucketList() {
        // Initialize indexCharacters.
        List<String> indexCharacters = initLabels();

        // Variables for hasMultiplePrimaryWeights().
        long variableTop;
        if (collatorPrimaryOnly.isAlternateHandlingShifted()) {
            variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL;
        } else {
            variableTop = 0;
        }
        boolean hasInvisibleBuckets = false;

        // Helper arrays for Chinese Pinyin collation.
        @SuppressWarnings({ "rawtypes", "unchecked" })
        Bucket<V>[] asciiBuckets = new Bucket[26];
        @SuppressWarnings({ "rawtypes", "unchecked" })
        Bucket<V>[] pinyinBuckets = new Bucket[26];
        boolean hasPinyin = false;

        ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();

        // underflow bucket
        bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW));

        // fix up the list, adding underflow, additions, overflow
        // Insert inflow labels as needed.
        int scriptIndex = -1;
        String scriptUpperBoundary = "";
        for (String current : indexCharacters) {
            if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) {
                // We crossed the script boundary into a new script.
                String inflowBoundary = scriptUpperBoundary;
                boolean skippedScript = false;
                for (;;) {
                    scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex);
                    if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) {
                        break;
                    }
                    skippedScript = true;
                }
                if (skippedScript && bucketList.size() > 1) {
                    // We are skipping one or more scripts,
                    // and we are not just getting out of the underflow label.
                    bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary,
                            LabelType.INFLOW));
                }
            }
            // Add a bucket with the current label.
            Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL);
            bucketList.add(bucket);
            // Remember ASCII and Pinyin buckets for Pinyin redirects.
            char c;
            if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') {
                asciiBuckets[c - 'A'] = bucket;
            } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) &&
                    'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') {
                pinyinBuckets[c - 'A'] = bucket;
                hasPinyin = true;
            }
            // Check for multiple primary weights.
            if (!current.startsWith(BASE) &&
                    hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) &&
                    !current.endsWith("\uffff")) {
                // "Æ" or "Sch" etc.
                for (int i = bucketList.size() - 2;; --i) {
                    Bucket<V> singleBucket = bucketList.get(i);
                    if (singleBucket.labelType != LabelType.NORMAL) {
                        // There is no single-character bucket since the last
                        // underflow or inflow label.
                        break;
                    }
                    if (singleBucket.displayBucket == null &&
                            !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) {
                        // Add an invisible bucket that redirects strings greater than the expansion
                        // to the previous single-character bucket.
                        // For example, after ... Q R S Sch we add Sch\uFFFF->S
                        // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
                        bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL);
                        bucket.displayBucket = singleBucket;
                        bucketList.add(bucket);
                        hasInvisibleBuckets = true;
                        break;
                    }
                }
            }
        }
        if (bucketList.size() == 1) {
            // No real labels, show only the underflow label.
            return new BucketList<V>(bucketList, bucketList);
        }
        // overflow bucket
        bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final

        if (hasPinyin) {
            // Redirect Pinyin buckets.
            Bucket<V> asciiBucket = null;
            for (int i = 0; i < 26; ++i) {
                if (asciiBuckets[i] != null) {
                    asciiBucket = asciiBuckets[i];
                }
                if (pinyinBuckets[i] != null && asciiBucket != null) {
                    pinyinBuckets[i].displayBucket = asciiBucket;
                    hasInvisibleBuckets = true;
                }
            }
        }

        if (!hasInvisibleBuckets) {
            return new BucketList<V>(bucketList, bucketList);
        }
        // Merge inflow buckets that are visually adjacent.
        // Iterate backwards: Merge inflow into overflow rather than the other way around.
        int i = bucketList.size() - 1;
        Bucket<V> nextBucket = bucketList.get(i);
        while (--i > 0) {
            Bucket<V> bucket = bucketList.get(i);
            if (bucket.displayBucket != null) {
                continue;  // skip invisible buckets
            }
            if (bucket.labelType == LabelType.INFLOW) {
                if (nextBucket.labelType != LabelType.NORMAL) {
                    bucket.displayBucket = nextBucket;
                    continue;
                }
            }
            nextBucket = bucket;
        }

        ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>();
        for (Bucket<V> bucket : bucketList) {
            if (bucket.displayBucket == null) {
                publicBucketList.add(bucket);
            }
        }
        return new BucketList<V>(bucketList, publicBucketList);
    }

    private static class BucketList<V> implements Iterable<Bucket<V>> {
        private final ArrayList<Bucket<V>> bucketList;
        private final List<Bucket<V>> immutableVisibleList;

        private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) {
            this.bucketList = bucketList;

            int displayIndex = 0;
            for (Bucket<V> bucket : publicBucketList) {
                bucket.displayIndex = displayIndex++;
            }
            immutableVisibleList = Collections.unmodifiableList(publicBucketList);
        }

        private int getBucketCount() {
            return immutableVisibleList.size();
        }

        private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) {
            // binary search
            int start = 0;
            int limit = bucketList.size();
            while ((start + 1) < limit) {
                int i = (start + limit) / 2;
                Bucket<V> bucket = bucketList.get(i);
                int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary);
                if (nameVsBucket < 0) {
                    limit = i;
                } else {
                    start = i;
                }
            }
            Bucket<V> bucket = bucketList.get(start);
            if (bucket.displayBucket != null) {
                bucket = bucket.displayBucket;
            }
            return bucket.displayIndex;
        }

        /**
         * Private iterator over all the buckets, visible and invisible
         */
        private Iterator<Bucket<V>> fullIterator() {
            return bucketList.iterator();
        }

        /**
         * Iterator over just the visible buckets.
         */
        public Iterator<Bucket<V>> iterator() {
            return immutableVisibleList.iterator(); // use immutable list to prevent remove().
        }
    }

    private static boolean hasMultiplePrimaryWeights(
            RuleBasedCollator coll, long variableTop, String s) {
        long[] ces = coll.internalGetCEs(s);
        boolean seenPrimary = false;
        for (int i = 0; i < ces.length; ++i) {
            long ce = ces[i];
            long p = ce >>> 32;
            if (p > variableTop) {
                // not primary ignorable
                if (seenPrimary) {
                    return true;
                }
                seenPrimary = true;
            }
        }
        return false;
    }

    // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?!
    private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER;
    private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER;
    private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER;
    private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER;
    private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER;
    private static final int GC_L_MASK =
            GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK;
    private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES;

    /**
     * Return a list of the first character in each script. Only exposed for testing.
     *
     * @return list of first characters in each script
     * @deprecated This API is ICU internal, only for testing.
     * @hide original deprecated declaration
     * @hide draft / provisional / internal are hidden on Android
     */
    @Deprecated
    public List<String> getFirstCharactersInScripts() {
        List<String> dest = new ArrayList<String>(200);
        // Fetch the script-first-primary contractions which are defined in the root collator.
        // They all start with U+FDD1.
        UnicodeSet set = new UnicodeSet();
        collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
        if (set.isEmpty()) {
            throw new UnsupportedOperationException(
                    "AlphabeticIndex requires script-first-primary contractions");
        }
        for (String boundary : set) {
            int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
            if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
                // Ignore boundaries for the special reordering groups.
                // Take only those for "real scripts" (where the sample character is a Letter,
                // and the one for unassigned implicit weights (Cn).
                continue;
            }
            dest.add(boundary);
        }
        return dest;
    }
}