package org.unicode.cldr.draft; import java.text.FieldPosition; import java.text.Format; import java.text.ParsePosition; import java.util.BitSet; import java.util.Set; import java.util.TreeSet; import org.unicode.cldr.draft.PatternFixer.Target; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; public class UnicodeSetFormat extends Format { public UnicodeSetFormat(Target target) { this.target = target; } public UnicodeSetFormat(Target target, int patternOptions) { this.target = target; this.options = patternOptions; } // main methods @Override // TODO clean up prototype public StringBuffer format(Object obj, StringBuffer toAppendTo, FieldPosition pos) { // API for Format calls for StringBuffer, but should update to StringBuilder int startPos = toAppendTo.length(); Set strings = null; toAppendTo.append('['); for (UnicodeSetIterator it = new UnicodeSetIterator((UnicodeSet) obj); it.nextRange();) { if (it.codepoint == UnicodeSetIterator.IS_STRING) { if (strings == null) { strings = new TreeSet(); } strings.add(it.string); continue; } appendQuoted(toAppendTo, it.codepoint); if (it.codepointEnd != it.codepoint) { appendQuoted(toAppendTo.append('-'), it.codepointEnd); } } toAppendTo.append(']'); if (strings != null) { // edge case StringBuffer extras = new StringBuffer("(?:"); for (String string : strings) { appendQuoted(extras, string).append('|'); } toAppendTo.insert(startPos, extras); toAppendTo.append(')'); } return toAppendTo; } // TODO optimize this to only quote what is needed for the particular target // and (possibly) the given location in the character class private StringBuffer appendQuoted(StringBuffer target, int codePoint) { switch (codePoint) { case '[': // SET_OPEN: case ']': // SET_CLOSE: case '-': // HYPHEN: case '^': // COMPLEMENT: case '&': // INTERSECTION: case '\\': // BACKSLASH: case '{': case '}': case '$': case ':': target.append('\\'); break; default: if (toQuote.contains(codePoint)) { if (codePoint > 0xFFFF) { target.append("\\u"); target.append(Utility.hex(UTF16.getLeadSurrogate(codePoint), 4)); codePoint = UTF16.getTrailSurrogate(codePoint); } target.append("\\u"); target.append(Utility.hex(codePoint, 4)); return target; } } UTF16.append(target, codePoint); return target; } private StringBuffer appendQuoted(StringBuffer target, String string) { for (int i = 0; i < string.length(); ++i) { appendQuoted(target, string.charAt(i)); // don't worry about surrogates; this works in Java // for other Targets we may have to fix. } return target; } @Override public final UnicodeSet parseObject(String pattern, ParsePosition pos) { return new UnicodeSet(pattern, pos, null); } // settings public Target getTarget() { return target; } public UnicodeSetFormat setTarget(Target target) { this.target = target; return this; } public int getOptions() { return options; } public UnicodeSetFormat setOptions(int options) { this.options = options; return this; } public Extension[] getExtensions() { return extensions; } public UnicodeSetFormat setExtensions(Extension... extensions) { this.extensions = extensions; return this; } public abstract class Extension { /** * Is called every time an unquoted $ is found. Should parse out variables as appropriate * and return how far we got, and the replacement string. Returns null if doesn't match a variable. * * @pos on input should be set to the position just before the dollar sign. * On output should be set to the end of the text to replace. */ public abstract String replaceVariable(String pattern, ParsePosition pos); /** * Resolves anything that looks like a property, eg:
* encountering \p{whitespace} or [:whitespace:] would call * getProperty("whitespace", "", false, result)
* while * \p{bidi_class=neutral} would call getProperty("bidi_class", "neutral", * false, result) and
* \p{name=/DOT/} would call * getProperty("bidi_class", "neutral", false, result)
* (for an example of the latter, see {@linkplain http * ://unicode.org/cldr/utility/list-unicodeset.jsp?a=\p name=/WITH%20DOT%20ABOVE/} * * @param regex * Set to true if the property value is a regex "find" expression. In that case, * the return value should be the set of Unicode characters that match the regex. */ public abstract boolean getProperty(String propertyName, String propertyValue, boolean regex, UnicodeSet result); } public String formatWithProperties(UnicodeSet original, boolean addOthers, UnicodeSet expandBlockIgnorables, int... properties) { UnicodeSet remainder = new UnicodeSet().addAll(original); Set propSet = new TreeSet(); BitSet props = new BitSet(); for (int i = 0; i < properties.length; ++i) { reduceByProperty(original, expandBlockIgnorables, properties[i], remainder, propSet); props.set(i); } if (addOthers) { for (int i = UProperty.INT_START; i < UProperty.INT_LIMIT; ++i) { if (props.get(i)) continue; reduceByProperty(original, expandBlockIgnorables, i, remainder, propSet); } } StringBuffer result = new StringBuffer("[ "); for (String prop : propSet) { result.append(prop).append(" "); } if (expandBlockIgnorables != null) { result.append("- ").append(expandBlockIgnorables.toPattern(true)); } if (remainder.size() > 0) { result.append(" ").append(remainder.toPattern(true)); } result.append("]"); return result.toString(); } static final int blockEnum = UCharacter.getPropertyEnum("block"); private void reduceByProperty(UnicodeSet original, UnicodeSet expandBlockIgnorables, int property, UnicodeSet remainder, Set result) { String propertyAlias = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT); UnicodeSet valueChars = new UnicodeSet(); for (int i = UCharacter.getIntPropertyMinValue(property); i <= UCharacter.getIntPropertyMaxValue(property); ++i) { String valueAlias = UCharacter.getPropertyValueName(property, i, UProperty.NameChoice.SHORT); if (valueAlias == null) { valueAlias = UCharacter.getPropertyValueName(property, i, UProperty.NameChoice.LONG); } if (valueAlias == null) continue; valueChars.clear(); valueChars.applyPropertyAlias(propertyAlias, valueAlias); if (remainder.containsSome(valueChars)) { if (original.containsAll(valueChars)) { result.add("[:" + propertyAlias + '=' + valueAlias + ":]"); remainder.removeAll(valueChars); } else if (property == blockEnum && expandBlockIgnorables != null) { UnicodeSet hasScript = new UnicodeSet(valueChars).removeAll(expandBlockIgnorables); if (hasScript.size() > 5 && original.containsAll(hasScript)) { System.out.println("Broadening to block: " + valueAlias); result.add("[:" + propertyAlias + '=' + valueAlias + ":]"); remainder.removeAll(valueChars); } } } } } // ===== PRIVATES ===== private static final long serialVersionUID = 1L; private Target target; private int options; private Extension[] extensions; private static final UnicodeSet toQuote = (UnicodeSet) new UnicodeSet( "[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]").freeze(); }