LEFT | RIGHT |
(no file at all) | |
| 1 // © 2017 and later: Unicode, Inc. and others. |
| 2 // License & terms of use: http://www.unicode.org/copyright.html#License |
| 3 package com.ibm.icu.impl.number.parse; |
| 4 |
| 5 import java.util.EnumMap; |
| 6 import java.util.Map; |
| 7 |
| 8 import com.ibm.icu.text.UnicodeSet; |
| 9 |
| 10 /** |
| 11 * This class statically initializes UnicodeSets useful for number parsing. Micr
obenchmarks show this to |
| 12 * bring a very sizeable performance boost. |
| 13 * |
| 14 * IMPORTANT ASSUMPTION: All of the sets contain code points (no strings) and th
ey are all case-folded. |
| 15 * If this assumption were ever broken, logic in classes such as SymbolMatcher w
ould need to be updated |
| 16 * in order to return well-formed sets upon calls to getLeadCodePoints(). |
| 17 * |
| 18 * @author sffc |
| 19 */ |
| 20 public class UnicodeSetStaticCache { |
| 21 public static enum Key { |
| 22 // Ignorables |
| 23 BIDI, |
| 24 WHITESPACE, |
| 25 DEFAULT_IGNORABLES, |
| 26 STRICT_IGNORABLES, |
| 27 |
| 28 // Separators |
| 29 COMMA, |
| 30 PERIOD, |
| 31 OTHER_GROUPING_SEPARATORS, |
| 32 COMMA_OR_OTHER, |
| 33 PERIOD_OR_OTHER, |
| 34 COMMA_OR_PERIOD_OR_OTHER, |
| 35 STRICT_COMMA, |
| 36 STRICT_PERIOD, |
| 37 STRICT_COMMA_OR_OTHER, |
| 38 STRICT_PERIOD_OR_OTHER, |
| 39 STRICT_COMMA_OR_PERIOD_OR_OTHER, |
| 40 |
| 41 // Symbols |
| 42 // TODO: NaN? |
| 43 MINUS_SIGN, |
| 44 PLUS_SIGN, |
| 45 PERCENT_SIGN, |
| 46 PERMILLE_SIGN, |
| 47 INFINITY, |
| 48 |
| 49 // Other |
| 50 DIGITS, |
| 51 CAPITAL_N, |
| 52 FOLDED_N, |
| 53 CAPITAL_E, |
| 54 FOLDED_E, |
| 55 |
| 56 // Combined Separators with Digits (for lead code points) |
| 57 DIGITS_OR_COMMA_OR_OTHER, |
| 58 DIGITS_OR_PERIOD_OR_OTHER, |
| 59 DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER, |
| 60 DIGITS_OR_STRICT_COMMA_OR_OTHER, |
| 61 DIGITS_OR_STRICT_PERIOD_OR_OTHER, |
| 62 DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER, |
| 63 }; |
| 64 |
| 65 private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<Key, Uni
codeSet>(Key.class); |
| 66 |
| 67 public static UnicodeSet get(Key key) { |
| 68 return unicodeSets.get(key); |
| 69 } |
| 70 |
| 71 public static Key chooseFrom(String str, Key key1) { |
| 72 return get(key1).contains(str) ? key1 : null; |
| 73 } |
| 74 |
| 75 public static Key chooseFrom(String str, Key key1, Key key2) { |
| 76 return get(key1).contains(str) ? key1 : chooseFrom(str, key2); |
| 77 } |
| 78 |
| 79 public static Key chooseFrom(String str, Key key1, Key key2, Key key3) { |
| 80 return get(key1).contains(str) ? key1 : chooseFrom(str, key2, key3); |
| 81 } |
| 82 |
| 83 public static Key unionOf(Key key1, Key key2) { |
| 84 // Make sure key1 < key2 |
| 85 if (key2.ordinal() < key1.ordinal()) { |
| 86 Key temp = key1; |
| 87 key1 = key2; |
| 88 key2 = temp; |
| 89 } |
| 90 |
| 91 if (key1 == Key.COMMA && key2 == Key.PERIOD_OR_OTHER) { |
| 92 // 1.234,567 |
| 93 return Key.COMMA_OR_PERIOD_OR_OTHER; |
| 94 |
| 95 } else if (key1 == Key.COMMA && key2 == Key.OTHER_GROUPING_SEPARATORS) { |
| 96 // 1'234,567 |
| 97 return Key.COMMA_OR_OTHER; |
| 98 |
| 99 } else if (key1 == Key.PERIOD && key2 == Key.COMMA_OR_OTHER) { |
| 100 // 1,234.567 |
| 101 return Key.COMMA_OR_PERIOD_OR_OTHER; |
| 102 |
| 103 } else if (key1 == Key.PERIOD && key2 == Key.OTHER_GROUPING_SEPARATORS)
{ |
| 104 // 1'234.567 |
| 105 return Key.PERIOD_OR_OTHER; |
| 106 |
| 107 } else if (key1 == Key.STRICT_COMMA && key2 == Key.STRICT_PERIOD_OR_OTHE
R) { |
| 108 // Strict 1.234,567 |
| 109 return Key.STRICT_COMMA_OR_PERIOD_OR_OTHER; |
| 110 |
| 111 } else if (key1 == Key.STRICT_COMMA && key2 == Key.OTHER_GROUPING_SEPARA
TORS) { |
| 112 // Strict 1'234,567 |
| 113 return Key.STRICT_COMMA_OR_OTHER; |
| 114 |
| 115 } else if (key1 == Key.STRICT_PERIOD && key2 == Key.STRICT_COMMA_OR_OTHE
R) { |
| 116 // Strict 1,234.567 |
| 117 return Key.STRICT_COMMA_OR_PERIOD_OR_OTHER; |
| 118 |
| 119 } else if (key1 == Key.STRICT_PERIOD && key2 == Key.OTHER_GROUPING_SEPAR
ATORS) { |
| 120 // Strict 1'234.567 |
| 121 return Key.STRICT_PERIOD_OR_OTHER; |
| 122 |
| 123 } else if (key1 == Key.COMMA_OR_OTHER && key2 == Key.DIGITS) { |
| 124 return Key.DIGITS_OR_COMMA_OR_OTHER; |
| 125 |
| 126 } else if (key1 == Key.PERIOD_OR_OTHER && key2 == Key.DIGITS) { |
| 127 return Key.DIGITS_OR_PERIOD_OR_OTHER; |
| 128 |
| 129 } else if (key1 == Key.COMMA_OR_PERIOD_OR_OTHER && key2 == Key.DIGITS) { |
| 130 return Key.DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER; |
| 131 |
| 132 } else if (key1 == Key.STRICT_COMMA_OR_OTHER && key2 == Key.DIGITS) { |
| 133 return Key.DIGITS_OR_STRICT_COMMA_OR_OTHER; |
| 134 |
| 135 } else if (key1 == Key.STRICT_PERIOD_OR_OTHER && key2 == Key.DIGITS) { |
| 136 return Key.DIGITS_OR_STRICT_PERIOD_OR_OTHER; |
| 137 |
| 138 } else if (key1 == Key.STRICT_COMMA_OR_PERIOD_OR_OTHER && key2 == Key.DI
GITS) { |
| 139 return Key.DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER; |
| 140 } |
| 141 |
| 142 return null; |
| 143 } |
| 144 |
| 145 private static UnicodeSet computeUnion(Key k1, Key k2) { |
| 146 return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze(); |
| 147 } |
| 148 |
| 149 private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) { |
| 150 return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).
freeze(); |
| 151 } |
| 152 |
| 153 static { |
| 154 // BiDi characters are skipped over and ignored at any point in the stri
ng, even in strict mode. |
| 155 unicodeSets.put(Key.BIDI, new UnicodeSet("[[\\u200E\\u200F\\u061C]]").fr
eeze()); |
| 156 |
| 157 // This set was decided after discussion with icu-design@. See ticket #1
3309. |
| 158 // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank propert
y). |
| 159 unicodeSets.put(Key.WHITESPACE, new UnicodeSet("[[:Zs:][\\u0009]]").free
ze()); |
| 160 |
| 161 unicodeSets.put(Key.DEFAULT_IGNORABLES, computeUnion(Key.BIDI, Key.WHITE
SPACE)); |
| 162 unicodeSets.put(Key.STRICT_IGNORABLES, get(Key.BIDI)); |
| 163 |
| 164 // TODO: Re-generate these sets from the UCD. They probably haven't been
updated in a while. |
| 165 unicodeSets.put(Key.COMMA, new UnicodeSet("[,،٫、︐︑﹐﹑,、]").freeze()); |
| 166 unicodeSets.put(Key.STRICT_COMMA, new UnicodeSet("[,٫︐﹐,]").freeze()); |
| 167 unicodeSets.put(Key.PERIOD, new UnicodeSet("[.․。︒﹒.。]").freeze()); |
| 168 unicodeSets.put(Key.STRICT_PERIOD, new UnicodeSet("[.․﹒.。]").freeze()); |
| 169 unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, |
| 170 new UnicodeSet("['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205
F\\u3000]").freeze()); |
| 171 |
| 172 unicodeSets.put(Key.COMMA_OR_OTHER, computeUnion(Key.COMMA, Key.OTHER_GR
OUPING_SEPARATORS)); |
| 173 unicodeSets.put(Key.PERIOD_OR_OTHER, computeUnion(Key.PERIOD, Key.OTHER_
GROUPING_SEPARATORS)); |
| 174 unicodeSets.put(Key.COMMA_OR_PERIOD_OR_OTHER, |
| 175 computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATOR
S)); |
| 176 unicodeSets.put(Key.STRICT_COMMA_OR_OTHER, |
| 177 computeUnion(Key.STRICT_COMMA, Key.OTHER_GROUPING_SEPARATORS)); |
| 178 unicodeSets.put(Key.STRICT_PERIOD_OR_OTHER, |
| 179 computeUnion(Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS)); |
| 180 unicodeSets.put(Key.STRICT_COMMA_OR_PERIOD_OR_OTHER, |
| 181 computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROU
PING_SEPARATORS)); |
| 182 |
| 183 unicodeSets.put(Key.MINUS_SIGN, new UnicodeSet("[-⁻₋−➖﹣-]").freeze()); |
| 184 unicodeSets.put(Key.PLUS_SIGN, new UnicodeSet("[+⁺₊➕﬩﹢+]").freeze()); |
| 185 |
| 186 // TODO: Fill in the next three sets. |
| 187 unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze()); |
| 188 unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze()); |
| 189 unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze()); |
| 190 |
| 191 unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze()); |
| 192 unicodeSets.put(Key.CAPITAL_N, new UnicodeSet("[N]").freeze()); |
| 193 unicodeSets.put(Key.FOLDED_N, new UnicodeSet("[n]").freeze()); |
| 194 unicodeSets.put(Key.CAPITAL_E, new UnicodeSet("[E]").freeze()); |
| 195 unicodeSets.put(Key.FOLDED_E, new UnicodeSet("[e]").freeze()); |
| 196 |
| 197 unicodeSets.put(Key.DIGITS_OR_COMMA_OR_OTHER, computeUnion(Key.DIGITS, K
ey.COMMA_OR_OTHER)); |
| 198 unicodeSets.put(Key.DIGITS_OR_PERIOD_OR_OTHER, computeUnion(Key.DIGITS,
Key.PERIOD_OR_OTHER)); |
| 199 unicodeSets.put(Key.DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER, |
| 200 computeUnion(Key.DIGITS, Key.COMMA_OR_PERIOD_OR_OTHER)); |
| 201 unicodeSets.put(Key.DIGITS_OR_STRICT_COMMA_OR_OTHER, |
| 202 computeUnion(Key.DIGITS, Key.STRICT_COMMA_OR_OTHER)); |
| 203 unicodeSets.put(Key.DIGITS_OR_STRICT_PERIOD_OR_OTHER, |
| 204 computeUnion(Key.DIGITS, Key.STRICT_PERIOD_OR_OTHER)); |
| 205 unicodeSets.put(Key.DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER, |
| 206 computeUnion(Key.DIGITS, Key.STRICT_COMMA_OR_PERIOD_OR_OTHER)); |
| 207 } |
| 208 } |
LEFT | RIGHT |