Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(1390)

Unified Diff: icu4j/main/classes/core/src/com/ibm/icu/impl/number/AffixUtils.java

Issue 335150043: Refreshing Number Parsing: ICU4J Base URL: svn+icussh://source.icu-project.org/repos/icu/trunk/
Patch Set: Replying to Andy feedback round one. See commit message. Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: icu4j/main/classes/core/src/com/ibm/icu/impl/number/AffixUtils.java
===================================================================
--- icu4j/main/classes/core/src/com/ibm/icu/impl/number/AffixUtils.java (revision 40724)
+++ icu4j/main/classes/core/src/com/ibm/icu/impl/number/AffixUtils.java (working copy)
@@ -3,6 +3,7 @@
package com.ibm.icu.impl.number;
import com.ibm.icu.text.NumberFormat;
+import com.ibm.icu.text.UnicodeSet;
/**
* Performs manipulations on affix patterns: the prefix and suffix strings associated with a decimal
@@ -9,608 +10,689 @@
* format pattern. For example:
*
* <table>
- * <tr><th>Affix Pattern</th><th>Example Unescaped (Formatted) String</th></tr>
- * <tr><td>abc</td><td>abc</td></tr>
- * <tr><td>ab-</td><td>ab−</td></tr>
- * <tr><td>ab'-'</td><td>ab-</td></tr>
- * <tr><td>ab''</td><td>ab'</td></tr>
+ * <tr>
+ * <th>Affix Pattern</th>
+ * <th>Example Unescaped (Formatted) String</th>
+ * </tr>
+ * <tr>
+ * <td>abc</td>
+ * <td>abc</td>
+ * </tr>
+ * <tr>
+ * <td>ab-</td>
+ * <td>ab−</td>
+ * </tr>
+ * <tr>
+ * <td>ab'-'</td>
+ * <td>ab-</td>
+ * </tr>
+ * <tr>
+ * <td>ab''</td>
+ * <td>ab'</td>
+ * </tr>
* </table>
*
- * To manually iterate over tokens in a literal string, use the following pattern, which is designed
- * to be efficient.
+ * To manually iterate over tokens in a literal string, use the following pattern, which is designed to
+ * be efficient.
*
* <pre>
* long tag = 0L;
* while (AffixPatternUtils.hasNext(tag, patternString)) {
- * tag = AffixPatternUtils.nextToken(tag, patternString);
- * int typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
- * switch (typeOrCp) {
+ * tag = AffixPatternUtils.nextToken(tag, patternString);
+ * int typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
+ * switch (typeOrCp) {
* case AffixPatternUtils.TYPE_MINUS_SIGN:
- * // Current token is a minus sign.
- * break;
+ * // Current token is a minus sign.
+ * break;
* case AffixPatternUtils.TYPE_PLUS_SIGN:
- * // Current token is a plus sign.
- * break;
+ * // Current token is a plus sign.
+ * break;
* case AffixPatternUtils.TYPE_PERCENT:
- * // Current token is a percent sign.
- * break;
+ * // Current token is a percent sign.
+ * break;
* // ... other types ...
* default:
- * // Current token is an arbitrary code point.
- * // The variable typeOrCp is the code point.
- * break;
- * }
+ * // Current token is an arbitrary code point.
+ * // The variable typeOrCp is the code point.
+ * break;
+ * }
* }
* </pre>
*/
public class AffixUtils {
- private static final int STATE_BASE = 0;
- private static final int STATE_FIRST_QUOTE = 1;
- private static final int STATE_INSIDE_QUOTE = 2;
- private static final int STATE_AFTER_QUOTE = 3;
- private static final int STATE_FIRST_CURR = 4;
- private static final int STATE_SECOND_CURR = 5;
- private static final int STATE_THIRD_CURR = 6;
- private static final int STATE_FOURTH_CURR = 7;
- private static final int STATE_FIFTH_CURR = 8;
- private static final int STATE_OVERFLOW_CURR = 9;
+ private static final int STATE_BASE = 0;
+ private static final int STATE_FIRST_QUOTE = 1;
+ private static final int STATE_INSIDE_QUOTE = 2;
+ private static final int STATE_AFTER_QUOTE = 3;
+ private static final int STATE_FIRST_CURR = 4;
+ private static final int STATE_SECOND_CURR = 5;
+ private static final int STATE_THIRD_CURR = 6;
+ private static final int STATE_FOURTH_CURR = 7;
+ private static final int STATE_FIFTH_CURR = 8;
+ private static final int STATE_OVERFLOW_CURR = 9;
- /** Represents a literal character; the value is stored in the code point field. */
- private static final int TYPE_CODEPOINT = 0;
+ /** Represents a literal character; the value is stored in the code point field. */
+ private static final int TYPE_CODEPOINT = 0;
- /** Represents a minus sign symbol '-'. */
- public static final int TYPE_MINUS_SIGN = -1;
+ /** Represents a minus sign symbol '-'. */
+ public static final int TYPE_MINUS_SIGN = -1;
- /** Represents a plus sign symbol '+'. */
- public static final int TYPE_PLUS_SIGN = -2;
+ /** Represents a plus sign symbol '+'. */
+ public static final int TYPE_PLUS_SIGN = -2;
- /** Represents a percent sign symbol '%'. */
- public static final int TYPE_PERCENT = -3;
+ /** Represents a percent sign symbol '%'. */
+ public static final int TYPE_PERCENT = -3;
- /** Represents a permille sign symbol '‰'. */
- public static final int TYPE_PERMILLE = -4;
+ /** Represents a permille sign symbol '‰'. */
+ public static final int TYPE_PERMILLE = -4;
- /** Represents a single currency symbol '¤'. */
- public static final int TYPE_CURRENCY_SINGLE = -5;
+ /** Represents a single currency symbol '¤'. */
+ public static final int TYPE_CURRENCY_SINGLE = -5;
- /** Represents a double currency symbol '¤¤'. */
- public static final int TYPE_CURRENCY_DOUBLE = -6;
+ /** Represents a double currency symbol '¤¤'. */
+ public static final int TYPE_CURRENCY_DOUBLE = -6;
- /** Represents a triple currency symbol '¤¤¤'. */
- public static final int TYPE_CURRENCY_TRIPLE = -7;
+ /** Represents a triple currency symbol '¤¤¤'. */
+ public static final int TYPE_CURRENCY_TRIPLE = -7;
- /** Represents a quadruple currency symbol '¤¤¤¤'. */
- public static final int TYPE_CURRENCY_QUAD = -8;
+ /** Represents a quadruple currency symbol '¤¤¤¤'. */
+ public static final int TYPE_CURRENCY_QUAD = -8;
- /** Represents a quintuple currency symbol '¤¤¤¤¤'. */
- public static final int TYPE_CURRENCY_QUINT = -9;
+ /** Represents a quintuple currency symbol '¤¤¤¤¤'. */
+ public static final int TYPE_CURRENCY_QUINT = -9;
- /** Represents a sequence of six or more currency symbols. */
- public static final int TYPE_CURRENCY_OVERFLOW = -15;
+ /** Represents a sequence of six or more currency symbols. */
+ public static final int TYPE_CURRENCY_OVERFLOW = -15;
- public static interface SymbolProvider {
- public CharSequence getSymbol(int type);
- }
+ public static interface SymbolProvider {
+ public CharSequence getSymbol(int type);
+ }
- /**
- * Estimates the number of code points present in an unescaped version of the affix pattern string
- * (one that would be returned by {@link #unescape}), assuming that all interpolated symbols
- * consume one code point and that currencies consume as many code points as their symbol width.
- * Used for computing padding width.
- *
- * @param patternString The original string whose width will be estimated.
- * @return The length of the unescaped string.
- */
- public static int estimateLength(CharSequence patternString) {
- if (patternString == null) return 0;
- int state = STATE_BASE;
- int offset = 0;
- int length = 0;
- for (; offset < patternString.length(); ) {
- int cp = Character.codePointAt(patternString, offset);
+ /**
+ * Estimates the number of code points present in an unescaped version of the affix pattern string
+ * (one that would be returned by {@link #unescape}), assuming that all interpolated symbols consume
+ * one code point and that currencies consume as many code points as their symbol width. Used for
+ * computing padding width.
+ *
+ * @param patternString
+ * The original string whose width will be estimated.
+ * @return The length of the unescaped string.
+ */
+ public static int estimateLength(CharSequence patternString) {
+ if (patternString == null)
+ return 0;
+ int state = STATE_BASE;
+ int offset = 0;
+ int length = 0;
+ for (; offset < patternString.length();) {
+ int cp = Character.codePointAt(patternString, offset);
- switch (state) {
- case STATE_BASE:
- if (cp == '\'') {
- // First quote
- state = STATE_FIRST_QUOTE;
- } else {
- // Unquoted symbol
- length++;
- }
- break;
+ switch (state) {
+ case STATE_BASE:
+ if (cp == '\'') {
+ // First quote
+ state = STATE_FIRST_QUOTE;
+ } else {
+ // Unquoted symbol
+ length++;
+ }
+ break;
+ case STATE_FIRST_QUOTE:
+ if (cp == '\'') {
+ // Repeated quote
+ length++;
+ state = STATE_BASE;
+ } else {
+ // Quoted code point
+ length++;
+ state = STATE_INSIDE_QUOTE;
+ }
+ break;
+ case STATE_INSIDE_QUOTE:
+ if (cp == '\'') {
+ // End of quoted sequence
+ state = STATE_AFTER_QUOTE;
+ } else {
+ // Quoted code point
+ length++;
+ }
+ break;
+ case STATE_AFTER_QUOTE:
+ if (cp == '\'') {
+ // Double quote inside of quoted sequence
+ length++;
+ state = STATE_INSIDE_QUOTE;
+ } else {
+ // Unquoted symbol
+ length++;
+ }
+ break;
+ default:
+ throw new AssertionError();
+ }
+
+ offset += Character.charCount(cp);
+ }
+
+ switch (state) {
case STATE_FIRST_QUOTE:
- if (cp == '\'') {
- // Repeated quote
- length++;
- state = STATE_BASE;
- } else {
- // Quoted code point
- length++;
- state = STATE_INSIDE_QUOTE;
- }
- break;
case STATE_INSIDE_QUOTE:
- if (cp == '\'') {
- // End of quoted sequence
- state = STATE_AFTER_QUOTE;
- } else {
- // Quoted code point
- length++;
- }
- break;
- case STATE_AFTER_QUOTE:
- if (cp == '\'') {
- // Double quote inside of quoted sequence
- length++;
- state = STATE_INSIDE_QUOTE;
- } else {
- // Unquoted symbol
- length++;
- }
- break;
+ throw new IllegalArgumentException("Unterminated quote: \"" + patternString + "\"");
default:
- throw new AssertionError();
- }
+ break;
+ }
- offset += Character.charCount(cp);
+ return length;
}
- switch (state) {
- case STATE_FIRST_QUOTE:
- case STATE_INSIDE_QUOTE:
- throw new IllegalArgumentException("Unterminated quote: \"" + patternString + "\"");
- default:
- break;
- }
+ /**
+ * Takes a string and escapes (quotes) characters that have special meaning in the affix pattern
+ * syntax. This function does not reverse-lookup symbols.
+ *
+ * <p>
+ * Example input: "-$x"; example output: "'-'$x"
+ *
+ * @param input
+ * The string to be escaped.
+ * @param output
+ * The string builder to which to append the escaped string.
+ * @return The number of chars (UTF-16 code units) appended to the output.
+ */
+ public static int escape(CharSequence input, StringBuilder output) {
+ if (input == null)
+ return 0;
+ int state = STATE_BASE;
+ int offset = 0;
+ int startLength = output.length();
+ for (; offset < input.length();) {
+ int cp = Character.codePointAt(input, offset);
- return length;
- }
+ switch (cp) {
+ case '\'':
+ output.append("''");
+ break;
- /**
- * Takes a string and escapes (quotes) characters that have special meaning in the affix pattern
- * syntax. This function does not reverse-lookup symbols.
- *
- * <p>Example input: "-$x"; example output: "'-'$x"
- *
- * @param input The string to be escaped.
- * @param output The string builder to which to append the escaped string.
- * @return The number of chars (UTF-16 code units) appended to the output.
- */
- public static int escape(CharSequence input, StringBuilder output) {
- if (input == null) return 0;
- int state = STATE_BASE;
- int offset = 0;
- int startLength = output.length();
- for (; offset < input.length(); ) {
- int cp = Character.codePointAt(input, offset);
+ case '-':
+ case '+':
+ case '%':
+ case '‰':
+ case '¤':
+ if (state == STATE_BASE) {
+ output.append('\'');
+ output.appendCodePoint(cp);
+ state = STATE_INSIDE_QUOTE;
+ } else {
+ output.appendCodePoint(cp);
+ }
+ break;
- switch (cp) {
- case '\'':
- output.append("''");
- break;
+ default:
+ if (state == STATE_INSIDE_QUOTE) {
+ output.append('\'');
+ output.appendCodePoint(cp);
+ state = STATE_BASE;
+ } else {
+ output.appendCodePoint(cp);
+ }
+ break;
+ }
+ offset += Character.charCount(cp);
+ }
- case '-':
- case '+':
- case '%':
- case '‰':
- case '¤':
- if (state == STATE_BASE) {
+ if (state == STATE_INSIDE_QUOTE) {
output.append('\'');
- output.appendCodePoint(cp);
- state = STATE_INSIDE_QUOTE;
- } else {
- output.appendCodePoint(cp);
- }
- break;
+ }
- default:
- if (state == STATE_INSIDE_QUOTE) {
- output.append('\'');
- output.appendCodePoint(cp);
- state = STATE_BASE;
- } else {
- output.appendCodePoint(cp);
- }
- break;
- }
- offset += Character.charCount(cp);
+ return output.length() - startLength;
}
- if (state == STATE_INSIDE_QUOTE) {
- output.append('\'');
+ /** Version of {@link #escape} that returns a String, or null if input is null. */
+ public static String escape(CharSequence input) {
+ if (input == null)
+ return null;
+ StringBuilder sb = new StringBuilder();
+ escape(input, sb);
+ return sb.toString();
}
- return output.length() - startLength;
- }
+ public static final NumberFormat.Field getFieldForType(int type) {
+ switch (type) {
+ case TYPE_MINUS_SIGN:
+ return NumberFormat.Field.SIGN;
+ case TYPE_PLUS_SIGN:
+ return NumberFormat.Field.SIGN;
+ case TYPE_PERCENT:
+ return NumberFormat.Field.PERCENT;
+ case TYPE_PERMILLE:
+ return NumberFormat.Field.PERMILLE;
+ case TYPE_CURRENCY_SINGLE:
+ return NumberFormat.Field.CURRENCY;
+ case TYPE_CURRENCY_DOUBLE:
+ return NumberFormat.Field.CURRENCY;
+ case TYPE_CURRENCY_TRIPLE:
+ return NumberFormat.Field.CURRENCY;
+ case TYPE_CURRENCY_QUAD:
+ return NumberFormat.Field.CURRENCY;
+ case TYPE_CURRENCY_QUINT:
+ return NumberFormat.Field.CURRENCY;
+ case TYPE_CURRENCY_OVERFLOW:
+ return NumberFormat.Field.CURRENCY;
+ default:
+ throw new AssertionError();
+ }
+ }
- /** Version of {@link #escape} that returns a String, or null if input is null. */
- public static String escape(CharSequence input) {
- if (input == null) return null;
- StringBuilder sb = new StringBuilder();
- escape(input, sb);
- return sb.toString();
- }
-
- public static final NumberFormat.Field getFieldForType(int type) {
- switch (type) {
- case TYPE_MINUS_SIGN:
- return NumberFormat.Field.SIGN;
- case TYPE_PLUS_SIGN:
- return NumberFormat.Field.SIGN;
- case TYPE_PERCENT:
- return NumberFormat.Field.PERCENT;
- case TYPE_PERMILLE:
- return NumberFormat.Field.PERMILLE;
- case TYPE_CURRENCY_SINGLE:
- return NumberFormat.Field.CURRENCY;
- case TYPE_CURRENCY_DOUBLE:
- return NumberFormat.Field.CURRENCY;
- case TYPE_CURRENCY_TRIPLE:
- return NumberFormat.Field.CURRENCY;
- case TYPE_CURRENCY_QUAD:
- return NumberFormat.Field.CURRENCY;
- case TYPE_CURRENCY_QUINT:
- return NumberFormat.Field.CURRENCY;
- case TYPE_CURRENCY_OVERFLOW:
- return NumberFormat.Field.CURRENCY;
- default:
- throw new AssertionError();
+ /**
+ * Executes the unescape state machine. Replaces the unquoted characters "-", "+", "%", "‰", and "¤"
+ * with the corresponding symbols provided by the {@link SymbolProvider}, and inserts the result into
+ * the NumberStringBuilder at the requested location.
+ *
+ * <p>
+ * Example input: "'-'¤x"; example output: "-$x"
+ *
+ * @param affixPattern
+ * The original string to be unescaped.
+ * @param output
+ * The NumberStringBuilder to mutate with the result.
+ * @param position
+ * The index into the NumberStringBuilder to insert the the string.
+ * @param provider
+ * An object to generate locale symbols.
+ * @return The length of the string added to affixPattern.
+ */
+ public static int unescape(
+ CharSequence affixPattern,
+ NumberStringBuilder output,
+ int position,
+ SymbolProvider provider) {
+ assert affixPattern != null;
+ int length = 0;
+ long tag = 0L;
+ while (hasNext(tag, affixPattern)) {
+ tag = nextToken(tag, affixPattern);
+ int typeOrCp = getTypeOrCp(tag);
+ if (typeOrCp == TYPE_CURRENCY_OVERFLOW) {
+ // Don't go to the provider for this special case
+ length += output.insertCodePoint(position + length, 0xFFFD, NumberFormat.Field.CURRENCY);
+ } else if (typeOrCp < 0) {
+ length += output.insert(position + length,
+ provider.getSymbol(typeOrCp),
+ getFieldForType(typeOrCp));
+ } else {
+ length += output.insertCodePoint(position + length, typeOrCp, null);
+ }
+ }
+ return length;
}
- }
- /**
- * Executes the unescape state machine. Replaces the unquoted characters "-", "+", "%", "‰", and
- * "¤" with the corresponding symbols provided by the {@link SymbolProvider}, and inserts the
- * result into the NumberStringBuilder at the requested location.
- *
- * <p>Example input: "'-'¤x"; example output: "-$x"
- *
- * @param affixPattern The original string to be unescaped.
- * @param output The NumberStringBuilder to mutate with the result.
- * @param position The index into the NumberStringBuilder to insert the the string.
- * @param provider An object to generate locale symbols.
- * @return The length of the string added to affixPattern.
- */
- public static int unescape(
- CharSequence affixPattern,
- NumberStringBuilder output,
- int position,
- SymbolProvider provider) {
- assert affixPattern != null;
- int length = 0;
- long tag = 0L;
- while (hasNext(tag, affixPattern)) {
- tag = nextToken(tag, affixPattern);
- int typeOrCp = getTypeOrCp(tag);
- if (typeOrCp == TYPE_CURRENCY_OVERFLOW) {
- // Don't go to the provider for this special case
- length += output.insertCodePoint(position + length, 0xFFFD, NumberFormat.Field.CURRENCY);
- } else if (typeOrCp < 0) {
- length += output.insert(position + length, provider.getSymbol(typeOrCp), getFieldForType(typeOrCp));
- } else {
- length += output.insertCodePoint(position + length, typeOrCp, null);
- }
+ /**
+ * Sames as {@link #unescape}, but only calculates the code point count. More efficient than
+ * {@link #unescape} if you only need the length but not the string itself.
+ *
+ * @param affixPattern
+ * The original string to be unescaped.
+ * @param provider
+ * An object to generate locale symbols.
+ * @return The number of code points in the unescaped string.
+ */
+ public static int unescapedCodePointCount(CharSequence affixPattern, SymbolProvider provider) {
+ int length = 0;
+ long tag = 0L;
+ while (hasNext(tag, affixPattern)) {
+ tag = nextToken(tag, affixPattern);
+ int typeOrCp = getTypeOrCp(tag);
+ if (typeOrCp == TYPE_CURRENCY_OVERFLOW) {
+ length += 1;
+ } else if (typeOrCp < 0) {
+ CharSequence symbol = provider.getSymbol(typeOrCp);
+ length += Character.codePointCount(symbol, 0, symbol.length());
+ } else {
+ length += 1;
+ }
+ }
+ return length;
}
- return length;
- }
- /**
- * Sames as {@link #unescape}, but only calculates the code point count. More efficient than {@link #unescape}
- * if you only need the length but not the string itself.
- *
- * @param affixPattern The original string to be unescaped.
- * @param provider An object to generate locale symbols.
- * @return The number of code points in the unescaped string.
- */
- public static int unescapedCodePointCount(CharSequence affixPattern, SymbolProvider provider) {
- int length = 0;
- long tag = 0L;
- while (hasNext(tag, affixPattern)) {
- tag = nextToken(tag, affixPattern);
- int typeOrCp = getTypeOrCp(tag);
- if (typeOrCp == TYPE_CURRENCY_OVERFLOW) {
- length += 1;
- } else if (typeOrCp < 0) {
- CharSequence symbol = provider.getSymbol(typeOrCp);
- length += Character.codePointCount(symbol, 0, symbol.length());
- } else {
- length += 1;
- }
+ /**
+ * Checks whether the given affix pattern contains at least one token of the given type, which is one
+ * of the constants "TYPE_" in {@link AffixUtils}.
+ *
+ * @param affixPattern
+ * The affix pattern to check.
+ * @param type
+ * The token type.
+ * @return true if the affix pattern contains the given token type; false otherwise.
+ */
+ public static boolean containsType(CharSequence affixPattern, int type) {
+ if (affixPattern == null || affixPattern.length() == 0) {
+ return false;
+ }
+ long tag = 0L;
+ while (hasNext(tag, affixPattern)) {
+ tag = nextToken(tag, affixPattern);
+ if (getTypeOrCp(tag) == type) {
+ return true;
+ }
+ }
+ return false;
}
- return length;
- }
- /**
- * Checks whether the given affix pattern contains at least one token of the given type, which is
- * one of the constants "TYPE_" in {@link AffixUtils}.
- *
- * @param affixPattern The affix pattern to check.
- * @param type The token type.
- * @return true if the affix pattern contains the given token type; false otherwise.
- */
- public static boolean containsType(CharSequence affixPattern, int type) {
- if (affixPattern == null || affixPattern.length() == 0) {
+ /**
+ * Checks whether the specified affix pattern has any unquoted currency symbols ("¤").
+ *
+ * @param affixPattern
+ * The string to check for currency symbols.
+ * @return true if the literal has at least one unquoted currency symbol; false otherwise.
+ */
+ public static boolean hasCurrencySymbols(CharSequence affixPattern) {
+ if (affixPattern == null || affixPattern.length() == 0)
+ return false;
+ long tag = 0L;
+ while (hasNext(tag, affixPattern)) {
+ tag = nextToken(tag, affixPattern);
+ int typeOrCp = getTypeOrCp(tag);
+ if (typeOrCp < 0 && getFieldForType(typeOrCp) == NumberFormat.Field.CURRENCY) {
+ return true;
+ }
+ }
return false;
}
- long tag = 0L;
- while (hasNext(tag, affixPattern)) {
- tag = nextToken(tag, affixPattern);
- if (getTypeOrCp(tag) == type) {
- return true;
- }
+
+ /**
+ * Replaces all occurrences of tokens with the given type with the given replacement char.
+ *
+ * @param affixPattern
+ * The source affix pattern (does not get modified).
+ * @param type
+ * The token type.
+ * @param replacementChar
+ * The char to substitute in place of chars of the given token type.
+ * @return A string containing the new affix pattern.
+ */
+ public static String replaceType(CharSequence affixPattern, int type, char replacementChar) {
+ if (affixPattern == null || affixPattern.length() == 0)
+ return "";
+ char[] chars = affixPattern.toString().toCharArray();
+ long tag = 0L;
+ while (hasNext(tag, affixPattern)) {
+ tag = nextToken(tag, affixPattern);
+ if (getTypeOrCp(tag) == type) {
+ int offset = getOffset(tag);
+ chars[offset - 1] = replacementChar;
+ }
+ }
+ return new String(chars);
}
- return false;
- }
- /**
- * Checks whether the specified affix pattern has any unquoted currency symbols ("¤").
- *
- * @param affixPattern The string to check for currency symbols.
- * @return true if the literal has at least one unquoted currency symbol; false otherwise.
- */
- public static boolean hasCurrencySymbols(CharSequence affixPattern) {
- if (affixPattern == null || affixPattern.length() == 0) return false;
- long tag = 0L;
- while (hasNext(tag, affixPattern)) {
- tag = nextToken(tag, affixPattern);
- int typeOrCp = getTypeOrCp(tag);
- if (typeOrCp < 0 && getFieldForType(typeOrCp) == NumberFormat.Field.CURRENCY) {
- return true;
- }
+ /**
+ * Appends a new affix pattern with all symbols and code points in the given "ignorables" UnicodeSet
+ * trimmed from the beginning and end. Similar to calling unescape with a symbol provider that always
+ * returns the empty string.
+ *
+ * <p>
+ * Accepts and returns a StringBuilder, allocating it only if necessary.
+ */
+ public static StringBuilder trimSymbolsAndIgnorables(
+ CharSequence affixPattern,
+ UnicodeSet ignorables,
+ StringBuilder sb) {
+ assert affixPattern != null;
+ long tag = 0L;
+ int trailingIgnorables = 0;
+ while (hasNext(tag, affixPattern)) {
+ tag = nextToken(tag, affixPattern);
+ int typeOrCp = getTypeOrCp(tag);
+ if (typeOrCp >= 0) {
+ if (!ignorables.contains(typeOrCp)) {
+ if (sb == null) {
+ // Lazy-initialize the StringBuilder
+ sb = new StringBuilder();
+ }
+ sb.appendCodePoint(typeOrCp);
+ trailingIgnorables = 0;
+ } else if (sb != null && sb.length() > 0) {
+ sb.appendCodePoint(typeOrCp);
+ trailingIgnorables += Character.charCount(typeOrCp);
+ }
+ }
+ }
+ if (trailingIgnorables > 0) {
+ sb.setLength(sb.length() - trailingIgnorables);
+ }
+ return sb;
}
- return false;
- }
- /**
- * Replaces all occurrences of tokens with the given type with the given replacement char.
- *
- * @param affixPattern The source affix pattern (does not get modified).
- * @param type The token type.
- * @param replacementChar The char to substitute in place of chars of the given token type.
- * @return A string containing the new affix pattern.
- */
- public static String replaceType(CharSequence affixPattern, int type, char replacementChar) {
- if (affixPattern == null || affixPattern.length() == 0) return "";
- char[] chars = affixPattern.toString().toCharArray();
- long tag = 0L;
- while (hasNext(tag, affixPattern)) {
- tag = nextToken(tag, affixPattern);
- if (getTypeOrCp(tag) == type) {
+ /**
+ * Returns the next token from the affix pattern.
+ *
+ * @param tag
+ * A bitmask used for keeping track of state from token to token. The initial value should
+ * be 0L.
+ * @param patternString
+ * The affix pattern.
+ * @return The bitmask tag to pass to the next call of this method to retrieve the following token
+ * (never negative), or -1 if there were no more tokens in the affix pattern.
+ * @see #hasNext
+ */
+ public static long nextToken(long tag, CharSequence patternString) {
int offset = getOffset(tag);
- chars[offset - 1] = replacementChar;
- }
- }
- return new String(chars);
- }
+ int state = getState(tag);
+ for (; offset < patternString.length();) {
+ int cp = Character.codePointAt(patternString, offset);
+ int count = Character.charCount(cp);
- /**
- * Returns the next token from the affix pattern.
- *
- * @param tag A bitmask used for keeping track of state from token to token. The initial value
- * should be 0L.
- * @param patternString The affix pattern.
- * @return The bitmask tag to pass to the next call of this method to retrieve the following token
- * (never negative), or -1 if there were no more tokens in the affix pattern.
- * @see #hasNext
- */
- public static long nextToken(long tag, CharSequence patternString) {
- int offset = getOffset(tag);
- int state = getState(tag);
- for (; offset < patternString.length(); ) {
- int cp = Character.codePointAt(patternString, offset);
- int count = Character.charCount(cp);
-
- switch (state) {
+ switch (state) {
+ case STATE_BASE:
+ switch (cp) {
+ case '\'':
+ state = STATE_FIRST_QUOTE;
+ offset += count;
+ // continue to the next code point
+ break;
+ case '-':
+ return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
+ case '+':
+ return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
+ case '%':
+ return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
+ case '‰':
+ return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
+ case '¤':
+ state = STATE_FIRST_CURR;
+ offset += count;
+ // continue to the next code point
+ break;
+ default:
+ return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
+ }
+ break;
+ case STATE_FIRST_QUOTE:
+ if (cp == '\'') {
+ return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
+ } else {
+ return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
+ }
+ case STATE_INSIDE_QUOTE:
+ if (cp == '\'') {
+ state = STATE_AFTER_QUOTE;
+ offset += count;
+ // continue to the next code point
+ break;
+ } else {
+ return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
+ }
+ case STATE_AFTER_QUOTE:
+ if (cp == '\'') {
+ return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
+ } else {
+ state = STATE_BASE;
+ // re-evaluate this code point
+ break;
+ }
+ case STATE_FIRST_CURR:
+ if (cp == '¤') {
+ state = STATE_SECOND_CURR;
+ offset += count;
+ // continue to the next code point
+ break;
+ } else {
+ return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
+ }
+ case STATE_SECOND_CURR:
+ if (cp == '¤') {
+ state = STATE_THIRD_CURR;
+ offset += count;
+ // continue to the next code point
+ break;
+ } else {
+ return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
+ }
+ case STATE_THIRD_CURR:
+ if (cp == '¤') {
+ state = STATE_FOURTH_CURR;
+ offset += count;
+ // continue to the next code point
+ break;
+ } else {
+ return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
+ }
+ case STATE_FOURTH_CURR:
+ if (cp == '¤') {
+ state = STATE_FIFTH_CURR;
+ offset += count;
+ // continue to the next code point
+ break;
+ } else {
+ return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
+ }
+ case STATE_FIFTH_CURR:
+ if (cp == '¤') {
+ state = STATE_OVERFLOW_CURR;
+ offset += count;
+ // continue to the next code point
+ break;
+ } else {
+ return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
+ }
+ case STATE_OVERFLOW_CURR:
+ if (cp == '¤') {
+ offset += count;
+ // continue to the next code point and loop back to this state
+ break;
+ } else {
+ return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
+ }
+ default:
+ throw new AssertionError();
+ }
+ }
+ // End of string
+ switch (state) {
case STATE_BASE:
- switch (cp) {
- case '\'':
- state = STATE_FIRST_QUOTE;
- offset += count;
- // continue to the next code point
- break;
- case '-':
- return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
- case '+':
- return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
- case '%':
- return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
- case '‰':
- return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
- case '¤':
- state = STATE_FIRST_CURR;
- offset += count;
- // continue to the next code point
- break;
- default:
- return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
- }
- break;
+ // No more tokens in string.
+ return -1L;
case STATE_FIRST_QUOTE:
- if (cp == '\'') {
- return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
- } else {
- return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
- }
case STATE_INSIDE_QUOTE:
- if (cp == '\'') {
- state = STATE_AFTER_QUOTE;
- offset += count;
- // continue to the next code point
- break;
- } else {
- return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
- }
+ // For consistent behavior with the JDK and ICU 58, throw an exception here.
+ throw new IllegalArgumentException(
+ "Unterminated quote in pattern affix: \"" + patternString + "\"");
case STATE_AFTER_QUOTE:
- if (cp == '\'') {
- return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
- } else {
- state = STATE_BASE;
- // re-evaluate this code point
- break;
- }
+ // No more tokens in string.
+ return -1L;
case STATE_FIRST_CURR:
- if (cp == '¤') {
- state = STATE_SECOND_CURR;
- offset += count;
- // continue to the next code point
- break;
- } else {
return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
- }
case STATE_SECOND_CURR:
- if (cp == '¤') {
- state = STATE_THIRD_CURR;
- offset += count;
- // continue to the next code point
- break;
- } else {
return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
- }
case STATE_THIRD_CURR:
- if (cp == '¤') {
- state = STATE_FOURTH_CURR;
- offset += count;
- // continue to the next code point
- break;
- } else {
return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
- }
case STATE_FOURTH_CURR:
- if (cp == '¤') {
- state = STATE_FIFTH_CURR;
- offset += count;
- // continue to the next code point
- break;
- } else {
return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
- }
case STATE_FIFTH_CURR:
- if (cp == '¤') {
- state = STATE_OVERFLOW_CURR;
- offset += count;
- // continue to the next code point
- break;
- } else {
return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
- }
case STATE_OVERFLOW_CURR:
- if (cp == '¤') {
- offset += count;
- // continue to the next code point and loop back to this state
- break;
- } else {
return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
- }
default:
- throw new AssertionError();
- }
+ throw new AssertionError();
+ }
}
- // End of string
- switch (state) {
- case STATE_BASE:
- // No more tokens in string.
- return -1L;
- case STATE_FIRST_QUOTE:
- case STATE_INSIDE_QUOTE:
- // For consistent behavior with the JDK and ICU 58, throw an exception here.
- throw new IllegalArgumentException(
- "Unterminated quote in pattern affix: \"" + patternString + "\"");
- case STATE_AFTER_QUOTE:
- // No more tokens in string.
- return -1L;
- case STATE_FIRST_CURR:
- return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
- case STATE_SECOND_CURR:
- return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
- case STATE_THIRD_CURR:
- return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
- case STATE_FOURTH_CURR:
- return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
- case STATE_FIFTH_CURR:
- return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
- case STATE_OVERFLOW_CURR:
- return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
- default:
- throw new AssertionError();
+
+ /**
+ * Returns whether the affix pattern string has any more tokens to be retrieved from a call to
+ * {@link #nextToken}.
+ *
+ * @param tag
+ * The bitmask tag of the previous token, as returned by {@link #nextToken}.
+ * @param string
+ * The affix pattern.
+ * @return true if there are more tokens to consume; false otherwise.
+ */
+ public static boolean hasNext(long tag, CharSequence string) {
+ assert tag >= 0;
+ int state = getState(tag);
+ int offset = getOffset(tag);
+ // Special case: the last character in string is an end quote.
+ if (state == STATE_INSIDE_QUOTE
+ && offset == string.length() - 1
+ && string.charAt(offset) == '\'') {
+ return false;
+ } else if (state != STATE_BASE) {
+ return true;
+ } else {
+ return offset < string.length();
+ }
}
- }
- /**
- * Returns whether the affix pattern string has any more tokens to be retrieved from a call to
- * {@link #nextToken}.
- *
- * @param tag The bitmask tag of the previous token, as returned by {@link #nextToken}.
- * @param string The affix pattern.
- * @return true if there are more tokens to consume; false otherwise.
- */
- public static boolean hasNext(long tag, CharSequence string) {
- assert tag >= 0;
- int state = getState(tag);
- int offset = getOffset(tag);
- // Special case: the last character in string is an end quote.
- if (state == STATE_INSIDE_QUOTE
- && offset == string.length() - 1
- && string.charAt(offset) == '\'') {
- return false;
- } else if (state != STATE_BASE) {
- return true;
- } else {
- return offset < string.length();
+ /**
+ * This function helps determine the identity of the token consumed by {@link #nextToken}. Converts
+ * from a bitmask tag, based on a call to {@link #nextToken}, to its corresponding symbol type or
+ * code point.
+ *
+ * @param tag
+ * The bitmask tag of the current token, as returned by {@link #nextToken}.
+ * @return If less than zero, a symbol type corresponding to one of the <code>TYPE_</code> constants,
+ * such as {@link #TYPE_MINUS_SIGN}. If greater than or equal to zero, a literal code point.
+ */
+ public static int getTypeOrCp(long tag) {
+ assert tag >= 0;
+ int type = getType(tag);
+ return (type == TYPE_CODEPOINT) ? getCodePoint(tag) : -type;
}
- }
- /**
- * This function helps determine the identity of the token consumed by {@link #nextToken}.
- * Converts from a bitmask tag, based on a call to {@link #nextToken}, to its corresponding symbol
- * type or code point.
- *
- * @param tag The bitmask tag of the current token, as returned by {@link #nextToken}.
- * @return If less than zero, a symbol type corresponding to one of the <code>TYPE_</code>
- * constants, such as {@link #TYPE_MINUS_SIGN}. If greater than or equal to zero, a literal
- * code point.
- */
- public static int getTypeOrCp(long tag) {
- assert tag >= 0;
- int type = getType(tag);
- return (type == TYPE_CODEPOINT) ? getCodePoint(tag) : -type;
- }
+ /**
+ * Encodes the given values into a 64-bit tag.
+ *
+ * <ul>
+ * <li>Bits 0-31 => offset (int32)
+ * <li>Bits 32-35 => type (uint4)
+ * <li>Bits 36-39 => state (uint4)
+ * <li>Bits 40-60 => code point (uint21)
+ * <li>Bits 61-63 => unused
+ * </ul>
+ */
+ private static long makeTag(int offset, int type, int state, int cp) {
+ long tag = 0L;
+ tag |= offset;
+ tag |= (-(long) type) << 32;
+ tag |= ((long) state) << 36;
+ tag |= ((long) cp) << 40;
+ assert tag >= 0;
+ return tag;
+ }
- /**
- * Encodes the given values into a 64-bit tag.
- *
- * <ul>
- * <li>Bits 0-31 => offset (int32)
- * <li>Bits 32-35 => type (uint4)
- * <li>Bits 36-39 => state (uint4)
- * <li>Bits 40-60 => code point (uint21)
- * <li>Bits 61-63 => unused
- * </ul>
- */
- private static long makeTag(int offset, int type, int state, int cp) {
- long tag = 0L;
- tag |= offset;
- tag |= (-(long) type) << 32;
- tag |= ((long) state) << 36;
- tag |= ((long) cp) << 40;
- assert tag >= 0;
- return tag;
- }
+ static int getOffset(long tag) {
+ return (int) (tag & 0xffffffff);
+ }
- static int getOffset(long tag) {
- return (int) (tag & 0xffffffff);
- }
+ static int getType(long tag) {
+ return (int) ((tag >>> 32) & 0xf);
+ }
- static int getType(long tag) {
- return (int) ((tag >>> 32) & 0xf);
- }
+ static int getState(long tag) {
+ return (int) ((tag >>> 36) & 0xf);
+ }
- static int getState(long tag) {
- return (int) ((tag >>> 36) & 0xf);
- }
-
- static int getCodePoint(long tag) {
- return (int) (tag >>> 40);
- }
+ static int getCodePoint(long tag) {
+ return (int) (tag >>> 40);
+ }
}

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b