LEFT | RIGHT |
1 // © 2016 and later: Unicode, Inc. and others. | 1 // © 2016 and later: Unicode, Inc. and others. |
2 // License & terms of use: http://www.unicode.org/copyright.html#License | 2 // License & terms of use: http://www.unicode.org/copyright.html#License |
3 /* | 3 /* |
4 *************************************************************************** | 4 *************************************************************************** |
5 * Copyright (C) 2008-2016 International Business Machines Corporation | 5 * Copyright (C) 2008-2016 International Business Machines Corporation |
6 * and others. All Rights Reserved. | 6 * and others. All Rights Reserved. |
7 *************************************************************************** | 7 *************************************************************************** |
8 * | 8 * |
9 * Unicode Spoof Detection | 9 * Unicode Spoof Detection |
10 */ | 10 */ |
(...skipping 424 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
435 | 435 |
436 /** | 436 /** |
437 * Check that an identifier does not mix numbers from different numbering sy
stems. For more information, see UTS 39 | 437 * Check that an identifier does not mix numbers from different numbering sy
stems. For more information, see UTS 39 |
438 * section 5.3. | 438 * section 5.3. |
439 * | 439 * |
440 * @stable ICU 58 | 440 * @stable ICU 58 |
441 */ | 441 */ |
442 public static final int MIXED_NUMBERS = 128; | 442 public static final int MIXED_NUMBERS = 128; |
443 | 443 |
444 /** | 444 /** |
445 * Check that an identifier does not have a combining dot following a charac
ter that already has a | 445 * Check that an identifier does not have a combining character following a
character in which that |
446 * dot (or hides the dot). For example, "i\u0307" is forbidden with this che
ck. | 446 * combining character would be hidden; for example 'i' followed by a U+0307
combining dot. |
447 * | 447 * <p> |
448 * More specifically, the set of characters that are forbidden from precedin
g a U+0307 are those | 448 * More specifically, the following characters are forbidden from preceding
a U+0307: |
449 * having the Unicode character property Soft_Dotted (which includes i and j
), the Latin lowercase | 449 * <ul> |
450 * letter L, or any character confusable with such a character, including do
tless variants of i and | 450 * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and '
j')</li> |
451 * j. | 451 * <li>Latin lowercase letter 'l'</li> |
| 452 * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li> |
| 453 * <li>Any character whose confusable prototype ends with such a character |
| 454 * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li> |
| 455 * </ul> |
| 456 * In addition, combining characters are allowed between the above character
s and U+0307 except those |
| 457 * with combining class 0 or combining class "Above" (230, same class as U+0
307). |
| 458 * <p> |
| 459 * This list and the number of combing characters considered by this check m
ay grow over time. |
452 * | 460 * |
453 * @draft ICU 62 | 461 * @draft ICU 62 |
454 * @provisional This API might change or be removed in a future release. | 462 * @provisional This API might change or be removed in a future release. |
455 */ | 463 */ |
456 public static final int COMBINING_DOT = 256; | 464 public static final int HIDDEN_OVERLAY = 256; |
457 | 465 |
458 // Update CheckResult.toString() when a new check is added. | 466 // Update CheckResult.toString() when a new check is added. |
459 | 467 |
460 /** | 468 /** |
461 * Enable all spoof checks. | 469 * Enable all spoof checks. |
462 * | 470 * |
463 * @stable ICU 4.6 | 471 * @stable ICU 4.6 |
464 */ | 472 */ |
465 public static final int ALL_CHECKS = 0xFFFFFFFF; | 473 public static final int ALL_CHECKS = 0xFFFFFFFF; |
466 | 474 |
(...skipping 840 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1307 UnicodeSet numerics = new UnicodeSet(); | 1315 UnicodeSet numerics = new UnicodeSet(); |
1308 getNumerics(text, numerics); | 1316 getNumerics(text, numerics); |
1309 if (numerics.size() > 1) { | 1317 if (numerics.size() > 1) { |
1310 result |= MIXED_NUMBERS; | 1318 result |= MIXED_NUMBERS; |
1311 } | 1319 } |
1312 if (checkResult != null) { | 1320 if (checkResult != null) { |
1313 checkResult.numerics = numerics; | 1321 checkResult.numerics = numerics; |
1314 } | 1322 } |
1315 } | 1323 } |
1316 | 1324 |
| 1325 if (0 != (this.fChecks & HIDDEN_OVERLAY)) { |
| 1326 int index = findHiddenOverlay(text); |
| 1327 if (index != -1) { |
| 1328 result |= HIDDEN_OVERLAY; |
| 1329 } |
| 1330 } |
| 1331 |
1317 if (0 != (this.fChecks & CHAR_LIMIT)) { | 1332 if (0 != (this.fChecks & CHAR_LIMIT)) { |
1318 int i; | 1333 int i; |
1319 int c; | 1334 int c; |
1320 for (i = 0; i < length;) { | 1335 for (i = 0; i < length;) { |
1321 // U16_NEXT(text, i, length, c); | 1336 // U16_NEXT(text, i, length, c); |
1322 c = Character.codePointAt(text, i); | 1337 c = Character.codePointAt(text, i); |
1323 i = Character.offsetByCodePoints(text, i, 1); | 1338 i = Character.offsetByCodePoints(text, i, 1); |
1324 if (!this.fAllowedCharsSet.contains(c)) { | 1339 if (!this.fAllowedCharsSet.contains(c)) { |
1325 result |= CHAR_LIMIT; | 1340 result |= CHAR_LIMIT; |
1326 break; | 1341 break; |
(...skipping 335 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1662 } | 1677 } |
1663 | 1678 |
1664 // Section 5.2 step 7: | 1679 // Section 5.2 step 7: |
1665 if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) &
& !resolvedNoLatn.get(UScript.GREEK) | 1680 if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) &
& !resolvedNoLatn.get(UScript.GREEK) |
1666 && !resolvedNoLatn.get(UScript.CHEROKEE)) { | 1681 && !resolvedNoLatn.get(UScript.CHEROKEE)) { |
1667 return RestrictionLevel.MODERATELY_RESTRICTIVE; | 1682 return RestrictionLevel.MODERATELY_RESTRICTIVE; |
1668 } | 1683 } |
1669 | 1684 |
1670 // Section 5.2 step 8: | 1685 // Section 5.2 step 8: |
1671 return RestrictionLevel.MINIMALLY_RESTRICTIVE; | 1686 return RestrictionLevel.MINIMALLY_RESTRICTIVE; |
| 1687 } |
| 1688 |
| 1689 int findHiddenOverlay(String input) { |
| 1690 boolean sawLeadCharacter = false; |
| 1691 StringBuilder sb = new StringBuilder(); |
| 1692 for (int i=0; i<input.length();) { |
| 1693 int cp = input.codePointAt(i); |
| 1694 if (sawLeadCharacter && cp == 0x0307) { |
| 1695 return i; |
| 1696 } |
| 1697 int combiningClass = UCharacter.getCombiningClass(cp); |
| 1698 // Skip over characters except for those with combining class 0 (non
-combining characters) or with |
| 1699 // combining class 230 (same class as U+0307) |
| 1700 assert UCharacter.getCombiningClass(0x0307) == 230; |
| 1701 if (combiningClass == 0 || combiningClass == 230) { |
| 1702 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp, sb); |
| 1703 } |
| 1704 i += UCharacter.charCount(cp); |
| 1705 } |
| 1706 return -1; |
| 1707 } |
| 1708 |
| 1709 boolean isIllegalCombiningDotLeadCharacterNoLookup(int cp) { |
| 1710 return cp == 'i' || cp == 'j' || cp == 'ı' || cp == 'ȷ' || cp == 'l' || |
| 1711 UCharacter.hasBinaryProperty(cp, UProperty.SOFT_DOTTED); |
| 1712 } |
| 1713 |
| 1714 boolean isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb) { |
| 1715 if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) { |
| 1716 return true; |
| 1717 } |
| 1718 sb.setLength(0); |
| 1719 fSpoofData.confusableLookup(cp, sb); |
| 1720 int finalCp = UCharacter.codePointBefore(sb, sb.length()); |
| 1721 if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)
) { |
| 1722 return true; |
| 1723 } |
| 1724 return false; |
1672 } | 1725 } |
1673 | 1726 |
1674 // Data Members | 1727 // Data Members |
1675 private int fChecks; // Bit vector of checks to perform. | 1728 private int fChecks; // Bit vector of checks to perform. |
1676 private SpoofData fSpoofData; | 1729 private SpoofData fSpoofData; |
1677 private Set<ULocale> fAllowedLocales; // The Set of allowed locales. | 1730 private Set<ULocale> fAllowedLocales; // The Set of allowed locales. |
1678 private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters
. | 1731 private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters
. |
1679 private RestrictionLevel fRestrictionLevel; | 1732 private RestrictionLevel fRestrictionLevel; |
1680 | 1733 |
1681 private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance(); | 1734 private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance(); |
(...skipping 347 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2029 @Override | 2082 @Override |
2030 public String toString() { | 2083 public String toString() { |
2031 StringBuilder sb = new StringBuilder(); | 2084 StringBuilder sb = new StringBuilder(); |
2032 sb.append("<ScriptSet "); | 2085 sb.append("<ScriptSet "); |
2033 appendStringTo(sb); | 2086 appendStringTo(sb); |
2034 sb.append(">"); | 2087 sb.append(">"); |
2035 return sb.toString(); | 2088 return sb.toString(); |
2036 } | 2089 } |
2037 } | 2090 } |
2038 } | 2091 } |
LEFT | RIGHT |