Index: icu4c/source/i18n/unicode/uspoof.h
===================================================================
--- icu4c/source/i18n/unicode/uspoof.h (revision 41366)
+++ icu4c/source/i18n/unicode/uspoof.h (working copy)
@@ -477,7 +477,7 @@
*/
USPOOF_CHAR_LIMIT = 64,
- /**
+ /**
* Check that an identifier does not mix numbers from different numbering systems.
* For more information, see UTS 39 section 5.3.
*
@@ -485,6 +485,27 @@
*/
USPOOF_MIXED_NUMBERS = 128,
+ /**
+ * Check that an identifier does not have a combining character following a character in which that
+ * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
+ *
+ * More specifically, the following characters are forbidden from preceding a U+0307:
+ *
+ * - Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')
+ * - Latin lowercase letter 'l'
+ * - Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)
+ * - Any character whose confusable prototype ends with such a character
+ * (Soft_Dotted, 'l', 'ı', or 'ȷ')
+ *
+ * In addition, combining characters are allowed between the above characters and U+0307 except those
+ * with combining class 0 or combining class "Above" (230, same class as U+0307).
+ *
+ * This list and the number of combing characters considered by this check may grow over time.
+ *
+ * @draft ICU 62
+ */
+ USPOOF_HIDDEN_OVERLAY = 256,
+
/**
* Enable all spoof checks.
*
Index: icu4c/source/i18n/uspoof.cpp
===================================================================
--- icu4c/source/i18n/uspoof.cpp (revision 41366)
+++ icu4c/source/i18n/uspoof.cpp (working copy)
@@ -524,7 +524,14 @@
checkResult->fNumerics = numerics; // UnicodeSet::operator=
}
+ if (0 != (This->fChecks & USPOOF_HIDDEN_OVERLAY)) {
+ int32_t index = This->findHiddenOverlay(id, *status);
+ if (index != -1) {
+ result |= USPOOF_HIDDEN_OVERLAY;
+ }
+ }
+
if (0 != (This->fChecks & USPOOF_CHAR_LIMIT)) {
int32_t i;
UChar32 c;
Index: icu4c/source/i18n/uspoof_impl.cpp
===================================================================
--- icu4c/source/i18n/uspoof_impl.cpp (revision 41366)
+++ icu4c/source/i18n/uspoof_impl.cpp (working copy)
@@ -377,8 +377,45 @@
return USPOOF_MINIMALLY_RESTRICTIVE;
}
+int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
+ bool sawLeadCharacter = false;
+ for (int32_t i=0; iconfusableLookup(cp, skelStr);
+ UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
+ if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
+ return true;
+ }
+ return false;
+}
+
+
+
// Convert a text format hex number. Utility function used by builder code. Static.
// Input: UChar *string text. Output: a UChar32
// Input has been pre-checked, and will have no non-hex chars.
@@ -532,7 +569,7 @@
if (gDefaultSpoofData) {
// Will delete, assuming all user-level spoof checkers were closed.
gDefaultSpoofData->removeReference();
- gDefaultSpoofData = NULL;
+ gDefaultSpoofData = nullptr;
gSpoofInitDefaultOnce.reset();
}
return TRUE;
@@ -539,17 +576,18 @@
}
static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
- UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
+ UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
spoofDataIsAcceptable,
- NULL, // context, would receive dataVersion if supplied.
+ nullptr, // context, would receive dataVersion if supplied.
&status);
if (U_FAILURE(status)) { return; }
gDefaultSpoofData = new SpoofData(udm, status);
if (U_FAILURE(status)) {
delete gDefaultSpoofData;
+ gDefaultSpoofData = nullptr;
return;
}
- if (gDefaultSpoofData == NULL) {
+ if (gDefaultSpoofData == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
Index: icu4c/source/i18n/uspoof_impl.h
===================================================================
--- icu4c/source/i18n/uspoof_impl.h (revision 41366)
+++ icu4c/source/i18n/uspoof_impl.h (working copy)
@@ -83,6 +83,9 @@
void getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& status) const;
URestrictionLevel getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const;
+ int32_t findHiddenOverlay(const UnicodeString& input, UErrorCode& status) const;
+ bool isIllegalCombiningDotLeadCharacter(UChar32 cp) const;
+
/** parse a hex number. Untility used by the builders. */
static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status);
Index: icu4c/source/test/intltest/itspoof.cpp
===================================================================
--- icu4c/source/test/intltest/itspoof.cpp (revision 41366)
+++ icu4c/source/test/intltest/itspoof.cpp (working copy)
@@ -92,6 +92,7 @@
TESTCASE_AUTO(testBug12815);
TESTCASE_AUTO(testBug13314_MixedNumbers);
TESTCASE_AUTO(testBug13328_MixedCombiningMarks);
+ TESTCASE_AUTO(testCombiningDot);
TESTCASE_AUTO_END;
}
@@ -710,4 +711,45 @@
failedChecks);
}
+void IntlTestSpoof::testCombiningDot() {
+ UErrorCode status = U_ZERO_ERROR;
+ LocalUSpoofCheckerPointer sc(uspoof_open(&status));
+ TEST_ASSERT_SUCCESS(status);
+ uspoof_setChecks(sc.getAlias(), USPOOF_HIDDEN_OVERLAY, &status);
+ TEST_ASSERT_SUCCESS(status);
+
+ static const struct TestCase {
+ bool shouldFail;
+ const char16_t* input;
+ } cases[] = {
+ {false, u"i"},
+ {false, u"j"},
+ {false, u"l"},
+ {true, u"i\u0307"},
+ {true, u"j\u0307"},
+ {true, u"l\u0307"},
+ {true, u"ı\u0307"},
+ {true, u"ȷ\u0307"},
+ {true, u"𝚤\u0307"},
+ {true, u"𝑗\u0307"},
+ {false, u"m\u0307"},
+ {true, u"1\u0307"},
+ {true, u"ij\u0307"},
+ {true, u"i\u0307\u0307"},
+ {true, u"abci\u0307def"},
+ {false, u"i\u0301\u0307"}, // U+0301 has combining class ABOVE (230)
+ {true, u"i\u0320\u0307"}, // U+0320 has combining class BELOW
+ {true, u"i\u0320\u0321\u0307"}, // U+0321 also has combining class BELOW
+ {false, u"i\u0320\u0301\u0307"},
+ {false, u"iz\u0307"},
+ };
+
+ for (auto& cas : cases) {
+ int32_t failedChecks = uspoof_check2(sc.getAlias(), cas.input, -1, nullptr, &status);
+ TEST_ASSERT_SUCCESS(status);
+ int32_t expected = cas.shouldFail ? USPOOF_HIDDEN_OVERLAY : 0;
+ assertEquals(cas.input, expected, failedChecks);
+ }
+}
+
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO */
Index: icu4c/source/test/intltest/itspoof.h
===================================================================
--- icu4c/source/test/intltest/itspoof.h (revision 41366)
+++ icu4c/source/test/intltest/itspoof.h (working copy)
@@ -54,6 +54,8 @@
void testBug13328_MixedCombiningMarks();
+ void testCombiningDot();
+
// Internal function to run a single skeleton test case.
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
const char *input, const char *expected, int32_t lineNum);
Index: icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java
===================================================================
--- icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java (revision 41366)
+++ icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java (working copy)
@@ -441,6 +441,28 @@
*/
public static final int MIXED_NUMBERS = 128;
+ /**
+ * Check that an identifier does not have a combining character following a character in which that
+ * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
+ *
+ * More specifically, the following characters are forbidden from preceding a U+0307:
+ *
+ * - Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')
+ * - Latin lowercase letter 'l'
+ * - Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)
+ * - Any character whose confusable prototype ends with such a character
+ * (Soft_Dotted, 'l', 'ı', or 'ȷ')
+ *
+ * In addition, combining characters are allowed between the above characters and U+0307 except those
+ * with combining class 0 or combining class "Above" (230, same class as U+0307).
+ *
+ * This list and the number of combing characters considered by this check may grow over time.
+ *
+ * @draft ICU 62
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static final int HIDDEN_OVERLAY = 256;
+
// Update CheckResult.toString() when a new check is added.
/**
@@ -1300,6 +1322,13 @@
}
}
+ if (0 != (this.fChecks & HIDDEN_OVERLAY)) {
+ int index = findHiddenOverlay(text);
+ if (index != -1) {
+ result |= HIDDEN_OVERLAY;
+ }
+ }
+
if (0 != (this.fChecks & CHAR_LIMIT)) {
int i;
int c;
@@ -1657,6 +1686,44 @@
return RestrictionLevel.MINIMALLY_RESTRICTIVE;
}
+ int findHiddenOverlay(String input) {
+ boolean sawLeadCharacter = false;
+ StringBuilder sb = new StringBuilder();
+ for (int i=0; i