OLD | NEW |
1 # Copyright (C) 2016 and later: Unicode, Inc. and others. | 1 # Copyright (C) 2016 and later: Unicode, Inc. and others. |
2 # License & terms of use: http://www.unicode.org/copyright.html | 2 # License & terms of use: http://www.unicode.org/copyright.html |
3 # | 3 # |
4 # Copyright (C) 2002-2015, International Business Machines Corporation and oth
ers. | 4 # Copyright (C) 2002-2015, International Business Machines Corporation and oth
ers. |
5 # All Rights Reserved. | 5 # All Rights Reserved. |
6 # | 6 # |
7 # file: sent.txt | 7 # file: sent.txt |
8 # | 8 # |
9 # ICU Sentence Break Rules | 9 # ICU Sentence Break Rules |
10 # See Unicode Standard Annex #29. | 10 # See Unicode Standard Annex #29. |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
43 $NumericEx = $Numeric ($Extend | $Format)*; | 43 $NumericEx = $Numeric ($Extend | $Format)*; |
44 $ATermEx = $ATerm ($Extend | $Format)*; | 44 $ATermEx = $ATerm ($Extend | $Format)*; |
45 $SContinueEx= $SContinue ($Extend | $Format)*; | 45 $SContinueEx= $SContinue ($Extend | $Format)*; |
46 $STermEx = $STerm ($Extend | $Format)*; | 46 $STermEx = $STerm ($Extend | $Format)*; |
47 $CloseEx = $Close ($Extend | $Format)*; | 47 $CloseEx = $Close ($Extend | $Format)*; |
48 | 48 |
49 | 49 |
50 ## ------------------------------------------------- | 50 ## ------------------------------------------------- |
51 | 51 |
52 !!chain; | 52 !!chain; |
53 !!forward; | |
54 | 53 |
55 # Rule 3 - break after separators. Keep CR/LF together. | 54 # Rule 3 - break after separators. Keep CR/LF together. |
56 # | 55 # |
57 $CR $LF; | 56 $CR $LF; |
58 | 57 |
59 | 58 |
60 # Rule 4 - Break after $Sep. | 59 # Rule 4 - Break after $Sep. |
61 # Rule 5 - Ignore $Format and $Extend | 60 # Rule 5 - Ignore $Format and $Extend |
62 # | 61 # |
63 [^$Sep $CR $LF]? ($Extend | $Format)*; | 62 [^$Sep $CR $LF]? ($Extend | $Format)*; |
(...skipping 11 matching lines...) Expand all Loading... |
75 | 74 |
76 # Rule 8a | 75 # Rule 8a |
77 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); | 76 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); |
78 | 77 |
79 #Rule 9, 10, 11 | 78 #Rule 9, 10, 11 |
80 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; | 79 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; |
81 | 80 |
82 #Rule 12 | 81 #Rule 12 |
83 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Form
at | $Close | $Sp)* .; | 82 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Form
at | $Close | $Sp)* .; |
84 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Form
at | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; | 83 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Form
at | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; |
85 | |
86 ## ------------------------------------------------- | |
87 | |
88 !!safe_reverse; | |
89 | |
90 $SpEx_R = ($Extend | $Format)* $Sp; | |
91 $ATermEx_R = ($Extend | $Format)* $ATerm; | |
92 $STermEx_R = ($Extend | $Format)* $STerm; | |
93 $CloseEx_R = ($Extend | $Format)* $Close; | |
94 | |
95 [{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_
R* ($STermEx_R | $ATermEx_R))*; | |
96 #.*; | |
97 | |
98 # Explanation for this rule: | |
99 # | |
100 # It needs to back over | |
101 # The $Sep at which we probably begin | |
102 # All of the non $Sep chars leading to the preceding $Sep | |
103 # The preceding $Sep, which will be the second one that the rule matches. | |
104 # Any immediately preceding STerm or ATerm sequences. We need to see the
se | |
105 # to get the correct rule status when moving forwards again. | |
106 # | |
107 # [{bof}] inhibit rule chaining. Without this, rule would loop on its
elf and match | |
108 # the entire string. TODO: can bof be replaced with ^ | |
109 # | |
110 # (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because po
sition might be | |
111 # at the beginning of the string at this point, and we don't w
ant to fail. | |
112 # Can only use {eof} once, and it is used later. | |
113 # | |
OLD | NEW |