Left: | ||
Right: |
OLD | NEW |
---|---|
1 // Copyright 2009 The Go Authors. All rights reserved. | 1 // Copyright 2009 The Go Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // This package provides data and functions to test some properties of Unicode c ode points. | 5 // This package provides data and functions to test some properties of Unicode c ode points. |
6 package unicode | 6 package unicode |
7 | 7 |
8 const ( | 8 const ( |
9 MaxRune = 0x10FFFF // Maximum valid Unicode code point. | 9 MaxRune = 0x10FFFF // Maximum valid Unicode code point. |
10 ReplacementChar = 0xFFFD // Represents invalid code points. | 10 ReplacementChar = 0xFFFD // Represents invalid code points. |
11 ) | 11 ) |
12 | 12 |
13 | 13 |
14 // The representation of a range of Unicode code points. The range runs from Lo to Hi | 14 // The representation of a range of Unicode code points. The range runs from Lo to Hi |
15 // inclusive and has the specified stride. | 15 // inclusive and has the specified stride. |
16 type Range struct { | 16 type Range struct { |
17 Lo int | 17 Lo int |
18 Hi int | 18 Hi int |
19 Stride int | 19 Stride int |
20 } | 20 } |
21 | 21 |
22 // The representation of a range of Unicode code points for case conversion. | 22 // CaseRange represents a range of Unicode code points for simple (one |
23 // code point to one code point) case conversion. | |
23 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas | 24 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas |
24 // are the number to add to the code point to reach the code point for a | 25 // are the number to add to the code point to reach the code point for a |
25 // different case for that character. They may be negative. If zero, it | 26 // different case for that character. They may be negative. If zero, it |
26 // means the character is in the corresponding case. There is a special | 27 // means the character is in the corresponding case. There is a special |
27 // case representing sequences of alternating corresponding Upper and Lower | 28 // case representing sequences of alternating corresponding Upper and Lower |
28 // pairs. It appears with a fixed Delta of | 29 // pairs. It appears with a fixed Delta of |
29 // {UpperLower, UpperLower, UpperLower} | 30 // {UpperLower, UpperLower, UpperLower} |
30 // The constant UpperLower has an otherwise impossible delta value. | 31 // The constant UpperLower has an otherwise impossible delta value. |
32 // TODO: Provide a mechanism for full case folding (those that involve | |
rsc1
2010/03/31 00:39:31
If you move this out of the doc comment you can sa
| |
33 // multiple runes in the input or output). | |
31 type CaseRange struct { | 34 type CaseRange struct { |
32 Lo int | 35 Lo int |
33 Hi int | 36 Hi int |
34 Delta d | 37 Delta d |
35 } | 38 } |
36 | 39 |
40 // SpecialCase represents language-specific case mappings such as Turkish. | |
41 // TODO: Provide a mechanism for full case folding (those that involve | |
rsc1
2010/03/31 00:39:31
Then you can drop this one.
(But keep Methods of S
| |
42 // multiple runes in the input or output). Methods of SpecialCase | |
43 // customize (by overriding) the standard mappings. | |
44 type SpecialCase []CaseRange | |
45 | |
37 // Indices into the Delta arrays inside CaseRanges for case mapping. | 46 // Indices into the Delta arrays inside CaseRanges for case mapping. |
38 const ( | 47 const ( |
39 UpperCase = iota | 48 UpperCase = iota |
40 LowerCase | 49 LowerCase |
41 TitleCase | 50 TitleCase |
42 MaxCase | 51 MaxCase |
43 ) | 52 ) |
44 | 53 |
45 type d [MaxCase]int32 // to make the CaseRanges text shorter | 54 type d [MaxCase]int32 // to make the CaseRanges text shorter |
46 | 55 |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
123 if rune <= 0xFF { // quick Latin-1 check | 132 if rune <= 0xFF { // quick Latin-1 check |
124 switch rune { | 133 switch rune { |
125 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: | 134 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: |
126 return true | 135 return true |
127 } | 136 } |
128 return false | 137 return false |
129 } | 138 } |
130 return Is(White_Space, rune) | 139 return Is(White_Space, rune) |
131 } | 140 } |
132 | 141 |
133 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase. | 142 // to maps the rune using the specified case mapping. |
134 func To(_case int, rune int) int { | 143 func to(_case int, rune int, caseRange []CaseRange) int { |
135 if _case < 0 || MaxCase <= _case { | 144 if _case < 0 || MaxCase <= _case { |
136 return ReplacementChar // as reasonable an error as any | 145 return ReplacementChar // as reasonable an error as any |
137 } | 146 } |
138 // binary search over ranges | 147 // binary search over ranges |
139 lo := 0 | 148 lo := 0 |
140 » hi := len(CaseRanges) | 149 » hi := len(caseRange) |
141 for lo < hi { | 150 for lo < hi { |
142 m := lo + (hi-lo)/2 | 151 m := lo + (hi-lo)/2 |
143 » » r := CaseRanges[m] | 152 » » r := caseRange[m] |
144 if r.Lo <= rune && rune <= r.Hi { | 153 if r.Lo <= rune && rune <= r.Hi { |
145 delta := int(r.Delta[_case]) | 154 delta := int(r.Delta[_case]) |
146 if delta > MaxRune { | 155 if delta > MaxRune { |
147 // In an Upper-Lower sequence, which always star ts with | 156 // In an Upper-Lower sequence, which always star ts with |
148 // an UpperCase letter, the real deltas always l ook like: | 157 // an UpperCase letter, the real deltas always l ook like: |
149 // {0, 1, 0} UpperCase (Lower is next) | 158 // {0, 1, 0} UpperCase (Lower is next) |
150 // {-1, 0, -1} LowerCase (Upper, Title are previous) | 159 // {-1, 0, -1} LowerCase (Upper, Title are previous) |
151 // The characters at even offsets from the begin ning of the | 160 // The characters at even offsets from the begin ning of the |
152 // sequence are upper case; the ones at odd offs ets are lower. | 161 // sequence are upper case; the ones at odd offs ets are lower. |
153 // The correct mapping can be done by clearing o r setting the low | 162 // The correct mapping can be done by clearing o r setting the low |
154 // bit in the sequence offset. | 163 // bit in the sequence offset. |
155 // The constants UpperCase and TitleCase are eve n while LowerCase | 164 // The constants UpperCase and TitleCase are eve n while LowerCase |
156 // is odd so we take the low bit from _case. | 165 // is odd so we take the low bit from _case. |
157 return r.Lo + ((rune-r.Lo)&^1 | _case&1) | 166 return r.Lo + ((rune-r.Lo)&^1 | _case&1) |
158 } | 167 } |
159 return rune + delta | 168 return rune + delta |
160 } | 169 } |
161 if rune < r.Lo { | 170 if rune < r.Lo { |
162 hi = m | 171 hi = m |
163 } else { | 172 } else { |
164 lo = m + 1 | 173 lo = m + 1 |
165 } | 174 } |
166 } | 175 } |
167 return rune | 176 return rune |
168 } | 177 } |
169 | 178 |
179 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase. | |
180 func To(_case int, rune int) int { | |
181 return to(_case, rune, CaseRanges) | |
182 } | |
183 | |
170 // ToUpper maps the rune to upper case. | 184 // ToUpper maps the rune to upper case. |
171 func ToUpper(rune int) int { | 185 func ToUpper(rune int) int { |
172 if rune < 0x80 { // quick ASCII check | 186 if rune < 0x80 { // quick ASCII check |
173 if 'a' <= rune && rune <= 'z' { | 187 if 'a' <= rune && rune <= 'z' { |
174 rune -= 'a' - 'A' | 188 rune -= 'a' - 'A' |
175 } | 189 } |
176 return rune | 190 return rune |
177 } | 191 } |
178 return To(UpperCase, rune) | 192 return To(UpperCase, rune) |
179 } | 193 } |
(...skipping 12 matching lines...) Expand all Loading... | |
192 // ToTitle maps the rune to title case. | 206 // ToTitle maps the rune to title case. |
193 func ToTitle(rune int) int { | 207 func ToTitle(rune int) int { |
194 if rune < 0x80 { // quick ASCII check | 208 if rune < 0x80 { // quick ASCII check |
195 if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII | 209 if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII |
196 rune -= 'a' - 'A' | 210 rune -= 'a' - 'A' |
197 } | 211 } |
198 return rune | 212 return rune |
199 } | 213 } |
200 return To(TitleCase, rune) | 214 return To(TitleCase, rune) |
201 } | 215 } |
216 | |
217 // ToUpper maps the rune to upper case giving priority to the special mapping. | |
218 func (special SpecialCase) ToUpper(rune int) int { | |
219 r := to(UpperCase, rune, []CaseRange(special)) | |
220 if r == rune { | |
221 r = ToUpper(rune) | |
222 } | |
223 return r | |
224 } | |
225 | |
226 // ToTitlemaps the rune to upper case giving priority to the special mapping. | |
227 func (special SpecialCase) ToTitle(rune int) int { | |
228 r := to(TitleCase, rune, []CaseRange(special)) | |
229 if r == rune { | |
230 r = ToTitle(rune) | |
231 } | |
232 return r | |
233 } | |
234 | |
235 // ToLower maps the rune to upper case giving priority to the special mapping. | |
236 func (special SpecialCase) ToLower(rune int) int { | |
237 r := to(LowerCase, rune, []CaseRange(special)) | |
238 if r == rune { | |
239 r = ToLower(rune) | |
240 } | |
241 return r | |
242 } | |
OLD | NEW |