OLD | NEW |
1 // Copyright 2009 The Go Authors. All rights reserved. | 1 // Copyright 2009 The Go Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // This package provides data and functions to test some properties of Unicode c
ode points. | 5 // This package provides data and functions to test some properties of Unicode c
ode points. |
6 package unicode | 6 package unicode |
7 | 7 |
8 const ( | 8 const ( |
9 MaxRune = 0x10FFFF // Maximum valid Unicode code point. | 9 MaxRune = 0x10FFFF // Maximum valid Unicode code point. |
10 ReplacementChar = 0xFFFD // Represents invalid code points. | 10 ReplacementChar = 0xFFFD // Represents invalid code points. |
11 ) | 11 ) |
12 | 12 |
13 | 13 |
14 // The representation of a range of Unicode code points. The range runs from Lo
to Hi | 14 // The representation of a range of Unicode code points. The range runs from Lo
to Hi |
15 // inclusive and has the specified stride. | 15 // inclusive and has the specified stride. |
16 type Range struct { | 16 type Range struct { |
17 Lo int | 17 Lo int |
18 Hi int | 18 Hi int |
19 Stride int | 19 Stride int |
20 } | 20 } |
21 | 21 |
22 // The representation of a range of Unicode code points for case conversion. | 22 // CaseRange represents a range of Unicode code points for simple (one |
| 23 // code point to one code point) case conversion. |
23 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas | 24 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas |
24 // are the number to add to the code point to reach the code point for a | 25 // are the number to add to the code point to reach the code point for a |
25 // different case for that character. They may be negative. If zero, it | 26 // different case for that character. They may be negative. If zero, it |
26 // means the character is in the corresponding case. There is a special | 27 // means the character is in the corresponding case. There is a special |
27 // case representing sequences of alternating corresponding Upper and Lower | 28 // case representing sequences of alternating corresponding Upper and Lower |
28 // pairs. It appears with a fixed Delta of | 29 // pairs. It appears with a fixed Delta of |
29 // {UpperLower, UpperLower, UpperLower} | 30 // {UpperLower, UpperLower, UpperLower} |
30 // The constant UpperLower has an otherwise impossible delta value. | 31 // The constant UpperLower has an otherwise impossible delta value. |
31 type CaseRange struct { | 32 type CaseRange struct { |
32 Lo int | 33 Lo int |
33 Hi int | 34 Hi int |
34 Delta d | 35 Delta d |
35 } | 36 } |
36 | 37 |
| 38 // SpecialCase represents language-specific case mappings such as Turkish. |
| 39 // Methods of SpecialCase customize (by overriding) the standard mappings. |
| 40 type SpecialCase []CaseRange |
| 41 |
| 42 //BUG(r): Provide a mechanism for full case folding (those that involve |
| 43 // multiple runes in the input or output). |
| 44 |
37 // Indices into the Delta arrays inside CaseRanges for case mapping. | 45 // Indices into the Delta arrays inside CaseRanges for case mapping. |
38 const ( | 46 const ( |
39 UpperCase = iota | 47 UpperCase = iota |
40 LowerCase | 48 LowerCase |
41 TitleCase | 49 TitleCase |
42 MaxCase | 50 MaxCase |
43 ) | 51 ) |
44 | 52 |
45 type d [MaxCase]int32 // to make the CaseRanges text shorter | 53 type d [MaxCase]int32 // to make the CaseRanges text shorter |
46 | 54 |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
123 if rune <= 0xFF { // quick Latin-1 check | 131 if rune <= 0xFF { // quick Latin-1 check |
124 switch rune { | 132 switch rune { |
125 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: | 133 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: |
126 return true | 134 return true |
127 } | 135 } |
128 return false | 136 return false |
129 } | 137 } |
130 return Is(White_Space, rune) | 138 return Is(White_Space, rune) |
131 } | 139 } |
132 | 140 |
133 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase. | 141 // to maps the rune using the specified case mapping. |
134 func To(_case int, rune int) int { | 142 func to(_case int, rune int, caseRange []CaseRange) int { |
135 if _case < 0 || MaxCase <= _case { | 143 if _case < 0 || MaxCase <= _case { |
136 return ReplacementChar // as reasonable an error as any | 144 return ReplacementChar // as reasonable an error as any |
137 } | 145 } |
138 // binary search over ranges | 146 // binary search over ranges |
139 lo := 0 | 147 lo := 0 |
140 » hi := len(CaseRanges) | 148 » hi := len(caseRange) |
141 for lo < hi { | 149 for lo < hi { |
142 m := lo + (hi-lo)/2 | 150 m := lo + (hi-lo)/2 |
143 » » r := CaseRanges[m] | 151 » » r := caseRange[m] |
144 if r.Lo <= rune && rune <= r.Hi { | 152 if r.Lo <= rune && rune <= r.Hi { |
145 delta := int(r.Delta[_case]) | 153 delta := int(r.Delta[_case]) |
146 if delta > MaxRune { | 154 if delta > MaxRune { |
147 // In an Upper-Lower sequence, which always star
ts with | 155 // In an Upper-Lower sequence, which always star
ts with |
148 // an UpperCase letter, the real deltas always l
ook like: | 156 // an UpperCase letter, the real deltas always l
ook like: |
149 // {0, 1, 0} UpperCase (Lower is next) | 157 // {0, 1, 0} UpperCase (Lower is next) |
150 // {-1, 0, -1} LowerCase (Upper, Title are
previous) | 158 // {-1, 0, -1} LowerCase (Upper, Title are
previous) |
151 // The characters at even offsets from the begin
ning of the | 159 // The characters at even offsets from the begin
ning of the |
152 // sequence are upper case; the ones at odd offs
ets are lower. | 160 // sequence are upper case; the ones at odd offs
ets are lower. |
153 // The correct mapping can be done by clearing o
r setting the low | 161 // The correct mapping can be done by clearing o
r setting the low |
154 // bit in the sequence offset. | 162 // bit in the sequence offset. |
155 // The constants UpperCase and TitleCase are eve
n while LowerCase | 163 // The constants UpperCase and TitleCase are eve
n while LowerCase |
156 // is odd so we take the low bit from _case. | 164 // is odd so we take the low bit from _case. |
157 return r.Lo + ((rune-r.Lo)&^1 | _case&1) | 165 return r.Lo + ((rune-r.Lo)&^1 | _case&1) |
158 } | 166 } |
159 return rune + delta | 167 return rune + delta |
160 } | 168 } |
161 if rune < r.Lo { | 169 if rune < r.Lo { |
162 hi = m | 170 hi = m |
163 } else { | 171 } else { |
164 lo = m + 1 | 172 lo = m + 1 |
165 } | 173 } |
166 } | 174 } |
167 return rune | 175 return rune |
168 } | 176 } |
169 | 177 |
| 178 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase. |
| 179 func To(_case int, rune int) int { |
| 180 return to(_case, rune, CaseRanges) |
| 181 } |
| 182 |
170 // ToUpper maps the rune to upper case. | 183 // ToUpper maps the rune to upper case. |
171 func ToUpper(rune int) int { | 184 func ToUpper(rune int) int { |
172 if rune < 0x80 { // quick ASCII check | 185 if rune < 0x80 { // quick ASCII check |
173 if 'a' <= rune && rune <= 'z' { | 186 if 'a' <= rune && rune <= 'z' { |
174 rune -= 'a' - 'A' | 187 rune -= 'a' - 'A' |
175 } | 188 } |
176 return rune | 189 return rune |
177 } | 190 } |
178 return To(UpperCase, rune) | 191 return To(UpperCase, rune) |
179 } | 192 } |
(...skipping 12 matching lines...) Expand all Loading... |
192 // ToTitle maps the rune to title case. | 205 // ToTitle maps the rune to title case. |
193 func ToTitle(rune int) int { | 206 func ToTitle(rune int) int { |
194 if rune < 0x80 { // quick ASCII check | 207 if rune < 0x80 { // quick ASCII check |
195 if 'a' <= rune && rune <= 'z' { // title case is upper case for
ASCII | 208 if 'a' <= rune && rune <= 'z' { // title case is upper case for
ASCII |
196 rune -= 'a' - 'A' | 209 rune -= 'a' - 'A' |
197 } | 210 } |
198 return rune | 211 return rune |
199 } | 212 } |
200 return To(TitleCase, rune) | 213 return To(TitleCase, rune) |
201 } | 214 } |
| 215 |
| 216 // ToUpper maps the rune to upper case giving priority to the special mapping. |
| 217 func (special SpecialCase) ToUpper(rune int) int { |
| 218 r := to(UpperCase, rune, []CaseRange(special)) |
| 219 if r == rune { |
| 220 r = ToUpper(rune) |
| 221 } |
| 222 return r |
| 223 } |
| 224 |
| 225 // ToTitlemaps the rune to upper case giving priority to the special mapping. |
| 226 func (special SpecialCase) ToTitle(rune int) int { |
| 227 r := to(TitleCase, rune, []CaseRange(special)) |
| 228 if r == rune { |
| 229 r = ToTitle(rune) |
| 230 } |
| 231 return r |
| 232 } |
| 233 |
| 234 // ToLower maps the rune to upper case giving priority to the special mapping. |
| 235 func (special SpecialCase) ToLower(rune int) int { |
| 236 r := to(LowerCase, rune, []CaseRange(special)) |
| 237 if r == rune { |
| 238 r = ToLower(rune) |
| 239 } |
| 240 return r |
| 241 } |
OLD | NEW |