Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(3)

Side by Side Diff: src/pkg/unicode/letter.go

Issue 824043: code review 824043: Unicode: provide an ability to supplement the case-mapp... (Closed)
Patch Set: Created 15 years ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2009 The Go Authors. All rights reserved. 1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 // This package provides data and functions to test some properties of Unicode c ode points. 5 // This package provides data and functions to test some properties of Unicode c ode points.
6 package unicode 6 package unicode
7 7
8 const ( 8 const (
9 MaxRune = 0x10FFFF // Maximum valid Unicode code point. 9 MaxRune = 0x10FFFF // Maximum valid Unicode code point.
10 ReplacementChar = 0xFFFD // Represents invalid code points. 10 ReplacementChar = 0xFFFD // Represents invalid code points.
11 ) 11 )
12 12
13 13
14 // The representation of a range of Unicode code points. The range runs from Lo to Hi 14 // The representation of a range of Unicode code points. The range runs from Lo to Hi
15 // inclusive and has the specified stride. 15 // inclusive and has the specified stride.
16 type Range struct { 16 type Range struct {
17 Lo int 17 Lo int
18 Hi int 18 Hi int
19 Stride int 19 Stride int
20 } 20 }
21 21
22 // The representation of a range of Unicode code points for case conversion. 22 // CaseRange represents a range of Unicode code points for simple (one
23 // code point to one code point) case conversion.
23 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas 24 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
24 // are the number to add to the code point to reach the code point for a 25 // are the number to add to the code point to reach the code point for a
25 // different case for that character. They may be negative. If zero, it 26 // different case for that character. They may be negative. If zero, it
26 // means the character is in the corresponding case. There is a special 27 // means the character is in the corresponding case. There is a special
27 // case representing sequences of alternating corresponding Upper and Lower 28 // case representing sequences of alternating corresponding Upper and Lower
28 // pairs. It appears with a fixed Delta of 29 // pairs. It appears with a fixed Delta of
29 // {UpperLower, UpperLower, UpperLower} 30 // {UpperLower, UpperLower, UpperLower}
30 // The constant UpperLower has an otherwise impossible delta value. 31 // The constant UpperLower has an otherwise impossible delta value.
32 // TODO: Provide a mechanism for full case folding (those that involve
rsc1 2010/03/31 00:39:31 If you move this out of the doc comment you can sa
33 // multiple runes in the input or output).
31 type CaseRange struct { 34 type CaseRange struct {
32 Lo int 35 Lo int
33 Hi int 36 Hi int
34 Delta d 37 Delta d
35 } 38 }
36 39
40 // SpecialCase represents language-specific case mappings such as Turkish.
41 // TODO: Provide a mechanism for full case folding (those that involve
rsc1 2010/03/31 00:39:31 Then you can drop this one. (But keep Methods of S
42 // multiple runes in the input or output). Methods of SpecialCase
43 // customize (by overriding) the standard mappings.
44 type SpecialCase []CaseRange
45
37 // Indices into the Delta arrays inside CaseRanges for case mapping. 46 // Indices into the Delta arrays inside CaseRanges for case mapping.
38 const ( 47 const (
39 UpperCase = iota 48 UpperCase = iota
40 LowerCase 49 LowerCase
41 TitleCase 50 TitleCase
42 MaxCase 51 MaxCase
43 ) 52 )
44 53
45 type d [MaxCase]int32 // to make the CaseRanges text shorter 54 type d [MaxCase]int32 // to make the CaseRanges text shorter
46 55
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
123 if rune <= 0xFF { // quick Latin-1 check 132 if rune <= 0xFF { // quick Latin-1 check
124 switch rune { 133 switch rune {
125 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: 134 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
126 return true 135 return true
127 } 136 }
128 return false 137 return false
129 } 138 }
130 return Is(White_Space, rune) 139 return Is(White_Space, rune)
131 } 140 }
132 141
133 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase. 142 // to maps the rune using the specified case mapping.
134 func To(_case int, rune int) int { 143 func to(_case int, rune int, caseRange []CaseRange) int {
135 if _case < 0 || MaxCase <= _case { 144 if _case < 0 || MaxCase <= _case {
136 return ReplacementChar // as reasonable an error as any 145 return ReplacementChar // as reasonable an error as any
137 } 146 }
138 // binary search over ranges 147 // binary search over ranges
139 lo := 0 148 lo := 0
140 » hi := len(CaseRanges) 149 » hi := len(caseRange)
141 for lo < hi { 150 for lo < hi {
142 m := lo + (hi-lo)/2 151 m := lo + (hi-lo)/2
143 » » r := CaseRanges[m] 152 » » r := caseRange[m]
144 if r.Lo <= rune && rune <= r.Hi { 153 if r.Lo <= rune && rune <= r.Hi {
145 delta := int(r.Delta[_case]) 154 delta := int(r.Delta[_case])
146 if delta > MaxRune { 155 if delta > MaxRune {
147 // In an Upper-Lower sequence, which always star ts with 156 // In an Upper-Lower sequence, which always star ts with
148 // an UpperCase letter, the real deltas always l ook like: 157 // an UpperCase letter, the real deltas always l ook like:
149 // {0, 1, 0} UpperCase (Lower is next) 158 // {0, 1, 0} UpperCase (Lower is next)
150 // {-1, 0, -1} LowerCase (Upper, Title are previous) 159 // {-1, 0, -1} LowerCase (Upper, Title are previous)
151 // The characters at even offsets from the begin ning of the 160 // The characters at even offsets from the begin ning of the
152 // sequence are upper case; the ones at odd offs ets are lower. 161 // sequence are upper case; the ones at odd offs ets are lower.
153 // The correct mapping can be done by clearing o r setting the low 162 // The correct mapping can be done by clearing o r setting the low
154 // bit in the sequence offset. 163 // bit in the sequence offset.
155 // The constants UpperCase and TitleCase are eve n while LowerCase 164 // The constants UpperCase and TitleCase are eve n while LowerCase
156 // is odd so we take the low bit from _case. 165 // is odd so we take the low bit from _case.
157 return r.Lo + ((rune-r.Lo)&^1 | _case&1) 166 return r.Lo + ((rune-r.Lo)&^1 | _case&1)
158 } 167 }
159 return rune + delta 168 return rune + delta
160 } 169 }
161 if rune < r.Lo { 170 if rune < r.Lo {
162 hi = m 171 hi = m
163 } else { 172 } else {
164 lo = m + 1 173 lo = m + 1
165 } 174 }
166 } 175 }
167 return rune 176 return rune
168 } 177 }
169 178
179 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
180 func To(_case int, rune int) int {
181 return to(_case, rune, CaseRanges)
182 }
183
170 // ToUpper maps the rune to upper case. 184 // ToUpper maps the rune to upper case.
171 func ToUpper(rune int) int { 185 func ToUpper(rune int) int {
172 if rune < 0x80 { // quick ASCII check 186 if rune < 0x80 { // quick ASCII check
173 if 'a' <= rune && rune <= 'z' { 187 if 'a' <= rune && rune <= 'z' {
174 rune -= 'a' - 'A' 188 rune -= 'a' - 'A'
175 } 189 }
176 return rune 190 return rune
177 } 191 }
178 return To(UpperCase, rune) 192 return To(UpperCase, rune)
179 } 193 }
(...skipping 12 matching lines...) Expand all
192 // ToTitle maps the rune to title case. 206 // ToTitle maps the rune to title case.
193 func ToTitle(rune int) int { 207 func ToTitle(rune int) int {
194 if rune < 0x80 { // quick ASCII check 208 if rune < 0x80 { // quick ASCII check
195 if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII 209 if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII
196 rune -= 'a' - 'A' 210 rune -= 'a' - 'A'
197 } 211 }
198 return rune 212 return rune
199 } 213 }
200 return To(TitleCase, rune) 214 return To(TitleCase, rune)
201 } 215 }
216
217 // ToUpper maps the rune to upper case giving priority to the special mapping.
218 func (special SpecialCase) ToUpper(rune int) int {
219 r := to(UpperCase, rune, []CaseRange(special))
220 if r == rune {
221 r = ToUpper(rune)
222 }
223 return r
224 }
225
226 // ToTitlemaps the rune to upper case giving priority to the special mapping.
227 func (special SpecialCase) ToTitle(rune int) int {
228 r := to(TitleCase, rune, []CaseRange(special))
229 if r == rune {
230 r = ToTitle(rune)
231 }
232 return r
233 }
234
235 // ToLower maps the rune to upper case giving priority to the special mapping.
236 func (special SpecialCase) ToLower(rune int) int {
237 r := to(LowerCase, rune, []CaseRange(special))
238 if r == rune {
239 r = ToLower(rune)
240 }
241 return r
242 }
OLDNEW

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b