Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(347)

Side by Side Diff: src/pkg/unicode/letter.go

Issue 824043: code review 824043: Unicode: provide an ability to supplement the case-mapp... (Closed)
Patch Set: code review 824043: Unicode: provide an ability to supplement the case-mapp... Created 15 years ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/pkg/unicode/casetables.go ('k') | src/pkg/unicode/letter_test.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2009 The Go Authors. All rights reserved. 1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 // This package provides data and functions to test some properties of Unicode c ode points. 5 // This package provides data and functions to test some properties of Unicode c ode points.
6 package unicode 6 package unicode
7 7
8 const ( 8 const (
9 MaxRune = 0x10FFFF // Maximum valid Unicode code point. 9 MaxRune = 0x10FFFF // Maximum valid Unicode code point.
10 ReplacementChar = 0xFFFD // Represents invalid code points. 10 ReplacementChar = 0xFFFD // Represents invalid code points.
11 ) 11 )
12 12
13 13
14 // The representation of a range of Unicode code points. The range runs from Lo to Hi 14 // The representation of a range of Unicode code points. The range runs from Lo to Hi
15 // inclusive and has the specified stride. 15 // inclusive and has the specified stride.
16 type Range struct { 16 type Range struct {
17 Lo int 17 Lo int
18 Hi int 18 Hi int
19 Stride int 19 Stride int
20 } 20 }
21 21
22 // The representation of a range of Unicode code points for case conversion. 22 // CaseRange represents a range of Unicode code points for simple (one
23 // code point to one code point) case conversion.
23 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas 24 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
24 // are the number to add to the code point to reach the code point for a 25 // are the number to add to the code point to reach the code point for a
25 // different case for that character. They may be negative. If zero, it 26 // different case for that character. They may be negative. If zero, it
26 // means the character is in the corresponding case. There is a special 27 // means the character is in the corresponding case. There is a special
27 // case representing sequences of alternating corresponding Upper and Lower 28 // case representing sequences of alternating corresponding Upper and Lower
28 // pairs. It appears with a fixed Delta of 29 // pairs. It appears with a fixed Delta of
29 // {UpperLower, UpperLower, UpperLower} 30 // {UpperLower, UpperLower, UpperLower}
30 // The constant UpperLower has an otherwise impossible delta value. 31 // The constant UpperLower has an otherwise impossible delta value.
31 type CaseRange struct { 32 type CaseRange struct {
32 Lo int 33 Lo int
33 Hi int 34 Hi int
34 Delta d 35 Delta d
35 } 36 }
36 37
38 // SpecialCase represents language-specific case mappings such as Turkish.
39 // Methods of SpecialCase customize (by overriding) the standard mappings.
40 type SpecialCase []CaseRange
41
42 //BUG(r): Provide a mechanism for full case folding (those that involve
43 // multiple runes in the input or output).
44
37 // Indices into the Delta arrays inside CaseRanges for case mapping. 45 // Indices into the Delta arrays inside CaseRanges for case mapping.
38 const ( 46 const (
39 UpperCase = iota 47 UpperCase = iota
40 LowerCase 48 LowerCase
41 TitleCase 49 TitleCase
42 MaxCase 50 MaxCase
43 ) 51 )
44 52
45 type d [MaxCase]int32 // to make the CaseRanges text shorter 53 type d [MaxCase]int32 // to make the CaseRanges text shorter
46 54
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
123 if rune <= 0xFF { // quick Latin-1 check 131 if rune <= 0xFF { // quick Latin-1 check
124 switch rune { 132 switch rune {
125 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: 133 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
126 return true 134 return true
127 } 135 }
128 return false 136 return false
129 } 137 }
130 return Is(White_Space, rune) 138 return Is(White_Space, rune)
131 } 139 }
132 140
133 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase. 141 // to maps the rune using the specified case mapping.
134 func To(_case int, rune int) int { 142 func to(_case int, rune int, caseRange []CaseRange) int {
135 if _case < 0 || MaxCase <= _case { 143 if _case < 0 || MaxCase <= _case {
136 return ReplacementChar // as reasonable an error as any 144 return ReplacementChar // as reasonable an error as any
137 } 145 }
138 // binary search over ranges 146 // binary search over ranges
139 lo := 0 147 lo := 0
140 » hi := len(CaseRanges) 148 » hi := len(caseRange)
141 for lo < hi { 149 for lo < hi {
142 m := lo + (hi-lo)/2 150 m := lo + (hi-lo)/2
143 » » r := CaseRanges[m] 151 » » r := caseRange[m]
144 if r.Lo <= rune && rune <= r.Hi { 152 if r.Lo <= rune && rune <= r.Hi {
145 delta := int(r.Delta[_case]) 153 delta := int(r.Delta[_case])
146 if delta > MaxRune { 154 if delta > MaxRune {
147 // In an Upper-Lower sequence, which always star ts with 155 // In an Upper-Lower sequence, which always star ts with
148 // an UpperCase letter, the real deltas always l ook like: 156 // an UpperCase letter, the real deltas always l ook like:
149 // {0, 1, 0} UpperCase (Lower is next) 157 // {0, 1, 0} UpperCase (Lower is next)
150 // {-1, 0, -1} LowerCase (Upper, Title are previous) 158 // {-1, 0, -1} LowerCase (Upper, Title are previous)
151 // The characters at even offsets from the begin ning of the 159 // The characters at even offsets from the begin ning of the
152 // sequence are upper case; the ones at odd offs ets are lower. 160 // sequence are upper case; the ones at odd offs ets are lower.
153 // The correct mapping can be done by clearing o r setting the low 161 // The correct mapping can be done by clearing o r setting the low
154 // bit in the sequence offset. 162 // bit in the sequence offset.
155 // The constants UpperCase and TitleCase are eve n while LowerCase 163 // The constants UpperCase and TitleCase are eve n while LowerCase
156 // is odd so we take the low bit from _case. 164 // is odd so we take the low bit from _case.
157 return r.Lo + ((rune-r.Lo)&^1 | _case&1) 165 return r.Lo + ((rune-r.Lo)&^1 | _case&1)
158 } 166 }
159 return rune + delta 167 return rune + delta
160 } 168 }
161 if rune < r.Lo { 169 if rune < r.Lo {
162 hi = m 170 hi = m
163 } else { 171 } else {
164 lo = m + 1 172 lo = m + 1
165 } 173 }
166 } 174 }
167 return rune 175 return rune
168 } 176 }
169 177
178 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
179 func To(_case int, rune int) int {
180 return to(_case, rune, CaseRanges)
181 }
182
170 // ToUpper maps the rune to upper case. 183 // ToUpper maps the rune to upper case.
171 func ToUpper(rune int) int { 184 func ToUpper(rune int) int {
172 if rune < 0x80 { // quick ASCII check 185 if rune < 0x80 { // quick ASCII check
173 if 'a' <= rune && rune <= 'z' { 186 if 'a' <= rune && rune <= 'z' {
174 rune -= 'a' - 'A' 187 rune -= 'a' - 'A'
175 } 188 }
176 return rune 189 return rune
177 } 190 }
178 return To(UpperCase, rune) 191 return To(UpperCase, rune)
179 } 192 }
(...skipping 12 matching lines...) Expand all
192 // ToTitle maps the rune to title case. 205 // ToTitle maps the rune to title case.
193 func ToTitle(rune int) int { 206 func ToTitle(rune int) int {
194 if rune < 0x80 { // quick ASCII check 207 if rune < 0x80 { // quick ASCII check
195 if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII 208 if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII
196 rune -= 'a' - 'A' 209 rune -= 'a' - 'A'
197 } 210 }
198 return rune 211 return rune
199 } 212 }
200 return To(TitleCase, rune) 213 return To(TitleCase, rune)
201 } 214 }
215
216 // ToUpper maps the rune to upper case giving priority to the special mapping.
217 func (special SpecialCase) ToUpper(rune int) int {
218 r := to(UpperCase, rune, []CaseRange(special))
219 if r == rune {
220 r = ToUpper(rune)
221 }
222 return r
223 }
224
225 // ToTitlemaps the rune to upper case giving priority to the special mapping.
226 func (special SpecialCase) ToTitle(rune int) int {
227 r := to(TitleCase, rune, []CaseRange(special))
228 if r == rune {
229 r = ToTitle(rune)
230 }
231 return r
232 }
233
234 // ToLower maps the rune to upper case giving priority to the special mapping.
235 func (special SpecialCase) ToLower(rune int) int {
236 r := to(LowerCase, rune, []CaseRange(special))
237 if r == rune {
238 r = ToLower(rune)
239 }
240 return r
241 }
OLDNEW
« no previous file with comments | « src/pkg/unicode/casetables.go ('k') | src/pkg/unicode/letter_test.go » ('j') | no next file with comments »

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b