src/pkg/unicode/letter.go - Issue 824043: code review 824043: Unicode: provide an ability to supplement the case-mapp...

Side by Side Diff: src/pkg/unicode/letter.go

Issue 824043: code review 824043: Unicode: provide an ability to supplement the case-mapp... (Closed)

Patch Set: Created 15 years ago

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2009 The Go Authors. All rights reserved.	1 // Copyright 2009 The Go Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style	2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.	3 // license that can be found in the LICENSE file.

4	4

5 // This package provides data and functions to test some properties of Unicode c ode points.	5 // This package provides data and functions to test some properties of Unicode c ode points.

6 package unicode	6 package unicode

7	7

8 const (	8 const (

9 MaxRune = 0x10FFFF // Maximum valid Unicode code point.	9 MaxRune = 0x10FFFF // Maximum valid Unicode code point.

10 ReplacementChar = 0xFFFD // Represents invalid code points.	10 ReplacementChar = 0xFFFD // Represents invalid code points.

11 )	11 )

12	12

13	13

14 // The representation of a range of Unicode code points. The range runs from Lo to Hi	14 // The representation of a range of Unicode code points. The range runs from Lo to Hi

15 // inclusive and has the specified stride.	15 // inclusive and has the specified stride.

16 type Range struct {	16 type Range struct {

17 Lo int	17 Lo int

18 Hi int	18 Hi int

19 Stride int	19 Stride int

20 }	20 }

21	21

22 // The representation of a range of Unicode code points for case conversion.	22 // CaseRange represents a range of Unicode code points for simple (one

	23 // code point to one code point) case conversion.

23 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas	24 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas

24 // are the number to add to the code point to reach the code point for a	25 // are the number to add to the code point to reach the code point for a

25 // different case for that character. They may be negative. If zero, it	26 // different case for that character. They may be negative. If zero, it

26 // means the character is in the corresponding case. There is a special	27 // means the character is in the corresponding case. There is a special

27 // case representing sequences of alternating corresponding Upper and Lower	28 // case representing sequences of alternating corresponding Upper and Lower

28 // pairs. It appears with a fixed Delta of	29 // pairs. It appears with a fixed Delta of

29 // {UpperLower, UpperLower, UpperLower}	30 // {UpperLower, UpperLower, UpperLower}

30 // The constant UpperLower has an otherwise impossible delta value.	31 // The constant UpperLower has an otherwise impossible delta value.

	32 // TODO: Provide a mechanism for full case folding (those that involve
	rsc1 2010/03/31 00:39:31 If you move this out of the doc comment you can sa If you move this out of the doc comment you can say // BUG(r): There should be a mechanism for full case folding // (those that involve multiple runes in the input or output). at top level as its own comment, and it appears in the BUGS section of the godoc output.
	33 // multiple runes in the input or output).

31 type CaseRange struct {	34 type CaseRange struct {

32 Lo int	35 Lo int

33 Hi int	36 Hi int

34 Delta d	37 Delta d

35 }	38 }

36	39

	40 // SpecialCase represents language-specific case mappings such as Turkish.

	41 // TODO: Provide a mechanism for full case folding (those that involve
	rsc1 2010/03/31 00:39:31 Then you can drop this one. (But keep Methods of S Then you can drop this one. (But keep Methods of SpecialCase...)
	42 // multiple runes in the input or output). Methods of SpecialCase

	43 // customize (by overriding) the standard mappings.

	44 type SpecialCase []CaseRange

	45

37 // Indices into the Delta arrays inside CaseRanges for case mapping.	46 // Indices into the Delta arrays inside CaseRanges for case mapping.

38 const (	47 const (

39 UpperCase = iota	48 UpperCase = iota

40 LowerCase	49 LowerCase

41 TitleCase	50 TitleCase

42 MaxCase	51 MaxCase

43 )	52 )

44	53

45 type d [MaxCase]int32 // to make the CaseRanges text shorter	54 type d [MaxCase]int32 // to make the CaseRanges text shorter

46	55

(...skipping 76 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
123 if rune <= 0xFF { // quick Latin-1 check	132 if rune <= 0xFF { // quick Latin-1 check

124 switch rune {	133 switch rune {

125 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:	134 case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:

126 return true	135 return true

127 }	136 }

128 return false	137 return false

129 }	138 }

130 return Is(White_Space, rune)	139 return Is(White_Space, rune)

131 }	140 }

132	141

133 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.	142 // to maps the rune using the specified case mapping.

134 func To(_case int, rune int) int {	143 func to(_case int, rune int, caseRange []CaseRange) int {

135 if _case < 0 \|\| MaxCase <= _case {	144 if _case < 0 \|\| MaxCase <= _case {

136 return ReplacementChar // as reasonable an error as any	145 return ReplacementChar // as reasonable an error as any

137 }	146 }

138 // binary search over ranges	147 // binary search over ranges

139 lo := 0	148 lo := 0

140 » hi := len(CaseRanges)	149 » hi := len(caseRange)

141 for lo < hi {	150 for lo < hi {

142 m := lo + (hi-lo)/2	151 m := lo + (hi-lo)/2

143 » » r := CaseRanges[m]	152 » » r := caseRange[m]

144 if r.Lo <= rune && rune <= r.Hi {	153 if r.Lo <= rune && rune <= r.Hi {

145 delta := int(r.Delta[_case])	154 delta := int(r.Delta[_case])

146 if delta > MaxRune {	155 if delta > MaxRune {

147 // In an Upper-Lower sequence, which always star ts with	156 // In an Upper-Lower sequence, which always star ts with

148 // an UpperCase letter, the real deltas always l ook like:	157 // an UpperCase letter, the real deltas always l ook like:

149 // {0, 1, 0} UpperCase (Lower is next)	158 // {0, 1, 0} UpperCase (Lower is next)

150 // {-1, 0, -1} LowerCase (Upper, Title are previous)	159 // {-1, 0, -1} LowerCase (Upper, Title are previous)

151 // The characters at even offsets from the begin ning of the	160 // The characters at even offsets from the begin ning of the

152 // sequence are upper case; the ones at odd offs ets are lower.	161 // sequence are upper case; the ones at odd offs ets are lower.

153 // The correct mapping can be done by clearing o r setting the low	162 // The correct mapping can be done by clearing o r setting the low

154 // bit in the sequence offset.	163 // bit in the sequence offset.

155 // The constants UpperCase and TitleCase are eve n while LowerCase	164 // The constants UpperCase and TitleCase are eve n while LowerCase

156 // is odd so we take the low bit from _case.	165 // is odd so we take the low bit from _case.

157 return r.Lo + ((rune-r.Lo)&^1 \| _case&1)	166 return r.Lo + ((rune-r.Lo)&^1 \| _case&1)

158 }	167 }

159 return rune + delta	168 return rune + delta

160 }	169 }

161 if rune < r.Lo {	170 if rune < r.Lo {

162 hi = m	171 hi = m

163 } else {	172 } else {

164 lo = m + 1	173 lo = m + 1

165 }	174 }

166 }	175 }

167 return rune	176 return rune

168 }	177 }

169	178

	179 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.

	180 func To(_case int, rune int) int {

	181 return to(_case, rune, CaseRanges)

	182 }

	183

170 // ToUpper maps the rune to upper case.	184 // ToUpper maps the rune to upper case.

171 func ToUpper(rune int) int {	185 func ToUpper(rune int) int {

172 if rune < 0x80 { // quick ASCII check	186 if rune < 0x80 { // quick ASCII check

173 if 'a' <= rune && rune <= 'z' {	187 if 'a' <= rune && rune <= 'z' {

174 rune -= 'a' - 'A'	188 rune -= 'a' - 'A'

175 }	189 }

176 return rune	190 return rune

177 }	191 }

178 return To(UpperCase, rune)	192 return To(UpperCase, rune)

179 }	193 }

(...skipping 12 matching lines...) Expand all Loading...
192 // ToTitle maps the rune to title case.	206 // ToTitle maps the rune to title case.

193 func ToTitle(rune int) int {	207 func ToTitle(rune int) int {

194 if rune < 0x80 { // quick ASCII check	208 if rune < 0x80 { // quick ASCII check

195 if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII	209 if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII

196 rune -= 'a' - 'A'	210 rune -= 'a' - 'A'

197 }	211 }

198 return rune	212 return rune

199 }	213 }

200 return To(TitleCase, rune)	214 return To(TitleCase, rune)

201 }	215 }

	216

	217 // ToUpper maps the rune to upper case giving priority to the special mapping.

	218 func (special SpecialCase) ToUpper(rune int) int {

	219 r := to(UpperCase, rune, []CaseRange(special))

	220 if r == rune {

	221 r = ToUpper(rune)

	222 }

	223 return r

	224 }

	225

	226 // ToTitlemaps the rune to upper case giving priority to the special mapping.

	227 func (special SpecialCase) ToTitle(rune int) int {

	228 r := to(TitleCase, rune, []CaseRange(special))

	229 if r == rune {

	230 r = ToTitle(rune)

	231 }

	232 return r

	233 }

	234

	235 // ToLower maps the rune to upper case giving priority to the special mapping.

	236 func (special SpecialCase) ToLower(rune int) int {

	237 r := to(LowerCase, rune, []CaseRange(special))

	238 if r == rune {

	239 r = ToLower(rune)

	240 }

	241 return r

	242 }

OLD	NEW

« src/pkg/strings/strings.go ('K') | « src/pkg/unicode/casetables.go ('k') | src/pkg/unicode/letter_test.go » ('j') | no next file with comments »