libgo/go/html/escape.go - Issue 4035044: code review 4035044: Update to current version of Go library.

Side by Side Diff: libgo/go/html/escape.go

Issue 4035044: code review 4035044: Update to current version of Go library. (Closed)

Patch Set: Created 14 years, 2 months ago

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2010 The Go Authors. All rights reserved.	1 // Copyright 2010 The Go Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style	2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.	3 // license that can be found in the LICENSE file.

4	4

5 package html	5 package html

6	6

7 import (	7 import (

8 "bytes"	8 "bytes"

9 "strings"	9 "strings"

10 "utf8"	10 "utf8"

11 )	11 )

12	12

	13 // These replacements permit compatibility with old numeric entities that·

	14 // assumed Windows-1252 encoding.

	15 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html #consume-a-character-reference

	16 var replacementTable = [...]int{

	17 '\u20AC', // First entry is what 0x80 should be replaced with.

	18 '\u0081',

	19 '\u201A',

	20 '\u0192',

	21 '\u201E',

	22 '\u2026',

	23 '\u2020',

	24 '\u2021',

	25 '\u02C6',

	26 '\u2030',

	27 '\u0160',

	28 '\u2039',

	29 '\u0152',

	30 '\u008D',

	31 '\u017D',

	32 '\u008F',

	33 '\u0090',

	34 '\u2018',

	35 '\u2019',

	36 '\u201C',

	37 '\u201D',

	38 '\u2022',

	39 '\u2013',

	40 '\u2014',

	41 '\u02DC',

	42 '\u2122',

	43 '\u0161',

	44 '\u203A',

	45 '\u0153',

	46 '\u009D',

	47 '\u017E',

	48 '\u0178', // Last entry is 0x9F.

	49 // 0x00->'\uFFFD' is handled programmatically.·

	50 // 0x0D->'\u000D' is a no-op.

	51 }

	52

13 // unescapeEntity reads an entity like "<" from b[src:] and writes the	53 // unescapeEntity reads an entity like "<" from b[src:] and writes the

14 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.	54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.

15 // Precondition: src[0] == '&' && dst <= src.	55 // Precondition: b[src] == '&' && dst <= src.

16 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {	56 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {

17 // TODO(nigeltao): Check that this entity substitution algorithm matches the spec:

18 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenizat ion.html#consume-a-character-reference	57 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenizat ion.html#consume-a-character-reference

19 // TODO(nigeltao): Handle things like "中" or "中".

20	58

21 // i starts at 1 because we already know that s[0] == '&'.	59 // i starts at 1 because we already know that s[0] == '&'.

22 i, s := 1, b[src:]	60 i, s := 1, b[src:]

	61

	62 if len(s) <= 1 {

	63 b[dst] = b[src]

	64 return dst + 1, src + 1

	65 }

	66

	67 if s[i] == '#' {

	68 if len(s) <= 3 { // We need to have at least "&#.".

	69 b[dst] = b[src]

	70 return dst + 1, src + 1

	71 }

	72 i++

	73 c := s[i]

	74 hex := false

	75 if c == 'x' \|\| c == 'X' {

	76 hex = true

	77 i++

	78 }

	79

	80 x := 0

	81 for i < len(s) {

	82 c = s[i]

	83 i++

	84 if hex {

	85 if '0' <= c && c <= '9' {

	86 x = 16*x + int(c) - '0'

	87 continue

	88 } else if 'a' <= c && c <= 'f' {

	89 x = 16*x + int(c) - 'a' + 10

	90 continue

	91 } else if 'A' <= c && c <= 'F' {

	92 x = 16*x + int(c) - 'A' + 10

	93 continue

	94 }

	95 } else if '0' <= c && c <= '9' {

	96 x = 10*x + int(c) - '0'

	97 continue

	98 }

	99 if c != ';' {

	100 i--

	101 }

	102 break

	103 }

	104

	105 if i <= 3 { // No characters matched.

	106 b[dst] = b[src]

	107 return dst + 1, src + 1

	108 }

	109

	110 if 0x80 <= x && x <= 0x9F {

	111 // Replace characters from Windows-1252 with UTF-8 equiv alents.

	112 x = replacementTable[x-0x80]

	113 } else if x == 0 \|\| (0xD800 <= x && x <= 0xDFFF) \|\| x > 0x10FFFF {

	114 // Replace invalid characters with the replacement chara cter.

	115 x = '\uFFFD'

	116 }

	117

	118 return dst + utf8.EncodeRune(b[dst:], x), src + i

	119 }

	120

	121 // Consume the maximum number of characters possible, with the

	122 // consumed characters matching one of the named references.

	123

	124 // TODO(nigeltao): unescape("&notit;") should be "¬it;"

23 for i < len(s) {	125 for i < len(s) {

24 c := s[i]	126 c := s[i]

25 i++	127 i++

26 // Lower-cased characters are more common in entities, so we che ck for them first.	128 // Lower-cased characters are more common in entities, so we che ck for them first.

27 if 'a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' {	129 if 'a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' {

28 continue	130 continue

29 }	131 }

30 if c != ';' {	132 if c != ';' {

31 i--	133 i--

32 }	134 }

33 x := entity[string(s[1:i])]

34 if x != 0 {

35 return dst + utf8.EncodeRune(x, b[dst:]), src + i

36 }

37 break	135 break

38 }	136 }

	137

	138 entityName := string(s[1:i])

	139 if x := entity[entityName]; x != 0 {

	140 return dst + utf8.EncodeRune(b[dst:], x), src + i

	141 } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-c haracter entity.

	142 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])

	143 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i

	144 }

	145

39 dst1, src1 = dst+i, src+i	146 dst1, src1 = dst+i, src+i

40 copy(b[dst:dst1], b[src:src1])	147 copy(b[dst:dst1], b[src:src1])

41 return dst1, src1	148 return dst1, src1

42 }	149 }

43	150

44 // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".	151 // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".

45 func unescape(b []byte) []byte {	152 func unescape(b []byte) []byte {

46 for i, c := range b {	153 for i, c := range b {

47 if c == '&' {	154 if c == '&' {

48 dst, src := unescapeEntity(b, i, i)	155 dst, src := unescapeEntity(b, i, i)

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
108 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't	215 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't

109 // always true.	216 // always true.

110 func UnescapeString(s string) string {	217 func UnescapeString(s string) string {

111 for _, c := range s {	218 for _, c := range s {

112 if c == '&' {	219 if c == '&' {

113 return string(unescape([]byte(s)))	220 return string(unescape([]byte(s)))

114 }	221 }

115 }	222 }

116 return s	223 return s

117 }	224 }

OLD	NEW

« no previous file with comments | « libgo/go/html/entity_test.go ('k') | libgo/go/html/parse.go » ('j') | no next file with comments »