Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(1427)

Side by Side Diff: libgo/go/html/escape.go

Issue 4035044: code review 4035044: Update to current version of Go library. (Closed)
Patch Set: Created 14 years, 2 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « libgo/go/html/entity_test.go ('k') | libgo/go/html/parse.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2010 The Go Authors. All rights reserved. 1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 package html 5 package html
6 6
7 import ( 7 import (
8 "bytes" 8 "bytes"
9 "strings" 9 "strings"
10 "utf8" 10 "utf8"
11 ) 11 )
12 12
13 // These replacements permit compatibility with old numeric entities that·
14 // assumed Windows-1252 encoding.
15 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html #consume-a-character-reference
16 var replacementTable = [...]int{
17 '\u20AC', // First entry is what 0x80 should be replaced with.
18 '\u0081',
19 '\u201A',
20 '\u0192',
21 '\u201E',
22 '\u2026',
23 '\u2020',
24 '\u2021',
25 '\u02C6',
26 '\u2030',
27 '\u0160',
28 '\u2039',
29 '\u0152',
30 '\u008D',
31 '\u017D',
32 '\u008F',
33 '\u0090',
34 '\u2018',
35 '\u2019',
36 '\u201C',
37 '\u201D',
38 '\u2022',
39 '\u2013',
40 '\u2014',
41 '\u02DC',
42 '\u2122',
43 '\u0161',
44 '\u203A',
45 '\u0153',
46 '\u009D',
47 '\u017E',
48 '\u0178', // Last entry is 0x9F.
49 // 0x00->'\uFFFD' is handled programmatically.·
50 // 0x0D->'\u000D' is a no-op.
51 }
52
13 // unescapeEntity reads an entity like "<" from b[src:] and writes the 53 // unescapeEntity reads an entity like "<" from b[src:] and writes the
14 // corresponding "<" to b[dst:], returning the incremented dst and src cursors. 54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
15 // Precondition: src[0] == '&' && dst <= src. 55 // Precondition: b[src] == '&' && dst <= src.
16 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { 56 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
17 // TODO(nigeltao): Check that this entity substitution algorithm matches the spec:
18 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenizat ion.html#consume-a-character-reference 57 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenizat ion.html#consume-a-character-reference
19 // TODO(nigeltao): Handle things like "&#20013;" or "&#x4e2d;".
20 58
21 // i starts at 1 because we already know that s[0] == '&'. 59 // i starts at 1 because we already know that s[0] == '&'.
22 i, s := 1, b[src:] 60 i, s := 1, b[src:]
61
62 if len(s) <= 1 {
63 b[dst] = b[src]
64 return dst + 1, src + 1
65 }
66
67 if s[i] == '#' {
68 if len(s) <= 3 { // We need to have at least "&#.".
69 b[dst] = b[src]
70 return dst + 1, src + 1
71 }
72 i++
73 c := s[i]
74 hex := false
75 if c == 'x' || c == 'X' {
76 hex = true
77 i++
78 }
79
80 x := 0
81 for i < len(s) {
82 c = s[i]
83 i++
84 if hex {
85 if '0' <= c && c <= '9' {
86 x = 16*x + int(c) - '0'
87 continue
88 } else if 'a' <= c && c <= 'f' {
89 x = 16*x + int(c) - 'a' + 10
90 continue
91 } else if 'A' <= c && c <= 'F' {
92 x = 16*x + int(c) - 'A' + 10
93 continue
94 }
95 } else if '0' <= c && c <= '9' {
96 x = 10*x + int(c) - '0'
97 continue
98 }
99 if c != ';' {
100 i--
101 }
102 break
103 }
104
105 if i <= 3 { // No characters matched.
106 b[dst] = b[src]
107 return dst + 1, src + 1
108 }
109
110 if 0x80 <= x && x <= 0x9F {
111 // Replace characters from Windows-1252 with UTF-8 equiv alents.
112 x = replacementTable[x-0x80]
113 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
114 // Replace invalid characters with the replacement chara cter.
115 x = '\uFFFD'
116 }
117
118 return dst + utf8.EncodeRune(b[dst:], x), src + i
119 }
120
121 // Consume the maximum number of characters possible, with the
122 // consumed characters matching one of the named references.
123
124 // TODO(nigeltao): unescape("&notit;") should be "¬it;"
23 for i < len(s) { 125 for i < len(s) {
24 c := s[i] 126 c := s[i]
25 i++ 127 i++
26 // Lower-cased characters are more common in entities, so we che ck for them first. 128 // Lower-cased characters are more common in entities, so we che ck for them first.
27 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { 129 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
28 continue 130 continue
29 } 131 }
30 if c != ';' { 132 if c != ';' {
31 i-- 133 i--
32 } 134 }
33 x := entity[string(s[1:i])]
34 if x != 0 {
35 return dst + utf8.EncodeRune(x, b[dst:]), src + i
36 }
37 break 135 break
38 } 136 }
137
138 entityName := string(s[1:i])
139 if x := entity[entityName]; x != 0 {
140 return dst + utf8.EncodeRune(b[dst:], x), src + i
141 } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-c haracter entity.
142 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
143 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
144 }
145
39 dst1, src1 = dst+i, src+i 146 dst1, src1 = dst+i, src+i
40 copy(b[dst:dst1], b[src:src1]) 147 copy(b[dst:dst1], b[src:src1])
41 return dst1, src1 148 return dst1, src1
42 } 149 }
43 150
44 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b". 151 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
45 func unescape(b []byte) []byte { 152 func unescape(b []byte) []byte {
46 for i, c := range b { 153 for i, c := range b {
47 if c == '&' { 154 if c == '&' {
48 dst, src := unescapeEntity(b, i, i) 155 dst, src := unescapeEntity(b, i, i)
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
108 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't 215 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
109 // always true. 216 // always true.
110 func UnescapeString(s string) string { 217 func UnescapeString(s string) string {
111 for _, c := range s { 218 for _, c := range s {
112 if c == '&' { 219 if c == '&' {
113 return string(unescape([]byte(s))) 220 return string(unescape([]byte(s)))
114 } 221 }
115 } 222 }
116 return s 223 return s
117 } 224 }
OLDNEW
« no previous file with comments | « libgo/go/html/entity_test.go ('k') | libgo/go/html/parse.go » ('j') | no next file with comments »

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b