OLD | NEW |
1 // Copyright 2010 The Go Authors. All rights reserved. | 1 // Copyright 2010 The Go Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 package html | 5 package html |
6 | 6 |
7 import ( | 7 import ( |
8 "bytes" | 8 "bytes" |
9 "strings" | 9 "strings" |
10 "utf8" | 10 "utf8" |
11 ) | 11 ) |
12 | 12 |
| 13 // These replacements permit compatibility with old numeric entities that· |
| 14 // assumed Windows-1252 encoding. |
| 15 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
#consume-a-character-reference |
| 16 var replacementTable = [...]int{ |
| 17 '\u20AC', // First entry is what 0x80 should be replaced with. |
| 18 '\u0081', |
| 19 '\u201A', |
| 20 '\u0192', |
| 21 '\u201E', |
| 22 '\u2026', |
| 23 '\u2020', |
| 24 '\u2021', |
| 25 '\u02C6', |
| 26 '\u2030', |
| 27 '\u0160', |
| 28 '\u2039', |
| 29 '\u0152', |
| 30 '\u008D', |
| 31 '\u017D', |
| 32 '\u008F', |
| 33 '\u0090', |
| 34 '\u2018', |
| 35 '\u2019', |
| 36 '\u201C', |
| 37 '\u201D', |
| 38 '\u2022', |
| 39 '\u2013', |
| 40 '\u2014', |
| 41 '\u02DC', |
| 42 '\u2122', |
| 43 '\u0161', |
| 44 '\u203A', |
| 45 '\u0153', |
| 46 '\u009D', |
| 47 '\u017E', |
| 48 '\u0178', // Last entry is 0x9F. |
| 49 // 0x00->'\uFFFD' is handled programmatically.· |
| 50 // 0x0D->'\u000D' is a no-op. |
| 51 } |
| 52 |
13 // unescapeEntity reads an entity like "<" from b[src:] and writes the | 53 // unescapeEntity reads an entity like "<" from b[src:] and writes the |
14 // corresponding "<" to b[dst:], returning the incremented dst and src cursors. | 54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors. |
15 // Precondition: src[0] == '&' && dst <= src. | 55 // Precondition: b[src] == '&' && dst <= src. |
16 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { | 56 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { |
17 // TODO(nigeltao): Check that this entity substitution algorithm matches
the spec: | |
18 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenizat
ion.html#consume-a-character-reference | 57 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenizat
ion.html#consume-a-character-reference |
19 // TODO(nigeltao): Handle things like "中" or "中". | |
20 | 58 |
21 // i starts at 1 because we already know that s[0] == '&'. | 59 // i starts at 1 because we already know that s[0] == '&'. |
22 i, s := 1, b[src:] | 60 i, s := 1, b[src:] |
| 61 |
| 62 if len(s) <= 1 { |
| 63 b[dst] = b[src] |
| 64 return dst + 1, src + 1 |
| 65 } |
| 66 |
| 67 if s[i] == '#' { |
| 68 if len(s) <= 3 { // We need to have at least "&#.". |
| 69 b[dst] = b[src] |
| 70 return dst + 1, src + 1 |
| 71 } |
| 72 i++ |
| 73 c := s[i] |
| 74 hex := false |
| 75 if c == 'x' || c == 'X' { |
| 76 hex = true |
| 77 i++ |
| 78 } |
| 79 |
| 80 x := 0 |
| 81 for i < len(s) { |
| 82 c = s[i] |
| 83 i++ |
| 84 if hex { |
| 85 if '0' <= c && c <= '9' { |
| 86 x = 16*x + int(c) - '0' |
| 87 continue |
| 88 } else if 'a' <= c && c <= 'f' { |
| 89 x = 16*x + int(c) - 'a' + 10 |
| 90 continue |
| 91 } else if 'A' <= c && c <= 'F' { |
| 92 x = 16*x + int(c) - 'A' + 10 |
| 93 continue |
| 94 } |
| 95 } else if '0' <= c && c <= '9' { |
| 96 x = 10*x + int(c) - '0' |
| 97 continue |
| 98 } |
| 99 if c != ';' { |
| 100 i-- |
| 101 } |
| 102 break |
| 103 } |
| 104 |
| 105 if i <= 3 { // No characters matched. |
| 106 b[dst] = b[src] |
| 107 return dst + 1, src + 1 |
| 108 } |
| 109 |
| 110 if 0x80 <= x && x <= 0x9F { |
| 111 // Replace characters from Windows-1252 with UTF-8 equiv
alents. |
| 112 x = replacementTable[x-0x80] |
| 113 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF
{ |
| 114 // Replace invalid characters with the replacement chara
cter. |
| 115 x = '\uFFFD' |
| 116 } |
| 117 |
| 118 return dst + utf8.EncodeRune(b[dst:], x), src + i |
| 119 } |
| 120 |
| 121 // Consume the maximum number of characters possible, with the |
| 122 // consumed characters matching one of the named references. |
| 123 |
| 124 // TODO(nigeltao): unescape("¬it;") should be "¬it;" |
23 for i < len(s) { | 125 for i < len(s) { |
24 c := s[i] | 126 c := s[i] |
25 i++ | 127 i++ |
26 // Lower-cased characters are more common in entities, so we che
ck for them first. | 128 // Lower-cased characters are more common in entities, so we che
ck for them first. |
27 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { | 129 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { |
28 continue | 130 continue |
29 } | 131 } |
30 if c != ';' { | 132 if c != ';' { |
31 i-- | 133 i-- |
32 } | 134 } |
33 x := entity[string(s[1:i])] | |
34 if x != 0 { | |
35 return dst + utf8.EncodeRune(x, b[dst:]), src + i | |
36 } | |
37 break | 135 break |
38 } | 136 } |
| 137 |
| 138 entityName := string(s[1:i]) |
| 139 if x := entity[entityName]; x != 0 { |
| 140 return dst + utf8.EncodeRune(b[dst:], x), src + i |
| 141 } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-c
haracter entity. |
| 142 dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) |
| 143 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i |
| 144 } |
| 145 |
39 dst1, src1 = dst+i, src+i | 146 dst1, src1 = dst+i, src+i |
40 copy(b[dst:dst1], b[src:src1]) | 147 copy(b[dst:dst1], b[src:src1]) |
41 return dst1, src1 | 148 return dst1, src1 |
42 } | 149 } |
43 | 150 |
44 // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b". | 151 // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b". |
45 func unescape(b []byte) []byte { | 152 func unescape(b []byte) []byte { |
46 for i, c := range b { | 153 for i, c := range b { |
47 if c == '&' { | 154 if c == '&' { |
48 dst, src := unescapeEntity(b, i, i) | 155 dst, src := unescapeEntity(b, i, i) |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
108 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't | 215 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't |
109 // always true. | 216 // always true. |
110 func UnescapeString(s string) string { | 217 func UnescapeString(s string) string { |
111 for _, c := range s { | 218 for _, c := range s { |
112 if c == '&' { | 219 if c == '&' { |
113 return string(unescape([]byte(s))) | 220 return string(unescape([]byte(s))) |
114 } | 221 } |
115 } | 222 } |
116 return s | 223 return s |
117 } | 224 } |
OLD | NEW |