LEFT | RIGHT |
1 // Copyright (C) 2007 Google Inc. | 1 // Copyright (C) 2007 Google Inc. |
2 // | 2 // |
3 // Licensed under the Apache License, Version 2.0 (the "License"); | 3 // Licensed under the Apache License, Version 2.0 (the "License"); |
4 // you may not use this file except in compliance with the License. | 4 // you may not use this file except in compliance with the License. |
5 // You may obtain a copy of the License at | 5 // You may obtain a copy of the License at |
6 // | 6 // |
7 // http://www.apache.org/licenses/LICENSE-2.0 | 7 // http://www.apache.org/licenses/LICENSE-2.0 |
8 // | 8 // |
9 // Unless required by applicable law or agreed to in writing, software | 9 // Unless required by applicable law or agreed to in writing, software |
10 // distributed under the License is distributed on an "AS IS" BASIS, | 10 // distributed under the License is distributed on an "AS IS" BASIS, |
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 // See the License for the specific language governing permissions and | 12 // See the License for the specific language governing permissions and |
13 // limitations under the License. | 13 // limitations under the License. |
14 | 14 |
15 package com.google.caja.lexer.escaping; | 15 package com.google.caja.lexer.escaping; |
16 | 16 |
17 import com.google.caja.util.SparseBitSet; | 17 import com.google.caja.util.SparseBitSet; |
18 import java.io.IOException; | 18 import java.io.IOException; |
19 import java.nio.ByteBuffer; | |
20 import java.nio.CharBuffer; | |
21 import java.nio.charset.Charset; | |
22 | 19 |
23 import java.util.ArrayList; | 20 import java.util.ArrayList; |
24 import java.util.List; | 21 import java.util.List; |
25 | 22 |
26 /** | 23 /** |
27 * Escaping of strings and regular expressions. | 24 * Escaping of strings and regular expressions. |
28 * | 25 * |
29 * @author mikesamuel@gmail.com (Mike Samuel) | 26 * @author mikesamuel@gmail.com (Mike Samuel) |
30 */ | 27 */ |
31 public class Escaping { | 28 public class Escaping { |
32 | 29 |
33 /** | 30 /** |
34 * Given a plain text string writes an unquoted javascript string literal. | 31 * Given a plain text string writes an unquoted javascript string literal. |
35 * | 32 * |
36 * @param s the plain text string to escape. | 33 * @param s the plain text string to escape. |
37 * @param asciiOnly Makes sure that only ASCII characters are written to out. | 34 * @param asciiOnly Makes sure that only ASCII characters are written to out. |
38 * This is a good idea if you don't have control over the charset that | 35 * This is a good idea if you don't have control over the charset that |
39 * the javascript will be served with. | 36 * the javascript will be served with. |
40 * @param embeddable True to make sure that nothing is written to out that | 37 * @param embeddable True to make sure that nothing is written to out that |
41 * could interfere with embedding inside a script tag or CDATA section, or | 38 * could interfere with embedding inside a script tag or CDATA section, or |
42 * other tag that typically contains markup. | 39 * other tag that typically contains markup. |
43 * This does not make it safe to embed in an HTML attribute without | 40 * This does not make it safe to embed in an HTML attribute without |
44 * further escaping. | 41 * further escaping. |
45 * @param out written to. | 42 * @param out written to. |
46 */ | 43 */ |
47 public static void escapeJsString( | 44 public static void escapeJsString( |
48 CharSequence s, boolean asciiOnly, boolean embeddable, Appendable out) | 45 CharSequence s, boolean asciiOnly, boolean embeddable, Appendable out) |
49 throws IOException { | 46 throws IOException { |
50 new Escaper(s, embeddable ? STRING_EMBEDDABLE_ESCAPES : STRING_MINIMAL_ESCAP
ES, | 47 new Escaper( |
51 asciiOnly ? NO_NON_ASCII : ALLOW_NON_ASCII, JS_ENCODER, out) | 48 s, embeddable ? STRING_EMBEDDABLE_ESCAPES : STRING_MINIMAL_ESCAPES, |
| 49 asciiOnly ? NO_NON_ASCII : ALLOW_NON_ASCII, JS_ENCODER, out) |
52 .escape(); | 50 .escape(); |
53 } | 51 } |
54 | 52 |
55 /** @see #escapeJsString(CharSequence, boolean, boolean, Appendable) */ | 53 /** @see #escapeJsString(CharSequence, boolean, boolean, Appendable) */ |
56 public static void escapeJsString( | 54 public static void escapeJsString( |
57 CharSequence s, boolean asciiOnly, boolean embeddable, | 55 CharSequence s, boolean asciiOnly, boolean embeddable, |
58 StringBuilder out) { | 56 StringBuilder out) { |
59 try { | 57 try { |
60 escapeJsString(s, asciiOnly, embeddable, (Appendable) out); | 58 escapeJsString(s, asciiOnly, embeddable, (Appendable) out); |
61 } catch (IOException ex) { | 59 } catch (IOException ex) { |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
95 } | 93 } |
96 | 94 |
97 /** | 95 /** |
98 * Given a plain text string, write to out unquoted regular expression text | 96 * Given a plain text string, write to out unquoted regular expression text |
99 * that would match that substring and only that substring. | 97 * that would match that substring and only that substring. |
100 * | 98 * |
101 * @param s the plain text string to escape. | 99 * @param s the plain text string to escape. |
102 * @param asciiOnly Makes sure that only ASCII characters are written to out. | 100 * @param asciiOnly Makes sure that only ASCII characters are written to out. |
103 * This is a good idea if you don't have control over the charset that | 101 * This is a good idea if you don't have control over the charset that |
104 * the javascript will be served with. | 102 * the javascript will be served with. |
105 * @param embeddable True to make sure that nothing is written to out that cou
ld | 103 * @param embeddable True to make sure that nothing is written to out that |
106 * interfere with embedding inside a script tag or CDATA section, or | 104 * could interfere with embedding inside a script tag or CDATA section, or |
107 * other tag that typically contains markup. | 105 * other tag that typically contains markup. |
108 * This does not make it safe to embed in an HTML attribute without | 106 * This does not make it safe to embed in an HTML attribute without |
109 * further escaping. | 107 * further escaping. |
110 * @param out written to. | 108 * @param out written to. |
111 */ | 109 */ |
112 public static void escapeRegex( | 110 public static void escapeRegex( |
113 CharSequence s, boolean asciiOnly, boolean embeddable, Appendable out) | 111 CharSequence s, boolean asciiOnly, boolean embeddable, Appendable out) |
114 throws IOException { | 112 throws IOException { |
115 new Escaper( | 113 new Escaper( |
116 s, | 114 s, |
(...skipping 16 matching lines...) Expand all Loading... |
133 | 131 |
134 /** | 132 /** |
135 * Given a regular expression pattern, write a version to out that has the | 133 * Given a regular expression pattern, write a version to out that has the |
136 * same meaning, but with enough characters escaped to satisfy the conditions | 134 * same meaning, but with enough characters escaped to satisfy the conditions |
137 * imposed by the flags passed to this method. | 135 * imposed by the flags passed to this method. |
138 * | 136 * |
139 * @param s the plain text string to escape. | 137 * @param s the plain text string to escape. |
140 * @param asciiOnly Makes sure that only ASCII characters are written to out. | 138 * @param asciiOnly Makes sure that only ASCII characters are written to out. |
141 * This is a good idea if you don't have control over the charset that | 139 * This is a good idea if you don't have control over the charset that |
142 * the javascript will be served with. | 140 * the javascript will be served with. |
143 * @param embeddable True to make sure that nothing is written to out that cou
ld | 141 * @param embeddable True to make sure that nothing is written to out that |
144 * interfere with embedding inside a script tag or CDATA section, or | 142 * could interfere with embedding inside a script tag or CDATA section, or |
145 * other tag that typically contains markup. | 143 * other tag that typically contains markup. |
146 * This does not make it safe to embed in an HTML attribute without | 144 * This does not make it safe to embed in an HTML attribute without |
147 * further escaping. | 145 * further escaping. |
148 * @param out written to. | 146 * @param out written to. |
149 */ | 147 */ |
150 public static void normalizeRegex( | 148 public static void normalizeRegex( |
151 CharSequence s, boolean asciiOnly, boolean embeddable, Appendable out) | 149 CharSequence s, boolean asciiOnly, boolean embeddable, Appendable out) |
152 throws IOException { | 150 throws IOException { |
153 new Escaper(requireEndUnescaped(rebalance(s, '[', ']')), | 151 new Escaper(requireEndUnescaped(rebalance(s, '[', ']')), |
154 embeddable ? REGEX_EMBEDDABLE_ESCAPES : REGEX_MINIMAL_ESCAPES, | 152 embeddable ? REGEX_EMBEDDABLE_ESCAPES : REGEX_MINIMAL_ESCAPES, |
155 asciiOnly ? NO_NON_ASCII : ALLOW_NON_ASCII, JS_ENCODER, out) | 153 asciiOnly ? NO_NON_ASCII : ALLOW_NON_ASCII, JS_ENCODER, out) |
156 .normalize(); | 154 .normalize(); |
157 } | 155 } |
158 | 156 |
159 /** @see #normalizeRegex(CharSequence, boolean, boolean, Appendable) */ | 157 /** @see #normalizeRegex(CharSequence, boolean, boolean, Appendable) */ |
160 public static void normalizeRegex( | 158 public static void normalizeRegex( |
161 CharSequence s, boolean asciiOnly, boolean embeddable, | 159 CharSequence s, boolean asciiOnly, boolean embeddable, |
162 StringBuilder out) { | 160 StringBuilder out) { |
163 try { | 161 try { |
164 normalizeRegex(s, asciiOnly, embeddable, (Appendable) out); | 162 normalizeRegex(s, asciiOnly, embeddable, (Appendable) out); |
165 } catch (IOException ex) { | 163 } catch (IOException ex) { |
166 // StringBuilders don't throw IOException | 164 // StringBuilders don't throw IOException |
167 throw new RuntimeException(ex); | 165 throw new RuntimeException(ex); |
168 } | 166 } |
169 } | 167 } |
170 | 168 |
171 private static final Charset UTF8 = Charset.forName("UTF-8"); | 169 /** |
172 | 170 * Given plain text, output HTML/XML with the same meaning. |
173 /** | |
174 * Convert a URI to a string %xx escaping some codepoints that are in the | |
175 * RFC3986 reserved set, but only used in obsolete productions. | |
176 * This works around problems with inconsistencies in escaping conventions | |
177 * in CSS URIs, but still allows us to make sure that URIs don't look like | |
178 * code. | |
179 */ | |
180 public static String normalizeUri(String uri) { | |
181 StringBuilder sb = new StringBuilder(uri.length()); | |
182 boolean sawQmark = false; | |
183 for (int i = 0, n = uri.length(); i < n; ++i) { | |
184 char ch = uri.charAt(i); | |
185 boolean esc = false; | |
186 switch (ch) { | |
187 // Special in URIs, but only used in the obsolete "mark" production. | |
188 // Square brackets are used in IPv6 addresses so are not changed. | |
189 case '(': case ')': case '\'': case '*': esc = true; break; | |
190 case ':': esc = sawQmark; break; | |
191 case '=': esc = !sawQmark; break; | |
192 case '?': | |
193 if (sawQmark) { | |
194 esc = true; | |
195 } else { | |
196 sawQmark = true; | |
197 } | |
198 break; | |
199 default: | |
200 if (ch >= 0x7f) { | |
201 esc = true; | |
202 } | |
203 break; | |
204 } | |
205 if (esc) { | |
206 pctEncode(ch, sb); | |
207 } else { | |
208 sb.append(ch); | |
209 } | |
210 } | |
211 return sb.toString(); | |
212 } | |
213 | |
214 /** | |
215 * Given plain text, output html/XML with the same meaning. | |
216 * | 171 * |
217 * @param s the plain text string to escape. | 172 * @param s the plain text string to escape. |
218 * @param asciiOnly Makes sure that only ASCII characters are written to out. | 173 * @param asciiOnly Makes sure that only ASCII characters are written to out. |
219 * This is a good idea if you don't have control over the charset that | 174 * This is a good idea if you don't have control over the charset that |
220 * the javascript will be served with. | 175 * the javascript will be served with. |
221 * @param out written to. | 176 * @param out written to. |
222 */ | 177 */ |
223 public static void escapeXml( | 178 public static void escapeXml( |
224 CharSequence s, boolean asciiOnly, Appendable out) | 179 CharSequence s, boolean asciiOnly, Appendable out) |
225 throws IOException { | 180 throws IOException { |
(...skipping 330 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
556 } else { | 511 } else { |
557 if (ch >= 0x1000) { | 512 if (ch >= 0x1000) { |
558 out.append("0123456789ABCDEF".charAt((ch >> 12) & 0xf)); | 513 out.append("0123456789ABCDEF".charAt((ch >> 12) & 0xf)); |
559 } | 514 } |
560 out.append("0123456789ABCDEF".charAt((ch >> 8) & 0xf)) | 515 out.append("0123456789ABCDEF".charAt((ch >> 8) & 0xf)) |
561 .append("0123456789ABCDEF".charAt((ch >> 4) & 0xf)) | 516 .append("0123456789ABCDEF".charAt((ch >> 4) & 0xf)) |
562 .append("0123456789ABCDEF".charAt(ch & 0xf)); | 517 .append("0123456789ABCDEF".charAt(ch & 0xf)); |
563 } | 518 } |
564 } | 519 } |
565 | 520 |
566 static void pctEncode(char ch, StringBuilder out) { | |
567 if (ch < 0x80) { | |
568 pctEncode((byte) ch, out); | |
569 } else { | |
570 // UTF-8 encode | |
571 ByteBuffer bb = UTF8.encode(CharBuffer.wrap(new char[] { ch })); | |
572 while (bb.position() < bb.limit()) { | |
573 pctEncode(bb.get(), out); | |
574 } | |
575 } | |
576 } | |
577 static void pctEncode(byte b, StringBuilder out) { | |
578 assert (b & 0x80) == 0; // One byte form in UTF-8. | |
579 out.append('%') | |
580 .append("0123456789ABCDEF".charAt((b >> 4) & 0xf)) | |
581 .append("0123456789ABCDEF".charAt(b & 0xf)); | |
582 } | |
583 | |
584 /** Produces hex escape for all characters in the given inclusive range. */ | 521 /** Produces hex escape for all characters in the given inclusive range. */ |
585 private static Escape[] hex2Escapes(char min, char max) { | 522 private static Escape[] hex2Escapes(char min, char max) { |
586 Escape[] out = new Escape[max - min + 1]; | 523 Escape[] out = new Escape[max - min + 1]; |
587 for (int i = 0; i < out.length; ++i) { | 524 for (int i = 0; i < out.length; ++i) { |
588 StringBuilder sb = new StringBuilder(4); | 525 StringBuilder sb = new StringBuilder(4); |
589 char ch = (char) (min + i); | 526 char ch = (char) (min + i); |
590 try { | 527 try { |
591 hex2Escape(ch, sb); | 528 hex2Escape(ch, sb); |
592 } catch (IOException ex) { | 529 } catch (IOException ex) { |
593 // StringBuilders do not throw IOException | 530 // StringBuilders do not throw IOException |
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
676 ++nBackslashes; | 613 ++nBackslashes; |
677 } | 614 } |
678 if ((nBackslashes & 1) == 1) { | 615 if ((nBackslashes & 1) == 1) { |
679 return s + "\\"; | 616 return s + "\\"; |
680 } | 617 } |
681 return s; | 618 return s; |
682 } | 619 } |
683 | 620 |
684 private Escaping() { /* non instantiable */ } | 621 private Escaping() { /* non instantiable */ } |
685 } | 622 } |
LEFT | RIGHT |