Left: | ||
Right: |
OLD | NEW |
---|---|
1 // Copyright 2009 The Go Authors. All rights reserved. | 1 // Copyright 2009 The Go Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // Package xml implements a simple XML 1.0 parser that | 5 // Package xml implements a simple XML 1.0 parser that |
6 // understands XML name spaces. | 6 // understands XML name spaces. |
7 package xml | 7 package xml |
8 | 8 |
9 // References: | 9 // References: |
10 // Annotated XML spec: http://www.xml.com/axml/testaxml.htm | 10 // Annotated XML spec: http://www.xml.com/axml/testaxml.htm |
11 // XML name spaces: http://www.w3.org/TR/REC-xml-names/ | 11 // XML name spaces: http://www.w3.org/TR/REC-xml-names/ |
12 | 12 |
13 // TODO(rsc): | 13 // TODO(rsc): |
14 // Test error handling. | 14 // Test error handling. |
15 | 15 |
16 import ( | 16 import ( |
17 "bufio" | 17 "bufio" |
18 "bytes" | 18 "bytes" |
19 "errors" | |
19 "fmt" | 20 "fmt" |
20 "io" | 21 "io" |
21 "strconv" | 22 "strconv" |
22 "strings" | 23 "strings" |
23 "unicode" | 24 "unicode" |
24 "unicode/utf8" | 25 "unicode/utf8" |
25 ) | 26 ) |
26 | 27 |
27 // A SyntaxError represents a syntax error in the XML input stream. | 28 // A SyntaxError represents a syntax error in the XML input stream. |
28 type SyntaxError struct { | 29 type SyntaxError struct { |
(...skipping 138 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
167 // non-UTF-8 charset into UTF-8. If CharsetReader is nil or | 168 // non-UTF-8 charset into UTF-8. If CharsetReader is nil or |
168 // returns an error, parsing stops with an error. One of the | 169 // returns an error, parsing stops with an error. One of the |
169 // the CharsetReader's result values must be non-nil. | 170 // the CharsetReader's result values must be non-nil. |
170 CharsetReader func(charset string, input io.Reader) (io.Reader, error) | 171 CharsetReader func(charset string, input io.Reader) (io.Reader, error) |
171 | 172 |
172 // DefaultSpace sets the default name space used for unadorned tags, | 173 // DefaultSpace sets the default name space used for unadorned tags, |
173 // as if the entire XML stream were wrapped in an element containing | 174 // as if the entire XML stream were wrapped in an element containing |
174 // the attribute xmlns="DefaultSpace". | 175 // the attribute xmlns="DefaultSpace". |
175 DefaultSpace string | 176 DefaultSpace string |
176 | 177 |
177 » r io.ByteReader | 178 » r io.ByteReader |
178 » buf bytes.Buffer | 179 » buf bytes.Buffer |
179 » saved *bytes.Buffer | 180 » saved *bytes.Buffer |
180 » stk *stack | 181 » stk *stack |
181 » free *stack | 182 » free *stack |
182 » needClose bool | 183 » needClose bool |
183 » toClose Name | 184 » toClose Name |
184 » nextToken Token | 185 » nextToken Token |
185 » nextByte int | 186 » nextByte int |
186 » ns map[string]string | 187 » ns map[string]string |
187 » err error | 188 » err error |
188 » line int | 189 » line int |
190 » unmarshalDepth int | |
189 } | 191 } |
190 | 192 |
191 // NewDecoder creates a new XML parser reading from r. | 193 // NewDecoder creates a new XML parser reading from r. |
192 func NewDecoder(r io.Reader) *Decoder { | 194 func NewDecoder(r io.Reader) *Decoder { |
193 d := &Decoder{ | 195 d := &Decoder{ |
194 ns: make(map[string]string), | 196 ns: make(map[string]string), |
195 nextByte: -1, | 197 nextByte: -1, |
196 line: 1, | 198 line: 1, |
197 Strict: true, | 199 Strict: true, |
198 } | 200 } |
(...skipping 17 matching lines...) Expand all Loading... | |
216 // if Token encounters an unexpected end element, | 218 // if Token encounters an unexpected end element, |
217 // it will return an error. | 219 // it will return an error. |
218 // | 220 // |
219 // Token implements XML name spaces as described by | 221 // Token implements XML name spaces as described by |
220 // http://www.w3.org/TR/REC-xml-names/. Each of the | 222 // http://www.w3.org/TR/REC-xml-names/. Each of the |
221 // Name structures contained in the Token has the Space | 223 // Name structures contained in the Token has the Space |
222 // set to the URL identifying its name space when known. | 224 // set to the URL identifying its name space when known. |
223 // If Token encounters an unrecognized name space prefix, | 225 // If Token encounters an unrecognized name space prefix, |
224 // it uses the prefix as the Space rather than report an error. | 226 // it uses the prefix as the Space rather than report an error. |
225 func (d *Decoder) Token() (t Token, err error) { | 227 func (d *Decoder) Token() (t Token, err error) { |
228 if d.stk != nil && d.stk.kind == stkEOF { | |
229 err = io.EOF | |
230 return | |
231 } | |
226 if d.nextToken != nil { | 232 if d.nextToken != nil { |
227 t = d.nextToken | 233 t = d.nextToken |
228 d.nextToken = nil | 234 d.nextToken = nil |
229 » } else if t, err = d.RawToken(); err != nil { | 235 » } else if t, err = d.rawToken(); err != nil { |
230 return | 236 return |
231 } | 237 } |
232 | 238 |
233 if !d.Strict { | 239 if !d.Strict { |
234 if t1, ok := d.autoClose(t); ok { | 240 if t1, ok := d.autoClose(t); ok { |
235 d.nextToken = t | 241 d.nextToken = t |
236 t = t1 | 242 t = t1 |
237 } | 243 } |
238 } | 244 } |
239 switch t1 := t.(type) { | 245 switch t1 := t.(type) { |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
315 type stack struct { | 321 type stack struct { |
316 next *stack | 322 next *stack |
317 kind int | 323 kind int |
318 name Name | 324 name Name |
319 ok bool | 325 ok bool |
320 } | 326 } |
321 | 327 |
322 const ( | 328 const ( |
323 stkStart = iota | 329 stkStart = iota |
324 stkNs | 330 stkNs |
331 stkEOF | |
325 ) | 332 ) |
326 | 333 |
327 func (d *Decoder) push(kind int) *stack { | 334 func (d *Decoder) push(kind int) *stack { |
328 s := d.free | 335 s := d.free |
329 if s != nil { | 336 if s != nil { |
330 d.free = s.next | 337 d.free = s.next |
331 } else { | 338 } else { |
332 s = new(stack) | 339 s = new(stack) |
333 } | 340 } |
334 s.next = d.stk | 341 s.next = d.stk |
335 s.kind = kind | 342 s.kind = kind |
336 d.stk = s | 343 d.stk = s |
337 return s | 344 return s |
338 } | 345 } |
339 | 346 |
340 func (d *Decoder) pop() *stack { | 347 func (d *Decoder) pop() *stack { |
341 s := d.stk | 348 s := d.stk |
342 if s != nil { | 349 if s != nil { |
343 d.stk = s.next | 350 d.stk = s.next |
344 s.next = d.free | 351 s.next = d.free |
345 d.free = s | 352 d.free = s |
346 } | 353 } |
347 return s | 354 return s |
348 } | 355 } |
349 | 356 |
357 // Record that after the current element is finished | |
358 // (that element is already pushed on the stack) | |
359 // Token should return EOF until popEOF is called. | |
360 func (d *Decoder) pushEOF() { | |
361 // Walk down stack to find Start. | |
362 // It might not be the top, because there might be stkNs | |
363 // entries above it. | |
364 start := d.stk | |
365 for start.kind != stkStart { | |
366 start = start.next | |
367 } | |
368 // The stkNs entries below a start are associated with that | |
369 // element too; skip over them. | |
370 for start.next != nil && start.next.kind == stkNs { | |
371 start = start.next | |
372 } | |
373 s := d.free | |
374 if s != nil { | |
375 d.free = s.next | |
376 } else { | |
377 s = new(stack) | |
378 } | |
379 s.kind = stkEOF | |
380 s.next = start.next | |
381 start.next = s | |
382 } | |
383 | |
384 // Undo a pushEOF. | |
385 // The element must have been finished, so the EOF should be at the top of the s tack. | |
386 func (d *Decoder) popEOF() bool { | |
387 if d.stk == nil || d.stk.kind != stkEOF { | |
388 return false | |
389 } | |
390 d.pop() | |
391 return true | |
392 } | |
393 | |
350 // Record that we are starting an element with the given name. | 394 // Record that we are starting an element with the given name. |
351 func (d *Decoder) pushElement(name Name) { | 395 func (d *Decoder) pushElement(name Name) { |
352 s := d.push(stkStart) | 396 s := d.push(stkStart) |
353 s.name = name | 397 s.name = name |
354 } | 398 } |
355 | 399 |
356 // Record that we are changing the value of ns[local]. | 400 // Record that we are changing the value of ns[local]. |
357 // The old value is url, ok. | 401 // The old value is url, ok. |
358 func (d *Decoder) pushNs(local string, url string, ok bool) { | 402 func (d *Decoder) pushNs(local string, url string, ok bool) { |
359 s := d.push(stkNs) | 403 s := d.push(stkNs) |
(...skipping 28 matching lines...) Expand all Loading... | |
388 return true | 432 return true |
389 } | 433 } |
390 d.err = d.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">") | 434 d.err = d.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">") |
391 return false | 435 return false |
392 case s.name.Space != name.Space: | 436 case s.name.Space != name.Space: |
393 d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space + | 437 d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space + |
394 "closed by </" + name.Local + "> in space " + name.Space ) | 438 "closed by </" + name.Local + "> in space " + name.Space ) |
395 return false | 439 return false |
396 } | 440 } |
397 | 441 |
398 » // Pop stack until a Start is on the top, undoing the | 442 » // Pop stack until a Start or EOF is on the top, undoing the |
399 // translations that were associated with the element we just closed. | 443 // translations that were associated with the element we just closed. |
400 » for d.stk != nil && d.stk.kind != stkStart { | 444 » for d.stk != nil && d.stk.kind != stkStart && d.stk.kind != stkEOF { |
401 s := d.pop() | 445 s := d.pop() |
402 if s.ok { | 446 if s.ok { |
403 d.ns[s.name.Local] = s.name.Space | 447 d.ns[s.name.Local] = s.name.Space |
404 } else { | 448 } else { |
405 delete(d.ns, s.name.Local) | 449 delete(d.ns, s.name.Local) |
406 } | 450 } |
407 } | 451 } |
408 | 452 |
409 return true | 453 return true |
410 } | 454 } |
(...skipping 11 matching lines...) Expand all Loading... | |
422 et, ok := t.(EndElement) | 466 et, ok := t.(EndElement) |
423 if !ok || et.Name.Local != name { | 467 if !ok || et.Name.Local != name { |
424 return EndElement{d.stk.name}, true | 468 return EndElement{d.stk.name}, true |
425 } | 469 } |
426 break | 470 break |
427 } | 471 } |
428 } | 472 } |
429 return nil, false | 473 return nil, false |
430 } | 474 } |
431 | 475 |
476 var errRawToken = errors.New("xml: cannot use RawToken from UnmarshalXML method" ) | |
477 | |
432 // RawToken is like Token but does not verify that | 478 // RawToken is like Token but does not verify that |
433 // start and end elements match and does not translate | 479 // start and end elements match and does not translate |
434 // name space prefixes to their corresponding URLs. | 480 // name space prefixes to their corresponding URLs. |
435 func (d *Decoder) RawToken() (Token, error) { | 481 func (d *Decoder) RawToken() (Token, error) { |
482 if d.unmarshalDepth > 0 { | |
483 return nil, errRawToken | |
Dominik Honnef
2013/08/09 16:21:45
Should this error condition be documented?
rsc
2013/08/13 16:24:34
Done.
| |
484 } | |
485 return d.rawToken() | |
486 } | |
487 | |
488 func (d *Decoder) rawToken() (Token, error) { | |
436 if d.err != nil { | 489 if d.err != nil { |
437 return nil, d.err | 490 return nil, d.err |
438 } | 491 } |
439 if d.needClose { | 492 if d.needClose { |
440 // The last element we read was self-closing and | 493 // The last element we read was self-closing and |
441 // we returned just the StartElement half. | 494 // we returned just the StartElement half. |
442 // Return the EndElement half now. | 495 // Return the EndElement half now. |
443 d.needClose = false | 496 d.needClose = false |
444 return EndElement{d.toClose}, nil | 497 return EndElement{d.toClose}, nil |
445 } | 498 } |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
477 return nil, d.err | 530 return nil, d.err |
478 } | 531 } |
479 if b != '>' { | 532 if b != '>' { |
480 d.err = d.syntaxError("invalid characters between </" + name.Local + " and >") | 533 d.err = d.syntaxError("invalid characters between </" + name.Local + " and >") |
481 return nil, d.err | 534 return nil, d.err |
482 } | 535 } |
483 return EndElement{name}, nil | 536 return EndElement{name}, nil |
484 | 537 |
485 case '?': | 538 case '?': |
486 // <?: Processing instruction. | 539 // <?: Processing instruction. |
487 » » // TODO(rsc): Should parse the <?xml declaration to make sure | 540 » » // TODO(rsc): Should parse the <?xml declaration to make sure th e version is 1.0. |
488 » » // the version is 1.0 and the encoding is UTF-8. | |
489 var target string | 541 var target string |
490 if target, ok = d.name(); !ok { | 542 if target, ok = d.name(); !ok { |
491 if d.err == nil { | 543 if d.err == nil { |
492 d.err = d.syntaxError("expected target name afte r <?") | 544 d.err = d.syntaxError("expected target name afte r <?") |
493 } | 545 } |
494 return nil, d.err | 546 return nil, d.err |
495 } | 547 } |
496 d.space() | 548 d.space() |
497 d.buf.Reset() | 549 d.buf.Reset() |
498 var b0 byte | 550 var b0 byte |
(...skipping 606 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1105 if c == utf8.RuneError && n == 1 { | 1157 if c == utf8.RuneError && n == 1 { |
1106 return false | 1158 return false |
1107 } | 1159 } |
1108 if !unicode.Is(first, c) && !unicode.Is(second, c) { | 1160 if !unicode.Is(first, c) && !unicode.Is(second, c) { |
1109 return false | 1161 return false |
1110 } | 1162 } |
1111 } | 1163 } |
1112 return true | 1164 return true |
1113 } | 1165 } |
1114 | 1166 |
1167 func isNameString(s string) bool { | |
1168 if len(s) == 0 { | |
1169 return false | |
1170 } | |
1171 c, n := utf8.DecodeRuneInString(s) | |
1172 if c == utf8.RuneError && n == 1 { | |
1173 return false | |
1174 } | |
1175 if !unicode.Is(first, c) { | |
1176 return false | |
1177 } | |
1178 for n < len(s) { | |
1179 s = s[n:] | |
1180 c, n = utf8.DecodeRuneInString(s) | |
1181 if c == utf8.RuneError && n == 1 { | |
1182 return false | |
1183 } | |
1184 if !unicode.Is(first, c) && !unicode.Is(second, c) { | |
1185 return false | |
1186 } | |
1187 } | |
1188 return true | |
1189 } | |
1190 | |
1115 // These tables were generated by cut and paste from Appendix B of | 1191 // These tables were generated by cut and paste from Appendix B of |
1116 // the XML spec at http://www.xml.com/axml/testaxml.htm | 1192 // the XML spec at http://www.xml.com/axml/testaxml.htm |
1117 // and then reformatting. First corresponds to (Letter | '_' | ':') | 1193 // and then reformatting. First corresponds to (Letter | '_' | ':') |
1118 // and second corresponds to NameChar. | 1194 // and second corresponds to NameChar. |
1119 | 1195 |
1120 var first = &unicode.RangeTable{ | 1196 var first = &unicode.RangeTable{ |
1121 R16: []unicode.Range16{ | 1197 R16: []unicode.Range16{ |
1122 {0x003A, 0x003A, 1}, | 1198 {0x003A, 0x003A, 1}, |
1123 {0x0041, 0x005A, 1}, | 1199 {0x0041, 0x005A, 1}, |
1124 {0x005F, 0x005F, 1}, | 1200 {0x005F, 0x005F, 1}, |
(...skipping 646 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1771 return err | 1847 return err |
1772 } | 1848 } |
1773 last = i | 1849 last = i |
1774 } | 1850 } |
1775 if _, err := w.Write(s[last:]); err != nil { | 1851 if _, err := w.Write(s[last:]); err != nil { |
1776 return err | 1852 return err |
1777 } | 1853 } |
1778 return nil | 1854 return nil |
1779 } | 1855 } |
1780 | 1856 |
1857 // EscapeString writes to p the properly escaped XML equivalent | |
1858 // of the plain text data s. | |
1859 func (p *printer) EscapeString(s string) { | |
1860 var esc []byte | |
1861 last := 0 | |
1862 for i := 0; i < len(s); { | |
1863 r, width := utf8.DecodeRuneInString(s[i:]) | |
1864 i += width | |
1865 switch r { | |
1866 case '"': | |
1867 esc = esc_quot | |
1868 case '\'': | |
1869 esc = esc_apos | |
1870 case '&': | |
1871 esc = esc_amp | |
1872 case '<': | |
1873 esc = esc_lt | |
1874 case '>': | |
1875 esc = esc_gt | |
1876 case '\t': | |
1877 esc = esc_tab | |
1878 case '\n': | |
1879 esc = esc_nl | |
1880 case '\r': | |
1881 esc = esc_cr | |
1882 default: | |
1883 if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) { | |
1884 esc = esc_fffd | |
1885 break | |
1886 } | |
1887 continue | |
1888 } | |
1889 p.WriteString(s[last : i-width]) | |
1890 p.Write(esc) | |
1891 last = i | |
1892 } | |
1893 p.WriteString(s[last:]) | |
1894 } | |
1895 | |
1781 // Escape is like EscapeText but omits the error return value. | 1896 // Escape is like EscapeText but omits the error return value. |
1782 // It is provided for backwards compatibility with Go 1.0. | 1897 // It is provided for backwards compatibility with Go 1.0. |
1783 // Code targeting Go 1.1 or later should use EscapeText. | 1898 // Code targeting Go 1.1 or later should use EscapeText. |
1784 func Escape(w io.Writer, s []byte) { | 1899 func Escape(w io.Writer, s []byte) { |
1785 EscapeText(w, s) | 1900 EscapeText(w, s) |
1786 } | 1901 } |
1787 | 1902 |
1788 // procInstEncoding parses the `encoding="..."` or `encoding='...'` | 1903 // procInstEncoding parses the `encoding="..."` or `encoding='...'` |
1789 // value out of the provided string, returning "" if not found. | 1904 // value out of the provided string, returning "" if not found. |
1790 func procInstEncoding(s string) string { | 1905 func procInstEncoding(s string) string { |
1791 // TODO: this parsing is somewhat lame and not exact. | 1906 // TODO: this parsing is somewhat lame and not exact. |
1792 // It works for all actual cases, though. | 1907 // It works for all actual cases, though. |
1793 idx := strings.Index(s, "encoding=") | 1908 idx := strings.Index(s, "encoding=") |
1794 if idx == -1 { | 1909 if idx == -1 { |
1795 return "" | 1910 return "" |
1796 } | 1911 } |
1797 v := s[idx+len("encoding="):] | 1912 v := s[idx+len("encoding="):] |
1798 if v == "" { | 1913 if v == "" { |
1799 return "" | 1914 return "" |
1800 } | 1915 } |
1801 if v[0] != '\'' && v[0] != '"' { | 1916 if v[0] != '\'' && v[0] != '"' { |
1802 return "" | 1917 return "" |
1803 } | 1918 } |
1804 idx = strings.IndexRune(v[1:], rune(v[0])) | 1919 idx = strings.IndexRune(v[1:], rune(v[0])) |
1805 if idx == -1 { | 1920 if idx == -1 { |
1806 return "" | 1921 return "" |
1807 } | 1922 } |
1808 return v[1 : idx+1] | 1923 return v[1 : idx+1] |
1809 } | 1924 } |
OLD | NEW |