OLD | NEW |
1 """A parser for HTML and XHTML.""" | 1 """A parser for HTML and XHTML.""" |
2 | 2 |
3 # This file is based on sgmllib.py, but the API is slightly different. | 3 # This file is based on sgmllib.py, but the API is slightly different. |
4 | 4 |
5 # XXX There should be a way to distinguish between PCDATA (parsed | 5 # XXX There should be a way to distinguish between PCDATA (parsed |
6 # character data -- the normal case), RCDATA (replaceable character | 6 # character data -- the normal case), RCDATA (replaceable character |
7 # data -- only char and entity references and end tags are special) | 7 # data -- only char and entity references and end tags are special) |
8 # and CDATA (character data -- only end tags are special). | 8 # and CDATA (character data -- only end tags are special). |
9 | 9 |
10 | 10 |
(...skipping 367 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
378 if HTMLParser.entitydefs is None: | 378 if HTMLParser.entitydefs is None: |
379 entitydefs = HTMLParser.entitydefs = {'apos':"'"} | 379 entitydefs = HTMLParser.entitydefs = {'apos':"'"} |
380 for k, v in html.entities.name2codepoint.items(): | 380 for k, v in html.entities.name2codepoint.items(): |
381 entitydefs[k] = chr(v) | 381 entitydefs[k] = chr(v) |
382 try: | 382 try: |
383 return self.entitydefs[s] | 383 return self.entitydefs[s] |
384 except KeyError: | 384 except KeyError: |
385 return '&'+s+';' | 385 return '&'+s+';' |
386 | 386 |
387 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", | 387 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", |
388 replaceEntities, s) | 388 replaceEntities, s, re.ASCII) |
OLD | NEW |