Left: | ||
Right: |
OLD | NEW |
---|---|
1 """Parse (absolute and relative) URLs. | 1 """Parse (absolute and relative) URLs. |
2 | 2 |
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, | 3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, |
4 UC Irvine, June 1995. | 4 UC Irvine, June 1995. |
5 """ | 5 """ |
6 | 6 |
7 import sys | 7 import sys |
8 | 8 |
9 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", | 9 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", |
10 "urlsplit", "urlunsplit"] | 10 "urlsplit", "urlunsplit", |
11 "quote", "quote_plus", "quote_from_bytes", | |
12 "unquote", "unquote_plus", "unquote_to_bytes"] | |
11 | 13 |
12 # A classification of schemes ('' means apply by default) | 14 # A classification of schemes ('' means apply by default) |
13 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', | 15 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', |
14 'wais', 'file', 'https', 'shttp', 'mms', | 16 'wais', 'file', 'https', 'shttp', 'mms', |
15 'prospero', 'rtsp', 'rtspu', '', 'sftp'] | 17 'prospero', 'rtsp', 'rtspu', '', 'sftp'] |
16 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', | 18 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', |
17 'imap', 'wais', 'file', 'mms', 'https', 'shttp', | 19 'imap', 'wais', 'file', 'mms', 'https', 'shttp', |
18 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', | 20 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', |
19 'svn', 'svn+ssh', 'sftp'] | 21 'svn', 'svn+ssh', 'sftp'] |
20 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', | 22 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', |
(...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
253 the URL contained no fragments, the second element is the | 255 the URL contained no fragments, the second element is the |
254 empty string. | 256 empty string. |
255 """ | 257 """ |
256 if '#' in url: | 258 if '#' in url: |
257 s, n, p, a, q, frag = urlparse(url) | 259 s, n, p, a, q, frag = urlparse(url) |
258 defrag = urlunparse((s, n, p, a, q, '')) | 260 defrag = urlunparse((s, n, p, a, q, '')) |
259 return defrag, frag | 261 return defrag, frag |
260 else: | 262 else: |
261 return url, '' | 263 return url, '' |
262 | 264 |
265 def unquote_to_bytes(s): | |
266 """unquote_to_bytes('abc%20def') -> b'abc def'.""" | |
267 # Note: strings are encoded as UTF-8. This is only an issue if it contains | |
268 # unescaped non-ASCII characters, which URIs should not. | |
269 res = s.split('%') | |
270 res[0] = res[0].encode('utf-8') | |
271 for i in range(1, len(res)): | |
272 item = res[i] | |
273 try: | |
274 res[i] = bytes.fromhex(item[:2]) + item[2:].encode('utf-8') | |
275 except KeyError: | |
Antoine Pitrou
2008/08/07 17:29:40
If this is supposed to catch non-hex characters, i
GvR
2008/08/07 17:38:01
Right, the KeyError was from the previous incarnat
mgiuca
2008/08/10 05:53:52
Good catch. No test case caught it either! Will am
mgiuca
2008/08/10 07:45:01
Done.
| |
276 res[i] = b'%' + item.encode('utf-8') | |
277 return b"".join(res) | |
263 | 278 |
264 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) | 279 def unquote(s, encoding='utf-8', errors='strict'): |
265 _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) | 280 """Replace %xx escapes by their single-character equivalent. The optional |
281 encoding and errors parameters specify how to decode percent-encoded | |
282 sequences into Unicode characters, as accepted by the bytes.decode() | |
283 method. | |
284 By default, percent-encoded sequences are decoded with UTF-8, and invalid | |
285 sequences raise a UnicodeDecodeError. | |
266 | 286 |
267 def unquote(s): | 287 unquote('abc%20def') -> 'abc def'. |
268 """unquote('abc%20def') -> 'abc def'.""" | 288 """ |
289 if encoding is None: encoding = 'utf-8' | |
290 if errors is None: errors = 'strict' | |
291 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded | |
292 # (list of single-byte bytes objects) | |
293 pct_sequence = [] | |
269 res = s.split('%') | 294 res = s.split('%') |
270 for i in range(1, len(res)): | 295 for i in range(1, len(res)): |
271 item = res[i] | 296 item = res[i] |
272 try: | 297 try: |
273 res[i] = _hextochr[item[:2]] + item[2:] | 298 pct_sequence.append(bytes.fromhex(item[:2])) |
299 rest = item[2:] | |
274 except KeyError: | 300 except KeyError: |
275 res[i] = '%' + item | 301 rest = '%' + item |
276 except UnicodeDecodeError: | 302 if len(rest) == 0: |
277 res[i] = chr(int(item[:2], 16)) + item[2:] | 303 # This segment was just a single percent-encoded character. |
278 return "".join(res) | 304 # May be part of a sequence of code units, so delay decoding. |
305 # (Stored in pct_sequence). | |
306 res[i] = '' | |
307 else: | |
308 # Encountered non-percent-encoded characters. Flush the current | |
309 # pct_sequence. | |
310 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest | |
311 pct_sequence = [] | |
312 if len(pct_sequence) > 0: | |
313 # Flush the final pct_sequence | |
314 # res[-1] will always be empty if pct_sequence != [] | |
315 res[-1] = b''.join(pct_sequence).decode(encoding, errors) | |
316 return ''.join(res) | |
279 | 317 |
280 def unquote_plus(s): | 318 def unquote_plus(s, encoding='utf-8', errors='strict'): |
281 """unquote('%7e/abc+def') -> '~/abc def'""" | 319 """Like unquote(), but also replace plus signs by spaces, as required for |
320 unquoting HTML form values. | |
321 | |
322 unquote_plus('%7e/abc+def') -> '~/abc def' | |
323 """ | |
282 s = s.replace('+', ' ') | 324 s = s.replace('+', ' ') |
283 return unquote(s) | 325 return unquote(s, encoding, errors) |
284 | 326 |
285 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' | 327 always_safe = frozenset( |
286 'abcdefghijklmnopqrstuvwxyz' | 328 b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
287 '0123456789' '_.-') | 329 b'abcdefghijklmnopqrstuvwxyz' |
330 b'0123456789' b'_.-') | |
288 _safe_quoters= {} | 331 _safe_quoters= {} |
289 | 332 |
290 class Quoter: | 333 class Quoter: |
291 def __init__(self, safe): | 334 def __init__(self, safe): |
335 """safe: May be either a string or bytes object.""" | |
Antoine Pitrou
2008/08/07 17:29:40
This is contradicted by the fact that you expect i
mgiuca
2008/08/10 07:45:01
Done.
| |
292 self.cache = {} | 336 self.cache = {} |
293 self.safe = safe + always_safe | 337 # safe is a bytes object |
338 self.safe = always_safe.union(c for c in safe if c < 128) | |
294 | 339 |
295 def __call__(self, c): | 340 def __call__(self, c): |
341 """ | |
342 c: An int, representing a byte to be encoded. Must have range(0,256). | |
343 Returns a str. | |
344 """ | |
296 try: | 345 try: |
297 return self.cache[c] | 346 return self.cache[c] |
298 except KeyError: | 347 except KeyError: |
299 if ord(c) < 256: | 348 res = c in self.safe and chr(c) or ('%%%02X' % c) |
300 res = (c in self.safe) and c or ('%%%02X' % ord(c)) | 349 self.cache[c] = res |
301 self.cache[c] = res | 350 return res |
302 return res | |
303 else: | |
304 return "".join(['%%%02X' % i for i in c.encode("utf-8")]) | |
305 | 351 |
306 def quote(s, safe = '/'): | 352 def quote(s, safe='/', encoding='utf-8', errors='strict'): |
307 """quote('abc def') -> 'abc%20def' | 353 """quote('abc def') -> 'abc%20def' |
308 | 354 |
309 Each part of a URL, e.g. the path info, the query, etc., has a | 355 Each part of a URL, e.g. the path info, the query, etc., has a |
310 different set of reserved characters that must be quoted. | 356 different set of reserved characters that must be quoted. |
311 | 357 |
312 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists | 358 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists |
313 the following reserved characters. | 359 the following reserved characters. |
314 | 360 |
315 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | | 361 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | |
316 "$" | "," | 362 "$" | "," |
317 | 363 |
318 Each of these characters is reserved in some component of a URL, | 364 Each of these characters is reserved in some component of a URL, |
319 but not necessarily in all of them. | 365 but not necessarily in all of them. |
320 | 366 |
321 By default, the quote function is intended for quoting the path | 367 By default, the quote function is intended for quoting the path |
322 section of a URL. Thus, it will not encode '/'. This character | 368 section of a URL. Thus, it will not encode '/'. This character |
323 is reserved, but in typical usage the quote function is being | 369 is reserved, but in typical usage the quote function is being |
324 called on a path where the existing slash characters are used as | 370 called on a path where the existing slash characters are used as |
325 reserved characters. | 371 reserved characters. |
372 | |
373 The optional encoding and errors parameters specify how to deal with | |
374 non-ASCII characters, as accepted by the str.encode method. | |
375 By default, characters are encoded with UTF-8, and unsupported characters | |
376 raise a UnicodeEncodeError. | |
326 """ | 377 """ |
378 if encoding is None: encoding = 'utf-8' | |
379 if errors is None: errors = 'strict' | |
380 if isinstance(safe, str): | |
381 # Normalize 'safe' by converting to bytes and removing non-ASCII chars | |
382 safe = safe.encode('ascii', 'ignore') | |
327 cachekey = (safe, always_safe) | 383 cachekey = (safe, always_safe) |
384 if isinstance(s, str): | |
385 s = s.encode(encoding, errors) | |
328 try: | 386 try: |
329 quoter = _safe_quoters[cachekey] | 387 quoter = _safe_quoters[cachekey] |
330 except KeyError: | 388 except KeyError: |
331 quoter = Quoter(safe) | 389 quoter = Quoter(safe) |
332 _safe_quoters[cachekey] = quoter | 390 _safe_quoters[cachekey] = quoter |
333 res = map(quoter, s) | 391 res = map(quoter, s) |
334 return ''.join(res) | 392 return ''.join(res) |
335 | 393 |
336 def quote_plus(s, safe = ''): | 394 def quote_plus(s, safe='', encoding='utf-8', errors='strict'): |
337 """Quote the query fragment of a URL; replacing ' ' with '+'""" | 395 """Like quote(), but also replace ' ' with '+', as required for quoting |
338 if ' ' in s: | 396 HTML form values. Plus signs in the original string are escaped unless |
339 s = quote(s, safe + ' ') | 397 they are included in safe. It also does not have safe default to '/'. |
398 """ | |
399 # Check if ' ' in s, where s may either be a str or bytes | |
400 if ' ' in s if isinstance(s, str) else b' ' in s: | |
401 s = quote(s, safe + ' ' if isinstance(safe, str) else safe + b' ') | |
340 return s.replace(' ', '+') | 402 return s.replace(' ', '+') |
341 return quote(s, safe) | 403 return quote(s, safe, encoding, errors) |
404 | |
405 def quote_from_bytes(s, safe='/'): | |
406 if isinstance(safe, str): | |
407 # Normalize 'safe' by converting to bytes and removing non-ASCII chars | |
408 safe = safe.encode('ascii', 'ignore') | |
409 cachekey = (safe, always_safe) | |
410 if not isinstance(s, bytes) or isinstance(s, bytearray): | |
411 raise TypeError("quote_from_bytes() expected a bytes") | |
412 try: | |
413 quoter = _safe_quoters[cachekey] | |
414 except KeyError: | |
415 quoter = Quoter(safe) | |
416 _safe_quoters[cachekey] = quoter | |
417 res = map(quoter, s) | |
418 return ''.join(res) | |
342 | 419 |
343 def urlencode(query,doseq=0): | 420 def urlencode(query,doseq=0): |
344 """Encode a sequence of two-element tuples or dictionary into a URL query st ring. | 421 """Encode a sequence of two-element tuples or dictionary into a URL query st ring. |
345 | 422 |
346 If any values in the query arg are sequences and doseq is true, each | 423 If any values in the query arg are sequences and doseq is true, each |
347 sequence element is converted to a separate parameter. | 424 sequence element is converted to a separate parameter. |
348 | 425 |
349 If the query arg is a sequence of two-element tuples, the order of the | 426 If the query arg is a sequence of two-element tuples, the order of the |
350 parameters in the output will match the order of parameters in the | 427 parameters in the output will match the order of parameters in the |
351 input. | 428 input. |
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
622 if not base: | 699 if not base: |
623 base = abs | 700 base = abs |
624 wrapped = '<URL:%s>' % abs | 701 wrapped = '<URL:%s>' % abs |
625 print('%-10s = %s' % (url, wrapped)) | 702 print('%-10s = %s' % (url, wrapped)) |
626 if len(words) == 3 and words[1] == '=': | 703 if len(words) == 3 and words[1] == '=': |
627 if wrapped != words[2]: | 704 if wrapped != words[2]: |
628 print('EXPECTED', words[2], '!!!!!!!!!!') | 705 print('EXPECTED', words[2], '!!!!!!!!!!') |
629 | 706 |
630 if __name__ == '__main__': | 707 if __name__ == '__main__': |
631 test() | 708 test() |
OLD | NEW |