Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(1679)

Side by Side Diff: Lib/urllib/parse.py

Issue 2827: [issue3300] urllib.quote and unquote - Unicode issues (Closed) Base URL: http://svn.python.org/view/*checkout*/python/branches/py3k/
Patch Set: parse.py.patch8 Created 15 years, 7 months ago , Downloaded from: http://bugs.python.org/file11069/parse.py.patch8
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« Lib/email/utils.py ('K') | « Lib/test/test_wsgiref.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 """Parse (absolute and relative) URLs. 1 """Parse (absolute and relative) URLs.
2 2
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, 3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4 UC Irvine, June 1995. 4 UC Irvine, June 1995.
5 """ 5 """
6 6
7 import sys 7 import sys
8 8
9 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 9 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
10 "urlsplit", "urlunsplit"] 10 "urlsplit", "urlunsplit",
11 "quote", "quote_plus", "quote_from_bytes",
12 "unquote", "unquote_plus", "unquote_to_bytes"]
11 13
12 # A classification of schemes ('' means apply by default) 14 # A classification of schemes ('' means apply by default)
13 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', 15 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
14 'wais', 'file', 'https', 'shttp', 'mms', 16 'wais', 'file', 'https', 'shttp', 'mms',
15 'prospero', 'rtsp', 'rtspu', '', 'sftp'] 17 'prospero', 'rtsp', 'rtspu', '', 'sftp']
16 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 18 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
17 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 19 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
18 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', 20 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
19 'svn', 'svn+ssh', 'sftp'] 21 'svn', 'svn+ssh', 'sftp']
20 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 22 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
(...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after
253 the URL contained no fragments, the second element is the 255 the URL contained no fragments, the second element is the
254 empty string. 256 empty string.
255 """ 257 """
256 if '#' in url: 258 if '#' in url:
257 s, n, p, a, q, frag = urlparse(url) 259 s, n, p, a, q, frag = urlparse(url)
258 defrag = urlunparse((s, n, p, a, q, '')) 260 defrag = urlunparse((s, n, p, a, q, ''))
259 return defrag, frag 261 return defrag, frag
260 else: 262 else:
261 return url, '' 263 return url, ''
262 264
265 def unquote_to_bytes(s):
266 """unquote_to_bytes('abc%20def') -> b'abc def'."""
267 # Note: strings are encoded as UTF-8. This is only an issue if it contains
268 # unescaped non-ASCII characters, which URIs should not.
269 res = s.split('%')
270 res[0] = res[0].encode('utf-8')
271 for i in range(1, len(res)):
272 item = res[i]
273 try:
274 res[i] = bytes.fromhex(item[:2]) + item[2:].encode('utf-8')
275 except KeyError:
Antoine Pitrou 2008/08/07 17:29:40 If this is supposed to catch non-hex characters, i
GvR 2008/08/07 17:38:01 Right, the KeyError was from the previous incarnat
mgiuca 2008/08/10 05:53:52 Good catch. No test case caught it either! Will am
mgiuca 2008/08/10 07:45:01 Done.
276 res[i] = b'%' + item.encode('utf-8')
277 return b"".join(res)
263 278
264 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) 279 def unquote(s, encoding='utf-8', errors='strict'):
265 _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) 280 """Replace %xx escapes by their single-character equivalent. The optional
281 encoding and errors parameters specify how to decode percent-encoded
282 sequences into Unicode characters, as accepted by the bytes.decode()
283 method.
284 By default, percent-encoded sequences are decoded with UTF-8, and invalid
285 sequences raise a UnicodeDecodeError.
266 286
267 def unquote(s): 287 unquote('abc%20def') -> 'abc def'.
268 """unquote('abc%20def') -> 'abc def'.""" 288 """
289 if encoding is None: encoding = 'utf-8'
290 if errors is None: errors = 'strict'
291 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
292 # (list of single-byte bytes objects)
293 pct_sequence = []
269 res = s.split('%') 294 res = s.split('%')
270 for i in range(1, len(res)): 295 for i in range(1, len(res)):
271 item = res[i] 296 item = res[i]
272 try: 297 try:
273 res[i] = _hextochr[item[:2]] + item[2:] 298 pct_sequence.append(bytes.fromhex(item[:2]))
299 rest = item[2:]
274 except KeyError: 300 except KeyError:
275 res[i] = '%' + item 301 rest = '%' + item
276 except UnicodeDecodeError: 302 if len(rest) == 0:
277 res[i] = chr(int(item[:2], 16)) + item[2:] 303 # This segment was just a single percent-encoded character.
278 return "".join(res) 304 # May be part of a sequence of code units, so delay decoding.
305 # (Stored in pct_sequence).
306 res[i] = ''
307 else:
308 # Encountered non-percent-encoded characters. Flush the current
309 # pct_sequence.
310 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
311 pct_sequence = []
312 if len(pct_sequence) > 0:
313 # Flush the final pct_sequence
314 # res[-1] will always be empty if pct_sequence != []
315 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
316 return ''.join(res)
279 317
280 def unquote_plus(s): 318 def unquote_plus(s, encoding='utf-8', errors='strict'):
281 """unquote('%7e/abc+def') -> '~/abc def'""" 319 """Like unquote(), but also replace plus signs by spaces, as required for
320 unquoting HTML form values.
321
322 unquote_plus('%7e/abc+def') -> '~/abc def'
323 """
282 s = s.replace('+', ' ') 324 s = s.replace('+', ' ')
283 return unquote(s) 325 return unquote(s, encoding, errors)
284 326
285 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 327 always_safe = frozenset(
286 'abcdefghijklmnopqrstuvwxyz' 328 b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
287 '0123456789' '_.-') 329 b'abcdefghijklmnopqrstuvwxyz'
330 b'0123456789' b'_.-')
288 _safe_quoters= {} 331 _safe_quoters= {}
289 332
290 class Quoter: 333 class Quoter:
291 def __init__(self, safe): 334 def __init__(self, safe):
335 """safe: May be either a string or bytes object."""
Antoine Pitrou 2008/08/07 17:29:40 This is contradicted by the fact that you expect i
mgiuca 2008/08/10 07:45:01 Done.
292 self.cache = {} 336 self.cache = {}
293 self.safe = safe + always_safe 337 # safe is a bytes object
338 self.safe = always_safe.union(c for c in safe if c < 128)
294 339
295 def __call__(self, c): 340 def __call__(self, c):
341 """
342 c: An int, representing a byte to be encoded. Must have range(0,256).
343 Returns a str.
344 """
296 try: 345 try:
297 return self.cache[c] 346 return self.cache[c]
298 except KeyError: 347 except KeyError:
299 if ord(c) < 256: 348 res = c in self.safe and chr(c) or ('%%%02X' % c)
300 res = (c in self.safe) and c or ('%%%02X' % ord(c)) 349 self.cache[c] = res
301 self.cache[c] = res 350 return res
302 return res
303 else:
304 return "".join(['%%%02X' % i for i in c.encode("utf-8")])
305 351
306 def quote(s, safe = '/'): 352 def quote(s, safe='/', encoding='utf-8', errors='strict'):
307 """quote('abc def') -> 'abc%20def' 353 """quote('abc def') -> 'abc%20def'
308 354
309 Each part of a URL, e.g. the path info, the query, etc., has a 355 Each part of a URL, e.g. the path info, the query, etc., has a
310 different set of reserved characters that must be quoted. 356 different set of reserved characters that must be quoted.
311 357
312 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists 358 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
313 the following reserved characters. 359 the following reserved characters.
314 360
315 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 361 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
316 "$" | "," 362 "$" | ","
317 363
318 Each of these characters is reserved in some component of a URL, 364 Each of these characters is reserved in some component of a URL,
319 but not necessarily in all of them. 365 but not necessarily in all of them.
320 366
321 By default, the quote function is intended for quoting the path 367 By default, the quote function is intended for quoting the path
322 section of a URL. Thus, it will not encode '/'. This character 368 section of a URL. Thus, it will not encode '/'. This character
323 is reserved, but in typical usage the quote function is being 369 is reserved, but in typical usage the quote function is being
324 called on a path where the existing slash characters are used as 370 called on a path where the existing slash characters are used as
325 reserved characters. 371 reserved characters.
372
373 The optional encoding and errors parameters specify how to deal with
374 non-ASCII characters, as accepted by the str.encode method.
375 By default, characters are encoded with UTF-8, and unsupported characters
376 raise a UnicodeEncodeError.
326 """ 377 """
378 if encoding is None: encoding = 'utf-8'
379 if errors is None: errors = 'strict'
380 if isinstance(safe, str):
381 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
382 safe = safe.encode('ascii', 'ignore')
327 cachekey = (safe, always_safe) 383 cachekey = (safe, always_safe)
384 if isinstance(s, str):
385 s = s.encode(encoding, errors)
328 try: 386 try:
329 quoter = _safe_quoters[cachekey] 387 quoter = _safe_quoters[cachekey]
330 except KeyError: 388 except KeyError:
331 quoter = Quoter(safe) 389 quoter = Quoter(safe)
332 _safe_quoters[cachekey] = quoter 390 _safe_quoters[cachekey] = quoter
333 res = map(quoter, s) 391 res = map(quoter, s)
334 return ''.join(res) 392 return ''.join(res)
335 393
336 def quote_plus(s, safe = ''): 394 def quote_plus(s, safe='', encoding='utf-8', errors='strict'):
337 """Quote the query fragment of a URL; replacing ' ' with '+'""" 395 """Like quote(), but also replace ' ' with '+', as required for quoting
338 if ' ' in s: 396 HTML form values. Plus signs in the original string are escaped unless
339 s = quote(s, safe + ' ') 397 they are included in safe. It also does not have safe default to '/'.
398 """
399 # Check if ' ' in s, where s may either be a str or bytes
400 if ' ' in s if isinstance(s, str) else b' ' in s:
401 s = quote(s, safe + ' ' if isinstance(safe, str) else safe + b' ')
340 return s.replace(' ', '+') 402 return s.replace(' ', '+')
341 return quote(s, safe) 403 return quote(s, safe, encoding, errors)
404
405 def quote_from_bytes(s, safe='/'):
406 if isinstance(safe, str):
407 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
408 safe = safe.encode('ascii', 'ignore')
409 cachekey = (safe, always_safe)
410 if not isinstance(s, bytes) or isinstance(s, bytearray):
411 raise TypeError("quote_from_bytes() expected a bytes")
412 try:
413 quoter = _safe_quoters[cachekey]
414 except KeyError:
415 quoter = Quoter(safe)
416 _safe_quoters[cachekey] = quoter
417 res = map(quoter, s)
418 return ''.join(res)
342 419
343 def urlencode(query,doseq=0): 420 def urlencode(query,doseq=0):
344 """Encode a sequence of two-element tuples or dictionary into a URL query st ring. 421 """Encode a sequence of two-element tuples or dictionary into a URL query st ring.
345 422
346 If any values in the query arg are sequences and doseq is true, each 423 If any values in the query arg are sequences and doseq is true, each
347 sequence element is converted to a separate parameter. 424 sequence element is converted to a separate parameter.
348 425
349 If the query arg is a sequence of two-element tuples, the order of the 426 If the query arg is a sequence of two-element tuples, the order of the
350 parameters in the output will match the order of parameters in the 427 parameters in the output will match the order of parameters in the
351 input. 428 input.
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after
622 if not base: 699 if not base:
623 base = abs 700 base = abs
624 wrapped = '<URL:%s>' % abs 701 wrapped = '<URL:%s>' % abs
625 print('%-10s = %s' % (url, wrapped)) 702 print('%-10s = %s' % (url, wrapped))
626 if len(words) == 3 and words[1] == '=': 703 if len(words) == 3 and words[1] == '=':
627 if wrapped != words[2]: 704 if wrapped != words[2]:
628 print('EXPECTED', words[2], '!!!!!!!!!!') 705 print('EXPECTED', words[2], '!!!!!!!!!!')
629 706
630 if __name__ == '__main__': 707 if __name__ == '__main__':
631 test() 708 test()
OLDNEW
« Lib/email/utils.py ('K') | « Lib/test/test_wsgiref.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b