Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(91)

Side by Side Diff: Objects/unicodeobject.c

Issue 767: [issue2630] repr() should not escape non-ASCII characters (Closed) SVN Base: http://svn.python.org/view/*checkout*/python/branches/py3k/
Patch Set: Created 1 year, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 2
3 Unicode implementation based on original code by Fredrik Lundh, 3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt). 5 Unicode Integration Proposal (see file Misc/unicode.txt).
6 6
7 Major speed upgrades to the method implementations at the Reykjavik 7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9 9
10 Copyright (c) Corporation for National Research Initiatives. 10 Copyright (c) Corporation for National Research Initiatives.
(...skipping 2395 matching lines...) Expand 10 before | Expand all | Expand 10 after
2406 else if (byteorder == 1) { 2406 else if (byteorder == 1) {
2407 /* force BE */ 2407 /* force BE */
2408 iorder[0] = 3; 2408 iorder[0] = 3;
2409 iorder[1] = 2; 2409 iorder[1] = 2;
2410 iorder[2] = 1; 2410 iorder[2] = 1;
2411 iorder[3] = 0; 2411 iorder[3] = 0;
2412 } 2412 }
2413 2413
2414 while (size-- > 0) { 2414 while (size-- > 0) {
2415 Py_UCS4 ch = *s++; 2415 Py_UCS4 ch = *s++;
2416
2416 #ifndef Py_UNICODE_WIDE 2417 #ifndef Py_UNICODE_WIDE
2417 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2418 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2418 Py_UCS4 ch2 = *s; 2419 Py_UCS4 ch2 = *s;
2419 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2420 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2420 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2421 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2421 s++; 2422 s++;
2422 size--; 2423 size--;
2423 } 2424 }
2424 } 2425 }
2425 #endif 2426 #endif
(...skipping 5084 matching lines...) Expand 10 before | Expand all | Expand 10 after
7510 if (repr == NULL) 7511 if (repr == NULL)
7511 return NULL; 7512 return NULL;
7512 7513
7513 p = PyUnicode_AS_UNICODE(repr); 7514 p = PyUnicode_AS_UNICODE(repr);
7514 7515
7515 /* Add quote */ 7516 /* Add quote */
7516 *p++ = (findchar(s, size, '\'') && 7517 *p++ = (findchar(s, size, '\'') &&
7517 !findchar(s, size, '"')) ? '"' : '\''; 7518 !findchar(s, size, '"')) ? '"' : '\'';
7518 while (size-- > 0) { 7519 while (size-- > 0) {
7519 Py_UNICODE ch = *s++; 7520 Py_UNICODE ch = *s++;
7520
7521 /* Escape quotes and backslashes */ 7521 /* Escape quotes and backslashes */
7522 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 7522 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7523 *p++ = '\\'; 7523 *p++ = '\\';
7524 *p++ = ch; 7524 *p++ = ch;
7525 continue; 7525 continue;
7526 } 7526 }
7527 7527
7528 #ifdef Py_UNICODE_WIDE 7528 » /* Map special whitespace to '\t', \n', '\r' */
7529 /* Map 21-bit characters to '\U00xxxxxx' */ 7529 if (ch == '\t') {
7530 else if (ch >= 0x10000) {
7531 *p++ = '\\';
7532 *p++ = 'U';
7533 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7534 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7535 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7536 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7537 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7538 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7539 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7540 *p++ = hexdigits[ch & 0x0000000F];
7541 » continue;
7542 }
7543 #else
7544 » /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7545 » else if (ch >= 0xD800 && ch < 0xDC00) {
7546 » Py_UNICODE ch2;
7547 » Py_UCS4 ucs;
7548
7549 » ch2 = *s++;
7550 » size--;
7551 » if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7552 » » ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7553 » » *p++ = '\\';
7554 » » *p++ = 'U';
7555 » » *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7556 » » *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7557 » » *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7558 » » *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7559 » » *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7560 » » *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7561 » » *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7562 » » *p++ = hexdigits[ucs & 0x0000000F];
7563 » » continue;
7564 » }
7565 » /* Fall through: isolated surrogates are copied as-is */
7566 » s--;
7567 » size++;
7568 » }
7569 #endif
7570
7571 /* Map 16-bit characters to '\uxxxx' */
7572 if (ch >= 256) {
7573 *p++ = '\\';
7574 *p++ = 'u';
7575 *p++ = hexdigits[(ch >> 12) & 0x000F];
7576 *p++ = hexdigits[(ch >> 8) & 0x000F];
7577 *p++ = hexdigits[(ch >> 4) & 0x000F];
7578 *p++ = hexdigits[ch & 0x000F];
7579 }
7580
7581 /* Map special whitespace to '\t', \n', '\r' */
7582 else if (ch == '\t') {
7583 *p++ = '\\'; 7530 *p++ = '\\';
7584 *p++ = 't'; 7531 *p++ = 't';
7585 } 7532 }
7586 else if (ch == '\n') { 7533 else if (ch == '\n') {
7587 *p++ = '\\'; 7534 *p++ = '\\';
7588 *p++ = 'n'; 7535 *p++ = 'n';
7589 } 7536 }
7590 else if (ch == '\r') { 7537 else if (ch == '\r') {
7591 *p++ = '\\'; 7538 *p++ = '\\';
7592 *p++ = 'r'; 7539 *p++ = 'r';
7593 } 7540 }
7594 7541
7595 /* Map non-printable US ASCII to '\xhh' */ 7542 /* Map non-printable US ASCII to '\xhh' */
7596 else if (ch < ' ' || ch >= 0x7F) { 7543 else if (ch < ' ' || ch == 0x7F) {
7597 *p++ = '\\'; 7544 *p++ = '\\';
7598 *p++ = 'x'; 7545 *p++ = 'x';
7599 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7546 *p++ = hexdigits[(ch >> 4) & 0x000F];
7600 *p++ = hexdigits[ch & 0x000F]; 7547 *p++ = hexdigits[ch & 0x000F];
7601 } 7548 }
7602 7549
7603 /* Copy everything else as-is */ 7550 /* Copy ASCII characters as-is */
7604 else 7551 else if (ch < 0x7F) {
7605 *p++ = (char) ch; 7552 *p++ = ch;
7553 }
7554
7555 » /* Non-ASCII characters */
7556 else {
7557 Py_UCS4 ucs = ch;
7558
7559 #ifndef Py_UNICODE_WIDE
7560 Py_UNICODE ch2 = 0;
7561 /* Get code point from surrogate pair */
7562 if (size > 0) {
7563 ch2 = *s;
7564 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7565 && ch2 <= 0xDFFF) {
7566 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7567 + 0x00010000;
7568 s++;
7569 size--;
7570 }
7571 }
7572 #endif
7573 /* Map Unicode whitespace and control characters
7574 (categories Z* and C* except ASCII space)
7575 */
7576 if (Py_UNICODE_ISHEXESCAPED(ucs)) {
7577 » /* Map 8-bit characters to '\xhh' */
7578 » » if (ucs <= 0xff) {
7579 » » *p++ = '\\';
7580 » » *p++ = 'x';
7581 » » *p++ = hexdigits[(ch >> 4) & 0x000F];
7582 » » *p++ = hexdigits[ch & 0x000F];
7583 » » }
7584 /* Map 21-bit characters to '\U00xxxxxx' */
7585 else if (ucs >= 0x10000) {
7586 *p++ = '\\';
7587 *p++ = 'U';
7588 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7589 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7590 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7591 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7592 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7593 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7594 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7595 *p++ = hexdigits[ucs & 0x0000000F];
7596 }
7597 /* Map 16-bit characters to '\uxxxx' */
7598 else {
7599 *p++ = '\\';
7600 *p++ = 'u';
7601 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7602 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7603 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7604 *p++ = hexdigits[ucs & 0x000F];
7605 }
7606 }
7607 /* Copy characters as-is */
7608 else {
7609 *p++ = ch;
7610 #ifndef Py_UNICODE_WIDE
7611 if (ucs >= 0x10000)
7612 *p++ = ch2;
7613 #endif
7614 }
7615 }
7606 } 7616 }
7607 /* Add quote */ 7617 /* Add quote */
7608 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 7618 *p++ = PyUnicode_AS_UNICODE(repr)[0];
7609 7619
7610 *p = '\0'; 7620 *p = '\0';
7611 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 7621 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7612 return repr; 7622 return repr;
7613 } 7623 }
7614 7624
7615 PyDoc_STRVAR(rfind__doc__, 7625 PyDoc_STRVAR(rfind__doc__,
(...skipping 1910 matching lines...) Expand 10 before | Expand all | Expand 10 after
9526 } 9536 }
9527 #endif 9537 #endif
9528 9538
9529 9539
9530 /* 9540 /*
9531 Local variables: 9541 Local variables:
9532 c-basic-offset: 4 9542 c-basic-offset: 4
9533 indent-tabs-mode: nil 9543 indent-tabs-mode: nil
9534 End: 9544 End:
9535 */ 9545 */
OLDNEW

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld r497