Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(16)

Side by Side Diff: Objects/unicodectype.c

Issue 767: [issue2630] repr() should not escape non-ASCII characters (Closed) SVN Base: http://svn.python.org/view/*checkout*/python/branches/py3k/
Patch Set: Created 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Unicode character type helpers. 2 Unicode character type helpers.
3 3
4 Written by Marc-Andre Lemburg (mal@lemburg.com). 4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
6 6
7 Copyright (c) Corporation for National Research Initiatives. 7 Copyright (c) Corporation for National Research Initiatives.
8 8
9 */ 9 */
10 10
11 #include "Python.h" 11 #include "Python.h"
12 #include "unicodeobject.h" 12 #include "unicodeobject.h"
13 13
14 #define ALPHA_MASK 0x01 14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02 15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04 16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08 17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10 18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20 19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40 20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80 21 #define UPPER_MASK 0x80
22 #define XID_START_MASK 0x100 22 #define XID_START_MASK 0x100
23 #define XID_CONTINUE_MASK 0x200 23 #define XID_CONTINUE_MASK 0x200
24 #define HEX_ESCAPE_MASK 0x400
24 25
25 typedef struct { 26 typedef struct {
26 const Py_UNICODE upper; 27 const Py_UNICODE upper;
27 const Py_UNICODE lower; 28 const Py_UNICODE lower;
28 const Py_UNICODE title; 29 const Py_UNICODE title;
29 const unsigned char decimal; 30 const unsigned char decimal;
30 const unsigned char digit; 31 const unsigned char digit;
31 const unsigned short flags; 32 const unsigned short flags;
32 } _PyUnicode_TypeRecord; 33 } _PyUnicode_TypeRecord;
33 34
34 #include "unicodetype_db.h" 35 #include "unicodetype_db.h"
35 36
36 static const _PyUnicode_TypeRecord * 37 static const _PyUnicode_TypeRecord *
37 gettyperecord(Py_UNICODE code) 38 gettyperecord(Py_UNICODE code)
38 { 39 {
39 int index; 40 int index;
40 41
41 #ifdef Py_UNICODE_WIDE 42 #ifdef Py_UNICODE_WIDE
42 if (code >= 0x110000) 43 if (code >= 0x110000)
43 index = 0; 44 index = 0;
44 else 45 else
45 #endif 46 #endif
46 { 47 {
47 index = index1[(code>>SHIFT)]; 48 index = index1[(code>>SHIFT)];
48 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 49 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
49 } 50 }
50 51
51 return &_PyUnicode_TypeRecords[index]; 52 return &_PyUnicode_TypeRecords[index];
52 } 53 }
53 54
54 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or 55 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
55 type 'B', 0 otherwise. */ 56 type 'B', 0 otherwise. */
56 57
57 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) 58 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
58 { 59 {
59 switch (ch) { 60 switch (ch) {
60 case 0x000A: /* LINE FEED */ 61 case 0x000A: /* LINE FEED */
61 case 0x000D: /* CARRIAGE RETURN */ 62 case 0x000D: /* CARRIAGE RETURN */
62 case 0x001C: /* FILE SEPARATOR */ 63 case 0x001C: /* FILE SEPARATOR */
63 case 0x001D: /* GROUP SEPARATOR */ 64 case 0x001D: /* GROUP SEPARATOR */
64 case 0x001E: /* RECORD SEPARATOR */ 65 case 0x001E: /* RECORD SEPARATOR */
65 case 0x0085: /* NEXT LINE */ 66 case 0x0085: /* NEXT LINE */
66 case 0x2028: /* LINE SEPARATOR */ 67 case 0x2028: /* LINE SEPARATOR */
67 case 0x2029: /* PARAGRAPH SEPARATOR */ 68 case 0x2029: /* PARAGRAPH SEPARATOR */
68 return 1; 69 return 1;
69 default: 70 default:
70 return 0; 71 return 0;
71 } 72 }
72 } 73 }
73 74
(...skipping 553 matching lines...) Show 10 above Show 10 below
627 #endif 628 #endif
628 return (double) 8; 629 return (double) 8;
629 case 0x1379: 630 case 0x1379:
630 #ifdef Py_UNICODE_WIDE 631 #ifdef Py_UNICODE_WIDE
631 case 0x10117: 632 case 0x10117:
632 #endif 633 #endif
633 return (double) 80; 634 return (double) 80;
634 #ifdef Py_UNICODE_WIDE 635 #ifdef Py_UNICODE_WIDE
635 case 0x10120: 636 case 0x10120:
636 return (double) 800; 637 return (double) 800;
637 case 0x10129: 638 case 0x10129:
638 return (double) 8000; 639 return (double) 8000;
639 case 0x10132: 640 case 0x10132:
640 return (double) 80000; 641 return (double) 80000;
641 #endif 642 #endif
642 case 0x17F9: 643 case 0x17F9:
643 case 0x2168: 644 case 0x2168:
644 case 0x2178: 645 case 0x2178:
645 case 0x3029: 646 case 0x3029:
646 case 0x3228: 647 case 0x3228:
647 case 0x3288: 648 case 0x3288:
648 #ifdef Py_UNICODE_WIDE 649 #ifdef Py_UNICODE_WIDE
649 case 0x1010F: 650 case 0x1010F:
650 #endif 651 #endif
651 return (double) 9; 652 return (double) 9;
652 case 0x0F2E: 653 case 0x0F2E:
653 return (double) 9 / 2; 654 return (double) 9 / 2;
654 case 0x137A: 655 case 0x137A:
655 #ifdef Py_UNICODE_WIDE 656 #ifdef Py_UNICODE_WIDE
656 case 0x10118: 657 case 0x10118:
657 #endif 658 #endif
658 return (double) 90; 659 return (double) 90;
659 #ifdef Py_UNICODE_WIDE 660 #ifdef Py_UNICODE_WIDE
660 case 0x10121: 661 case 0x10121:
661 case 0x1034A: 662 case 0x1034A:
662 return (double) 900; 663 return (double) 900;
663 case 0x1012A: 664 case 0x1012A:
664 return (double) 9000; 665 return (double) 9000;
665 case 0x10133: 666 case 0x10133:
666 return (double) 90000; 667 return (double) 90000;
667 #endif 668 #endif
668 default: 669 default:
669 return (double) _PyUnicode_ToDigit(ch); 670 return (double) _PyUnicode_ToDigit(ch);
670 } 671 }
671 } 672 }
672 673
673 int _PyUnicode_IsNumeric(Py_UNICODE ch) 674 int _PyUnicode_IsNumeric(Py_UNICODE ch)
674 { 675 {
675 return _PyUnicode_ToNumeric(ch) != -1.0; 676 return _PyUnicode_ToNumeric(ch) != -1.0;
676 } 677 }
678
679 /* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
680 0 otherwise. */
681
682 int _PyUnicode_IsHexEscaped(Py_UNICODE ch)
683 {
684 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
685
686 return (ctype->flags & HEX_ESCAPE_MASK) != 0;
687 }
688
677 689
678 #ifndef WANT_WCTYPE_FUNCTIONS 690 #ifndef WANT_WCTYPE_FUNCTIONS
679 691
680 /* Returns 1 for Unicode characters having the bidirectional type 692 /* Returns 1 for Unicode characters having the bidirectional type
681 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */ 693 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
682 694
683 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch) 695 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
684 { 696 {
685 switch (ch) { 697 switch (ch) {
686 case 0x0009: /* HORIZONTAL TABULATION */ 698 case 0x0009: /* HORIZONTAL TABULATION */
687 case 0x000A: /* LINE FEED */ 699 case 0x000A: /* LINE FEED */
688 case 0x000B: /* VERTICAL TABULATION */ 700 case 0x000B: /* VERTICAL TABULATION */
689 case 0x000C: /* FORM FEED */ 701 case 0x000C: /* FORM FEED */
690 case 0x000D: /* CARRIAGE RETURN */ 702 case 0x000D: /* CARRIAGE RETURN */
691 case 0x001C: /* FILE SEPARATOR */ 703 case 0x001C: /* FILE SEPARATOR */
692 case 0x001D: /* GROUP SEPARATOR */ 704 case 0x001D: /* GROUP SEPARATOR */
693 case 0x001E: /* RECORD SEPARATOR */ 705 case 0x001E: /* RECORD SEPARATOR */
694 case 0x001F: /* UNIT SEPARATOR */ 706 case 0x001F: /* UNIT SEPARATOR */
695 case 0x0020: /* SPACE */ 707 case 0x0020: /* SPACE */
696 case 0x0085: /* NEXT LINE */ 708 case 0x0085: /* NEXT LINE */
697 case 0x00A0: /* NO-BREAK SPACE */ 709 case 0x00A0: /* NO-BREAK SPACE */
698 case 0x1680: /* OGHAM SPACE MARK */ 710 case 0x1680: /* OGHAM SPACE MARK */
699 case 0x2000: /* EN QUAD */ 711 case 0x2000: /* EN QUAD */
700 case 0x2001: /* EM QUAD */ 712 case 0x2001: /* EM QUAD */
701 case 0x2002: /* EN SPACE */ 713 case 0x2002: /* EN SPACE */
702 case 0x2003: /* EM SPACE */ 714 case 0x2003: /* EM SPACE */
703 case 0x2004: /* THREE-PER-EM SPACE */ 715 case 0x2004: /* THREE-PER-EM SPACE */
704 case 0x2005: /* FOUR-PER-EM SPACE */ 716 case 0x2005: /* FOUR-PER-EM SPACE */
705 case 0x2006: /* SIX-PER-EM SPACE */ 717 case 0x2006: /* SIX-PER-EM SPACE */
706 case 0x2007: /* FIGURE SPACE */ 718 case 0x2007: /* FIGURE SPACE */
707 case 0x2008: /* PUNCTUATION SPACE */ 719 case 0x2008: /* PUNCTUATION SPACE */
708 case 0x2009: /* THIN SPACE */ 720 case 0x2009: /* THIN SPACE */
709 case 0x200A: /* HAIR SPACE */ 721 case 0x200A: /* HAIR SPACE */
710 case 0x200B: /* ZERO WIDTH SPACE */ 722 case 0x200B: /* ZERO WIDTH SPACE */
711 case 0x2028: /* LINE SEPARATOR */ 723 case 0x2028: /* LINE SEPARATOR */
712 case 0x2029: /* PARAGRAPH SEPARATOR */ 724 case 0x2029: /* PARAGRAPH SEPARATOR */
713 case 0x202F: /* NARROW NO-BREAK SPACE */ 725 case 0x202F: /* NARROW NO-BREAK SPACE */
714 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */ 726 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
715 case 0x3000: /* IDEOGRAPHIC SPACE */ 727 case 0x3000: /* IDEOGRAPHIC SPACE */
716 return 1; 728 return 1;
717 default: 729 default:
718 return 0; 730 return 0;
719 } 731 }
720 } 732 }
721 733
722 /* Returns 1 for Unicode characters having the category 'Ll', 0 734 /* Returns 1 for Unicode characters having the category 'Ll', 0
723 otherwise. */ 735 otherwise. */
724 736
725 int _PyUnicode_IsLowercase(Py_UNICODE ch) 737 int _PyUnicode_IsLowercase(Py_UNICODE ch)
726 { 738 {
727 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 739 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
728 740
729 return (ctype->flags & LOWER_MASK) != 0; 741 return (ctype->flags & LOWER_MASK) != 0;
730 } 742 }
731 743
732 /* Returns 1 for Unicode characters having the category 'Lu', 0 744 /* Returns 1 for Unicode characters having the category 'Lu', 0
733 otherwise. */ 745 otherwise. */
734 746
735 int _PyUnicode_IsUppercase(Py_UNICODE ch) 747 int _PyUnicode_IsUppercase(Py_UNICODE ch)
736 { 748 {
737 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 749 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
738 750
739 return (ctype->flags & UPPER_MASK) != 0; 751 return (ctype->flags & UPPER_MASK) != 0;
740 } 752 }
741 753
742 /* Returns the uppercase Unicode characters corresponding to ch or just 754 /* Returns the uppercase Unicode characters corresponding to ch or just
743 ch if no uppercase mapping is known. */ 755 ch if no uppercase mapping is known. */
744 756
745 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch) 757 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
746 { 758 {
747 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 759 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
748 int delta = ctype->upper; 760 int delta = ctype->upper;
749 if (delta >= 32768) 761 if (delta >= 32768)
750 delta -= 65536; 762 delta -= 65536;
751 return ch + delta; 763 return ch + delta;
752 } 764 }
753 765
754 /* Returns the lowercase Unicode characters corresponding to ch or just 766 /* Returns the lowercase Unicode characters corresponding to ch or just
755 ch if no lowercase mapping is known. */ 767 ch if no lowercase mapping is known. */
756 768
757 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch) 769 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
758 { 770 {
759 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 771 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
760 int delta = ctype->lower; 772 int delta = ctype->lower;
761 if (delta >= 32768) 773 if (delta >= 32768)
762 delta -= 65536; 774 delta -= 65536;
763 return ch + delta; 775 return ch + delta;
764 } 776 }
765 777
766 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', 778 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
767 'Lo' or 'Lm', 0 otherwise. */ 779 'Lo' or 'Lm', 0 otherwise. */
768 780
769 int _PyUnicode_IsAlpha(Py_UNICODE ch) 781 int _PyUnicode_IsAlpha(Py_UNICODE ch)
770 { 782 {
771 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); 783 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
772 784
773 return (ctype->flags & ALPHA_MASK) != 0; 785 return (ctype->flags & ALPHA_MASK) != 0;
774 } 786 }
775 787
776 #else 788 #else
777 789
778 /* Export the interfaces using the wchar_t type for portability 790 /* Export the interfaces using the wchar_t type for portability
779 reasons: */ 791 reasons: */
780 792
781 int _PyUnicode_IsWhitespace(Py_UNICODE ch) 793 int _PyUnicode_IsWhitespace(Py_UNICODE ch)
782 { 794 {
783 return iswspace(ch); 795 return iswspace(ch);
784 } 796 }
785 797
786 int _PyUnicode_IsLowercase(Py_UNICODE ch) 798 int _PyUnicode_IsLowercase(Py_UNICODE ch)
787 { 799 {
788 return iswlower(ch); 800 return iswlower(ch);
789 } 801 }
790 802
791 int _PyUnicode_IsUppercase(Py_UNICODE ch) 803 int _PyUnicode_IsUppercase(Py_UNICODE ch)
792 { 804 {
793 return iswupper(ch); 805 return iswupper(ch);
794 } 806 }
795 807
796 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch) 808 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
797 { 809 {
798 return towlower(ch); 810 return towlower(ch);
799 } 811 }
800 812
801 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch) 813 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
802 { 814 {
803 return towupper(ch); 815 return towupper(ch);
804 } 816 }
805 817
806 int _PyUnicode_IsAlpha(Py_UNICODE ch) 818 int _PyUnicode_IsAlpha(Py_UNICODE ch)
807 { 819 {
808 return iswalpha(ch); 820 return iswalpha(ch);
809 } 821 }
810 822
811 #endif 823 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld r305