| OLD | NEW |
| 1 /* | 1 /* |
| 2 Unicode character type helpers. | 2 Unicode character type helpers. |
| 3 | 3 |
| 4 Written by Marc-Andre Lemburg (mal@lemburg.com). | 4 Written by Marc-Andre Lemburg (mal@lemburg.com). |
| 5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) | 5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) |
| 6 | 6 |
| 7 Copyright (c) Corporation for National Research Initiatives. | 7 Copyright (c) Corporation for National Research Initiatives. |
| 8 | 8 |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "Python.h" | 11 #include "Python.h" |
| 12 #include "unicodeobject.h" | 12 #include "unicodeobject.h" |
| 13 | 13 |
| 14 #define ALPHA_MASK 0x01 | 14 #define ALPHA_MASK 0x01 |
| 15 #define DECIMAL_MASK 0x02 | 15 #define DECIMAL_MASK 0x02 |
| 16 #define DIGIT_MASK 0x04 | 16 #define DIGIT_MASK 0x04 |
| 17 #define LOWER_MASK 0x08 | 17 #define LOWER_MASK 0x08 |
| 18 #define LINEBREAK_MASK 0x10 | 18 #define LINEBREAK_MASK 0x10 |
| 19 #define SPACE_MASK 0x20 | 19 #define SPACE_MASK 0x20 |
| 20 #define TITLE_MASK 0x40 | 20 #define TITLE_MASK 0x40 |
| 21 #define UPPER_MASK 0x80 | 21 #define UPPER_MASK 0x80 |
| 22 #define XID_START_MASK 0x100 | 22 #define XID_START_MASK 0x100 |
| 23 #define XID_CONTINUE_MASK 0x200 | 23 #define XID_CONTINUE_MASK 0x200 |
| 24 #define HEX_ESCAPE_MASK 0x400 |
| 24 | 25 |
| 25 typedef struct { | 26 typedef struct { |
| 26 const Py_UNICODE upper; | 27 const Py_UNICODE upper; |
| 27 const Py_UNICODE lower; | 28 const Py_UNICODE lower; |
| 28 const Py_UNICODE title; | 29 const Py_UNICODE title; |
| 29 const unsigned char decimal; | 30 const unsigned char decimal; |
| 30 const unsigned char digit; | 31 const unsigned char digit; |
| 31 const unsigned short flags; | 32 const unsigned short flags; |
| 32 } _PyUnicode_TypeRecord; | 33 } _PyUnicode_TypeRecord; |
| 33 | 34 |
| 34 #include "unicodetype_db.h" | 35 #include "unicodetype_db.h" |
| 35 | 36 |
| 36 static const _PyUnicode_TypeRecord * | 37 static const _PyUnicode_TypeRecord * |
| 37 gettyperecord(Py_UNICODE code) | 38 gettyperecord(Py_UNICODE code) |
| 38 { | 39 { |
| 39 int index; | 40 int index; |
| 40 | 41 |
| 41 #ifdef Py_UNICODE_WIDE | 42 #ifdef Py_UNICODE_WIDE |
| 42 if (code >= 0x110000) | 43 if (code >= 0x110000) |
| 43 index = 0; | 44 index = 0; |
| 44 else | 45 else |
| 45 #endif | 46 #endif |
| 46 { | 47 { |
| 47 index = index1[(code>>SHIFT)]; | 48 index = index1[(code>>SHIFT)]; |
| 48 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; | 49 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; |
| 49 } | 50 } |
| 50 | 51 |
| 51 return &_PyUnicode_TypeRecords[index]; | 52 return &_PyUnicode_TypeRecords[index]; |
| 52 } | 53 } |
| 53 | 54 |
| 54 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or | 55 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or |
| 55 type 'B', 0 otherwise. */ | 56 type 'B', 0 otherwise. */ |
| 56 | 57 |
| 57 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) | 58 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) |
| 58 { | 59 { |
| 59 switch (ch) { | 60 switch (ch) { |
| 60 case 0x000A: /* LINE FEED */ | 61 case 0x000A: /* LINE FEED */ |
| 61 case 0x000D: /* CARRIAGE RETURN */ | 62 case 0x000D: /* CARRIAGE RETURN */ |
| 62 case 0x001C: /* FILE SEPARATOR */ | 63 case 0x001C: /* FILE SEPARATOR */ |
| 63 case 0x001D: /* GROUP SEPARATOR */ | 64 case 0x001D: /* GROUP SEPARATOR */ |
| 64 case 0x001E: /* RECORD SEPARATOR */ | 65 case 0x001E: /* RECORD SEPARATOR */ |
| 65 case 0x0085: /* NEXT LINE */ | 66 case 0x0085: /* NEXT LINE */ |
| 66 case 0x2028: /* LINE SEPARATOR */ | 67 case 0x2028: /* LINE SEPARATOR */ |
| 67 case 0x2029: /* PARAGRAPH SEPARATOR */ | 68 case 0x2029: /* PARAGRAPH SEPARATOR */ |
| 68 return 1; | 69 return 1; |
| 69 default: | 70 default: |
| 70 return 0; | 71 return 0; |
| 71 } | 72 } |
| 72 } | 73 } |
| 73 | 74 |
| (...skipping 553 matching lines...) Show 10 above Show 10 below |
| 627 #endif | 628 #endif |
| 628 return (double) 8; | 629 return (double) 8; |
| 629 case 0x1379: | 630 case 0x1379: |
| 630 #ifdef Py_UNICODE_WIDE | 631 #ifdef Py_UNICODE_WIDE |
| 631 case 0x10117: | 632 case 0x10117: |
| 632 #endif | 633 #endif |
| 633 return (double) 80; | 634 return (double) 80; |
| 634 #ifdef Py_UNICODE_WIDE | 635 #ifdef Py_UNICODE_WIDE |
| 635 case 0x10120: | 636 case 0x10120: |
| 636 return (double) 800; | 637 return (double) 800; |
| 637 case 0x10129: | 638 case 0x10129: |
| 638 return (double) 8000; | 639 return (double) 8000; |
| 639 case 0x10132: | 640 case 0x10132: |
| 640 return (double) 80000; | 641 return (double) 80000; |
| 641 #endif | 642 #endif |
| 642 case 0x17F9: | 643 case 0x17F9: |
| 643 case 0x2168: | 644 case 0x2168: |
| 644 case 0x2178: | 645 case 0x2178: |
| 645 case 0x3029: | 646 case 0x3029: |
| 646 case 0x3228: | 647 case 0x3228: |
| 647 case 0x3288: | 648 case 0x3288: |
| 648 #ifdef Py_UNICODE_WIDE | 649 #ifdef Py_UNICODE_WIDE |
| 649 case 0x1010F: | 650 case 0x1010F: |
| 650 #endif | 651 #endif |
| 651 return (double) 9; | 652 return (double) 9; |
| 652 case 0x0F2E: | 653 case 0x0F2E: |
| 653 return (double) 9 / 2; | 654 return (double) 9 / 2; |
| 654 case 0x137A: | 655 case 0x137A: |
| 655 #ifdef Py_UNICODE_WIDE | 656 #ifdef Py_UNICODE_WIDE |
| 656 case 0x10118: | 657 case 0x10118: |
| 657 #endif | 658 #endif |
| 658 return (double) 90; | 659 return (double) 90; |
| 659 #ifdef Py_UNICODE_WIDE | 660 #ifdef Py_UNICODE_WIDE |
| 660 case 0x10121: | 661 case 0x10121: |
| 661 case 0x1034A: | 662 case 0x1034A: |
| 662 return (double) 900; | 663 return (double) 900; |
| 663 case 0x1012A: | 664 case 0x1012A: |
| 664 return (double) 9000; | 665 return (double) 9000; |
| 665 case 0x10133: | 666 case 0x10133: |
| 666 return (double) 90000; | 667 return (double) 90000; |
| 667 #endif | 668 #endif |
| 668 default: | 669 default: |
| 669 return (double) _PyUnicode_ToDigit(ch); | 670 return (double) _PyUnicode_ToDigit(ch); |
| 670 } | 671 } |
| 671 } | 672 } |
| 672 | 673 |
| 673 int _PyUnicode_IsNumeric(Py_UNICODE ch) | 674 int _PyUnicode_IsNumeric(Py_UNICODE ch) |
| 674 { | 675 { |
| 675 return _PyUnicode_ToNumeric(ch) != -1.0; | 676 return _PyUnicode_ToNumeric(ch) != -1.0; |
| 676 } | 677 } |
| 678 |
| 679 /* Returns 1 for Unicode characters to be hex-escaped when repr()ed, |
| 680 0 otherwise. */ |
| 681 |
| 682 int _PyUnicode_IsHexEscaped(Py_UNICODE ch) |
| 683 { |
| 684 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 685 |
| 686 return (ctype->flags & HEX_ESCAPE_MASK) != 0; |
| 687 } |
| 688 |
| 677 | 689 |
| 678 #ifndef WANT_WCTYPE_FUNCTIONS | 690 #ifndef WANT_WCTYPE_FUNCTIONS |
| 679 | 691 |
| 680 /* Returns 1 for Unicode characters having the bidirectional type | 692 /* Returns 1 for Unicode characters having the bidirectional type |
| 681 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */ | 693 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */ |
| 682 | 694 |
| 683 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch) | 695 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch) |
| 684 { | 696 { |
| 685 switch (ch) { | 697 switch (ch) { |
| 686 case 0x0009: /* HORIZONTAL TABULATION */ | 698 case 0x0009: /* HORIZONTAL TABULATION */ |
| 687 case 0x000A: /* LINE FEED */ | 699 case 0x000A: /* LINE FEED */ |
| 688 case 0x000B: /* VERTICAL TABULATION */ | 700 case 0x000B: /* VERTICAL TABULATION */ |
| 689 case 0x000C: /* FORM FEED */ | 701 case 0x000C: /* FORM FEED */ |
| 690 case 0x000D: /* CARRIAGE RETURN */ | 702 case 0x000D: /* CARRIAGE RETURN */ |
| 691 case 0x001C: /* FILE SEPARATOR */ | 703 case 0x001C: /* FILE SEPARATOR */ |
| 692 case 0x001D: /* GROUP SEPARATOR */ | 704 case 0x001D: /* GROUP SEPARATOR */ |
| 693 case 0x001E: /* RECORD SEPARATOR */ | 705 case 0x001E: /* RECORD SEPARATOR */ |
| 694 case 0x001F: /* UNIT SEPARATOR */ | 706 case 0x001F: /* UNIT SEPARATOR */ |
| 695 case 0x0020: /* SPACE */ | 707 case 0x0020: /* SPACE */ |
| 696 case 0x0085: /* NEXT LINE */ | 708 case 0x0085: /* NEXT LINE */ |
| 697 case 0x00A0: /* NO-BREAK SPACE */ | 709 case 0x00A0: /* NO-BREAK SPACE */ |
| 698 case 0x1680: /* OGHAM SPACE MARK */ | 710 case 0x1680: /* OGHAM SPACE MARK */ |
| 699 case 0x2000: /* EN QUAD */ | 711 case 0x2000: /* EN QUAD */ |
| 700 case 0x2001: /* EM QUAD */ | 712 case 0x2001: /* EM QUAD */ |
| 701 case 0x2002: /* EN SPACE */ | 713 case 0x2002: /* EN SPACE */ |
| 702 case 0x2003: /* EM SPACE */ | 714 case 0x2003: /* EM SPACE */ |
| 703 case 0x2004: /* THREE-PER-EM SPACE */ | 715 case 0x2004: /* THREE-PER-EM SPACE */ |
| 704 case 0x2005: /* FOUR-PER-EM SPACE */ | 716 case 0x2005: /* FOUR-PER-EM SPACE */ |
| 705 case 0x2006: /* SIX-PER-EM SPACE */ | 717 case 0x2006: /* SIX-PER-EM SPACE */ |
| 706 case 0x2007: /* FIGURE SPACE */ | 718 case 0x2007: /* FIGURE SPACE */ |
| 707 case 0x2008: /* PUNCTUATION SPACE */ | 719 case 0x2008: /* PUNCTUATION SPACE */ |
| 708 case 0x2009: /* THIN SPACE */ | 720 case 0x2009: /* THIN SPACE */ |
| 709 case 0x200A: /* HAIR SPACE */ | 721 case 0x200A: /* HAIR SPACE */ |
| 710 case 0x200B: /* ZERO WIDTH SPACE */ | 722 case 0x200B: /* ZERO WIDTH SPACE */ |
| 711 case 0x2028: /* LINE SEPARATOR */ | 723 case 0x2028: /* LINE SEPARATOR */ |
| 712 case 0x2029: /* PARAGRAPH SEPARATOR */ | 724 case 0x2029: /* PARAGRAPH SEPARATOR */ |
| 713 case 0x202F: /* NARROW NO-BREAK SPACE */ | 725 case 0x202F: /* NARROW NO-BREAK SPACE */ |
| 714 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */ | 726 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */ |
| 715 case 0x3000: /* IDEOGRAPHIC SPACE */ | 727 case 0x3000: /* IDEOGRAPHIC SPACE */ |
| 716 return 1; | 728 return 1; |
| 717 default: | 729 default: |
| 718 return 0; | 730 return 0; |
| 719 } | 731 } |
| 720 } | 732 } |
| 721 | 733 |
| 722 /* Returns 1 for Unicode characters having the category 'Ll', 0 | 734 /* Returns 1 for Unicode characters having the category 'Ll', 0 |
| 723 otherwise. */ | 735 otherwise. */ |
| 724 | 736 |
| 725 int _PyUnicode_IsLowercase(Py_UNICODE ch) | 737 int _PyUnicode_IsLowercase(Py_UNICODE ch) |
| 726 { | 738 { |
| 727 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 739 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 728 | 740 |
| 729 return (ctype->flags & LOWER_MASK) != 0; | 741 return (ctype->flags & LOWER_MASK) != 0; |
| 730 } | 742 } |
| 731 | 743 |
| 732 /* Returns 1 for Unicode characters having the category 'Lu', 0 | 744 /* Returns 1 for Unicode characters having the category 'Lu', 0 |
| 733 otherwise. */ | 745 otherwise. */ |
| 734 | 746 |
| 735 int _PyUnicode_IsUppercase(Py_UNICODE ch) | 747 int _PyUnicode_IsUppercase(Py_UNICODE ch) |
| 736 { | 748 { |
| 737 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 749 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 738 | 750 |
| 739 return (ctype->flags & UPPER_MASK) != 0; | 751 return (ctype->flags & UPPER_MASK) != 0; |
| 740 } | 752 } |
| 741 | 753 |
| 742 /* Returns the uppercase Unicode characters corresponding to ch or just | 754 /* Returns the uppercase Unicode characters corresponding to ch or just |
| 743 ch if no uppercase mapping is known. */ | 755 ch if no uppercase mapping is known. */ |
| 744 | 756 |
| 745 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch) | 757 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch) |
| 746 { | 758 { |
| 747 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 759 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 748 int delta = ctype->upper; | 760 int delta = ctype->upper; |
| 749 if (delta >= 32768) | 761 if (delta >= 32768) |
| 750 delta -= 65536; | 762 delta -= 65536; |
| 751 return ch + delta; | 763 return ch + delta; |
| 752 } | 764 } |
| 753 | 765 |
| 754 /* Returns the lowercase Unicode characters corresponding to ch or just | 766 /* Returns the lowercase Unicode characters corresponding to ch or just |
| 755 ch if no lowercase mapping is known. */ | 767 ch if no lowercase mapping is known. */ |
| 756 | 768 |
| 757 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch) | 769 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch) |
| 758 { | 770 { |
| 759 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 771 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 760 int delta = ctype->lower; | 772 int delta = ctype->lower; |
| 761 if (delta >= 32768) | 773 if (delta >= 32768) |
| 762 delta -= 65536; | 774 delta -= 65536; |
| 763 return ch + delta; | 775 return ch + delta; |
| 764 } | 776 } |
| 765 | 777 |
| 766 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', | 778 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', |
| 767 'Lo' or 'Lm', 0 otherwise. */ | 779 'Lo' or 'Lm', 0 otherwise. */ |
| 768 | 780 |
| 769 int _PyUnicode_IsAlpha(Py_UNICODE ch) | 781 int _PyUnicode_IsAlpha(Py_UNICODE ch) |
| 770 { | 782 { |
| 771 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 783 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 772 | 784 |
| 773 return (ctype->flags & ALPHA_MASK) != 0; | 785 return (ctype->flags & ALPHA_MASK) != 0; |
| 774 } | 786 } |
| 775 | 787 |
| 776 #else | 788 #else |
| 777 | 789 |
| 778 /* Export the interfaces using the wchar_t type for portability | 790 /* Export the interfaces using the wchar_t type for portability |
| 779 reasons: */ | 791 reasons: */ |
| 780 | 792 |
| 781 int _PyUnicode_IsWhitespace(Py_UNICODE ch) | 793 int _PyUnicode_IsWhitespace(Py_UNICODE ch) |
| 782 { | 794 { |
| 783 return iswspace(ch); | 795 return iswspace(ch); |
| 784 } | 796 } |
| 785 | 797 |
| 786 int _PyUnicode_IsLowercase(Py_UNICODE ch) | 798 int _PyUnicode_IsLowercase(Py_UNICODE ch) |
| 787 { | 799 { |
| 788 return iswlower(ch); | 800 return iswlower(ch); |
| 789 } | 801 } |
| 790 | 802 |
| 791 int _PyUnicode_IsUppercase(Py_UNICODE ch) | 803 int _PyUnicode_IsUppercase(Py_UNICODE ch) |
| 792 { | 804 { |
| 793 return iswupper(ch); | 805 return iswupper(ch); |
| 794 } | 806 } |
| 795 | 807 |
| 796 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch) | 808 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch) |
| 797 { | 809 { |
| 798 return towlower(ch); | 810 return towlower(ch); |
| 799 } | 811 } |
| 800 | 812 |
| 801 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch) | 813 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch) |
| 802 { | 814 { |
| 803 return towupper(ch); | 815 return towupper(ch); |
| 804 } | 816 } |
| 805 | 817 |
| 806 int _PyUnicode_IsAlpha(Py_UNICODE ch) | 818 int _PyUnicode_IsAlpha(Py_UNICODE ch) |
| 807 { | 819 { |
| 808 return iswalpha(ch); | 820 return iswalpha(ch); |
| 809 } | 821 } |
| 810 | 822 |
| 811 #endif | 823 #endif |
| OLD | NEW |