OLD | NEW |
1 doctests = """ | 1 doctests = """ |
2 Tests for the tokenize module. | 2 Tests for the tokenize module. |
3 | 3 |
4 The tests can be really simple. Given a small fragment of source | 4 The tests can be really simple. Given a small fragment of source |
5 code, print out a table with tokens. The ENDMARK is omitted for | 5 code, print out a table with tokens. The ENDMARK is omitted for |
6 brevity. | 6 brevity. |
7 | 7 |
8 >>> dump_tokens("1 + 1") | 8 >>> dump_tokens("1 + 1") |
9 ENCODING 'utf-8' (0, 0) (0, 0) | 9 ENCODING 'utf-8' (0, 0) (0, 0) |
10 NUMBER '1' (1, 0) (1, 1) | 10 NUMBER '1' (1, 0) (1, 1) |
(...skipping 671 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
682 nonlocal first | 682 nonlocal first |
683 if not first: | 683 if not first: |
684 first = True | 684 first = True |
685 return line | 685 return line |
686 else: | 686 else: |
687 return b'' | 687 return b'' |
688 | 688 |
689 # skip the initial encoding token and the end token | 689 # skip the initial encoding token and the end token |
690 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] | 690 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] |
691 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] | 691 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] |
692 self.assertEquals(tokens, expected_tokens, | 692 self.assertEqual(tokens, expected_tokens, |
693 "bytes not decoded with encoding") | 693 "bytes not decoded with encoding") |
694 | 694 |
695 def test__tokenize_does_not_decode_with_encoding_none(self): | 695 def test__tokenize_does_not_decode_with_encoding_none(self): |
696 literal = '"ЉЊЈЁЂ"' | 696 literal = '"ЉЊЈЁЂ"' |
697 first = False | 697 first = False |
698 def readline(): | 698 def readline(): |
699 nonlocal first | 699 nonlocal first |
700 if not first: | 700 if not first: |
701 first = True | 701 first = True |
702 return literal | 702 return literal |
703 else: | 703 else: |
704 return b'' | 704 return b'' |
705 | 705 |
706 # skip the end token | 706 # skip the end token |
707 tokens = list(_tokenize(readline, encoding=None))[:-1] | 707 tokens = list(_tokenize(readline, encoding=None))[:-1] |
708 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] | 708 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] |
709 self.assertEquals(tokens, expected_tokens, | 709 self.assertEqual(tokens, expected_tokens, |
710 "string not tokenized when encoding is None") | 710 "string not tokenized when encoding is None") |
711 | 711 |
712 | 712 |
713 class TestDetectEncoding(TestCase): | 713 class TestDetectEncoding(TestCase): |
714 | 714 |
715 def get_readline(self, lines): | 715 def get_readline(self, lines): |
716 index = 0 | 716 index = 0 |
717 def readline(): | 717 def readline(): |
718 nonlocal index | 718 nonlocal index |
719 if index == len(lines): | 719 if index == len(lines): |
720 raise StopIteration | 720 raise StopIteration |
721 line = lines[index] | 721 line = lines[index] |
722 index += 1 | 722 index += 1 |
723 return line | 723 return line |
724 return readline | 724 return readline |
725 | 725 |
726 def test_no_bom_no_encoding_cookie(self): | 726 def test_no_bom_no_encoding_cookie(self): |
727 lines = ( | 727 lines = ( |
728 b'# something\n', | 728 b'# something\n', |
729 b'print(something)\n', | 729 b'print(something)\n', |
730 b'do_something(else)\n' | 730 b'do_something(else)\n' |
731 ) | 731 ) |
732 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) | 732 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) |
733 self.assertEquals(encoding, 'utf-8') | 733 self.assertEqual(encoding, 'utf-8') |
734 self.assertEquals(consumed_lines, list(lines[:2])) | 734 self.assertEqual(consumed_lines, list(lines[:2])) |
735 | 735 |
736 def test_bom_no_cookie(self): | 736 def test_bom_no_cookie(self): |
737 lines = ( | 737 lines = ( |
738 b'\xef\xbb\xbf# something\n', | 738 b'\xef\xbb\xbf# something\n', |
739 b'print(something)\n', | 739 b'print(something)\n', |
740 b'do_something(else)\n' | 740 b'do_something(else)\n' |
741 ) | 741 ) |
742 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) | 742 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) |
743 self.assertEquals(encoding, 'utf-8-sig') | 743 self.assertEqual(encoding, 'utf-8-sig') |
744 self.assertEquals(consumed_lines, | 744 self.assertEqual(consumed_lines, |
745 [b'# something\n', b'print(something)\n']) | 745 [b'# something\n', b'print(something)\n']) |
746 | 746 |
747 def test_cookie_first_line_no_bom(self): | 747 def test_cookie_first_line_no_bom(self): |
748 lines = ( | 748 lines = ( |
749 b'# -*- coding: latin-1 -*-\n', | 749 b'# -*- coding: latin-1 -*-\n', |
750 b'print(something)\n', | 750 b'print(something)\n', |
751 b'do_something(else)\n' | 751 b'do_something(else)\n' |
752 ) | 752 ) |
753 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) | 753 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) |
754 self.assertEquals(encoding, 'iso-8859-1') | 754 self.assertEqual(encoding, 'iso-8859-1') |
755 self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) | 755 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) |
756 | 756 |
757 def test_matched_bom_and_cookie_first_line(self): | 757 def test_matched_bom_and_cookie_first_line(self): |
758 lines = ( | 758 lines = ( |
759 b'\xef\xbb\xbf# coding=utf-8\n', | 759 b'\xef\xbb\xbf# coding=utf-8\n', |
760 b'print(something)\n', | 760 b'print(something)\n', |
761 b'do_something(else)\n' | 761 b'do_something(else)\n' |
762 ) | 762 ) |
763 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) | 763 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) |
764 self.assertEquals(encoding, 'utf-8-sig') | 764 self.assertEqual(encoding, 'utf-8-sig') |
765 self.assertEquals(consumed_lines, [b'# coding=utf-8\n']) | 765 self.assertEqual(consumed_lines, [b'# coding=utf-8\n']) |
766 | 766 |
767 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): | 767 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): |
768 lines = ( | 768 lines = ( |
769 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n', | 769 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n', |
770 b'print(something)\n', | 770 b'print(something)\n', |
771 b'do_something(else)\n' | 771 b'do_something(else)\n' |
772 ) | 772 ) |
773 readline = self.get_readline(lines) | 773 readline = self.get_readline(lines) |
774 self.assertRaises(SyntaxError, detect_encoding, readline) | 774 self.assertRaises(SyntaxError, detect_encoding, readline) |
775 | 775 |
776 def test_cookie_second_line_no_bom(self): | 776 def test_cookie_second_line_no_bom(self): |
777 lines = ( | 777 lines = ( |
778 b'#! something\n', | 778 b'#! something\n', |
779 b'# vim: set fileencoding=ascii :\n', | 779 b'# vim: set fileencoding=ascii :\n', |
780 b'print(something)\n', | 780 b'print(something)\n', |
781 b'do_something(else)\n' | 781 b'do_something(else)\n' |
782 ) | 782 ) |
783 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) | 783 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) |
784 self.assertEquals(encoding, 'ascii') | 784 self.assertEqual(encoding, 'ascii') |
785 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] | 785 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] |
786 self.assertEquals(consumed_lines, expected) | 786 self.assertEqual(consumed_lines, expected) |
787 | 787 |
788 def test_matched_bom_and_cookie_second_line(self): | 788 def test_matched_bom_and_cookie_second_line(self): |
789 lines = ( | 789 lines = ( |
790 b'\xef\xbb\xbf#! something\n', | 790 b'\xef\xbb\xbf#! something\n', |
791 b'f# coding=utf-8\n', | 791 b'f# coding=utf-8\n', |
792 b'print(something)\n', | 792 b'print(something)\n', |
793 b'do_something(else)\n' | 793 b'do_something(else)\n' |
794 ) | 794 ) |
795 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) | 795 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) |
796 self.assertEquals(encoding, 'utf-8-sig') | 796 self.assertEqual(encoding, 'utf-8-sig') |
797 self.assertEquals(consumed_lines, | 797 self.assertEqual(consumed_lines, |
798 [b'#! something\n', b'f# coding=utf-8\n']) | 798 [b'#! something\n', b'f# coding=utf-8\n']) |
799 | 799 |
800 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): | 800 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): |
801 lines = ( | 801 lines = ( |
802 b'\xef\xbb\xbf#! something\n', | 802 b'\xef\xbb\xbf#! something\n', |
803 b'# vim: set fileencoding=ascii :\n', | 803 b'# vim: set fileencoding=ascii :\n', |
804 b'print(something)\n', | 804 b'print(something)\n', |
805 b'do_something(else)\n' | 805 b'do_something(else)\n' |
806 ) | 806 ) |
807 readline = self.get_readline(lines) | 807 readline = self.get_readline(lines) |
808 self.assertRaises(SyntaxError, detect_encoding, readline) | 808 self.assertRaises(SyntaxError, detect_encoding, readline) |
809 | 809 |
810 def test_latin1_normalization(self): | 810 def test_latin1_normalization(self): |
811 # See get_normal_name() in tokenizer.c. | 811 # See get_normal_name() in tokenizer.c. |
812 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", | 812 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", |
813 "iso-8859-1-unix", "iso-latin-1-mac") | 813 "iso-8859-1-unix", "iso-latin-1-mac") |
814 for encoding in encodings: | 814 for encoding in encodings: |
815 for rep in ("-", "_"): | 815 for rep in ("-", "_"): |
816 enc = encoding.replace("-", rep) | 816 enc = encoding.replace("-", rep) |
817 lines = (b"#!/usr/bin/python\n", | 817 lines = (b"#!/usr/bin/python\n", |
818 b"# coding: " + enc.encode("ascii") + b"\n", | 818 b"# coding: " + enc.encode("ascii") + b"\n", |
819 b"print(things)\n", | 819 b"print(things)\n", |
820 b"do_something += 4\n") | 820 b"do_something += 4\n") |
821 rl = self.get_readline(lines) | 821 rl = self.get_readline(lines) |
822 found, consumed_lines = detect_encoding(rl) | 822 found, consumed_lines = detect_encoding(rl) |
823 self.assertEquals(found, "iso-8859-1") | 823 self.assertEqual(found, "iso-8859-1") |
824 | 824 |
825 def test_utf8_normalization(self): | 825 def test_utf8_normalization(self): |
826 # See get_normal_name() in tokenizer.c. | 826 # See get_normal_name() in tokenizer.c. |
827 encodings = ("utf-8", "utf-8-mac", "utf-8-unix") | 827 encodings = ("utf-8", "utf-8-mac", "utf-8-unix") |
828 for encoding in encodings: | 828 for encoding in encodings: |
829 for rep in ("-", "_"): | 829 for rep in ("-", "_"): |
830 enc = encoding.replace("-", rep) | 830 enc = encoding.replace("-", rep) |
831 lines = (b"#!/usr/bin/python\n", | 831 lines = (b"#!/usr/bin/python\n", |
832 b"# coding: " + enc.encode("ascii") + b"\n", | 832 b"# coding: " + enc.encode("ascii") + b"\n", |
833 b"1 + 3\n") | 833 b"1 + 3\n") |
834 rl = self.get_readline(lines) | 834 rl = self.get_readline(lines) |
835 found, consumed_lines = detect_encoding(rl) | 835 found, consumed_lines = detect_encoding(rl) |
836 self.assertEquals(found, "utf-8") | 836 self.assertEqual(found, "utf-8") |
837 | 837 |
838 def test_short_files(self): | 838 def test_short_files(self): |
839 readline = self.get_readline((b'print(something)\n',)) | 839 readline = self.get_readline((b'print(something)\n',)) |
840 encoding, consumed_lines = detect_encoding(readline) | 840 encoding, consumed_lines = detect_encoding(readline) |
841 self.assertEquals(encoding, 'utf-8') | 841 self.assertEqual(encoding, 'utf-8') |
842 self.assertEquals(consumed_lines, [b'print(something)\n']) | 842 self.assertEqual(consumed_lines, [b'print(something)\n']) |
843 | 843 |
844 encoding, consumed_lines = detect_encoding(self.get_readline(())) | 844 encoding, consumed_lines = detect_encoding(self.get_readline(())) |
845 self.assertEquals(encoding, 'utf-8') | 845 self.assertEqual(encoding, 'utf-8') |
846 self.assertEquals(consumed_lines, []) | 846 self.assertEqual(consumed_lines, []) |
847 | 847 |
848 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) | 848 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) |
849 encoding, consumed_lines = detect_encoding(readline) | 849 encoding, consumed_lines = detect_encoding(readline) |
850 self.assertEquals(encoding, 'utf-8-sig') | 850 self.assertEqual(encoding, 'utf-8-sig') |
851 self.assertEquals(consumed_lines, [b'print(something)\n']) | 851 self.assertEqual(consumed_lines, [b'print(something)\n']) |
852 | 852 |
853 readline = self.get_readline((b'\xef\xbb\xbf',)) | 853 readline = self.get_readline((b'\xef\xbb\xbf',)) |
854 encoding, consumed_lines = detect_encoding(readline) | 854 encoding, consumed_lines = detect_encoding(readline) |
855 self.assertEquals(encoding, 'utf-8-sig') | 855 self.assertEqual(encoding, 'utf-8-sig') |
856 self.assertEquals(consumed_lines, []) | 856 self.assertEqual(consumed_lines, []) |
857 | 857 |
858 readline = self.get_readline((b'# coding: bad\n',)) | 858 readline = self.get_readline((b'# coding: bad\n',)) |
859 self.assertRaises(SyntaxError, detect_encoding, readline) | 859 self.assertRaises(SyntaxError, detect_encoding, readline) |
860 | 860 |
861 def test_open(self): | 861 def test_open(self): |
862 filename = support.TESTFN + '.py' | 862 filename = support.TESTFN + '.py' |
863 self.addCleanup(support.unlink, filename) | 863 self.addCleanup(support.unlink, filename) |
864 | 864 |
865 # test coding cookie | 865 # test coding cookie |
866 for encoding in ('iso-8859-15', 'utf-8'): | 866 for encoding in ('iso-8859-15', 'utf-8'): |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
905 if counter == 5: | 905 if counter == 5: |
906 return b'' | 906 return b'' |
907 return counter | 907 return counter |
908 | 908 |
909 orig_detect_encoding = tokenize_module.detect_encoding | 909 orig_detect_encoding = tokenize_module.detect_encoding |
910 orig__tokenize = tokenize_module._tokenize | 910 orig__tokenize = tokenize_module._tokenize |
911 tokenize_module.detect_encoding = mock_detect_encoding | 911 tokenize_module.detect_encoding = mock_detect_encoding |
912 tokenize_module._tokenize = mock__tokenize | 912 tokenize_module._tokenize = mock__tokenize |
913 try: | 913 try: |
914 results = tokenize(mock_readline) | 914 results = tokenize(mock_readline) |
915 self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4]) | 915 self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4]) |
916 finally: | 916 finally: |
917 tokenize_module.detect_encoding = orig_detect_encoding | 917 tokenize_module.detect_encoding = orig_detect_encoding |
918 tokenize_module._tokenize = orig__tokenize | 918 tokenize_module._tokenize = orig__tokenize |
919 | 919 |
920 self.assertTrue(encoding_used, encoding) | 920 self.assertTrue(encoding_used, encoding) |
921 | 921 |
922 | 922 |
923 __test__ = {"doctests" : doctests, 'decistmt': decistmt} | 923 __test__ = {"doctests" : doctests, 'decistmt': decistmt} |
924 | 924 |
925 def test_main(): | 925 def test_main(): |
926 from test import test_tokenize | 926 from test import test_tokenize |
927 support.run_doctest(test_tokenize, True) | 927 support.run_doctest(test_tokenize, True) |
928 support.run_unittest(TestTokenizerAdheresToPep0263) | 928 support.run_unittest(TestTokenizerAdheresToPep0263) |
929 support.run_unittest(Test_Tokenize) | 929 support.run_unittest(Test_Tokenize) |
930 support.run_unittest(TestDetectEncoding) | 930 support.run_unittest(TestDetectEncoding) |
931 support.run_unittest(TestTokenize) | 931 support.run_unittest(TestTokenize) |
932 | 932 |
933 if __name__ == "__main__": | 933 if __name__ == "__main__": |
934 test_main() | 934 test_main() |
OLD | NEW |