Mailing List Archive

[3.11] gh-94823: Improve coverage in tokenizer.c:valid_utf8 (GH-94856) (#96029)
https://github.com/python/cpython/commit/2bb363cfcd7563fdd29ac93563f95b8a5205b008
commit: 2bb363cfcd7563fdd29ac93563f95b8a5205b008
branch: 3.11
author: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
committer: pablogsal <Pablogsal@gmail.com>
date: 2022-08-16T17:26:40+01:00
summary:

[3.11] gh-94823: Improve coverage in tokenizer.c:valid_utf8 (GH-94856) (#96029)

Co-authored-by: Michael Droettboom <mdboom@gmail.com>

files:
M Lib/test/test_source_encoding.py

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index a0375fda0d36..e1b0de2adef6 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -224,6 +224,67 @@ def test_crcrcrlf2(self):
out = self.check_script_output(src, br"'\n\n\n'")


+class UTF8ValidatorTest(unittest.TestCase):
+ @unittest.skipIf(not sys.platform.startswith("linux"),
+ "Too slow to run on non-Linux platforms")
+ def test_invalid_utf8(self):
+ # This is a port of test_utf8_decode_invalid_sequences in
+ # test_unicode.py to exercise the separate utf8 validator in
+ # Parser/tokenizer.c used when reading source files.
+
+ # That file is written using low-level C file I/O, so the only way to
+ # test it is to write actual files to disk.
+
+ # Each example is put inside a string at the top of the file so
+ # it's an otherwise valid Python source file.
+ template = b'"%s"\n'
+
+ with tempfile.TemporaryDirectory() as tmpd:
+ fn = os.path.join(tmpd, 'test.py')
+
+ def check(content):
+ with open(fn, 'wb') as fp:
+ fp.write(template % content)
+ script_helper.assert_python_failure(fn)
+
+ # continuation bytes in a sequence of 2, 3, or 4 bytes
+ continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
+ # start bytes of a 2-byte sequence equivalent to code points < 0x7F
+ invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
+ # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
+ invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
+ invalid_start_bytes = (
+ continuation_bytes + invalid_2B_seq_start_bytes +
+ invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
+ )
+
+ for byte in invalid_start_bytes:
+ check(byte)
+
+ for sb in invalid_2B_seq_start_bytes:
+ for cb in continuation_bytes:
+ check(sb + cb)
+
+ for sb in invalid_4B_seq_start_bytes:
+ for cb1 in continuation_bytes[:3]:
+ for cb3 in continuation_bytes[:3]:
+ check(sb+cb1+b'\x80'+cb3)
+
+ for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
+ check(b'\xE0'+cb+b'\x80')
+ check(b'\xE0'+cb+b'\xBF')
+ # surrogates
+ for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
+ check(b'\xED'+cb+b'\x80')
+ check(b'\xED'+cb+b'\xBF')
+ for cb in [bytes([x]) for x in range(0x80, 0x90)]:
+ check(b'\xF0'+cb+b'\x80\x80')
+ check(b'\xF0'+cb+b'\xBF\xBF')
+ for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
+ check(b'\xF4'+cb+b'\x80\x80')
+ check(b'\xF4'+cb+b'\xBF\xBF')
+
+
class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):

def check_script_output(self, src, expected):

_______________________________________________
Python-checkins mailing list
Python-checkins@python.org
https://mail.python.org/mailman/listinfo/python-checkins