Mailing List Archive: [3.10] Ensure the str member of the tokenizer is always initialised (GH-29681). (GH-29683)

https://github.com/python/cpython/commit/07cf66fd03e161c09279346da4e76705cf42d535
commit: 07cf66fd03e161c09279346da4e76705cf42d535
branch: 3.10
author: Pablo Galindo Salgado <Pablogsal@gmail.com>
committer: pablogsal <Pablogsal@gmail.com>
date: 2021-11-21T04:15:22Z
summary:

[3.10] Ensure the str member of the tokenizer is always initialised (GH-29681). (GH-29683)

(cherry picked from commit 4f006a789a35f5d1a7ef142bd1304ce167392457)

Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>

files:
A Parser/pegen_errors.c
M Parser/pegen.c
M Parser/tokenizer.c
M Parser/tokenizer.h

diff --git a/Parser/pegen.c b/Parser/pegen.c
index 464a902173dfb..8946aa33145b8 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -432,7 +432,7 @@ get_error_line(Parser *p, Py_ssize_t lineno)
* (multi-line) statement are stored in p->tok->interactive_src_start.
* If not, we're parsing from a string, which means that the whole source
* is stored in p->tok->str. */
- assert(p->tok->fp == NULL || p->tok->fp == stdin);
+ assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);

char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
assert(cur_line != NULL);
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
new file mode 100644
index 0000000000000..694184a03b075
--- /dev/null
+++ b/Parser/pegen_errors.c
@@ -0,0 +1,425 @@
+#include <Python.h>
+#include <errcode.h>
+
+#include "tokenizer.h"
+#include "pegen.h"
+
+// TOKENIZER ERRORS
+
+void
+_PyPegen_raise_tokenizer_init_error(PyObject *filename)
+{
+ if (!(PyErr_ExceptionMatches(PyExc_LookupError)
+ || PyErr_ExceptionMatches(PyExc_SyntaxError)
+ || PyErr_ExceptionMatches(PyExc_ValueError)
+ || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
+ return;
+ }
+ PyObject *errstr = NULL;
+ PyObject *tuple = NULL;
+ PyObject *type;
+ PyObject *value;
+ PyObject *tback;
+ PyErr_Fetch(&type, &value, &tback);
+ errstr = PyObject_Str(value);
+ if (!errstr) {
+ goto error;
+ }
+
+ PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
+ if (!tmp) {
+ goto error;
+ }
+
+ tuple = PyTuple_Pack(2, errstr, tmp);
+ Py_DECREF(tmp);
+ if (!value) {
+ goto error;
+ }
+ PyErr_SetObject(PyExc_SyntaxError, tuple);
+
+error:
+ Py_XDECREF(type);
+ Py_XDECREF(value);
+ Py_XDECREF(tback);
+ Py_XDECREF(errstr);
+ Py_XDECREF(tuple);
+}
+
+static inline void
+raise_unclosed_parentheses_error(Parser *p) {
+ int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
+ int error_col = p->tok->parencolstack[p->tok->level-1];
+ RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
+ error_lineno, error_col, error_lineno, -1,
+ "'%c' was never closed",
+ p->tok->parenstack[p->tok->level-1]);
+}
+
+int
+_Pypegen_tokenizer_error(Parser *p)
+{
+ if (PyErr_Occurred()) {
+ return -1;
+ }
+
+ const char *msg = NULL;
+ PyObject* errtype = PyExc_SyntaxError;
+ Py_ssize_t col_offset = -1;
+ switch (p->tok->done) {
+ case E_TOKEN:
+ msg = "invalid token";
+ break;
+ case E_EOF:
+ if (p->tok->level) {
+ raise_unclosed_parentheses_error(p);
+ } else {
+ RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
+ }
+ return -1;
+ case E_DEDENT:
+ RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
+ return -1;
+ case E_INTR:
+ if (!PyErr_Occurred()) {
+ PyErr_SetNone(PyExc_KeyboardInterrupt);
+ }
+ return -1;
+ case E_NOMEM:
+ PyErr_NoMemory();
+ return -1;
+ case E_TABSPACE:
+ errtype = PyExc_TabError;
+ msg = "inconsistent use of tabs and spaces in indentation";
+ break;
+ case E_TOODEEP:
+ errtype = PyExc_IndentationError;
+ msg = "too many levels of indentation";
+ break;
+ case E_LINECONT: {
+ col_offset = p->tok->cur - p->tok->buf - 1;
+ msg = "unexpected character after line continuation character";
+ break;
+ }
+ default:
+ msg = "unknown parsing error";
+ }
+
+ RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
+ col_offset >= 0 ? col_offset : 0,
+ p->tok->lineno, -1, msg);
+ return -1;
+}
+
+int
+_Pypegen_raise_decode_error(Parser *p)
+{
+ assert(PyErr_Occurred());
+ const char *errtype = NULL;
+ if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
+ errtype = "unicode error";
+ }
+ else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
+ errtype = "value error";
+ }
+ if (errtype) {
+ PyObject *type;
+ PyObject *value;
+ PyObject *tback;
+ PyObject *errstr;
+ PyErr_Fetch(&type, &value, &tback);
+ errstr = PyObject_Str(value);
+ if (errstr) {
+ RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
+ Py_DECREF(errstr);
+ }
+ else {
+ PyErr_Clear();
+ RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
+ }
+ Py_XDECREF(type);
+ Py_XDECREF(value);
+ Py_XDECREF(tback);
+ }
+
+ return -1;
+}
+
+static int
+_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
+ // Tokenize the whole input to see if there are any tokenization
+ // errors such as mistmatching parentheses. These will get priority
+ // over generic syntax errors only if the line number of the error is
+ // before the one that we had for the generic error.
+
+ // We don't want to tokenize to the end for interactive input
+ if (p->tok->prompt != NULL) {
+ return 0;
+ }
+
+ PyObject *type, *value, *traceback;
+ PyErr_Fetch(&type, &value, &traceback);
+
+ Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
+ Py_ssize_t current_err_line = current_token->lineno;
+
+ int ret = 0;
+
+ for (;;) {
+ const char *start;
+ const char *end;
+ switch (_PyTokenizer_Get(p->tok, &start, &end)) {
+ case ERRORTOKEN:
+ if (p->tok->level != 0) {
+ int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
+ if (current_err_line > error_lineno) {
+ raise_unclosed_parentheses_error(p);
+ ret = -1;
+ goto exit;
+ }
+ }
+ break;
+ case ENDMARKER:
+ break;
+ default:
+ continue;
+ }
+ break;
+ }
+
+
+exit:
+ if (PyErr_Occurred()) {
+ Py_XDECREF(value);
+ Py_XDECREF(type);
+ Py_XDECREF(traceback);
+ } else {
+ PyErr_Restore(type, value, traceback);
+ }
+ return ret;
+}
+
+// PARSER ERRORS
+
+void *
+_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
+{
+ if (p->fill == 0) {
+ va_list va;
+ va_start(va, errmsg);
+ _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
+ va_end(va);
+ return NULL;
+ }
+
+ Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
+ Py_ssize_t col_offset;
+ Py_ssize_t end_col_offset = -1;
+ if (t->col_offset == -1) {
+ if (p->tok->cur == p->tok->buf) {
+ col_offset = 0;
+ } else {
+ const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
+ col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
+ }
+ } else {
+ col_offset = t->col_offset + 1;
+ }
+
+ if (t->end_col_offset != -1) {
+ end_col_offset = t->end_col_offset + 1;
+ }
+
+ va_list va;
+ va_start(va, errmsg);
+ _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
+ va_end(va);
+
+ return NULL;
+}
+
+static PyObject *
+get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
+{
+ /* If the file descriptor is interactive, the source lines of the current
+ * (multi-line) statement are stored in p->tok->interactive_src_start.
+ * If not, we're parsing from a string, which means that the whole source
+ * is stored in p->tok->str. */
+ assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
+
+ char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
+ assert(cur_line != NULL);
+
+ for (int i = 0; i < lineno - 1; i++) {
+ cur_line = strchr(cur_line, '\n') + 1;
+ }
+
+ char *next_newline;
+ if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
+ next_newline = cur_line + strlen(cur_line);
+ }
+ return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
+}
+
+void *
+_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
+ Py_ssize_t lineno, Py_ssize_t col_offset,
+ Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
+ const char *errmsg, va_list va)
+{
+ PyObject *value = NULL;
+ PyObject *errstr = NULL;
+ PyObject *error_line = NULL;
+ PyObject *tmp = NULL;
+ p->error_indicator = 1;
+
+ if (end_lineno == CURRENT_POS) {
+ end_lineno = p->tok->lineno;
+ }
+ if (end_col_offset == CURRENT_POS) {
+ end_col_offset = p->tok->cur - p->tok->line_start;
+ }
+
+ if (p->start_rule == Py_fstring_input) {
+ const char *fstring_msg = "f-string: ";
+ Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
+
+ char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
+ if (!new_errmsg) {
+ return (void *) PyErr_NoMemory();
+ }
+
+ // Copy both strings into new buffer
+ memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
+ memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
+ new_errmsg[len] = 0;
+ errmsg = new_errmsg;
+ }
+ errstr = PyUnicode_FromFormatV(errmsg, va);
+ if (!errstr) {
+ goto error;
+ }
+
+ if (p->tok->fp_interactive) {
+ error_line = get_error_line_from_tokenizer_buffers(p, lineno);
+ }
+ else if (p->start_rule == Py_file_input) {
+ error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
+ (int) lineno, p->tok->encoding);
+ }
+
+ if (!error_line) {
+ /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
+ then we need to find the error line from some other source, because
+ p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
+ failed or we're parsing from a string or the REPL. There's a third edge case where
+ we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
+ `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
+ does not physically exist */
+ assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
+
+ if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
+ Py_ssize_t size = p->tok->inp - p->tok->buf;
+ error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
+ }
+ else if (p->tok->fp == NULL || p->tok->fp == stdin) {
+ error_line = get_error_line_from_tokenizer_buffers(p, lineno);
+ }
+ else {
+ error_line = PyUnicode_FromStringAndSize("", 0);
+ }
+ if (!error_line) {
+ goto error;
+ }
+ }
+
+ if (p->start_rule == Py_fstring_input) {
+ col_offset -= p->starting_col_offset;
+ end_col_offset -= p->starting_col_offset;
+ }
+
+ Py_ssize_t col_number = col_offset;
+ Py_ssize_t end_col_number = end_col_offset;
+
+ if (p->tok->encoding != NULL) {
+ col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
+ if (col_number < 0) {
+ goto error;
+ }
+ if (end_col_number > 0) {
+ Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
+ if (end_col_offset < 0) {
+ goto error;
+ } else {
+ end_col_number = end_col_offset;
+ }
+ }
+ }
+ tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
+ if (!tmp) {
+ goto error;
+ }
+ value = PyTuple_Pack(2, errstr, tmp);
+ Py_DECREF(tmp);
+ if (!value) {
+ goto error;
+ }
+ PyErr_SetObject(errtype, value);
+
+ Py_DECREF(errstr);
+ Py_DECREF(value);
+ if (p->start_rule == Py_fstring_input) {
+ PyMem_Free((void *)errmsg);
+ }
+ return NULL;
+
+error:
+ Py_XDECREF(errstr);
+ Py_XDECREF(error_line);
+ if (p->start_rule == Py_fstring_input) {
+ PyMem_Free((void *)errmsg);
+ }
+ return NULL;
+}
+
+void
+_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
+ // Existing sintax error
+ if (PyErr_Occurred()) {
+ // Prioritize tokenizer errors to custom syntax errors raised
+ // on the second phase only if the errors come from the parser.
+ if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
+ _PyPegen_tokenize_full_source_to_check_for_errors(p);
+ }
+ // Propagate the existing syntax error.
+ return;
+ }
+ // Initialization error
+ if (p->fill == 0) {
+ RAISE_SYNTAX_ERROR("error at start before reading any input");
+ }
+ // Parser encountered EOF (End of File) unexpectedtly
+ if (p->tok->done == E_EOF) {
+ if (p->tok->level) {
+ raise_unclosed_parentheses_error(p);
+ } else {
+ RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
+ }
+ return;
+ }
+ // Indentation error in the tokenizer
+ if (last_token->type == INDENT || last_token->type == DEDENT) {
+ RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
+ return;
+ }
+ // Unknown error (generic case)
+
+ // Use the last token we found on the first pass to avoid reporting
+ // incorrect locations for generic syntax errors just because we reached
+ // further away when trying to find specific syntax errors in the second
+ // pass.
+ RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
+ // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
+ // generic SyntaxError we just raised if errors are found.
+ _PyPegen_tokenize_full_source_to_check_for_errors(p);
+}
\ No newline at end of file
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 76a22dab65994..672fdb92ec86f 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -86,7 +86,7 @@ tok_new(void)
tok->async_def_indent = 0;
tok->async_def_nl = 0;
tok->interactive_underflow = IUNDERFLOW_NORMAL;
-
+ tok->str = NULL;
return tok;
}

diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 677f9dba490be..61f0a6138f676 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -71,7 +71,7 @@ struct tok_state {
PyObject *decoding_readline; /* open(...).readline */
PyObject *decoding_buffer;
const char* enc; /* Encoding for the current str. */
- char* str;
+ char* str; /* Source string being tokenized (if tokenizing from a string)*/
char* input; /* Tokenizer's newline translated copy of the string. */

int type_comments; /* Whether to look for type comments */

_______________________________________________
Python-checkins mailing list
Python-checkins@python.org
https://mail.python.org/mailman/listinfo/python-checkins