Mailing List Archive

Refactor parser compilation units into specific components (GH-29676)
https://github.com/python/cpython/commit/c9c4444d9f11ae80c2c4cc7d40b6718419d81a97
commit: c9c4444d9f11ae80c2c4cc7d40b6718419d81a97
branch: main
author: Pablo Galindo Salgado <Pablogsal@gmail.com>
committer: pablogsal <Pablogsal@gmail.com>
date: 2021-11-21T01:08:50Z
summary:

Refactor parser compilation units into specific components (GH-29676)

files:
A Parser/action_helpers.c
A Parser/pegen_errors.c
M Makefile.pre.in
M PCbuild/_freeze_module.vcxproj
M PCbuild/pythoncore.vcxproj
M PCbuild/pythoncore.vcxproj.filters
M Parser/pegen.c
M Parser/pegen.h
M Tools/peg_generator/Makefile
M Tools/peg_generator/pegen/build.py

diff --git a/Makefile.pre.in b/Makefile.pre.in
index 11ffdaabc617b..fc4def8f5d12f 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -331,6 +331,8 @@ LIBFFI_INCLUDEDIR= @LIBFFI_INCLUDEDIR@

PEGEN_OBJS= \
Parser/pegen.o \
+ Parser/pegen_errors.o \
+ Parser/action_helpers.o \
Parser/parser.o \
Parser/string_parser.o \
Parser/peg_api.o
diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj
index d33e07c54b8c9..6a91776b9d8e9 100644
--- a/PCbuild/_freeze_module.vcxproj
+++ b/PCbuild/_freeze_module.vcxproj
@@ -163,6 +163,8 @@
<ClCompile Include="..\Parser\parser.c" />
<ClCompile Include="..\Parser\peg_api.c" />
<ClCompile Include="..\Parser\pegen.c" />
+ <ClCompile Include="..\Parser\pegen_errors.c" />
+ <ClCompile Include="..\Parser\action_helpers.c" />
<ClCompile Include="..\Parser\string_parser.c" />
<ClCompile Include="..\Parser\token.c" />
<ClCompile Include="..\Parser\tokenizer.c" />
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index b65998186927b..70f05563fa391 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -433,6 +433,8 @@
<ClCompile Include="..\Parser\tokenizer.c" />
<ClCompile Include="..\Parser\token.c" />
<ClCompile Include="..\Parser\pegen.c" />
+ <ClCompile Include="..\Parser\pegen_errors.c" />
+ <ClCompile Include="..\Parser\action_helpers.c" />
<ClCompile Include="..\Parser\parser.c" />
<ClCompile Include="..\Parser\string_parser.c" />
<ClCompile Include="..\Parser\peg_api.c" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 62aab5bccf9ef..b19f0279ec311 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -1205,6 +1205,12 @@
<ClCompile Include="..\Parser\pegen.c">
<Filter>Parser</Filter>
</ClCompile>
+ <ClCompile Include="..\Parser\pegen_errors.c">
+ <Filter>Parser</Filter>
+ </ClCompile>
+ <ClCompile Include="..\Parser\action_helpers.c">
+ <Filter>Parser</Filter>
+ </ClCompile>
<ClCompile Include="..\Parser\peg_api.c">
<Filter>Parser</Filter>
</ClCompile>
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
new file mode 100644
index 0000000000000..e5d7b667f7f5e
--- /dev/null
+++ b/Parser/action_helpers.c
@@ -0,0 +1,1289 @@
+#include <Python.h>
+
+#include "pegen.h"
+#include "string_parser.h"
+
+static PyObject *
+_create_dummy_identifier(Parser *p)
+{
+ return _PyPegen_new_identifier(p, "");
+}
+
+void *
+_PyPegen_dummy_name(Parser *p, ...)
+{
+ static void *cache = NULL;
+
+ if (cache != NULL) {
+ return cache;
+ }
+
+ PyObject *id = _create_dummy_identifier(p);
+ if (!id) {
+ return NULL;
+ }
+ cache = _PyAST_Name(id, Load, 1, 0, 1, 0, p->arena);
+ return cache;
+}
+
+/* Creates a single-element asdl_seq* that contains a */
+asdl_seq *
+_PyPegen_singleton_seq(Parser *p, void *a)
+{
+ assert(a != NULL);
+ asdl_seq *seq = (asdl_seq*)_Py_asdl_generic_seq_new(1, p->arena);
+ if (!seq) {
+ return NULL;
+ }
+ asdl_seq_SET_UNTYPED(seq, 0, a);
+ return seq;
+}
+
+/* Creates a copy of seq and prepends a to it */
+asdl_seq *
+_PyPegen_seq_insert_in_front(Parser *p, void *a, asdl_seq *seq)
+{
+ assert(a != NULL);
+ if (!seq) {
+ return _PyPegen_singleton_seq(p, a);
+ }
+
+ asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(asdl_seq_LEN(seq) + 1, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+
+ asdl_seq_SET_UNTYPED(new_seq, 0, a);
+ for (Py_ssize_t i = 1, l = asdl_seq_LEN(new_seq); i < l; i++) {
+ asdl_seq_SET_UNTYPED(new_seq, i, asdl_seq_GET_UNTYPED(seq, i - 1));
+ }
+ return new_seq;
+}
+
+/* Creates a copy of seq and appends a to it */
+asdl_seq *
+_PyPegen_seq_append_to_end(Parser *p, asdl_seq *seq, void *a)
+{
+ assert(a != NULL);
+ if (!seq) {
+ return _PyPegen_singleton_seq(p, a);
+ }
+
+ asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(asdl_seq_LEN(seq) + 1, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+
+ for (Py_ssize_t i = 0, l = asdl_seq_LEN(new_seq); i + 1 < l; i++) {
+ asdl_seq_SET_UNTYPED(new_seq, i, asdl_seq_GET_UNTYPED(seq, i));
+ }
+ asdl_seq_SET_UNTYPED(new_seq, asdl_seq_LEN(new_seq) - 1, a);
+ return new_seq;
+}
+
+static Py_ssize_t
+_get_flattened_seq_size(asdl_seq *seqs)
+{
+ Py_ssize_t size = 0;
+ for (Py_ssize_t i = 0, l = asdl_seq_LEN(seqs); i < l; i++) {
+ asdl_seq *inner_seq = asdl_seq_GET_UNTYPED(seqs, i);
+ size += asdl_seq_LEN(inner_seq);
+ }
+ return size;
+}
+
+/* Flattens an asdl_seq* of asdl_seq*s */
+asdl_seq *
+_PyPegen_seq_flatten(Parser *p, asdl_seq *seqs)
+{
+ Py_ssize_t flattened_seq_size = _get_flattened_seq_size(seqs);
+ assert(flattened_seq_size > 0);
+
+ asdl_seq *flattened_seq = (asdl_seq*)_Py_asdl_generic_seq_new(flattened_seq_size, p->arena);
+ if (!flattened_seq) {
+ return NULL;
+ }
+
+ int flattened_seq_idx = 0;
+ for (Py_ssize_t i = 0, l = asdl_seq_LEN(seqs); i < l; i++) {
+ asdl_seq *inner_seq = asdl_seq_GET_UNTYPED(seqs, i);
+ for (Py_ssize_t j = 0, li = asdl_seq_LEN(inner_seq); j < li; j++) {
+ asdl_seq_SET_UNTYPED(flattened_seq, flattened_seq_idx++, asdl_seq_GET_UNTYPED(inner_seq, j));
+ }
+ }
+ assert(flattened_seq_idx == flattened_seq_size);
+
+ return flattened_seq;
+}
+
+void *
+_PyPegen_seq_last_item(asdl_seq *seq)
+{
+ Py_ssize_t len = asdl_seq_LEN(seq);
+ return asdl_seq_GET_UNTYPED(seq, len - 1);
+}
+
+void *
+_PyPegen_seq_first_item(asdl_seq *seq)
+{
+ return asdl_seq_GET_UNTYPED(seq, 0);
+}
+
+/* Creates a new name of the form <first_name>.<second_name> */
+expr_ty
+_PyPegen_join_names_with_dot(Parser *p, expr_ty first_name, expr_ty second_name)
+{
+ assert(first_name != NULL && second_name != NULL);
+ PyObject *first_identifier = first_name->v.Name.id;
+ PyObject *second_identifier = second_name->v.Name.id;
+
+ if (PyUnicode_READY(first_identifier) == -1) {
+ return NULL;
+ }
+ if (PyUnicode_READY(second_identifier) == -1) {
+ return NULL;
+ }
+ const char *first_str = PyUnicode_AsUTF8(first_identifier);
+ if (!first_str) {
+ return NULL;
+ }
+ const char *second_str = PyUnicode_AsUTF8(second_identifier);
+ if (!second_str) {
+ return NULL;
+ }
+ Py_ssize_t len = strlen(first_str) + strlen(second_str) + 1; // +1 for the dot
+
+ PyObject *str = PyBytes_FromStringAndSize(NULL, len);
+ if (!str) {
+ return NULL;
+ }
+
+ char *s = PyBytes_AS_STRING(str);
+ if (!s) {
+ return NULL;
+ }
+
+ strcpy(s, first_str);
+ s += strlen(first_str);
+ *s++ = '.';
+ strcpy(s, second_str);
+ s += strlen(second_str);
+ *s = '\0';
+
+ PyObject *uni = PyUnicode_DecodeUTF8(PyBytes_AS_STRING(str), PyBytes_GET_SIZE(str), NULL);
+ Py_DECREF(str);
+ if (!uni) {
+ return NULL;
+ }
+ PyUnicode_InternInPlace(&uni);
+ if (_PyArena_AddPyObject(p->arena, uni) < 0) {
+ Py_DECREF(uni);
+ return NULL;
+ }
+
+ return _PyAST_Name(uni, Load, EXTRA_EXPR(first_name, second_name));
+}
+
+/* Counts the total number of dots in seq's tokens */
+int
+_PyPegen_seq_count_dots(asdl_seq *seq)
+{
+ int number_of_dots = 0;
+ for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
+ Token *current_expr = asdl_seq_GET_UNTYPED(seq, i);
+ switch (current_expr->type) {
+ case ELLIPSIS:
+ number_of_dots += 3;
+ break;
+ case DOT:
+ number_of_dots += 1;
+ break;
+ default:
+ Py_UNREACHABLE();
+ }
+ }
+
+ return number_of_dots;
+}
+
+/* Creates an alias with '*' as the identifier name */
+alias_ty
+_PyPegen_alias_for_star(Parser *p, int lineno, int col_offset, int end_lineno,
+ int end_col_offset, PyArena *arena) {
+ PyObject *str = PyUnicode_InternFromString("*");
+ if (!str) {
+ return NULL;
+ }
+ if (_PyArena_AddPyObject(p->arena, str) < 0) {
+ Py_DECREF(str);
+ return NULL;
+ }
+ return _PyAST_alias(str, NULL, lineno, col_offset, end_lineno, end_col_offset, arena);
+}
+
+/* Creates a new asdl_seq* with the identifiers of all the names in seq */
+asdl_identifier_seq *
+_PyPegen_map_names_to_ids(Parser *p, asdl_expr_seq *seq)
+{
+ Py_ssize_t len = asdl_seq_LEN(seq);
+ assert(len > 0);
+
+ asdl_identifier_seq *new_seq = _Py_asdl_identifier_seq_new(len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ expr_ty e = asdl_seq_GET(seq, i);
+ asdl_seq_SET(new_seq, i, e->v.Name.id);
+ }
+ return new_seq;
+}
+
+/* Constructs a CmpopExprPair */
+CmpopExprPair *
+_PyPegen_cmpop_expr_pair(Parser *p, cmpop_ty cmpop, expr_ty expr)
+{
+ assert(expr != NULL);
+ CmpopExprPair *a = _PyArena_Malloc(p->arena, sizeof(CmpopExprPair));
+ if (!a) {
+ return NULL;
+ }
+ a->cmpop = cmpop;
+ a->expr = expr;
+ return a;
+}
+
+asdl_int_seq *
+_PyPegen_get_cmpops(Parser *p, asdl_seq *seq)
+{
+ Py_ssize_t len = asdl_seq_LEN(seq);
+ assert(len > 0);
+
+ asdl_int_seq *new_seq = _Py_asdl_int_seq_new(len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ CmpopExprPair *pair = asdl_seq_GET_UNTYPED(seq, i);
+ asdl_seq_SET(new_seq, i, pair->cmpop);
+ }
+ return new_seq;
+}
+
+asdl_expr_seq *
+_PyPegen_get_exprs(Parser *p, asdl_seq *seq)
+{
+ Py_ssize_t len = asdl_seq_LEN(seq);
+ assert(len > 0);
+
+ asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ CmpopExprPair *pair = asdl_seq_GET_UNTYPED(seq, i);
+ asdl_seq_SET(new_seq, i, pair->expr);
+ }
+ return new_seq;
+}
+
+/* Creates an asdl_seq* where all the elements have been changed to have ctx as context */
+static asdl_expr_seq *
+_set_seq_context(Parser *p, asdl_expr_seq *seq, expr_context_ty ctx)
+{
+ Py_ssize_t len = asdl_seq_LEN(seq);
+ if (len == 0) {
+ return NULL;
+ }
+
+ asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ expr_ty e = asdl_seq_GET(seq, i);
+ asdl_seq_SET(new_seq, i, _PyPegen_set_expr_context(p, e, ctx));
+ }
+ return new_seq;
+}
+
+static expr_ty
+_set_name_context(Parser *p, expr_ty e, expr_context_ty ctx)
+{
+ return _PyAST_Name(e->v.Name.id, ctx, EXTRA_EXPR(e, e));
+}
+
+static expr_ty
+_set_tuple_context(Parser *p, expr_ty e, expr_context_ty ctx)
+{
+ return _PyAST_Tuple(
+ _set_seq_context(p, e->v.Tuple.elts, ctx),
+ ctx,
+ EXTRA_EXPR(e, e));
+}
+
+static expr_ty
+_set_list_context(Parser *p, expr_ty e, expr_context_ty ctx)
+{
+ return _PyAST_List(
+ _set_seq_context(p, e->v.List.elts, ctx),
+ ctx,
+ EXTRA_EXPR(e, e));
+}
+
+static expr_ty
+_set_subscript_context(Parser *p, expr_ty e, expr_context_ty ctx)
+{
+ return _PyAST_Subscript(e->v.Subscript.value, e->v.Subscript.slice,
+ ctx, EXTRA_EXPR(e, e));
+}
+
+static expr_ty
+_set_attribute_context(Parser *p, expr_ty e, expr_context_ty ctx)
+{
+ return _PyAST_Attribute(e->v.Attribute.value, e->v.Attribute.attr,
+ ctx, EXTRA_EXPR(e, e));
+}
+
+static expr_ty
+_set_starred_context(Parser *p, expr_ty e, expr_context_ty ctx)
+{
+ return _PyAST_Starred(_PyPegen_set_expr_context(p, e->v.Starred.value, ctx),
+ ctx, EXTRA_EXPR(e, e));
+}
+
+/* Creates an `expr_ty` equivalent to `expr` but with `ctx` as context */
+expr_ty
+_PyPegen_set_expr_context(Parser *p, expr_ty expr, expr_context_ty ctx)
+{
+ assert(expr != NULL);
+
+ expr_ty new = NULL;
+ switch (expr->kind) {
+ case Name_kind:
+ new = _set_name_context(p, expr, ctx);
+ break;
+ case Tuple_kind:
+ new = _set_tuple_context(p, expr, ctx);
+ break;
+ case List_kind:
+ new = _set_list_context(p, expr, ctx);
+ break;
+ case Subscript_kind:
+ new = _set_subscript_context(p, expr, ctx);
+ break;
+ case Attribute_kind:
+ new = _set_attribute_context(p, expr, ctx);
+ break;
+ case Starred_kind:
+ new = _set_starred_context(p, expr, ctx);
+ break;
+ default:
+ new = expr;
+ }
+ return new;
+}
+
+/* Constructs a KeyValuePair that is used when parsing a dict's key value pairs */
+KeyValuePair *
+_PyPegen_key_value_pair(Parser *p, expr_ty key, expr_ty value)
+{
+ KeyValuePair *a = _PyArena_Malloc(p->arena, sizeof(KeyValuePair));
+ if (!a) {
+ return NULL;
+ }
+ a->key = key;
+ a->value = value;
+ return a;
+}
+
+/* Extracts all keys from an asdl_seq* of KeyValuePair*'s */
+asdl_expr_seq *
+_PyPegen_get_keys(Parser *p, asdl_seq *seq)
+{
+ Py_ssize_t len = asdl_seq_LEN(seq);
+ asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ KeyValuePair *pair = asdl_seq_GET_UNTYPED(seq, i);
+ asdl_seq_SET(new_seq, i, pair->key);
+ }
+ return new_seq;
+}
+
+/* Extracts all values from an asdl_seq* of KeyValuePair*'s */
+asdl_expr_seq *
+_PyPegen_get_values(Parser *p, asdl_seq *seq)
+{
+ Py_ssize_t len = asdl_seq_LEN(seq);
+ asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ KeyValuePair *pair = asdl_seq_GET_UNTYPED(seq, i);
+ asdl_seq_SET(new_seq, i, pair->value);
+ }
+ return new_seq;
+}
+
+/* Constructs a KeyPatternPair that is used when parsing mapping & class patterns */
+KeyPatternPair *
+_PyPegen_key_pattern_pair(Parser *p, expr_ty key, pattern_ty pattern)
+{
+ KeyPatternPair *a = _PyArena_Malloc(p->arena, sizeof(KeyPatternPair));
+ if (!a) {
+ return NULL;
+ }
+ a->key = key;
+ a->pattern = pattern;
+ return a;
+}
+
+/* Extracts all keys from an asdl_seq* of KeyPatternPair*'s */
+asdl_expr_seq *
+_PyPegen_get_pattern_keys(Parser *p, asdl_seq *seq)
+{
+ Py_ssize_t len = asdl_seq_LEN(seq);
+ asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ KeyPatternPair *pair = asdl_seq_GET_UNTYPED(seq, i);
+ asdl_seq_SET(new_seq, i, pair->key);
+ }
+ return new_seq;
+}
+
+/* Extracts all patterns from an asdl_seq* of KeyPatternPair*'s */
+asdl_pattern_seq *
+_PyPegen_get_patterns(Parser *p, asdl_seq *seq)
+{
+ Py_ssize_t len = asdl_seq_LEN(seq);
+ asdl_pattern_seq *new_seq = _Py_asdl_pattern_seq_new(len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ KeyPatternPair *pair = asdl_seq_GET_UNTYPED(seq, i);
+ asdl_seq_SET(new_seq, i, pair->pattern);
+ }
+ return new_seq;
+}
+
+/* Constructs a NameDefaultPair */
+NameDefaultPair *
+_PyPegen_name_default_pair(Parser *p, arg_ty arg, expr_ty value, Token *tc)
+{
+ NameDefaultPair *a = _PyArena_Malloc(p->arena, sizeof(NameDefaultPair));
+ if (!a) {
+ return NULL;
+ }
+ a->arg = _PyPegen_add_type_comment_to_arg(p, arg, tc);
+ a->value = value;
+ return a;
+}
+
+/* Constructs a SlashWithDefault */
+SlashWithDefault *
+_PyPegen_slash_with_default(Parser *p, asdl_arg_seq *plain_names, asdl_seq *names_with_defaults)
+{
+ SlashWithDefault *a = _PyArena_Malloc(p->arena, sizeof(SlashWithDefault));
+ if (!a) {
+ return NULL;
+ }
+ a->plain_names = plain_names;
+ a->names_with_defaults = names_with_defaults;
+ return a;
+}
+
+/* Constructs a StarEtc */
+StarEtc *
+_PyPegen_star_etc(Parser *p, arg_ty vararg, asdl_seq *kwonlyargs, arg_ty kwarg)
+{
+ StarEtc *a = _PyArena_Malloc(p->arena, sizeof(StarEtc));
+ if (!a) {
+ return NULL;
+ }
+ a->vararg = vararg;
+ a->kwonlyargs = kwonlyargs;
+ a->kwarg = kwarg;
+ return a;
+}
+
+asdl_seq *
+_PyPegen_join_sequences(Parser *p, asdl_seq *a, asdl_seq *b)
+{
+ Py_ssize_t first_len = asdl_seq_LEN(a);
+ Py_ssize_t second_len = asdl_seq_LEN(b);
+ asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(first_len + second_len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+
+ int k = 0;
+ for (Py_ssize_t i = 0; i < first_len; i++) {
+ asdl_seq_SET_UNTYPED(new_seq, k++, asdl_seq_GET_UNTYPED(a, i));
+ }
+ for (Py_ssize_t i = 0; i < second_len; i++) {
+ asdl_seq_SET_UNTYPED(new_seq, k++, asdl_seq_GET_UNTYPED(b, i));
+ }
+
+ return new_seq;
+}
+
+static asdl_arg_seq*
+_get_names(Parser *p, asdl_seq *names_with_defaults)
+{
+ Py_ssize_t len = asdl_seq_LEN(names_with_defaults);
+ asdl_arg_seq *seq = _Py_asdl_arg_seq_new(len, p->arena);
+ if (!seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ NameDefaultPair *pair = asdl_seq_GET_UNTYPED(names_with_defaults, i);
+ asdl_seq_SET(seq, i, pair->arg);
+ }
+ return seq;
+}
+
+static asdl_expr_seq *
+_get_defaults(Parser *p, asdl_seq *names_with_defaults)
+{
+ Py_ssize_t len = asdl_seq_LEN(names_with_defaults);
+ asdl_expr_seq *seq = _Py_asdl_expr_seq_new(len, p->arena);
+ if (!seq) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++) {
+ NameDefaultPair *pair = asdl_seq_GET_UNTYPED(names_with_defaults, i);
+ asdl_seq_SET(seq, i, pair->value);
+ }
+ return seq;
+}
+
+static int
+_make_posonlyargs(Parser *p,
+ asdl_arg_seq *slash_without_default,
+ SlashWithDefault *slash_with_default,
+ asdl_arg_seq **posonlyargs) {
+ if (slash_without_default != NULL) {
+ *posonlyargs = slash_without_default;
+ }
+ else if (slash_with_default != NULL) {
+ asdl_arg_seq *slash_with_default_names =
+ _get_names(p, slash_with_default->names_with_defaults);
+ if (!slash_with_default_names) {
+ return -1;
+ }
+ *posonlyargs = (asdl_arg_seq*)_PyPegen_join_sequences(
+ p,
+ (asdl_seq*)slash_with_default->plain_names,
+ (asdl_seq*)slash_with_default_names);
+ }
+ else {
+ *posonlyargs = _Py_asdl_arg_seq_new(0, p->arena);
+ }
+ return *posonlyargs == NULL ? -1 : 0;
+}
+
+static int
+_make_posargs(Parser *p,
+ asdl_arg_seq *plain_names,
+ asdl_seq *names_with_default,
+ asdl_arg_seq **posargs) {
+ if (plain_names != NULL && names_with_default != NULL) {
+ asdl_arg_seq *names_with_default_names = _get_names(p, names_with_default);
+ if (!names_with_default_names) {
+ return -1;
+ }
+ *posargs = (asdl_arg_seq*)_PyPegen_join_sequences(
+ p,(asdl_seq*)plain_names, (asdl_seq*)names_with_default_names);
+ }
+ else if (plain_names == NULL && names_with_default != NULL) {
+ *posargs = _get_names(p, names_with_default);
+ }
+ else if (plain_names != NULL && names_with_default == NULL) {
+ *posargs = plain_names;
+ }
+ else {
+ *posargs = _Py_asdl_arg_seq_new(0, p->arena);
+ }
+ return *posargs == NULL ? -1 : 0;
+}
+
+static int
+_make_posdefaults(Parser *p,
+ SlashWithDefault *slash_with_default,
+ asdl_seq *names_with_default,
+ asdl_expr_seq **posdefaults) {
+ if (slash_with_default != NULL && names_with_default != NULL) {
+ asdl_expr_seq *slash_with_default_values =
+ _get_defaults(p, slash_with_default->names_with_defaults);
+ if (!slash_with_default_values) {
+ return -1;
+ }
+ asdl_expr_seq *names_with_default_values = _get_defaults(p, names_with_default);
+ if (!names_with_default_values) {
+ return -1;
+ }
+ *posdefaults = (asdl_expr_seq*)_PyPegen_join_sequences(
+ p,
+ (asdl_seq*)slash_with_default_values,
+ (asdl_seq*)names_with_default_values);
+ }
+ else if (slash_with_default == NULL && names_with_default != NULL) {
+ *posdefaults = _get_defaults(p, names_with_default);
+ }
+ else if (slash_with_default != NULL && names_with_default == NULL) {
+ *posdefaults = _get_defaults(p, slash_with_default->names_with_defaults);
+ }
+ else {
+ *posdefaults = _Py_asdl_expr_seq_new(0, p->arena);
+ }
+ return *posdefaults == NULL ? -1 : 0;
+}
+
+static int
+_make_kwargs(Parser *p, StarEtc *star_etc,
+ asdl_arg_seq **kwonlyargs,
+ asdl_expr_seq **kwdefaults) {
+ if (star_etc != NULL && star_etc->kwonlyargs != NULL) {
+ *kwonlyargs = _get_names(p, star_etc->kwonlyargs);
+ }
+ else {
+ *kwonlyargs = _Py_asdl_arg_seq_new(0, p->arena);
+ }
+
+ if (*kwonlyargs == NULL) {
+ return -1;
+ }
+
+ if (star_etc != NULL && star_etc->kwonlyargs != NULL) {
+ *kwdefaults = _get_defaults(p, star_etc->kwonlyargs);
+ }
+ else {
+ *kwdefaults = _Py_asdl_expr_seq_new(0, p->arena);
+ }
+
+ if (*kwdefaults == NULL) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Constructs an arguments_ty object out of all the parsed constructs in the parameters rule */
+arguments_ty
+_PyPegen_make_arguments(Parser *p, asdl_arg_seq *slash_without_default,
+ SlashWithDefault *slash_with_default, asdl_arg_seq *plain_names,
+ asdl_seq *names_with_default, StarEtc *star_etc)
+{
+ asdl_arg_seq *posonlyargs;
+ if (_make_posonlyargs(p, slash_without_default, slash_with_default, &posonlyargs) == -1) {
+ return NULL;
+ }
+
+ asdl_arg_seq *posargs;
+ if (_make_posargs(p, plain_names, names_with_default, &posargs) == -1) {
+ return NULL;
+ }
+
+ asdl_expr_seq *posdefaults;
+ if (_make_posdefaults(p,slash_with_default, names_with_default, &posdefaults) == -1) {
+ return NULL;
+ }
+
+ arg_ty vararg = NULL;
+ if (star_etc != NULL && star_etc->vararg != NULL) {
+ vararg = star_etc->vararg;
+ }
+
+ asdl_arg_seq *kwonlyargs;
+ asdl_expr_seq *kwdefaults;
+ if (_make_kwargs(p, star_etc, &kwonlyargs, &kwdefaults) == -1) {
+ return NULL;
+ }
+
+ arg_ty kwarg = NULL;
+ if (star_etc != NULL && star_etc->kwarg != NULL) {
+ kwarg = star_etc->kwarg;
+ }
+
+ return _PyAST_arguments(posonlyargs, posargs, vararg, kwonlyargs,
+ kwdefaults, kwarg, posdefaults, p->arena);
+}
+
+
+/* Constructs an empty arguments_ty object, that gets used when a function accepts no
+ * arguments. */
+arguments_ty
+_PyPegen_empty_arguments(Parser *p)
+{
+ asdl_arg_seq *posonlyargs = _Py_asdl_arg_seq_new(0, p->arena);
+ if (!posonlyargs) {
+ return NULL;
+ }
+ asdl_arg_seq *posargs = _Py_asdl_arg_seq_new(0, p->arena);
+ if (!posargs) {
+ return NULL;
+ }
+ asdl_expr_seq *posdefaults = _Py_asdl_expr_seq_new(0, p->arena);
+ if (!posdefaults) {
+ return NULL;
+ }
+ asdl_arg_seq *kwonlyargs = _Py_asdl_arg_seq_new(0, p->arena);
+ if (!kwonlyargs) {
+ return NULL;
+ }
+ asdl_expr_seq *kwdefaults = _Py_asdl_expr_seq_new(0, p->arena);
+ if (!kwdefaults) {
+ return NULL;
+ }
+
+ return _PyAST_arguments(posonlyargs, posargs, NULL, kwonlyargs,
+ kwdefaults, NULL, posdefaults, p->arena);
+}
+
+/* Encapsulates the value of an operator_ty into an AugOperator struct */
+AugOperator *
+_PyPegen_augoperator(Parser *p, operator_ty kind)
+{
+ AugOperator *a = _PyArena_Malloc(p->arena, sizeof(AugOperator));
+ if (!a) {
+ return NULL;
+ }
+ a->kind = kind;
+ return a;
+}
+
+/* Construct a FunctionDef equivalent to function_def, but with decorators */
+stmt_ty
+_PyPegen_function_def_decorators(Parser *p, asdl_expr_seq *decorators, stmt_ty function_def)
+{
+ assert(function_def != NULL);
+ if (function_def->kind == AsyncFunctionDef_kind) {
+ return _PyAST_AsyncFunctionDef(
+ function_def->v.FunctionDef.name, function_def->v.FunctionDef.args,
+ function_def->v.FunctionDef.body, decorators, function_def->v.FunctionDef.returns,
+ function_def->v.FunctionDef.type_comment, function_def->lineno,
+ function_def->col_offset, function_def->end_lineno, function_def->end_col_offset,
+ p->arena);
+ }
+
+ return _PyAST_FunctionDef(
+ function_def->v.FunctionDef.name, function_def->v.FunctionDef.args,
+ function_def->v.FunctionDef.body, decorators,
+ function_def->v.FunctionDef.returns,
+ function_def->v.FunctionDef.type_comment, function_def->lineno,
+ function_def->col_offset, function_def->end_lineno,
+ function_def->end_col_offset, p->arena);
+}
+
+/* Construct a ClassDef equivalent to class_def, but with decorators */
+stmt_ty
+_PyPegen_class_def_decorators(Parser *p, asdl_expr_seq *decorators, stmt_ty class_def)
+{
+ assert(class_def != NULL);
+ return _PyAST_ClassDef(
+ class_def->v.ClassDef.name, class_def->v.ClassDef.bases,
+ class_def->v.ClassDef.keywords, class_def->v.ClassDef.body, decorators,
+ class_def->lineno, class_def->col_offset, class_def->end_lineno,
+ class_def->end_col_offset, p->arena);
+}
+
+/* Construct a KeywordOrStarred */
+KeywordOrStarred *
+_PyPegen_keyword_or_starred(Parser *p, void *element, int is_keyword)
+{
+ KeywordOrStarred *a = _PyArena_Malloc(p->arena, sizeof(KeywordOrStarred));
+ if (!a) {
+ return NULL;
+ }
+ a->element = element;
+ a->is_keyword = is_keyword;
+ return a;
+}
+
+/* Get the number of starred expressions in an asdl_seq* of KeywordOrStarred*s */
+static int
+_seq_number_of_starred_exprs(asdl_seq *seq)
+{
+ int n = 0;
+ for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
+ KeywordOrStarred *k = asdl_seq_GET_UNTYPED(seq, i);
+ if (!k->is_keyword) {
+ n++;
+ }
+ }
+ return n;
+}
+
+/* Extract the starred expressions of an asdl_seq* of KeywordOrStarred*s */
+asdl_expr_seq *
+_PyPegen_seq_extract_starred_exprs(Parser *p, asdl_seq *kwargs)
+{
+ int new_len = _seq_number_of_starred_exprs(kwargs);
+ if (new_len == 0) {
+ return NULL;
+ }
+ asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(new_len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+
+ int idx = 0;
+ for (Py_ssize_t i = 0, len = asdl_seq_LEN(kwargs); i < len; i++) {
+ KeywordOrStarred *k = asdl_seq_GET_UNTYPED(kwargs, i);
+ if (!k->is_keyword) {
+ asdl_seq_SET(new_seq, idx++, k->element);
+ }
+ }
+ return new_seq;
+}
+
+/* Return a new asdl_seq* with only the keywords in kwargs */
+asdl_keyword_seq*
+_PyPegen_seq_delete_starred_exprs(Parser *p, asdl_seq *kwargs)
+{
+ Py_ssize_t len = asdl_seq_LEN(kwargs);
+ Py_ssize_t new_len = len - _seq_number_of_starred_exprs(kwargs);
+ if (new_len == 0) {
+ return NULL;
+ }
+ asdl_keyword_seq *new_seq = _Py_asdl_keyword_seq_new(new_len, p->arena);
+ if (!new_seq) {
+ return NULL;
+ }
+
+ int idx = 0;
+ for (Py_ssize_t i = 0; i < len; i++) {
+ KeywordOrStarred *k = asdl_seq_GET_UNTYPED(kwargs, i);
+ if (k->is_keyword) {
+ asdl_seq_SET(new_seq, idx++, k->element);
+ }
+ }
+ return new_seq;
+}
+
+expr_ty
+_PyPegen_concatenate_strings(Parser *p, asdl_seq *strings)
+{
+ Py_ssize_t len = asdl_seq_LEN(strings);
+ assert(len > 0);
+
+ Token *first = asdl_seq_GET_UNTYPED(strings, 0);
+ Token *last = asdl_seq_GET_UNTYPED(strings, len - 1);
+
+ int bytesmode = 0;
+ PyObject *bytes_str = NULL;
+
+ FstringParser state;
+ _PyPegen_FstringParser_Init(&state);
+
+ for (Py_ssize_t i = 0; i < len; i++) {
+ Token *t = asdl_seq_GET_UNTYPED(strings, i);
+
+ int this_bytesmode;
+ int this_rawmode;
+ PyObject *s;
+ const char *fstr;
+ Py_ssize_t fstrlen = -1;
+
+ if (_PyPegen_parsestr(p, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen, t) != 0) {
+ goto error;
+ }
+
+ /* Check that we are not mixing bytes with unicode. */
+ if (i != 0 && bytesmode != this_bytesmode) {
+ RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals");
+ Py_XDECREF(s);
+ goto error;
+ }
+ bytesmode = this_bytesmode;
+
+ if (fstr != NULL) {
+ assert(s == NULL && !bytesmode);
+
+ int result = _PyPegen_FstringParser_ConcatFstring(p, &state, &fstr, fstr + fstrlen,
+ this_rawmode, 0, first, t, last);
+ if (result < 0) {
+ goto error;
+ }
+ }
+ else {
+ /* String or byte string. */
+ assert(s != NULL && fstr == NULL);
+ assert(bytesmode ? PyBytes_CheckExact(s) : PyUnicode_CheckExact(s));
+
+ if (bytesmode) {
+ if (i == 0) {
+ bytes_str = s;
+ }
+ else {
+ PyBytes_ConcatAndDel(&bytes_str, s);
+ if (!bytes_str) {
+ goto error;
+ }
+ }
+ }
+ else {
+ /* This is a regular string. Concatenate it. */
+ if (_PyPegen_FstringParser_ConcatAndDel(&state, s) < 0) {
+ goto error;
+ }
+ }
+ }
+ }
+
+ if (bytesmode) {
+ if (_PyArena_AddPyObject(p->arena, bytes_str) < 0) {
+ goto error;
+ }
+ return _PyAST_Constant(bytes_str, NULL, first->lineno,
+ first->col_offset, last->end_lineno,
+ last->end_col_offset, p->arena);
+ }
+
+ return _PyPegen_FstringParser_Finish(p, &state, first, last);
+
+error:
+ Py_XDECREF(bytes_str);
+ _PyPegen_FstringParser_Dealloc(&state);
+ if (PyErr_Occurred()) {
+ _Pypegen_raise_decode_error(p);
+ }
+ return NULL;
+}
+
+expr_ty
+_PyPegen_ensure_imaginary(Parser *p, expr_ty exp)
+{
+ if (exp->kind != Constant_kind || !PyComplex_CheckExact(exp->v.Constant.value)) {
+ RAISE_SYNTAX_ERROR_KNOWN_LOCATION(exp, "imaginary number required in complex literal");
+ return NULL;
+ }
+ return exp;
+}
+
+expr_ty
+_PyPegen_ensure_real(Parser *p, expr_ty exp)
+{
+ if (exp->kind != Constant_kind || PyComplex_CheckExact(exp->v.Constant.value)) {
+ RAISE_SYNTAX_ERROR_KNOWN_LOCATION(exp, "real number required in complex literal");
+ return NULL;
+ }
+ return exp;
+}
+
+mod_ty
+_PyPegen_make_module(Parser *p, asdl_stmt_seq *a) {
+ asdl_type_ignore_seq *type_ignores = NULL;
+ Py_ssize_t num = p->type_ignore_comments.num_items;
+ if (num > 0) {
+ // Turn the raw (comment, lineno) pairs into TypeIgnore objects in the arena
+ type_ignores = _Py_asdl_type_ignore_seq_new(num, p->arena);
+ if (type_ignores == NULL) {
+ return NULL;
+ }
+ for (int i = 0; i < num; i++) {
+ PyObject *tag = _PyPegen_new_type_comment(p, p->type_ignore_comments.items[i].comment);
+ if (tag == NULL) {
+ return NULL;
+ }
+ type_ignore_ty ti = _PyAST_TypeIgnore(p->type_ignore_comments.items[i].lineno,
+ tag, p->arena);
+ if (ti == NULL) {
+ return NULL;
+ }
+ asdl_seq_SET(type_ignores, i, ti);
+ }
+ }
+ return _PyAST_Module(a, type_ignores, p->arena);
+}
+
+PyObject *
+_PyPegen_new_type_comment(Parser *p, const char *s)
+{
+ PyObject *res = PyUnicode_DecodeUTF8(s, strlen(s), NULL);
+ if (res == NULL) {
+ return NULL;
+ }
+ if (_PyArena_AddPyObject(p->arena, res) < 0) {
+ Py_DECREF(res);
+ return NULL;
+ }
+ return res;
+}
+
+arg_ty
+_PyPegen_add_type_comment_to_arg(Parser *p, arg_ty a, Token *tc)
+{
+ if (tc == NULL) {
+ return a;
+ }
+ const char *bytes = PyBytes_AsString(tc->bytes);
+ if (bytes == NULL) {
+ return NULL;
+ }
+ PyObject *tco = _PyPegen_new_type_comment(p, bytes);
+ if (tco == NULL) {
+ return NULL;
+ }
+ return _PyAST_arg(a->arg, a->annotation, tco,
+ a->lineno, a->col_offset, a->end_lineno, a->end_col_offset,
+ p->arena);
+}
+
+/* Checks if the NOTEQUAL token is valid given the current parser flags
+0 indicates success and nonzero indicates failure (an exception may be set) */
+int
+_PyPegen_check_barry_as_flufl(Parser *p, Token* t) {
+ assert(t->bytes != NULL);
+ assert(t->type == NOTEQUAL);
+
+ const char* tok_str = PyBytes_AS_STRING(t->bytes);
+ if (p->flags & PyPARSE_BARRY_AS_BDFL && strcmp(tok_str, "<>") != 0) {
+ RAISE_SYNTAX_ERROR("with Barry as BDFL, use '<>' instead of '!='");
+ return -1;
+ }
+ if (!(p->flags & PyPARSE_BARRY_AS_BDFL)) {
+ return strcmp(tok_str, "!=");
+ }
+ return 0;
+}
+
+int
+_PyPegen_check_legacy_stmt(Parser *p, expr_ty name) {
+ if (name->kind != Name_kind) {
+ return 0;
+ }
+ const char* candidates[2] = {"print", "exec"};
+ for (int i=0; i<2; i++) {
+ if (PyUnicode_CompareWithASCIIString(name->v.Name.id, candidates[i]) == 0) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+const char *
+_PyPegen_get_expr_name(expr_ty e)
+{
+ assert(e != NULL);
+ switch (e->kind) {
+ case Attribute_kind:
+ return "attribute";
+ case Subscript_kind:
+ return "subscript";
+ case Starred_kind:
+ return "starred";
+ case Name_kind:
+ return "name";
+ case List_kind:
+ return "list";
+ case Tuple_kind:
+ return "tuple";
+ case Lambda_kind:
+ return "lambda";
+ case Call_kind:
+ return "function call";
+ case BoolOp_kind:
+ case BinOp_kind:
+ case UnaryOp_kind:
+ return "expression";
+ case GeneratorExp_kind:
+ return "generator expression";
+ case Yield_kind:
+ case YieldFrom_kind:
+ return "yield expression";
+ case Await_kind:
+ return "await expression";
+ case ListComp_kind:
+ return "list comprehension";
+ case SetComp_kind:
+ return "set comprehension";
+ case DictComp_kind:
+ return "dict comprehension";
+ case Dict_kind:
+ return "dict literal";
+ case Set_kind:
+ return "set display";
+ case JoinedStr_kind:
+ case FormattedValue_kind:
+ return "f-string expression";
+ case Constant_kind: {
+ PyObject *value = e->v.Constant.value;
+ if (value == Py_None) {
+ return "None";
+ }
+ if (value == Py_False) {
+ return "False";
+ }
+ if (value == Py_True) {
+ return "True";
+ }
+ if (value == Py_Ellipsis) {
+ return "ellipsis";
+ }
+ return "literal";
+ }
+ case Compare_kind:
+ return "comparison";
+ case IfExp_kind:
+ return "conditional expression";
+ case NamedExpr_kind:
+ return "named expression";
+ default:
+ PyErr_Format(PyExc_SystemError,
+ "unexpected expression in assignment %d (line %d)",
+ e->kind, e->lineno);
+ return NULL;
+ }
+}
+
+static inline expr_ty
+_PyPegen_get_last_comprehension_item(comprehension_ty comprehension) {
+ if (comprehension->ifs == NULL || asdl_seq_LEN(comprehension->ifs) == 0) {
+ return comprehension->iter;
+ }
+ return PyPegen_last_item(comprehension->ifs, expr_ty);
+}
+
+expr_ty _PyPegen_collect_call_seqs(Parser *p, asdl_expr_seq *a, asdl_seq *b,
+ int lineno, int col_offset, int end_lineno,
+ int end_col_offset, PyArena *arena) {
+ Py_ssize_t args_len = asdl_seq_LEN(a);
+ Py_ssize_t total_len = args_len;
+
+ if (b == NULL) {
+ return _PyAST_Call(_PyPegen_dummy_name(p), a, NULL, lineno, col_offset,
+ end_lineno, end_col_offset, arena);
+
+ }
+
+ asdl_expr_seq *starreds = _PyPegen_seq_extract_starred_exprs(p, b);
+ asdl_keyword_seq *keywords = _PyPegen_seq_delete_starred_exprs(p, b);
+
+ if (starreds) {
+ total_len += asdl_seq_LEN(starreds);
+ }
+
+ asdl_expr_seq *args = _Py_asdl_expr_seq_new(total_len, arena);
+
+ Py_ssize_t i = 0;
+ for (i = 0; i < args_len; i++) {
+ asdl_seq_SET(args, i, asdl_seq_GET(a, i));
+ }
+ for (; i < total_len; i++) {
+ asdl_seq_SET(args, i, asdl_seq_GET(starreds, i - args_len));
+ }
+
+ return _PyAST_Call(_PyPegen_dummy_name(p), args, keywords, lineno,
+ col_offset, end_lineno, end_col_offset, arena);
+}
+
+// AST Error reporting helpers
+
+expr_ty
+_PyPegen_get_invalid_target(expr_ty e, TARGETS_TYPE targets_type)
+{
+ if (e == NULL) {
+ return NULL;
+ }
+
+#define VISIT_CONTAINER(CONTAINER, TYPE) do { \
+ Py_ssize_t len = asdl_seq_LEN((CONTAINER)->v.TYPE.elts);\
+ for (Py_ssize_t i = 0; i < len; i++) {\
+ expr_ty other = asdl_seq_GET((CONTAINER)->v.TYPE.elts, i);\
+ expr_ty child = _PyPegen_get_invalid_target(other, targets_type);\
+ if (child != NULL) {\
+ return child;\
+ }\
+ }\
+ } while (0)
+
+ // We only need to visit List and Tuple nodes recursively as those
+ // are the only ones that can contain valid names in targets when
+ // they are parsed as expressions. Any other kind of expression
+ // that is a container (like Sets or Dicts) is directly invalid and
+ // we don't need to visit it recursively.
+
+ switch (e->kind) {
+ case List_kind:
+ VISIT_CONTAINER(e, List);
+ return NULL;
+ case Tuple_kind:
+ VISIT_CONTAINER(e, Tuple);
+ return NULL;
+ case Starred_kind:
+ if (targets_type == DEL_TARGETS) {
+ return e;
+ }
+ return _PyPegen_get_invalid_target(e->v.Starred.value, targets_type);
+ case Compare_kind:
+ // This is needed, because the `a in b` in `for a in b` gets parsed
+ // as a comparison, and so we need to search the left side of the comparison
+ // for invalid targets.
+ if (targets_type == FOR_TARGETS) {
+ cmpop_ty cmpop = (cmpop_ty) asdl_seq_GET(e->v.Compare.ops, 0);
+ if (cmpop == In) {
+ return _PyPegen_get_invalid_target(e->v.Compare.left, targets_type);
+ }
+ return NULL;
+ }
+ return e;
+ case Name_kind:
+ case Subscript_kind:
+ case Attribute_kind:
+ return NULL;
+ default:
+ return e;
+ }
+}
+
+void *_PyPegen_arguments_parsing_error(Parser *p, expr_ty e) {
+ int kwarg_unpacking = 0;
+ for (Py_ssize_t i = 0, l = asdl_seq_LEN(e->v.Call.keywords); i < l; i++) {
+ keyword_ty keyword = asdl_seq_GET(e->v.Call.keywords, i);
+ if (!keyword->arg) {
+ kwarg_unpacking = 1;
+ }
+ }
+
+ const char *msg = NULL;
+ if (kwarg_unpacking) {
+ msg = "positional argument follows keyword argument unpacking";
+ } else {
+ msg = "positional argument follows keyword argument";
+ }
+
+ return RAISE_SYNTAX_ERROR(msg);
+}
+
+void *
+_PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq *comprehensions)
+{
+ /* The rule that calls this function is 'args for_if_clauses'.
+ For the input f(L, x for x in y), L and x are in args and
+ the for is parsed as a for_if_clause. We have to check if
+ len <= 1, so that input like dict((a, b) for a, b in x)
+ gets successfully parsed and then we pass the last
+ argument (x in the above example) as the location of the
+ error */
+ Py_ssize_t len = asdl_seq_LEN(args->v.Call.args);
+ if (len <= 1) {
+ return NULL;
+ }
+
+ comprehension_ty last_comprehension = PyPegen_last_item(comprehensions, comprehension_ty);
+
+ return RAISE_SYNTAX_ERROR_KNOWN_RANGE(
+ (expr_ty) asdl_seq_GET(args->v.Call.args, len - 1),
+ _PyPegen_get_last_comprehension_item(last_comprehension),
+ "Generator expression must be parenthesized"
+ );
+}
\ No newline at end of file
diff --git a/Parser/pegen.c b/Parser/pegen.c
index b760730189073..4f51c63c44353 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -1,432 +1,21 @@
#include <Python.h>
#include "pycore_ast.h" // _PyAST_Validate(),
#include <errcode.h>
-#include "tokenizer.h"

+#include "tokenizer.h"
#include "pegen.h"
-#include "string_parser.h"
-
-PyObject *
-_PyPegen_new_type_comment(Parser *p, const char *s)
-{
- PyObject *res = PyUnicode_DecodeUTF8(s, strlen(s), NULL);
- if (res == NULL) {
- return NULL;
- }
- if (_PyArena_AddPyObject(p->arena, res) < 0) {
- Py_DECREF(res);
- return NULL;
- }
- return res;
-}
-
-arg_ty
-_PyPegen_add_type_comment_to_arg(Parser *p, arg_ty a, Token *tc)
-{
- if (tc == NULL) {
- return a;
- }
- const char *bytes = PyBytes_AsString(tc->bytes);
- if (bytes == NULL) {
- return NULL;
- }
- PyObject *tco = _PyPegen_new_type_comment(p, bytes);
- if (tco == NULL) {
- return NULL;
- }
- return _PyAST_arg(a->arg, a->annotation, tco,
- a->lineno, a->col_offset, a->end_lineno, a->end_col_offset,
- p->arena);
-}
-
-static int
-init_normalization(Parser *p)
-{
- if (p->normalize) {
- return 1;
- }
- PyObject *m = PyImport_ImportModuleNoBlock("unicodedata");
- if (!m)
- {
- return 0;
- }
- p->normalize = PyObject_GetAttrString(m, "normalize");
- Py_DECREF(m);
- if (!p->normalize)
- {
- return 0;
- }
- return 1;
-}
-
-/* Checks if the NOTEQUAL token is valid given the current parser flags
-0 indicates success and nonzero indicates failure (an exception may be set) */
-int
-_PyPegen_check_barry_as_flufl(Parser *p, Token* t) {
- assert(t->bytes != NULL);
- assert(t->type == NOTEQUAL);
-
- const char* tok_str = PyBytes_AS_STRING(t->bytes);
- if (p->flags & PyPARSE_BARRY_AS_BDFL && strcmp(tok_str, "<>") != 0) {
- RAISE_SYNTAX_ERROR("with Barry as BDFL, use '<>' instead of '!='");
- return -1;
- }
- if (!(p->flags & PyPARSE_BARRY_AS_BDFL)) {
- return strcmp(tok_str, "!=");
- }
- return 0;
-}
-
-int
-_PyPegen_check_legacy_stmt(Parser *p, expr_ty name) {
- if (name->kind != Name_kind) {
- return 0;
- }
- const char* candidates[2] = {"print", "exec"};
- for (int i=0; i<2; i++) {
- if (PyUnicode_CompareWithASCIIString(name->v.Name.id, candidates[i]) == 0) {
- return 1;
- }
- }
- return 0;
-}
-
-PyObject *
-_PyPegen_new_identifier(Parser *p, const char *n)
-{
- PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
- if (!id) {
- goto error;
- }
- /* PyUnicode_DecodeUTF8 should always return a ready string. */
- assert(PyUnicode_IS_READY(id));
- /* Check whether there are non-ASCII characters in the
- identifier; if so, normalize to NFKC. */
- if (!PyUnicode_IS_ASCII(id))
- {
- PyObject *id2;
- if (!init_normalization(p))
- {
- Py_DECREF(id);
- goto error;
- }
- PyObject *form = PyUnicode_InternFromString("NFKC");
- if (form == NULL)
- {
- Py_DECREF(id);
- goto error;
- }
- PyObject *args[2] = {form, id};
- id2 = _PyObject_FastCall(p->normalize, args, 2);
- Py_DECREF(id);
- Py_DECREF(form);
- if (!id2) {
- goto error;
- }
- if (!PyUnicode_Check(id2))
- {
- PyErr_Format(PyExc_TypeError,
- "unicodedata.normalize() must return a string, not "
- "%.200s",
- _PyType_Name(Py_TYPE(id2)));
- Py_DECREF(id2);
- goto error;
- }
- id = id2;
- }
- PyUnicode_InternInPlace(&id);
- if (_PyArena_AddPyObject(p->arena, id) < 0)
- {
- Py_DECREF(id);
- goto error;
- }
- return id;
-
-error:
- p->error_indicator = 1;
- return NULL;
-}
-
-static PyObject *
-_create_dummy_identifier(Parser *p)
-{
- return _PyPegen_new_identifier(p, "");
-}
-
-const char *
-_PyPegen_get_expr_name(expr_ty e)
-{
- assert(e != NULL);
- switch (e->kind) {
- case Attribute_kind:
- return "attribute";
- case Subscript_kind:
- return "subscript";
- case Starred_kind:
- return "starred";
- case Name_kind:
- return "name";
- case List_kind:
- return "list";
- case Tuple_kind:
- return "tuple";
- case Lambda_kind:
- return "lambda";
- case Call_kind:
- return "function call";
- case BoolOp_kind:
- case BinOp_kind:
- case UnaryOp_kind:
- return "expression";
- case GeneratorExp_kind:
- return "generator expression";
- case Yield_kind:
- case YieldFrom_kind:
- return "yield expression";
- case Await_kind:
- return "await expression";
- case ListComp_kind:
- return "list comprehension";
- case SetComp_kind:
- return "set comprehension";
- case DictComp_kind:
- return "dict comprehension";
- case Dict_kind:
- return "dict literal";
- case Set_kind:
- return "set display";
- case JoinedStr_kind:
- case FormattedValue_kind:
- return "f-string expression";
- case Constant_kind: {
- PyObject *value = e->v.Constant.value;
- if (value == Py_None) {
- return "None";
- }
- if (value == Py_False) {
- return "False";
- }
- if (value == Py_True) {
- return "True";
- }
- if (value == Py_Ellipsis) {
- return "ellipsis";
- }
- return "literal";
- }
- case Compare_kind:
- return "comparison";
- case IfExp_kind:
- return "conditional expression";
- case NamedExpr_kind:
- return "named expression";
- default:
- PyErr_Format(PyExc_SystemError,
- "unexpected expression in assignment %d (line %d)",
- e->kind, e->lineno);
- return NULL;
- }
-}
-
-static int
-raise_decode_error(Parser *p)
-{
- assert(PyErr_Occurred());
- const char *errtype = NULL;
- if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
- errtype = "unicode error";
- }
- else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
- errtype = "value error";
- }
- if (errtype) {
- PyObject *type;
- PyObject *value;
- PyObject *tback;
- PyObject *errstr;
- PyErr_Fetch(&type, &value, &tback);
- errstr = PyObject_Str(value);
- if (errstr) {
- RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
- Py_DECREF(errstr);
- }
- else {
- PyErr_Clear();
- RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
- }
- Py_XDECREF(type);
- Py_XDECREF(value);
- Py_XDECREF(tback);
- }
-
- return -1;
-}
-
-static inline void
-raise_unclosed_parentheses_error(Parser *p) {
- int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
- int error_col = p->tok->parencolstack[p->tok->level-1];
- RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
- error_lineno, error_col, error_lineno, -1,
- "'%c' was never closed",
- p->tok->parenstack[p->tok->level-1]);
-}
-
-static void
-raise_tokenizer_init_error(PyObject *filename)
-{
- if (!(PyErr_ExceptionMatches(PyExc_LookupError)
- || PyErr_ExceptionMatches(PyExc_SyntaxError)
- || PyErr_ExceptionMatches(PyExc_ValueError)
- || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
- return;
- }
- PyObject *errstr = NULL;
- PyObject *tuple = NULL;
- PyObject *type;
- PyObject *value;
- PyObject *tback;
- PyErr_Fetch(&type, &value, &tback);
- errstr = PyObject_Str(value);
- if (!errstr) {
- goto error;
- }
-
- PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
- if (!tmp) {
- goto error;
- }
-
- tuple = PyTuple_Pack(2, errstr, tmp);
- Py_DECREF(tmp);
- if (!value) {
- goto error;
- }
- PyErr_SetObject(PyExc_SyntaxError, tuple);
-
-error:
- Py_XDECREF(type);
- Py_XDECREF(value);
- Py_XDECREF(tback);
- Py_XDECREF(errstr);
- Py_XDECREF(tuple);
-}
-
-static int
-tokenizer_error(Parser *p)
-{
- if (PyErr_Occurred()) {
- return -1;
- }
-
- const char *msg = NULL;
- PyObject* errtype = PyExc_SyntaxError;
- Py_ssize_t col_offset = -1;
- switch (p->tok->done) {
- case E_TOKEN:
- msg = "invalid token";
- break;
- case E_EOF:
- if (p->tok->level) {
- raise_unclosed_parentheses_error(p);
- } else {
- RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
- }
- return -1;
- case E_DEDENT:
- RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
- return -1;
- case E_INTR:
- if (!PyErr_Occurred()) {
- PyErr_SetNone(PyExc_KeyboardInterrupt);
- }
- return -1;
- case E_NOMEM:
- PyErr_NoMemory();
- return -1;
- case E_TABSPACE:
- errtype = PyExc_TabError;
- msg = "inconsistent use of tabs and spaces in indentation";
- break;
- case E_TOODEEP:
- errtype = PyExc_IndentationError;
- msg = "too many levels of indentation";
- break;
- case E_LINECONT: {
- col_offset = p->tok->cur - p->tok->buf - 1;
- msg = "unexpected character after line continuation character";
- break;
- }
- default:
- msg = "unknown parsing error";
- }

- RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
- col_offset >= 0 ? col_offset : 0,
- p->tok->lineno, -1, msg);
- return -1;
-}
+// Internal parser functions

-void *
-_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
+asdl_stmt_seq*
+_PyPegen_interactive_exit(Parser *p)
{
- if (p->fill == 0) {
- va_list va;
- va_start(va, errmsg);
- _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
- va_end(va);
- return NULL;
- }
-
- Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
- Py_ssize_t col_offset;
- Py_ssize_t end_col_offset = -1;
- if (t->col_offset == -1) {
- if (p->tok->cur == p->tok->buf) {
- col_offset = 0;
- } else {
- const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
- col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
- }
- } else {
- col_offset = t->col_offset + 1;
- }
-
- if (t->end_col_offset != -1) {
- end_col_offset = t->end_col_offset + 1;
+ if (p->errcode) {
+ *(p->errcode) = E_EOF;
}
-
- va_list va;
- va_start(va, errmsg);
- _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
- va_end(va);
-
return NULL;
}

-static PyObject *
-get_error_line(Parser *p, Py_ssize_t lineno)
-{
- /* If the file descriptor is interactive, the source lines of the current
- * (multi-line) statement are stored in p->tok->interactive_src_start.
- * If not, we're parsing from a string, which means that the whole source
- * is stored in p->tok->str. */
- assert(p->tok->fp == NULL || p->tok->fp == stdin);
-
- char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
- assert(cur_line != NULL);
-
- for (int i = 0; i < lineno - 1; i++) {
- cur_line = strchr(cur_line, '\n') + 1;
- }
-
- char *next_newline;
- if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
- next_newline = cur_line + strlen(cur_line);
- }
- return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
-}
-
Py_ssize_t
_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
{
@@ -448,127 +37,6 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
return size;
}

-void *
-_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
- Py_ssize_t lineno, Py_ssize_t col_offset,
- Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
- const char *errmsg, va_list va)
-{
- PyObject *value = NULL;
- PyObject *errstr = NULL;
- PyObject *error_line = NULL;
- PyObject *tmp = NULL;
- p->error_indicator = 1;
-
- if (end_lineno == CURRENT_POS) {
- end_lineno = p->tok->lineno;
- }
- if (end_col_offset == CURRENT_POS) {
- end_col_offset = p->tok->cur - p->tok->line_start;
- }
-
- if (p->start_rule == Py_fstring_input) {
- const char *fstring_msg = "f-string: ";
- Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
-
- char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
- if (!new_errmsg) {
- return (void *) PyErr_NoMemory();
- }
-
- // Copy both strings into new buffer
- memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
- memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
- new_errmsg[len] = 0;
- errmsg = new_errmsg;
- }
- errstr = PyUnicode_FromFormatV(errmsg, va);
- if (!errstr) {
- goto error;
- }
-
- if (p->tok->fp_interactive) {
- error_line = get_error_line(p, lineno);
- }
- else if (p->start_rule == Py_file_input) {
- error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
- (int) lineno, p->tok->encoding);
- }
-
- if (!error_line) {
- /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
- then we need to find the error line from some other source, because
- p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
- failed or we're parsing from a string or the REPL. There's a third edge case where
- we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
- `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
- does not physically exist */
- assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
-
- if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
- Py_ssize_t size = p->tok->inp - p->tok->buf;
- error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
- }
- else if (p->tok->fp == NULL || p->tok->fp == stdin) {
- error_line = get_error_line(p, lineno);
- }
- else {
- error_line = PyUnicode_FromStringAndSize("", 0);
- }
- if (!error_line) {
- goto error;
- }
- }
-
- if (p->start_rule == Py_fstring_input) {
- col_offset -= p->starting_col_offset;
- end_col_offset -= p->starting_col_offset;
- }
-
- Py_ssize_t col_number = col_offset;
- Py_ssize_t end_col_number = end_col_offset;
-
- if (p->tok->encoding != NULL) {
- col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
- if (col_number < 0) {
- goto error;
- }
- if (end_col_number > 0) {
- Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
- if (end_col_offset < 0) {
- goto error;
- } else {
- end_col_number = end_col_offset;
- }
- }
- }
- tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
- if (!tmp) {
- goto error;
- }
- value = PyTuple_Pack(2, errstr, tmp);
- Py_DECREF(tmp);
- if (!value) {
- goto error;
- }
- PyErr_SetObject(errtype, value);
-
- Py_DECREF(errstr);
- Py_DECREF(value);
- if (p->start_rule == Py_fstring_input) {
- PyMem_Free((void *)errmsg);
- }
- return NULL;
-
-error:
- Py_XDECREF(errstr);
- Py_XDECREF(error_line);
- if (p->start_rule == Py_fstring_input) {
- PyMem_Free((void *)errmsg);
- }
- return NULL;
-}
-
#if 0
static const char *
token_name(int type)
@@ -614,39 +82,24 @@ _PyPegen_update_memo(Parser *p, int mark, int type, void *node)
return _PyPegen_insert_memo(p, mark, type, node);
}

-// Return dummy NAME.
-void *
-_PyPegen_dummy_name(Parser *p, ...)
+static int
+init_normalization(Parser *p)
{
- static void *cache = NULL;
-
- if (cache != NULL) {
- return cache;
+ if (p->normalize) {
+ return 1;
}
-
- PyObject *id = _create_dummy_identifier(p);
- if (!id) {
- return NULL;
+ PyObject *m = PyImport_ImportModuleNoBlock("unicodedata");
+ if (!m)
+ {
+ return 0;
}
- cache = _PyAST_Name(id, Load, 1, 0, 1, 0, p->arena);
- return cache;
-}
-
-static int
-_get_keyword_or_name_type(Parser *p, const char *name, int name_len)
-{
- assert(name_len > 0);
- if (name_len >= p->n_keyword_lists ||
- p->keywords[name_len] == NULL ||
- p->keywords[name_len]->type == -1) {
- return NAME;
- }
- for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
- if (strncmp(k->str, name, name_len) == 0) {
- return k->type;
- }
+ p->normalize = PyObject_GetAttrString(m, "normalize");
+ Py_DECREF(m);
+ if (!p->normalize)
+ {
+ return 0;
}
- return NAME;
+ return 1;
}

static int
@@ -685,6 +138,23 @@ growable_comment_array_deallocate(growable_comment_array *arr) {
PyMem_Free(arr->items);
}

+static int
+_get_keyword_or_name_type(Parser *p, const char *name, int name_len)
+{
+ assert(name_len > 0);
+ if (name_len >= p->n_keyword_lists ||
+ p->keywords[name_len] == NULL ||
+ p->keywords[name_len]->type == -1) {
+ return NAME;
+ }
+ for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
+ if (strncmp(k->str, name, name_len) == 0) {
+ return k->type;
+ }
+ }
+ return NAME;
+}
+
static int
initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
assert(token != NULL);
@@ -715,10 +185,10 @@ initialize_token(Parser *p, Token *token, const char *start, const char *end, in
p->fill += 1;

if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
- return raise_decode_error(p);
+ return _Pypegen_raise_decode_error(p);
}

- return (token_type == ERRORTOKEN ? tokenizer_error(p) : 0);
+ return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
}

static int
@@ -791,7 +261,6 @@ _PyPegen_fill_token(Parser *p)
return initialize_token(p, t, start, end, type);
}

-
#if defined(Py_DEBUG)
// Instrumentation to count the effectiveness of memoization.
// The array counts the number of tokens skipped by memoization,
@@ -989,6 +458,62 @@ _PyPegen_get_last_nonnwhitespace_token(Parser *p)
return token;
}

+PyObject *
+_PyPegen_new_identifier(Parser *p, const char *n)
+{
+ PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
+ if (!id) {
+ goto error;
+ }
+ /* PyUnicode_DecodeUTF8 should always return a ready string. */
+ assert(PyUnicode_IS_READY(id));
+ /* Check whether there are non-ASCII characters in the
+ identifier; if so, normalize to NFKC. */
+ if (!PyUnicode_IS_ASCII(id))
+ {
+ PyObject *id2;
+ if (!init_normalization(p))
+ {
+ Py_DECREF(id);
+ goto error;
+ }
+ PyObject *form = PyUnicode_InternFromString("NFKC");
+ if (form == NULL)
+ {
+ Py_DECREF(id);
+ goto error;
+ }
+ PyObject *args[2] = {form, id};
+ id2 = _PyObject_FastCall(p->normalize, args, 2);
+ Py_DECREF(id);
+ Py_DECREF(form);
+ if (!id2) {
+ goto error;
+ }
+ if (!PyUnicode_Check(id2))
+ {
+ PyErr_Format(PyExc_TypeError,
+ "unicodedata.normalize() must return a string, not "
+ "%.200s",
+ _PyType_Name(Py_TYPE(id2)));
+ Py_DECREF(id2);
+ goto error;
+ }
+ id = id2;
+ }
+ PyUnicode_InternInPlace(&id);
+ if (_PyArena_AddPyObject(p->arena, id) < 0)
+ {
+ Py_DECREF(id);
+ goto error;
+ }
+ return id;
+
+error:
+ p->error_indicator = 1;
+ return NULL;
+}
+
static expr_ty
_PyPegen_name_from_token(Parser *p, Token* t)
{
@@ -1009,7 +534,6 @@ _PyPegen_name_from_token(Parser *p, Token* t)
t->end_col_offset, p->arena);
}

-
expr_ty
_PyPegen_name_token(Parser *p)
{
@@ -1023,7 +547,6 @@ _PyPegen_string_token(Parser *p)
return _PyPegen_expect_token(p, STRING);
}

-
expr_ty _PyPegen_soft_keyword_token(Parser *p) {
Token *t = _PyPegen_expect_token(p, NAME);
if (t == NULL) {
@@ -1197,18 +720,6 @@ bad_single_statement(Parser *p)
}
}

-void
-_PyPegen_Parser_Free(Parser *p)
-{
- Py_XDECREF(p->normalize);
- for (int i = 0; i < p->size; i++) {
- PyMem_Free(p->tokens[i]);
- }
- PyMem_Free(p->tokens);
- growable_comment_array_deallocate(&p->type_ignore_comments);
- PyMem_Free(p);
-}
-
static int
compute_parser_flags(PyCompilerFlags *flags)
{
@@ -1234,6 +745,8 @@ compute_parser_flags(PyCompilerFlags *flags)
return parser_flags;
}

+// Parser API
+
Parser *
_PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
int feature_version, int *errcode, PyArena *arena)
@@ -1289,8 +802,20 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
return p;
}

+void
+_PyPegen_Parser_Free(Parser *p)
+{
+ Py_XDECREF(p->normalize);
+ for (int i = 0; i < p->size; i++) {
+ PyMem_Free(p->tokens[i]);
+ }
+ PyMem_Free(p->tokens);
+ growable_comment_array_deallocate(&p->type_ignore_comments);
+ PyMem_Free(p);
+}
+
static void
-reset_parser_state(Parser *p)
+reset_parser_state_for_error_pass(Parser *p)
{
for (int i = 0; i < p->fill; i++) {
p->tokens[i]->memo = NULL;
@@ -1302,60 +827,6 @@ reset_parser_state(Parser *p)
p->tok->interactive_underflow = IUNDERFLOW_STOP;
}

-static int
-_PyPegen_check_tokenizer_errors(Parser *p) {
- // Tokenize the whole input to see if there are any tokenization
- // errors such as mistmatching parentheses. These will get priority
- // over generic syntax errors only if the line number of the error is
- // before the one that we had for the generic error.
-
- // We don't want to tokenize to the end for interactive input
- if (p->tok->prompt != NULL) {
- return 0;
- }
-
- PyObject *type, *value, *traceback;
- PyErr_Fetch(&type, &value, &traceback);
-
- Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
- Py_ssize_t current_err_line = current_token->lineno;
-
- int ret = 0;
-
- for (;;) {
- const char *start;
- const char *end;
- switch (_PyTokenizer_Get(p->tok, &start, &end)) {
- case ERRORTOKEN:
- if (p->tok->level != 0) {
- int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
- if (current_err_line > error_lineno) {
- raise_unclosed_parentheses_error(p);
- ret = -1;
- goto exit;
- }
- }
- break;
- case ENDMARKER:
- break;
- default:
- continue;
- }
- break;
- }
-
-
-exit:
- if (PyErr_Occurred()) {
- Py_XDECREF(value);
- Py_XDECREF(type);
- Py_XDECREF(traceback);
- } else {
- PyErr_Restore(type, value, traceback);
- }
- return ret;
-}
-
void *
_PyPegen_run_parser(Parser *p)
{
@@ -1364,46 +835,17 @@ _PyPegen_run_parser(Parser *p)
if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
return NULL;
}
+ // Make a second parser pass. In this pass we activate heavier and slower checks
+ // to produce better error messages and more complete diagnostics. Extra "invalid_*"
+ // rules will be active during parsing.
Token *last_token = p->tokens[p->fill - 1];
- reset_parser_state(p);
+ reset_parser_state_for_error_pass(p);
_PyPegen_parse(p);
- if (PyErr_Occurred()) {
- // Prioritize tokenizer errors to custom syntax errors raised
- // on the second phase only if the errors come from the parser.
- if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
- _PyPegen_check_tokenizer_errors(p);
- }
- return NULL;
- }
- if (p->fill == 0) {
- RAISE_SYNTAX_ERROR("error at start before reading any input");
- }
- else if (p->tok->done == E_EOF) {
- if (p->tok->level) {
- raise_unclosed_parentheses_error(p);
- } else {
- RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
- }
- }
- else {
- if (p->tokens[p->fill-1]->type == INDENT) {
- RAISE_INDENTATION_ERROR("unexpected indent");
- }
- else if (p->tokens[p->fill-1]->type == DEDENT) {
- RAISE_INDENTATION_ERROR("unexpected unindent");
- }
- else {
- // Use the last token we found on the first pass to avoid reporting
- // incorrect locations for generic syntax errors just because we reached
- // further away when trying to find specific syntax errors in the second
- // pass.
- RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
- // _PyPegen_check_tokenizer_errors will override the existing
- // generic SyntaxError we just raised if errors are found.
- _PyPegen_check_tokenizer_errors(p);
- }
- }
- return NULL;
+
+ // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
+ // point.
+ _Pypegen_set_syntax_error(p, last_token);
+ return NULL;
}

if (p->start_rule == Py_single_input && bad_single_statement(p)) {
@@ -1433,7 +875,7 @@ _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filena
struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
if (tok == NULL) {
if (PyErr_Occurred()) {
- raise_tokenizer_init_error(filename_ob);
+ _PyPegen_raise_tokenizer_init_error(filename_ob);
return NULL;
}
return NULL;
@@ -1478,7 +920,7 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
}
if (tok == NULL) {
if (PyErr_Occurred()) {
- raise_tokenizer_init_error(filename_ob);
+ _PyPegen_raise_tokenizer_init_error(filename_ob);
}
return NULL;
}
@@ -1504,1138 +946,4 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
error:
_PyTokenizer_Free(tok);
return result;
-}
-
-asdl_stmt_seq*
-_PyPegen_interactive_exit(Parser *p)
-{
- if (p->errcode) {
- *(p->errcode) = E_EOF;
- }
- return NULL;
-}
-
-/* Creates a single-element asdl_seq* that contains a */
-asdl_seq *
-_PyPegen_singleton_seq(Parser *p, void *a)
-{
- assert(a != NULL);
- asdl_seq *seq = (asdl_seq*)_Py_asdl_generic_seq_new(1, p->arena);
- if (!seq) {
- return NULL;
- }
- asdl_seq_SET_UNTYPED(seq, 0, a);
- return seq;
-}
-
-/* Creates a copy of seq and prepends a to it */
-asdl_seq *
-_PyPegen_seq_insert_in_front(Parser *p, void *a, asdl_seq *seq)
-{
- assert(a != NULL);
- if (!seq) {
- return _PyPegen_singleton_seq(p, a);
- }
-
- asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(asdl_seq_LEN(seq) + 1, p->arena);
- if (!new_seq) {
- return NULL;
- }
-
- asdl_seq_SET_UNTYPED(new_seq, 0, a);
- for (Py_ssize_t i = 1, l = asdl_seq_LEN(new_seq); i < l; i++) {
- asdl_seq_SET_UNTYPED(new_seq, i, asdl_seq_GET_UNTYPED(seq, i - 1));
- }
- return new_seq;
-}
-
-/* Creates a copy of seq and appends a to it */
-asdl_seq *
-_PyPegen_seq_append_to_end(Parser *p, asdl_seq *seq, void *a)
-{
- assert(a != NULL);
- if (!seq) {
- return _PyPegen_singleton_seq(p, a);
- }
-
- asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(asdl_seq_LEN(seq) + 1, p->arena);
- if (!new_seq) {
- return NULL;
- }
-
- for (Py_ssize_t i = 0, l = asdl_seq_LEN(new_seq); i + 1 < l; i++) {
- asdl_seq_SET_UNTYPED(new_seq, i, asdl_seq_GET_UNTYPED(seq, i));
- }
- asdl_seq_SET_UNTYPED(new_seq, asdl_seq_LEN(new_seq) - 1, a);
- return new_seq;
-}
-
-static Py_ssize_t
-_get_flattened_seq_size(asdl_seq *seqs)
-{
- Py_ssize_t size = 0;
- for (Py_ssize_t i = 0, l = asdl_seq_LEN(seqs); i < l; i++) {
- asdl_seq *inner_seq = asdl_seq_GET_UNTYPED(seqs, i);
- size += asdl_seq_LEN(inner_seq);
- }
- return size;
-}
-
-/* Flattens an asdl_seq* of asdl_seq*s */
-asdl_seq *
-_PyPegen_seq_flatten(Parser *p, asdl_seq *seqs)
-{
- Py_ssize_t flattened_seq_size = _get_flattened_seq_size(seqs);
- assert(flattened_seq_size > 0);
-
- asdl_seq *flattened_seq = (asdl_seq*)_Py_asdl_generic_seq_new(flattened_seq_size, p->arena);
- if (!flattened_seq) {
- return NULL;
- }
-
- int flattened_seq_idx = 0;
- for (Py_ssize_t i = 0, l = asdl_seq_LEN(seqs); i < l; i++) {
- asdl_seq *inner_seq = asdl_seq_GET_UNTYPED(seqs, i);
- for (Py_ssize_t j = 0, li = asdl_seq_LEN(inner_seq); j < li; j++) {
- asdl_seq_SET_UNTYPED(flattened_seq, flattened_seq_idx++, asdl_seq_GET_UNTYPED(inner_seq, j));
- }
- }
- assert(flattened_seq_idx == flattened_seq_size);
-
- return flattened_seq;
-}
-
-void *
-_PyPegen_seq_last_item(asdl_seq *seq)
-{
- Py_ssize_t len = asdl_seq_LEN(seq);
- return asdl_seq_GET_UNTYPED(seq, len - 1);
-}
-
-void *
-_PyPegen_seq_first_item(asdl_seq *seq)
-{
- return asdl_seq_GET_UNTYPED(seq, 0);
-}
-
-
-/* Creates a new name of the form <first_name>.<second_name> */
-expr_ty
-_PyPegen_join_names_with_dot(Parser *p, expr_ty first_name, expr_ty second_name)
-{
- assert(first_name != NULL && second_name != NULL);
- PyObject *first_identifier = first_name->v.Name.id;
- PyObject *second_identifier = second_name->v.Name.id;
-
- if (PyUnicode_READY(first_identifier) == -1) {
- return NULL;
- }
- if (PyUnicode_READY(second_identifier) == -1) {
- return NULL;
- }
- const char *first_str = PyUnicode_AsUTF8(first_identifier);
- if (!first_str) {
- return NULL;
- }
- const char *second_str = PyUnicode_AsUTF8(second_identifier);
- if (!second_str) {
- return NULL;
- }
- Py_ssize_t len = strlen(first_str) + strlen(second_str) + 1; // +1 for the dot
-
- PyObject *str = PyBytes_FromStringAndSize(NULL, len);
- if (!str) {
- return NULL;
- }
-
- char *s = PyBytes_AS_STRING(str);
- if (!s) {
- return NULL;
- }
-
- strcpy(s, first_str);
- s += strlen(first_str);
- *s++ = '.';
- strcpy(s, second_str);
- s += strlen(second_str);
- *s = '\0';
-
- PyObject *uni = PyUnicode_DecodeUTF8(PyBytes_AS_STRING(str), PyBytes_GET_SIZE(str), NULL);
- Py_DECREF(str);
- if (!uni) {
- return NULL;
- }
- PyUnicode_InternInPlace(&uni);
- if (_PyArena_AddPyObject(p->arena, uni) < 0) {
- Py_DECREF(uni);
- return NULL;
- }
-
- return _PyAST_Name(uni, Load, EXTRA_EXPR(first_name, second_name));
-}
-
-/* Counts the total number of dots in seq's tokens */
-int
-_PyPegen_seq_count_dots(asdl_seq *seq)
-{
- int number_of_dots = 0;
- for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
- Token *current_expr = asdl_seq_GET_UNTYPED(seq, i);
- switch (current_expr->type) {
- case ELLIPSIS:
- number_of_dots += 3;
- break;
- case DOT:
- number_of_dots += 1;
- break;
- default:
- Py_UNREACHABLE();
- }
- }
-
- return number_of_dots;
-}
-
-/* Creates an alias with '*' as the identifier name */
-alias_ty
-_PyPegen_alias_for_star(Parser *p, int lineno, int col_offset, int end_lineno,
- int end_col_offset, PyArena *arena) {
- PyObject *str = PyUnicode_InternFromString("*");
- if (!str) {
- return NULL;
- }
- if (_PyArena_AddPyObject(p->arena, str) < 0) {
- Py_DECREF(str);
- return NULL;
- }
- return _PyAST_alias(str, NULL, lineno, col_offset, end_lineno, end_col_offset, arena);
-}
-
-/* Creates a new asdl_seq* with the identifiers of all the names in seq */
-asdl_identifier_seq *
-_PyPegen_map_names_to_ids(Parser *p, asdl_expr_seq *seq)
-{
- Py_ssize_t len = asdl_seq_LEN(seq);
- assert(len > 0);
-
- asdl_identifier_seq *new_seq = _Py_asdl_identifier_seq_new(len, p->arena);
- if (!new_seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- expr_ty e = asdl_seq_GET(seq, i);
- asdl_seq_SET(new_seq, i, e->v.Name.id);
- }
- return new_seq;
-}
-
-/* Constructs a CmpopExprPair */
-CmpopExprPair *
-_PyPegen_cmpop_expr_pair(Parser *p, cmpop_ty cmpop, expr_ty expr)
-{
- assert(expr != NULL);
- CmpopExprPair *a = _PyArena_Malloc(p->arena, sizeof(CmpopExprPair));
- if (!a) {
- return NULL;
- }
- a->cmpop = cmpop;
- a->expr = expr;
- return a;
-}
-
-asdl_int_seq *
-_PyPegen_get_cmpops(Parser *p, asdl_seq *seq)
-{
- Py_ssize_t len = asdl_seq_LEN(seq);
- assert(len > 0);
-
- asdl_int_seq *new_seq = _Py_asdl_int_seq_new(len, p->arena);
- if (!new_seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- CmpopExprPair *pair = asdl_seq_GET_UNTYPED(seq, i);
- asdl_seq_SET(new_seq, i, pair->cmpop);
- }
- return new_seq;
-}
-
-asdl_expr_seq *
-_PyPegen_get_exprs(Parser *p, asdl_seq *seq)
-{
- Py_ssize_t len = asdl_seq_LEN(seq);
- assert(len > 0);
-
- asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
- if (!new_seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- CmpopExprPair *pair = asdl_seq_GET_UNTYPED(seq, i);
- asdl_seq_SET(new_seq, i, pair->expr);
- }
- return new_seq;
-}
-
-/* Creates an asdl_seq* where all the elements have been changed to have ctx as context */
-static asdl_expr_seq *
-_set_seq_context(Parser *p, asdl_expr_seq *seq, expr_context_ty ctx)
-{
- Py_ssize_t len = asdl_seq_LEN(seq);
- if (len == 0) {
- return NULL;
- }
-
- asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
- if (!new_seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- expr_ty e = asdl_seq_GET(seq, i);
- asdl_seq_SET(new_seq, i, _PyPegen_set_expr_context(p, e, ctx));
- }
- return new_seq;
-}
-
-static expr_ty
-_set_name_context(Parser *p, expr_ty e, expr_context_ty ctx)
-{
- return _PyAST_Name(e->v.Name.id, ctx, EXTRA_EXPR(e, e));
-}
-
-static expr_ty
-_set_tuple_context(Parser *p, expr_ty e, expr_context_ty ctx)
-{
- return _PyAST_Tuple(
- _set_seq_context(p, e->v.Tuple.elts, ctx),
- ctx,
- EXTRA_EXPR(e, e));
-}
-
-static expr_ty
-_set_list_context(Parser *p, expr_ty e, expr_context_ty ctx)
-{
- return _PyAST_List(
- _set_seq_context(p, e->v.List.elts, ctx),
- ctx,
- EXTRA_EXPR(e, e));
-}
-
-static expr_ty
-_set_subscript_context(Parser *p, expr_ty e, expr_context_ty ctx)
-{
- return _PyAST_Subscript(e->v.Subscript.value, e->v.Subscript.slice,
- ctx, EXTRA_EXPR(e, e));
-}
-
-static expr_ty
-_set_attribute_context(Parser *p, expr_ty e, expr_context_ty ctx)
-{
- return _PyAST_Attribute(e->v.Attribute.value, e->v.Attribute.attr,
- ctx, EXTRA_EXPR(e, e));
-}
-
-static expr_ty
-_set_starred_context(Parser *p, expr_ty e, expr_context_ty ctx)
-{
- return _PyAST_Starred(_PyPegen_set_expr_context(p, e->v.Starred.value, ctx),
- ctx, EXTRA_EXPR(e, e));
-}
-
-/* Creates an `expr_ty` equivalent to `expr` but with `ctx` as context */
-expr_ty
-_PyPegen_set_expr_context(Parser *p, expr_ty expr, expr_context_ty ctx)
-{
- assert(expr != NULL);
-
- expr_ty new = NULL;
- switch (expr->kind) {
- case Name_kind:
- new = _set_name_context(p, expr, ctx);
- break;
- case Tuple_kind:
- new = _set_tuple_context(p, expr, ctx);
- break;
- case List_kind:
- new = _set_list_context(p, expr, ctx);
- break;
- case Subscript_kind:
- new = _set_subscript_context(p, expr, ctx);
- break;
- case Attribute_kind:
- new = _set_attribute_context(p, expr, ctx);
- break;
- case Starred_kind:
- new = _set_starred_context(p, expr, ctx);
- break;
- default:
- new = expr;
- }
- return new;
-}
-
-/* Constructs a KeyValuePair that is used when parsing a dict's key value pairs */
-KeyValuePair *
-_PyPegen_key_value_pair(Parser *p, expr_ty key, expr_ty value)
-{
- KeyValuePair *a = _PyArena_Malloc(p->arena, sizeof(KeyValuePair));
- if (!a) {
- return NULL;
- }
- a->key = key;
- a->value = value;
- return a;
-}
-
-/* Extracts all keys from an asdl_seq* of KeyValuePair*'s */
-asdl_expr_seq *
-_PyPegen_get_keys(Parser *p, asdl_seq *seq)
-{
- Py_ssize_t len = asdl_seq_LEN(seq);
- asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
- if (!new_seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- KeyValuePair *pair = asdl_seq_GET_UNTYPED(seq, i);
- asdl_seq_SET(new_seq, i, pair->key);
- }
- return new_seq;
-}
-
-/* Extracts all values from an asdl_seq* of KeyValuePair*'s */
-asdl_expr_seq *
-_PyPegen_get_values(Parser *p, asdl_seq *seq)
-{
- Py_ssize_t len = asdl_seq_LEN(seq);
- asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
- if (!new_seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- KeyValuePair *pair = asdl_seq_GET_UNTYPED(seq, i);
- asdl_seq_SET(new_seq, i, pair->value);
- }
- return new_seq;
-}
-
-/* Constructs a KeyPatternPair that is used when parsing mapping & class patterns */
-KeyPatternPair *
-_PyPegen_key_pattern_pair(Parser *p, expr_ty key, pattern_ty pattern)
-{
- KeyPatternPair *a = _PyArena_Malloc(p->arena, sizeof(KeyPatternPair));
- if (!a) {
- return NULL;
- }
- a->key = key;
- a->pattern = pattern;
- return a;
-}
-
-/* Extracts all keys from an asdl_seq* of KeyPatternPair*'s */
-asdl_expr_seq *
-_PyPegen_get_pattern_keys(Parser *p, asdl_seq *seq)
-{
- Py_ssize_t len = asdl_seq_LEN(seq);
- asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena);
- if (!new_seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- KeyPatternPair *pair = asdl_seq_GET_UNTYPED(seq, i);
- asdl_seq_SET(new_seq, i, pair->key);
- }
- return new_seq;
-}
-
-/* Extracts all patterns from an asdl_seq* of KeyPatternPair*'s */
-asdl_pattern_seq *
-_PyPegen_get_patterns(Parser *p, asdl_seq *seq)
-{
- Py_ssize_t len = asdl_seq_LEN(seq);
- asdl_pattern_seq *new_seq = _Py_asdl_pattern_seq_new(len, p->arena);
- if (!new_seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- KeyPatternPair *pair = asdl_seq_GET_UNTYPED(seq, i);
- asdl_seq_SET(new_seq, i, pair->pattern);
- }
- return new_seq;
-}
-
-/* Constructs a NameDefaultPair */
-NameDefaultPair *
-_PyPegen_name_default_pair(Parser *p, arg_ty arg, expr_ty value, Token *tc)
-{
- NameDefaultPair *a = _PyArena_Malloc(p->arena, sizeof(NameDefaultPair));
- if (!a) {
- return NULL;
- }
- a->arg = _PyPegen_add_type_comment_to_arg(p, arg, tc);
- a->value = value;
- return a;
-}
-
-/* Constructs a SlashWithDefault */
-SlashWithDefault *
-_PyPegen_slash_with_default(Parser *p, asdl_arg_seq *plain_names, asdl_seq *names_with_defaults)
-{
- SlashWithDefault *a = _PyArena_Malloc(p->arena, sizeof(SlashWithDefault));
- if (!a) {
- return NULL;
- }
- a->plain_names = plain_names;
- a->names_with_defaults = names_with_defaults;
- return a;
-}
-
-/* Constructs a StarEtc */
-StarEtc *
-_PyPegen_star_etc(Parser *p, arg_ty vararg, asdl_seq *kwonlyargs, arg_ty kwarg)
-{
- StarEtc *a = _PyArena_Malloc(p->arena, sizeof(StarEtc));
- if (!a) {
- return NULL;
- }
- a->vararg = vararg;
- a->kwonlyargs = kwonlyargs;
- a->kwarg = kwarg;
- return a;
-}
-
-asdl_seq *
-_PyPegen_join_sequences(Parser *p, asdl_seq *a, asdl_seq *b)
-{
- Py_ssize_t first_len = asdl_seq_LEN(a);
- Py_ssize_t second_len = asdl_seq_LEN(b);
- asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(first_len + second_len, p->arena);
- if (!new_seq) {
- return NULL;
- }
-
- int k = 0;
- for (Py_ssize_t i = 0; i < first_len; i++) {
- asdl_seq_SET_UNTYPED(new_seq, k++, asdl_seq_GET_UNTYPED(a, i));
- }
- for (Py_ssize_t i = 0; i < second_len; i++) {
- asdl_seq_SET_UNTYPED(new_seq, k++, asdl_seq_GET_UNTYPED(b, i));
- }
-
- return new_seq;
-}
-
-static asdl_arg_seq*
-_get_names(Parser *p, asdl_seq *names_with_defaults)
-{
- Py_ssize_t len = asdl_seq_LEN(names_with_defaults);
- asdl_arg_seq *seq = _Py_asdl_arg_seq_new(len, p->arena);
- if (!seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- NameDefaultPair *pair = asdl_seq_GET_UNTYPED(names_with_defaults, i);
- asdl_seq_SET(seq, i, pair->arg);
- }
- return seq;
-}
-
-static asdl_expr_seq *
-_get_defaults(Parser *p, asdl_seq *names_with_defaults)
-{
- Py_ssize_t len = asdl_seq_LEN(names_with_defaults);
- asdl_expr_seq *seq = _Py_asdl_expr_seq_new(len, p->arena);
- if (!seq) {
- return NULL;
- }
- for (Py_ssize_t i = 0; i < len; i++) {
- NameDefaultPair *pair = asdl_seq_GET_UNTYPED(names_with_defaults, i);
- asdl_seq_SET(seq, i, pair->value);
- }
- return seq;
-}
-
-static int
-_make_posonlyargs(Parser *p,
- asdl_arg_seq *slash_without_default,
- SlashWithDefault *slash_with_default,
- asdl_arg_seq **posonlyargs) {
- if (slash_without_default != NULL) {
- *posonlyargs = slash_without_default;
- }
- else if (slash_with_default != NULL) {
- asdl_arg_seq *slash_with_default_names =
- _get_names(p, slash_with_default->names_with_defaults);
- if (!slash_with_default_names) {
- return -1;
- }
- *posonlyargs = (asdl_arg_seq*)_PyPegen_join_sequences(
- p,
- (asdl_seq*)slash_with_default->plain_names,
- (asdl_seq*)slash_with_default_names);
- }
- else {
- *posonlyargs = _Py_asdl_arg_seq_new(0, p->arena);
- }
- return *posonlyargs == NULL ? -1 : 0;
-}
-
-static int
-_make_posargs(Parser *p,
- asdl_arg_seq *plain_names,
- asdl_seq *names_with_default,
- asdl_arg_seq **posargs) {
- if (plain_names != NULL && names_with_default != NULL) {
- asdl_arg_seq *names_with_default_names = _get_names(p, names_with_default);
- if (!names_with_default_names) {
- return -1;
- }
- *posargs = (asdl_arg_seq*)_PyPegen_join_sequences(
- p,(asdl_seq*)plain_names, (asdl_seq*)names_with_default_names);
- }
- else if (plain_names == NULL && names_with_default != NULL) {
- *posargs = _get_names(p, names_with_default);
- }
- else if (plain_names != NULL && names_with_default == NULL) {
- *posargs = plain_names;
- }
- else {
- *posargs = _Py_asdl_arg_seq_new(0, p->arena);
- }
- return *posargs == NULL ? -1 : 0;
-}
-
-static int
-_make_posdefaults(Parser *p,
- SlashWithDefault *slash_with_default,
- asdl_seq *names_with_default,
- asdl_expr_seq **posdefaults) {
- if (slash_with_default != NULL && names_with_default != NULL) {
- asdl_expr_seq *slash_with_default_values =
- _get_defaults(p, slash_with_default->names_with_defaults);
- if (!slash_with_default_values) {
- return -1;
- }
- asdl_expr_seq *names_with_default_values = _get_defaults(p, names_with_default);
- if (!names_with_default_values) {
- return -1;
- }
- *posdefaults = (asdl_expr_seq*)_PyPegen_join_sequences(
- p,
- (asdl_seq*)slash_with_default_values,
- (asdl_seq*)names_with_default_values);
- }
- else if (slash_with_default == NULL && names_with_default != NULL) {
- *posdefaults = _get_defaults(p, names_with_default);
- }
- else if (slash_with_default != NULL && names_with_default == NULL) {
- *posdefaults = _get_defaults(p, slash_with_default->names_with_defaults);
- }
- else {
- *posdefaults = _Py_asdl_expr_seq_new(0, p->arena);
- }
- return *posdefaults == NULL ? -1 : 0;
-}
-
-static int
-_make_kwargs(Parser *p, StarEtc *star_etc,
- asdl_arg_seq **kwonlyargs,
- asdl_expr_seq **kwdefaults) {
- if (star_etc != NULL && star_etc->kwonlyargs != NULL) {
- *kwonlyargs = _get_names(p, star_etc->kwonlyargs);
- }
- else {
- *kwonlyargs = _Py_asdl_arg_seq_new(0, p->arena);
- }
-
- if (*kwonlyargs == NULL) {
- return -1;
- }
-
- if (star_etc != NULL && star_etc->kwonlyargs != NULL) {
- *kwdefaults = _get_defaults(p, star_etc->kwonlyargs);
- }
- else {
- *kwdefaults = _Py_asdl_expr_seq_new(0, p->arena);
- }
-
- if (*kwdefaults == NULL) {
- return -1;
- }
-
- return 0;
-}
-
-/* Constructs an arguments_ty object out of all the parsed constructs in the parameters rule */
-arguments_ty
-_PyPegen_make_arguments(Parser *p, asdl_arg_seq *slash_without_default,
- SlashWithDefault *slash_with_default, asdl_arg_seq *plain_names,
- asdl_seq *names_with_default, StarEtc *star_etc)
-{
- asdl_arg_seq *posonlyargs;
- if (_make_posonlyargs(p, slash_without_default, slash_with_default, &posonlyargs) == -1) {
- return NULL;
- }
-
- asdl_arg_seq *posargs;
- if (_make_posargs(p, plain_names, names_with_default, &posargs) == -1) {
- return NULL;
- }
-
- asdl_expr_seq *posdefaults;
- if (_make_posdefaults(p,slash_with_default, names_with_default, &posdefaults) == -1) {
- return NULL;
- }
-
- arg_ty vararg = NULL;
- if (star_etc != NULL && star_etc->vararg != NULL) {
- vararg = star_etc->vararg;
- }
-
- asdl_arg_seq *kwonlyargs;
- asdl_expr_seq *kwdefaults;
- if (_make_kwargs(p, star_etc, &kwonlyargs, &kwdefaults) == -1) {
- return NULL;
- }
-
- arg_ty kwarg = NULL;
- if (star_etc != NULL && star_etc->kwarg != NULL) {
- kwarg = star_etc->kwarg;
- }
-
- return _PyAST_arguments(posonlyargs, posargs, vararg, kwonlyargs,
- kwdefaults, kwarg, posdefaults, p->arena);
-}
-
-
-/* Constructs an empty arguments_ty object, that gets used when a function accepts no
- * arguments. */
-arguments_ty
-_PyPegen_empty_arguments(Parser *p)
-{
- asdl_arg_seq *posonlyargs = _Py_asdl_arg_seq_new(0, p->arena);
- if (!posonlyargs) {
- return NULL;
- }
- asdl_arg_seq *posargs = _Py_asdl_arg_seq_new(0, p->arena);
- if (!posargs) {
- return NULL;
- }
- asdl_expr_seq *posdefaults = _Py_asdl_expr_seq_new(0, p->arena);
- if (!posdefaults) {
- return NULL;
- }
- asdl_arg_seq *kwonlyargs = _Py_asdl_arg_seq_new(0, p->arena);
- if (!kwonlyargs) {
- return NULL;
- }
- asdl_expr_seq *kwdefaults = _Py_asdl_expr_seq_new(0, p->arena);
- if (!kwdefaults) {
- return NULL;
- }
-
- return _PyAST_arguments(posonlyargs, posargs, NULL, kwonlyargs,
- kwdefaults, NULL, posdefaults, p->arena);
-}
-
-/* Encapsulates the value of an operator_ty into an AugOperator struct */
-AugOperator *
-_PyPegen_augoperator(Parser *p, operator_ty kind)
-{
- AugOperator *a = _PyArena_Malloc(p->arena, sizeof(AugOperator));
- if (!a) {
- return NULL;
- }
- a->kind = kind;
- return a;
-}
-
-/* Construct a FunctionDef equivalent to function_def, but with decorators */
-stmt_ty
-_PyPegen_function_def_decorators(Parser *p, asdl_expr_seq *decorators, stmt_ty function_def)
-{
- assert(function_def != NULL);
- if (function_def->kind == AsyncFunctionDef_kind) {
- return _PyAST_AsyncFunctionDef(
- function_def->v.FunctionDef.name, function_def->v.FunctionDef.args,
- function_def->v.FunctionDef.body, decorators, function_def->v.FunctionDef.returns,
- function_def->v.FunctionDef.type_comment, function_def->lineno,
- function_def->col_offset, function_def->end_lineno, function_def->end_col_offset,
- p->arena);
- }
-
- return _PyAST_FunctionDef(
- function_def->v.FunctionDef.name, function_def->v.FunctionDef.args,
- function_def->v.FunctionDef.body, decorators,
- function_def->v.FunctionDef.returns,
- function_def->v.FunctionDef.type_comment, function_def->lineno,
- function_def->col_offset, function_def->end_lineno,
- function_def->end_col_offset, p->arena);
-}
-
-/* Construct a ClassDef equivalent to class_def, but with decorators */
-stmt_ty
-_PyPegen_class_def_decorators(Parser *p, asdl_expr_seq *decorators, stmt_ty class_def)
-{
- assert(class_def != NULL);
- return _PyAST_ClassDef(
- class_def->v.ClassDef.name, class_def->v.ClassDef.bases,
- class_def->v.ClassDef.keywords, class_def->v.ClassDef.body, decorators,
- class_def->lineno, class_def->col_offset, class_def->end_lineno,
- class_def->end_col_offset, p->arena);
-}
-
-/* Construct a KeywordOrStarred */
-KeywordOrStarred *
-_PyPegen_keyword_or_starred(Parser *p, void *element, int is_keyword)
-{
- KeywordOrStarred *a = _PyArena_Malloc(p->arena, sizeof(KeywordOrStarred));
- if (!a) {
- return NULL;
- }
- a->element = element;
- a->is_keyword = is_keyword;
- return a;
-}
-
-/* Get the number of starred expressions in an asdl_seq* of KeywordOrStarred*s */
-static int
-_seq_number_of_starred_exprs(asdl_seq *seq)
-{
- int n = 0;
- for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) {
- KeywordOrStarred *k = asdl_seq_GET_UNTYPED(seq, i);
- if (!k->is_keyword) {
- n++;
- }
- }
- return n;
-}
-
-/* Extract the starred expressions of an asdl_seq* of KeywordOrStarred*s */
-asdl_expr_seq *
-_PyPegen_seq_extract_starred_exprs(Parser *p, asdl_seq *kwargs)
-{
- int new_len = _seq_number_of_starred_exprs(kwargs);
- if (new_len == 0) {
- return NULL;
- }
- asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(new_len, p->arena);
- if (!new_seq) {
- return NULL;
- }
-
- int idx = 0;
- for (Py_ssize_t i = 0, len = asdl_seq_LEN(kwargs); i < len; i++) {
- KeywordOrStarred *k = asdl_seq_GET_UNTYPED(kwargs, i);
- if (!k->is_keyword) {
- asdl_seq_SET(new_seq, idx++, k->element);
- }
- }
- return new_seq;
-}
-
-/* Return a new asdl_seq* with only the keywords in kwargs */
-asdl_keyword_seq*
-_PyPegen_seq_delete_starred_exprs(Parser *p, asdl_seq *kwargs)
-{
- Py_ssize_t len = asdl_seq_LEN(kwargs);
- Py_ssize_t new_len = len - _seq_number_of_starred_exprs(kwargs);
- if (new_len == 0) {
- return NULL;
- }
- asdl_keyword_seq *new_seq = _Py_asdl_keyword_seq_new(new_len, p->arena);
- if (!new_seq) {
- return NULL;
- }
-
- int idx = 0;
- for (Py_ssize_t i = 0; i < len; i++) {
- KeywordOrStarred *k = asdl_seq_GET_UNTYPED(kwargs, i);
- if (k->is_keyword) {
- asdl_seq_SET(new_seq, idx++, k->element);
- }
- }
- return new_seq;
-}
-
-expr_ty
-_PyPegen_concatenate_strings(Parser *p, asdl_seq *strings)
-{
- Py_ssize_t len = asdl_seq_LEN(strings);
- assert(len > 0);
-
- Token *first = asdl_seq_GET_UNTYPED(strings, 0);
- Token *last = asdl_seq_GET_UNTYPED(strings, len - 1);
-
- int bytesmode = 0;
- PyObject *bytes_str = NULL;
-
- FstringParser state;
- _PyPegen_FstringParser_Init(&state);
-
- for (Py_ssize_t i = 0; i < len; i++) {
- Token *t = asdl_seq_GET_UNTYPED(strings, i);
-
- int this_bytesmode;
- int this_rawmode;
- PyObject *s;
- const char *fstr;
- Py_ssize_t fstrlen = -1;
-
- if (_PyPegen_parsestr(p, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen, t) != 0) {
- goto error;
- }
-
- /* Check that we are not mixing bytes with unicode. */
- if (i != 0 && bytesmode != this_bytesmode) {
- RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals");
- Py_XDECREF(s);
- goto error;
- }
- bytesmode = this_bytesmode;
-
- if (fstr != NULL) {
- assert(s == NULL && !bytesmode);
-
- int result = _PyPegen_FstringParser_ConcatFstring(p, &state, &fstr, fstr + fstrlen,
- this_rawmode, 0, first, t, last);
- if (result < 0) {
- goto error;
- }
- }
- else {
- /* String or byte string. */
- assert(s != NULL && fstr == NULL);
- assert(bytesmode ? PyBytes_CheckExact(s) : PyUnicode_CheckExact(s));
-
- if (bytesmode) {
- if (i == 0) {
- bytes_str = s;
- }
- else {
- PyBytes_ConcatAndDel(&bytes_str, s);
- if (!bytes_str) {
- goto error;
- }
- }
- }
- else {
- /* This is a regular string. Concatenate it. */
- if (_PyPegen_FstringParser_ConcatAndDel(&state, s) < 0) {
- goto error;
- }
- }
- }
- }
-
- if (bytesmode) {
- if (_PyArena_AddPyObject(p->arena, bytes_str) < 0) {
- goto error;
- }
- return _PyAST_Constant(bytes_str, NULL, first->lineno,
- first->col_offset, last->end_lineno,
- last->end_col_offset, p->arena);
- }
-
- return _PyPegen_FstringParser_Finish(p, &state, first, last);
-
-error:
- Py_XDECREF(bytes_str);
- _PyPegen_FstringParser_Dealloc(&state);
- if (PyErr_Occurred()) {
- raise_decode_error(p);
- }
- return NULL;
-}
-
-expr_ty
-_PyPegen_ensure_imaginary(Parser *p, expr_ty exp)
-{
- if (exp->kind != Constant_kind || !PyComplex_CheckExact(exp->v.Constant.value)) {
- RAISE_SYNTAX_ERROR_KNOWN_LOCATION(exp, "imaginary number required in complex literal");
- return NULL;
- }
- return exp;
-}
-
-expr_ty
-_PyPegen_ensure_real(Parser *p, expr_ty exp)
-{
- if (exp->kind != Constant_kind || PyComplex_CheckExact(exp->v.Constant.value)) {
- RAISE_SYNTAX_ERROR_KNOWN_LOCATION(exp, "real number required in complex literal");
- return NULL;
- }
- return exp;
-}
-
-mod_ty
-_PyPegen_make_module(Parser *p, asdl_stmt_seq *a) {
- asdl_type_ignore_seq *type_ignores = NULL;
- Py_ssize_t num = p->type_ignore_comments.num_items;
- if (num > 0) {
- // Turn the raw (comment, lineno) pairs into TypeIgnore objects in the arena
- type_ignores = _Py_asdl_type_ignore_seq_new(num, p->arena);
- if (type_ignores == NULL) {
- return NULL;
- }
- for (int i = 0; i < num; i++) {
- PyObject *tag = _PyPegen_new_type_comment(p, p->type_ignore_comments.items[i].comment);
- if (tag == NULL) {
- return NULL;
- }
- type_ignore_ty ti = _PyAST_TypeIgnore(p->type_ignore_comments.items[i].lineno,
- tag, p->arena);
- if (ti == NULL) {
- return NULL;
- }
- asdl_seq_SET(type_ignores, i, ti);
- }
- }
- return _PyAST_Module(a, type_ignores, p->arena);
-}
-
-// Error reporting helpers
-
-expr_ty
-_PyPegen_get_invalid_target(expr_ty e, TARGETS_TYPE targets_type)
-{
- if (e == NULL) {
- return NULL;
- }
-
-#define VISIT_CONTAINER(CONTAINER, TYPE) do { \
- Py_ssize_t len = asdl_seq_LEN((CONTAINER)->v.TYPE.elts);\
- for (Py_ssize_t i = 0; i < len; i++) {\
- expr_ty other = asdl_seq_GET((CONTAINER)->v.TYPE.elts, i);\
- expr_ty child = _PyPegen_get_invalid_target(other, targets_type);\
- if (child != NULL) {\
- return child;\
- }\
- }\
- } while (0)
-
- // We only need to visit List and Tuple nodes recursively as those
- // are the only ones that can contain valid names in targets when
- // they are parsed as expressions. Any other kind of expression
- // that is a container (like Sets or Dicts) is directly invalid and
- // we don't need to visit it recursively.
-
- switch (e->kind) {
- case List_kind:
- VISIT_CONTAINER(e, List);
- return NULL;
- case Tuple_kind:
- VISIT_CONTAINER(e, Tuple);
- return NULL;
- case Starred_kind:
- if (targets_type == DEL_TARGETS) {
- return e;
- }
- return _PyPegen_get_invalid_target(e->v.Starred.value, targets_type);
- case Compare_kind:
- // This is needed, because the `a in b` in `for a in b` gets parsed
- // as a comparison, and so we need to search the left side of the comparison
- // for invalid targets.
- if (targets_type == FOR_TARGETS) {
- cmpop_ty cmpop = (cmpop_ty) asdl_seq_GET(e->v.Compare.ops, 0);
- if (cmpop == In) {
- return _PyPegen_get_invalid_target(e->v.Compare.left, targets_type);
- }
- return NULL;
- }
- return e;
- case Name_kind:
- case Subscript_kind:
- case Attribute_kind:
- return NULL;
- default:
- return e;
- }
-}
-
-void *_PyPegen_arguments_parsing_error(Parser *p, expr_ty e) {
- int kwarg_unpacking = 0;
- for (Py_ssize_t i = 0, l = asdl_seq_LEN(e->v.Call.keywords); i < l; i++) {
- keyword_ty keyword = asdl_seq_GET(e->v.Call.keywords, i);
- if (!keyword->arg) {
- kwarg_unpacking = 1;
- }
- }
-
- const char *msg = NULL;
- if (kwarg_unpacking) {
- msg = "positional argument follows keyword argument unpacking";
- } else {
- msg = "positional argument follows keyword argument";
- }
-
- return RAISE_SYNTAX_ERROR(msg);
-}
-
-
-static inline expr_ty
-_PyPegen_get_last_comprehension_item(comprehension_ty comprehension) {
- if (comprehension->ifs == NULL || asdl_seq_LEN(comprehension->ifs) == 0) {
- return comprehension->iter;
- }
- return PyPegen_last_item(comprehension->ifs, expr_ty);
-}
-
-void *
-_PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq *comprehensions)
-{
- /* The rule that calls this function is 'args for_if_clauses'.
- For the input f(L, x for x in y), L and x are in args and
- the for is parsed as a for_if_clause. We have to check if
- len <= 1, so that input like dict((a, b) for a, b in x)
- gets successfully parsed and then we pass the last
- argument (x in the above example) as the location of the
- error */
- Py_ssize_t len = asdl_seq_LEN(args->v.Call.args);
- if (len <= 1) {
- return NULL;
- }
-
- comprehension_ty last_comprehension = PyPegen_last_item(comprehensions, comprehension_ty);
-
- return RAISE_SYNTAX_ERROR_KNOWN_RANGE(
- (expr_ty) asdl_seq_GET(args->v.Call.args, len - 1),
- _PyPegen_get_last_comprehension_item(last_comprehension),
- "Generator expression must be parenthesized"
- );
-}
-
-
-expr_ty _PyPegen_collect_call_seqs(Parser *p, asdl_expr_seq *a, asdl_seq *b,
- int lineno, int col_offset, int end_lineno,
- int end_col_offset, PyArena *arena) {
- Py_ssize_t args_len = asdl_seq_LEN(a);
- Py_ssize_t total_len = args_len;
-
- if (b == NULL) {
- return _PyAST_Call(_PyPegen_dummy_name(p), a, NULL, lineno, col_offset,
- end_lineno, end_col_offset, arena);
-
- }
-
- asdl_expr_seq *starreds = _PyPegen_seq_extract_starred_exprs(p, b);
- asdl_keyword_seq *keywords = _PyPegen_seq_delete_starred_exprs(p, b);
-
- if (starreds) {
- total_len += asdl_seq_LEN(starreds);
- }
-
- asdl_expr_seq *args = _Py_asdl_expr_seq_new(total_len, arena);
-
- Py_ssize_t i = 0;
- for (i = 0; i < args_len; i++) {
- asdl_seq_SET(args, i, asdl_seq_GET(a, i));
- }
- for (; i < total_len; i++) {
- asdl_seq_SET(args, i, asdl_seq_GET(starreds, i - args_len));
- }
-
- return _PyAST_Call(_PyPegen_dummy_name(p), args, keywords, lineno,
- col_offset, end_lineno, end_col_offset, arena);
-}
+}
\ No newline at end of file
diff --git a/Parser/pegen.h b/Parser/pegen.h
index 8721d7e891005..e5e712ab26b87 100644
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -23,6 +23,8 @@
#define PyPARSE_TYPE_COMMENTS 0x0040
#define PyPARSE_ASYNC_HACKS 0x0080

+#define CURRENT_POS (-5)
+
typedef struct _memo {
int type;
void *node;
@@ -114,6 +116,7 @@ typedef struct {
int is_keyword;
} KeywordOrStarred;

+// Internal parser functions
#if defined(Py_DEBUG)
void _PyPegen_clear_memo_statistics(void);
PyObject *_PyPegen_get_memo_statistics(void);
@@ -123,7 +126,6 @@ int _PyPegen_insert_memo(Parser *p, int mark, int type, void *node);
int _PyPegen_update_memo(Parser *p, int mark, int type, void *node);
int _PyPegen_is_memoized(Parser *p, int type, void *pres);

-
int _PyPegen_lookahead_with_name(int, expr_ty (func)(Parser *), Parser *);
int _PyPegen_lookahead_with_int(int, Token *(func)(Parser *, int), Parser *, int);
int _PyPegen_lookahead_with_string(int , expr_ty (func)(Parser *, const char*), Parser *, const char*);
@@ -139,23 +141,24 @@ int _PyPegen_fill_token(Parser *p);
expr_ty _PyPegen_name_token(Parser *p);
expr_ty _PyPegen_number_token(Parser *p);
void *_PyPegen_string_token(Parser *p);
-const char *_PyPegen_get_expr_name(expr_ty);
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
+
+// Error handling functions and APIs
+typedef enum {
+ STAR_TARGETS,
+ DEL_TARGETS,
+ FOR_TARGETS
+} TARGETS_TYPE;
+
+int _Pypegen_raise_decode_error(Parser *p);
+void _PyPegen_raise_tokenizer_init_error(PyObject *filename);
+int _Pypegen_tokenizer_error(Parser *p);
void *_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...);
void *_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
Py_ssize_t lineno, Py_ssize_t col_offset,
Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
const char *errmsg, va_list va);
-void *_PyPegen_dummy_name(Parser *p, ...);
-
-void * _PyPegen_seq_last_item(asdl_seq *seq);
-#define PyPegen_last_item(seq, type) ((type)_PyPegen_seq_last_item((asdl_seq*)seq))
-
-void * _PyPegen_seq_first_item(asdl_seq *seq);
-#define PyPegen_first_item(seq, type) ((type)_PyPegen_seq_first_item((asdl_seq*)seq))
-
-#define CURRENT_POS (-5)
-
+void _Pypegen_set_syntax_error(Parser* p, Token* last_token);
Py_LOCAL_INLINE(void *)
RAISE_ERROR_KNOWN_LOCATION(Parser *p, PyObject *errtype,
Py_ssize_t lineno, Py_ssize_t col_offset,
@@ -170,10 +173,6 @@ RAISE_ERROR_KNOWN_LOCATION(Parser *p, PyObject *errtype,
va_end(va);
return NULL;
}
-
-#define UNUSED(expr) do { (void)(expr); } while (0)
-#define EXTRA_EXPR(head, tail) head->lineno, (head)->col_offset, (tail)->end_lineno, (tail)->end_col_offset, p->arena
-#define EXTRA _start_lineno, _start_col_offset, _end_lineno, _end_col_offset, p->arena
#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, msg, ##__VA_ARGS__)
#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, msg, ##__VA_ARGS__)
#define RAISE_SYNTAX_ERROR_KNOWN_RANGE(a, b, msg, ...) \
@@ -182,6 +181,7 @@ RAISE_ERROR_KNOWN_LOCATION(Parser *p, PyObject *errtype,
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, (a)->lineno, (a)->col_offset, (a)->end_lineno, (a)->end_col_offset, msg, ##__VA_ARGS__)
#define RAISE_SYNTAX_ERROR_STARTING_FROM(a, msg, ...) \
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, (a)->lineno, (a)->col_offset, CURRENT_POS, CURRENT_POS, msg, ##__VA_ARGS__)
+#define RAISE_SYNTAX_ERROR_INVALID_TARGET(type, e) _RAISE_SYNTAX_ERROR_INVALID_TARGET(p, type, e)

Py_LOCAL_INLINE(void *)
CHECK_CALL(Parser *p, void *result)
@@ -207,6 +207,39 @@ CHECK_CALL_NULL_ALLOWED(Parser *p, void *result)
#define CHECK(type, result) ((type) CHECK_CALL(p, result))
#define CHECK_NULL_ALLOWED(type, result) ((type) CHECK_CALL_NULL_ALLOWED(p, result))

+expr_ty _PyPegen_get_invalid_target(expr_ty e, TARGETS_TYPE targets_type);
+const char *_PyPegen_get_expr_name(expr_ty);
+Py_LOCAL_INLINE(void *)
+_RAISE_SYNTAX_ERROR_INVALID_TARGET(Parser *p, TARGETS_TYPE type, void *e)
+{
+ expr_ty invalid_target = CHECK_NULL_ALLOWED(expr_ty, _PyPegen_get_invalid_target(e, type));
+ if (invalid_target != NULL) {
+ const char *msg;
+ if (type == STAR_TARGETS || type == FOR_TARGETS) {
+ msg = "cannot assign to %s";
+ }
+ else {
+ msg = "cannot delete %s";
+ }
+ return RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+ invalid_target,
+ msg,
+ _PyPegen_get_expr_name(invalid_target)
+ );
+ }
+ return RAISE_SYNTAX_ERROR("invalid syntax");
+}
+
+// Action utility functions
+
+void *_PyPegen_dummy_name(Parser *p, ...);
+void * _PyPegen_seq_last_item(asdl_seq *seq);
+#define PyPegen_last_item(seq, type) ((type)_PyPegen_seq_last_item((asdl_seq*)seq))
+void * _PyPegen_seq_first_item(asdl_seq *seq);
+#define PyPegen_first_item(seq, type) ((type)_PyPegen_seq_first_item((asdl_seq*)seq))
+#define UNUSED(expr) do { (void)(expr); } while (0)
+#define EXTRA_EXPR(head, tail) head->lineno, (head)->col_offset, (tail)->end_lineno, (tail)->end_col_offset, p->arena
+#define EXTRA _start_lineno, _start_col_offset, _end_lineno, _end_col_offset, p->arena
PyObject *_PyPegen_new_type_comment(Parser *, const char *);

Py_LOCAL_INLINE(PyObject *)
@@ -248,13 +281,6 @@ INVALID_VERSION_CHECK(Parser *p, int version, char *msg, void *node)

arg_ty _PyPegen_add_type_comment_to_arg(Parser *, arg_ty, Token *);
PyObject *_PyPegen_new_identifier(Parser *, const char *);
-Parser *_PyPegen_Parser_New(struct tok_state *, int, int, int, int *, PyArena *);
-void _PyPegen_Parser_Free(Parser *);
-mod_ty _PyPegen_run_parser_from_file_pointer(FILE *, int, PyObject *, const char *,
- const char *, const char *, PyCompilerFlags *, int *, PyArena *);
-void *_PyPegen_run_parser(Parser *);
-mod_ty _PyPegen_run_parser_from_string(const char *, int, PyObject *, PyCompilerFlags *, PyArena *);
-asdl_stmt_seq *_PyPegen_interactive_exit(Parser *);
asdl_seq *_PyPegen_singleton_seq(Parser *, void *);
asdl_seq *_PyPegen_seq_insert_in_front(Parser *, void *, asdl_seq *);
asdl_seq *_PyPegen_seq_append_to_end(Parser *, asdl_seq *, void *);
@@ -295,40 +321,18 @@ asdl_seq *_PyPegen_join_sequences(Parser *, asdl_seq *, asdl_seq *);
int _PyPegen_check_barry_as_flufl(Parser *, Token *);
int _PyPegen_check_legacy_stmt(Parser *p, expr_ty t);
mod_ty _PyPegen_make_module(Parser *, asdl_stmt_seq *);
-
-// Error reporting helpers
-typedef enum {
- STAR_TARGETS,
- DEL_TARGETS,
- FOR_TARGETS
-} TARGETS_TYPE;
-expr_ty _PyPegen_get_invalid_target(expr_ty e, TARGETS_TYPE targets_type);
-#define RAISE_SYNTAX_ERROR_INVALID_TARGET(type, e) _RAISE_SYNTAX_ERROR_INVALID_TARGET(p, type, e)
-
-Py_LOCAL_INLINE(void *)
-_RAISE_SYNTAX_ERROR_INVALID_TARGET(Parser *p, TARGETS_TYPE type, void *e)
-{
- expr_ty invalid_target = CHECK_NULL_ALLOWED(expr_ty, _PyPegen_get_invalid_target(e, type));
- if (invalid_target != NULL) {
- const char *msg;
- if (type == STAR_TARGETS || type == FOR_TARGETS) {
- msg = "cannot assign to %s";
- }
- else {
- msg = "cannot delete %s";
- }
- return RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
- invalid_target,
- msg,
- _PyPegen_get_expr_name(invalid_target)
- );
- }
- return RAISE_SYNTAX_ERROR("invalid syntax");
-}
-
void *_PyPegen_arguments_parsing_error(Parser *, expr_ty);
void *_PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq *comprehensions);

+// Parser API
+
+Parser *_PyPegen_Parser_New(struct tok_state *, int, int, int, int *, PyArena *);
+void _PyPegen_Parser_Free(Parser *);
+mod_ty _PyPegen_run_parser_from_file_pointer(FILE *, int, PyObject *, const char *,
+ const char *, const char *, PyCompilerFlags *, int *, PyArena *);
+void *_PyPegen_run_parser(Parser *);
+mod_ty _PyPegen_run_parser_from_string(const char *, int, PyObject *, PyCompilerFlags *, PyArena *);
+asdl_stmt_seq *_PyPegen_interactive_exit(Parser *);

// Generated function in parse.c - function definition in python.gram
void *_PyPegen_parse(Parser *);
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
new file mode 100644
index 0000000000000..6eeab0a97226f
--- /dev/null
+++ b/Parser/pegen_errors.c
@@ -0,0 +1,425 @@
+#include <Python.h>
+#include <errcode.h>
+
+#include "tokenizer.h"
+#include "pegen.h"
+
+// TOKENIZER ERRORS
+
+void
+_PyPegen_raise_tokenizer_init_error(PyObject *filename)
+{
+ if (!(PyErr_ExceptionMatches(PyExc_LookupError)
+ || PyErr_ExceptionMatches(PyExc_SyntaxError)
+ || PyErr_ExceptionMatches(PyExc_ValueError)
+ || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
+ return;
+ }
+ PyObject *errstr = NULL;
+ PyObject *tuple = NULL;
+ PyObject *type;
+ PyObject *value;
+ PyObject *tback;
+ PyErr_Fetch(&type, &value, &tback);
+ errstr = PyObject_Str(value);
+ if (!errstr) {
+ goto error;
+ }
+
+ PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
+ if (!tmp) {
+ goto error;
+ }
+
+ tuple = PyTuple_Pack(2, errstr, tmp);
+ Py_DECREF(tmp);
+ if (!value) {
+ goto error;
+ }
+ PyErr_SetObject(PyExc_SyntaxError, tuple);
+
+error:
+ Py_XDECREF(type);
+ Py_XDECREF(value);
+ Py_XDECREF(tback);
+ Py_XDECREF(errstr);
+ Py_XDECREF(tuple);
+}
+
+static inline void
+raise_unclosed_parentheses_error(Parser *p) {
+ int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
+ int error_col = p->tok->parencolstack[p->tok->level-1];
+ RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
+ error_lineno, error_col, error_lineno, -1,
+ "'%c' was never closed",
+ p->tok->parenstack[p->tok->level-1]);
+}
+
+int
+_Pypegen_tokenizer_error(Parser *p)
+{
+ if (PyErr_Occurred()) {
+ return -1;
+ }
+
+ const char *msg = NULL;
+ PyObject* errtype = PyExc_SyntaxError;
+ Py_ssize_t col_offset = -1;
+ switch (p->tok->done) {
+ case E_TOKEN:
+ msg = "invalid token";
+ break;
+ case E_EOF:
+ if (p->tok->level) {
+ raise_unclosed_parentheses_error(p);
+ } else {
+ RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
+ }
+ return -1;
+ case E_DEDENT:
+ RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
+ return -1;
+ case E_INTR:
+ if (!PyErr_Occurred()) {
+ PyErr_SetNone(PyExc_KeyboardInterrupt);
+ }
+ return -1;
+ case E_NOMEM:
+ PyErr_NoMemory();
+ return -1;
+ case E_TABSPACE:
+ errtype = PyExc_TabError;
+ msg = "inconsistent use of tabs and spaces in indentation";
+ break;
+ case E_TOODEEP:
+ errtype = PyExc_IndentationError;
+ msg = "too many levels of indentation";
+ break;
+ case E_LINECONT: {
+ col_offset = p->tok->cur - p->tok->buf - 1;
+ msg = "unexpected character after line continuation character";
+ break;
+ }
+ default:
+ msg = "unknown parsing error";
+ }
+
+ RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
+ col_offset >= 0 ? col_offset : 0,
+ p->tok->lineno, -1, msg);
+ return -1;
+}
+
+int
+_Pypegen_raise_decode_error(Parser *p)
+{
+ assert(PyErr_Occurred());
+ const char *errtype = NULL;
+ if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
+ errtype = "unicode error";
+ }
+ else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
+ errtype = "value error";
+ }
+ if (errtype) {
+ PyObject *type;
+ PyObject *value;
+ PyObject *tback;
+ PyObject *errstr;
+ PyErr_Fetch(&type, &value, &tback);
+ errstr = PyObject_Str(value);
+ if (errstr) {
+ RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
+ Py_DECREF(errstr);
+ }
+ else {
+ PyErr_Clear();
+ RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
+ }
+ Py_XDECREF(type);
+ Py_XDECREF(value);
+ Py_XDECREF(tback);
+ }
+
+ return -1;
+}
+
+static int
+_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
+ // Tokenize the whole input to see if there are any tokenization
+ // errors such as mistmatching parentheses. These will get priority
+ // over generic syntax errors only if the line number of the error is
+ // before the one that we had for the generic error.
+
+ // We don't want to tokenize to the end for interactive input
+ if (p->tok->prompt != NULL) {
+ return 0;
+ }
+
+ PyObject *type, *value, *traceback;
+ PyErr_Fetch(&type, &value, &traceback);
+
+ Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
+ Py_ssize_t current_err_line = current_token->lineno;
+
+ int ret = 0;
+
+ for (;;) {
+ const char *start;
+ const char *end;
+ switch (_PyTokenizer_Get(p->tok, &start, &end)) {
+ case ERRORTOKEN:
+ if (p->tok->level != 0) {
+ int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
+ if (current_err_line > error_lineno) {
+ raise_unclosed_parentheses_error(p);
+ ret = -1;
+ goto exit;
+ }
+ }
+ break;
+ case ENDMARKER:
+ break;
+ default:
+ continue;
+ }
+ break;
+ }
+
+
+exit:
+ if (PyErr_Occurred()) {
+ Py_XDECREF(value);
+ Py_XDECREF(type);
+ Py_XDECREF(traceback);
+ } else {
+ PyErr_Restore(type, value, traceback);
+ }
+ return ret;
+}
+
+// PARSER ERRORS
+
+void *
+_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
+{
+ if (p->fill == 0) {
+ va_list va;
+ va_start(va, errmsg);
+ _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
+ va_end(va);
+ return NULL;
+ }
+
+ Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
+ Py_ssize_t col_offset;
+ Py_ssize_t end_col_offset = -1;
+ if (t->col_offset == -1) {
+ if (p->tok->cur == p->tok->buf) {
+ col_offset = 0;
+ } else {
+ const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
+ col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
+ }
+ } else {
+ col_offset = t->col_offset + 1;
+ }
+
+ if (t->end_col_offset != -1) {
+ end_col_offset = t->end_col_offset + 1;
+ }
+
+ va_list va;
+ va_start(va, errmsg);
+ _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
+ va_end(va);
+
+ return NULL;
+}
+
+static PyObject *
+get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
+{
+ /* If the file descriptor is interactive, the source lines of the current
+ * (multi-line) statement are stored in p->tok->interactive_src_start.
+ * If not, we're parsing from a string, which means that the whole source
+ * is stored in p->tok->str. */
+ assert(p->tok->fp == NULL || p->tok->fp == stdin);
+
+ char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
+ assert(cur_line != NULL);
+
+ for (int i = 0; i < lineno - 1; i++) {
+ cur_line = strchr(cur_line, '\n') + 1;
+ }
+
+ char *next_newline;
+ if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
+ next_newline = cur_line + strlen(cur_line);
+ }
+ return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
+}
+
+void *
+_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
+ Py_ssize_t lineno, Py_ssize_t col_offset,
+ Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
+ const char *errmsg, va_list va)
+{
+ PyObject *value = NULL;
+ PyObject *errstr = NULL;
+ PyObject *error_line = NULL;
+ PyObject *tmp = NULL;
+ p->error_indicator = 1;
+
+ if (end_lineno == CURRENT_POS) {
+ end_lineno = p->tok->lineno;
+ }
+ if (end_col_offset == CURRENT_POS) {
+ end_col_offset = p->tok->cur - p->tok->line_start;
+ }
+
+ if (p->start_rule == Py_fstring_input) {
+ const char *fstring_msg = "f-string: ";
+ Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
+
+ char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
+ if (!new_errmsg) {
+ return (void *) PyErr_NoMemory();
+ }
+
+ // Copy both strings into new buffer
+ memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
+ memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
+ new_errmsg[len] = 0;
+ errmsg = new_errmsg;
+ }
+ errstr = PyUnicode_FromFormatV(errmsg, va);
+ if (!errstr) {
+ goto error;
+ }
+
+ if (p->tok->fp_interactive) {
+ error_line = get_error_line_from_tokenizer_buffers(p, lineno);
+ }
+ else if (p->start_rule == Py_file_input) {
+ error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
+ (int) lineno, p->tok->encoding);
+ }
+
+ if (!error_line) {
+ /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
+ then we need to find the error line from some other source, because
+ p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
+ failed or we're parsing from a string or the REPL. There's a third edge case where
+ we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
+ `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
+ does not physically exist */
+ assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
+
+ if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
+ Py_ssize_t size = p->tok->inp - p->tok->buf;
+ error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
+ }
+ else if (p->tok->fp == NULL || p->tok->fp == stdin) {
+ error_line = get_error_line_from_tokenizer_buffers(p, lineno);
+ }
+ else {
+ error_line = PyUnicode_FromStringAndSize("", 0);
+ }
+ if (!error_line) {
+ goto error;
+ }
+ }
+
+ if (p->start_rule == Py_fstring_input) {
+ col_offset -= p->starting_col_offset;
+ end_col_offset -= p->starting_col_offset;
+ }
+
+ Py_ssize_t col_number = col_offset;
+ Py_ssize_t end_col_number = end_col_offset;
+
+ if (p->tok->encoding != NULL) {
+ col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
+ if (col_number < 0) {
+ goto error;
+ }
+ if (end_col_number > 0) {
+ Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
+ if (end_col_offset < 0) {
+ goto error;
+ } else {
+ end_col_number = end_col_offset;
+ }
+ }
+ }
+ tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
+ if (!tmp) {
+ goto error;
+ }
+ value = PyTuple_Pack(2, errstr, tmp);
+ Py_DECREF(tmp);
+ if (!value) {
+ goto error;
+ }
+ PyErr_SetObject(errtype, value);
+
+ Py_DECREF(errstr);
+ Py_DECREF(value);
+ if (p->start_rule == Py_fstring_input) {
+ PyMem_Free((void *)errmsg);
+ }
+ return NULL;
+
+error:
+ Py_XDECREF(errstr);
+ Py_XDECREF(error_line);
+ if (p->start_rule == Py_fstring_input) {
+ PyMem_Free((void *)errmsg);
+ }
+ return NULL;
+}
+
+void
+_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
+ // Existing sintax error
+ if (PyErr_Occurred()) {
+ // Prioritize tokenizer errors to custom syntax errors raised
+ // on the second phase only if the errors come from the parser.
+ if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
+ _PyPegen_tokenize_full_source_to_check_for_errors(p);
+ }
+ // Propagate the existing syntax error.
+ return;
+ }
+ // Initialization error
+ if (p->fill == 0) {
+ RAISE_SYNTAX_ERROR("error at start before reading any input");
+ }
+ // Parser encountered EOF (End of File) unexpectedtly
+ if (p->tok->done == E_EOF) {
+ if (p->tok->level) {
+ raise_unclosed_parentheses_error(p);
+ } else {
+ RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
+ }
+ return;
+ }
+ // Indentation error in the tokenizer
+ if (last_token->type == INDENT || last_token->type == DEDENT) {
+ RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
+ return;
+ }
+ // Unknown error (generic case)
+
+ // Use the last token we found on the first pass to avoid reporting
+ // incorrect locations for generic syntax errors just because we reached
+ // further away when trying to find specific syntax errors in the second
+ // pass.
+ RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
+ // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
+ // generic SyntaxError we just raised if errors are found.
+ _PyPegen_tokenize_full_source_to_check_for_errors(p);
+}
\ No newline at end of file
diff --git a/Tools/peg_generator/Makefile b/Tools/peg_generator/Makefile
index 6ad9c91b985cb..d010f19d58892 100644
--- a/Tools/peg_generator/Makefile
+++ b/Tools/peg_generator/Makefile
@@ -22,7 +22,7 @@ data/xxl.py:

build: peg_extension/parse.c

-peg_extension/parse.c: $(GRAMMAR) $(TOKENS) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen.c ../../Parser/string_parser.c ../../Parser/*.h pegen/grammar_parser.py
+peg_extension/parse.c: $(GRAMMAR) $(TOKENS) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen.c ../../Parser/pegen_errors.c ../../Parser/string_parser.c ../../Parser/action_helpers.c ../../Parser/*.h pegen/grammar_parser.py
$(PYTHON) -m pegen -q c $(GRAMMAR) $(TOKENS) -o peg_extension/parse.c --compile-extension

clean:
diff --git a/Tools/peg_generator/pegen/build.py b/Tools/peg_generator/pegen/build.py
index bf01078ff0b4a..c69e5c9a5f26a 100644
--- a/Tools/peg_generator/pegen/build.py
+++ b/Tools/peg_generator/pegen/build.py
@@ -69,6 +69,8 @@ def compile_c_extension(
str(MOD_DIR.parent.parent.parent / "Python" / "asdl.c"),
str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer.c"),
str(MOD_DIR.parent.parent.parent / "Parser" / "pegen.c"),
+ str(MOD_DIR.parent.parent.parent / "Parser" / "pegen_errors.c"),
+ str(MOD_DIR.parent.parent.parent / "Parser" / "action_helpers.c"),
str(MOD_DIR.parent.parent.parent / "Parser" / "string_parser.c"),
str(MOD_DIR.parent / "peg_extension" / "peg_extension.c"),
generated_source_path,

_______________________________________________
Python-checkins mailing list
Python-checkins@python.org
https://mail.python.org/mailman/listinfo/python-checkins