diff -Nru parso-0.2.1/CHANGELOG.rst parso-0.3.1/CHANGELOG.rst --- parso-0.2.1/CHANGELOG.rst 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/CHANGELOG.rst 2018-07-09 18:53:54.000000000 +0000 @@ -3,6 +3,16 @@ Changelog --------- +0.3.1 (2018-07-09) ++++++++++++++++++++ + +- Bugfixes in the diff parser and keyword-only arguments + +0.3.0 (2018-06-30) ++++++++++++++++++++ + +- Rewrote the pgen2 parser generator. + 0.2.1 (2018-05-21) +++++++++++++++++++ diff -Nru parso-0.2.1/debian/changelog parso-0.3.1/debian/changelog --- parso-0.2.1/debian/changelog 2018-06-13 21:45:23.000000000 +0000 +++ parso-0.3.1/debian/changelog 2018-12-31 10:23:51.000000000 +0000 @@ -1,3 +1,9 @@ +parso (0.3.1-1) unstable; urgency=medium + + * New upstream release + + -- Piotr Ożarowski Mon, 31 Dec 2018 11:23:51 +0100 + parso (0.2.1-1) unstable; urgency=medium * New upstream release diff -Nru parso-0.2.1/debian/control parso-0.3.1/debian/control --- parso-0.2.1/debian/control 2017-10-25 11:06:28.000000000 +0000 +++ parso-0.3.1/debian/control 2018-06-13 21:45:23.000000000 +0000 @@ -2,7 +2,6 @@ Section: python Priority: optional Maintainer: Piotr Ożarowski -Uploaders: Debian Python Modules Team Build-Depends: debhelper (>= 10), dh-python, python-all, python-setuptools, @@ -11,10 +10,8 @@ python3-setuptools, python3-sphinx, python3-pytest, -Standards-Version: 4.1.1 +Standards-Version: 4.3.0 Homepage: https://github.com/davidhalter/parso -Vcs-Git: https://anonscm.debian.org/git/python-modules/packages/parso.git -Vcs-Browser: https://anonscm.debian.org/git/python-modules/packages/parso.git Package: python-parso Architecture: all diff -Nru parso-0.2.1/debian/patches/0001-remove-forkme-logo-to-avoid-privacy-breach.patch parso-0.3.1/debian/patches/0001-remove-forkme-logo-to-avoid-privacy-breach.patch --- parso-0.2.1/debian/patches/0001-remove-forkme-logo-to-avoid-privacy-breach.patch 2017-12-20 10:04:20.000000000 +0000 +++ parso-0.3.1/debian/patches/0001-remove-forkme-logo-to-avoid-privacy-breach.patch 2018-12-31 10:23:51.000000000 +0000 @@ -8,10 +8,10 @@ docs/conf.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) -diff --git a/docs/_themes/flask/layout.html b/docs/_themes/flask/layout.html -index 988d3de..5caa4e2 100644 ---- a/docs/_themes/flask/layout.html -+++ b/docs/_themes/flask/layout.html +Index: parso-0.3.1/docs/_themes/flask/layout.html +=================================================================== +--- parso-0.3.1.orig/docs/_themes/flask/layout.html ++++ parso-0.3.1/docs/_themes/flask/layout.html @@ -6,9 +6,6 @@ {% endif %} ' % (self.__class__.__name__, txt) class PythonGrammar(Grammar): _error_normalizer_config = ErrorFinderConfig() - _token_namespace = token - _start_symbol = 'file_input' + _token_namespace = PythonTokenTypes + _start_nonterminal = 'file_input' def __init__(self, version_info, bnf_text): super(PythonGrammar, self).__init__( diff -Nru parso-0.2.1/parso/__init__.py parso-0.3.1/parso/__init__.py --- parso-0.2.1/parso/__init__.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/__init__.py 2018-07-09 18:53:54.000000000 +0000 @@ -43,7 +43,7 @@ from parso.utils import split_lines, python_bytes_to_unicode -__version__ = '0.2.1' +__version__ = '0.3.1' def parse(code=None, **kwargs): diff -Nru parso-0.2.1/parso/parser.py parso-0.3.1/parso/parser.py --- parso-0.2.1/parso/parser.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/parser.py 2018-07-09 18:53:54.000000000 +0000 @@ -1,3 +1,11 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright David Halter and Contributors +# Modifications are dual-licensed: MIT and PSF. +# 99% of the code is different from pgen2, now. + """ The ``Parser`` tries to convert the available Python code in an easy to read format, something like an abstract syntax tree. The classes who represent this @@ -16,7 +24,7 @@ ``Statement``, which produces ``Array`` and ``Call``). """ from parso import tree -from parso.pgen2.parse import PgenParser +from parso.pgen2.generator import ReservedString class ParserSyntaxError(Exception): @@ -30,7 +38,76 @@ self.error_leaf = error_leaf +class InternalParseError(Exception): + """ + Exception to signal the parser is stuck and error recovery didn't help. + Basically this shouldn't happen. It's a sign that something is really + wrong. + """ + + def __init__(self, msg, type_, value, start_pos): + Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" % + (msg, type_.name, value, start_pos)) + self.msg = msg + self.type = type + self.value = value + self.start_pos = start_pos + + +class Stack(list): + def _allowed_transition_names_and_token_types(self): + def iterate(): + # An API just for Jedi. + for stack_node in reversed(self): + for transition in stack_node.dfa.transitions: + if isinstance(transition, ReservedString): + yield transition.value + else: + yield transition # A token type + + if not stack_node.dfa.is_final: + break + + return list(iterate()) + + +class StackNode(object): + def __init__(self, dfa): + self.dfa = dfa + self.nodes = [] + + @property + def nonterminal(self): + return self.dfa.from_rule + + def __repr__(self): + return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes) + + +def _token_to_transition(grammar, type_, value): + # Map from token to label + if type_.contains_syntax: + # Check for reserved words (keywords) + try: + return grammar.reserved_syntax_strings[value] + except KeyError: + pass + + return type_ + + class BaseParser(object): + """Parser engine. + + A Parser instance contains state pertaining to the current token + sequence, and should not be used concurrently by different threads + to parse separate token sequences. + + See python/tokenize.py for how to get input tokens by a string. + + When a syntax error occurs, error_recovery() is called. + """ + node_map = {} default_node = tree.Node @@ -38,41 +115,94 @@ } default_leaf = tree.Leaf - def __init__(self, pgen_grammar, start_symbol='file_input', error_recovery=False): + def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False): self._pgen_grammar = pgen_grammar - self._start_symbol = start_symbol + self._start_nonterminal = start_nonterminal self._error_recovery = error_recovery def parse(self, tokens): - start_number = self._pgen_grammar.symbol2number[self._start_symbol] - self.pgen_parser = PgenParser( - self._pgen_grammar, self.convert_node, self.convert_leaf, - self.error_recovery, start_number - ) - - node = self.pgen_parser.parse(tokens) - # The stack is empty now, we don't need it anymore. - del self.pgen_parser - return node + first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0] + self.stack = Stack([StackNode(first_dfa)]) - def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix, - add_token_callback): + for token in tokens: + self._add_token(token) + + while True: + tos = self.stack[-1] + if not tos.dfa.is_final: + # We never broke out -- EOF is too soon -- Unfinished statement. + # However, the error recovery might have added the token again, if + # the stack is empty, we're fine. + raise InternalParseError( + "incomplete input", token.type, token.value, token.start_pos + ) + + if len(self.stack) > 1: + self._pop() + else: + return self.convert_node(tos.nonterminal, tos.nodes) + + def error_recovery(self, token): if self._error_recovery: raise NotImplementedError("Error Recovery is not implemented") else: - error_leaf = tree.ErrorLeaf('TODO %s' % typ, value, start_pos, prefix) + type_, value, start_pos, prefix = token + error_leaf = tree.ErrorLeaf(type_, value, start_pos, prefix) raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf) - def convert_node(self, pgen_grammar, type_, children): - # TODO REMOVE symbol, we don't want type here. - symbol = pgen_grammar.number2symbol[type_] + def convert_node(self, nonterminal, children): try: - return self.node_map[symbol](children) + return self.node_map[nonterminal](children) except KeyError: - return self.default_node(symbol, children) + return self.default_node(nonterminal, children) - def convert_leaf(self, pgen_grammar, type_, value, prefix, start_pos): + def convert_leaf(self, type_, value, prefix, start_pos): try: return self.leaf_map[type_](value, start_pos, prefix) except KeyError: return self.default_leaf(value, start_pos, prefix) + + def _add_token(self, token): + """ + This is the only core function for parsing. Here happens basically + everything. Everything is well prepared by the parser generator and we + only apply the necessary steps here. + """ + grammar = self._pgen_grammar + stack = self.stack + type_, value, start_pos, prefix = token + transition = _token_to_transition(grammar, type_, value) + + while True: + try: + plan = stack[-1].dfa.transitions[transition] + break + except KeyError: + if stack[-1].dfa.is_final: + self._pop() + else: + self.error_recovery(token) + return + except IndexError: + raise InternalParseError("too much input", type_, value, start_pos) + + stack[-1].dfa = plan.next_dfa + + for push in plan.dfa_pushes: + stack.append(StackNode(push)) + + leaf = self.convert_leaf(type_, value, prefix, start_pos) + stack[-1].nodes.append(leaf) + + def _pop(self): + tos = self.stack.pop() + # If there's exactly one child, return that child instead of + # creating a new node. We still create expr_stmt and + # file_input though, because a lot of Jedi depends on its + # logic. + if len(tos.nodes) == 1: + new_node = tos.nodes[0] + else: + new_node = self.convert_node(tos.dfa.from_rule, tos.nodes) + + self.stack[-1].nodes.append(new_node) diff -Nru parso-0.2.1/parso/pgen2/generator.py parso-0.3.1/parso/pgen2/generator.py --- parso-0.2.1/parso/pgen2/generator.py 1970-01-01 00:00:00.000000000 +0000 +++ parso-0.3.1/parso/pgen2/generator.py 2018-07-09 18:53:54.000000000 +0000 @@ -0,0 +1,358 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright David Halter and Contributors +# Modifications are dual-licensed: MIT and PSF. + +""" +This module defines the data structures used to represent a grammar. + +Specifying grammars in pgen is possible with this grammar:: + + grammar: (NEWLINE | rule)* ENDMARKER + rule: NAME ':' rhs NEWLINE + rhs: items ('|' items)* + items: item+ + item: '[' rhs ']' | atom ['+' | '*'] + atom: '(' rhs ')' | NAME | STRING + +This grammar is self-referencing. + +This parser generator (pgen2) was created by Guido Rossum and used for lib2to3. +Most of the code has been refactored to make it more Pythonic. Since this was a +"copy" of the CPython Parser parser "pgen", there was some work needed to make +it more readable. It should also be slightly faster than the original pgen2, +because we made some optimizations. +""" + +from ast import literal_eval + +from parso.pgen2.grammar_parser import GrammarParser, NFAState + + +class Grammar(object): + """ + Once initialized, this class supplies the grammar tables for the + parsing engine implemented by parse.py. The parsing engine + accesses the instance variables directly. + + The only important part in this parsers are dfas and transitions between + dfas. + """ + + def __init__(self, start_nonterminal, rule_to_dfas, reserved_syntax_strings): + self.nonterminal_to_dfas = rule_to_dfas # Dict[str, List[DFAState]] + self.reserved_syntax_strings = reserved_syntax_strings + self.start_nonterminal = start_nonterminal + + +class DFAPlan(object): + """ + Plans are used for the parser to create stack nodes and do the proper + DFA state transitions. + """ + def __init__(self, next_dfa, dfa_pushes=[]): + self.next_dfa = next_dfa + self.dfa_pushes = dfa_pushes + + def __repr__(self): + return '%s(%s, %s)' % (self.__class__.__name__, self.next_dfa, self.dfa_pushes) + + +class DFAState(object): + """ + The DFAState object is the core class for pretty much anything. DFAState + are the vertices of an ordered graph while arcs and transitions are the + edges. + + Arcs are the initial edges, where most DFAStates are not connected and + transitions are then calculated to connect the DFA state machines that have + different nonterminals. + """ + def __init__(self, from_rule, nfa_set, final): + assert isinstance(nfa_set, set) + assert isinstance(next(iter(nfa_set)), NFAState) + assert isinstance(final, NFAState) + self.from_rule = from_rule + self.nfa_set = nfa_set + self.arcs = {} # map from terminals/nonterminals to DFAState + # In an intermediary step we set these nonterminal arcs (which has the + # same structure as arcs). These don't contain terminals anymore. + self.nonterminal_arcs = {} + + # Transitions are basically the only thing that the parser is using + # with is_final. Everyting else is purely here to create a parser. + self.transitions = {} #: Dict[Union[TokenType, ReservedString], DFAPlan] + self.is_final = final in nfa_set + + def add_arc(self, next_, label): + assert isinstance(label, str) + assert label not in self.arcs + assert isinstance(next_, DFAState) + self.arcs[label] = next_ + + def unifystate(self, old, new): + for label, next_ in self.arcs.items(): + if next_ is old: + self.arcs[label] = new + + def __eq__(self, other): + # Equality test -- ignore the nfa_set instance variable + assert isinstance(other, DFAState) + if self.is_final != other.is_final: + return False + # Can't just return self.arcs == other.arcs, because that + # would invoke this method recursively, with cycles... + if len(self.arcs) != len(other.arcs): + return False + for label, next_ in self.arcs.items(): + if next_ is not other.arcs.get(label): + return False + return True + + __hash__ = None # For Py3 compatibility. + + def __repr__(self): + return '<%s: %s is_final=%s>' % ( + self.__class__.__name__, self.from_rule, self.is_final + ) + + +class ReservedString(object): + """ + Most grammars will have certain keywords and operators that are mentioned + in the grammar as strings (e.g. "if") and not token types (e.g. NUMBER). + This class basically is the former. + """ + + def __init__(self, value): + self.value = value + + def __repr__(self): + return '%s(%s)' % (self.__class__.__name__, self.value) + + +def _simplify_dfas(dfas): + """ + This is not theoretically optimal, but works well enough. + Algorithm: repeatedly look for two states that have the same + set of arcs (same labels pointing to the same nodes) and + unify them, until things stop changing. + + dfas is a list of DFAState instances + """ + changes = True + while changes: + changes = False + for i, state_i in enumerate(dfas): + for j in range(i + 1, len(dfas)): + state_j = dfas[j] + if state_i == state_j: + #print " unify", i, j + del dfas[j] + for state in dfas: + state.unifystate(state_j, state_i) + changes = True + break + + +def _make_dfas(start, finish): + """ + Uses the powerset construction algorithm to create DFA states from sets of + NFA states. + + Also does state reduction if some states are not needed. + """ + # To turn an NFA into a DFA, we define the states of the DFA + # to correspond to *sets* of states of the NFA. Then do some + # state reduction. + assert isinstance(start, NFAState) + assert isinstance(finish, NFAState) + + def addclosure(nfa_state, base_nfa_set): + assert isinstance(nfa_state, NFAState) + if nfa_state in base_nfa_set: + return + base_nfa_set.add(nfa_state) + for nfa_arc in nfa_state.arcs: + if nfa_arc.nonterminal_or_string is None: + addclosure(nfa_arc.next, base_nfa_set) + + base_nfa_set = set() + addclosure(start, base_nfa_set) + states = [DFAState(start.from_rule, base_nfa_set, finish)] + for state in states: # NB states grows while we're iterating + arcs = {} + # Find state transitions and store them in arcs. + for nfa_state in state.nfa_set: + for nfa_arc in nfa_state.arcs: + if nfa_arc.nonterminal_or_string is not None: + nfa_set = arcs.setdefault(nfa_arc.nonterminal_or_string, set()) + addclosure(nfa_arc.next, nfa_set) + + # Now create the dfa's with no None's in arcs anymore. All Nones have + # been eliminated and state transitions (arcs) are properly defined, we + # just need to create the dfa's. + for nonterminal_or_string, nfa_set in arcs.items(): + for nested_state in states: + if nested_state.nfa_set == nfa_set: + # The DFA state already exists for this rule. + break + else: + nested_state = DFAState(start.from_rule, nfa_set, finish) + states.append(nested_state) + + state.add_arc(nested_state, nonterminal_or_string) + return states # List of DFAState instances; first one is start + + +def _dump_nfa(start, finish): + print("Dump of NFA for", start.from_rule) + todo = [start] + for i, state in enumerate(todo): + print(" State", i, state is finish and "(final)" or "") + for label, next_ in state.arcs: + if next_ in todo: + j = todo.index(next_) + else: + j = len(todo) + todo.append(next_) + if label is None: + print(" -> %d" % j) + else: + print(" %s -> %d" % (label, j)) + + +def _dump_dfas(dfas): + print("Dump of DFA for", dfas[0].from_rule) + for i, state in enumerate(dfas): + print(" State", i, state.is_final and "(final)" or "") + for nonterminal, next_ in state.arcs.items(): + print(" %s -> %d" % (nonterminal, dfas.index(next_))) + + +def generate_grammar(bnf_grammar, token_namespace): + """ + ``bnf_text`` is a grammar in extended BNF (using * for repetition, + for + at-least-once repetition, [] for optional parts, | for alternatives and () + for grouping). + + It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its + own parser. + """ + rule_to_dfas = {} + start_nonterminal = None + for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse(): + #_dump_nfa(a, z) + dfas = _make_dfas(nfa_a, nfa_z) + #_dump_dfas(dfas) + # oldlen = len(dfas) + _simplify_dfas(dfas) + # newlen = len(dfas) + rule_to_dfas[nfa_a.from_rule] = dfas + #print(nfa_a.from_rule, oldlen, newlen) + + if start_nonterminal is None: + start_nonterminal = nfa_a.from_rule + + reserved_strings = {} + for nonterminal, dfas in rule_to_dfas.items(): + for dfa_state in dfas: + for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items(): + if terminal_or_nonterminal in rule_to_dfas: + dfa_state.nonterminal_arcs[terminal_or_nonterminal] = next_dfa + else: + transition = _make_transition( + token_namespace, + reserved_strings, + terminal_or_nonterminal + ) + dfa_state.transitions[transition] = DFAPlan(next_dfa) + + _calculate_tree_traversal(rule_to_dfas) + return Grammar(start_nonterminal, rule_to_dfas, reserved_strings) + + +def _make_transition(token_namespace, reserved_syntax_strings, label): + """ + Creates a reserved string ("if", "for", "*", ...) or returns the token type + (NUMBER, STRING, ...) for a given grammar terminal. + """ + if label[0].isalpha(): + # A named token (e.g. NAME, NUMBER, STRING) + return getattr(token_namespace, label) + else: + # Either a keyword or an operator + assert label[0] in ('"', "'"), label + assert not label.startswith('"""') and not label.startswith("'''") + value = literal_eval(label) + try: + return reserved_syntax_strings[value] + except KeyError: + r = reserved_syntax_strings[value] = ReservedString(value) + return r + + +def _calculate_tree_traversal(nonterminal_to_dfas): + """ + By this point we know how dfas can move around within a stack node, but we + don't know how we can add a new stack node (nonterminal transitions). + """ + # Map from grammar rule (nonterminal) name to a set of tokens. + first_plans = {} + + nonterminals = list(nonterminal_to_dfas.keys()) + nonterminals.sort() + for nonterminal in nonterminals: + if nonterminal not in first_plans: + _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal) + + # Now that we have calculated the first terminals, we are sure that + # there is no left recursion or ambiguities. + + for dfas in nonterminal_to_dfas.values(): + for dfa_state in dfas: + for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items(): + for transition, pushes in first_plans[nonterminal].items(): + dfa_state.transitions[transition] = DFAPlan(next_dfa, pushes) + + +def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal): + """ + Calculates the first plan in the first_plans dictionary for every given + nonterminal. This is going to be used to know when to create stack nodes. + """ + dfas = nonterminal_to_dfas[nonterminal] + new_first_plans = {} + first_plans[nonterminal] = None # dummy to detect left recursion + # We only need to check the first dfa. All the following ones are not + # interesting to find first terminals. + state = dfas[0] + for transition, next_ in state.transitions.items(): + # It's a string. We have finally found a possible first token. + new_first_plans[transition] = [next_.next_dfa] + + for nonterminal2, next_ in state.nonterminal_arcs.items(): + # It's a nonterminal and we have either a left recursion issue + # in the grammar or we have to recurse. + try: + first_plans2 = first_plans[nonterminal2] + except KeyError: + first_plans2 = _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal2) + else: + if first_plans2 is None: + raise ValueError("left recursion for rule %r" % nonterminal) + + for t, pushes in first_plans2.items(): + check = new_first_plans.get(t) + if check is not None: + raise ValueError( + "Rule %s is ambiguous; %s is the" + " start of the rule %s as well as %s." + % (nonterminal, t, nonterminal2, check[-1].from_rule) + ) + new_first_plans[t] = [next_] + pushes + + first_plans[nonterminal] = new_first_plans + return new_first_plans diff -Nru parso-0.2.1/parso/pgen2/grammar_parser.py parso-0.3.1/parso/pgen2/grammar_parser.py --- parso-0.2.1/parso/pgen2/grammar_parser.py 1970-01-01 00:00:00.000000000 +0000 +++ parso-0.3.1/parso/pgen2/grammar_parser.py 2018-07-09 18:53:54.000000000 +0000 @@ -0,0 +1,156 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright David Halter and Contributors +# Modifications are dual-licensed: MIT and PSF. + +from parso.python.tokenize import tokenize +from parso.utils import parse_version_string +from parso.python.token import PythonTokenTypes + + +class GrammarParser(): + """ + The parser for Python grammar files. + """ + def __init__(self, bnf_grammar): + self._bnf_grammar = bnf_grammar + self.generator = tokenize( + bnf_grammar, + version_info=parse_version_string('3.6') + ) + self._gettoken() # Initialize lookahead + + def parse(self): + # grammar: (NEWLINE | rule)* ENDMARKER + while self.type != PythonTokenTypes.ENDMARKER: + while self.type == PythonTokenTypes.NEWLINE: + self._gettoken() + + # rule: NAME ':' rhs NEWLINE + self._current_rule_name = self._expect(PythonTokenTypes.NAME) + self._expect(PythonTokenTypes.OP, ':') + + a, z = self._parse_rhs() + self._expect(PythonTokenTypes.NEWLINE) + + yield a, z + + def _parse_rhs(self): + # rhs: items ('|' items)* + a, z = self._parse_items() + if self.value != "|": + return a, z + else: + aa = NFAState(self._current_rule_name) + zz = NFAState(self._current_rule_name) + while True: + # Add the possibility to go into the state of a and come back + # to finish. + aa.add_arc(a) + z.add_arc(zz) + if self.value != "|": + break + + self._gettoken() + a, z = self._parse_items() + return aa, zz + + def _parse_items(self): + # items: item+ + a, b = self._parse_item() + while self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING) \ + or self.value in ('(', '['): + c, d = self._parse_item() + # Need to end on the next item. + b.add_arc(c) + b = d + return a, b + + def _parse_item(self): + # item: '[' rhs ']' | atom ['+' | '*'] + if self.value == "[": + self._gettoken() + a, z = self._parse_rhs() + self._expect(PythonTokenTypes.OP, ']') + # Make it also possible that there is no token and change the + # state. + a.add_arc(z) + return a, z + else: + a, z = self._parse_atom() + value = self.value + if value not in ("+", "*"): + return a, z + self._gettoken() + # Make it clear that we can go back to the old state and repeat. + z.add_arc(a) + if value == "+": + return a, z + else: + # The end state is the same as the beginning, nothing must + # change. + return a, a + + def _parse_atom(self): + # atom: '(' rhs ')' | NAME | STRING + if self.value == "(": + self._gettoken() + a, z = self._parse_rhs() + self._expect(PythonTokenTypes.OP, ')') + return a, z + elif self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING): + a = NFAState(self._current_rule_name) + z = NFAState(self._current_rule_name) + # Make it clear that the state transition requires that value. + a.add_arc(z, self.value) + self._gettoken() + return a, z + else: + self._raise_error("expected (...) or NAME or STRING, got %s/%s", + self.type, self.value) + + def _expect(self, type_, value=None): + if self.type != type_: + self._raise_error("expected %s, got %s [%s]", + type_, self.type, self.value) + if value is not None and self.value != value: + self._raise_error("expected %s, got %s", value, self.value) + value = self.value + self._gettoken() + return value + + def _gettoken(self): + tup = next(self.generator) + self.type, self.value, self.begin, prefix = tup + + def _raise_error(self, msg, *args): + if args: + try: + msg = msg % args + except: + msg = " ".join([msg] + list(map(str, args))) + line = self._bnf_grammar.splitlines()[self.begin[0] - 1] + raise SyntaxError(msg, ('', self.begin[0], + self.begin[1], line)) + + +class NFAArc(object): + def __init__(self, next_, nonterminal_or_string): + self.next = next_ + self.nonterminal_or_string = nonterminal_or_string + + +class NFAState(object): + def __init__(self, from_rule): + self.from_rule = from_rule + self.arcs = [] # List[nonterminal (str), NFAState] + + def add_arc(self, next_, nonterminal_or_string=None): + assert nonterminal_or_string is None or isinstance(nonterminal_or_string, str) + assert isinstance(next_, NFAState) + self.arcs.append(NFAArc(next_, nonterminal_or_string)) + + def __repr__(self): + return '<%s: from %s>' % (self.__class__.__name__, self.from_rule) diff -Nru parso-0.2.1/parso/pgen2/grammar.py parso-0.3.1/parso/pgen2/grammar.py --- parso-0.2.1/parso/pgen2/grammar.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/pgen2/grammar.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,128 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -# Modifications: -# Copyright 2014 David Halter. Integration into Jedi. -# Modifications are dual-licensed: MIT and PSF. - -"""This module defines the data structures used to represent a grammar. - -These are a bit arcane because they are derived from the data -structures used by Python's 'pgen' parser generator. - -There's also a table here mapping operators to their names in the -token module; the Python tokenize module reports all operators as the -fallback token code OP, but the parser needs the actual token code. - -""" - -try: - import cPickle as pickle -except: - import pickle - - -class Grammar(object): - """Pgen parsing tables conversion class. - - Once initialized, this class supplies the grammar tables for the - parsing engine implemented by parse.py. The parsing engine - accesses the instance variables directly. The class here does not - provide initialization of the tables; several subclasses exist to - do this (see the conv and pgen modules). - - The load() method reads the tables from a pickle file, which is - much faster than the other ways offered by subclasses. The pickle - file is written by calling dump() (after loading the grammar - tables using a subclass). The report() method prints a readable - representation of the tables to stdout, for debugging. - - The instance variables are as follows: - - symbol2number -- a dict mapping symbol names to numbers. Symbol - numbers are always 256 or higher, to distinguish - them from token numbers, which are between 0 and - 255 (inclusive). - - number2symbol -- a dict mapping numbers to symbol names; - these two are each other's inverse. - - states -- a list of DFAs, where each DFA is a list of - states, each state is a list of arcs, and each - arc is a (i, j) pair where i is a label and j is - a state number. The DFA number is the index into - this list. (This name is slightly confusing.) - Final states are represented by a special arc of - the form (0, j) where j is its own state number. - - dfas -- a dict mapping symbol numbers to (DFA, first) - pairs, where DFA is an item from the states list - above, and first is a set of tokens that can - begin this grammar rule (represented by a dict - whose values are always 1). - - labels -- a list of (x, y) pairs where x is either a token - number or a symbol number, and y is either None - or a string; the strings are keywords. The label - number is the index in this list; label numbers - are used to mark state transitions (arcs) in the - DFAs. - - start -- the number of the grammar's start symbol. - - keywords -- a dict mapping keyword strings to arc labels. - - tokens -- a dict mapping token numbers to arc labels. - - """ - - def __init__(self, bnf_text): - self.symbol2number = {} - self.number2symbol = {} - self.states = [] - self.dfas = {} - self.labels = [(0, "EMPTY")] - self.keywords = {} - self.tokens = {} - self.symbol2label = {} - self.label2symbol = {} - self.start = 256 - - def dump(self, filename): - """Dump the grammar tables to a pickle file.""" - with open(filename, "wb") as f: - pickle.dump(self.__dict__, f, 2) - - def load(self, filename): - """Load the grammar tables from a pickle file.""" - with open(filename, "rb") as f: - d = pickle.load(f) - self.__dict__.update(d) - - def copy(self): - """ - Copy the grammar. - """ - new = self.__class__() - for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", - "tokens", "symbol2label"): - setattr(new, dict_attr, getattr(self, dict_attr).copy()) - new.labels = self.labels[:] - new.states = self.states[:] - new.start = self.start - return new - - def report(self): - """Dump the grammar tables to standard output, for debugging.""" - from pprint import pprint - print("s2n") - pprint(self.symbol2number) - print("n2s") - pprint(self.number2symbol) - print("states") - pprint(self.states) - print("dfas") - pprint(self.dfas) - print("labels") - pprint(self.labels) - print("start", self.start) diff -Nru parso-0.2.1/parso/pgen2/__init__.py parso-0.3.1/parso/pgen2/__init__.py --- parso-0.2.1/parso/pgen2/__init__.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/pgen2/__init__.py 2018-07-09 18:53:54.000000000 +0000 @@ -4,5 +4,7 @@ # Modifications: # Copyright 2006 Google, Inc. All Rights Reserved. # Licensed to PSF under a Contributor Agreement. -# Copyright 2014 David Halter. Integration into Jedi. +# Copyright 2014 David Halter and Contributors # Modifications are dual-licensed: MIT and PSF. + +from parso.pgen2.generator import generate_grammar diff -Nru parso-0.2.1/parso/pgen2/parse.py parso-0.3.1/parso/pgen2/parse.py --- parso-0.2.1/parso/pgen2/parse.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/pgen2/parse.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,223 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -# Modifications: -# Copyright 2014 David Halter. Integration into Jedi. -# Modifications are dual-licensed: MIT and PSF. - -""" -Parser engine for the grammar tables generated by pgen. - -The grammar table must be loaded first. - -See Parser/parser.c in the Python distribution for additional info on -how this parsing engine works. -""" - -from parso.python import tokenize - - -class InternalParseError(Exception): - """ - Exception to signal the parser is stuck and error recovery didn't help. - Basically this shouldn't happen. It's a sign that something is really - wrong. - """ - - def __init__(self, msg, type, value, start_pos): - Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" % - (msg, tokenize.tok_name[type], value, start_pos)) - self.msg = msg - self.type = type - self.value = value - self.start_pos = start_pos - - -class Stack(list): - def get_tos_nodes(self): - tos = self[-1] - return tos[2][1] - - -def token_to_ilabel(grammar, type_, value): - # Map from token to label - if type_ == tokenize.NAME: - # Check for reserved words (keywords) - try: - return grammar.keywords[value] - except KeyError: - pass - - try: - return grammar.tokens[type_] - except KeyError: - return None - - -class PgenParser(object): - """Parser engine. - - The proper usage sequence is: - - p = Parser(grammar, [converter]) # create instance - p.setup([start]) # prepare for parsing - : - if p.add_token(...): # parse a token - break - root = p.rootnode # root of abstract syntax tree - - A Parser instance may be reused by calling setup() repeatedly. - - A Parser instance contains state pertaining to the current token - sequence, and should not be used concurrently by different threads - to parse separate token sequences. - - See driver.py for how to get input tokens by tokenizing a file or - string. - - Parsing is complete when add_token() returns True; the root of the - abstract syntax tree can then be retrieved from the rootnode - instance variable. When a syntax error occurs, error_recovery() - is called. There is no error recovery; the parser cannot be used - after a syntax error was reported (but it can be reinitialized by - calling setup()). - - """ - - def __init__(self, grammar, convert_node, convert_leaf, error_recovery, start): - """Constructor. - - The grammar argument is a grammar.Grammar instance; see the - grammar module for more information. - - The parser is not ready yet for parsing; you must call the - setup() method to get it started. - - The optional convert argument is a function mapping concrete - syntax tree nodes to abstract syntax tree nodes. If not - given, no conversion is done and the syntax tree produced is - the concrete syntax tree. If given, it must be a function of - two arguments, the first being the grammar (a grammar.Grammar - instance), and the second being the concrete syntax tree node - to be converted. The syntax tree is converted from the bottom - up. - - A concrete syntax tree node is a (type, nodes) tuple, where - type is the node type (a token or symbol number) and nodes - is a list of children for symbols, and None for tokens. - - An abstract syntax tree node may be anything; this is entirely - up to the converter function. - - """ - self.grammar = grammar - self.convert_node = convert_node - self.convert_leaf = convert_leaf - - # Each stack entry is a tuple: (dfa, state, node). - # A node is a tuple: (type, children), - # where children is a list of nodes or None - newnode = (start, []) - stackentry = (self.grammar.dfas[start], 0, newnode) - self.stack = Stack([stackentry]) - self.rootnode = None - self.error_recovery = error_recovery - - def parse(self, tokens): - for type_, value, start_pos, prefix in tokens: - if self.add_token(type_, value, start_pos, prefix): - break - else: - # We never broke out -- EOF is too soon -- Unfinished statement. - # However, the error recovery might have added the token again, if - # the stack is empty, we're fine. - if self.stack: - raise InternalParseError("incomplete input", type_, value, start_pos) - return self.rootnode - - def add_token(self, type_, value, start_pos, prefix): - """Add a token; return True if this is the end of the program.""" - ilabel = token_to_ilabel(self.grammar, type_, value) - - # Loop until the token is shifted; may raise exceptions - _gram = self.grammar - _labels = _gram.labels - _push = self._push - _pop = self._pop - _shift = self._shift - while True: - dfa, state, node = self.stack[-1] - states, first = dfa - arcs = states[state] - # Look for a state with this label - for i, newstate in arcs: - t, v = _labels[i] - if ilabel == i: - # Look it up in the list of labels - assert t < 256 - # Shift a token; we're done with it - _shift(type_, value, newstate, prefix, start_pos) - # Pop while we are in an accept-only state - state = newstate - while states[state] == [(0, state)]: - _pop() - if not self.stack: - # Done parsing! - return True - dfa, state, node = self.stack[-1] - states, first = dfa - # Done with this token - return False - elif t >= 256: - # See if it's a symbol and if we're in its first set - itsdfa = _gram.dfas[t] - itsstates, itsfirst = itsdfa - if ilabel in itsfirst: - # Push a symbol - _push(t, itsdfa, newstate) - break # To continue the outer while loop - else: - if (0, state) in arcs: - # An accepting state, pop it and try something else - _pop() - if not self.stack: - # Done parsing, but another token is input - raise InternalParseError("too much input", type_, value, start_pos) - else: - self.error_recovery(self.grammar, self.stack, arcs, type_, - value, start_pos, prefix, self.add_token) - break - - def _shift(self, type_, value, newstate, prefix, start_pos): - """Shift a token. (Internal)""" - dfa, state, node = self.stack[-1] - newnode = self.convert_leaf(self.grammar, type_, value, prefix, start_pos) - node[-1].append(newnode) - self.stack[-1] = (dfa, newstate, node) - - def _push(self, type_, newdfa, newstate): - """Push a nonterminal. (Internal)""" - dfa, state, node = self.stack[-1] - newnode = (type_, []) - self.stack[-1] = (dfa, newstate, node) - self.stack.append((newdfa, 0, newnode)) - - def _pop(self): - """Pop a nonterminal. (Internal)""" - popdfa, popstate, (type_, children) = self.stack.pop() - # If there's exactly one child, return that child instead of creating a - # new node. We still create expr_stmt and file_input though, because a - # lot of Jedi depends on its logic. - if len(children) == 1: - newnode = children[0] - else: - newnode = self.convert_node(self.grammar, type_, children) - - try: - # Equal to: - # dfa, state, node = self.stack[-1] - # symbol, children = node - self.stack[-1][2][1].append(newnode) - except IndexError: - # Stack is empty, set the rootnode. - self.rootnode = newnode diff -Nru parso-0.2.1/parso/pgen2/pgen.py parso-0.3.1/parso/pgen2/pgen.py --- parso-0.2.1/parso/pgen2/pgen.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/pgen2/pgen.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,400 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -# Modifications: -# Copyright 2014 David Halter. Integration into Jedi. -# Modifications are dual-licensed: MIT and PSF. - -from parso.pgen2 import grammar -from parso.python import token -from parso.python import tokenize -from parso.utils import parse_version_string - - -class ParserGenerator(object): - def __init__(self, bnf_text, token_namespace): - self._bnf_text = bnf_text - self.generator = tokenize.tokenize( - bnf_text, - version_info=parse_version_string('3.6') - ) - self._gettoken() # Initialize lookahead - self.dfas, self.startsymbol = self._parse() - self.first = {} # map from symbol name to set of tokens - self._addfirstsets() - self._token_namespace = token_namespace - - def make_grammar(self): - c = grammar.Grammar(self._bnf_text) - names = list(self.dfas.keys()) - names.sort() - # TODO do we still need this? - names.remove(self.startsymbol) - names.insert(0, self.startsymbol) - for name in names: - i = 256 + len(c.symbol2number) - c.symbol2number[name] = i - c.number2symbol[i] = name - for name in names: - dfa = self.dfas[name] - states = [] - for state in dfa: - arcs = [] - for label, next in state.arcs.items(): - arcs.append((self._make_label(c, label), dfa.index(next))) - if state.isfinal: - arcs.append((0, dfa.index(state))) - states.append(arcs) - c.states.append(states) - c.dfas[c.symbol2number[name]] = (states, self._make_first(c, name)) - c.start = c.symbol2number[self.startsymbol] - return c - - def _make_first(self, c, name): - rawfirst = self.first[name] - first = {} - for label in rawfirst: - ilabel = self._make_label(c, label) - ##assert ilabel not in first # XXX failed on <> ... != - first[ilabel] = 1 - return first - - def _make_label(self, c, label): - # XXX Maybe this should be a method on a subclass of converter? - ilabel = len(c.labels) - if label[0].isalpha(): - # Either a symbol name or a named token - if label in c.symbol2number: - # A symbol name (a non-terminal) - if label in c.symbol2label: - return c.symbol2label[label] - else: - c.labels.append((c.symbol2number[label], None)) - c.symbol2label[label] = ilabel - c.label2symbol[ilabel] = label - return ilabel - else: - # A named token (NAME, NUMBER, STRING) - itoken = getattr(self._token_namespace, label, None) - assert isinstance(itoken, int), label - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - else: - # Either a keyword or an operator - assert label[0] in ('"', "'"), label - value = eval(label) - if value[0].isalpha(): - # A keyword - if value in c.keywords: - return c.keywords[value] - else: - # TODO this might be an issue?! Using token.NAME here? - c.labels.append((token.NAME, value)) - c.keywords[value] = ilabel - return ilabel - else: - # An operator (any non-numeric token) - itoken = self._token_namespace.generate_token_id(value) - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - - def _addfirstsets(self): - names = list(self.dfas.keys()) - names.sort() - for name in names: - if name not in self.first: - self._calcfirst(name) - #print name, self.first[name].keys() - - def _calcfirst(self, name): - dfa = self.dfas[name] - self.first[name] = None # dummy to detect left recursion - state = dfa[0] - totalset = {} - overlapcheck = {} - for label, next in state.arcs.items(): - if label in self.dfas: - if label in self.first: - fset = self.first[label] - if fset is None: - raise ValueError("recursion for rule %r" % name) - else: - self._calcfirst(label) - fset = self.first[label] - totalset.update(fset) - overlapcheck[label] = fset - else: - totalset[label] = 1 - overlapcheck[label] = {label: 1} - inverse = {} - for label, itsfirst in overlapcheck.items(): - for symbol in itsfirst: - if symbol in inverse: - raise ValueError("rule %s is ambiguous; %s is in the" - " first sets of %s as well as %s" % - (name, symbol, label, inverse[symbol])) - inverse[symbol] = label - self.first[name] = totalset - - def _parse(self): - dfas = {} - startsymbol = None - # MSTART: (NEWLINE | RULE)* ENDMARKER - while self.type != token.ENDMARKER: - while self.type == token.NEWLINE: - self._gettoken() - # RULE: NAME ':' RHS NEWLINE - name = self._expect(token.NAME) - self._expect(token.COLON) - a, z = self._parse_rhs() - self._expect(token.NEWLINE) - #self._dump_nfa(name, a, z) - dfa = self._make_dfa(a, z) - #self._dump_dfa(name, dfa) - # oldlen = len(dfa) - self._simplify_dfa(dfa) - # newlen = len(dfa) - dfas[name] = dfa - #print name, oldlen, newlen - if startsymbol is None: - startsymbol = name - return dfas, startsymbol - - def _make_dfa(self, start, finish): - # To turn an NFA into a DFA, we define the states of the DFA - # to correspond to *sets* of states of the NFA. Then do some - # state reduction. Let's represent sets as dicts with 1 for - # values. - assert isinstance(start, NFAState) - assert isinstance(finish, NFAState) - - def closure(state): - base = {} - addclosure(state, base) - return base - - def addclosure(state, base): - assert isinstance(state, NFAState) - if state in base: - return - base[state] = 1 - for label, next in state.arcs: - if label is None: - addclosure(next, base) - - states = [DFAState(closure(start), finish)] - for state in states: # NB states grows while we're iterating - arcs = {} - for nfastate in state.nfaset: - for label, next in nfastate.arcs: - if label is not None: - addclosure(next, arcs.setdefault(label, {})) - for label, nfaset in arcs.items(): - for st in states: - if st.nfaset == nfaset: - break - else: - st = DFAState(nfaset, finish) - states.append(st) - state.addarc(st, label) - return states # List of DFAState instances; first one is start - - def _dump_nfa(self, name, start, finish): - print("Dump of NFA for", name) - todo = [start] - for i, state in enumerate(todo): - print(" State", i, state is finish and "(final)" or "") - for label, next in state.arcs: - if next in todo: - j = todo.index(next) - else: - j = len(todo) - todo.append(next) - if label is None: - print(" -> %d" % j) - else: - print(" %s -> %d" % (label, j)) - - def _dump_dfa(self, name, dfa): - print("Dump of DFA for", name) - for i, state in enumerate(dfa): - print(" State", i, state.isfinal and "(final)" or "") - for label, next in state.arcs.items(): - print(" %s -> %d" % (label, dfa.index(next))) - - def _simplify_dfa(self, dfa): - # This is not theoretically optimal, but works well enough. - # Algorithm: repeatedly look for two states that have the same - # set of arcs (same labels pointing to the same nodes) and - # unify them, until things stop changing. - - # dfa is a list of DFAState instances - changes = True - while changes: - changes = False - for i, state_i in enumerate(dfa): - for j in range(i + 1, len(dfa)): - state_j = dfa[j] - if state_i == state_j: - #print " unify", i, j - del dfa[j] - for state in dfa: - state.unifystate(state_j, state_i) - changes = True - break - - def _parse_rhs(self): - # RHS: ALT ('|' ALT)* - a, z = self._parse_alt() - if self.value != "|": - return a, z - else: - aa = NFAState() - zz = NFAState() - aa.addarc(a) - z.addarc(zz) - while self.value == "|": - self._gettoken() - a, z = self._parse_alt() - aa.addarc(a) - z.addarc(zz) - return aa, zz - - def _parse_alt(self): - # ALT: ITEM+ - a, b = self._parse_item() - while (self.value in ("(", "[") or - self.type in (token.NAME, token.STRING)): - c, d = self._parse_item() - b.addarc(c) - b = d - return a, b - - def _parse_item(self): - # ITEM: '[' RHS ']' | ATOM ['+' | '*'] - if self.value == "[": - self._gettoken() - a, z = self._parse_rhs() - self._expect(token.RSQB) - a.addarc(z) - return a, z - else: - a, z = self._parse_atom() - value = self.value - if value not in ("+", "*"): - return a, z - self._gettoken() - z.addarc(a) - if value == "+": - return a, z - else: - return a, a - - def _parse_atom(self): - # ATOM: '(' RHS ')' | NAME | STRING - if self.value == "(": - self._gettoken() - a, z = self._parse_rhs() - self._expect(token.RPAR) - return a, z - elif self.type in (token.NAME, token.STRING): - a = NFAState() - z = NFAState() - a.addarc(z, self.value) - self._gettoken() - return a, z - else: - self._raise_error("expected (...) or NAME or STRING, got %s/%s", - self.type, self.value) - - def _expect(self, type): - if self.type != type: - self._raise_error("expected %s(%s), got %s(%s)", - type, token.tok_name[type], self.type, self.value) - value = self.value - self._gettoken() - return value - - def _gettoken(self): - tup = next(self.generator) - while tup[0] in (token.COMMENT, token.NL): - tup = next(self.generator) - self.type, self.value, self.begin, prefix = tup - - def _raise_error(self, msg, *args): - if args: - try: - msg = msg % args - except: - msg = " ".join([msg] + list(map(str, args))) - line = self._bnf_text.splitlines()[self.begin[0] - 1] - raise SyntaxError(msg, ('', self.begin[0], - self.begin[1], line)) - - -class NFAState(object): - def __init__(self): - self.arcs = [] # list of (label, NFAState) pairs - - def addarc(self, next, label=None): - assert label is None or isinstance(label, str) - assert isinstance(next, NFAState) - self.arcs.append((label, next)) - - -class DFAState(object): - def __init__(self, nfaset, final): - assert isinstance(nfaset, dict) - assert isinstance(next(iter(nfaset)), NFAState) - assert isinstance(final, NFAState) - self.nfaset = nfaset - self.isfinal = final in nfaset - self.arcs = {} # map from label to DFAState - - def addarc(self, next, label): - assert isinstance(label, str) - assert label not in self.arcs - assert isinstance(next, DFAState) - self.arcs[label] = next - - def unifystate(self, old, new): - for label, next in self.arcs.items(): - if next is old: - self.arcs[label] = new - - def __eq__(self, other): - # Equality test -- ignore the nfaset instance variable - assert isinstance(other, DFAState) - if self.isfinal != other.isfinal: - return False - # Can't just return self.arcs == other.arcs, because that - # would invoke this method recursively, with cycles... - if len(self.arcs) != len(other.arcs): - return False - for label, next in self.arcs.items(): - if next is not other.arcs.get(label): - return False - return True - - __hash__ = None # For Py3 compatibility. - - -def generate_grammar(bnf_text, token_namespace): - """ - ``bnf_text`` is a grammar in extended BNF (using * for repetition, + for - at-least-once repetition, [] for optional parts, | for alternatives and () - for grouping). - - It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its - own parser. - """ - p = ParserGenerator(bnf_text, token_namespace) - return p.make_grammar() diff -Nru parso-0.2.1/parso/python/diff.py parso-0.3.1/parso/python/diff.py --- parso-0.2.1/parso/python/diff.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/python/diff.py 2018-07-09 18:53:54.000000000 +0000 @@ -13,8 +13,8 @@ from parso.utils import split_lines from parso.python.parser import Parser from parso.python.tree import EndMarker -from parso.python.tokenize import (NEWLINE, PythonToken, ERROR_DEDENT, - ENDMARKER, INDENT, DEDENT) +from parso.python.tokenize import PythonToken +from parso.python.token import PythonTokenTypes LOG = logging.getLogger(__name__) @@ -29,7 +29,7 @@ def _ends_with_newline(leaf, suffix=''): if leaf.type == 'error_leaf': - typ = leaf.original_type + typ = leaf.token_type.lower() else: typ = leaf.type @@ -41,21 +41,28 @@ if, while, for and try might not be finished, because another part might still be parsed. """ - for dfa, newstate, (symbol_number, nodes) in stack: - if pgen_grammar.number2symbol[symbol_number] in ('if_stmt', 'while_stmt', - 'for_stmt', 'try_stmt'): + for stack_node in stack: + if stack_node.nonterminal in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'): return False return True +def _func_or_class_has_suite(node): + if node.type == 'decorated': + node = node.children[-1] + if node.type in ('async_funcdef', 'async_stmt'): + node = node.children[-1] + return node.type in ('classdef', 'funcdef') and node.children[-1].type == 'suite' + + def suite_or_file_input_is_valid(pgen_grammar, stack): if not _flows_finished(pgen_grammar, stack): return False - for dfa, newstate, (symbol_number, nodes) in reversed(stack): - if pgen_grammar.number2symbol[symbol_number] == 'suite': + for stack_node in reversed(stack): + if stack_node.nonterminal == 'suite': # If only newline is in the suite, the suite is not valid, yet. - return len(nodes) > 1 + return len(stack_node.nodes) > 1 # Not reaching a suite means that we're dealing with file_input levels # where there's no need for a valid statement in it. It can also be empty. return True @@ -168,8 +175,7 @@ def _enabled_debugging(self, old_lines, lines_new): if self._module.get_code() != ''.join(lines_new): - LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), - ''.join(lines_new)) + LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new)) def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new): copied_nodes = [None] @@ -273,7 +279,6 @@ # memoryview? parsed_until_line = self._nodes_stack.parsed_until_line lines_after = self._parser_lines_new[parsed_until_line:] - #print('parse_content', parsed_until_line, lines_after, until_line) tokens = self._diff_tokenize( lines_after, until_line, @@ -290,10 +295,10 @@ omitted_first_indent = False indents = [] tokens = self._tokenizer(lines, (1, 0)) - stack = self._active_parser.pgen_parser.stack + stack = self._active_parser.stack for typ, string, start_pos, prefix in tokens: start_pos = start_pos[0] + line_offset, start_pos[1] - if typ == INDENT: + if typ == PythonTokenTypes.INDENT: indents.append(start_pos[1]) if is_first_token: omitted_first_indent = True @@ -306,8 +311,9 @@ # In case of omitted_first_indent, it might not be dedented fully. # However this is a sign for us that a dedent happened. - if typ == DEDENT \ - or typ == ERROR_DEDENT and omitted_first_indent and len(indents) == 1: + if typ == PythonTokenTypes.DEDENT \ + or typ == PythonTokenTypes.ERROR_DEDENT \ + and omitted_first_indent and len(indents) == 1: indents.pop() if omitted_first_indent and not indents: # We are done here, only thing that can come now is an @@ -317,18 +323,22 @@ prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix) else: prefix = '' - yield PythonToken(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix) + yield PythonToken( + PythonTokenTypes.ENDMARKER, '', + (start_pos[0] + line_offset, 0), + prefix + ) break - elif typ == NEWLINE and start_pos[0] >= until_line: + elif typ == PythonTokenTypes.NEWLINE and start_pos[0] >= until_line: yield PythonToken(typ, string, start_pos, prefix) # Check if the parser is actually in a valid suite state. if suite_or_file_input_is_valid(self._pgen_grammar, stack): start_pos = start_pos[0] + 1, 0 while len(indents) > int(omitted_first_indent): indents.pop() - yield PythonToken(DEDENT, '', start_pos, '') + yield PythonToken(PythonTokenTypes.DEDENT, '', start_pos, '') - yield PythonToken(ENDMARKER, '', start_pos, '') + yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '') break else: continue @@ -509,7 +519,7 @@ # binary search. if _get_last_line(node) > until_line: # We can split up functions and classes later. - if node.type in ('classdef', 'funcdef') and node.children[-1].type == 'suite': + if _func_or_class_has_suite(node): new_nodes.append(node) break @@ -520,24 +530,26 @@ last_node = new_nodes[-1] line_offset_index = -1 - if last_node.type in ('classdef', 'funcdef'): - suite = last_node.children[-1] - if suite.type == 'suite': - suite_tos = _NodesStackNode(suite) - # Don't need to pass line_offset here, it's already done by the - # parent. - suite_nodes, recursive_tos = self._copy_nodes( - suite_tos, suite.children, until_line, line_offset) - if len(suite_nodes) < 2: - # A suite only with newline is not valid. - new_nodes.pop() - else: - suite_tos.parent = tos - new_tos = recursive_tos - line_offset_index = -2 + if _func_or_class_has_suite(last_node): + suite = last_node + while suite.type != 'suite': + suite = suite.children[-1] + + suite_tos = _NodesStackNode(suite) + # Don't need to pass line_offset here, it's already done by the + # parent. + suite_nodes, recursive_tos = self._copy_nodes( + suite_tos, suite.children, until_line, line_offset) + if len(suite_nodes) < 2: + # A suite only with newline is not valid. + new_nodes.pop() + else: + suite_tos.parent = tos + new_tos = recursive_tos + line_offset_index = -2 - elif (new_nodes[-1].type in ('error_leaf', 'error_node') or - _is_flow_node(new_nodes[-1])): + elif (last_node.type in ('error_leaf', 'error_node') or + _is_flow_node(new_nodes[-1])): # Error leafs/nodes don't have a defined start/end. Error # nodes might not end with a newline (e.g. if there's an # open `(`). Therefore ignore all of them unless they are @@ -568,7 +580,7 @@ self._tos = _NodesStackNode(tree_node, self._tos) self._tos.add(list(tree_node.children)) self._update_tos(tree_node.children[-1]) - elif tree_node.type in ('classdef', 'funcdef'): + elif _func_or_class_has_suite(tree_node): self._update_tos(tree_node.children[-1]) def close(self): diff -Nru parso-0.2.1/parso/python/errors.py parso-0.3.1/parso/python/errors.py --- parso-0.2.1/parso/python/errors.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/python/errors.py 2018-07-09 18:53:54.000000000 +0000 @@ -306,12 +306,12 @@ def visit_leaf(self, leaf): if leaf.type == 'error_leaf': - if leaf.original_type in ('indent', 'error_dedent'): + if leaf.token_type in ('INDENT', 'ERROR_DEDENT'): # Indents/Dedents itself never have a prefix. They are just # "pseudo" tokens that get removed by the syntax tree later. # Therefore in case of an error we also have to check for this. spacing = list(leaf.get_next_leaf()._split_prefix())[-1] - if leaf.original_type == 'indent': + if leaf.token_type == 'INDENT': message = 'unexpected indent' else: message = 'unindent does not match any outer indentation level' diff -Nru parso-0.2.1/parso/python/parser.py parso-0.3.1/parso/python/parser.py --- parso-0.2.1/parso/python/parser.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/python/parser.py 2018-07-09 18:53:54.000000000 +0000 @@ -1,9 +1,11 @@ from parso.python import tree -from parso.python.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER, - STRING, tok_name, NAME, FSTRING_STRING, - FSTRING_START, FSTRING_END) +from parso.python.token import PythonTokenTypes from parso.parser import BaseParser -from parso.pgen2.parse import token_to_ilabel + + +NAME = PythonTokenTypes.NAME +INDENT = PythonTokenTypes.INDENT +DEDENT = PythonTokenTypes.DEDENT class Parser(BaseParser): @@ -53,53 +55,33 @@ # Names/Keywords are handled separately _leaf_map = { - STRING: tree.String, - NUMBER: tree.Number, - NEWLINE: tree.Newline, - ENDMARKER: tree.EndMarker, - FSTRING_STRING: tree.FStringString, - FSTRING_START: tree.FStringStart, - FSTRING_END: tree.FStringEnd, + PythonTokenTypes.STRING: tree.String, + PythonTokenTypes.NUMBER: tree.Number, + PythonTokenTypes.NEWLINE: tree.Newline, + PythonTokenTypes.ENDMARKER: tree.EndMarker, + PythonTokenTypes.FSTRING_STRING: tree.FStringString, + PythonTokenTypes.FSTRING_START: tree.FStringStart, + PythonTokenTypes.FSTRING_END: tree.FStringEnd, } - def __init__(self, pgen_grammar, error_recovery=True, start_symbol='file_input'): - super(Parser, self).__init__(pgen_grammar, start_symbol, error_recovery=error_recovery) + def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'): + super(Parser, self).__init__(pgen_grammar, start_nonterminal, + error_recovery=error_recovery) self.syntax_errors = [] self._omit_dedent_list = [] self._indent_counter = 0 - # TODO do print absolute import detection here. - # try: - # del python_grammar_no_print_statement.keywords["print"] - # except KeyError: - # pass # Doesn't exist in the Python 3 grammar. - - # if self.options["print_function"]: - # python_grammar = pygram.python_grammar_no_print_statement - # else: - def parse(self, tokens): if self._error_recovery: - if self._start_symbol != 'file_input': + if self._start_nonterminal != 'file_input': raise NotImplementedError tokens = self._recovery_tokenize(tokens) - node = super(Parser, self).parse(tokens) - - if self._start_symbol == 'file_input' != node.type: - # If there's only one statement, we get back a non-module. That's - # not what we want, we want a module, so we add it here: - node = self.convert_node( - self._pgen_grammar, - self._pgen_grammar.symbol2number['file_input'], - [node] - ) + return super(Parser, self).parse(tokens) - return node - - def convert_node(self, pgen_grammar, type, children): + def convert_node(self, nonterminal, children): """ Convert raw node information to a PythonBaseNode instance. @@ -107,149 +89,114 @@ grammar rule produces a new complete node, so that the tree is build strictly bottom-up. """ - # TODO REMOVE symbol, we don't want type here. - symbol = pgen_grammar.number2symbol[type] try: - return self.node_map[symbol](children) + return self.node_map[nonterminal](children) except KeyError: - if symbol == 'suite': + if nonterminal == 'suite': # We don't want the INDENT/DEDENT in our parser tree. Those # leaves are just cancer. They are virtual leaves and not real # ones and therefore have pseudo start/end positions and no # prefixes. Just ignore them. children = [children[0]] + children[2:-1] - elif symbol == 'list_if': + elif nonterminal == 'list_if': # Make transitioning from 2 to 3 easier. - symbol = 'comp_if' - elif symbol == 'listmaker': + nonterminal = 'comp_if' + elif nonterminal == 'listmaker': # Same as list_if above. - symbol = 'testlist_comp' - return self.default_node(symbol, children) + nonterminal = 'testlist_comp' + return self.default_node(nonterminal, children) - def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos): + def convert_leaf(self, type, value, prefix, start_pos): # print('leaf', repr(value), token.tok_name[type]) if type == NAME: - if value in pgen_grammar.keywords: + if value in self._pgen_grammar.reserved_syntax_strings: return tree.Keyword(value, start_pos, prefix) else: return tree.Name(value, start_pos, prefix) return self._leaf_map.get(type, tree.Operator)(value, start_pos, prefix) - def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix, - add_token_callback): - def get_symbol_and_nodes(stack): - for dfa, state, (type_, nodes) in stack: - symbol = pgen_grammar.number2symbol[type_] - yield symbol, nodes - - tos_nodes = stack.get_tos_nodes() + def error_recovery(self, token): + tos_nodes = self.stack[-1].nodes if tos_nodes: last_leaf = tos_nodes[-1].get_last_leaf() else: last_leaf = None - if self._start_symbol == 'file_input' and \ - (typ == ENDMARKER or typ == DEDENT and '\n' not in last_leaf.value): - def reduce_stack(states, newstate): - # reduce - state = newstate - while states[state] == [(0, state)]: - self.pgen_parser._pop() - - dfa, state, (type_, nodes) = stack[-1] - states, first = dfa - - + if self._start_nonterminal == 'file_input' and \ + (token.type == PythonTokenTypes.ENDMARKER or + token.type == DEDENT and '\n' not in last_leaf.value): # In Python statements need to end with a newline. But since it's # possible (and valid in Python ) that there's no newline at the # end of a file, we have to recover even if the user doesn't want # error recovery. - #print('x', pprint.pprint(stack)) - ilabel = token_to_ilabel(pgen_grammar, NEWLINE, value) - - dfa, state, (type_, nodes) = stack[-1] - symbol = pgen_grammar.number2symbol[type_] - states, first = dfa - arcs = states[state] - # Look for a state with this label - for i, newstate in arcs: - if ilabel == i: - if symbol == 'simple_stmt': - # This is basically shifting - stack[-1] = (dfa, newstate, (type_, nodes)) - - reduce_stack(states, newstate) - add_token_callback(typ, value, start_pos, prefix) + if self.stack[-1].dfa.from_rule == 'simple_stmt': + try: + plan = self.stack[-1].dfa.transitions[PythonTokenTypes.NEWLINE] + except KeyError: + pass + else: + if plan.next_dfa.is_final and not plan.dfa_pushes: + # We are ignoring here that the newline would be + # required for a simple_stmt. + self.stack[-1].dfa = plan.next_dfa + self._add_token(token) return - # Check if we're at the right point - #for symbol, nodes in get_symbol_and_nodes(stack): - # self.pgen_parser._pop() - - #break - break - #symbol = pgen_grammar.number2symbol[type_] if not self._error_recovery: - return super(Parser, self).error_recovery( - pgen_grammar, stack, arcs, typ, value, start_pos, prefix, - add_token_callback) + return super(Parser, self).error_recovery(token) def current_suite(stack): # For now just discard everything that is not a suite or # file_input, if we detect an error. - for index, (symbol, nodes) in reversed(list(enumerate(get_symbol_and_nodes(stack)))): + for until_index, stack_node in reversed(list(enumerate(stack))): # `suite` can sometimes be only simple_stmt, not stmt. - if symbol == 'file_input': - break - elif symbol == 'suite' and len(nodes) > 1: - # suites without an indent in them get discarded. + if stack_node.nonterminal == 'file_input': break - return index, symbol, nodes + elif stack_node.nonterminal == 'suite': + # In the case where we just have a newline we don't want to + # do error recovery here. In all other cases, we want to do + # error recovery. + if len(stack_node.nodes) != 1: + break + return until_index - index, symbol, nodes = current_suite(stack) + until_index = current_suite(self.stack) - # print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index) - if self._stack_removal(pgen_grammar, stack, arcs, index + 1, value, start_pos): - add_token_callback(typ, value, start_pos, prefix) + if self._stack_removal(until_index + 1): + self._add_token(token) else: + typ, value, start_pos, prefix = token if typ == INDENT: # For every deleted INDENT we have to delete a DEDENT as well. # Otherwise the parser will get into trouble and DEDENT too early. self._omit_dedent_list.append(self._indent_counter) - error_leaf = tree.PythonErrorLeaf(tok_name[typ].lower(), value, start_pos, prefix) - stack[-1][2][1].append(error_leaf) + error_leaf = tree.PythonErrorLeaf(typ.name, value, start_pos, prefix) + self.stack[-1].nodes.append(error_leaf) + + tos = self.stack[-1] + if tos.nonterminal == 'suite': + # Need at least one statement in the suite. This happend with the + # error recovery above. + try: + tos.dfa = tos.dfa.arcs['stmt'] + except KeyError: + # We're already in a final state. + pass + + def _stack_removal(self, start_index): + all_nodes = [node for stack_node in self.stack[start_index:] for node in stack_node.nodes] - if symbol == 'suite': - dfa, state, node = stack[-1] - states, first = dfa - arcs = states[state] - intended_label = pgen_grammar.symbol2label['stmt'] - # Introduce a proper state transition. We're basically allowing - # there to be no valid statements inside a suite. - if [x[0] for x in arcs] == [intended_label]: - new_state = arcs[0][1] - stack[-1] = dfa, new_state, node - - def _stack_removal(self, pgen_grammar, stack, arcs, start_index, value, start_pos): - failed_stack = False - found = False - all_nodes = [] - for dfa, state, (type_, nodes) in stack[start_index:]: - if nodes: - found = True - if found: - failed_stack = True - all_nodes += nodes - if failed_stack: - stack[start_index - 1][2][1].append(tree.PythonErrorNode(all_nodes)) + if all_nodes: + self.stack[start_index - 1].nodes.append(tree.PythonErrorNode(all_nodes)) - stack[start_index:] = [] - return failed_stack + self.stack[start_index:] = [] + return bool(all_nodes) def _recovery_tokenize(self, tokens): - for typ, value, start_pos, prefix in tokens: + for token in tokens: + typ = token[0] # print(tok_name[typ], repr(value), start_pos, repr(prefix)) if typ == DEDENT: # We need to count indents, because if we just omit any DEDENT, @@ -262,4 +209,4 @@ self._indent_counter -= 1 elif typ == INDENT: self._indent_counter += 1 - yield typ, value, start_pos, prefix + yield token diff -Nru parso-0.2.1/parso/python/tokenize.py parso-0.3.1/parso/python/tokenize.py --- parso-0.2.1/parso/python/tokenize.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/python/tokenize.py 2018-07-09 18:53:54.000000000 +0000 @@ -18,14 +18,25 @@ import itertools as _itertools from codecs import BOM_UTF8 -from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap, - NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT, - ERROR_DEDENT, FSTRING_STRING, FSTRING_START, - FSTRING_END) +from parso.python.token import PythonTokenTypes from parso._compatibility import py_version from parso.utils import split_lines +STRING = PythonTokenTypes.STRING +NAME = PythonTokenTypes.NAME +NUMBER = PythonTokenTypes.NUMBER +OP = PythonTokenTypes.OP +NEWLINE = PythonTokenTypes.NEWLINE +INDENT = PythonTokenTypes.INDENT +DEDENT = PythonTokenTypes.DEDENT +ENDMARKER = PythonTokenTypes.ENDMARKER +ERRORTOKEN = PythonTokenTypes.ERRORTOKEN +ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT +FSTRING_START = PythonTokenTypes.FSTRING_START +FSTRING_STRING = PythonTokenTypes.FSTRING_STRING +FSTRING_END = PythonTokenTypes.FSTRING_END + TokenCollection = namedtuple( 'TokenCollection', 'pseudo_token single_quoted triple_quoted endpats whitespace ' @@ -242,12 +253,9 @@ class PythonToken(Token): - def _get_type_name(self, exact=True): - return tok_name[self.type] - def __repr__(self): return ('TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)' % - self._replace(type=self._get_type_name())) + self._replace(type=self.type.name)) class FStringNode(object): @@ -396,7 +404,9 @@ endmatch = endprog.match(line) if endmatch: pos = endmatch.end(0) - yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix) + yield PythonToken( + STRING, contstr + line[:pos], + contstr_start, prefix) contstr = '' contline = None else: @@ -438,16 +448,15 @@ pseudomatch = pseudo_token.match(line, pos) if not pseudomatch: # scan for tokens - if line.endswith('\n'): - new_line = True match = whitespace.match(line, pos) pos = match.end() yield PythonToken( - ERRORTOKEN, line[pos:], (lnum, pos), + ERRORTOKEN, line[pos], (lnum, pos), additional_prefix + match.group(0) ) additional_prefix = '' - break + pos += 1 + continue prefix = additional_prefix + pseudomatch.group(1) additional_prefix = '' @@ -571,13 +580,7 @@ and fstring_stack[-1].parentheses_count == 1: fstring_stack[-1].format_spec_count += 1 - try: - # This check is needed in any case to check if it's a valid - # operator or just some random unicode character. - typ = opmap[token] - except KeyError: - typ = ERRORTOKEN - yield PythonToken(typ, token, spos, prefix) + yield PythonToken(OP, token, spos, prefix) if contstr: yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix) diff -Nru parso-0.2.1/parso/python/token.py parso-0.3.1/parso/python/token.py --- parso-0.2.1/parso/python/token.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/python/token.py 2018-07-09 18:53:54.000000000 +0000 @@ -1,113 +1,27 @@ from __future__ import absolute_import -from itertools import count -from token import * -from parso._compatibility import py_version +class TokenType(object): + def __init__(self, name, contains_syntax=False): + self.name = name + self.contains_syntax = contains_syntax -_counter = count(N_TOKENS) -# Never want to see this thing again. -del N_TOKENS - -COMMENT = next(_counter) -tok_name[COMMENT] = 'COMMENT' - -NL = next(_counter) -tok_name[NL] = 'NL' - -# Sets the attributes that don't exist in these tok_name versions. -if py_version >= 30: - BACKQUOTE = next(_counter) - tok_name[BACKQUOTE] = 'BACKQUOTE' -else: - RARROW = next(_counter) - tok_name[RARROW] = 'RARROW' - ELLIPSIS = next(_counter) - tok_name[ELLIPSIS] = 'ELLIPSIS' - -if py_version < 35: - ATEQUAL = next(_counter) - tok_name[ATEQUAL] = 'ATEQUAL' - -ERROR_DEDENT = next(_counter) -tok_name[ERROR_DEDENT] = 'ERROR_DEDENT' - -FSTRING_START = next(_counter) -tok_name[FSTRING_START] = 'FSTRING_START' -FSTRING_END = next(_counter) -tok_name[FSTRING_END] = 'FSTRING_END' -FSTRING_STRING = next(_counter) -tok_name[FSTRING_STRING] = 'FSTRING_STRING' -EXCLAMATION = next(_counter) -tok_name[EXCLAMATION] = 'EXCLAMATION' - -# Map from operator to number (since tokenize doesn't do this) - -opmap_raw = """\ -( LPAR -) RPAR -[ LSQB -] RSQB -: COLON -, COMMA -; SEMI -+ PLUS -- MINUS -* STAR -/ SLASH -| VBAR -& AMPER -< LESS -> GREATER -= EQUAL -. DOT -% PERCENT -` BACKQUOTE -{ LBRACE -} RBRACE -@ AT -== EQEQUAL -!= NOTEQUAL -<> NOTEQUAL -<= LESSEQUAL ->= GREATEREQUAL -~ TILDE -^ CIRCUMFLEX -<< LEFTSHIFT ->> RIGHTSHIFT -** DOUBLESTAR -+= PLUSEQUAL --= MINEQUAL -*= STAREQUAL -/= SLASHEQUAL -%= PERCENTEQUAL -&= AMPEREQUAL -|= VBAREQUAL -@= ATEQUAL -^= CIRCUMFLEXEQUAL -<<= LEFTSHIFTEQUAL ->>= RIGHTSHIFTEQUAL -**= DOUBLESTAREQUAL -// DOUBLESLASH -//= DOUBLESLASHEQUAL --> RARROW -... ELLIPSIS -! EXCLAMATION -""" - -opmap = {} -for line in opmap_raw.splitlines(): - op, name = line.split() - opmap[op] = globals()[name] + def __repr__(self): + return '%s(%s)' % (self.__class__.__name__, self.name) -def generate_token_id(string): +class TokenTypes(object): """ - Uses a token in the grammar (e.g. `'+'` or `'and'`returns the corresponding - ID for it. The strings are part of the grammar file. + Basically an enum, but Python 2 doesn't have enums in the standard library. """ - try: - return opmap[string] - except KeyError: - pass - return globals()[string] + def __init__(self, names, contains_syntax): + for name in names: + setattr(self, name, TokenType(name, contains_syntax=name in contains_syntax)) + + +PythonTokenTypes = TokenTypes(( + 'STRING', 'NUMBER', 'NAME', 'ERRORTOKEN', 'NEWLINE', 'INDENT', 'DEDENT', + 'ERROR_DEDENT', 'FSTRING_STRING', 'FSTRING_START', 'FSTRING_END', 'OP', + 'ENDMARKER'), + contains_syntax=('NAME', 'OP'), +) diff -Nru parso-0.2.1/parso/python/tree.py parso-0.3.1/parso/python/tree.py --- parso-0.2.1/parso/python/tree.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/python/tree.py 2018-07-09 18:53:54.000000000 +0000 @@ -124,7 +124,7 @@ # indent error leafs somehow? No idea how, though. previous_leaf = self.get_previous_leaf() if previous_leaf is not None and previous_leaf.type == 'error_leaf' \ - and previous_leaf.original_type in ('indent', 'error_dedent'): + and previous_leaf.token_type in ('INDENT', 'ERROR_DEDENT'): previous_leaf = previous_leaf.get_previous_leaf() if previous_leaf is None: @@ -537,7 +537,9 @@ if child is None or child == ',': param_children = children[start:end] if param_children: # Could as well be comma and then end. - if param_children[0] == '*' and param_children[1] == ',' \ + if param_children[0] == '*' \ + and (len(param_children) == 1 + or param_children[1] == ',') \ or check_python2_nested_param(param_children[0]): for p in param_children: p.parent = parent diff -Nru parso-0.2.1/parso/tree.py parso-0.3.1/parso/tree.py --- parso-0.2.1/parso/tree.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/parso/tree.py 2018-07-09 18:53:54.000000000 +0000 @@ -229,6 +229,7 @@ class TypedLeaf(Leaf): __slots__ = ('type',) + def __init__(self, type, value, start_pos, prefix=''): super(TypedLeaf, self).__init__(value, start_pos, prefix) self.type = type @@ -351,13 +352,13 @@ A leaf that is either completely invalid in a language (like `$` in Python) or is invalid at that position. Like the star in `1 +* 1`. """ - __slots__ = ('original_type',) + __slots__ = ('token_type',) type = 'error_leaf' - def __init__(self, original_type, value, start_pos, prefix=''): + def __init__(self, token_type, value, start_pos, prefix=''): super(ErrorLeaf, self).__init__(value, start_pos, prefix) - self.original_type = original_type + self.token_type = token_type def __repr__(self): return "<%s: %s:%s, %s>" % \ - (type(self).__name__, self.original_type, repr(self.value), self.start_pos) + (type(self).__name__, self.token_type, repr(self.value), self.start_pos) diff -Nru parso-0.2.1/parso.egg-info/PKG-INFO parso-0.3.1/parso.egg-info/PKG-INFO --- parso-0.2.1/parso.egg-info/PKG-INFO 2018-05-21 11:01:06.000000000 +0000 +++ parso-0.3.1/parso.egg-info/PKG-INFO 2018-07-09 18:56:17.000000000 +0000 @@ -1,10 +1,12 @@ -Metadata-Version: 1.1 +Metadata-Version: 1.2 Name: parso -Version: 0.2.1 +Version: 0.3.1 Summary: A Python Parser Home-page: https://github.com/davidhalter/parso Author: David Halter Author-email: davidhalter88@gmail.com +Maintainer: David Halter +Maintainer-email: davidhalter88@gmail.com License: MIT Description: ################################################################### parso - A Python Parser @@ -103,6 +105,16 @@ Changelog --------- + 0.3.1 (2018-07-09) + +++++++++++++++++++ + + - Bugfixes in the diff parser and keyword-only arguments + + 0.3.0 (2018-06-30) + +++++++++++++++++++ + + - Rewrote the pgen2 parser generator. + 0.2.1 (2018-05-21) +++++++++++++++++++ diff -Nru parso-0.2.1/parso.egg-info/SOURCES.txt parso-0.3.1/parso.egg-info/SOURCES.txt --- parso-0.2.1/parso.egg-info/SOURCES.txt 2018-05-21 11:01:06.000000000 +0000 +++ parso-0.3.1/parso.egg-info/SOURCES.txt 2018-07-09 18:56:17.000000000 +0000 @@ -42,9 +42,8 @@ parso.egg-info/dependency_links.txt parso.egg-info/top_level.txt parso/pgen2/__init__.py -parso/pgen2/grammar.py -parso/pgen2/parse.py -parso/pgen2/pgen.py +parso/pgen2/generator.py +parso/pgen2/grammar_parser.py parso/python/__init__.py parso/python/diff.py parso/python/errors.py @@ -66,6 +65,7 @@ test/test_absolute_import.py test/test_cache.py test/test_diff_parser.py +test/test_error_recovery.py test/test_file_python_errors.py test/test_fstring.py test/test_get_code.py diff -Nru parso-0.2.1/PKG-INFO parso-0.3.1/PKG-INFO --- parso-0.2.1/PKG-INFO 2018-05-21 11:01:06.000000000 +0000 +++ parso-0.3.1/PKG-INFO 2018-07-09 18:56:17.000000000 +0000 @@ -1,10 +1,12 @@ -Metadata-Version: 1.1 +Metadata-Version: 1.2 Name: parso -Version: 0.2.1 +Version: 0.3.1 Summary: A Python Parser Home-page: https://github.com/davidhalter/parso Author: David Halter Author-email: davidhalter88@gmail.com +Maintainer: David Halter +Maintainer-email: davidhalter88@gmail.com License: MIT Description: ################################################################### parso - A Python Parser @@ -103,6 +105,16 @@ Changelog --------- + 0.3.1 (2018-07-09) + +++++++++++++++++++ + + - Bugfixes in the diff parser and keyword-only arguments + + 0.3.0 (2018-06-30) + +++++++++++++++++++ + + - Rewrote the pgen2 parser generator. + 0.2.1 (2018-05-21) +++++++++++++++++++ diff -Nru parso-0.2.1/test/test_diff_parser.py parso-0.3.1/test/test_diff_parser.py --- parso-0.2.1/test/test_diff_parser.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/test/test_diff_parser.py 2018-07-09 18:53:54.000000000 +0000 @@ -507,3 +507,22 @@ def test_newlines_at_end(differ): differ.initialize('a\n\n') differ.parse('a\n', copies=1) + + +def test_end_newline_with_decorator(differ): + code = dedent('''\ + @staticmethod + def spam(): + import json + json.l''') + + differ.initialize(code) + module = differ.parse(code + '\n', copies=1) + decorated, endmarker = module.children + assert decorated.type == 'decorated' + decorator, func = decorated.children + suite = func.children[-1] + assert suite.type == 'suite' + newline, first_stmt, second_stmt = suite.children + assert first_stmt.get_code() == ' import json\n' + assert second_stmt.get_code() == ' json.l\n' diff -Nru parso-0.2.1/test/test_error_recovery.py parso-0.3.1/test/test_error_recovery.py --- parso-0.2.1/test/test_error_recovery.py 1970-01-01 00:00:00.000000000 +0000 +++ parso-0.3.1/test/test_error_recovery.py 2018-07-09 18:53:54.000000000 +0000 @@ -0,0 +1,85 @@ +from parso import parse, load_grammar + + +def test_with_stmt(): + module = parse('with x: f.\na') + assert module.children[0].type == 'with_stmt' + w, with_item, colon, f = module.children[0].children + assert f.type == 'error_node' + assert f.get_code(include_prefix=False) == 'f.' + + assert module.children[2].type == 'name' + + +def test_one_line_function(each_version): + module = parse('def x(): f.', version=each_version) + assert module.children[0].type == 'funcdef' + def_, name, parameters, colon, f = module.children[0].children + assert f.type == 'error_node' + + module = parse('def x(a:', version=each_version) + func = module.children[0] + assert func.type == 'error_node' + if each_version.startswith('2'): + assert func.children[-1].value == 'a' + else: + assert func.children[-1] == ':' + + +def test_if_else(): + module = parse('if x:\n f.\nelse:\n g(') + if_stmt = module.children[0] + if_, test, colon, suite1, else_, colon, suite2 = if_stmt.children + f = suite1.children[1] + assert f.type == 'error_node' + assert f.children[0].value == 'f' + assert f.children[1].value == '.' + g = suite2.children[1] + assert g.children[0].value == 'g' + assert g.children[1].value == '(' + + +def test_if_stmt(): + module = parse('if x: f.\nelse: g(') + if_stmt = module.children[0] + assert if_stmt.type == 'if_stmt' + if_, test, colon, f = if_stmt.children + assert f.type == 'error_node' + assert f.children[0].value == 'f' + assert f.children[1].value == '.' + + assert module.children[1].type == 'newline' + assert module.children[1].value == '\n' + assert module.children[2].type == 'error_leaf' + assert module.children[2].value == 'else' + assert module.children[3].type == 'error_leaf' + assert module.children[3].value == ':' + + in_else_stmt = module.children[4] + assert in_else_stmt.type == 'error_node' + assert in_else_stmt.children[0].value == 'g' + assert in_else_stmt.children[1].value == '(' + + +def test_invalid_token(): + module = parse('a + ? + b') + error_node, q, plus_b, endmarker = module.children + assert error_node.get_code() == 'a +' + assert q.value == '?' + assert q.type == 'error_leaf' + assert plus_b.type == 'factor' + assert plus_b.get_code() == ' + b' + + +def test_invalid_token_in_fstr(): + module = load_grammar(version='3.6').parse('f"{a + ? + b}"') + error_node, q, plus_b, error1, error2, endmarker = module.children + assert error_node.get_code() == 'f"{a +' + assert q.value == '?' + assert q.type == 'error_leaf' + assert plus_b.type == 'error_node' + assert plus_b.get_code() == ' + b' + assert error1.value == '}' + assert error1.type == 'error_leaf' + assert error2.value == '"' + assert error2.type == 'error_leaf' diff -Nru parso-0.2.1/test/test_param_splitting.py parso-0.3.1/test/test_param_splitting.py --- parso-0.2.1/test/test_param_splitting.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/test/test_param_splitting.py 2018-07-09 18:53:54.000000000 +0000 @@ -32,3 +32,16 @@ assert_params(u'x, *args', x=None, args=None) assert_params(u'**kwargs', kwargs=None) assert_params(u'*args, **kwargs', args=None, kwargs=None) + + +def test_kw_only_no_kw(works_ge_py3): + """ + Parsing this should be working. In CPython the parser also parses this and + in a later step the AST complains. + """ + module = works_ge_py3.parse('def test(arg, *):\n pass') + if module is not None: + func = module.children[0] + open_, p1, asterisk, close = func._get_param_nodes() + assert p1.get_code('arg,') + assert asterisk.value == '*' diff -Nru parso-0.2.1/test/test_pgen2.py parso-0.3.1/test/test_pgen2.py --- parso-0.2.1/test/test_pgen2.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/test/test_pgen2.py 2018-07-09 18:53:54.000000000 +0000 @@ -12,6 +12,8 @@ from parso import load_grammar from parso import ParserSyntaxError +from parso.pgen2 import generate_grammar +from parso.python import tokenize def _parse(code, version=None): @@ -270,3 +272,19 @@ def test_py3_rb(works_ge_py3): works_ge_py3.parse("rb'1'") works_ge_py3.parse("RB'1'") + + +def test_left_recursion(): + with pytest.raises(ValueError, match='left recursion'): + generate_grammar('foo: foo NAME\n', tokenize.PythonTokenTypes) + + +def test_ambiguities(): + with pytest.raises(ValueError, match='ambiguous'): + generate_grammar('foo: bar | baz\nbar: NAME\nbaz: NAME\n', tokenize.PythonTokenTypes) + + with pytest.raises(ValueError, match='ambiguous'): + generate_grammar('''foo: bar | baz\nbar: 'x'\nbaz: "x"\n''', tokenize.PythonTokenTypes) + + with pytest.raises(ValueError, match='ambiguous'): + generate_grammar('''foo: bar | 'x'\nbar: 'x'\n''', tokenize.PythonTokenTypes) diff -Nru parso-0.2.1/test/test_tokenize.py parso-0.3.1/test/test_tokenize.py --- parso-0.2.1/test/test_tokenize.py 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/test/test_tokenize.py 2018-07-09 18:53:54.000000000 +0000 @@ -6,14 +6,25 @@ from parso._compatibility import py_version from parso.utils import split_lines, parse_version_string -from parso.python.token import ( - NAME, NEWLINE, STRING, INDENT, DEDENT, ERRORTOKEN, ENDMARKER, ERROR_DEDENT, - FSTRING_START) +from parso.python.token import PythonTokenTypes from parso.python import tokenize from parso import parse from parso.python.tokenize import PythonToken +# To make it easier to access some of the token types, just put them here. +NAME = PythonTokenTypes.NAME +NEWLINE = PythonTokenTypes.NEWLINE +STRING = PythonTokenTypes.STRING +INDENT = PythonTokenTypes.INDENT +DEDENT = PythonTokenTypes.DEDENT +ERRORTOKEN = PythonTokenTypes.ERRORTOKEN +OP = PythonTokenTypes.OP +ENDMARKER = PythonTokenTypes.ENDMARKER +ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT +FSTRING_START = PythonTokenTypes.FSTRING_START + + def _get_token_list(string): # Load the current version. version_info = parse_version_string() @@ -127,7 +138,7 @@ else: # Unicode tokens in Python 2 seem to be identified as operators. # They will be ignored in the parser, that's ok. - assert unicode_token[0] == tokenize.ERRORTOKEN + assert unicode_token[0] == OP def test_quoted_strings(): @@ -187,17 +198,16 @@ def test_error_literal(): error_token, endmarker = _get_token_list('"\n') - assert error_token.type == tokenize.ERRORTOKEN - assert endmarker.prefix == '' - assert error_token.string == '"\n' - assert endmarker.type == tokenize.ENDMARKER - assert endmarker.prefix == '' + assert error_token.type == ERRORTOKEN + assert error_token.string == '"' + assert endmarker.type == ENDMARKER + assert endmarker.prefix == '\n' bracket, error_token, endmarker = _get_token_list('( """') - assert error_token.type == tokenize.ERRORTOKEN + assert error_token.type == ERRORTOKEN assert error_token.prefix == ' ' assert error_token.string == '"""' - assert endmarker.type == tokenize.ENDMARKER + assert endmarker.type == ENDMARKER assert endmarker.prefix == '' @@ -233,5 +243,6 @@ t1, endmarker = _get_token_list(' "\n') assert t1.type == ERRORTOKEN assert t1.prefix == ' ' - assert t1.string == '"\n' + assert t1.string == '"' + assert endmarker.prefix == '\n' assert endmarker.string == '' diff -Nru parso-0.2.1/tox.ini parso-0.3.1/tox.ini --- parso-0.2.1/tox.ini 2018-05-21 10:58:16.000000000 +0000 +++ parso-0.3.1/tox.ini 2018-07-09 18:53:54.000000000 +0000 @@ -2,6 +2,7 @@ envlist = py27, py33, py34, py35, py36, py37 [testenv] deps = + {env:_SETUPTOOLS_DEP:setuptools} {env:_PARSO_TEST_PYTEST_DEP:pytest>=3.0.7} # For --lf and --ff. pytest-cache @@ -10,6 +11,7 @@ # tox corrupts __pycache__, solution from here: PYTHONDONTWRITEBYTECODE=1 py26,py33: _PARSO_TEST_PYTEST_DEP=pytest>=3.0.7,<3.3 + py26,py33: _SETUPTOOLS_DEP=setuptools<37 commands = pytest {posargs:parso test} [testenv:cov]