diff -Nru lark-parser-0.6.2/debian/changelog lark-parser-0.7.0/debian/changelog --- lark-parser-0.6.2/debian/changelog 2018-08-18 10:50:06.000000000 +0000 +++ lark-parser-0.7.0/debian/changelog 2019-05-02 19:00:00.000000000 +0000 @@ -1,3 +1,9 @@ +lark-parser (0.7.0-0~bionic0) bionic; urgency=low + + * New upstream version. + + -- Angelos Tzotsos Thu, 02 May 2019 21:00:00 +0200 + lark-parser (0.6.2-0~bionic0) bionic; urgency=low * New upstream version. diff -Nru lark-parser-0.6.2/docs/classes.md lark-parser-0.7.0/docs/classes.md --- lark-parser-0.6.2/docs/classes.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/classes.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,227 @@ +# Classes - Reference + +This page details the important classes in Lark. + +---- + +## Lark + +The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor. + +### Methods + +#### \_\_init\_\_(self, grammar, **options) + +The Lark class accepts a grammar string or file object, and keyword options: + +* start - The symbol in the grammar that begins the parse (Default: `"start"`) + +* parser - Decides which parser engine to use, "earley", "lalr" or "cyk". (Default: `"earley"`) + +* lexer - Overrides default lexer. + +* transformer - Applies the transformer instead of building a parse tree (only allowed with parser="lalr") + +* postlex - Lexer post-processing (Default: None. only works when lexer is "standard" or "contextual") + +* ambiguity (only relevant for earley and cyk) + + * "explicit" - Return all derivations inside an "_ambig" data node. + + * "resolve" - Let the parser choose the best derivation (greedy for tokens, non-greedy for rules. Default) + +* debug - Display warnings (such as Shift-Reduce warnings for LALR) + +* keep_all_tokens - Don't throw away any terminals from the tree (Default=False) + +* propagate_positions - Propagate line/column count to tree nodes (default=False) + +* lexer_callbacks - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information. + +#### parse(self, text) + +Return a complete parse tree for the text (of type Tree) + +If a transformer is supplied to `__init__`, returns whatever is the result of the transformation. + +---- + +## Tree + +The main tree class + +### Properties + +* `data` - The name of the rule or alias +* `children` - List of matched sub-rules and terminals +* `meta` - Line & Column numbers, if using `propagate_positions` + +### Methods + +#### \_\_init\_\_(self, data, children) + +Creates a new tree, and stores "data" and "children" in attributes of the same name. + +#### pretty(self, indent_str=' ') + +Returns an indented string representation of the tree. Great for debugging. + +#### find_pred(self, pred) + +Returns all nodes of the tree that evaluate pred(node) as true. + +#### find_data(self, data) + +Returns all nodes of the tree whose data equals the given data. + +#### iter_subtrees(self) + +Depth-first iteration. + +Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG). + +#### iter_subtrees_topdown(self) + +Breadth-first iteration. + +Iterates over all the subtrees, return nodes in order like pretty() does. + +#### \_\_eq\_\_, \_\_hash\_\_ + +Trees can be hashed and compared. + +---- + +## Transformers & Visitors + +Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns. + +They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each methods accepts the children as an argument. That can be modified using the `v-args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument. + +See: https://github.com/lark-parser/lark/blob/master/lark/visitors.py + +### Visitors + +Visitors visit each node of the tree, and run the appropriate method on it according to the node's data. + +They work bottom-up, starting with the leaves and ending at the root of the tree. + +**Example** +```python +class IncreaseAllNumbers(Visitor): + def number(self, tree): + assert tree.data == "number" + tree.children[0] += 1 + +IncreaseAllNumbers().visit(parse_tree) +``` + +There are two classes that implement the visitor interface: + +* Visitor - Visit every node (without recursion) + +* Visitor_Recursive - Visit every node using recursion. Slightly faster. + +### Transformers + +Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. + +They work bottom-up (or: depth-first), starting with the leaves and ending at the root of the tree. + +Transformers can be used to implement map & reduce patterns. + +Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable). + +Transformers can be chained into a new transformer by using multiplication. + +**Example:** +```python +from lark import Tree, Transformer + +class EvalExpressions(Transformer): + def expr(self, args): + return eval(args[0]) + +t = Tree('a', [Tree('expr', ['1+2'])]) +print(EvalExpressions().transform( t )) + +# Prints: Tree(a, [3]) +``` + + +Here are the classes that implement the transformer interface: + +- Transformer - Recursively transforms the tree. This is the one you probably want. +- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances +- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances + +### v_args + +`v_args` is a decorator. + +By default, callback methods of transformers/visitors accept one argument: a list of the node's children. `v_args` can modify this behavior. + +When used on a transformer/visitor class definition, it applies to all the callback methods inside it. + +`v_args` accepts one of three flags: + +- `inline` - Children are provided as `*args` instead of a list argument (not recommended for very long lists). +- `meta` - Provides two arguments: `children` and `meta` (instead of just the first) +- `tree` - Provides the entire tree as the argument, instead of the children. + +Examples: + +```python +@v_args(inline=True) +class SolveArith(Transformer): + def add(self, left, right): + return left + right + + +class ReverseNotation(Transformer_InPlace): + @v_args(tree=True): + def tree_node(self, tree): + tree.children = tree.children[::-1] +``` + +### Discard + +When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent. + +## Token + +When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes: + +* `type` - Name of the token (as specified in grammar). +* `pos_in_stream` - the index of the token in the text +* `line` - The line of the token in the text (starting with 1) +* `column` - The column of the token in the text (starting with 1) +* `end_line` - The line where the token ends +* `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5. + + +## UnexpectedInput + +- `UnexpectedInput` + - `UnexpectedToken` - The parser recieved an unexpected token + - `UnexpectedCharacters` - The lexer encountered an unexpected string + +After catching one of these exceptions, you may call the following helper methods to create a nicer error message: + +### Methods + +#### get_context(text, span) + +Returns a pretty string pinpointing the error in the text, with `span` amount of context characters around it. + +(The parser doesn't hold a copy of the text it has to parse, so you have to provide it again) + +#### match_examples(parse_fn, examples) + +Allows you to detect what's wrong in the input text by matching against example errors. + +Accepts the parse function (usually `lark_instance.parse`) and a dictionary of `{'example_string': value}`. + +The function will iterate the dictionary until it finds a matching error, and return the corresponding value. + +For an example usage, see: [examples/error_reporting_lalr.py](https://github.com/lark-parser/lark/blob/master/examples/error_reporting_lalr.py) Binary files /tmp/tmpDAY1UK/VUjEyr1VFL/lark-parser-0.6.2/docs/comparison_memory.png and /tmp/tmpDAY1UK/i9qQf2uH52/lark-parser-0.7.0/docs/comparison_memory.png differ Binary files /tmp/tmpDAY1UK/VUjEyr1VFL/lark-parser-0.6.2/docs/comparison_runtime.png and /tmp/tmpDAY1UK/i9qQf2uH52/lark-parser-0.7.0/docs/comparison_runtime.png differ diff -Nru lark-parser-0.6.2/docs/features.md lark-parser-0.7.0/docs/features.md --- lark-parser-0.6.2/docs/features.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/features.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,32 @@ +# Main Features + - Earley parser, capable of parsing any context-free grammar + - Implements SPPF, for efficient parsing and storing of ambiguous grammars. + - LALR(1) parser, limited in power of expression, but very efficient in space and performance (O(n)). + - Implements a parse-aware lexer that provides a better power of expression than traditional LALR implementations (such as ply). + - EBNF-inspired grammar, with extra features (See: [Grammar Reference](grammar.md)) + - Builds a parse-tree (AST) automagically based on the grammar + - Stand-alone parser generator - create a small independent parser to embed in your project. + - Automatic line & column tracking + - Automatic terminal collision resolution + - Standard library of terminals (strings, numbers, names, etc.) + - Unicode fully supported + - Extensive test suite + - Python 2 & Python 3 compatible + - Pure-Python implementation + +[Read more about the parsers](parsers.md) + +# Extra features + + - Import rules and tokens from other Lark grammars, for code reuse and modularity. + - Import grammars from Nearley.js + - CYK parser + +### Experimental features + - Automatic reconstruction of input from parse-tree (see examples) + +### Planned features (not implemented yet) + - Generate code in other languages than Python + - Grammar composition + - LALR(k) parser + - Full regexp-collision support using NFAs diff -Nru lark-parser-0.6.2/docs/grammar.md lark-parser-0.7.0/docs/grammar.md --- lark-parser-0.6.2/docs/grammar.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/grammar.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,161 @@ +# Grammar Reference + +## Definitions + +**A grammar** is a list of rules and terminals, that together define a language. + +Terminals define the alphabet of the language, while rules define its structure. + +In Lark, a terminal may be a string, a regular expression, or a concatenation of these and other terminals. + +Each rule is a list of terminals and rules, whose location and nesting define the structure of the resulting parse-tree. + +A **parsing algorithm** is an algorithm that takes a grammar definition and a sequence of symbols (members of the alphabet), and matches the entirety of the sequence by searching for a structure that is allowed by the grammar. + +## General Syntax and notes + +Grammars in Lark are based on [EBNF](https://en.wikipedia.org/wiki/Extended_Backus–Naur_form) syntax, with several enhancements. + +Lark grammars are composed of a list of definitions and directives, each on its own line. A definition is either a named rule, or a named terminal. + +**Comments** start with `//` and last to the end of the line (C++ style) + +Lark begins the parse with the rule 'start', unless specified otherwise in the options. + +Names of rules are always in lowercase, while names of terminals are always in uppercase. This distinction has practical effects, for the shape of the generated parse-tree, and the automatic construction of the lexer (aka tokenizer, or scanner). + + +## Terminals + +Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals. + +**Syntax:** + +```html + [. ] : +``` + +Terminal names must be uppercase. + +Literals can be one of: + +* `"string"` +* `/regular expression+/` +* `"case-insensitive string"i` +* `/re with flags/imulx` +* Literal range: `"a".."z"`, `"1".."9"`, etc. + +#### Notes for when using a lexer: + +When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria: + +1. Highest priority first (priority is specified as: TERM.number: ...) +2. Length of match (for regexps, the longest theoretical match is used) +3. Length of literal / pattern definition +4. Name + +**Examples:** +```perl +IF: "if" +INTEGER : /[0-9]+/ +INTEGER2 : ("0".."9")+ //# Same as INTEGER +DECIMAL.2: INTEGER "." INTEGER //# Will be matched before INTEGER +WHITESPACE: (" " | /\t/ )+ +SQL_SELECT: "select"i +``` + +## Rules + +**Syntax:** +```html + : [-> ] + | ... +``` + +Names of rules and aliases are always in lowercase. + +Rule definitions can be extended to the next line by using the OR operator (signified by a pipe: `|` ). + +An alias is a name for the specific rule alternative. It affects tree construction. + + +Each item is one of: + +* `rule` +* `TERMINAL` +* `"string literal"` or `/regexp literal/` +* `(item item ..)` - Group items +* `[item item ..]` - Maybe. Same as `(item item ..)?` +* `item?` - Zero or one instances of item ("maybe") +* `item*` - Zero or more instances of item +* `item+` - One or more instances of item +* `item ~ n` - Exactly *n* instances of item +* `item ~ n..m` - Between *n* to *m* instances of item + +**Examples:** +```perl +hello_world: "hello" "world" +mul: [mul "*"] number //# Left-recursion is allowed! +expr: expr operator expr + | value //# Multi-line, belongs to expr + +four_words: word ~ 4 +``` + + +## Directives + +### %ignore + +All occurrences of the terminal will be ignored, and won't be part of the parse. + +Using the `%ignore` directive results in a cleaner grammar. + +It's especially important for the LALR(1) algorithm, because adding whitespace (or comments, or other extranous elements) explicitly in the grammar, harms its predictive abilities, which are based on a lookahead of 1. + +**Syntax:** +```html +%ignore +``` +**Examples:** +```perl +%ignore " " + +COMMENT: "#" /[^\n]/* +%ignore COMMENT +``` +### %import + +Allows to import terminals and rules from lark grammars. + +When importing rules, all their dependencies will be imported into a namespace, to avoid collisions. It's not possible to override their dependencies (e.g. like you would when inheriting a class). + +**Syntax:** +```html +%import . +%import . +%import . -> +%import . -> +%import ( ) +``` + +If the module path is absolute, Lark will attempt to load it from the built-in directory (currently, only `common.lark` is available). + +If the module path is relative, such as `.path.to.file`, Lark will attempt to load it from the current working directory. Grammars must have the `.lark` extension. + +The rule or terminal can be imported under an other name with the `->` syntax. + +**Example:** +```perl +%import common.NUMBER + +%import .terminals_file (A B C) + +%import .rules_file.rulea -> ruleb +``` + +Note that `%ignore` directives cannot be imported. Imported rules will abide by the `%ignore` directives declared in the main grammar. + +### %declare + +Declare a terminal without defining it. Useful for plugins. diff -Nru lark-parser-0.6.2/docs/how_to_develop.md lark-parser-0.7.0/docs/how_to_develop.md --- lark-parser-0.6.2/docs/how_to_develop.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/how_to_develop.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,63 @@ +# How to develop Lark - Guide + +There are many ways you can help the project: + +* Help solve issues +* Improve the documentation +* Write new grammars for Lark's library +* Write a blog post introducing Lark to your audience +* Port Lark to another language +* Help me with code developemnt + +If you're interested in taking one of these on, let me know and I will provide more details and assist you in the process. + + +## Unit Tests + +Lark comes with an extensive set of tests. Many of the tests will run several times, once for each parser configuration. + +To run the tests, just go to the lark project root, and run the command: +```bash +python -m tests +``` + +or + +```bash +pypy -m tests +``` + +For a list of supported interpreters, you can consult the `tox.ini` file. + +You can also run a single unittest using its class and method name, for example: +```bash +## test_package test_class_name.test_function_name +python -m tests TestLalrStandard.test_lexer_error_recovering +``` + +### tox + +To run all Unit Tests with tox, +install tox and Python 2.7 up to the latest python interpreter supported (consult the file tox.ini). +Then, +run the command `tox` on the root of this project (where the main setup.py file is on). + +And, for example, +if you would like to only run the Unit Tests for Python version 2.7, +you can run the command `tox -e py27` + +### pytest + +You can also run the tests using pytest: + +```bash +pytest tests +``` + +### Using setup.py + +Another way to run the tests is using setup.py: + +```bash +python setup.py test +``` \ No newline at end of file diff -Nru lark-parser-0.6.2/docs/how_to_use.md lark-parser-0.7.0/docs/how_to_use.md --- lark-parser-0.6.2/docs/how_to_use.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/how_to_use.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,46 @@ +# How To Use Lark - Guide + +## Work process + +This is the recommended process for working with Lark: + +1. Collect or create input samples, that demonstrate key features or behaviors in the language you're trying to parse. + +2. Write a grammar. Try to aim for a structure that is intuitive, and in a way that imitates how you would explain your language to a fellow human. + +3. Try your grammar in Lark against each input sample. Make sure the resulting parse-trees make sense. + +4. Use Lark's grammar features to [[shape the tree|Tree Construction]]: Get rid of superfluous rules by inlining them, and use aliases when specific cases need clarification. + + - You can perform steps 1-4 repeatedly, gradually growing your grammar to include more sentences. + +5. Create a transformer to evaluate the parse-tree into a structure you'll be comfortable to work with. This may include evaluating literals, merging branches, or even converting the entire tree into your own set of AST classes. + +Of course, some specific use-cases may deviate from this process. Feel free to suggest these cases, and I'll add them to this page. + +## Getting started + +Browse the [Examples](https://github.com/lark-parser/lark/tree/master/examples) to find a template that suits your purposes. + +Read the tutorials to get a better understanding of how everything works. (links in the [main page](/)) + +Use the [Cheatsheet (PDF)](lark_cheatsheet.pdf) for quick reference. + +Use the reference pages for more in-depth explanations. (links in the [main page](/)] + +## LALR usage + +By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure `logging` framework beforehand. For example: + +```python +from lark import Lark +import logging +logging.basicConfig(level=logging.DEBUG) + +collision_grammar = ''' +start: as as +as: a* +a: "a" +''' +p = Lark(collision_grammar, parser='lalr', debug=True) +``` diff -Nru lark-parser-0.6.2/docs/index.md lark-parser-0.7.0/docs/index.md --- lark-parser-0.6.2/docs/index.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/index.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,51 @@ +# Lark + +A modern parsing library for Python + +## Overview + +Lark can parse any context-free grammar. + +Lark provides: + +- Advanced grammar language, based on EBNF +- Three parsing algorithms to choose from: Earley, LALR(1) and CYK +- Automatic tree construction, inferred from your grammar +- Fast unicode lexer with regexp support, and automatic line-counting + +Lark's code is hosted on Github: [https://github.com/lark-parser/lark](https://github.com/lark-parser/lark) + +### Install +```bash +$ pip install lark-parser +``` + +#### Syntax Highlighting + +- [Sublime Text & TextMate](https://github.com/lark-parser/lark_syntax) +- [Visual Studio Code](https://github.com/lark-parser/vscode-lark) (Or install through the vscode plugin system) + +----- + +## Documentation Index + + +* [Philosophy & Design Choices](philosophy.md) +* [Full List of Features](features.md) +* [Examples](https://github.com/lark-parser/lark/tree/master/examples) +* Tutorials + * [How to write a DSL](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - Implements a toy LOGO-like language with an interpreter + * [How to write a JSON parser](json_tutorial.md) + * External + * [Program Synthesis is Possible](https://www.cs.cornell.edu/~asampson/blog/minisynth.html) - Creates a DSL for Z3 +* Guides + * [How to use Lark](how_to_use.md) + * [How to develop Lark](how_to_develop.md) +* Reference + * [Grammar](grammar.md) + * [Tree Construction](tree_construction.md) + * [Classes](classes.md) + * [Cheatsheet (PDF)](lark_cheatsheet.pdf) +* Discussion + * [Gitter](https://gitter.im/lark-parser/Lobby) + * [Forum (Google Groups)](https://groups.google.com/forum/#!forum/lark-parser) diff -Nru lark-parser-0.6.2/docs/json_tutorial.md lark-parser-0.7.0/docs/json_tutorial.md --- lark-parser-0.6.2/docs/json_tutorial.md 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/docs/json_tutorial.md 2019-03-28 13:53:28.000000000 +0000 @@ -79,7 +79,8 @@ Lark will accept this, if you really want to complicate your life :) -(You can find the original definitions in [common.lark](/lark/grammars/common.lark).) +You can find the original definitions in [common.lark](/lark/grammars/common.lark). +They're don't strictly adhere to [json.org](https://json.org/) - but our purpose here is to accept json, not validate it. Notice that terminals are written in UPPER-CASE, while rules are written in lower-case. I'll touch more on the differences between rules and terminals later. diff -Nru lark-parser-0.6.2/docs/parsers.md lark-parser-0.7.0/docs/parsers.md --- lark-parser-0.6.2/docs/parsers.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/parsers.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,49 @@ + +Lark implements the following parsing algorithms: Earley, LALR(1), and CYK + +# Earley + +An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser capable of parsing any context-free grammar at O(n^3), and O(n^2) when the grammar is unambiguous. It can parse most LR grammars at O(n). Most programming languages are LR, and can be parsed at a linear time. + +Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`. + +It's possible to bypass the dynamic lexer, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` + +**SPPF & Ambiguity resolution** + +Lark implements the Shared Packed Parse Forest data-structure for the Earley parser, in order to reduce the space and computation required to handle ambiguous grammars. + +You can read more about SPPF [here](http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/) + +As a result, Lark can efficiently parse and store every ambiguity in the grammar, when using Earley. + +Lark provides the following options to combat ambiguity: + +1) Lark will choose the best derivation for you (default). Users can choose between different disambiguation strategies, and can prioritize (or demote) individual rules over others, using the rule-priority syntax. + +2) Users may choose to recieve the set of all possible parse-trees (using ambiguity='explicit'), and choose the best derivation themselves. While simple and flexible, it comes at the cost of space and performance, and so it isn't recommended for highly ambiguous grammars, or very long inputs. + +3) As an advanced feature, users may use specialized visitors to iterate the SPPF themselves. Future versions of Lark intend to improve and simplify this interface. + + +**dynamic_complete** + +**TODO: Add documentation on dynamic_complete** + +# LALR(1) + +[LALR(1)](https://www.wikiwand.com/en/LALR_parser) is a very efficient, true-and-tested parsing algorithm. It's incredibly fast and requires very little memory. It can parse most programming languages (For example: Python and Java). + +Lark comes with an efficient implementation that outperforms every other parsing library for Python (including PLY) + +Lark extends the traditional YACC-based architecture with a *contextual lexer*, which automatically provides feedback from the parser to the lexer, making the LALR(1) algorithm stronger than ever. + +The contextual lexer communicates with the parser, and uses the parser's lookahead prediction to narrow its choice of tokens. So at each point, the lexer only matches the subgroup of terminals that are legal at that parser state, instead of all of the terminals. It’s surprisingly effective at resolving common terminal collisions, and allows to parse languages that LALR(1) was previously incapable of parsing. + +This is an improvement to LALR(1) that is unique to Lark. + +# CYK Parser + +A [CYK parser](https://www.wikiwand.com/en/CYK_algorithm) can parse any context-free grammar at O(n^3*|G|). + +Its too slow to be practical for simple grammars, but it offers good performance for highly ambiguous grammars. diff -Nru lark-parser-0.6.2/docs/philosophy.md lark-parser-0.7.0/docs/philosophy.md --- lark-parser-0.6.2/docs/philosophy.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/philosophy.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,63 @@ +# Philosophy + +Parsers are innately complicated and confusing. They're difficult to understand, difficult to write, and difficult to use. Even experts on the subject can become baffled by the nuances of these complicated state-machines. + +Lark's mission is to make the process of writing them as simple and abstract as possible, by following these design principles: + +### Design Principles + +1. Readability matters + +2. Keep the grammar clean and simple + +2. Don't force the user to decide on things that the parser can figure out on its own + +4. Usability is more important than performance + +5. Performance is still very important + +6. Follow the Zen Of Python, whenever possible and applicable + + +In accordance with these principles, I arrived at the following design choices: + +----------- + +# Design Choices + +### 1. Separation of code and grammar + +Grammars are the de-facto reference for your language, and for the structure of your parse-tree. For any non-trivial language, the conflation of code and grammar always turns out convoluted and difficult to read. + +The grammars in Lark are EBNF-inspired, so they are especially easy to read & work with. + +### 2. Always build a parse-tree (unless told not to) + +Trees are always simpler to work with than state-machines. + +1. Trees allow you to see the "state-machine" visually + +2. Trees allow your computation to be aware of previous and future states + +3. Trees allow you to process the parse in steps, instead of forcing you to do it all at once. + +And anyway, every parse-tree can be replayed as a state-machine, so there is no loss of information. + +See this answer in more detail [here](https://github.com/erezsh/lark/issues/4). + +To improve performance, you can skip building the tree for LALR(1), by providing Lark with a transformer (see the [JSON example](https://github.com/erezsh/lark/blob/master/examples/json_parser.py)). + +### 3. Earley is the default + +The Earley algorithm can accept *any* context-free grammar you throw at it (i.e. any grammar you can write in EBNF, it can parse). That makes it extremely friendly to beginners, who are not aware of the strange and arbitrary restrictions that LALR(1) places on its grammars. + +As the users grow to understand the structure of their grammar, the scope of their target language, and their performance requirements, they may choose to switch over to LALR(1) to gain a huge performance boost, possibly at the cost of some language features. + +In short, "Premature optimization is the root of all evil." + +### Other design features + +- Automatically resolve terminal collisions whenever possible + +- Automatically keep track of line & column numbers + diff -Nru lark-parser-0.6.2/docs/recipes.md lark-parser-0.7.0/docs/recipes.md --- lark-parser-0.6.2/docs/recipes.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/recipes.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,76 @@ +# Recipes + +A collection of recipes to use Lark and its various features + + + +## lexer_callbacks + +Use it to interface with the lexer as it generates tokens. + +Accepts a dictionary of the form + + {TOKEN_TYPE: callback} + +Where callback is of type `f(Token) -> Token` + +It only works with the standard and contextual lexers. + +### Example 1: Replace string values with ints for INT tokens + +```python +from lark import Lark, Token + +def tok_to_int(tok): + "Convert the value of `tok` from string to int, while maintaining line number & column." + # tok.type == 'INT' + return Token.new_borrow_pos(tok.type, int(tok), tok) + +parser = Lark(""" +start: INT* +%import common.INT +%ignore " " +""", parser="lalr", lexer_callbacks = {'INT': tok_to_int}) + +print(parser.parse('3 14 159')) +``` + +Prints out: + +```python +Tree(start, [Token(INT, 3), Token(INT, 14), Token(INT, 159)]) +``` + + +### Example 2: Collect all comments +```python +from lark import Lark + +comments = [] + +parser = Lark(""" + start: INT* + + COMMENT: /#.*/ + + %import common (INT, WS) + %ignore COMMENT + %ignore WS +""", parser="lalr", lexer_callbacks={'COMMENT': comments.append}) + +parser.parse(""" +1 2 3 # hello +# world +4 5 6 +""") + +print(comments) +``` + +Prints out: + +```python +[Token(COMMENT, '# hello'), Token(COMMENT, '# world')] +``` + +*Note: We don't have to return a token, because comments are ignored* diff -Nru lark-parser-0.6.2/docs/tree_construction.md lark-parser-0.7.0/docs/tree_construction.md --- lark-parser-0.6.2/docs/tree_construction.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/docs/tree_construction.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,129 @@ +# Automatic Tree Construction - Reference + + +Lark builds a tree automatically based on the structure of the grammar, where each rule that is matched becomes a branch (node) in the tree, and its children are its matches, in the order of matching. + +For example, the rule `node: child1 child2` will create a tree node with two children. If it is matched as part of another rule (i.e. if it isn't the root), the new rule's tree node will become its parent. + +Using `item+` or `item*` will result in a list of items, equivalent to writing `item item item ..`. + +### Terminals + +Terminals are always values in the tree, never branches. + +Lark filters out certain types of terminals by default, considering them punctuation: + +- Terminals that won't appear in the tree are: + + - Unnamed literals (like `"keyword"` or `"+"`) + - Terminals whose name starts with an underscore (like `_DIGIT`) + +- Terminals that *will* appear in the tree are: + + - Unnamed regular expressions (like `/[0-9]/`) + - Named terminals whose name starts with a letter (like `DIGIT`) + +Rules prefixed with `!` will retain all their literals regardless. + + + + +**Example:** + +```perl + expr: "(" expr ")" + | NAME+ + + NAME: /\w+/ + + %ignore " " +``` + +Lark will parse "((hello world))" as: + + expr + expr + expr + "hello" + "world" + +The brackets do not appear in the tree by design. The words appear because they are matched by a named terminal. + + +# Shaping the tree + +Users can alter the automatic construction of the tree using a collection of grammar features. + + +* Rules whose name begins with an underscore will be inlined into their containing rule. + +**Example:** + +```perl + start: "(" _greet ")" + _greet: /\w+/ /\w+/ +``` + +Lark will parse "(hello world)" as: + + start + "hello" + "world" + + +* Rules that receive a question mark (?) at the beginning of their definition, will be inlined if they have a single child, after filtering. + +**Example:** + +```ruby + start: greet greet + ?greet: "(" /\w+/ ")" + | /\w+/ /\w+/ +``` + +Lark will parse "hello world (planet)" as: + + start + greet + "hello" + "world" + "planet" + +* Rules that begin with an exclamation mark will keep all their terminals (they won't get filtered). + +```perl + !expr: "(" expr ")" + | NAME+ + NAME: /\w+/ + %ignore " " +``` + +Will parse "((hello world))" as: + + expr + ( + expr + ( + expr + hello + world + ) + ) + +Using the `!` prefix is usually a "code smell", and may point to a flaw in your grammar design. + +* Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option, instead of the rule name. + +**Example:** + +```ruby + start: greet greet + greet: "hello" + | "world" -> planet +``` + +Lark will parse "hello world" as: + + start + greet + planet diff -Nru lark-parser-0.6.2/examples/custom_lexer.py lark-parser-0.7.0/examples/custom_lexer.py --- lark-parser-0.6.2/examples/custom_lexer.py 2018-07-18 12:42:58.000000000 +0000 +++ lark-parser-0.7.0/examples/custom_lexer.py 2019-03-28 13:53:28.000000000 +0000 @@ -49,7 +49,7 @@ res = ParseToDict().transform(tree) print('-->') - print(res) # prints {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]} + print(res) # prints {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]} if __name__ == '__main__': diff -Nru lark-parser-0.6.2/examples/lark_grammar.py lark-parser-0.7.0/examples/lark_grammar.py --- lark-parser-0.6.2/examples/lark_grammar.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/examples/lark_grammar.py 2019-03-28 13:53:28.000000000 +0000 @@ -6,6 +6,9 @@ 'examples/python2.lark', 'examples/python3.lark', 'examples/lark.lark', + 'examples/relative-imports/multiples.lark', + 'examples/relative-imports/multiple2.lark', + 'examples/relative-imports/multiple3.lark', 'lark/grammars/common.lark', ] diff -Nru lark-parser-0.6.2/examples/lark.lark lark-parser-0.7.0/examples/lark.lark --- lark-parser-0.6.2/examples/lark.lark 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/examples/lark.lark 2019-03-28 13:53:28.000000000 +0000 @@ -10,10 +10,10 @@ priority: "." NUMBER statement: "%ignore" expansions _NL -> ignore - | "%import" import_args ["->" TOKEN] _NL -> import + | "%import" import_args ["->" name] _NL -> import | "%declare" name+ -> declare -import_args: name ("." name)* +import_args: "."? name ("." name)* ?expansions: alias (_VBAR alias)* diff -Nru lark-parser-0.6.2/examples/python2.lark lark-parser-0.7.0/examples/python2.lark --- lark-parser-0.6.2/examples/python2.lark 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/examples/python2.lark 2019-03-28 13:53:28.000000000 +0000 @@ -162,7 +162,7 @@ %ignore /[\t \f]+/ // WS -%ignore /\\[\t \f]*\r?\n/ // LINE_CONT +%ignore /\\[\t \f]*\r?\n/ // LINE_CONT %ignore COMMENT %declare _INDENT _DEDENT diff -Nru lark-parser-0.6.2/examples/python3.lark lark-parser-0.7.0/examples/python3.lark --- lark-parser-0.6.2/examples/python3.lark 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/examples/python3.lark 2019-03-28 13:53:28.000000000 +0000 @@ -161,7 +161,7 @@ yield_arg: "from" test | testlist -number: DEC_NUMBER | HEX_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER +number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER string: STRING | LONG_STRING // Tokens diff -Nru lark-parser-0.6.2/examples/qscintilla_json.py lark-parser-0.7.0/examples/qscintilla_json.py --- lark-parser-0.6.2/examples/qscintilla_json.py 2018-06-26 13:29:17.000000000 +0000 +++ lark-parser-0.7.0/examples/qscintilla_json.py 2019-03-28 13:53:28.000000000 +0000 @@ -48,12 +48,12 @@ self.setFont(self.parent().font(), style) self.token_styles = { - "__COLON": 5, - "__COMMA": 5, - "__LBRACE": 5, - "__LSQB": 5, - "__RBRACE": 5, - "__RSQB": 5, + "COLON": 5, + "COMMA": 5, + "LBRACE": 5, + "LSQB": 5, + "RBRACE": 5, + "RSQB": 5, "FALSE": 0, "NULL": 0, "TRUE": 0, diff -Nru lark-parser-0.6.2/examples/README.md lark-parser-0.7.0/examples/README.md --- lark-parser-0.6.2/examples/README.md 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/examples/README.md 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,33 @@ +# Examples for Lark + +#### How to run the examples + +After cloning the repo, open the terminal into the root directory of the project, and run the following: + +```bash +[lark]$ python -m examples. +``` + +For example, the following will parse all the Python files in the standard library of your local installation: + +```bash +[lark]$ python -m examples.python_parser +``` + +### Beginners + +- [calc.py](calc.py) - A simple example of a REPL calculator +- [json\_parser.py](json_parser.py) - A simple JSON parser (comes with a tutorial, see docs) +- [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) +- [fruitflies.py](fruitflies.py) - A demonstration of ambiguity +- [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. +- [lark\_grammar.py](lark_grammar.py) + [lark.lark](lark.lark) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer) + +### Advanced + +- [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser +- [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) +- [conf\_lalr.py](conf_lalr.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language +- [conf\_earley.py](conf_earley.py) - Demonstrates the power of Earley's dynamic lexer on a toy configuration language +- [custom\_lexer.py](custom_lexer.py) - Demonstrates using a custom lexer to parse a non-textual stream of data +- [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature diff -Nru lark-parser-0.6.2/examples/relative-imports/multiple2.lark lark-parser-0.7.0/examples/relative-imports/multiple2.lark --- lark-parser-0.6.2/examples/relative-imports/multiple2.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/examples/relative-imports/multiple2.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1 @@ +start: ("0" | "1")* "0" diff -Nru lark-parser-0.6.2/examples/relative-imports/multiple3.lark lark-parser-0.7.0/examples/relative-imports/multiple3.lark --- lark-parser-0.6.2/examples/relative-imports/multiple3.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/examples/relative-imports/multiple3.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,5 @@ +start: mod0mod0+ + +mod0mod0: "0" | "1" mod1mod0 +mod1mod0: "1" | "0" mod2mod1 mod1mod0 +mod2mod1: "0" | "1" mod2mod1 diff -Nru lark-parser-0.6.2/examples/relative-imports/multiples.lark lark-parser-0.7.0/examples/relative-imports/multiples.lark --- lark-parser-0.6.2/examples/relative-imports/multiples.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/examples/relative-imports/multiples.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,5 @@ +start: "2:" multiple2 + | "3:" multiple3 + +%import .multiple2.start -> multiple2 +%import .multiple3.start -> multiple3 diff -Nru lark-parser-0.6.2/examples/relative-imports/multiples.py lark-parser-0.7.0/examples/relative-imports/multiples.py --- lark-parser-0.6.2/examples/relative-imports/multiples.py 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/examples/relative-imports/multiples.py 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,28 @@ +# +# This example demonstrates relative imports with rule rewrite +# see multiples.lark +# + +# +# if b is a number written in binary, and m is either 2 or 3, +# the grammar aims to recognise m:b iif b is a multiple of m +# +# for example, 3:1001 is recognised +# because 9 (0b1001) is a multiple of 3 +# + +from lark import Lark, UnexpectedInput + +parser = Lark.open('multiples.lark', parser='lalr') + +def is_in_grammar(data): + try: + parser.parse(data) + except UnexpectedInput: + return False + return True + +for n_dec in range(100): + n_bin = bin(n_dec)[2:] + assert is_in_grammar('2:{}'.format(n_bin)) == (n_dec % 2 == 0) + assert is_in_grammar('3:{}'.format(n_bin)) == (n_dec % 3 == 0) diff -Nru lark-parser-0.6.2/examples/standalone/create_standalone.sh lark-parser-0.7.0/examples/standalone/create_standalone.sh --- lark-parser-0.6.2/examples/standalone/create_standalone.sh 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/examples/standalone/create_standalone.sh 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1 @@ +python -m lark.tools.standalone json.lark > json_parser.py diff -Nru lark-parser-0.6.2/examples/standalone/json.lark lark-parser-0.7.0/examples/standalone/json.lark --- lark-parser-0.6.2/examples/standalone/json.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/examples/standalone/json.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,21 @@ +?start: value + +?value: object + | array + | string + | SIGNED_NUMBER -> number + | "true" -> true + | "false" -> false + | "null" -> null + +array : "[" [value ("," value)*] "]" +object : "{" [pair ("," pair)*] "}" +pair : string ":" value + +string : ESCAPED_STRING + +%import common.ESCAPED_STRING +%import common.SIGNED_NUMBER +%import common.WS + +%ignore WS diff -Nru lark-parser-0.6.2/examples/standalone/json_parser_main.py lark-parser-0.7.0/examples/standalone/json_parser_main.py --- lark-parser-0.6.2/examples/standalone/json_parser_main.py 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/examples/standalone/json_parser_main.py 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,25 @@ +import sys + +from json_parser import Lark_StandAlone, Transformer, inline_args + +class TreeToJson(Transformer): + @inline_args + def string(self, s): + return s[1:-1].replace('\\"', '"') + + array = list + pair = tuple + object = dict + number = inline_args(float) + + null = lambda self, _: None + true = lambda self, _: True + false = lambda self, _: False + + +parser = Lark_StandAlone(transformer=TreeToJson()) + +if __name__ == '__main__': + with open(sys.argv[1]) as f: + print(parser.parse(f.read())) + diff -Nru lark-parser-0.6.2/examples/standalone/json_parser.py lark-parser-0.7.0/examples/standalone/json_parser.py --- lark-parser-0.6.2/examples/standalone/json_parser.py 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/examples/standalone/json_parser.py 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,779 @@ +# The file was automatically generated by Lark v0.5.5 +# +# +# Lark Stand-alone Generator Tool +# ---------------------------------- +# Generates a stand-alone LALR(1) parser with a standard lexer +# +# Git: https://github.com/erezsh/lark +# Author: Erez Shinan (erezshin@gmail.com) +# +# +# >>> LICENSE +# +# This tool and its generated code use a separate license from Lark. +# +# It is licensed under GPLv2 or above. +# +# If you wish to purchase a commercial license for this tool and its +# generated code, contact me via email. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# See . +# +# + + +import types +import functools +from contextlib import contextmanager + +Str = type(u'') + +def inline_args(f): + # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) + if isinstance(f, types.FunctionType): + @functools.wraps(f) + def _f_func(self, args): + return f(self, *args) + return _f_func + elif isinstance(f, (type, types.BuiltinFunctionType)): + @functools.wraps(f) + def _f_builtin(_self, args): + return f(*args) + return _f_builtin + elif isinstance(f, types.MethodType): + @functools.wraps(f.__func__) + def _f(self, args): + return f.__func__(self, *args) + return _f + else: + @functools.wraps(f.__call__.__func__) + def _f(self, args): + return f.__call__.__func__(self, *args) + return _f + + +try: + from contextlib import suppress # Python 3 +except ImportError: + @contextmanager + def suppress(*excs): + '''Catch and dismiss the provided exception + + >>> x = 'hello' + >>> with suppress(IndexError): + ... x = x[10] + >>> x + 'hello' + ''' + try: + yield + except excs: + pass + + +def is_terminal(sym): + return sym.isupper() + +class GrammarError(Exception): + pass + +class ParseError(Exception): + pass + +class UnexpectedToken(ParseError): + def __init__(self, token, expected, seq, index, considered_rules=None): + self.token = token + self.expected = expected + self.line = getattr(token, 'line', '?') + self.column = getattr(token, 'column', '?') + self.considered_rules = considered_rules + + try: + context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) + except AttributeError: + context = seq[index:index+5] + except TypeError: + context = "" + message = ("Unexpected token %r at line %s, column %s.\n" + "Expected: %s\n" + "Context: %s" % (token, self.line, self.column, expected, context)) + + super(UnexpectedToken, self).__init__(message) + + + +class Tree(object): + def __init__(self, data, children): + self.data = data + self.children = children + + def __repr__(self): + return 'Tree(%s, %s)' % (self.data, self.children) + + def _pretty_label(self): + return self.data + + def _pretty(self, level, indent_str): + if len(self.children) == 1 and not isinstance(self.children[0], Tree): + return [ indent_str*level, self._pretty_label(), '\t', '%s' % (self.children[0],), '\n'] + + l = [ indent_str*level, self._pretty_label(), '\n' ] + for n in self.children: + if isinstance(n, Tree): + l += n._pretty(level+1, indent_str) + else: + l += [ indent_str*(level+1), '%s' % (n,), '\n' ] + + return l + + def pretty(self, indent_str=' '): + return ''.join(self._pretty(0, indent_str)) +class Transformer(object): + def _get_func(self, name): + return getattr(self, name) + + def transform(self, tree): + items = [] + for c in tree.children: + try: + items.append(self.transform(c) if isinstance(c, Tree) else c) + except Discard: + pass + try: + f = self._get_func(tree.data) + except AttributeError: + return self.__default__(tree.data, items) + else: + return f(items) + + def __default__(self, data, children): + return Tree(data, children) + + def __mul__(self, other): + return TransformerChain(self, other) + + +class Discard(Exception): + pass + +class TransformerChain(object): + def __init__(self, *transformers): + self.transformers = transformers + + def transform(self, tree): + for t in self.transformers: + tree = t.transform(tree) + return tree + + def __mul__(self, other): + return TransformerChain(*self.transformers + (other,)) + + + +class InlineTransformer(Transformer): + def _get_func(self, name): # use super()._get_func + return inline_args(getattr(self, name)).__get__(self) + + +class Visitor(object): + def visit(self, tree): + for child in tree.children: + if isinstance(child, Tree): + self.visit(child) + + f = getattr(self, tree.data, self.__default__) + f(tree) + return tree + + def __default__(self, tree): + pass + + +class Visitor_NoRecurse(Visitor): + def visit(self, tree): + subtrees = list(tree.iter_subtrees()) + + for subtree in (subtrees): + getattr(self, subtree.data, self.__default__)(subtree) + return tree + + +class Transformer_NoRecurse(Transformer): + def transform(self, tree): + subtrees = list(tree.iter_subtrees()) + + def _t(t): + # Assumes t is already transformed + try: + f = self._get_func(t.data) + except AttributeError: + return self.__default__(t) + else: + return f(t) + + for subtree in subtrees: + children = [] + for c in subtree.children: + try: + children.append(_t(c) if isinstance(c, Tree) else c) + except Discard: + pass + subtree.children = children + + return _t(tree) + + def __default__(self, t): + return t + +class Indenter: + def __init__(self): + self.paren_level = 0 + self.indent_level = [0] + + def handle_NL(self, token): + if self.paren_level > 0: + return + + yield token + + indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces + indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len + + if indent > self.indent_level[-1]: + self.indent_level.append(indent) + yield Token.new_borrow_pos(self.INDENT_type, indent_str, token) + else: + while indent < self.indent_level[-1]: + self.indent_level.pop() + yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) + + assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) + + def process(self, stream): + for token in stream: + if token.type == self.NL_type: + for t in self.handle_NL(token): + yield t + else: + yield token + + if token.type in self.OPEN_PAREN_types: + self.paren_level += 1 + elif token.type in self.CLOSE_PAREN_types: + self.paren_level -= 1 + assert self.paren_level >= 0 + + while len(self.indent_level) > 1: + self.indent_level.pop() + yield Token(self.DEDENT_type, '') + + assert self.indent_level == [0], self.indent_level + + # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? + @property + def always_accept(self): + return (self.NL_type,) + + +class LexError(Exception): + pass + +class UnexpectedInput(LexError): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_rules=None): + context = seq[lex_pos:lex_pos+5] + message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) + if allowed: + message += '\n\nExpecting: %s\n' % allowed + + super(UnexpectedInput, self).__init__(message) + + self.line = line + self.column = column + self.context = context + self.allowed = allowed + self.considered_rules = considered_rules + +class Token(Str): + def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): + inst = Str.__new__(cls, value) + inst.type = type_ + inst.pos_in_stream = pos_in_stream + inst.value = value + inst.line = line + inst.column = column + return inst + + @classmethod + def new_borrow_pos(cls, type_, value, borrow_t): + return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) + + def __repr__(self): + return 'Token(%s, %r)' % (self.type, self.value) + + def __deepcopy__(self, memo): + return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) + + def __eq__(self, other): + if isinstance(other, Token) and self.type != other.type: + return False + + return Str.__eq__(self, other) + + __hash__ = Str.__hash__ + + +class LineCounter: + def __init__(self): + self.newline_char = '\n' + self.char_pos = 0 + self.line = 1 + self.column = 0 + self.line_start_pos = 0 + + def feed(self, token, test_newline=True): + """Consume a token and calculate the new line & column. + + As an optional optimization, set test_newline=False is token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + +class _Lex: + "Built to serve both Lexer and ContextualLexer" + def __init__(self, lexer): + self.lexer = lexer + + def lex(self, stream, newline_types, ignore_types): + newline_types = list(newline_types) + ignore_types = list(ignore_types) + line_ctr = LineCounter() + + t = None + while True: + lexer = self.lexer + for mre, type_from_index in lexer.mres: + m = mre.match(stream, line_ctr.char_pos) + if m: + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + yield t + else: + if type_ in lexer.callback: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t) + + line_ctr.feed(value, type_ in newline_types) + if t: + t.end_line = line_ctr.line + t.end_column = line_ctr.column + break + else: + if line_ctr.char_pos < len(stream): + raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) + break + +class UnlessCallback: + def __init__(self, mres): + self.mres = mres + + def __call__(self, t): + for mre, type_from_index in self.mres: + m = mre.match(t.value) + if m: + value = m.group(0) + t.type = type_from_index[m.lastindex] + break + return t + + + +from functools import partial + +class ExpandSingleChild: + def __init__(self, node_builder): + self.node_builder = node_builder + + def __call__(self, children): + if len(children) == 1: + return children[0] + else: + return self.node_builder(children) + + +class CreateToken: + "Used for fixing the results of scanless parsing" + + def __init__(self, token_name, node_builder): + self.node_builder = node_builder + self.token_name = token_name + + def __call__(self, children): + return self.node_builder( [Token(self.token_name, ''.join(children))] ) + + +class PropagatePositions: + def __init__(self, node_builder): + self.node_builder = node_builder + + def __call__(self, children): + res = self.node_builder(children) + + if children: + for a in children: + with suppress(AttributeError): + res.line = a.line + res.column = a.column + break + + for a in reversed(children): + with suppress(AttributeError): + res.end_line = a.end_line + res.end_column = a.end_column + break + + return res + + +class ChildFilter: + def __init__(self, to_include, node_builder): + self.node_builder = node_builder + self.to_include = to_include + + def __call__(self, children): + filtered = [] + for i, to_expand in self.to_include: + if to_expand: + if filtered: + filtered += children[i].children + else: # Optimize for left-recursion + filtered = children[i].children + else: + filtered.append(children[i]) + + return self.node_builder(filtered) + +def _should_expand(sym): + return not is_terminal(sym) and sym.startswith('_') + +def maybe_create_child_filter(expansion, filter_out): + to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] + + if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): + return partial(ChildFilter, to_include) + + +class Callback(object): + pass + +class ParseTreeBuilder: + def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): + self.tree_class = tree_class + self.propagate_positions = propagate_positions + self.always_keep_all_tokens = keep_all_tokens + + self.rule_builders = list(self._init_builders(rules)) + + self.user_aliases = {} + + def _init_builders(self, rules): + filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out} + filter_out |= {sym for rule in rules for sym in rule.expansion if is_terminal(sym) and sym.startswith('_')} + assert all(x.startswith('_') for x in filter_out) + + for rule in rules: + options = rule.options + keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) + expand_single_child = options.expand1 if options else False + create_token = options.create_token if options else False + + wrapper_chain = filter(None, [ + create_token and partial(CreateToken, create_token), + (expand_single_child and not rule.alias) and ExpandSingleChild, + maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out), + self.propagate_positions and PropagatePositions, + ]) + + yield rule, wrapper_chain + + + def create_callback(self, transformer=None): + callback = Callback() + + for rule, wrapper_chain in self.rule_builders: + internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) + + user_callback_name = rule.alias or rule.origin + try: + f = transformer._get_func(user_callback_name) + except AttributeError: + f = partial(self.tree_class, user_callback_name) + + self.user_aliases[rule] = rule.alias + rule.alias = internal_callback_name + + for w in wrapper_chain: + f = w(f) + + if hasattr(callback, internal_callback_name): + raise GrammarError("Rule '%s' already exists" % (rule,)) + setattr(callback, internal_callback_name, f) + + return callback + + + +class _Parser: + def __init__(self, parse_table, callbacks): + self.states = parse_table.states + self.start_state = parse_table.start_state + self.end_state = parse_table.end_state + self.callbacks = callbacks + + def parse(self, seq, set_state=None): + i = 0 + token = None + stream = iter(seq) + states = self.states + + state_stack = [self.start_state] + value_stack = [] + + if set_state: set_state(self.start_state) + + def get_action(key): + state = state_stack[-1] + try: + return states[state][key] + except KeyError: + expected = states[state].keys() + + raise UnexpectedToken(token, expected, seq, i) + + def reduce(rule): + size = len(rule.expansion) + if size: + s = value_stack[-size:] + del state_stack[-size:] + del value_stack[-size:] + else: + s = [] + + value = self.callbacks[rule](s) + + _action, new_state = get_action(rule.origin) + assert _action is Shift + state_stack.append(new_state) + value_stack.append(value) + + # Main LALR-parser loop + for i, token in enumerate(stream): + while True: + action, arg = get_action(token.type) + assert arg != self.end_state + + if action is Shift: + state_stack.append(arg) + value_stack.append(token) + if set_state: set_state(arg) + break # next token + else: + reduce(arg) + + while True: + _action, arg = get_action('$END') + if _action is Shift: + assert arg == self.end_state + val ,= value_stack + return val + else: + reduce(arg) + + + +class Rule(object): + """ + origin : a symbol + expansion : a list of symbols + """ + def __init__(self, origin, expansion, alias=None, options=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + self.options = options + + def __str__(self): + return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + + def __repr__(self): + return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + + +class RuleOptions: + def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.create_token = create_token # used for scanless postprocessing + self.priority = priority + + self.filter_out = filter_out # remove this rule from the tree + # used for "token"-rules in scanless + + def __repr__(self): + return 'RuleOptions(%r, %r, %r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.create_token, + self.priority, + self.filter_out + ) + +Shift = 0 +Reduce = 1 +import re +MRES = ( +[(u'(?P(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P(?:[ \t\x0c\r\n])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])', + {1: u'SIGNED_NUMBER', + 2: u'ESCAPED_STRING', + 3: u'WS', + 4: u'__FALSE1', + 5: u'__NULL2', + 6: u'__TRUE0', + 7: u'__COLON', + 8: u'__COMMA', + 9: u'__LBRACE', + 10: u'__LSQB', + 11: u'__RBRACE', + 12: u'__RSQB'})] +) +LEXER_CALLBACK = ( +{} +) +NEWLINE_TYPES = [u'WS'] +IGNORE_TYPES = [u'WS'] +class LexerRegexps: pass +lexer_regexps = LexerRegexps() +lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES] +lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres]) + for n, mres in LEXER_CALLBACK.items()} +lexer = _Lex(lexer_regexps) +def lex(stream): + return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES) +RULES = { + 0: Rule(u'start', [u'value'], None, RuleOptions(False, True, None, None, False)), + 1: Rule(u'value', [u'string'], None, RuleOptions(False, True, None, None, False)), + 2: Rule(u'value', [u'__TRUE0'], u'true', RuleOptions(False, True, None, None, False)), + 3: Rule(u'value', [u'array'], None, RuleOptions(False, True, None, None, False)), + 4: Rule(u'value', [u'__NULL2'], u'null', RuleOptions(False, True, None, None, False)), + 5: Rule(u'value', [u'SIGNED_NUMBER'], u'number', RuleOptions(False, True, None, None, False)), + 6: Rule(u'value', [u'object'], None, RuleOptions(False, True, None, None, False)), + 7: Rule(u'value', [u'__FALSE1'], u'false', RuleOptions(False, True, None, None, False)), + 8: Rule(u'array', ['__LSQB', u'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)), + 9: Rule(u'array', ['__LSQB', u'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)), + 10: Rule(u'array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)), + 11: Rule(u'object', ['__LBRACE', u'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), + 12: Rule(u'object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), + 13: Rule(u'object', ['__LBRACE', u'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), + 14: Rule(u'pair', [u'string', '__COLON', u'value'], None, RuleOptions(False, False, None, None, False)), + 15: Rule(u'string', [u'ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)), + 16: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', u'value'], None, None), + 17: Rule('__anon_star_0', ['__COMMA', u'value'], None, None), + 18: Rule('__anon_star_1', ['__COMMA', u'pair'], None, None), + 19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', u'pair'], None, None), +} +parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree) +class ParseTable: pass +parse_table = ParseTable() +STATES = { + 0: {0: (1, 4), 1: (1, 4), 2: (1, 4), 3: (1, 4)}, + 1: {1: (1, 14), 2: (1, 14)}, + 2: {0: (0, 29), 1: (0, 32), 4: (0, 9)}, + 3: {1: (0, 13), 2: (0, 12)}, + 4: {0: (1, 1), 1: (1, 1), 2: (1, 1), 3: (1, 1)}, + 5: {0: (1, 10), 1: (1, 10), 2: (1, 10), 3: (1, 10)}, + 6: {2: (0, 15), 5: (0, 27), 6: (0, 16), 7: (0, 26)}, + 7: {5: (0, 34), 6: (0, 16), 7: (0, 26)}, + 8: {0: (1, 2), 1: (1, 2), 2: (1, 2), 3: (1, 2)}, + 9: {0: (0, 11), 1: (0, 22)}, + 10: {0: (1, 6), 1: (1, 6), 2: (1, 6), 3: (1, 6)}, + 11: {0: (1, 9), 1: (1, 9), 2: (1, 9), 3: (1, 9)}, + 12: {0: (1, 11), 1: (1, 11), 2: (1, 11), 3: (1, 11)}, + 13: {5: (0, 20), 6: (0, 16), 7: (0, 26)}, + 14: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 17), 14: (0, 33), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, + 15: {0: (1, 12), 1: (1, 12), 2: (1, 12), 3: (1, 12)}, + 16: {0: (1, 15), 1: (1, 15), 2: (1, 15), 3: (1, 15), 18: (1, 15)}, + 17: {3: (1, 0)}, + 18: {}, + 19: {0: (1, 3), 1: (1, 3), 2: (1, 3), 3: (1, 3)}, + 20: {1: (1, 19), 2: (1, 19)}, + 21: {0: (1, 5), 1: (1, 5), 2: (1, 5), 3: (1, 5)}, + 22: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 30), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, + 23: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 1), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, + 24: {0: (0, 5), 6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 2), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, + 25: {0: (1, 13), 1: (1, 13), 2: (1, 13), 3: (1, 13)}, + 26: {18: (0, 23)}, + 27: {1: (0, 7), 2: (0, 25), 19: (0, 3)}, + 28: {0: (1, 17), 1: (1, 17)}, + 29: {0: (1, 8), 1: (1, 8), 2: (1, 8), 3: (1, 8)}, + 30: {0: (1, 16), 1: (1, 16)}, + 31: {0: (1, 7), 1: (1, 7), 2: (1, 7), 3: (1, 7)}, + 32: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 28), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, + 33: {3: (0, 18)}, + 34: {1: (1, 18), 2: (1, 18)}, +} +TOKEN_TYPES = ( +{0: '__RSQB', + 1: '__COMMA', + 2: '__RBRACE', + 3: '$END', + 4: '__anon_star_0', + 5: u'pair', + 6: u'ESCAPED_STRING', + 7: u'string', + 8: '__LBRACE', + 9: u'__FALSE1', + 10: '__LSQB', + 11: u'object', + 12: u'SIGNED_NUMBER', + 13: u'value', + 14: 'start', + 15: u'__NULL2', + 16: u'array', + 17: u'__TRUE0', + 18: '__COLON', + 19: '__anon_star_1'} +) +parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()} + for s, acts in STATES.items()} +parse_table.start_state = 14 +parse_table.end_state = 18 +class Lark_StandAlone: + def __init__(self, transformer=None, postlex=None): + callback = parse_tree_builder.create_callback(transformer=transformer) + callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()} + self.parser = _Parser(parse_table, callbacks) + self.postlex = postlex + def parse(self, stream): + tokens = lex(stream) + if self.postlex: tokens = self.postlex.process(tokens) + return self.parser.parse(tokens) diff -Nru lark-parser-0.6.2/.gitignore lark-parser-0.7.0/.gitignore --- lark-parser-0.6.2/.gitignore 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/.gitignore 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,10 @@ +*.pyc +*.pyo +/.tox +/lark_parser.egg-info/** +tags +.vscode +.ropeproject +.cache +/dist +/build diff -Nru lark-parser-0.6.2/.gitmodules lark-parser-0.7.0/.gitmodules --- lark-parser-0.6.2/.gitmodules 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/.gitmodules 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,3 @@ +[submodule "tests/test_nearley/nearley"] + path = tests/test_nearley/nearley + url = https://github.com/Hardmath123/nearley diff -Nru lark-parser-0.6.2/lark/common.py lark-parser-0.7.0/lark/common.py --- lark-parser-0.6.2/lark/common.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/common.py 2019-03-28 13:53:28.000000000 +0000 @@ -1,14 +1,3 @@ -import re -import sys - -from .utils import get_regexp_width, STRING_TYPE - -Py36 = (sys.version_info[:2] >= (3, 6)) - - -###{standalone -###} - class LexerConf: @@ -25,61 +14,3 @@ self.start = start - -class Pattern(object): - def __init__(self, value, flags=()): - self.value = value - self.flags = frozenset(flags) - - def __repr__(self): - return repr(self.to_regexp()) - - # Pattern Hashing assumes all subclasses have a different priority! - def __hash__(self): - return hash((type(self), self.value, self.flags)) - def __eq__(self, other): - return type(self) == type(other) and self.value == other.value and self.flags == other.flags - - if Py36: - # Python 3.6 changed syntax for flags in regular expression - def _get_flags(self, value): - for f in self.flags: - value = ('(?%s:%s)' % (f, value)) - return value - - else: - def _get_flags(self, value): - for f in self.flags: - value = ('(?%s)' % f) + value - return value - -class PatternStr(Pattern): - def to_regexp(self): - return self._get_flags(re.escape(self.value)) - - @property - def min_width(self): - return len(self.value) - max_width = min_width - -class PatternRE(Pattern): - def to_regexp(self): - return self._get_flags(self.value) - - @property - def min_width(self): - return get_regexp_width(self.to_regexp())[0] - @property - def max_width(self): - return get_regexp_width(self.to_regexp())[1] - -class TokenDef(object): - def __init__(self, name, pattern, priority=1): - assert isinstance(pattern, Pattern), pattern - self.name = name - self.pattern = pattern - self.priority = priority - - def __repr__(self): - return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - diff -Nru lark-parser-0.6.2/lark/exceptions.py lark-parser-0.7.0/lark/exceptions.py --- lark-parser-0.6.2/lark/exceptions.py 2018-07-18 12:42:58.000000000 +0000 +++ lark-parser-0.7.0/lark/exceptions.py 2019-03-28 13:53:28.000000000 +0000 @@ -1,5 +1,6 @@ from .utils import STRING_TYPE +###{standalone class LarkError(Exception): pass @@ -13,6 +14,8 @@ pass class UnexpectedInput(LarkError): + pos_in_stream = None + def get_context(self, text, span=40): pos = self.pos_in_stream start = max(pos - span, 0) @@ -75,12 +78,19 @@ self.column = getattr(token, 'column', '?') self.considered_rules = considered_rules self.state = state - self.pos_in_stream = token.pos_in_stream + self.pos_in_stream = getattr(token, 'pos_in_stream', None) message = ("Unexpected token %r at line %s, column %s.\n" - "Expected: %s\n" - % (token, self.line, self.column, ', '.join(self.expected))) + "Expected one of: \n\t* %s\n" + % (token, self.line, self.column, '\n\t* '.join(self.expected))) super(UnexpectedToken, self).__init__(message) - +class VisitError(LarkError): + def __init__(self, tree, orig_exc): + self.tree = tree + self.orig_exc = orig_exc + + message = 'Error trying to process rule "%s":\n\n%s' % (tree.data, orig_exc) + super(VisitError, self).__init__(message) +###} diff -Nru lark-parser-0.6.2/lark/grammar.py lark-parser-0.7.0/lark/grammar.py --- lark-parser-0.6.2/lark/grammar.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/grammar.py 2019-03-28 13:53:28.000000000 +0000 @@ -17,6 +17,8 @@ def __repr__(self): return '%s(%r)' % (type(self).__name__, self.name) + fullrepr = property(__repr__) + class Terminal(Symbol): is_term = True @@ -24,6 +26,10 @@ self.name = name self.filter_out = filter_out + @property + def fullrepr(self): + return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) + class NonTerminal(Symbol): is_term = False @@ -32,25 +38,39 @@ """ origin : a symbol expansion : a list of symbols + order : index of this expansion amongst all rules of the same name """ - def __init__(self, origin, expansion, alias=None, options=None): + __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') + def __init__(self, origin, expansion, order=0, alias=None, options=None): self.origin = origin self.expansion = expansion self.alias = alias + self.order = order self.options = options + self._hash = hash((self.origin, tuple(self.expansion))) + def __str__(self): - return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) def __repr__(self): return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + def __hash__(self): + return self._hash + + def __eq__(self, other): + if not isinstance(other, Rule): + return False + return self.origin == other.origin and self.expansion == other.expansion + class RuleOptions: def __init__(self, keep_all_tokens=False, expand1=False, priority=None): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.priority = priority + self.empty_indices = () def __repr__(self): return 'RuleOptions(%r, %r, %r)' % ( diff -Nru lark-parser-0.6.2/lark/grammars/common.lark lark-parser-0.7.0/lark/grammars/common.lark --- lark-parser-0.6.2/lark/grammars/common.lark 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/grammars/common.lark 2019-03-28 13:53:28.000000000 +0000 @@ -20,9 +20,10 @@ // // Strings // -//STRING: /"(\\\"|\\\\|[^"\n])*?"i?/ -STRING_INNER: ("\\\""|/[^"]/) -ESCAPED_STRING: "\"" STRING_INNER* "\"" +_STRING_INNER: /.*?/ +_STRING_ESC_INNER: _STRING_INNER /(? 0 def handle_NL(self, token): if self.paren_level > 0: @@ -27,7 +28,7 @@ assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) - def process(self, stream): + def _process(self, stream): for token in stream: if token.type == self.NL_type: for t in self.handle_NL(token): @@ -47,6 +48,11 @@ assert self.indent_level == [0], self.indent_level + def process(self, stream): + self.paren_level = 0 + self.indent_level = [0] + return self._process(stream) + # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? @property def always_accept(self): diff -Nru lark-parser-0.6.2/lark/__init__.py lark-parser-0.7.0/lark/__init__.py --- lark-parser-0.6.2/lark/__init__.py 2018-07-18 13:03:24.000000000 +0000 +++ lark-parser-0.7.0/lark/__init__.py 2019-03-28 13:53:28.000000000 +0000 @@ -2,6 +2,7 @@ from .visitors import Transformer, Visitor, v_args, Discard from .visitors import InlineTransformer, inline_args # XXX Deprecated from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters +from .lexer import Token from .lark import Lark -__version__ = "0.6.2" +__version__ = "0.6.6" diff -Nru lark-parser-0.6.2/lark/lark.py lark-parser-0.7.0/lark/lark.py --- lark-parser-0.6.2/lark/lark.py 2018-07-18 12:42:58.000000000 +0000 +++ lark-parser-0.7.0/lark/lark.py 2019-03-28 13:53:28.000000000 +0000 @@ -40,13 +40,17 @@ debug - Affects verbosity (default: False) keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) cache_grammar - Cache the Lark grammar (Default: False) - postlex - Lexer post-processing (Requires standard lexer. Default: None) + postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. start - The start symbol (Default: start) profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) + priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None """ - __doc__ += OPTIONS_DOC + if __doc__: + __doc__ += OPTIONS_DOC + def __init__(self, options_dict): o = dict(options_dict) @@ -60,16 +64,17 @@ self.transformer = o.pop('transformer', None) self.start = o.pop('start', 'start') self.profile = o.pop('profile', False) + self.priority = o.pop('priority', 'auto') self.ambiguity = o.pop('ambiguity', 'auto') self.propagate_positions = o.pop('propagate_positions', False) - self.earley__predict_all = o.pop('earley__predict_all', False) self.lexer_callbacks = o.pop('lexer_callbacks', {}) + self.maybe_placeholders = o.pop('maybe_placeholders', False) assert self.parser in ('earley', 'lalr', 'cyk', None) if self.parser == 'earley' and self.transformer: raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.' - 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') + 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') if o: raise ValueError("Unknown options: %s" % o.keys()) @@ -112,9 +117,6 @@ self.source = grammar.name except AttributeError: self.source = '' - cache_file = "larkcache_%s" % str(hash(grammar)%(2**32)) - else: - cache_file = "larkcache_%s" % os.path.basename(self.source) # Drain file-like objects to get their contents try: @@ -151,15 +153,39 @@ disambig_parsers = ['earley', 'cyk'] assert self.options.parser in disambig_parsers, ( 'Only %s supports disambiguation right now') % ', '.join(disambig_parsers) - assert self.options.ambiguity in ('resolve', 'explicit', 'auto', 'resolve__antiscore_sum') + + if self.options.priority == 'auto': + if self.options.parser in ('earley', 'cyk', ): + self.options.priority = 'normal' + elif self.options.parser in ('lalr', ): + self.options.priority = None + elif self.options.priority in ('invert', 'normal'): + assert self.options.parser in ('earley', 'cyk'), "priorities are not supported for LALR at this time" + + assert self.options.priority in ('auto', None, 'normal', 'invert'), 'invalid priority option specified: {}. options are auto, none, normal, invert.'.format(self.options.priority) + assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"' + assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) # Parse the grammar file and compose the grammars (TODO) self.grammar = load_grammar(grammar, self.source) # Compile the EBNF grammar into BNF - tokens, self.rules, self.ignore_tokens = self.grammar.compile() + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile() - self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) + # If the user asked to invert the priorities, negate them all here. + # This replaces the old 'resolve__antiscore_sum' option. + if self.options.priority == 'invert': + for rule in self.rules: + if rule.options and rule.options.priority is not None: + rule.options.priority = -rule.options.priority + # Else, if the user asked to disable priorities, strip them from the + # rules. This allows the Earley parsers to skip an extra forest walk + # for improved performance, if you don't need them (or didn't specify any). + elif self.options.priority == None: + for rule in self.rules: + if rule.options and rule.options.priority is not None: + rule.options.priority = None + self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) if self.options.parser: self.parser = self._build_parser() @@ -168,7 +194,8 @@ if self.profiler: self.profiler.enter_section('outside_lark') - __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC + if __init__.__doc__: + __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC def _build_lexer(self): return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) @@ -176,7 +203,7 @@ def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) callback = self._parse_tree_builder.create_callback(self.options.transformer) if self.profiler: for f in dir(callback): diff -Nru lark-parser-0.6.2/lark/lexer.py lark-parser-0.7.0/lark/lexer.py --- lark-parser-0.6.2/lark/lexer.py 2018-07-18 12:42:58.000000000 +0000 +++ lark-parser-0.7.0/lark/lexer.py 2019-03-28 13:53:28.000000000 +0000 @@ -2,28 +2,94 @@ import re -from .utils import Str, classify -from .common import PatternStr, PatternRE, TokenDef -from .exceptions import UnexpectedCharacters +from .utils import Str, classify, get_regexp_width, Py36 +from .exceptions import UnexpectedCharacters, LexError + +class Pattern(object): + def __init__(self, value, flags=()): + self.value = value + self.flags = frozenset(flags) + + def __repr__(self): + return repr(self.to_regexp()) + + # Pattern Hashing assumes all subclasses have a different priority! + def __hash__(self): + return hash((type(self), self.value, self.flags)) + def __eq__(self, other): + return type(self) == type(other) and self.value == other.value and self.flags == other.flags + + def to_regexp(self): + raise NotImplementedError() + + if Py36: + # Python 3.6 changed syntax for flags in regular expression + def _get_flags(self, value): + for f in self.flags: + value = ('(?%s:%s)' % (f, value)) + return value + + else: + def _get_flags(self, value): + for f in self.flags: + value = ('(?%s)' % f) + value + return value + +class PatternStr(Pattern): + def to_regexp(self): + return self._get_flags(re.escape(self.value)) + + @property + def min_width(self): + return len(self.value) + max_width = min_width + +class PatternRE(Pattern): + def to_regexp(self): + return self._get_flags(self.value) + + @property + def min_width(self): + return get_regexp_width(self.to_regexp())[0] + @property + def max_width(self): + return get_regexp_width(self.to_regexp())[1] + +class TerminalDef(object): + def __init__(self, name, pattern, priority=1): + assert isinstance(pattern, Pattern), pattern + self.name = name + self.pattern = pattern + self.priority = priority + + def __repr__(self): + return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) + + ###{standalone class Token(Str): __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') - def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): - self = super(Token, cls).__new__(cls, value) + def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None): + try: + self = super(Token, cls).__new__(cls, value) + except UnicodeDecodeError: + value = value.decode('latin1') + self = super(Token, cls).__new__(cls, value) + self.type = type_ self.pos_in_stream = pos_in_stream self.value = value self.line = line self.column = column - self.end_line = None - self.end_column = None + self.end_line = end_line + self.end_column = end_column return self @classmethod def new_borrow_pos(cls, type_, value, borrow_t): - return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) + return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column) def __reduce__(self): return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) @@ -72,38 +138,42 @@ self.state = state def lex(self, stream, newline_types, ignore_types): - newline_types = list(newline_types) - ignore_types = list(ignore_types) + newline_types = frozenset(newline_types) + ignore_types = frozenset(ignore_types) line_ctr = LineCounter() - t = None - while True: + while line_ctr.char_pos < len(stream): lexer = self.lexer for mre, type_from_index in lexer.mres: m = mre.match(stream, line_ctr.char_pos) - if m: - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: + if not m: + continue + + t = None + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + yield t + else: + if type_ in lexer.callback: t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - yield t - else: - if type_ in lexer.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t) - - line_ctr.feed(value, type_ in newline_types) - if t: - t.end_line = line_ctr.line - t.end_column = line_ctr.column + lexer.callback[type_](t) + + line_ctr.feed(value, type_ in newline_types) + if t: + t.end_line = line_ctr.line + t.end_column = line_ctr.column - break - else: - if line_ctr.char_pos < len(stream): - raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, state=self.state) break + else: + allowed = [v for m, tfi in lexer.mres for v in tfi.values()] + raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state) + class UnlessCallback: def __init__(self, mres): @@ -113,17 +183,27 @@ for mre, type_from_index in self.mres: m = mre.match(t.value) if m: - value = m.group(0) t.type = type_from_index[m.lastindex] break return t +class CallChain: + def __init__(self, callback1, callback2, cond): + self.callback1 = callback1 + self.callback2 = callback2 + self.cond = cond + + def __call__(self, t): + t2 = self.callback1(t) + return self.callback2(t) if self.cond(t2) else t2 + + ###} -def _create_unless(tokens): - tokens_by_type = classify(tokens, lambda t: type(t.pattern)) +def _create_unless(terminals): + tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() callback = {} @@ -141,31 +221,38 @@ if unless: callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) - tokens = [t for t in tokens if t not in embedded_strs] - return tokens, callback + terminals = [t for t in terminals if t not in embedded_strs] + return terminals, callback -def _build_mres(tokens, max_size, match_whole): +def _build_mres(terminals, max_size, match_whole): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. postfix = '$' if match_whole else '' mres = [] - while tokens: + while terminals: try: - mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size])) + mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size])) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(tokens, max_size//2, match_whole) + return _build_mres(terminals, max_size//2, match_whole) + # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) - tokens = tokens[max_size:] + terminals = terminals[max_size:] return mres -def build_mres(tokens, match_whole=False): - return _build_mres(tokens, len(tokens), match_whole) +def build_mres(terminals, match_whole=False): + return _build_mres(terminals, len(terminals), match_whole) def _regexp_has_newline(r): - return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r) + """Expressions that may indicate newlines in a regexp: + - newlines (\n) + - escaped newline (\\n) + - anything but ([^...]) + - any-char (.) when the flag (?s) exists + """ + return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) class Lexer: """Lexer interface @@ -179,48 +266,52 @@ lex = NotImplemented class TraditionalLexer(Lexer): - def __init__(self, tokens, ignore=(), user_callbacks={}): - assert all(isinstance(t, TokenDef) for t in tokens), tokens + def __init__(self, terminals, ignore=(), user_callbacks={}): + assert all(isinstance(t, TerminalDef) for t in terminals), terminals - tokens = list(tokens) + terminals = list(terminals) # Sanitization - for t in tokens: + for t in terminals: try: re.compile(t.pattern.to_regexp()) except: raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) if t.pattern.min_width == 0: - raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) + raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - assert set(ignore) <= {t.name for t in tokens} + assert set(ignore) <= {t.name for t in terminals} # Init - self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] + self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] self.ignore_types = list(ignore) - tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) + terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) - tokens, self.callback = _create_unless(tokens) + terminals, self.callback = _create_unless(terminals) assert all(self.callback.values()) for type_, f in user_callbacks.items(): - assert type_ not in self.callback - self.callback[type_] = f + if type_ in self.callback: + # Already a callback there, probably UnlessCallback + self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) + else: + self.callback[type_] = f + + self.terminals = terminals - self.tokens = tokens + self.mres = build_mres(terminals) - self.mres = build_mres(tokens) def lex(self, stream): return _Lex(self).lex(stream, self.newline_types, self.ignore_types) class ContextualLexer(Lexer): - def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}): + def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} - for t in tokens: + for t in terminals: assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t @@ -238,7 +329,7 @@ self.lexers[state] = lexer - self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks) + self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) self.set_parser_state(None) # Needs to be set on the outside diff -Nru lark-parser-0.6.2/lark/load_grammar.py lark-parser-0.7.0/lark/load_grammar.py --- lark-parser-0.6.2/lark/load_grammar.py 2018-07-18 12:42:58.000000000 +0000 +++ lark-parser-0.7.0/lark/load_grammar.py 2019-03-28 13:53:28.000000000 +0000 @@ -1,19 +1,18 @@ "Parses and creates Grammar objects" import os.path -from itertools import chain -import re +import sys from ast import literal_eval -from copy import deepcopy - -from .lexer import Token +from copy import copy, deepcopy +from .utils import bfs +from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR_TraditionalLexer -from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef +from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol -from .utils import classify, suppress +from .utils import classify, suppress, dedup_list from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST @@ -23,10 +22,11 @@ __path__ = os.path.dirname(__file__) IMPORT_PATHS = [os.path.join(__path__, 'grammars')] +EXT = '.lark' + _RE_FLAGS = 'imslux' -def is_terminal(sym): - return sym.isupper() +_EMPTY = Symbol('__empty__') _TERMINAL_NAMES = { '.' : 'DOT', @@ -75,6 +75,7 @@ '_RBRA': r'\]', 'OP': '[+*][?]?|[?](?![a-z])', '_COLON': ':', + '_COMMA': ',', '_OR': r'\|', '_DOT': r'\.', 'TILDE': '~', @@ -95,7 +96,7 @@ RULES = { 'start': ['_list'], '_list': ['_item', '_list _item'], - '_item': ['rule', 'token', 'statement', '_NL'], + '_item': ['rule', 'term', 'statement', '_NL'], 'rule': ['RULE _COLON expansions _NL', 'RULE _DOT NUMBER _COLON expansions _NL'], @@ -131,21 +132,27 @@ 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOT _DOT STRING'], - 'token': ['TERMINAL _COLON expansions _NL', + 'term': ['TERMINAL _COLON expansions _NL', 'TERMINAL _DOT NUMBER _COLON expansions _NL'], 'statement': ['ignore', 'import', 'declare'], 'ignore': ['_IGNORE expansions _NL'], 'declare': ['_DECLARE _declare_args _NL'], - 'import': ['_IMPORT import_args _NL', - '_IMPORT import_args _TO TERMINAL _NL'], - 'import_args': ['_import_args'], + 'import': ['_IMPORT _import_path _NL', + '_IMPORT _import_path _LPAR name_list _RPAR _NL', + '_IMPORT _import_path _TO name _NL'], + + '_import_path': ['import_lib', 'import_rel'], + 'import_lib': ['_import_args'], + 'import_rel': ['_DOT _import_args'], '_import_args': ['name', '_import_args _DOT name'], - '_declare_args': ['name', '_declare_args name'], + 'name_list': ['_name_list'], + '_name_list': ['name', '_name_list _COMMA name'], + + '_declare_args': ['name', '_declare_args name'], 'literal': ['REGEXP', 'STRING'], } - @inline_args class EBNF_to_BNF(Transformer_InPlace): def __init__(self): @@ -161,7 +168,7 @@ new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) self.i += 1 - t = NonTerminal(Token('RULE', new_name, -1)) + t = NonTerminal(new_name) tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) self.new_rules.append((new_name, tree, self.rule_options)) self.rules_by_expr[expr] = t @@ -169,7 +176,8 @@ def expr(self, rule, op, *args): if op.value == '?': - return ST('expansions', [rule, ST('expansion', [])]) + empty = ST('expansion', []) + return ST('expansions', [rule, empty]) elif op.value == '+': # a : b c+ d # --> @@ -193,6 +201,23 @@ return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) assert False, op + def maybe(self, rule): + keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens + + def will_not_get_removed(sym): + if isinstance(sym, NonTerminal): + return not sym.name.startswith('_') + if isinstance(sym, Terminal): + return keep_all_tokens or not sym.filter_out + assert False + + if any(rule.scan_values(will_not_get_removed)): + empty = _EMPTY + else: + empty = ST('expansion', []) + + return ST('expansions', [rule, empty]) + class SimplifyRule_Visitor(Visitor): @@ -223,7 +248,8 @@ tree.data = 'expansions' tree.children = [self.visit(ST('expansion', [option if i==j else other for j, other in enumerate(tree.children)])) - for option in set(child.children)] + for option in dedup_list(child.children)] + self._flatten(tree) break def alias(self, tree): @@ -237,7 +263,7 @@ def expansions(self, tree): self._flatten(tree) - tree.children = list(set(tree.children)) + tree.children = dedup_list(tree.children) class RuleTreeToText(Transformer): @@ -253,9 +279,6 @@ @inline_args class CanonizeTree(Transformer_InPlace): - def maybe(self, expr): - return ST('expr', [expr, Token('OP', '?', -1)]) - def tokenmods(self, *args): if len(args) == 1: return list(args) @@ -263,55 +286,58 @@ return tokenmods + [value] class PrepareAnonTerminals(Transformer_InPlace): - "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them" + "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them" - def __init__(self, tokens): - self.tokens = tokens - self.token_set = {td.name for td in self.tokens} - self.token_reverse = {td.pattern: td for td in tokens} + def __init__(self, terminals): + self.terminals = terminals + self.term_set = {td.name for td in self.terminals} + self.term_reverse = {td.pattern: td for td in terminals} self.i = 0 @inline_args def pattern(self, p): value = p.value - if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags: + if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags: raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) - token_name = None + term_name = None if isinstance(p, PatternStr): try: - # If already defined, use the user-defined token name - token_name = self.token_reverse[p].name + # If already defined, use the user-defined terminal name + term_name = self.term_reverse[p].name except KeyError: - # Try to assign an indicative anon-token name + # Try to assign an indicative anon-terminal name try: - token_name = _TERMINAL_NAMES[value] + term_name = _TERMINAL_NAMES[value] except KeyError: - if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set: + if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: with suppress(UnicodeEncodeError): - value.upper().encode('ascii') # Make sure we don't have unicode in our token names - token_name = value.upper() + value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names + term_name = value.upper() + + if term_name in self.term_set: + term_name = None elif isinstance(p, PatternRE): - if p in self.token_reverse: # Kind of a wierd placement.name - token_name = self.token_reverse[p].name + if p in self.term_reverse: # Kind of a wierd placement.name + term_name = self.term_reverse[p].name else: assert False, p - if token_name is None: - token_name = '__ANON_%d' % self.i + if term_name is None: + term_name = '__ANON_%d' % self.i self.i += 1 - if token_name not in self.token_set: - assert p not in self.token_reverse - self.token_set.add(token_name) - tokendef = TokenDef(token_name, p) - self.token_reverse[p] = tokendef - self.tokens.append(tokendef) + if term_name not in self.term_set: + assert p not in self.term_reverse + self.term_set.add(term_name) + termdef = TerminalDef(term_name, p) + self.term_reverse[p] = termdef + self.terminals.append(termdef) - return Terminal(token_name, filter_out=isinstance(p, PatternStr)) + return Terminal(term_name, filter_out=isinstance(p, PatternStr)) def _rfind(s, choices): @@ -328,7 +354,7 @@ n2 = next(i) if n2 == '\\': w += '\\\\' - elif n2 not in 'unftr': + elif n2 not in 'uxnftr': w += '\\' w += n2 w = w.replace('\\"', '"').replace("'", "\\'") @@ -371,12 +397,12 @@ assert start.type == end.type == 'STRING' start = start.value[1:-1] end = end.value[1:-1] - assert len(start) == len(end) == 1, (start, end, len(start), len(end)) + assert len(_fix_escaping(start)) == len(_fix_escaping(end)) == 1, (start, end, len(_fix_escaping(start)), len(_fix_escaping(end))) regexp = '[%s-%s]' % (start, end) return ST('pattern', [PatternRE(regexp)]) -class TokenTreeToPattern(Transformer): +class TerminalTreeToPattern(Transformer): def pattern(self, ps): p ,= ps return p @@ -386,14 +412,14 @@ if len(items) == 1: return items[0] if len({i.flags for i in items}) > 1: - raise GrammarError("Lark doesn't support joining tokens with conflicting flags!") + raise GrammarError("Lark doesn't support joining terminals with conflicting flags!") return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ()) def expansions(self, exps): if len(exps) == 1: return exps[0] if len({i.flags for i in exps}) > 1: - raise GrammarError("Lark doesn't support joining tokens with conflicting flags!") + raise GrammarError("Lark doesn't support joining terminals with conflicting flags!") return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags) def expr(self, args): @@ -410,6 +436,9 @@ assert len(args) == 2 return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) + def maybe(self, expr): + return self.expr(expr + ['?']) + def alias(self, t): raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") @@ -431,37 +460,39 @@ return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) class Grammar: - def __init__(self, rule_defs, token_defs, ignore): - self.token_defs = token_defs + def __init__(self, rule_defs, term_defs, ignore): + self.term_defs = term_defs self.rule_defs = rule_defs self.ignore = ignore def compile(self): - token_defs = list(self.token_defs) - rule_defs = self.rule_defs - - # ================= - # Compile Tokens - # ================= - - # Convert token-trees to strings/regexps - transformer = PrepareLiterals() * TokenTreeToPattern() - for name, (token_tree, priority) in token_defs: - if token_tree is None: # Terminal added through %declare + # We change the trees in-place (to support huge grammars) + # So deepcopy allows calling compile more than once. + term_defs = deepcopy(list(self.term_defs)) + rule_defs = deepcopy(self.rule_defs) + + # =================== + # Compile Terminals + # =================== + + # Convert terminal-trees to strings/regexps + transformer = PrepareLiterals() * TerminalTreeToPattern() + for name, (term_tree, priority) in term_defs: + if term_tree is None: # Terminal added through %declare continue - expansions = list(token_tree.find_data('expansion')) + expansions = list(term_tree.find_data('expansion')) if len(expansions) == 1 and not expansions[0].children: raise GrammarError("Terminals cannot be empty (%s)" % name) - tokens = [TokenDef(name, transformer.transform(token_tree), priority) - for name, (token_tree, priority) in token_defs if token_tree] + terminals = [TerminalDef(name, transformer.transform(term_tree), priority) + for name, (term_tree, priority) in term_defs if term_tree] # ================= # Compile Rules # ================= # 1. Pre-process terminals - transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(tokens) # Adds to tokens + transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals # 2. Convert EBNF to BNF (and apply step 1) ebnf_to_bnf = EBNF_to_BNF() @@ -469,7 +500,8 @@ for name, rule_tree, options in rule_defs: ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None tree = transformer.transform(rule_tree) - rules.append((name, ebnf_to_bnf.transform(tree), options)) + res = ebnf_to_bnf.transform(tree) + rules.append((name, res, options)) rules += ebnf_to_bnf.new_rules assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" @@ -479,7 +511,8 @@ simplify_rule = SimplifyRule_Visitor() compiled_rules = [] - for name, tree, options in rules: + for i, rule_content in enumerate(rules): + name, tree, options = rule_content simplify_rule.visit(tree) expansions = rule_tree_to_text.transform(tree) @@ -487,37 +520,112 @@ if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - assert all(isinstance(x, Symbol) for x in expansion), expansion + empty_indices = [x==_EMPTY for i, x in enumerate(expansion)] + if any(empty_indices): + exp_options = copy(options) if options else RuleOptions() + exp_options.empty_indices = empty_indices + expansion = [x for x in expansion if x!=_EMPTY] + else: + exp_options = options - rule = Rule(NonTerminal(name), expansion, alias, options) + assert all(isinstance(x, Symbol) for x in expansion), expansion + rule = Rule(NonTerminal(name), expansion, i, alias, exp_options) compiled_rules.append(rule) - return tokens, compiled_rules, self.ignore + # Remove duplicates of empty rules, throw error for non-empty duplicates + if len(set(compiled_rules)) != len(compiled_rules): + duplicates = classify(compiled_rules, lambda x: x) + for dups in duplicates.values(): + if len(dups) > 1: + if dups[0].expansion: + raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates)) + + # Empty rule; assert all other attributes are equal + assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) + + # Remove duplicates + compiled_rules = list(set(compiled_rules)) + + # Filter out unused terminals + used_terms = {t.name for r in compiled_rules + for t in r.expansion + if isinstance(t, Terminal)} + terminals = [t for t in terminals if t.name in used_terms or t.name in self.ignore] + + return terminals, compiled_rules, self.ignore _imported_grammars = {} -def import_grammar(grammar_path): +def import_grammar(grammar_path, base_paths=[]): if grammar_path not in _imported_grammars: - for import_path in IMPORT_PATHS: - with open(os.path.join(import_path, grammar_path)) as f: - text = f.read() - grammar = load_grammar(text, grammar_path) - _imported_grammars[grammar_path] = grammar + import_paths = base_paths + IMPORT_PATHS + for import_path in import_paths: + with suppress(IOError): + with open(os.path.join(import_path, grammar_path)) as f: + text = f.read() + grammar = load_grammar(text, grammar_path) + _imported_grammars[grammar_path] = grammar + break + else: + open(grammar_path) + assert False return _imported_grammars[grammar_path] +def import_from_grammar_into_namespace(grammar, namespace, aliases): + """Returns all rules and terminals of grammar, prepended + with a 'namespace' prefix, except for those which are aliased. + """ + + imported_terms = dict(grammar.term_defs) + imported_rules = {n:(n,deepcopy(t),o) for n,t,o in grammar.rule_defs} + + term_defs = [] + rule_defs = [] + + def rule_dependencies(symbol): + if symbol.type != 'RULE': + return [] + try: + _, tree, _ = imported_rules[symbol] + except KeyError: + raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace)) + return tree.scan_values(lambda x: x.type in ('RULE', 'TERMINAL')) + + def get_namespace_name(name): + try: + return aliases[name].value + except KeyError: + return '%s__%s' % (namespace, name) + + to_import = list(bfs(aliases, rule_dependencies)) + for symbol in to_import: + if symbol.type == 'TERMINAL': + term_defs.append([get_namespace_name(symbol), imported_terms[symbol]]) + else: + assert symbol.type == 'RULE' + rule = imported_rules[symbol] + for t in rule[1].iter_subtrees(): + for i, c in enumerate(t.children): + if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'): + t.children[i] = Token(c.type, get_namespace_name(c)) + rule_defs.append((get_namespace_name(symbol), rule[1], rule[2])) + + return term_defs, rule_defs + + -def resolve_token_references(token_defs): +def resolve_term_references(term_defs): # TODO Cycles detection # TODO Solve with transitive closure (maybe) - token_dict = {k:t for k, (t,_p) in token_defs} - assert len(token_dict) == len(token_defs), "Same name defined twice?" + token_dict = {k:t for k, (t,_p) in term_defs} + assert len(token_dict) == len(term_defs), "Same name defined twice?" while True: changed = False - for name, (token_tree, _p) in token_defs: + for name, (token_tree, _p) in term_defs: if token_tree is None: # Terminal added through %declare continue for exp in token_tree.find_data('value'): @@ -548,7 +656,7 @@ def symbols_from_strcase(expansion): - return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion] + return [Terminal(x, filter_out=x.startswith('_')) if x.isupper() else NonTerminal(x) for x in expansion] @inline_args class PrepareGrammar(Transformer_InPlace): @@ -560,35 +668,37 @@ class GrammarLoader: def __init__(self): - tokens = [TokenDef(name, PatternRE(value)) for name, value in TERMINALS.items()] + terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, x) for name, x in RULES.items()] - rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs] + rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, xs, o in rules for i, x in enumerate(xs)] callback = ParseTreeBuilder(rules, ST).create_callback() - lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) + lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) parser_conf = ParserConf(rules, callback, 'start') self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) self.canonize_tree = CanonizeTree() - def load_grammar(self, grammar_text, name=''): + def load_grammar(self, grammar_text, grammar_name=''): "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." try: tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') ) except UnexpectedCharacters as e: - raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) + context = e.get_context(grammar_text) + raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % + (e.line, e.column, grammar_name, context)) except UnexpectedToken as e: context = e.get_context(grammar_text) error = e.match_examples(self.parser.parse, { 'Unclosed parenthesis': ['a: (\n'], 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], - 'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'], + 'Expecting rule or terminal definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'], 'Alias expects lowercase name': ['a: -> "a"\n'], 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], - 'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'], + 'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'], '%import expects a name': ['%import "a"\n'], '%ignore expects a value': ['%ignore %import\n'], }) @@ -602,44 +712,76 @@ # Extract grammar items defs = classify(tree.children, lambda c: c.data, lambda c: c.children) - token_defs = defs.pop('token', []) + term_defs = defs.pop('term', []) rule_defs = defs.pop('rule', []) statements = defs.pop('statement', []) assert not defs - token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs] - token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs] + term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs] + term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs] + rule_defs = [options_from_rule(*x) for x in rule_defs] # Execute statements ignore = [] - declared = [] for (stmt,) in statements: if stmt.data == 'ignore': t ,= stmt.children ignore.append(t) elif stmt.data == 'import': - dotted_path = stmt.children[0].children - name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1] - grammar_path = os.path.join(*dotted_path[:-1]) + '.lark' - g = import_grammar(grammar_path) - token_options = dict(g.token_defs)[dotted_path[-1]] - assert isinstance(token_options, tuple) and len(token_options)==2 - token_defs.append([name.value, token_options]) + if len(stmt.children) > 1: + path_node, arg1 = stmt.children + else: + path_node ,= stmt.children + arg1 = None + + if isinstance(arg1, Tree): # Multi import + dotted_path = path_node.children + names = arg1.children + aliases = names # Can't have aliased multi import, so all aliases will be the same as names + else: # Single import + dotted_path = path_node.children[:-1] + names = [path_node.children[-1]] # Get name from dotted path + aliases = [arg1] if arg1 else names # Aliases if exist + + grammar_path = os.path.join(*dotted_path) + EXT + + if path_node.data == 'import_lib': # Import from library + g = import_grammar(grammar_path) + else: # Relative import + if grammar_name == '': # Import relative to script file path if grammar is coded in script + try: + base_file = os.path.abspath(sys.modules['__main__'].__file__) + except AttributeError: + base_file = None + else: + base_file = grammar_name # Import relative to grammar file path if external grammar file + if base_file: + base_path = os.path.split(base_file)[0] + else: + base_path = os.path.abspath(os.path.curdir) + g = import_grammar(grammar_path, base_paths=[base_path]) + + aliases_dict = dict(zip(names, aliases)) + new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases_dict) + + term_defs += new_td + rule_defs += new_rd + elif stmt.data == 'declare': for t in stmt.children: - token_defs.append([t.value, (None, None)]) + term_defs.append([t.value, (None, None)]) else: assert False, stmt # Verify correctness 1 - for name, _ in token_defs: + for name, _ in term_defs: if name.startswith('__'): raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) # Handle ignore tokens # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's - # inability to handle duplicate tokens (two names, one value) + # inability to handle duplicate terminals (two names, one value) ignore_names = [] for t in ignore: if t.data=='expansions' and len(t.children) == 1: @@ -654,22 +796,21 @@ name = '__IGNORE_%d'% len(ignore_names) ignore_names.append(name) - token_defs.append((name, (t, 0))) + term_defs.append((name, (t, 1))) # Verify correctness 2 - token_names = set() - for name, _ in token_defs: - if name in token_names: - raise GrammarError("Token '%s' defined more than once" % name) - token_names.add(name) + terminal_names = set() + for name, _ in term_defs: + if name in terminal_names: + raise GrammarError("Terminal '%s' defined more than once" % name) + terminal_names.add(name) - if set(ignore_names) > token_names: - raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names)) + if set(ignore_names) > terminal_names: + raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names)) - # Resolve token references - resolve_token_references(token_defs) + resolve_term_references(term_defs) - rules = [options_from_rule(*x) for x in rule_defs] + rules = rule_defs rule_names = set() for name, _x, _o in rules: @@ -683,16 +824,14 @@ used_symbols = {t for x in expansions.find_data('expansion') for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} for sym in used_symbols: - if is_terminal(sym): - if sym not in token_names: + if sym.type == 'TERMINAL': + if sym not in terminal_names: raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) else: if sym not in rule_names: raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name)) - # TODO don't include unused tokens, they can only cause trouble! - - return Grammar(rules, token_defs, ignore_names) + return Grammar(rules, term_defs, ignore_names) diff -Nru lark-parser-0.6.2/lark/parser_frontends.py lark-parser-0.7.0/lark/parser_frontends.py --- lark-parser-0.6.2/lark/parser_frontends.py 2018-07-18 12:42:58.000000000 +0000 +++ lark-parser-0.7.0/lark/parser_frontends.py 2019-03-28 13:53:28.000000000 +0000 @@ -4,17 +4,19 @@ from .utils import get_regexp_width from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token - -from .exceptions import GrammarError -from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk +from .parsers import lalr_parser, earley, xearley, cyk from .tree import Tree class WithLexer: + lexer = None + parser = None + lexer_conf = None + def init_traditional_lexer(self, lexer_conf): self.lexer_conf = lexer_conf self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) - def init_contextual_lexer(self, lexer_conf, parser_conf): + def init_contextual_lexer(self, lexer_conf): self.lexer_conf = lexer_conf states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () @@ -27,8 +29,7 @@ stream = self.lexer.lex(text) if self.lexer_conf.postlex: return self.lexer_conf.postlex.process(stream) - else: - return stream + return stream def parse(self, text): token_stream = self.lex(text) @@ -37,13 +38,15 @@ class LALR_TraditionalLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - self.parser = lalr_parser.Parser(parser_conf) + debug = options.debug if options else False + self.parser = lalr_parser.Parser(parser_conf, debug=debug) self.init_traditional_lexer(lexer_conf) class LALR_ContextualLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - self.parser = lalr_parser.Parser(parser_conf) - self.init_contextual_lexer(lexer_conf, parser_conf) + debug = options.debug if options else False + self.parser = lalr_parser.Parser(parser_conf, debug=debug) + self.init_contextual_lexer(lexer_conf) class LALR_CustomLexer(WithLexer): def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): @@ -51,16 +54,6 @@ self.lexer_conf = lexer_conf self.lexer = lexer_cls(lexer_conf) - -def get_ambiguity_resolver(options): - if not options or options.ambiguity == 'resolve': - return resolve_ambig.standard_resolve_ambig - elif options.ambiguity == 'resolve__antiscore_sum': - return resolve_ambig.antiscore_sum_resolve_ambig - elif options.ambiguity == 'explicit': - return None - raise ValueError(options) - def tokenize_text(text): line = 1 col_start_pos = 0 @@ -74,8 +67,8 @@ def __init__(self, lexer_conf, parser_conf, options=None): self.init_traditional_lexer(lexer_conf) - self.parser = earley.Parser(parser_conf, self.match, - resolve_ambiguity=get_ambiguity_resolver(options)) + resolve_ambiguity = options.ambiguity == 'resolve' + self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity) def match(self, term, token): return term.name == token.type @@ -86,12 +79,11 @@ self.token_by_name = {t.name:t for t in lexer_conf.tokens} self._prepare_match(lexer_conf) - + resolve_ambiguity = options.ambiguity == 'resolve' self.parser = xearley.Parser(parser_conf, self.match, - resolve_ambiguity=get_ambiguity_resolver(options), ignore=lexer_conf.ignore, - predict_all=options.earley__predict_all, + resolve_ambiguity=resolve_ambiguity, **kw ) @@ -101,6 +93,8 @@ def _prepare_match(self, lexer_conf): self.regexps = {} for t in lexer_conf.tokens: + if t.priority != 1: + raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority) regexp = t.pattern.to_regexp() try: width = get_regexp_width(regexp)[0] @@ -117,7 +111,7 @@ class XEarley_CompleteLex(XEarley): def __init__(self, *args, **kw): - super(self).__init__(*args, complete_lex=True, **kw) + XEarley.__init__(self, *args, complete_lex=True, **kw) diff -Nru lark-parser-0.6.2/lark/parsers/ablalr.py lark-parser-0.7.0/lark/parsers/ablalr.py --- lark-parser-0.6.2/lark/parsers/ablalr.py 2018-02-07 15:27:22.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/ablalr.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,98 +0,0 @@ -"""This module implements an ABLALR(1) Parser - -Ambiguous Backtracking Look-Ahead(1) LR(0) -""" -# Author: Erez Shinan (2017) -# Email : erezshin@gmail.com - -from ..common import UnexpectedToken - -from .lalr_analysis import LALR_Analyzer, Shift - -class Parser: - def __init__(self, parser_conf): - assert all(r.options is None or r.options.priority is None - for r in parser_conf.rules), "LALR doesn't yet support prioritization" - self.analysis = analysis = LALR_Analyzer(parser_conf) - analysis.compute_lookahead() - callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) - for rule in analysis.rules} - - self.parser_conf = parser_conf - self.parser = _Parser(analysis.parse_table, callbacks) - self.parse = self.parser.parse - -###{standalone - -class _Parser: - def __init__(self, parse_table, callbacks): - self.states = parse_table.states - self.start_state = parse_table.start_state - self.end_state = parse_table.end_state - self.callbacks = callbacks - - def parse(self, seq, set_state=None): - i = 0 - token = None - stream = iter(seq) - states = self.states - - state_stack = [self.start_state] - value_stack = [] - - if set_state: set_state(self.start_state) - - def get_action(key): - state = state_stack[-1] - try: - return states[state][key] - except KeyError: - expected = states[state].keys() - - raise UnexpectedToken(token, expected, seq, i) - - def reduce(rule): - size = len(rule.expansion) - if size: - s = value_stack[-size:] - del state_stack[-size:] - del value_stack[-size:] - else: - s = [] - - value = self.callbacks[rule](s) - - _action, new_state = get_action(rule.origin) - assert _action is Shift - state_stack.append(new_state) - value_stack.append(value) - - # Main LALR-parser loop - try: - token = next(stream) - i += 1 - while True: - action, arg = get_action(token.type) - assert arg != self.end_state - - if action is Shift: - state_stack.append(arg) - value_stack.append(token) - if set_state: set_state(arg) - token = next(stream) - i += 1 - else: - reduce(arg) - except StopIteration: - pass - - while True: - _action, arg = get_action('$END') - if _action is Shift: - assert arg == self.end_state - val ,= value_stack - return val - else: - reduce(arg) - -###} diff -Nru lark-parser-0.6.2/lark/parsers/cyk.py lark-parser-0.7.0/lark/parsers/cyk.py --- lark-parser-0.6.2/lark/parsers/cyk.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/cyk.py 2019-03-28 13:53:28.000000000 +0000 @@ -199,7 +199,8 @@ for r in self.rules: # Validate that the grammar is CNF and populate auxiliary data structures. assert isinstance(r.lhs, NT), r - assert len(r.rhs) in [1, 2], r + if len(r.rhs) not in [1, 2]: + raise ParseError("CYK doesn't support empty rules") if len(r.rhs) == 1 and isinstance(r.rhs[0], T): self.terminal_rules[r.rhs[0]].append(r) elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs): diff -Nru lark-parser-0.6.2/lark/parsers/earley_common.py lark-parser-0.7.0/lark/parsers/earley_common.py --- lark-parser-0.6.2/lark/parsers/earley_common.py 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/earley_common.py 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,75 @@ +"This module implements an Earley Parser" + +# The parser uses a parse-forest to keep track of derivations and ambiguations. +# When the parse ends successfully, a disambiguation stage resolves all ambiguity +# (right now ambiguity resolution is not developed beyond the needs of lark) +# Afterwards the parse tree is reduced (transformed) according to user callbacks. +# I use the no-recursion version of Transformer, because the tree might be +# deeper than Python's recursion limit (a bit absurd, but that's life) +# +# The algorithm keeps track of each state set, using a corresponding Column instance. +# Column keeps track of new items using NewsList instances. +# +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + +from ..grammar import NonTerminal, Terminal + +class Item(object): + "An Earley Item, the atom of the algorithm." + + __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') + def __init__(self, rule, ptr, start): + self.is_complete = len(rule.expansion) == ptr + self.rule = rule # rule + self.ptr = ptr # ptr + self.start = start # j + self.node = None # w + if self.is_complete: + self.s = rule.origin + self.expect = None + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None + else: + self.s = (rule, ptr) + self.expect = rule.expansion[ptr] + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None + self._hash = hash((self.s, self.start)) + + def advance(self): + return Item(self.rule, self.ptr + 1, self.start) + + def __eq__(self, other): + return self is other or (self.s == other.s and self.start == other.start) + + def __hash__(self): + return self._hash + + def __repr__(self): + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after)) + return '%s (%d)' % (symbol, self.start) + + +class TransitiveItem(Item): + __slots__ = ('recognized', 'reduction', 'column', 'next_titem') + def __init__(self, recognized, trule, originator, start): + super(TransitiveItem, self).__init__(trule.rule, trule.ptr, trule.start) + self.recognized = recognized + self.reduction = originator + self.column = start + self.next_titem = None + self._hash = hash((self.s, self.start, self.recognized)) + + def __eq__(self, other): + if not isinstance(other, TransitiveItem): + return False + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.recognized == other.recognized) + + def __hash__(self): + return self._hash + + def __repr__(self): + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + return '{} : {} -> {}* {} ({}, {})'.format(self.recognized.name, self.rule.origin.name, ' '.join(before), ' '.join(after), self.column, self.start) diff -Nru lark-parser-0.6.2/lark/parsers/earley_forest.py lark-parser-0.7.0/lark/parsers/earley_forest.py --- lark-parser-0.6.2/lark/parsers/earley_forest.py 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/earley_forest.py 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,429 @@ +""""This module implements an SPPF implementation + +This is used as the primary output mechanism for the Earley parser +in order to store complex ambiguities. + +Full reference and more details is here: +http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ +""" + +from random import randint +from math import isinf +from collections import deque +from operator import attrgetter +from importlib import import_module + +from ..tree import Tree +from ..exceptions import ParseError + +class ForestNode(object): + pass + +class SymbolNode(ForestNode): + """ + A Symbol Node represents a symbol (or Intermediate LR0). + + Symbol nodes are keyed by the symbol (s). For intermediate nodes + s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol + nodes, s will be a string representing the non-terminal origin (i.e. + the left hand side of the rule). + + The children of a Symbol or Intermediate Node will always be Packed Nodes; + with each Packed Node child representing a single derivation of a production. + + Hence a Symbol Node with a single child is unambiguous. + """ + __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash') + def __init__(self, s, start, end): + self.s = s + self.start = start + self.end = end + self._children = set() + self.paths = set() + self.paths_loaded = False + + ### We use inf here as it can be safely negated without resorting to conditionals, + # unlike None or float('NaN'), and sorts appropriately. + self.priority = float('-inf') + self.is_intermediate = isinstance(s, tuple) + self._hash = hash((self.s, self.start, self.end)) + + def add_family(self, lr0, rule, start, left, right): + self._children.add(PackedNode(self, lr0, rule, start, left, right)) + + def add_path(self, transitive, node): + self.paths.add((transitive, node)) + + def load_paths(self): + for transitive, node in self.paths: + if transitive.next_titem is not None: + vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end) + vn.add_path(transitive.next_titem, node) + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn) + else: + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node) + self.paths_loaded = True + + @property + def is_ambiguous(self): + return len(self.children) > 1 + + @property + def children(self): + if not self.paths_loaded: self.load_paths() + return sorted(self._children, key=attrgetter('sort_key')) + + def __iter__(self): + return iter(self._children) + + def __eq__(self, other): + if not isinstance(other, SymbolNode): + return False + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end) + + def __hash__(self): + return self._hash + + def __repr__(self): + if self.is_intermediate: + rule = self.s[0] + ptr = self.s[1] + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) + else: + symbol = self.s.name + return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority) + +class PackedNode(ForestNode): + """ + A Packed Node represents a single derivation in a symbol node. + """ + __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash') + def __init__(self, parent, s, rule, start, left, right): + self.parent = parent + self.s = s + self.start = start + self.rule = rule + self.left = left + self.right = right + self.priority = float('-inf') + self._hash = hash((self.left, self.right)) + + @property + def is_empty(self): + return self.left is None and self.right is None + + @property + def sort_key(self): + """ + Used to sort PackedNode children of SymbolNodes. + A SymbolNode has multiple PackedNodes if it matched + ambiguously. Hence, we use the sort order to identify + the order in which ambiguous children should be considered. + """ + return self.is_empty, -self.priority, -self.rule.order + + def __iter__(self): + return iter([self.left, self.right]) + + def __eq__(self, other): + if not isinstance(other, PackedNode): + return False + return self is other or (self.left == other.left and self.right == other.right) + + def __hash__(self): + return self._hash + + def __repr__(self): + if isinstance(self.s, tuple): + rule = self.s[0] + ptr = self.s[1] + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) + else: + symbol = self.s.name + return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order) + +class ForestVisitor(object): + """ + An abstract base class for building forest visitors. + + Use this as a base when you need to walk the forest. + """ + __slots__ = ['result'] + + def visit_token_node(self, node): pass + def visit_symbol_node_in(self, node): pass + def visit_symbol_node_out(self, node): pass + def visit_packed_node_in(self, node): pass + def visit_packed_node_out(self, node): pass + + def visit(self, root): + self.result = None + # Visiting is a list of IDs of all symbol/intermediate nodes currently in + # the stack. It serves two purposes: to detect when we 'recurse' in and out + # of a symbol/intermediate so that we can process both up and down. Also, + # since the SPPF can have cycles it allows us to detect if we're trying + # to recurse into a node that's already on the stack (infinite recursion). + visiting = set() + + # We do not use recursion here to walk the Forest due to the limited + # stack size in python. Therefore input_stack is essentially our stack. + input_stack = deque([root]) + + # It is much faster to cache these as locals since they are called + # many times in large parses. + vpno = getattr(self, 'visit_packed_node_out') + vpni = getattr(self, 'visit_packed_node_in') + vsno = getattr(self, 'visit_symbol_node_out') + vsni = getattr(self, 'visit_symbol_node_in') + vtn = getattr(self, 'visit_token_node') + while input_stack: + current = next(reversed(input_stack)) + try: + next_node = next(current) + except StopIteration: + input_stack.pop() + continue + except TypeError: + ### If the current object is not an iterator, pass through to Token/SymbolNode + pass + else: + if next_node is None: + continue + + if id(next_node) in visiting: + raise ParseError("Infinite recursion in grammar!") + + input_stack.append(next_node) + continue + + if not isinstance(current, ForestNode): + vtn(current) + input_stack.pop() + continue + + current_id = id(current) + if current_id in visiting: + if isinstance(current, PackedNode): vpno(current) + else: vsno(current) + input_stack.pop() + visiting.remove(current_id) + continue + else: + visiting.add(current_id) + if isinstance(current, PackedNode): next_node = vpni(current) + else: next_node = vsni(current) + if next_node is None: + continue + + if id(next_node) in visiting: + raise ParseError("Infinite recursion in grammar!") + + input_stack.append(next_node) + continue + + return self.result + +class ForestSumVisitor(ForestVisitor): + """ + A visitor for prioritizing ambiguous parts of the Forest. + + This visitor is used when support for explicit priorities on + rules is requested (whether normal, or invert). It walks the + forest (or subsets thereof) and cascades properties upwards + from the leaves. + + It would be ideal to do this during parsing, however this would + require processing each Earley item multiple times. That's + a big performance drawback; so running a forest walk is the + lesser of two evils: there can be significantly more Earley + items created during parsing than there are SPPF nodes in the + final tree. + """ + def visit_packed_node_in(self, node): + return iter([node.left, node.right]) + + def visit_symbol_node_in(self, node): + return iter(node.children) + + def visit_packed_node_out(self, node): + priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options and node.rule.options.priority else 0 + priority += getattr(node.right, 'priority', 0) + priority += getattr(node.left, 'priority', 0) + node.priority = priority + + def visit_symbol_node_out(self, node): + node.priority = max(child.priority for child in node.children) + +class ForestToTreeVisitor(ForestVisitor): + """ + A Forest visitor which converts an SPPF forest to an unambiguous AST. + + The implementation in this visitor walks only the first ambiguous child + of each symbol node. When it finds an ambiguous symbol node it first + calls the forest_sum_visitor implementation to sort the children + into preference order using the algorithms defined there; so the first + child should always be the highest preference. The forest_sum_visitor + implementation should be another ForestVisitor which sorts the children + according to some priority mechanism. + """ + __slots__ = ['forest_sum_visitor', 'callbacks', 'output_stack'] + def __init__(self, callbacks = None, forest_sum_visitor = None): + self.forest_sum_visitor = forest_sum_visitor + self.callbacks = callbacks + + def visit(self, root): + self.output_stack = deque() + return super(ForestToTreeVisitor, self).visit(root) + + def visit_token_node(self, node): + self.output_stack[-1].append(node) + + def visit_symbol_node_in(self, node): + if self.forest_sum_visitor and node.is_ambiguous and isinf(node.priority): + self.forest_sum_visitor.visit(node) + return next(iter(node.children)) + + def visit_packed_node_in(self, node): + if not node.parent.is_intermediate: + self.output_stack.append([]) + return iter([node.left, node.right]) + + def visit_packed_node_out(self, node): + if not node.parent.is_intermediate: + result = self.callbacks[node.rule](self.output_stack.pop()) + if self.output_stack: + self.output_stack[-1].append(result) + else: + self.result = result + +class ForestToAmbiguousTreeVisitor(ForestToTreeVisitor): + """ + A Forest visitor which converts an SPPF forest to an ambiguous AST. + + Because of the fundamental disparity between what can be stored in + an SPPF and what can be stored in a Tree; this implementation is not + complete. It correctly deals with ambiguities that occur on symbol nodes only, + and cannot deal with ambiguities that occur on intermediate nodes. + + Usually, most parsers can be rewritten to avoid intermediate node + ambiguities. Also, this implementation could be fixed, however + the code to handle intermediate node ambiguities is messy and + would not be performant. It is much better not to use this and + instead to correctly disambiguate the forest and only store unambiguous + parses in Trees. It is here just to provide some parity with the + old ambiguity='explicit'. + + This is mainly used by the test framework, to make it simpler to write + tests ensuring the SPPF contains the right results. + """ + def __init__(self, callbacks, forest_sum_visitor = ForestSumVisitor): + super(ForestToAmbiguousTreeVisitor, self).__init__(callbacks, forest_sum_visitor) + + def visit_token_node(self, node): + self.output_stack[-1].children.append(node) + + def visit_symbol_node_in(self, node): + if self.forest_sum_visitor and node.is_ambiguous and isinf(node.priority): + self.forest_sum_visitor.visit(node) + if not node.is_intermediate and node.is_ambiguous: + self.output_stack.append(Tree('_ambig', [])) + return iter(node.children) + + def visit_symbol_node_out(self, node): + if not node.is_intermediate and node.is_ambiguous: + result = self.output_stack.pop() + if self.output_stack: + self.output_stack[-1].children.append(result) + else: + self.result = result + + def visit_packed_node_in(self, node): + if not node.parent.is_intermediate: + self.output_stack.append(Tree('drv', [])) + return iter([node.left, node.right]) + + def visit_packed_node_out(self, node): + if not node.parent.is_intermediate: + result = self.callbacks[node.rule](self.output_stack.pop().children) + if self.output_stack: + self.output_stack[-1].children.append(result) + else: + self.result = result + +class ForestToPyDotVisitor(ForestVisitor): + """ + A Forest visitor which writes the SPPF to a PNG. + + The SPPF can get really large, really quickly because + of the amount of meta-data it stores, so this is probably + only useful for trivial trees and learning how the SPPF + is structured. + """ + def __init__(self, rankdir="TB"): + self.pydot = import_module('pydot') + self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir) + + def visit(self, root, filename): + super(ForestToPyDotVisitor, self).visit(root) + self.graph.write_png(filename) + + def visit_token_node(self, node): + graph_node_id = str(id(node)) + graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"')) + graph_node_color = 0x808080 + graph_node_style = "\"filled,rounded\"" + graph_node_shape = "diamond" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + + def visit_packed_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "filled" + graph_node_shape = "diamond" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + return iter([node.left, node.right]) + + def visit_packed_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in [node.left, node.right]: + if child is not None: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) + else: + #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay. + child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890)) + child_graph_node_style = "invis" + child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None") + child_edge_style = "invis" + self.graph.add_node(child_graph_node) + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style)) + + def visit_symbol_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "\"filled\"" + if node.is_intermediate: + graph_node_shape = "ellipse" + else: + graph_node_shape = "rectangle" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + return iter(node.children) + + def visit_symbol_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in node.children: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) diff -Nru lark-parser-0.6.2/lark/parsers/earley.py lark-parser-0.7.0/lark/parsers/earley.py --- lark-parser-0.6.2/lark/parsers/earley.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/earley.py 2019-03-28 13:53:28.000000000 +0000 @@ -1,233 +1,311 @@ -"This module implements an Earley Parser" +"""This module implements an scanerless Earley parser. -# The parser uses a parse-forest to keep track of derivations and ambiguations. -# When the parse ends successfully, a disambiguation stage resolves all ambiguity -# (right now ambiguity resolution is not developed beyond the needs of lark) -# Afterwards the parse tree is reduced (transformed) according to user callbacks. -# I use the no-recursion version of Transformer, because the tree might be -# deeper than Python's recursion limit (a bit absurd, but that's life) -# -# The algorithm keeps track of each state set, using a corresponding Column instance. -# Column keeps track of new items using NewsList instances. -# -# Author: Erez Shinan (2017) -# Email : erezshin@gmail.com +The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: + https://www.sciencedirect.com/science/article/pii/S1571066108001497 + +That is probably the best reference for understanding the algorithm here. + +The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format +is better documented here: + http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ +""" + +from collections import deque -from ..tree import Tree from ..visitors import Transformer_InPlace, v_args from ..exceptions import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal - - -class Derivation(Tree): - def __init__(self, rule, items=None): - Tree.__init__(self, 'drv', items or []) - self.meta.rule = rule - self._hash = None - - def _pretty_label(self): # Nicer pretty for debugging the parser - return self.rule.origin if self.rule else self.data - - def __hash__(self): - if self._hash is None: - self._hash = Tree.__hash__(self) - return self._hash - -class Item(object): - "An Earley Item, the atom of the algorithm." - - def __init__(self, rule, ptr, start, tree): - self.rule = rule - self.ptr = ptr - self.start = start - self.tree = tree if tree is not None else Derivation(self.rule) - - @property - def expect(self): - return self.rule.expansion[self.ptr] - - @property - def is_complete(self): - return self.ptr == len(self.rule.expansion) - - def advance(self, tree): - assert self.tree.data == 'drv' - new_tree = Derivation(self.rule, self.tree.children + [tree]) - return self.__class__(self.rule, self.ptr+1, self.start, new_tree) - - def __eq__(self, other): - return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule - - def __hash__(self): - return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ - - def __repr__(self): - before = list(map(str, self.rule.expansion[:self.ptr])) - after = list(map(str, self.rule.expansion[self.ptr:])) - return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) - -class NewsList(list): - "Keeps track of newly added items (append-only)" - - def __init__(self, initial=None): - list.__init__(self, initial or []) - self.last_iter = 0 - - def get_news(self): - i = self.last_iter - self.last_iter = len(self) - return self[i:] - - - -class Column: - "An entry in the table, aka Earley Chart. Contains lists of items." - def __init__(self, i, FIRST, predict_all=False): - self.i = i - self.to_reduce = NewsList() - self.to_predict = NewsList() - self.to_scan = [] - self.item_count = 0 - self.FIRST = FIRST - - self.predicted = set() - self.completed = {} - self.predict_all = predict_all - - def add(self, items): - """Sort items into scan/predict/reduce newslists - - Makes sure only unique items are added. - """ - for item in items: - - item_key = item, item.tree # Elsewhere, tree is not part of the comparison - if item.is_complete: - # XXX Potential bug: What happens if there's ambiguity in an empty rule? - if item.rule.expansion and item_key in self.completed: - old_tree = self.completed[item_key].tree - if old_tree == item.tree: - is_empty = not self.FIRST[item.rule.origin] - if not is_empty: - continue - - if old_tree.data != '_ambig': - new_tree = old_tree.copy() - new_tree.meta.rule = old_tree.meta.rule - old_tree.set('_ambig', [new_tree]) - old_tree.meta.rule = None # No longer a 'drv' node - - if item.tree.children[0] is old_tree: # XXX a little hacky! - raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) - - if item.tree not in old_tree.children: - old_tree.children.append(item.tree) - # old_tree.children.append(item.tree) - else: - self.completed[item_key] = item - self.to_reduce.append(item) - else: - if item.expect.is_term: - self.to_scan.append(item) - else: - k = item_key if self.predict_all else item - if k in self.predicted: - continue - self.predicted.add(k) - self.to_predict.append(item) - - self.item_count += 1 # Only count if actually added - - - def __bool__(self): - return bool(self.item_count) - __nonzero__ = __bool__ # Py2 backwards-compatibility +from .earley_common import Item, TransitiveItem +from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor class Parser: - def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True): analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity self.FIRST = analysis.FIRST - self.postprocess = {} + self.NULLABLE = analysis.NULLABLE + self.callbacks = {} self.predictions = {} + + ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than + # the slow 'isupper' in is_terminal. + self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } + self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } + + self.forest_sum_visitor = None for rule in parser_conf.rules: - self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) + self.callbacks[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] + ## Detect if any rules have priorities set. If the user specified priority = "none" then + # the priorities will be stripped from all rules before they reach us, allowing us to + # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities + # on any rules. + if self.forest_sum_visitor is None and rule.options and rule.options.priority is not None: + self.forest_sum_visitor = ForestSumVisitor() + + if resolve_ambiguity: + self.forest_tree_visitor = ForestToTreeVisitor(self.callbacks, self.forest_sum_visitor) + else: + self.forest_tree_visitor = ForestToAmbiguousTreeVisitor(self.callbacks, self.forest_sum_visitor) self.term_matcher = term_matcher - def parse(self, stream, start_symbol=None): - # Define parser functions - start_symbol = NonTerminal(start_symbol or self.parser_conf.start) + def predict_and_complete(self, i, to_scan, columns, transitives): + """The core Earley Predictor and Completer. - _Item = Item - match = self.term_matcher + At each stage of the input, we handling any completed items (things + that matched on the last cycle) and use those to predict what should + come next in the input stream. The completions and any predicted + non-terminals are recursively processed until we reach a set of, + which can be added to the scan list for the next scanner cycle.""" + # Held Completions (H in E.Scotts paper). + node_cache = {} + held_completions = {} + + column = columns[i] + # R (items) = Ei (column.items) + items = deque(column) + while items: + item = items.pop() # remove an element, A say, from R + + ### The Earley completer + if item.is_complete: ### (item.s == string) + if item.node is None: + label = (item.s, item.start, i) + item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + item.node.add_family(item.s, item.rule, item.start, None, None) + + # create_leo_transitives(item.rule.origin, item.start) + + ###R Joop Leo right recursion Completer + if item.rule.origin in transitives[item.start]: + transitive = transitives[item.start][item.s] + if transitive.previous in transitives[transitive.column]: + root_transitive = transitives[transitive.column][transitive.previous] + else: + root_transitive = transitive + + new_item = Item(transitive.rule, transitive.ptr, transitive.start) + label = (root_transitive.s, root_transitive.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_path(root_transitive, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + ###R Regular Earley completer + else: + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start == i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + label = (new_item.s, originator.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + + ### The Earley predictor + elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) + new_items = [] + for rule in self.predictions[item.expect]: + new_item = Item(rule, 0, i) + new_items.append(new_item) + + # Process any held completions (H). + if item.expect in held_completions: + new_item = item.advance() + label = (new_item.s, item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) + new_items.append(new_item) + + for new_item in new_items: + if new_item.expect in self.TERMINALS: + to_scan.add(new_item) + elif new_item not in column: + column.add(new_item) + items.append(new_item) - def predict(nonterm, column): - assert not nonterm.is_term, nonterm - return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] - - def complete(item): - name = item.rule.origin - return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] + def _parse(self, stream, columns, to_scan, start_symbol=None): + def is_quasi_complete(item): + if item.is_complete: + return True - def predict_and_complete(column): + quasi = item.advance() + while not quasi.is_complete: + if quasi.expect not in self.NULLABLE: + return False + if quasi.rule.origin == start_symbol and quasi.expect == start_symbol: + return False + quasi = quasi.advance() + return True + + def create_leo_transitives(origin, start): + visited = set() + to_create = [] + trule = None + previous = None + + ### Recursively walk backwards through the Earley sets until we find the + # first transitive candidate. If this is done continuously, we shouldn't + # have to walk more than 1 hop. while True: - to_predict = {x.expect for x in column.to_predict.get_news() - if x.ptr} # if not part of an already predicted batch - to_reduce = set(column.to_reduce.get_news()) - if not (to_predict or to_reduce): + if origin in transitives[start]: + previous = trule = transitives[start][origin] break - for nonterm in to_predict: - column.add( predict(nonterm, column) ) + is_empty_rule = not self.FIRST[origin] + if is_empty_rule: + break - for item in to_reduce: - new_items = list(complete(item)) - if item in new_items: - raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) - column.add(new_items) - - def scan(i, token, column): - next_set = Column(i, self.FIRST) - next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) - - if not next_set: - expect = {i.expect.name for i in column.to_scan} - raise UnexpectedToken(token, expect, considered_rules=set(column.to_scan)) - - return next_set - - # Main loop starts - column0 = Column(0, self.FIRST) - column0.add(predict(start_symbol, column0)) - - column = column0 - for i, token in enumerate(stream): - predict_and_complete(column) - column = scan(i, token, column) - - predict_and_complete(column) - - # Parse ended. Now build a parse tree - solutions = [n.tree for n in column.to_reduce - if n.rule.origin==start_symbol and n.start is column0] + candidates = [ candidate for candidate in columns[start] if candidate.expect is not None and origin == candidate.expect ] + if len(candidates) != 1: + break + originator = next(iter(candidates)) - if not solutions: - raise ParseError('Incomplete parse: Could not find a solution to input') - elif len(solutions) == 1: - tree = solutions[0] - else: - tree = Tree('_ambig', solutions) + if originator is None or originator in visited: + break - if self.resolve_ambiguity: - tree = self.resolve_ambiguity(tree) + visited.add(originator) + if not is_quasi_complete(originator): + break + + trule = originator.advance() + if originator.start != start: + visited.clear() + + to_create.append((origin, start, originator)) + origin = originator.rule.origin + start = originator.start + + # If a suitable Transitive candidate is not found, bail. + if trule is None: + return + + #### Now walk forwards and create Transitive Items in each set we walked through; and link + # each transitive item to the next set forwards. + while to_create: + origin, start, originator = to_create.pop() + titem = None + if previous is not None: + titem = previous.next_titem = TransitiveItem(origin, trule, originator, previous.column) + else: + titem = TransitiveItem(origin, trule, originator, start) + previous = transitives[start][origin] = titem + + + + def scan(i, token, to_scan): + """The core Earley Scanner. + + This is a custom implementation of the scanner that uses the + Lark lexer to match tokens. The scan list is built by the + Earley predictor, based on the previously completed tokens. + This ensures that at each phase of the parse we have a custom + lexer context, allowing for more complex ambiguities.""" + next_to_scan = set() + next_set = set() + columns.append(next_set) + transitives.append({}) + node_cache = {} + + for item in set(to_scan): + if match(item.expect, token): + new_item = item.advance() + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) + + if new_item.expect in self.TERMINALS: + # add (B ::= Aai+1.B, h, y) to Q' + next_to_scan.add(new_item) + else: + # add (B ::= Aa+1.B, h, y) to Ei+1 + next_set.add(new_item) + + if not next_set and not next_to_scan: + expect = {i.expect.name for i in to_scan} + raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) + + return next_to_scan + + + # Define parser functions + match = self.term_matcher + + # Cache for nodes & tokens created in a particular parse step. + transitives = [{}] + + ## The main Earley loop. + # Run the Prediction/Completion cycle for any Items in the current Earley set. + # Completions will be added to the SPPF tree, and predictions will be recursively + # processed down to terminals/empty nodes to be added to the scanner for the next + # step. + i = 0 + for token in stream: + self.predict_and_complete(i, to_scan, columns, transitives) + + to_scan = scan(i, token, to_scan) + i += 1 + + self.predict_and_complete(i, to_scan, columns, transitives) + + ## Column is now the final column in the parse. + assert i == len(columns)-1 + + def parse(self, stream, start_symbol=None): + start_symbol = NonTerminal(start_symbol or self.parser_conf.start) + + columns = [set()] + to_scan = set() # The scan buffer. 'Q' in E.Scott's paper. + + ## Predict for the start_symbol. + # Add predicted items to the first Earley set (for the predictor) if they + # result in a non-terminal, or the scanner if they result in a terminal. + for rule in self.predictions[start_symbol]: + item = Item(rule, 0, 0) + if item.expect in self.TERMINALS: + to_scan.add(item) + else: + columns[0].add(item) + + self._parse(stream, columns, to_scan, start_symbol) + + # If the parse was successful, the start + # symbol should have been completed in the last step of the Earley cycle, and will be in + # this column. Find the item for the start_symbol, which is the root of the SPPF tree. + solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] + + if not solutions: + expected_tokens = [t.expect for t in to_scan] + # raise ParseError('Incomplete parse: Could not find a solution to input') + raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) + elif len(solutions) > 1: + assert False, 'Earley should not generate multiple start symbol items!' - return ApplyCallbacks(self.postprocess).transform(tree) + # Perform our SPPF -> AST conversion using the right ForestVisitor. + return self.forest_tree_visitor.visit(solutions[0]) class ApplyCallbacks(Transformer_InPlace): diff -Nru lark-parser-0.6.2/lark/parsers/grammar_analysis.py lark-parser-0.7.0/lark/parsers/grammar_analysis.py --- lark-parser-0.6.2/lark/parsers/grammar_analysis.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/grammar_analysis.py 2019-03-28 13:53:28.000000000 +0000 @@ -1,3 +1,4 @@ +from collections import Counter from ..utils import bfs, fzset, classify from ..exceptions import GrammarError @@ -14,9 +15,9 @@ self.index = index def __repr__(self): - before = self.rule.expansion[:self.index] - after = self.rule.expansion[self.index:] - return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) + before = [x.name for x in self.rule.expansion[:self.index]] + after = [x.name for x in self.rule.expansion[self.index:]] + return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after)) @property def next(self): @@ -92,7 +93,7 @@ for rule in rules: for i, sym in enumerate(rule.expansion): - if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: + if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE: if update_set(FOLLOW[sym], FOLLOW[rule.origin]): changed = True @@ -111,7 +112,10 @@ rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])] self.rules_by_origin = classify(rules, lambda r: r.origin) - assert len(rules) == len(set(rules)) + if len(rules) != len(set(rules)): + duplicates = [item for item, count in Counter(rules).items() if count > 1] + raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates)) + for r in rules: for sym in r.expansion: if not (sym.is_term or sym in self.rules_by_origin): diff -Nru lark-parser-0.6.2/lark/parsers/lalr_analysis.py lark-parser-0.7.0/lark/parsers/lalr_analysis.py --- lark-parser-0.6.2/lark/parsers/lalr_analysis.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/lalr_analysis.py 2019-03-28 13:53:28.000000000 +0000 @@ -82,7 +82,9 @@ for k, v in lookahead.items(): if len(v) > 1: if self.debug: - logging.warn("Shift/reduce conflict for %s: %s. Resolving as shift.", k, v) + logging.warn("Shift/reduce conflict for terminal %s: (resolving as shift)", k.name) + for act, arg in v: + logging.warn(' * %s: %s', act, arg) for x in v: # XXX resolving shift/reduce into shift, like PLY # Give a proper warning diff -Nru lark-parser-0.6.2/lark/parsers/lalr_parser.py lark-parser-0.7.0/lark/parsers/lalr_parser.py --- lark-parser-0.6.2/lark/parsers/lalr_parser.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/lalr_parser.py 2019-03-28 13:53:28.000000000 +0000 @@ -3,14 +3,15 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com from ..exceptions import UnexpectedToken +from ..lexer import Token from .lalr_analysis import LALR_Analyzer, Shift class Parser: - def __init__(self, parser_conf): + def __init__(self, parser_conf, debug=False): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" - analysis = LALR_Analyzer(parser_conf) + analysis = LALR_Analyzer(parser_conf, debug=debug) analysis.compute_lookahead() callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) for rule in parser_conf.rules} @@ -30,7 +31,6 @@ self.callbacks = callbacks def parse(self, seq, set_state=None): - i = 0 token = None stream = iter(seq) states = self.states @@ -40,13 +40,13 @@ if set_state: set_state(self.start_state) - def get_action(key): + def get_action(token): state = state_stack[-1] try: - return states[state][key] + return states[state][token.type] except KeyError: - expected = states[state].keys() - raise UnexpectedToken(token, expected, state=state) # TODO filter out rules from expected + expected = [s for s in states[state].keys() if s.isupper()] + raise UnexpectedToken(token, expected, state=state) def reduce(rule): size = len(rule.expansion) @@ -59,15 +59,15 @@ value = self.callbacks[rule](s) - _action, new_state = get_action(rule.origin.name) + _action, new_state = states[state_stack[-1]][rule.origin.name] assert _action is Shift state_stack.append(new_state) value_stack.append(value) # Main LALR-parser loop - for i, token in enumerate(stream): + for token in stream: while True: - action, arg = get_action(token.type) + action, arg = get_action(token) assert arg != self.end_state if action is Shift: @@ -78,8 +78,9 @@ else: reduce(arg) + token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: - _action, arg = get_action('$END') + _action, arg = get_action(token) if _action is Shift: assert arg == self.end_state val ,= value_stack diff -Nru lark-parser-0.6.2/lark/parsers/resolve_ambig.py lark-parser-0.7.0/lark/parsers/resolve_ambig.py --- lark-parser-0.6.2/lark/parsers/resolve_ambig.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/resolve_ambig.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,109 +0,0 @@ -from ..utils import compare -from functools import cmp_to_key - -from ..tree import Tree - - -# Standard ambiguity resolver (uses comparison) -# -# Author: Erez Sh - -def _compare_rules(rule1, rule2): - return -compare( len(rule1.expansion), len(rule2.expansion)) - -def _sum_priority(tree): - p = 0 - - for n in tree.iter_subtrees(): - try: - p += n.meta.rule.options.priority or 0 - except AttributeError: - pass - - return p - -def _compare_priority(tree1, tree2): - tree1.iter_subtrees() - -def _compare_drv(tree1, tree2): - try: - rule1 = tree1.meta.rule - except AttributeError: - rule1 = None - - try: - rule2 = tree2.meta.rule - except AttributeError: - rule2 = None - - if None == rule1 == rule2: - return compare(tree1, tree2) - elif rule1 is None: - return -1 - elif rule2 is None: - return 1 - - assert tree1.data != '_ambig' - assert tree2.data != '_ambig' - - p1 = _sum_priority(tree1) - p2 = _sum_priority(tree2) - c = (p1 or p2) and compare(p1, p2) - if c: - return c - - c = _compare_rules(tree1.meta.rule, tree2.meta.rule) - if c: - return c - - # rules are "equal", so compare trees - if len(tree1.children) == len(tree2.children): - for t1, t2 in zip(tree1.children, tree2.children): - c = _compare_drv(t1, t2) - if c: - return c - - return compare(len(tree1.children), len(tree2.children)) - - -def _standard_resolve_ambig(tree): - assert tree.data == '_ambig' - key_f = cmp_to_key(_compare_drv) - best = max(tree.children, key=key_f) - assert best.data == 'drv' - tree.set('drv', best.children) - tree.meta.rule = best.meta.rule # needed for applying callbacks - -def standard_resolve_ambig(tree): - for ambig in tree.find_data('_ambig'): - _standard_resolve_ambig(ambig) - - return tree - - - - -# Anti-score Sum -# -# Author: Uriva (https://github.com/uriva) - -def _antiscore_sum_drv(tree): - if not isinstance(tree, Tree): - return 0 - - assert tree.data != '_ambig' - - return _sum_priority(tree) - -def _antiscore_sum_resolve_ambig(tree): - assert tree.data == '_ambig' - best = min(tree.children, key=_antiscore_sum_drv) - assert best.data == 'drv' - tree.set('drv', best.children) - tree.meta.rule = best.meta.rule # needed for applying callbacks - -def antiscore_sum_resolve_ambig(tree): - for ambig in tree.find_data('_ambig'): - _antiscore_sum_resolve_ambig(ambig) - - return tree diff -Nru lark-parser-0.6.2/lark/parsers/xearley.py lark-parser-0.7.0/lark/parsers/xearley.py --- lark-parser-0.6.2/lark/parsers/xearley.py 2018-07-18 12:42:58.000000000 +0000 +++ lark-parser-0.7.0/lark/parsers/xearley.py 2019-03-28 13:53:28.000000000 +0000 @@ -1,107 +1,58 @@ -"This module implements an experimental Earley Parser with a dynamic lexer" +"""This module implements an experimental Earley parser with a dynamic lexer -# The parser uses a parse-forest to keep track of derivations and ambiguations. -# When the parse ends successfully, a disambiguation stage resolves all ambiguity -# (right now ambiguity resolution is not developed beyond the needs of lark) -# Afterwards the parse tree is reduced (transformed) according to user callbacks. -# I use the no-recursion version of Transformer and Visitor, because the tree might be -# deeper than Python's recursion limit (a bit absurd, but that's life) -# -# The algorithm keeps track of each state set, using a corresponding Column instance. -# Column keeps track of new items using NewsList instances. -# -# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser -# uses regular expressions by necessity, achieving high-performance while maintaining all of -# Earley's power in parsing any CFG. -# -# -# Author: Erez Shinan (2017) -# Email : erezshin@gmail.com +The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: + https://www.sciencedirect.com/science/article/pii/S1571066108001497 + +That is probably the best reference for understanding the algorithm here. + +The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format +is better documented here: + http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ + +Instead of running a lexer beforehand, or using a costy char-by-char method, this parser +uses regular expressions by necessity, achieving high-performance while maintaining all of +Earley's power in parsing any CFG. +""" from collections import defaultdict -from ..exceptions import ParseError, UnexpectedCharacters +from ..exceptions import UnexpectedCharacters from ..lexer import Token -from ..tree import Tree -from .grammar_analysis import GrammarAnalyzer -from ..grammar import NonTerminal, Terminal - -from .earley import ApplyCallbacks, Item, Column +from ..grammar import Terminal +from .earley import Parser as BaseParser +from .earley_forest import SymbolNode -class Parser: - def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False): - self.analysis = GrammarAnalyzer(parser_conf) - self.parser_conf = parser_conf - self.resolve_ambiguity = resolve_ambiguity +class Parser(BaseParser): + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False): + BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity) self.ignore = [Terminal(t) for t in ignore] - self.predict_all = predict_all self.complete_lex = complete_lex - self.FIRST = self.analysis.FIRST - self.postprocess = {} - self.predictions = {} - for rule in parser_conf.rules: - self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) - self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] - - self.term_matcher = term_matcher - - - def parse(self, stream, start_symbol=None): - # Define parser functions - start_symbol = NonTerminal(start_symbol or self.parser_conf.start) - delayed_matches = defaultdict(list) - match = self.term_matcher - - text_line = 1 - text_column = 1 + def _parse(self, stream, columns, to_scan, start_symbol=None): - def predict(nonterm, column): - assert not nonterm.is_term, nonterm - return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] - - def complete(item): - name = item.rule.origin - return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] - - def predict_and_complete(column): - while True: - to_predict = {x.expect for x in column.to_predict.get_news() - if x.ptr} # if not part of an already predicted batch - to_reduce = column.to_reduce.get_news() - if not (to_predict or to_reduce): - break - - for nonterm in to_predict: - column.add( predict(nonterm, column) ) - for item in to_reduce: - new_items = list(complete(item)) - if item in new_items: - raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) - column.add(new_items) + def scan(i, to_scan): + """The core Earley Scanner. - def scan(i, column): - to_scan = column.to_scan - - for x in self.ignore: - m = match(x, stream, i) - if m: - delayed_matches[m.end()] += set(to_scan) - delayed_matches[m.end()] += set(column.to_reduce) - - # TODO add partial matches for ignore too? - # s = m.group(0) - # for j in range(1, len(s)): - # m = x.match(s[:-j]) - # if m: - # delayed_matches[m.end()] += to_scan - - for item in to_scan: + This is a custom implementation of the scanner that uses the + Lark lexer to match tokens. The scan list is built by the + Earley predictor, based on the previously completed tokens. + This ensures that at each phase of the parse we have a custom + lexer context, allowing for more complex ambiguities.""" + + node_cache = {} + + # 1) Loop the expectations and ask the lexer to match. + # Since regexp is forward looking on the input stream, and we only + # want to process tokens when we hit the point in the stream at which + # they complete, we push all tokens into a buffer (delayed_matches), to + # be held possibly for a later parse step when we reach the point in the + # input stream at which they complete. + for item in set(to_scan): m = match(item.expect, stream, i) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[m.end()].append(item.advance(t)) + delayed_matches[m.end()].append( (item, i, t) ) if self.complete_lex: s = m.group(0) @@ -109,50 +60,90 @@ m = match(item.expect, s[:-j]) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[i+m.end()].append(item.advance(t)) + delayed_matches[i+m.end()].append( (item, i, t) ) + + # Remove any items that successfully matched in this pass from the to_scan buffer. + # This ensures we don't carry over tokens that already matched, if we're ignoring below. + to_scan.remove(item) + + # 3) Process any ignores. This is typically used for e.g. whitespace. + # We carry over any unmatched items from the to_scan buffer to be matched again after + # the ignore. This should allow us to use ignored symbols in non-terminals to implement + # e.g. mandatory spacing. + for x in self.ignore: + m = match(x, stream, i) + if m: + # Carry over any items still in the scan buffer, to past the end of the ignored items. + delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ]) + + # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. + delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) + + next_to_scan = set() + next_set = set() + columns.append(next_set) + transitives.append({}) + + ## 4) Process Tokens from delayed_matches. + # This is the core of the Earley scanner. Create an SPPF node for each Token, + # and create the symbol node in the SPPF tree. Advance the item that completed, + # and add the resulting new item to either the Earley set (for processing by the + # completer/predictor) or the to_scan buffer for the next parse step. + for item, start, token in delayed_matches[i+1]: + if token is not None: + token.end_line = text_line + token.end_column = text_column + 1 + + new_item = item.advance() + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) + else: + new_item = item + + if new_item.expect in self.TERMINALS: + # add (B ::= Aai+1.B, h, y) to Q' + next_to_scan.add(new_item) + else: + # add (B ::= Aa+1.B, h, y) to Ei+1 + next_set.add(new_item) - next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) - next_set.add(delayed_matches[i+1]) del delayed_matches[i+1] # No longer needed, so unburden memory - if not next_set and not delayed_matches: + if not next_set and not delayed_matches and not next_to_scan: raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) - return next_set + return next_to_scan - # Main loop starts - column0 = Column(0, self.FIRST, predict_all=self.predict_all) - column0.add(predict(start_symbol, column0)) - - column = column0 - for i, token in enumerate(stream): - predict_and_complete(column) - column = scan(i, column) - - if token == '\n': - text_line += 1 - text_column = 1 - else: - text_column += 1 - predict_and_complete(column) + delayed_matches = defaultdict(list) + match = self.term_matcher - # Parse ended. Now build a parse tree - solutions = [n.tree for n in column.to_reduce - if n.rule.origin==start_symbol and n.start is column0] + # Cache for nodes & tokens created in a particular parse step. + transitives = [{}] - if not solutions: - expected_tokens = [t.expect for t in column.to_scan] - raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) + text_line = 1 + text_column = 1 - elif len(solutions) == 1: - tree = solutions[0] - else: - tree = Tree('_ambig', solutions) + ## The main Earley loop. + # Run the Prediction/Completion cycle for any Items in the current Earley set. + # Completions will be added to the SPPF tree, and predictions will be recursively + # processed down to terminals/empty nodes to be added to the scanner for the next + # step. + i = 0 + for token in stream: + self.predict_and_complete(i, to_scan, columns, transitives) - if self.resolve_ambiguity: - tree = self.resolve_ambiguity(tree) + to_scan = scan(i, to_scan) - return ApplyCallbacks(self.postprocess).transform(tree) + if token == '\n': + text_line += 1 + text_column = 1 + else: + text_column += 1 + i += 1 + self.predict_and_complete(i, to_scan, columns, transitives) + ## Column is now the final column in the parse. + assert i == len(columns)-1 \ No newline at end of file diff -Nru lark-parser-0.6.2/lark/parse_tree_builder.py lark-parser-0.7.0/lark/parse_tree_builder.py --- lark-parser-0.6.2/lark/parse_tree_builder.py 2018-07-18 13:03:24.000000000 +0000 +++ lark-parser-0.7.0/lark/parse_tree_builder.py 2019-03-28 13:53:28.000000000 +0000 @@ -1,12 +1,11 @@ from .exceptions import GrammarError -from .utils import suppress from .lexer import Token -from .grammar import Rule from .tree import Tree from .visitors import InlineTransformer # XXX Deprecated ###{standalone from functools import partial, wraps +from itertools import repeat, product class ExpandSingleChild: @@ -19,7 +18,6 @@ else: return self.node_builder(children) - class PropagatePositions: def __init__(self, node_builder): self.node_builder = node_builder @@ -27,43 +25,58 @@ def __call__(self, children): res = self.node_builder(children) - if children and isinstance(res, Tree): - for a in children: - if isinstance(a, Tree): - res.meta.line = a.meta.line - res.meta.column = a.meta.column - elif isinstance(a, Token): - res.meta.line = a.line - res.meta.column = a.column - break - - for a in reversed(children): - # with suppress(AttributeError): - if isinstance(a, Tree): - res.meta.end_line = a.meta.end_line - res.meta.end_column = a.meta.end_column - elif isinstance(a, Token): - res.meta.end_line = a.end_line - res.meta.end_column = a.end_column - - break + if isinstance(res, Tree): + for c in children: + if isinstance(c, Tree) and c.children and not c.meta.empty: + res.meta.line = c.meta.line + res.meta.column = c.meta.column + res.meta.start_pos = c.meta.start_pos + res.meta.empty = False + break + elif isinstance(c, Token): + res.meta.line = c.line + res.meta.column = c.column + res.meta.start_pos = c.pos_in_stream + res.meta.empty = False + break + + for c in reversed(children): + if isinstance(c, Tree) and c.children and not c.meta.empty: + res.meta.end_line = c.meta.end_line + res.meta.end_column = c.meta.end_column + res.meta.end_pos = c.meta.end_pos + res.meta.empty = False + break + elif isinstance(c, Token): + res.meta.end_line = c.end_line + res.meta.end_column = c.end_column + res.meta.end_pos = c.pos_in_stream + len(c.value) + res.meta.empty = False + break return res class ChildFilter: - def __init__(self, to_include, node_builder): + def __init__(self, to_include, append_none, node_builder): self.node_builder = node_builder self.to_include = to_include + self.append_none = append_none def __call__(self, children): filtered = [] - for i, to_expand in self.to_include: + + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none if to_expand: filtered += children[i].children else: filtered.append(children[i]) + if self.append_none: + filtered += [None] * self.append_none + return self.node_builder(filtered) class ChildFilterLALR(ChildFilter): @@ -71,7 +84,9 @@ def __call__(self, children): filtered = [] - for i, to_expand in self.to_include: + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none if to_expand: if filtered: filtered += children[i].children @@ -80,37 +95,115 @@ else: filtered.append(children[i]) + if self.append_none: + filtered += [None] * self.append_none + + return self.node_builder(filtered) + +class ChildFilterLALR_NoPlaceholders(ChildFilter): + "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" + def __init__(self, to_include, node_builder): + self.node_builder = node_builder + self.to_include = to_include + + def __call__(self, children): + filtered = [] + for i, to_expand in self.to_include: + if to_expand: + if filtered: + filtered += children[i].children + else: # Optimize for left-recursion + filtered = children[i].children + else: + filtered.append(children[i]) return self.node_builder(filtered) def _should_expand(sym): return not sym.is_term and sym.name.startswith('_') -def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous): - to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) - if keep_all_tokens or not (sym.is_term and sym.filter_out)] +def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices): + # Prepare empty_indices as: How many Nones to insert at each index? + if _empty_indices: + assert _empty_indices.count(False) == len(expansion) + s = ''.join(str(int(b)) for b in _empty_indices) + empty_indices = [len(ones) for ones in s.split('0')] + assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion)) + else: + empty_indices = [0] * (len(expansion)+1) + + to_include = [] + nones_to_add = 0 + for i, sym in enumerate(expansion): + nones_to_add += empty_indices[i] + if keep_all_tokens or not (sym.is_term and sym.filter_out): + to_include.append((i, _should_expand(sym), nones_to_add)) + nones_to_add = 0 + + nones_to_add += empty_indices[len(expansion)] + + if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include): + if _empty_indices or ambiguous: + return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add) + else: + # LALR without placeholders + return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include]) + +class AmbiguousExpander: + """Deal with the case where we're expanding children ('_rule') into a parent but the children + are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself + ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children + into the right parents in the right places, essentially shifting the ambiguiuty up the tree.""" + def __init__(self, to_expand, tree_class, node_builder): + self.node_builder = node_builder + self.tree_class = tree_class + self.to_expand = to_expand + + def __call__(self, children): + def _is_ambig_tree(child): + return hasattr(child, 'data') and child.data == '_ambig' - if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): - return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) + #### When we're repeatedly expanding ambiguities we can end up with nested ambiguities. + # All children of an _ambig node should be a derivation of that ambig node, hence + # it is safe to assume that if we see an _ambig node nested within an ambig node + # it is safe to simply expand it into the parent _ambig node as an alternative derivation. + ambiguous = [] + for i, child in enumerate(children): + if _is_ambig_tree(child): + if i in self.to_expand: + ambiguous.append(i) + to_expand = [j for j, grandchild in enumerate(child.children) if _is_ambig_tree(grandchild)] + child.expand_kids_by_index(*to_expand) + + if not ambiguous: + return self.node_builder(children) + + expand = [ iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children) ] + return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))]) + +def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): + to_expand = [i for i, sym in enumerate(expansion) + if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))] + if to_expand: + return partial(AmbiguousExpander, to_expand, tree_class) class Callback(object): pass -def inline_args(func): +def ptb_inline_args(func): @wraps(func) def f(children): return func(*children) return f - - class ParseTreeBuilder: - def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): + def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class self.propagate_positions = propagate_positions self.always_keep_all_tokens = keep_all_tokens self.ambiguous = ambiguous + self.maybe_placeholders = maybe_placeholders self.rule_builders = list(self._init_builders(rules)) @@ -124,8 +217,9 @@ wrapper_chain = filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous), + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), self.propagate_positions and PropagatePositions, + self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), ]) yield rule, wrapper_chain @@ -134,8 +228,10 @@ def create_callback(self, transformer=None): callback = Callback() + i = 0 for rule, wrapper_chain in self.rule_builders: - internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(x.name for x in rule.expansion)) + internal_callback_name = '_cb%d_%s' % (i, rule.origin) + i += 1 user_callback_name = rule.alias or rule.origin.name try: @@ -143,7 +239,7 @@ assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer" # XXX InlineTransformer is deprecated! if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer): - f = inline_args(f) + f = ptb_inline_args(f) except AttributeError: f = partial(self.tree_class, user_callback_name) diff -Nru lark-parser-0.6.2/lark/reconstruct2.py lark-parser-0.7.0/lark/reconstruct2.py --- lark-parser-0.6.2/lark/reconstruct2.py 2018-02-10 09:22:44.000000000 +0000 +++ lark-parser-0.7.0/lark/reconstruct2.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,107 +0,0 @@ -import re -from collections import defaultdict - -from .tree import Tree -from .common import is_terminal, ParserConf, PatternStr, Terminal -from .lexer import Token -from .parsers import earley - - - -def is_discarded_terminal(t): - return is_terminal(t) and t.startswith('_') - -def is_iter_empty(i): - try: - _ = next(i) - return False - except StopIteration: - return True - -class Reconstructor: - def __init__(self, parser): - # Recreate the rules to assume a standard lexer - _tokens, rules, _grammar_extra = parser.grammar.compile(lexer='standard', start='whatever') - tokens = {t.name:t for t in _tokens} - - token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in _tokens} - - class MatchTerminal(Terminal): - def match(self, other): - if isinstance(other, Tree): - return False - return token_res[self.data].match(other) is not None - - class MatchTree(Terminal): - def match(self, other): - try: - return self.data == other.data - except AttributeError: - return False - - class WriteTokens: - def __init__(self, name, expansion): - self.name = name - self.expansion = expansion - - def f(self, args): - args2 = iter(args) - to_write = [] - for sym in self.expansion: - if is_discarded_terminal(sym): - t = tokens[sym] - assert isinstance(t.pattern, PatternStr) - to_write.append(t.pattern.value) - else: - x = next(args2) - if isinstance(x, list): - to_write += x - else: - if isinstance(x, Token): - assert x.type == sym, x - else: - assert x.data == sym, x - to_write.append(x) - - assert is_iter_empty(args2) - - return to_write - - d = defaultdict(list) - for name, (expansions, _o) in rules.items(): - for expansion, alias in expansions: - if alias: - d[alias].append(expansion) - d[name].append([alias]) - else: - d[name].append(expansion) - - rules = [] - expand1s = {name for name, (_x, options) in parser.rules.items() - if options and options.expand1} - - for name, expansions in d.items(): - for expansion in expansions: - reduced = [sym if sym.startswith('_') or sym in expand1s else - MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym) - for sym in expansion if not is_discarded_terminal(sym)] - - rules.append((name, reduced, WriteTokens(name, expansion).f, None)) - self.rules = rules - - - def _reconstruct(self, tree): - # TODO: ambiguity? - parser = earley.Parser(self.rules, tree.data, {}) - res = parser.parse(tree.children) - for item in res: - if isinstance(item, Tree): - for x in self._reconstruct(item): - yield x - else: - yield item - - def reconstruct(self, tree): - return ''.join(self._reconstruct(tree)) - - diff -Nru lark-parser-0.6.2/lark/reconstruct.py lark-parser-0.7.0/lark/reconstruct.py --- lark-parser-0.6.2/lark/reconstruct.py 2018-06-27 13:54:54.000000000 +0000 +++ lark-parser-0.7.0/lark/reconstruct.py 2019-03-28 13:53:28.000000000 +0000 @@ -2,9 +2,9 @@ from .tree import Tree from .visitors import Transformer_InPlace -from .common import ParserConf, PatternStr -from .lexer import Token -from .parsers import earley, resolve_ambig +from .common import ParserConf +from .lexer import Token, PatternStr +from .parsers import earley from .grammar import Rule, Terminal, NonTerminal @@ -34,7 +34,8 @@ for sym in meta.orig_expansion: if is_discarded_terminal(sym): t = self.tokens[sym.name] - assert isinstance(t.pattern, PatternStr) + if not isinstance(t.pattern, PatternStr): + raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t) to_write.append(t.pattern.value) else: x = next(iter_args) @@ -67,39 +68,40 @@ class Reconstructor: def __init__(self, parser): - # Recreate the rules to assume a standard lexer - _tokens, rules, _grammar_extra = parser.grammar.compile() + # XXX TODO calling compile twice returns different results! + tokens, rules, _grammar_extra = parser.grammar.compile() - expand1s = {r.origin for r in parser.rules if r.options and r.options.expand1} + self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}) + self.rules = list(self._build_recons_rules(rules)) - d = defaultdict(list) + def _build_recons_rules(self, rules): + expand1s = {r.origin for r in rules if r.options and r.options.expand1} + + aliases = defaultdict(list) for r in rules: - # Rules can match their alias if r.alias: - alias = NonTerminal(r.alias) - d[alias].append(r.expansion) - d[r.origin].append([alias]) - else: - d[r.origin].append(r.expansion) + aliases[r.origin].append( r.alias ) - # Expanded rules can match their own terminal - for sym in r.expansion: - if sym in expand1s: - d[sym].append([Terminal(sym.name)]) + rule_names = {r.origin for r in rules} + nonterminals = {sym for sym in rule_names + if sym.name.startswith('_') or sym in expand1s or sym in aliases } - reduced_rules = defaultdict(list) - for name, expansions in d.items(): - for expansion in expansions: - reduced = [sym if sym.name.startswith('_') or sym in expand1s else Terminal(sym.name) - for sym in expansion if not is_discarded_terminal(sym)] + for r in rules: + recons_exp = [sym if sym in nonterminals else Terminal(sym.name) + for sym in r.expansion if not is_discarded_terminal(sym)] - reduced_rules[name, tuple(reduced)].append(expansion) + # Skip self-recursive constructs + if recons_exp == [r.origin]: + continue - self.rules = [Rule(name, list(reduced), MakeMatchTree(name.name, expansions[0]), None) - for (name, reduced), expansions in reduced_rules.items()] + sym = NonTerminal(r.alias) if r.alias else r.origin - self.write_tokens = WriteTokensTransformer({t.name:t for t in _tokens}) + yield Rule(sym, recons_exp, alias=MakeMatchTree(sym.name, r.expansion)) + for origin, rule_aliases in aliases.items(): + for alias in rule_aliases: + yield Rule(origin, [Terminal(alias)], alias=MakeMatchTree(origin.name, [NonTerminal(alias)])) + yield Rule(origin, [Terminal(origin.name)], alias=MakeMatchTree(origin.name, [origin])) def _match(self, term, token): if isinstance(token, Tree): @@ -110,7 +112,7 @@ def _reconstruct(self, tree): # TODO: ambiguity? - parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=resolve_ambig.standard_resolve_ambig) + parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=True) unreduced_tree = parser.parse(tree.children) # find a full derivation assert unreduced_tree.data == tree.data res = self.write_tokens.transform(unreduced_tree) @@ -123,4 +125,3 @@ def reconstruct(self, tree): return ''.join(self._reconstruct(tree)) - diff -Nru lark-parser-0.6.2/lark/tools/nearley.py lark-parser-0.7.0/lark/tools/nearley.py --- lark-parser-0.6.2/lark/tools/nearley.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/tools/nearley.py 2019-03-28 13:53:28.000000000 +0000 @@ -20,20 +20,21 @@ ?expr: item [":" /[+*?]/] - ?item: rule|string|regexp + ?item: rule|string|regexp|null | "(" expansions ")" rule: NAME string: STRING regexp: REGEXP + null: "null" JS: /{%.*?%}/s js: JS? NAME: /[a-zA-Z_$]\w*/ COMMENT: /#[^\n]*/ REGEXP: /\[.*?\]/ - STRING: /".*?"/ + %import common.ESCAPED_STRING -> STRING %import common.WS %ignore WS %ignore COMMENT @@ -83,6 +84,9 @@ def regexp(self, r): return '/%s/' % r + def null(self): + return '' + def string(self, s): return self._extra_rule(s) diff -Nru lark-parser-0.6.2/lark/tools/standalone.py lark-parser-0.7.0/lark/tools/standalone.py --- lark-parser-0.6.2/lark/tools/standalone.py 2018-07-18 12:42:58.000000000 +0000 +++ lark-parser-0.7.0/lark/tools/standalone.py 2019-03-28 13:53:28.000000000 +0000 @@ -18,6 +18,9 @@ # If you wish to purchase a commercial license for this tool and its # generated code, contact me via email. # +# If GPL is incompatible with your free or open-source project, +# contact me and we'll work it out (for free). +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or @@ -42,9 +45,7 @@ import lark from lark import Lark -from lark.parsers.lalr_analysis import Shift, Reduce - -from ..grammar import Rule +from lark.parsers.lalr_analysis import Reduce _dir = path.dirname(__file__) _larkdir = path.join(_dir, path.pardir) @@ -52,9 +53,10 @@ EXTRACT_STANDALONE_FILES = [ 'tools/standalone.py', + 'exceptions.py', 'utils.py', - 'common.py', 'tree.py', + 'visitors.py', 'indenter.py', 'lexer.py', 'parse_tree_builder.py', @@ -81,33 +83,72 @@ return {name:''.join(text) for name, text in sections.items()} -class LexerAtoms: +def _prepare_mres(mres): + return [(p.pattern,{i: t for i, t in d.items()}) for p,d in mres] + +class TraditionalLexerAtoms: def __init__(self, lexer): - self.mres = [(p.pattern,d) for p,d in lexer.mres] + self.mres = _prepare_mres(lexer.mres) self.newline_types = lexer.newline_types self.ignore_types = lexer.ignore_types - self.callback = {name:[(p.pattern,d) for p,d in c.mres] + self.callback = {name:_prepare_mres(c.mres) for name, c in lexer.callback.items()} def print_python(self): print('import re') + print('class LexerRegexps: pass') + print('NEWLINE_TYPES = %s' % self.newline_types) + print('IGNORE_TYPES = %s' % self.ignore_types) + self._print_python('lexer') + + def _print_python(self, var_name): print('MRES = (') pprint(self.mres) print(')') print('LEXER_CALLBACK = (') pprint(self.callback) print(')') - print('NEWLINE_TYPES = %s' % self.newline_types) - print('IGNORE_TYPES = %s' % self.ignore_types) - print('class LexerRegexps: pass') print('lexer_regexps = LexerRegexps()') print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])') print(' for n, mres in LEXER_CALLBACK.items()}') - print('lexer = _Lex(lexer_regexps)') - print('def lex(stream):') - print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)') + print('%s = (lexer_regexps)' % var_name) + + +class ContextualLexerAtoms: + def __init__(self, lexer): + self.lexer_atoms = {state: TraditionalLexerAtoms(lexer) for state, lexer in lexer.lexers.items()} + self.root_lexer_atoms = TraditionalLexerAtoms(lexer.root_lexer) + + def print_python(self): + print('import re') + print('class LexerRegexps: pass') + print('NEWLINE_TYPES = %s' % self.root_lexer_atoms.newline_types) + print('IGNORE_TYPES = %s' % self.root_lexer_atoms.ignore_types) + print('LEXERS = {}') + for state, lexer_atoms in self.lexer_atoms.items(): + lexer_atoms._print_python('LEXERS[%d]' % state) + + print('class ContextualLexer:') + print(' def __init__(self):') + print(' self.lexers = LEXERS') + print(' self.set_parser_state(None)') + print(' def set_parser_state(self, state):') + print(' self.parser_state = state') + print(' def lex(self, stream):') + print(' newline_types = NEWLINE_TYPES') + print(' ignore_types = IGNORE_TYPES') + print(' lexers = LEXERS') + print(' l = _Lex(lexers[self.parser_state], self.parser_state)') + print(' for x in l.lex(stream, newline_types, ignore_types):') + print(' yield x') + print(' l.lexer = lexers[self.parser_state]') + print(' l.state = self.parser_state') + + print('CON_LEXER = ContextualLexer()') + print('def lex(stream):') + print(' return CON_LEXER.lex(stream)') class GetRule: def __init__(self, rule_id): @@ -151,8 +192,9 @@ print(' self.postlex = postlex') print(' def parse(self, stream):') print(' tokens = lex(stream)') + print(' sps = CON_LEXER.set_parser_state') print(' if self.postlex: tokens = self.postlex.process(tokens)') - print(' return self.parser.parse(tokens)') + print(' return self.parser.parse(tokens, sps)') class TreeBuilderAtoms: def __init__(self, lark): @@ -160,17 +202,18 @@ self.ptb = lark._parse_tree_builder def print_python(self): + # print('class InlineTransformer: pass') print('RULES = {') for i, r in enumerate(self.rules): rule_ids[r] = i - print(' %d: Rule(%r, %r, %r, %r),' % (i, r.origin, r.expansion, self.ptb.user_aliases[r], r.options )) + print(' %d: Rule(%r, [%s], alias=%r, options=%r),' % (i, r.origin, ', '.join(s.fullrepr for s in r.expansion), self.ptb.user_aliases[r], r.options )) print('}') print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') def main(fobj, start): - lark_inst = Lark(fobj, parser="lalr", lexer="standard", start=start) + lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) - lexer_atoms = LexerAtoms(lark_inst.parser.lexer) + lexer_atoms = ContextualLexerAtoms(lark_inst.parser.lexer) parser_atoms = ParserAtoms(lark_inst.parser.parser) tree_builder_atoms = TreeBuilderAtoms(lark_inst) diff -Nru lark-parser-0.6.2/lark/tree.py lark-parser-0.7.0/lark/tree.py --- lark-parser-0.6.2/lark/tree.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/tree.py 2019-03-28 13:53:28.000000000 +0000 @@ -5,10 +5,12 @@ from copy import deepcopy -class Meta: - pass ###{standalone +class Meta: + def __init__(self): + self.empty = True + class Tree(object): def __init__(self, data, children, meta=None): self.data = data @@ -42,13 +44,6 @@ def pretty(self, indent_str=' '): return ''.join(self._pretty(0, indent_str)) -###} - - def expand_kids_by_index(self, *indices): - "Expand (inline) children at the given indices" - for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices - kid = self.children[i] - self.children[i:i+1] = kid.children def __eq__(self, other): try: @@ -61,6 +56,13 @@ def __hash__(self): return hash((self.data, tuple(self.children))) +###} + + def expand_kids_by_index(self, *indices): + "Expand (inline) children at the given indices" + for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices + kid = self.children[i] + self.children[i:i+1] = kid.children def find_pred(self, pred): "Find all nodes where pred(tree) == True" @@ -100,12 +102,22 @@ yield x seen.add(id(x)) + def iter_subtrees_topdown(self): + stack = [self] + while stack: + node = stack.pop() + if not isinstance(node, Tree): + continue + yield node + for n in reversed(node.children): + stack.append(n) def __deepcopy__(self, memo): return type(self)(self.data, deepcopy(self.children, memo)) def copy(self): return type(self)(self.data, self.children) + def set(self, data, children): self.data = data self.children = children @@ -129,11 +141,17 @@ __slots__ = 'data', 'children', 'rule', '_meta' -def pydot__tree_to_png(tree, filename): - "Creates a colorful image that represents the tree (data+children, without meta)" +def pydot__tree_to_png(tree, filename, rankdir="LR"): + """Creates a colorful image that represents the tree (data+children, without meta) + + Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to + directed graphs drawn from top to bottom, from left to right, from bottom to + top, and from right to left, respectively. See: + https://www.graphviz.org/doc/info/attrs.html#k:rankdir + """ import pydot - graph = pydot.Dot(graph_type='digraph', rankdir="LR") + graph = pydot.Dot(graph_type='digraph', rankdir=rankdir) i = [0] diff -Nru lark-parser-0.6.2/lark/utils.py lark-parser-0.7.0/lark/utils.py --- lark-parser-0.6.2/lark/utils.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/lark/utils.py 2019-03-28 13:53:28.000000000 +0000 @@ -1,5 +1,8 @@ +import sys from collections import deque +Py36 = (sys.version_info[:2] >= (3, 6)) + class fzset(frozenset): def __repr__(self): return '{%s}' % ', '.join(map(repr, self)) @@ -42,24 +45,28 @@ +###{standalone try: STRING_TYPE = basestring except NameError: # Python 3 STRING_TYPE = str -###{standalone import types from functools import wraps, partial from contextlib import contextmanager Str = type(u'') +try: + classtype = types.ClassType # Python2 +except AttributeError: + classtype = type # Python3 def smart_decorator(f, create_decorator): if isinstance(f, types.FunctionType): return wraps(f)(create_decorator(f, True)) - elif isinstance(f, (type, types.BuiltinFunctionType)): + elif isinstance(f, (classtype, type, types.BuiltinFunctionType)): return wraps(f)(create_decorator(f, False)) elif isinstance(f, types.MethodType): @@ -72,7 +79,14 @@ else: return create_decorator(f.__func__.__call__, True) +def dedup_list(l): + """Given a list (l) will removing duplicates from the list, + preserving the original order of the list. Assumes that + the list entrie are hashable.""" + dedup = set() + return [ x for x in l if not (x in dedup or dedup.add(x))] +###} try: @@ -93,7 +107,6 @@ except excs: pass -###} @@ -105,8 +118,7 @@ return 0 elif a > b: return 1 - else: - return -1 + return -1 import sre_parse diff -Nru lark-parser-0.6.2/lark/visitors.py lark-parser-0.7.0/lark/visitors.py --- lark-parser-0.6.2/lark/visitors.py 2018-07-18 12:42:58.000000000 +0000 +++ lark-parser-0.7.0/lark/visitors.py 2019-03-28 13:53:28.000000000 +0000 @@ -1,13 +1,15 @@ -from inspect import isclass, getmembers, getmro from functools import wraps from .utils import smart_decorator from .tree import Tree +from .exceptions import VisitError, GrammarError + +###{standalone +from inspect import getmembers, getmro class Discard(Exception): pass - # Transformers class Transformer: @@ -27,16 +29,21 @@ except AttributeError: return self.__default__(tree.data, children, tree.meta) else: - if getattr(f, 'meta', False): - return f(children, tree.meta) - elif getattr(f, 'inline', False): - return f(*children) - elif getattr(f, 'whole_tree', False): - if new_children is not None: - raise NotImplementedError("Doesn't work with the base Transformer class") - return f(tree) - else: - return f(children) + try: + if getattr(f, 'meta', False): + return f(children, tree.meta) + elif getattr(f, 'inline', False): + return f(*children) + elif getattr(f, 'whole_tree', False): + if new_children is not None: + raise NotImplementedError("Doesn't work with the base Transformer class") + return f(tree) + else: + return f(children) + except (GrammarError, Discard): + raise + except Exception as e: + raise VisitError(tree, e) def _transform_children(self, children): for c in children: @@ -67,8 +74,15 @@ for name, value in getmembers(cls): if name.startswith('_') or name in libmembers: continue + if not callable(cls.__dict__[name]): + continue - setattr(cls, name, decorator(value, **kwargs)) + # Skip if v_args already applied (at the function level) + if hasattr(cls.__dict__[name], 'vargs_applied'): + continue + + static = isinstance(cls.__dict__[name], (staticmethod, classmethod)) + setattr(cls, name, decorator(value, static=static, **kwargs)) return cls @@ -224,7 +238,7 @@ -def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False): +def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, static=False): assert [whole_tree, meta, inline].count(True) <= 1 def create_decorator(_f, with_self): if with_self: @@ -235,7 +249,11 @@ return _f(*args, **kwargs) return f - f = smart_decorator(func, create_decorator) + if static: + f = wraps(func)(create_decorator(func, False)) + else: + f = smart_decorator(func, create_decorator) + f.vargs_applied = True f.inline = inline f.meta = meta f.whole_tree = whole_tree @@ -250,3 +268,4 @@ return _visitor_args_dec +###} diff -Nru lark-parser-0.6.2/lark_parser.egg-info/dependency_links.txt lark-parser-0.7.0/lark_parser.egg-info/dependency_links.txt --- lark-parser-0.6.2/lark_parser.egg-info/dependency_links.txt 2018-07-18 13:19:27.000000000 +0000 +++ lark-parser-0.7.0/lark_parser.egg-info/dependency_links.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ - diff -Nru lark-parser-0.6.2/lark_parser.egg-info/not-zip-safe lark-parser-0.7.0/lark_parser.egg-info/not-zip-safe --- lark-parser-0.6.2/lark_parser.egg-info/not-zip-safe 2018-06-10 07:58:37.000000000 +0000 +++ lark-parser-0.7.0/lark_parser.egg-info/not-zip-safe 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ - diff -Nru lark-parser-0.6.2/lark_parser.egg-info/PKG-INFO lark-parser-0.7.0/lark_parser.egg-info/PKG-INFO --- lark-parser-0.6.2/lark_parser.egg-info/PKG-INFO 2018-07-18 13:19:27.000000000 +0000 +++ lark-parser-0.7.0/lark_parser.egg-info/PKG-INFO 1970-01-01 00:00:00.000000000 +0000 @@ -1,43 +0,0 @@ -Metadata-Version: 1.1 -Name: lark-parser -Version: 0.6.2 -Summary: a modern parsing library -Home-page: https://github.com/erezsh/lark -Author: Erez Shinan -Author-email: erezshin@gmail.com -License: MIT -Download-URL: https://github.com/erezsh/lark/tarball/master -Description-Content-Type: UNKNOWN -Description: - Lark is a modern general-purpose parsing library for Python. - - With Lark, you can parse any context-free grammar, efficiently, with very little code. - - Main Features: - - Builds a parse-tree (AST) automagically, based on the structure of the grammar - - Earley parser - - Can parse all context-free grammars - - Full support for ambiguous grammars - - LALR(1) parser - - Fast and light, competitive with PLY - - Can generate a stand-alone parser - - CYK parser, for highly ambiguous grammars - - EBNF grammar - - Unicode fully supported - - Python 2 & 3 compatible - - Automatic line & column tracking - - Standard library of terminals (strings, numbers, names, etc.) - - Import grammars from Nearley.js - - Extensive test suite - - And much more! - -Keywords: Earley LALR parser parsing ast -Platform: UNKNOWN -Classifier: Development Status :: 5 - Production/Stable -Classifier: Intended Audience :: Developers -Classifier: Programming Language :: Python :: 2.7 -Classifier: Programming Language :: Python :: 3 -Classifier: Topic :: Software Development :: Libraries :: Python Modules -Classifier: Topic :: Text Processing :: General -Classifier: Topic :: Text Processing :: Linguistic -Classifier: License :: OSI Approved :: MIT License diff -Nru lark-parser-0.6.2/lark_parser.egg-info/SOURCES.txt lark-parser-0.7.0/lark_parser.egg-info/SOURCES.txt --- lark-parser-0.6.2/lark_parser.egg-info/SOURCES.txt 2018-07-18 13:19:27.000000000 +0000 +++ lark-parser-0.7.0/lark_parser.egg-info/SOURCES.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,69 +0,0 @@ -LICENSE -MANIFEST.in -README.md -setup.cfg -setup.py -docs/comparison_memory.png -docs/comparison_runtime.png -docs/json_tutorial.md -docs/lark_cheatsheet.pdf -examples/__init__.py -examples/calc.py -examples/conf_earley.py -examples/conf_lalr.py -examples/custom_lexer.py -examples/error_reporting_lalr.py -examples/fruitflies.png -examples/fruitflies.py -examples/indented_tree.py -examples/json_parser.py -examples/lark.lark -examples/lark_grammar.py -examples/python2.lark -examples/python3.lark -examples/python_parser.py -examples/qscintilla_json.py -examples/reconstruct_json.py -examples/turtle_dsl.py -lark/__init__.py -lark/common.py -lark/exceptions.py -lark/grammar.py -lark/indenter.py -lark/lark.py -lark/lexer.py -lark/load_grammar.py -lark/parse_tree_builder.py -lark/parser_frontends.py -lark/reconstruct.py -lark/reconstruct2.py -lark/tree.py -lark/utils.py -lark/visitors.py -lark/grammars/common.lark -lark/parsers/__init__.py -lark/parsers/ablalr.py -lark/parsers/cyk.py -lark/parsers/earley.py -lark/parsers/grammar_analysis.py -lark/parsers/lalr_analysis.py -lark/parsers/lalr_parser.py -lark/parsers/resolve_ambig.py -lark/parsers/xearley.py -lark/tools/__init__.py -lark/tools/nearley.py -lark/tools/standalone.py -lark_parser.egg-info/PKG-INFO -lark_parser.egg-info/SOURCES.txt -lark_parser.egg-info/dependency_links.txt -lark_parser.egg-info/not-zip-safe -lark_parser.egg-info/top_level.txt -tests/__init__.py -tests/__main__.py -tests/test_parser.py -tests/test_tools.py -tests/test_trees.py -tests/test_nearley/__init__.py -tests/test_nearley/test_nearley.py -tests/test_nearley/grammars/include_unicode.ne -tests/test_nearley/grammars/unicode.ne \ No newline at end of file diff -Nru lark-parser-0.6.2/lark_parser.egg-info/top_level.txt lark-parser-0.7.0/lark_parser.egg-info/top_level.txt --- lark-parser-0.6.2/lark_parser.egg-info/top_level.txt 2018-07-18 13:19:27.000000000 +0000 +++ lark-parser-0.7.0/lark_parser.egg-info/top_level.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -lark diff -Nru lark-parser-0.6.2/MANIFEST.in lark-parser-0.7.0/MANIFEST.in --- lark-parser-0.6.2/MANIFEST.in 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/MANIFEST.in 2019-03-28 13:53:28.000000000 +0000 @@ -1 +1 @@ -include README.md LICENSE docs/* examples/*.py examples/*.png examples/*.lark tests/*.py tests/test_nearley/*.py tests/test_nearley/grammars/* +include README.md LICENSE docs/* examples/*.py examples/*.png examples/*.lark tests/*.py tests/*.lark tests/grammars/* tests/test_nearley/*.py tests/test_nearley/grammars/* diff -Nru lark-parser-0.6.2/mkdocs.yml lark-parser-0.7.0/mkdocs.yml --- lark-parser-0.6.2/mkdocs.yml 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/mkdocs.yml 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,13 @@ +site_name: Lark +theme: readthedocs +pages: + - Main Page: index.md + - Philosophy: philosophy.md + - Features: features.md + - Parsers: parsers.md + - How To Use (Guide): how_to_use.md + - How To Develop (Guide): how_to_develop.md + - Grammar Reference: grammar.md + - Tree Construction Reference: tree_construction.md + - Classes Reference: classes.md + - Recipes: recipes.md diff -Nru lark-parser-0.6.2/nearley-requirements.txt lark-parser-0.7.0/nearley-requirements.txt --- lark-parser-0.6.2/nearley-requirements.txt 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/nearley-requirements.txt 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1 @@ +Js2Py==0.50 diff -Nru lark-parser-0.6.2/PKG-INFO lark-parser-0.7.0/PKG-INFO --- lark-parser-0.6.2/PKG-INFO 2018-07-18 13:19:27.000000000 +0000 +++ lark-parser-0.7.0/PKG-INFO 1970-01-01 00:00:00.000000000 +0000 @@ -1,43 +0,0 @@ -Metadata-Version: 1.1 -Name: lark-parser -Version: 0.6.2 -Summary: a modern parsing library -Home-page: https://github.com/erezsh/lark -Author: Erez Shinan -Author-email: erezshin@gmail.com -License: MIT -Download-URL: https://github.com/erezsh/lark/tarball/master -Description-Content-Type: UNKNOWN -Description: - Lark is a modern general-purpose parsing library for Python. - - With Lark, you can parse any context-free grammar, efficiently, with very little code. - - Main Features: - - Builds a parse-tree (AST) automagically, based on the structure of the grammar - - Earley parser - - Can parse all context-free grammars - - Full support for ambiguous grammars - - LALR(1) parser - - Fast and light, competitive with PLY - - Can generate a stand-alone parser - - CYK parser, for highly ambiguous grammars - - EBNF grammar - - Unicode fully supported - - Python 2 & 3 compatible - - Automatic line & column tracking - - Standard library of terminals (strings, numbers, names, etc.) - - Import grammars from Nearley.js - - Extensive test suite - - And much more! - -Keywords: Earley LALR parser parsing ast -Platform: UNKNOWN -Classifier: Development Status :: 5 - Production/Stable -Classifier: Intended Audience :: Developers -Classifier: Programming Language :: Python :: 2.7 -Classifier: Programming Language :: Python :: 3 -Classifier: Topic :: Software Development :: Libraries :: Python Modules -Classifier: Topic :: Text Processing :: General -Classifier: Topic :: Text Processing :: Linguistic -Classifier: License :: OSI Approved :: MIT License diff -Nru lark-parser-0.6.2/README.md lark-parser-0.7.0/README.md --- lark-parser-0.6.2/README.md 2018-06-26 14:19:14.000000000 +0000 +++ lark-parser-0.7.0/README.md 2019-03-28 13:53:28.000000000 +0000 @@ -4,11 +4,11 @@ **Beginners**: Lark is not just another parser. It can parse any grammar you throw at it, no matter how complicated or ambiguous, and do so efficiently. It also constructs a parse-tree for you, without additional code on your part. -**Experts**: Lark lets you choose between Earley and LALR(1), to trade-off power and speed. It also contains a CYK parser and experimental features such as a contextual-lexer. +**Experts**: Lark implements both Earley(SPPF) and LALR(1), and several different lexers, so you can trade-off power and speed, according to your requirements. It also provides a variety of sophisticated features and utilities. Lark can: - - Parse all context-free grammars, and handle all ambiguity + - Parse all context-free grammars, and handle any ambiguity - Build a parse-tree automagically, no construction code required - Outperform all other Python libraries when using LALR(1) (Yes, including PLY) - Run on every Python interpreter (it's pure-python) @@ -20,11 +20,11 @@ ### Quick links -- [Documentation wiki](https://github.com/erezsh/lark/wiki) -- [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. +- [Documentation @readthedocs](https://lark-parser.readthedocs.io/) - [Cheatsheet (PDF)](/docs/lark_cheatsheet.pdf) +- [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) -- [Forum @googlegroups](https://groups.google.com/forum/#!forum/lark-parser) (New) +- [Gitter chat](https://gitter.im/lark-parser/Lobby) ### Install Lark @@ -113,6 +113,8 @@ *Note: I really wanted to add PLY to the benchmark, but I couldn't find a working JSON parser anywhere written in PLY. If anyone can point me to one that actually works, I would be happy to add it!* +*Note 2: The parsimonious code has been optimized for this specific test, unlike the other benchmarks (Lark included). Its "real-world" performance may not be as good.* + #### Feature comparison | Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | Generates Stand-alone @@ -121,7 +123,6 @@ | [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No | | [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No | | [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No | -| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | No | | [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No | | [ANTLR](https://github.com/antlr/antlr4) | LL(*) | EBNF | Yes | No | Yes? | Yes | No | @@ -133,6 +134,7 @@ - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer + - [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL engine by Dailymotion (Lark is used to parse the GraphQL schemas definitions) Using Lark? Send me a message and I'll add your project! @@ -159,20 +161,17 @@ Lark uses the [MIT license](LICENSE). +(The standalone tool is under GPL2) + ## Contribute -Lark is currently accepting pull-requests. +Lark is currently accepting pull-requests. See [How to develop Lark](/docs/how_to_develop.md) -There are many ways you can help the project: +## Donate -* Help solve issues -* Improve the documentation -* Write new grammars for Lark's library -* Write a blog post introducing Lark to your audience -* Port Lark to another language -* Help me with code developemnt +If you like Lark and feel like donating, you can do so at my [patreon page](https://www.patreon.com/erezsh). -If you're interested in taking one of these on, let me know and I will provide more details and assist you in the process. +If you wish for a specific feature to get a higher priority, you can request it in a follow-up email, and I'll consider it favorably. ## Contact diff -Nru lark-parser-0.6.2/setup.cfg lark-parser-0.7.0/setup.cfg --- lark-parser-0.6.2/setup.cfg 2018-07-18 13:19:27.000000000 +0000 +++ lark-parser-0.7.0/setup.cfg 2019-03-28 13:53:28.000000000 +0000 @@ -1,5 +1,5 @@ [global] -zip_safe = +zip_safe= [bdist_wheel] universal = 1 @@ -8,7 +8,3 @@ description-file = README.md license_file = LICENSE -[egg_info] -tag_build = -tag_date = 0 - diff -Nru lark-parser-0.6.2/tests/grammars/ab.lark lark-parser-0.7.0/tests/grammars/ab.lark --- lark-parser-0.6.2/tests/grammars/ab.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/grammars/ab.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,10 @@ +startab: expr + +expr: A B + | A expr B + +A: "a" +B: "b" + +%import common.WS +%ignore WS diff -Nru lark-parser-0.6.2/tests/grammars/test.lark lark-parser-0.7.0/tests/grammars/test.lark --- lark-parser-0.6.2/tests/grammars/test.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/grammars/test.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,3 @@ +%import common.NUMBER +%import common.WORD +%import common.WS diff -Nru lark-parser-0.6.2/tests/__main__.py lark-parser-0.7.0/tests/__main__.py --- lark-parser-0.6.2/tests/__main__.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/tests/__main__.py 2019-03-28 13:53:28.000000000 +0000 @@ -5,6 +5,7 @@ from .test_trees import TestTrees from .test_tools import TestStandalone +from .test_reconstructor import TestReconstructor try: from .test_nearley.test_nearley import TestNearley @@ -19,11 +20,11 @@ TestEarleyStandard, TestCykStandard, TestLalrContextual, - # TestEarleyScanless, TestEarleyDynamic, - # TestFullEarleyScanless, + # TestFullEarleyStandard, TestFullEarleyDynamic, + TestFullEarleyDynamic_complete, TestParsers, ) diff -Nru lark-parser-0.6.2/tests/test_nearley/test_nearley.py lark-parser-0.7.0/tests/test_nearley/test_nearley.py --- lark-parser-0.6.2/tests/test_nearley/test_nearley.py 2018-01-02 08:28:27.000000000 +0000 +++ lark-parser-0.7.0/tests/test_nearley/test_nearley.py 2019-03-28 13:53:28.000000000 +0000 @@ -75,6 +75,23 @@ parse(u'±a') + def test_backslash(self): + grammar = r'main -> "\""' + code = create_code_for_nearley_grammar(grammar, 'main', BUILTIN_PATH, './') + d = {} + exec (code, d) + parse = d['parse'] + parse(u'"') + + def test_null(self): + grammar = r'main -> "a" | null' + code = create_code_for_nearley_grammar(grammar, 'main', BUILTIN_PATH, './') + d = {} + exec (code, d) + parse = d['parse'] + parse('a') + parse('') + def test_utf8_2(self): fn = os.path.join(TEST_PATH, 'grammars/unicode.ne') nearley_tool_main(fn, 'x', NEARLEY_PATH) diff -Nru lark-parser-0.6.2/tests/test_parser.py lark-parser-0.7.0/tests/test_parser.py --- lark-parser-0.6.2/tests/test_parser.py 2018-06-27 13:28:15.000000000 +0000 +++ lark-parser-0.7.0/tests/test_parser.py 2019-03-28 13:53:28.000000000 +0000 @@ -18,7 +18,7 @@ logging.basicConfig(level=logging.INFO) from lark.lark import Lark -from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput +from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree from lark.visitors import Transformer @@ -48,8 +48,9 @@ self.assertRaises(GrammarError, Lark, g, parser='lalr') - l = Lark(g, parser='earley', lexer='dynamic') - self.assertRaises(ParseError, l.parse, 'a') + # TODO: should it? shouldn't it? + # l = Lark(g, parser='earley', lexer='dynamic') + # self.assertRaises(ParseError, l.parse, 'a') def test_propagate_positions(self): g = Lark("""start: a @@ -148,9 +149,14 @@ self.assertEqual( r.children, [""] ) + def test_alias(self): + Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """) + def _make_full_earley_test(LEXER): + def _Lark(grammar, **kwargs): + return Lark(grammar, lexer=LEXER, parser='earley', propagate_positions=True, **kwargs) class _TestFullEarley(unittest.TestCase): def test_anon(self): # Fails an Earley implementation without special handling for empty rules, @@ -186,8 +192,10 @@ @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") def test_earley3(self): - "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" + """Tests prioritization and disambiguation for pseudo-terminals (there should be only one result) + By default, `+` should immitate regexp greedy-matching + """ grammar = """ start: A A A: "a"+ @@ -195,7 +203,10 @@ l = Lark(grammar, parser='earley', lexer=LEXER) res = l.parse("aaa") - self.assertEqual(res.children, ['aa', 'a']) + self.assertEqual(set(res.children), {'aa', 'a'}) + # XXX TODO fix Earley to maintain correct order + # i.e. terminals it imitate greedy search for terminals, but lazy search for rules + # self.assertEqual(res.children, ['aa', 'a']) def test_earley4(self): grammar = """ @@ -205,7 +216,10 @@ l = Lark(grammar, parser='earley', lexer=LEXER) res = l.parse("aaa") - self.assertEqual(res.children, ['aaa']) + assert set(res.children) == {'aa', 'a'} or res.children == ['aaa'] + # XXX TODO fix Earley to maintain correct order + # i.e. terminals it imitate greedy search for terminals, but lazy search for rules + # self.assertEqual(res.children, ['aaa']) def test_earley_repeating_empty(self): # This was a sneaky bug! @@ -222,6 +236,7 @@ empty_tree = Tree('empty', [Tree('empty2', [])]) self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) + @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") def test_earley_explicit_ambiguity(self): # This was a sneaky bug! @@ -233,11 +248,11 @@ """ parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') - res = parser.parse('ab') - - self.assertEqual( res.data, '_ambig') - self.assertEqual( len(res.children), 2) + ambig_tree = parser.parse('ab') + self.assertEqual( ambig_tree.data, '_ambig') + self.assertEqual( len(ambig_tree.children), 2) + @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") def test_ambiguity1(self): grammar = """ start: cd+ "e" @@ -248,9 +263,32 @@ """ l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) - x = l.parse('cde') - assert x.data == '_ambig', x - assert len(x.children) == 2 + ambig_tree = l.parse('cde') + + assert ambig_tree.data == '_ambig', ambig_tree + assert len(ambig_tree.children) == 2 + + @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer") + def test_ambiguity2(self): + grammar = """ + ANY: /[a-zA-Z0-9 ]+/ + a.2: "A" b+ + b.2: "B" + c: ANY + + start: (a|c)* + """ + l = Lark(grammar, parser='earley', lexer=LEXER) + res = l.parse('ABX') + expected = Tree('start', [ + Tree('a', [ + Tree('b', []) + ]), + Tree('c', [ + 'X' + ]) + ]) + self.assertEqual(res, expected) def test_fruitflies_ambig(self): grammar = """ @@ -269,7 +307,7 @@ %ignore WS """ parser = Lark(grammar, ambiguity='explicit', lexer=LEXER) - res = parser.parse('fruit flies like bananas') + tree = parser.parse('fruit flies like bananas') expected = Tree('_ambig', [ Tree('comparative', [ @@ -284,13 +322,12 @@ ]) ]) - # print res.pretty() - # print expected.pretty() - - self.assertEqual(res, expected) + # self.assertEqual(tree, expected) + self.assertEqual(tree.data, expected.data) + self.assertEqual(set(tree.children), set(expected.children)) - @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") + @unittest.skipIf(LEXER!='dynamic_complete', "Only relevant for the dynamic_complete parser") def test_explicit_ambiguity2(self): grammar = r""" start: NAME+ @@ -299,7 +336,7 @@ """ text = """cat""" - parser = Lark(grammar, start='start', ambiguity='explicit') + parser = _Lark(grammar, start='start', ambiguity='explicit') tree = parser.parse(text) self.assertEqual(tree.data, '_ambig') @@ -350,7 +387,9 @@ def _make_parser_test(LEXER, PARSER): def _Lark(grammar, **kwargs): - return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) + return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs) + def _Lark_open(gfilename, **kwargs): + return Lark.open(gfilename, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs) class _TestParser(unittest.TestCase): def test_basic1(self): g = _Lark("""start: a+ b a* "b" a* @@ -411,6 +450,25 @@ """) g.parse(u'\xa3\u0101\u00a3\u0203\n') + def test_hex_escape(self): + g = _Lark(r"""start: A B C + A: "\x01" + B: /\x02/ + C: "\xABCD" + """) + g.parse('\x01\x02\xABCD') + + def test_unicode_literal_range_escape(self): + g = _Lark(r"""start: A+ + A: "\u0061".."\u0063" + """) + g.parse('abc') + + def test_hex_literal_range_escape(self): + g = _Lark(r"""start: A+ + A: "\x01".."\x03" + """) + g.parse('\x01\x02\x03') @unittest.skipIf(PARSER == 'cyk', "Takes forever") def test_stack_for_ebnf(self): @@ -704,15 +762,6 @@ """) x = g.parse(r'\a') - def test_special_chars(self): - g = _Lark(r"""start: "\n" - """) - x = g.parse('\n') - - g = _Lark(r"""start: /\n/ - """) - x = g.parse('\n') - def test_backslash2(self): g = _Lark(r"""start: "\"" "-" @@ -723,6 +772,18 @@ """) x = g.parse('/-') + + + def test_special_chars(self): + g = _Lark(r"""start: "\n" + """) + x = g.parse('\n') + + g = _Lark(r"""start: /\n/ + """) + x = g.parse('\n') + + # def test_token_recurse(self): # g = _Lark("""start: A # A: B @@ -758,7 +819,7 @@ %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items()))) def test_float_without_lexer(self): - expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken + expected_error = UnexpectedCharacters if LEXER.startswith('dynamic') else UnexpectedToken if PARSER == 'cyk': expected_error = ParseError @@ -936,6 +997,123 @@ x = l.parse('12 elephants') self.assertEqual(x.children, ['12', 'elephants']) + + def test_import_rename(self): + grammar = """ + start: N W + + %import common.NUMBER -> N + %import common.WORD -> W + %import common.WS + %ignore WS + + """ + l = _Lark(grammar) + x = l.parse('12 elephants') + self.assertEqual(x.children, ['12', 'elephants']) + + + def test_relative_import(self): + l = _Lark_open('test_relative_import.lark', rel_to=__file__) + x = l.parse('12 lions') + self.assertEqual(x.children, ['12', 'lions']) + + + def test_relative_import_rename(self): + l = _Lark_open('test_relative_import_rename.lark', rel_to=__file__) + x = l.parse('12 lions') + self.assertEqual(x.children, ['12', 'lions']) + + + def test_relative_rule_import(self): + l = _Lark_open('test_relative_rule_import.lark', rel_to=__file__) + x = l.parse('xaabby') + self.assertEqual(x.children, [ + 'x', + Tree('expr', ['a', Tree('expr', ['a', 'b']), 'b']), + 'y']) + + + def test_relative_rule_import_drop_ignore(self): + # %ignore rules are dropped on import + l = _Lark_open('test_relative_rule_import_drop_ignore.lark', + rel_to=__file__) + self.assertRaises((ParseError, UnexpectedInput), + l.parse, 'xa abby') + + + def test_relative_rule_import_subrule(self): + l = _Lark_open('test_relative_rule_import_subrule.lark', + rel_to=__file__) + x = l.parse('xaabby') + self.assertEqual(x.children, [ + 'x', + Tree('startab', [ + Tree('grammars__ab__expr', [ + 'a', Tree('grammars__ab__expr', ['a', 'b']), 'b', + ]), + ]), + 'y']) + + + def test_relative_rule_import_subrule_no_conflict(self): + l = _Lark_open( + 'test_relative_rule_import_subrule_no_conflict.lark', + rel_to=__file__) + x = l.parse('xaby') + self.assertEqual(x.children, [Tree('expr', [ + 'x', + Tree('startab', [ + Tree('grammars__ab__expr', ['a', 'b']), + ]), + 'y'])]) + self.assertRaises((ParseError, UnexpectedInput), + l.parse, 'xaxabyby') + + + def test_relative_rule_import_rename(self): + l = _Lark_open('test_relative_rule_import_rename.lark', + rel_to=__file__) + x = l.parse('xaabby') + self.assertEqual(x.children, [ + 'x', + Tree('ab', ['a', Tree('ab', ['a', 'b']), 'b']), + 'y']) + + + def test_multi_import(self): + grammar = """ + start: NUMBER WORD + + %import common (NUMBER, WORD, WS) + %ignore WS + + """ + l = _Lark(grammar) + x = l.parse('12 toucans') + self.assertEqual(x.children, ['12', 'toucans']) + + + def test_relative_multi_import(self): + l = _Lark_open("test_relative_multi_import.lark", rel_to=__file__) + x = l.parse('12 capybaras') + self.assertEqual(x.children, ['12', 'capybaras']) + + def test_import_errors(self): + grammar = """ + start: NUMBER WORD + + %import .grammars.bad_test.NUMBER + """ + self.assertRaises(IOError, _Lark, grammar) + + grammar = """ + start: NUMBER WORD + + %import bad_test.NUMBER + """ + self.assertRaises(IOError, _Lark, grammar) + @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") def test_earley_prioritization(self): "Tests effect of priority on result" @@ -977,7 +1155,7 @@ bb_.1: "bb" """ - l = _Lark(grammar, ambiguity='resolve__antiscore_sum') + l = Lark(grammar, priority="invert") res = l.parse('abba') self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_') @@ -990,7 +1168,7 @@ bb_: "bb" """ - l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum') + l = Lark(grammar, priority="invert") res = l.parse('abba') self.assertEqual(''.join(child.data for child in res.children), 'indirection') @@ -1003,7 +1181,7 @@ bb_.3: "bb" """ - l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum') + l = Lark(grammar, priority="invert") res = l.parse('abba') self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_') @@ -1016,7 +1194,7 @@ bb_.3: "bb" """ - l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum') + l = Lark(grammar, priority="invert") res = l.parse('abba') self.assertEqual(''.join(child.data for child in res.children), 'indirection') @@ -1166,6 +1344,90 @@ self.assertEqual(t.children, ['a', 'bc']) self.assertEqual(t.children[0].type, 'A') + def test_line_counting(self): + p = _Lark("start: /[^x]+/") + + text = 'hello\nworld' + t = p.parse(text) + tok = t.children[0] + self.assertEqual(tok, text) + self.assertEqual(tok.line, 1) + self.assertEqual(tok.column, 1) + if _LEXER != 'dynamic': + self.assertEqual(tok.end_line, 2) + self.assertEqual(tok.end_column, 6) + + @unittest.skipIf(PARSER=='cyk', "Empty rules") + def test_empty_end(self): + p = _Lark(""" + start: b c d + b: "B" + c: | "C" + d: | "D" + """) + res = p.parse('B') + self.assertEqual(len(res.children), 3) + + @unittest.skipIf(PARSER=='cyk', "Empty rules") + def test_maybe_placeholders(self): + # Anonymous tokens shouldn't count + p = _Lark("""start: ["a"] ["b"] ["c"] """, maybe_placeholders=True) + self.assertEqual(p.parse("").children, []) + + # All invisible constructs shouldn't count + p = _Lark("""start: [A] ["b"] [_c] ["e" "f" _c] + A: "a" + _c: "c" """, maybe_placeholders=True) + self.assertEqual(p.parse("").children, [None]) + self.assertEqual(p.parse("c").children, [None]) + self.assertEqual(p.parse("aefc").children, ['a']) + + # ? shouldn't apply + p = _Lark("""!start: ["a"] "b"? ["c"] """, maybe_placeholders=True) + self.assertEqual(p.parse("").children, [None, None]) + self.assertEqual(p.parse("b").children, [None, 'b', None]) + + p = _Lark("""!start: ["a"] ["b"] ["c"] """, maybe_placeholders=True) + self.assertEqual(p.parse("").children, [None, None, None]) + self.assertEqual(p.parse("a").children, ['a', None, None]) + self.assertEqual(p.parse("b").children, [None, 'b', None]) + self.assertEqual(p.parse("c").children, [None, None, 'c']) + self.assertEqual(p.parse("ab").children, ['a', 'b', None]) + self.assertEqual(p.parse("ac").children, ['a', None, 'c']) + self.assertEqual(p.parse("bc").children, [None, 'b', 'c']) + self.assertEqual(p.parse("abc").children, ['a', 'b', 'c']) + + p = _Lark("""!start: (["a"] "b" ["c"])+ """, maybe_placeholders=True) + self.assertEqual(p.parse("b").children, [None, 'b', None]) + self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None]) + self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c']) + self.assertEqual(p.parse("babbcabcb").children, + [None, 'b', None, + 'a', 'b', None, + None, 'b', 'c', + 'a', 'b', 'c', + None, 'b', None]) + + p = _Lark("""!start: ["a"] ["c"] "b"+ ["a"] ["d"] """, maybe_placeholders=True) + self.assertEqual(p.parse("bb").children, [None, None, 'b', 'b', None, None]) + self.assertEqual(p.parse("bd").children, [None, None, 'b', None, 'd']) + self.assertEqual(p.parse("abba").children, ['a', None, 'b', 'b', 'a', None]) + self.assertEqual(p.parse("cbbbb").children, [None, 'c', 'b', 'b', 'b', 'b', None, None]) + + + def test_escaped_string(self): + "Tests common.ESCAPED_STRING" + grammar = r""" + start: ESCAPED_STRING+ + + %import common (WS_INLINE, ESCAPED_STRING) + %ignore WS_INLINE + """ + + parser = _Lark(grammar) + parser.parse(r'"\\" "b" "c"') + + parser.parse(r'"That" "And a \"b"') _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() @@ -1186,9 +1448,8 @@ for _LEXER, _PARSER in _TO_TEST: _make_parser_test(_LEXER, _PARSER) -for _LEXER in ('dynamic',): +for _LEXER in ('dynamic', 'dynamic_complete'): _make_full_earley_test(_LEXER) if __name__ == '__main__': unittest.main() - diff -Nru lark-parser-0.6.2/tests/test_reconstructor.py lark-parser-0.7.0/tests/test_reconstructor.py --- lark-parser-0.6.2/tests/test_reconstructor.py 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/test_reconstructor.py 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,116 @@ +import json +import unittest +from unittest import TestCase +from lark import Lark +from lark.reconstruct import Reconstructor + + +common = """ +%import common (WS_INLINE, NUMBER, WORD) +%ignore WS_INLINE +""" + +def _remove_ws(s): + return s.replace(' ', '').replace('\n','') + +class TestReconstructor(TestCase): + + def assert_reconstruct(self, grammar, code): + parser = Lark(grammar, parser='lalr') + tree = parser.parse(code) + new = Reconstructor(parser).reconstruct(tree) + self.assertEqual(_remove_ws(code), _remove_ws(new)) + + def test_starred_rule(self): + + g = """ + start: item* + item: NL + | rule + rule: WORD ":" NUMBER + NL: /(\\r?\\n)+\\s*/ + """ + common + + code = """ + Elephants: 12 + """ + + self.assert_reconstruct(g, code) + + def test_starred_group(self): + + g = """ + start: (rule | NL)* + rule: WORD ":" NUMBER + NL: /(\\r?\\n)+\\s*/ + """ + common + + code = """ + Elephants: 12 + """ + + self.assert_reconstruct(g, code) + + def test_alias(self): + + g = """ + start: line* + line: NL + | rule + | "hello" -> hi + rule: WORD ":" NUMBER + NL: /(\\r?\\n)+\\s*/ + """ + common + + code = """ + Elephants: 12 + hello + """ + + self.assert_reconstruct(g, code) + + def test_json_example(self): + test_json = ''' + { + "empty_object" : {}, + "empty_array" : [], + "booleans" : { "YES" : true, "NO" : false }, + "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], + "strings" : [ "This", [ "And" , "That", "And a \\"b" ] ], + "nothing" : null + } + ''' + + json_grammar = r""" + ?start: value + + ?value: object + | array + | string + | SIGNED_NUMBER -> number + | "true" -> true + | "false" -> false + | "null" -> null + + array : "[" [value ("," value)*] "]" + object : "{" [pair ("," pair)*] "}" + pair : string ":" value + + string : ESCAPED_STRING + + %import common.ESCAPED_STRING + %import common.SIGNED_NUMBER + %import common.WS + + %ignore WS + """ + + json_parser = Lark(json_grammar, parser='lalr') + tree = json_parser.parse(test_json) + + new_json = Reconstructor(json_parser).reconstruct(tree) + self.assertEqual(json.loads(new_json), json.loads(test_json)) + + +if __name__ == '__main__': + unittest.main() diff -Nru lark-parser-0.6.2/tests/test_relative_import.lark lark-parser-0.7.0/tests/test_relative_import.lark --- lark-parser-0.6.2/tests/test_relative_import.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/test_relative_import.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,7 @@ +start: NUMBER WORD + +%import .grammars.test.NUMBER +%import common.WORD +%import common.WS +%ignore WS + diff -Nru lark-parser-0.6.2/tests/test_relative_import_rename.lark lark-parser-0.7.0/tests/test_relative_import_rename.lark --- lark-parser-0.6.2/tests/test_relative_import_rename.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/test_relative_import_rename.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,7 @@ +start: N WORD + +%import .grammars.test.NUMBER -> N +%import common.WORD +%import common.WS +%ignore WS + diff -Nru lark-parser-0.6.2/tests/test_relative_multi_import.lark lark-parser-0.7.0/tests/test_relative_multi_import.lark --- lark-parser-0.6.2/tests/test_relative_multi_import.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/test_relative_multi_import.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,4 @@ +start: NUMBER WORD + +%import .grammars.test (NUMBER, WORD, WS) +%ignore WS diff -Nru lark-parser-0.6.2/tests/test_relative_rule_import_drop_ignore.lark lark-parser-0.7.0/tests/test_relative_rule_import_drop_ignore.lark --- lark-parser-0.6.2/tests/test_relative_rule_import_drop_ignore.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/test_relative_rule_import_drop_ignore.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,7 @@ +start: X expr Y + +X: "x" +Y: "y" + +%import .grammars.ab.expr + diff -Nru lark-parser-0.6.2/tests/test_relative_rule_import.lark lark-parser-0.7.0/tests/test_relative_rule_import.lark --- lark-parser-0.6.2/tests/test_relative_rule_import.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/test_relative_rule_import.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,7 @@ +start: X expr Y + +X: "x" +Y: "y" + +%import .grammars.ab.expr + diff -Nru lark-parser-0.6.2/tests/test_relative_rule_import_rename.lark lark-parser-0.7.0/tests/test_relative_rule_import_rename.lark --- lark-parser-0.6.2/tests/test_relative_rule_import_rename.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/test_relative_rule_import_rename.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,7 @@ +start: X ab Y + +X: "x" +Y: "y" + +%import .grammars.ab.expr -> ab + diff -Nru lark-parser-0.6.2/tests/test_relative_rule_import_subrule.lark lark-parser-0.7.0/tests/test_relative_rule_import_subrule.lark --- lark-parser-0.6.2/tests/test_relative_rule_import_subrule.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/test_relative_rule_import_subrule.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,7 @@ +start: X startab Y + +X: "x" +Y: "y" + +%import .grammars.ab.startab + diff -Nru lark-parser-0.6.2/tests/test_relative_rule_import_subrule_no_conflict.lark lark-parser-0.7.0/tests/test_relative_rule_import_subrule_no_conflict.lark --- lark-parser-0.6.2/tests/test_relative_rule_import_subrule_no_conflict.lark 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tests/test_relative_rule_import_subrule_no_conflict.lark 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,9 @@ +start: expr + +expr: X startab Y + +X: "x" +Y: "y" + +%import .grammars.ab.startab + diff -Nru lark-parser-0.6.2/tests/test_tools.py lark-parser-0.7.0/tests/test_tools.py --- lark-parser-0.6.2/tests/test_tools.py 2018-03-10 10:28:35.000000000 +0000 +++ lark-parser-0.7.0/tests/test_tools.py 2019-03-28 13:53:28.000000000 +0000 @@ -17,6 +17,18 @@ def setUp(self): pass + def _create_standalone(self, grammar): + code_buf = StringIO() + temp = sys.stdout + sys.stdout = code_buf + standalone.main(StringIO(grammar), 'start') + sys.stdout = temp + code = code_buf.getvalue() + + context = {} + exec(code, context) + return context + def test_simple(self): grammar = """ start: NUMBER WORD @@ -28,21 +40,36 @@ """ - code_buf = StringIO() - temp = sys.stdout - sys.stdout = code_buf - standalone.main(StringIO(grammar), 'start') - sys.stdout = temp - code = code_buf.getvalue() + context = self._create_standalone(grammar) - context = {} - exec(code, context) _Lark = context['Lark_StandAlone'] - l = _Lark() x = l.parse('12 elephants') self.assertEqual(x.children, ['12', 'elephants']) + def test_contextual(self): + grammar = """ + start: a b + a: "A" "B" + b: "AB" + """ + + context = self._create_standalone(grammar) + + _Lark = context['Lark_StandAlone'] + l = _Lark() + x = l.parse('ABAB') + + class T(context['Transformer']): + def a(self, items): + return 'a' + def b(self, items): + return 'b' + start = list + + x = T().transform(x) + self.assertEqual(x, ['a', 'b']) + if __name__ == '__main__': unittest.main() diff -Nru lark-parser-0.6.2/tests/test_trees.py lark-parser-0.7.0/tests/test_trees.py --- lark-parser-0.6.2/tests/test_trees.py 2018-06-26 13:29:25.000000000 +0000 +++ lark-parser-0.7.0/tests/test_trees.py 2019-03-28 13:53:28.000000000 +0000 @@ -6,7 +6,7 @@ import pickle from lark.tree import Tree -from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args +from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args, Discard class TestTrees(TestCase): @@ -21,6 +21,17 @@ data = pickle.dumps(s) assert pickle.loads(data) == s + def test_iter_subtrees(self): + expected = [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z'), + Tree('a', [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')])] + nodes = list(self.tree1.iter_subtrees()) + self.assertEqual(nodes, expected) + + def test_iter_subtrees_topdown(self): + expected = [Tree('a', [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')]), + Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')] + nodes = list(self.tree1.iter_subtrees_topdown()) + self.assertEqual(nodes, expected) def test_interp(self): t = Tree('a', [Tree('b', []), Tree('c', []), 'd']) @@ -97,6 +108,63 @@ res = T().transform(t) self.assertEqual(res, 2.9) + def test_vargs(self): + @v_args() + class MyTransformer(Transformer): + @staticmethod + def integer(args): + return 1 # some code here + + @classmethod + def integer2(cls, args): + return 2 # some code here + + hello = staticmethod(lambda args: 'hello') + + x = MyTransformer().transform( Tree('integer', [2])) + self.assertEqual(x, 1) + x = MyTransformer().transform( Tree('integer2', [2])) + self.assertEqual(x, 2) + x = MyTransformer().transform( Tree('hello', [2])) + self.assertEqual(x, 'hello') + + def test_vargs_override(self): + t = Tree('add', [Tree('sub', [Tree('i', ['3']), Tree('f', ['1.1'])]), Tree('i', ['1'])]) + + @v_args(inline=True) + class T(Transformer): + i = int + f = float + sub = lambda self, a, b: a-b + + not_a_method = {'other': 'stuff'} + + @v_args(inline=False) + def add(self, values): + return sum(values) + + res = T().transform(t) + self.assertEqual(res, 2.9) + + def test_discard(self): + class MyTransformer(Transformer): + def a(self, args): + return 1 # some code here + + def b(cls, args): + raise Discard() + + t = Tree('root', [ + Tree('b', []), + Tree('a', []), + Tree('b', []), + Tree('c', []), + Tree('b', []), + ]) + t2 = Tree('root', [1, Tree('c', [])]) + + x = MyTransformer().transform( t ) + self.assertEqual(x, t2) if __name__ == '__main__': diff -Nru lark-parser-0.6.2/tox.ini lark-parser-0.7.0/tox.ini --- lark-parser-0.6.2/tox.ini 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/tox.ini 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,24 @@ +[tox] +envlist = py27, py34, py35, py36, pypy, pypy3 +skip_missing_interpreters=true + +[travis] +2.7 = py27 +3.4 = py34 +3.5 = py35 +3.6 = py36 +pypy = pypy +pypy3 = pypy3 + +[testenv] +whitelist_externals = git +deps = + -rnearley-requirements.txt + +# to always force recreation and avoid unexpected side effects +recreate=True + +commands= + git submodule sync -q + git submodule update --init + python -m tests {posargs} diff -Nru lark-parser-0.6.2/.travis.yml lark-parser-0.7.0/.travis.yml --- lark-parser-0.6.2/.travis.yml 1970-01-01 00:00:00.000000000 +0000 +++ lark-parser-0.7.0/.travis.yml 2019-03-28 13:53:28.000000000 +0000 @@ -0,0 +1,11 @@ +language: python +python: + - "2.7" + - "3.4" + - "3.5" + - "3.6" + - "pypy" # PyPy2 5.8.0 + - "pypy3" # Pypy3 5.8.0-beta0 +install: pip install tox-travis +script: + - tox