diff --git a/CHANGELOG.md b/CHANGELOG.md index e711175..3f98323 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,17 @@ ## [Unreleased][unreleased] +## [v0.5.2][] + +**Release date: 2024-03-28** + +### Fixed + +* Optimization returns new grammar instead of mutating original ([#44]) +* Make union of choice of character classes ([#44]) +* `Flag.STRICT` now raises parsing errors in machine parser + + ## [v0.5.1][] **Release date: 2023-12-31** @@ -171,6 +182,7 @@ descent parser and a work-in-progress state-machine parser. [v0.4.0]: ../../releases/tag/v0.4.0 [v0.5.0]: ../../releases/tag/v0.5.0 [v0.5.1]: ../../releases/tag/v0.5.1 +[v0.5.2]: ../../releases/tag/v0.5.2 [#6]: https://github.com/goodmami/pe/issues/6 [#7]: https://github.com/goodmami/pe/issues/7 @@ -186,3 +198,4 @@ descent parser and a work-in-progress state-machine parser. [#31]: https://github.com/goodmami/pe/issues/31 [#36]: https://github.com/goodmami/pe/issues/36 [#38]: https://github.com/goodmami/pe/issues/38 +[#44]: https://github.com/goodmami/pe/issues/44 diff --git a/pe/_cy_machine.pyx b/pe/_cy_machine.pyx index 787d87a..e7b0c99 100644 --- a/pe/_cy_machine.pyx +++ b/pe/_cy_machine.pyx @@ -13,7 +13,7 @@ from enum import IntEnum from cpython.mem cimport PyMem_Malloc, PyMem_Free from pe._constants import Operator, Flag, FAIL as FAILURE -from pe._errors import Error +from pe._errors import Error, ParseError from pe._match import Match from pe._types import Memo from pe._definition import Definition @@ -176,6 +176,8 @@ class MachineParser(Parser): idx = self._index[self.start] end = self._parser.match(idx, s, pos, args, kwargs, memo) if end < 0: + if flags & Flag.STRICT: + raise ParseError() return None else: return Match( diff --git a/pe/_meta.py b/pe/_meta.py index 9e621aa..559ba39 100644 --- a/pe/_meta.py +++ b/pe/_meta.py @@ -2,4 +2,4 @@ Meta-information about pe. """ -__version__ = '0.5.1' +__version__ = '0.5.2' diff --git a/pe/_optimize.py b/pe/_optimize.py index ad19b3e..11a98a9 100644 --- a/pe/_optimize.py +++ b/pe/_optimize.py @@ -128,22 +128,18 @@ def _common(defn): if len(ranges) == 1 and ranges[0][1] is None and not negated: defn = Literal(ranges[0][0]) - if op == SEQ: - _common_sequence(defn.args[0]) + elif op == SEQ: + defn = _common_sequence(defn) - if op == CHC: - _common_choice(defn.args[0]) - - # Sequence(x) -> x OR Choice(x) -> x - if op in (SEQ, CHC) and len(defn.args[0]) == 1: - defn = defn.args[0][0] - op = defn.op + elif op == CHC: + defn = _common_choice(defn) return defn -def _common_sequence(subdefs): +def _common_sequence(defn): i = 0 + subdefs = list(defn.args[0]) while i < len(subdefs) - 1: d = subdefs[i] # ![...] . -> [^...] @@ -163,16 +159,18 @@ def _common_sequence(subdefs): if j - i > 1: subdefs[i:j] = [Literal(''.join(x.args[0] for x in subdefs[i:j]))] i += 1 + return Sequence(*subdefs) -def _common_choice(subdefs): +def _common_choice(defn): i = 0 + subdefs = list(defn.args[0]) while i < len(subdefs) - 1: d = subdefs[i] # [..] / [..] -> [....] # [..] / "." -> [...] if (d.op == CLS and not d.args[1]) or (d.op == LIT and len(d.args[0]) == 1): - ranges = d.args[0] if d.op == CLS else [(d.args[0], None)] + ranges = list(d.args[0]) if d.op == CLS else [(d.args[0], None)] j = i + 1 while j < len(subdefs): d2 = subdefs[j] @@ -184,8 +182,14 @@ def _common_choice(subdefs): break j += 1 if j - i > 1: - subdefs[i:j] = [Class(ranges)] + subdefs[i:j] = [Class(sorted(set(ranges), key=_range_sort_key))] i += 1 + return Choice(*subdefs) + + +def _range_sort_key(range): + """Ensure single hyphen characters are the first.""" + return (range != ("-", None), range) def _regex_dot(defn, defs, grpid): diff --git a/pe/_py_machine.py b/pe/_py_machine.py index 153dbe9..3c3e803 100644 --- a/pe/_py_machine.py +++ b/pe/_py_machine.py @@ -11,7 +11,7 @@ import re from pe._constants import FAIL as FAILURE, Operator, Flag -from pe._errors import Error +from pe._errors import Error, ParseError from pe._match import Match from pe._types import Memo from pe._definition import Definition @@ -132,6 +132,8 @@ def match(self, idx = self._index[self.start] end = _match(self.pi, idx, s, pos, args, kwargs, memo) if end < 0: + if flags & Flag.STRICT: + raise ParseError() return None else: return Match( diff --git a/test/test__optimize.py b/test/test__optimize.py index d92cbd4..0c4d8b4 100644 --- a/test/test__optimize.py +++ b/test/test__optimize.py @@ -15,11 +15,16 @@ def gload(s, inline=False, common=False, regex=False): + _, original = loads(s) start, defmap = loads(s) - return optimize(Grammar(defmap, start=start), - inline=inline, - common=common, - regex=regex) + optimized = optimize( + Grammar(defmap, start=start), + inline=inline, + common=common, + regex=regex + ) + assert original == defmap + return optimized def iload(s): @@ -67,12 +72,13 @@ def test_common(): gload(r'A <- "a"')) assert (cload(r'A <- !"a"') == gload(r'A <- !"a"')) - assert (cload(r'A <- !"a"') == - gload(r'A <- !"a"')) # single-char classes to literals assert (cload(r'A <- [a]') == gload(r'A <- "a"')) - # but not single-range + # but not multi-char class + assert (cload(r'A <- [ab]') == + gload(r'A <- [ab]')) + # and not ranges assert (cload(r'A <- [a-c]') == gload(r'A <- [a-c]')) # add "b" to avoid dropping the sequence @@ -86,15 +92,24 @@ def test_common(): # sequence of literals to literal assert (cload(r'A <- "a" "bc" "d"') == gload(r'A <- "abcd"')) - # but not sequence with classes + # or sequence of literals or single-char classes + assert (cload(r'A <- "a" [b] "c"') == + gload(r'A <- "abc"')) + # but not sequence with multi-char classes assert (cload(r'A <- "a" [bc] "d"') == gload(r'A <- "a" [bc] "d"')) - # choice of classes or single-char literals + # choice of classes + assert (cload(r'A <- [ab] / [bc]') == + gload(r'A <- [abc]')) + # or choice of classes or single-char literals assert (cload(r'A <- [ab] / "m" / [yz]') == gload(r'A <- [abmyz]')) # not negated classes though assert (cload(r'A <- (![ab] .) / "m" / [yz]') == grm({'A': Choice(Class('ab', negate=True), Class('myz'))})) + # hyphen characters are moved to start of class + assert (cload(r'A <- [(-,] / [-.]') == + gload(r'A <- [-(-,.]')) def test_regex():