From 7d2df252751d7aefb07acc46c16cfae8303a95d7 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Sat, 24 Feb 2024 11:39:54 -0800 Subject: [PATCH 1/5] Ensure optimization returns new grammar object --- CHANGELOG.md | 4 ++++ pe/_optimize.py | 23 +++++++++++------------ test/test__optimize.py | 13 +++++++++---- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e711175..c8ee158 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased][unreleased] +### Fixed + +* Optimization returns new grammar instead of mutating original + ## [v0.5.1][] diff --git a/pe/_optimize.py b/pe/_optimize.py index ad19b3e..21b83c5 100644 --- a/pe/_optimize.py +++ b/pe/_optimize.py @@ -128,22 +128,18 @@ def _common(defn): if len(ranges) == 1 and ranges[0][1] is None and not negated: defn = Literal(ranges[0][0]) - if op == SEQ: - _common_sequence(defn.args[0]) + elif op == SEQ: + defn = _common_sequence(defn) - if op == CHC: - _common_choice(defn.args[0]) - - # Sequence(x) -> x OR Choice(x) -> x - if op in (SEQ, CHC) and len(defn.args[0]) == 1: - defn = defn.args[0][0] - op = defn.op + elif op == CHC: + defn = _common_choice(defn) return defn -def _common_sequence(subdefs): +def _common_sequence(defn): i = 0 + subdefs = list(defn.args[0]) while i < len(subdefs) - 1: d = subdefs[i] # ![...] . -> [^...] @@ -163,16 +159,18 @@ def _common_sequence(subdefs): if j - i > 1: subdefs[i:j] = [Literal(''.join(x.args[0] for x in subdefs[i:j]))] i += 1 + return Sequence(*subdefs) -def _common_choice(subdefs): +def _common_choice(defn): i = 0 + subdefs = list(defn.args[0]) while i < len(subdefs) - 1: d = subdefs[i] # [..] / [..] -> [....] # [..] / "." -> [...] if (d.op == CLS and not d.args[1]) or (d.op == LIT and len(d.args[0]) == 1): - ranges = d.args[0] if d.op == CLS else [(d.args[0], None)] + ranges = list(d.args[0]) if d.op == CLS else [(d.args[0], None)] j = i + 1 while j < len(subdefs): d2 = subdefs[j] @@ -186,6 +184,7 @@ def _common_choice(subdefs): if j - i > 1: subdefs[i:j] = [Class(ranges)] i += 1 + return Choice(*subdefs) def _regex_dot(defn, defs, grpid): diff --git a/test/test__optimize.py b/test/test__optimize.py index d92cbd4..33af4f6 100644 --- a/test/test__optimize.py +++ b/test/test__optimize.py @@ -15,11 +15,16 @@ def gload(s, inline=False, common=False, regex=False): + _, original = loads(s) start, defmap = loads(s) - return optimize(Grammar(defmap, start=start), - inline=inline, - common=common, - regex=regex) + optimized = optimize( + Grammar(defmap, start=start), + inline=inline, + common=common, + regex=regex + ) + assert original == defmap + return optimized def iload(s): From 71d7f159ee9be3ce0dc5e46575b8442fe5d64f89 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Sat, 24 Feb 2024 11:42:01 -0800 Subject: [PATCH 2/5] Optimization unions choice of char classes Fixes #44 --- CHANGELOG.md | 1 + pe/_optimize.py | 6 +++++- test/test__optimize.py | 20 +++++++++++++++----- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c8ee158..e9f2c90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Fixed * Optimization returns new grammar instead of mutating original +* Make union of choice of character classes ## [v0.5.1][] diff --git a/pe/_optimize.py b/pe/_optimize.py index 21b83c5..773de93 100644 --- a/pe/_optimize.py +++ b/pe/_optimize.py @@ -182,11 +182,15 @@ def _common_choice(defn): break j += 1 if j - i > 1: - subdefs[i:j] = [Class(ranges)] + subdefs[i:j] = [Class(sorted(set(ranges), key=_range_sort_key))] i += 1 return Choice(*subdefs) +def _range_sort_key(range: list[tuple[str, str|None]]): + """Ensure single hyphen characters are the first.""" + return (range != ("-", None), range) + def _regex_dot(defn, defs, grpid): return Regex('(?s:.)') diff --git a/test/test__optimize.py b/test/test__optimize.py index 33af4f6..0c4d8b4 100644 --- a/test/test__optimize.py +++ b/test/test__optimize.py @@ -72,12 +72,13 @@ def test_common(): gload(r'A <- "a"')) assert (cload(r'A <- !"a"') == gload(r'A <- !"a"')) - assert (cload(r'A <- !"a"') == - gload(r'A <- !"a"')) # single-char classes to literals assert (cload(r'A <- [a]') == gload(r'A <- "a"')) - # but not single-range + # but not multi-char class + assert (cload(r'A <- [ab]') == + gload(r'A <- [ab]')) + # and not ranges assert (cload(r'A <- [a-c]') == gload(r'A <- [a-c]')) # add "b" to avoid dropping the sequence @@ -91,15 +92,24 @@ def test_common(): # sequence of literals to literal assert (cload(r'A <- "a" "bc" "d"') == gload(r'A <- "abcd"')) - # but not sequence with classes + # or sequence of literals or single-char classes + assert (cload(r'A <- "a" [b] "c"') == + gload(r'A <- "abc"')) + # but not sequence with multi-char classes assert (cload(r'A <- "a" [bc] "d"') == gload(r'A <- "a" [bc] "d"')) - # choice of classes or single-char literals + # choice of classes + assert (cload(r'A <- [ab] / [bc]') == + gload(r'A <- [abc]')) + # or choice of classes or single-char literals assert (cload(r'A <- [ab] / "m" / [yz]') == gload(r'A <- [abmyz]')) # not negated classes though assert (cload(r'A <- (![ab] .) / "m" / [yz]') == grm({'A': Choice(Class('ab', negate=True), Class('myz'))})) + # hyphen characters are moved to start of class + assert (cload(r'A <- [(-,] / [-.]') == + gload(r'A <- [-(-,.]')) def test_regex(): From 8ecd1ac2ce4616060f89cbee8a0937485fdb81c0 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Sat, 24 Feb 2024 11:45:14 -0800 Subject: [PATCH 3/5] Raise on machine parser errors with Flag.STRICT Without a memo, the error message does not provide context yet. Part of #44. --- CHANGELOG.md | 6 ++++-- pe/_cy_machine.pyx | 4 +++- pe/_py_machine.py | 4 +++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9f2c90..1683553 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,9 @@ ### Fixed -* Optimization returns new grammar instead of mutating original -* Make union of choice of character classes +* Optimization returns new grammar instead of mutating original ([#44]) +* Make union of choice of character classes ([#44]) +* `Flag.STRICT` now raises parsing errors in machine parser ## [v0.5.1][] @@ -191,3 +192,4 @@ descent parser and a work-in-progress state-machine parser. [#31]: https://github.com/goodmami/pe/issues/31 [#36]: https://github.com/goodmami/pe/issues/36 [#38]: https://github.com/goodmami/pe/issues/38 +[#44]: https://github.com/goodmami/pe/issues/44 diff --git a/pe/_cy_machine.pyx b/pe/_cy_machine.pyx index 787d87a..e7b0c99 100644 --- a/pe/_cy_machine.pyx +++ b/pe/_cy_machine.pyx @@ -13,7 +13,7 @@ from enum import IntEnum from cpython.mem cimport PyMem_Malloc, PyMem_Free from pe._constants import Operator, Flag, FAIL as FAILURE -from pe._errors import Error +from pe._errors import Error, ParseError from pe._match import Match from pe._types import Memo from pe._definition import Definition @@ -176,6 +176,8 @@ class MachineParser(Parser): idx = self._index[self.start] end = self._parser.match(idx, s, pos, args, kwargs, memo) if end < 0: + if flags & Flag.STRICT: + raise ParseError() return None else: return Match( diff --git a/pe/_py_machine.py b/pe/_py_machine.py index 153dbe9..0c261c3 100644 --- a/pe/_py_machine.py +++ b/pe/_py_machine.py @@ -11,7 +11,7 @@ import re from pe._constants import FAIL as FAILURE, Operator, Flag -from pe._errors import Error +from pe._errors import Error, ParseError from pe._match import Match from pe._types import Memo from pe._definition import Definition @@ -132,6 +132,8 @@ def match(self, idx = self._index[self.start] end = _match(self.pi, idx, s, pos, args, kwargs, memo) if end < 0: + if flags & flags.STRICT: + raise ParseError() return None else: return Match( From dc313e2e8d1839e62ca67b1a702fb58dda170c83 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 28 Mar 2024 21:00:13 -0700 Subject: [PATCH 4/5] Resolve linting/typing errors --- pe/_optimize.py | 3 ++- pe/_py_machine.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pe/_optimize.py b/pe/_optimize.py index 773de93..11a98a9 100644 --- a/pe/_optimize.py +++ b/pe/_optimize.py @@ -187,10 +187,11 @@ def _common_choice(defn): return Choice(*subdefs) -def _range_sort_key(range: list[tuple[str, str|None]]): +def _range_sort_key(range): """Ensure single hyphen characters are the first.""" return (range != ("-", None), range) + def _regex_dot(defn, defs, grpid): return Regex('(?s:.)') diff --git a/pe/_py_machine.py b/pe/_py_machine.py index 0c261c3..3c3e803 100644 --- a/pe/_py_machine.py +++ b/pe/_py_machine.py @@ -132,7 +132,7 @@ def match(self, idx = self._index[self.start] end = _match(self.pi, idx, s, pos, args, kwargs, memo) if end < 0: - if flags & flags.STRICT: + if flags & Flag.STRICT: raise ParseError() return None else: From 4b8e4059be34e4cf30e72e14f58c77401a64afe2 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 28 Mar 2024 21:05:38 -0700 Subject: [PATCH 5/5] Bump version to 0.5.2 --- CHANGELOG.md | 6 ++++++ pe/_meta.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1683553..3f98323 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ ## [Unreleased][unreleased] + +## [v0.5.2][] + +**Release date: 2024-03-28** + ### Fixed * Optimization returns new grammar instead of mutating original ([#44]) @@ -177,6 +182,7 @@ descent parser and a work-in-progress state-machine parser. [v0.4.0]: ../../releases/tag/v0.4.0 [v0.5.0]: ../../releases/tag/v0.5.0 [v0.5.1]: ../../releases/tag/v0.5.1 +[v0.5.2]: ../../releases/tag/v0.5.2 [#6]: https://github.com/goodmami/pe/issues/6 [#7]: https://github.com/goodmami/pe/issues/7 diff --git a/pe/_meta.py b/pe/_meta.py index 9e621aa..559ba39 100644 --- a/pe/_meta.py +++ b/pe/_meta.py @@ -2,4 +2,4 @@ Meta-information about pe. """ -__version__ = '0.5.1' +__version__ = '0.5.2'