From 71e175bff07d70d4265a5430a7e11f505af6813e Mon Sep 17 00:00:00 2001 From: William Rodriguez Date: Mon, 13 Feb 2023 12:07:50 -0600 Subject: [PATCH] Replace md5 hashing with sha256. Fixes #1250 --- lark/lark.py | 16 ++++++++-------- lark/load_grammar.py | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index c93e9e19..80528207 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -20,7 +20,7 @@ from .exceptions import ConfigurationError, assert_config, UnexpectedInput from .utils import Serialize, SerializeMemoizer, FS, isascii, logger -from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, md5_digest +from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest from .tree import Tree from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType @@ -288,7 +288,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: grammar = read() cache_fn = None - cache_md5 = None + cache_sha256 = None if isinstance(grammar, str): self.source_grammar = grammar if self.options.use_bytes: @@ -303,7 +303,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) from . import __version__ s = grammar + options_str + __version__ + str(sys.version_info[:2]) - cache_md5 = md5_digest(s) + cache_sha256 = sha256_digest(s) if isinstance(self.options.cache, str): cache_fn = self.options.cache @@ -319,7 +319,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: # specific reason - we just want a username. username = "unknown" - cache_fn = tempfile.gettempdir() + "/.lark_cache_%s_%s_%s_%s.tmp" % (username, cache_md5, *sys.version_info[:2]) + cache_fn = tempfile.gettempdir() + "/.lark_cache_%s_%s_%s_%s.tmp" % (username, cache_sha256, *sys.version_info[:2]) old_options = self.options try: @@ -328,9 +328,9 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: # Remove options that aren't relevant for loading from cache for name in (set(options) - _LOAD_ALLOWED_OPTIONS): del options[name] - file_md5 = f.readline().rstrip(b'\n') + file_sha256 = f.readline().rstrip(b'\n') cached_used_files = pickle.load(f) - if file_md5 == cache_md5.encode('utf8') and verify_used_files(cached_used_files): + if file_sha256 == cache_sha256.encode('utf8') and verify_used_files(cached_used_files): cached_parser_data = pickle.load(f) self._load(cached_parser_data, **options) return @@ -436,8 +436,8 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: logger.debug('Saving grammar to cache: %s', cache_fn) try: with FS.open(cache_fn, 'wb') as f: - assert cache_md5 is not None - f.write(cache_md5.encode('utf8') + b'\n') + assert cache_sha256 is not None + f.write(cache_sha256.encode('utf8') + b'\n') pickle.dump(used_files, f) self.save(f, _LOAD_ALLOWED_OPTIONS) except IOError as e: diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ce295e03..28d0b200 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -1314,7 +1314,7 @@ def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], alia except IOError: continue else: - h = md5_digest(text) + h = sha256_digest(text) if self.used_files.get(joined_path, h) != h: raise RuntimeError("Grammar file was changed during importing") self.used_files[joined_path] = h @@ -1393,7 +1393,7 @@ def verify_used_files(file_hashes): if text is None: # We don't know how to load the path. ignore it. continue - current = md5_digest(text) + current = sha256_digest(text) if old != current: logger.info("File %r changed, rebuilding Parser" % path) return False @@ -1411,13 +1411,13 @@ def load_grammar(grammar, source, import_paths, global_keep_all_tokens): return builder.build(), builder.used_files -def md5_digest(s: str) -> str: - """Get the md5 digest of a string +def sha256_digest(s: str) -> str: + """Get the sha256 digest of a string Supports the `usedforsecurity` argument for Python 3.9+ to allow running on a FIPS-enabled system. """ if sys.version_info >= (3, 9): - return hashlib.md5(s.encode('utf8'), usedforsecurity=False).hexdigest() + return hashlib.sha256(s.encode('utf8'), usedforsecurity=False).hexdigest() else: - return hashlib.md5(s.encode('utf8')).hexdigest() + return hashlib.sha256(s.encode('utf8')).hexdigest()