Support cp2k cell parsing in npt log file

robinzyb · Oct 5, 2023 · 718e4d1 · 718e4d1
2 parents d0d41c1 + 3f266d4
commit 718e4d1
Show file tree

Hide file tree

Showing 31 changed files with 9,256 additions and 88 deletions.
diff --git a/.github/release.yml b/.github/release.yml
@@ -0,0 +1,10 @@
+# .github/release.yml
+
+changelog:
+  categories:
+    - title: New Features
+      labels:
+        - New-Feature
+    - title: Other Changes
+      labels:
+        - "*"
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@
 [![Python package](https://github.com/robinzyb/cp2kdata/actions/workflows/ci.yml/badge.svg)](https://github.com/robinzyb/cp2kdata/actions/workflows/ci.yml)[![Coverage Status](https://coveralls.io/repos/github/robinzyb/cp2kdata/badge.svg)](https://coveralls.io/github/robinzyb/cp2kdata)
 ![pythonv](https://img.shields.io/pypi/pyversions/cp2kdata)
 ![pypiv](https://img.shields.io/pypi/v/cp2kdata)
+![PyPI - pip install](https://img.shields.io/pypi/dm/cp2kdata?logo=pypi&label=pip%20install)
 
 Python Package to postprocess cp2k data.
 

diff --git a/cp2kdata/block_parser/cells.py b/cp2kdata/block_parser/cells.py
@@ -1,5 +1,6 @@
 import regex as re
 import numpy as np
+from ase.geometry.cell import cellpar_to_cell
 
 ALL_CELL_RE = re.compile(
     r"""
@@ -40,3 +41,39 @@ def parse_all_cells(output_file):
         return np.array(all_cells, dtype=float)
     else:
         return None
+
+
+ALL_MD_CELL_RE = re.compile(
+    r"""
+    #parse cell lengths
+    \sMD\|\sCell\slengths\s\[ang\]\s{9}
+    \s{2}(?P<a>\d\.\d{8}E(\+|\-)\d{2})
+    \s{2}(?P<b>\d\.\d{8}E(\+|\-)\d{2})
+    \s{2}(?P<c>\d\.\d{8}E(\+|\-)\d{2})
+    \n
+    #skip two lines
+    (.{80}\n){2}
+    #parse angles
+    \sMD\|\sCell\sangles\s\[deg\]\s{10} 
+    \s{2}(?P<alpha>\d\.\d{8}E(\+|\-)\d{2})
+    \s{2}(?P<beta>\d\.\d{8}E(\+|\-)\d{2})
+    \s{2}(?P<gamma>\d\.\d{8}E(\+|\-)\d{2})
+    """,
+    re.VERBOSE
+)
+
+def parse_all_md_cells(output_file):
+    # notice that the cell of step 0 is not included
+    all_md_cells = []
+    for match in ALL_MD_CELL_RE.finditer(output_file):
+        #print(match)
+        cell = [match["a"], match["b"], match["c"],
+             match["alpha"], match["beta"], match["gamma"]]
+        cell = np.array(cell, dtype=float)
+        cell = cellpar_to_cell(cell)
+        all_md_cells.append(cell)
+
+    if all_md_cells:
+        return np.array(all_md_cells, dtype=float)
+    else:
+        return None
diff --git a/cp2kdata/block_parser/geo_opt.py b/cp2kdata/block_parser/geo_opt.py
@@ -50,7 +50,7 @@
 )
 
 
-def parse_geo_opt(output_file) -> float:
+def parse_geo_opt_info(output_file) -> float:
     geo_opt_info = []
 
     for match in GEO_OPT_INFO_FIRST_RE.finditer(output_file):

diff --git a/cp2kdata/block_parser/header_info.py b/cp2kdata/block_parser/header_info.py
@@ -1,5 +1,6 @@
 import regex as re
 from dataclasses import dataclass
+# use monty.re because it can terminate on match
 from monty.re import regrep
 
 
@@ -9,7 +10,7 @@ class Cp2kInfo:
 
 CP2K_INFO_VERSION_PATTERN = \
     r"""(?xm)
-    ^\sCP2K\|\sversion\sstring:\s{20,42}
+    ^\sCP2K\|\sversion\sstring:\s{10,42}
     CP2K\sversion\s(?P<version>\d{1,4}\.\d)(?:\s\(Development\sVersion\))?$
     """
 
@@ -83,8 +84,34 @@ def parse_dft_info(filename) -> DFTInfo:
         patterns={
             "ks_type": DFT_INFO_KS_TYPE_PATTERN,
             "multiplicity": DFT_INFO_MULTIPLICITY_PATTERN
+        },
+        terminate_on_match=True
+        )
+
+    return DFTInfo(ks_type=dft_info["ks_type"][0][0][0], multiplicity=dft_info["multiplicity"][0][0][0])
+
+
+
+
+@dataclass
+class MDInfo:
+    ensemble_type: str = None
+
+# PATTERNS
+MD_INFO_ENSEMBLE_TYPE_PATTERN = \
+    r"""(?xm)
+    ^\s(?:MD_PAR|MD)\|\sEnsemble\s(?:t|T)ype\s{39,60}(?P<ensemble_type>\w{3,16})
+    """
+
+def parse_md_info(filename):
+    md_info = {}
+
+    md_info = regrep(
+        filename=filename, 
+        patterns={
+            "ensemble_type": MD_INFO_ENSEMBLE_TYPE_PATTERN
         }, 
         terminate_on_match=True
         )
 
-    return DFTInfo(ks_type=dft_info["ks_type"][0][0][0], multiplicity=dft_info["multiplicity"][0][0][0])
+    return MDInfo(ensemble_type=md_info["ensemble_type"][0][0][0])
diff --git a/cp2kdata/block_parser/md_xyz.py b/cp2kdata/block_parser/md_xyz.py
@@ -8,12 +8,12 @@
 )
 
 def parse_md_ener(ener_file):
-    print(f"Obtian Energies From {ener_file}")
+    print(f"Parsing Energies From {ener_file}")
     energies_list = np.loadtxt(ener_file, usecols=4, ndmin=1, dtype=np.float64)
     return energies_list
 
 def parse_pos_xyz(posxyz_file):
-    print(f"Obtian Structures From {posxyz_file}")
+    print(f"Parsing Structures From {posxyz_file}")
     fp = zopen(posxyz_file, "r")
     lines = fp.readlines()
     energies_list = []
@@ -36,7 +36,7 @@ def parse_pos_xyz(posxyz_file):
     return pos_list, energies_list, chemical_symbols
 
 def parse_frc_xyz(frcxyz_file):
-    print(f"Obtian Froces From {frcxyz_file}")
+    print(f"Parsing Froces From {frcxyz_file}")
     fp = zopen(frcxyz_file, "r")
     lines = fp.readlines()
     force_list = []
@@ -57,7 +57,7 @@ def parse_frc_xyz(frcxyz_file):
 
 #NOTE: incomplete function, do not release!
 def parse_pos_xyz_from_wannier(wannier_xyz_fiel):
-    print(f"Obtian Structures From {wannier_xyz_fiel}")
+    print(f"Parsing Structures From {wannier_xyz_fiel}")
     fp = zopen(wannier_xyz_fiel, "r")
     lines = fp.readlines()
     force_list = []
@@ -79,7 +79,7 @@ def parse_pos_xyz_from_wannier(wannier_xyz_fiel):
     return force_list
 
 def parse_md_stress(stress_file):
-    print(f"Obtian Stresses From {stress_file}")
+    print(f"Parsing Stresses From {stress_file}")
     stresses_list = np.loadtxt(
         stress_file, 
         usecols=(2, 3, 4, 5, 6, 7, 8, 9, 10), 

diff --git a/cp2kdata/dpdata_plugin.py b/cp2kdata/dpdata_plugin.py
@@ -81,9 +81,10 @@ def from_labeled_system(self, file_name, **kwargs):
 
         if cells is None:
             if cp2kmd.filename:
-                cells = cp2kmd.get_init_cell()
-                cells = cells[np.newaxis, :, :]
-                cells = np.repeat(cells, repeats=num_frames, axis=0)
+                # cells = cp2kmd.get_init_cell()
+                # cells = cells[np.newaxis, :, :]
+                # cells = np.repeat(cells, repeats=num_frames, axis=0)
+                cells = cp2kmd.get_all_cells()
             else:
                 print("No cell information, please check if your inputs are correct.")
         elif isinstance(cells, np.ndarray):
@@ -168,9 +169,10 @@ def from_labeled_system(self, file_name, **kwargs):
 
         if cells is None:
             if cp2kmd.filename:
-                cells = cp2kmd.get_init_cell()
-                cells = cells[np.newaxis, :, :]
-                cells = np.repeat(cells, repeats=num_frames, axis=0)
+                # cells = cp2kmd.get_init_cell()
+                # cells = cells[np.newaxis, :, :]
+                # cells = np.repeat(cells, repeats=num_frames, axis=0)
+                cells = cp2kmd.get_all_cells()
             else:
                 print("No cell information, please check if your inputs are correct.")
         elif isinstance(cells, np.ndarray):

diff --git a/cp2kdata/output.py b/cp2kdata/output.py
@@ -1,4 +1,3 @@
-from ast import parse
 import sys
 import time
 import numpy as np
@@ -66,7 +65,7 @@ def __init__(self, output_file=None, run_type: str=None, path_prefix=".", **kwar
             with open(self.filename, 'r') as fp:
                 self.output_file = fp.read()
             self.cp2k_info = parse_cp2k_info(self.filename)
-            self.DFTInfo = parse_dft_info(self.filename)
+            self.dft_info = parse_dft_info(self.filename)
         else:
             self.cp2k_info = Cp2kInfo(version="Unknown")
 
@@ -105,7 +104,7 @@ def __init__(self, output_file=None, run_type: str=None, path_prefix=".", **kwar
         #     self.output_file)
 
         # self.mulliken_pop_list = parse_mulliken_pop_list(
-        #     self.output_file, self.DFTInfo)
+        #     self.output_file, self.dft_info)
         # self.hirshfeld_pop_list = parse_hirshfeld_pop_list(self.output_file)
         # self.dft_plus_u_occ = parse_dft_plus_u_occ(self.output_file)
 
@@ -268,7 +267,7 @@ def parse_energy_force(self):
             "atomic_kinds": parse_atomic_kinds,
             "cells": parse_all_cells
         }
-        # convert kwargs to flexible attribute! 
+        #TODO: convert kwargs to flexible attribute! 
         self.geo_opt_info = None
         self.num_frames = 1
         self.init_atomic_coordinates, self.atom_kind_list, self.chemical_symbols = parse_init_atomic_coordinates(
@@ -283,8 +282,10 @@ def parse_energy_force(self):
     def parse_geo_opt(self):
         self.init_atomic_coordinates, self.atom_kind_list, self.chemical_symbols = parse_init_atomic_coordinates(
             self.output_file)
-        self.geo_opt_info = parse_geo_opt(self.output_file)
+        self.geo_opt_info = parse_geo_opt_info(self.output_file)
         self.num_frames = len(self.geo_opt_info)
+        self.atomic_forces_list = parse_atomic_forces_list(self.output_file)
+        self.stress_tensor_list = parse_stress_tensor_list(self.output_file)
 
     def parse_cell_opt(self):
         # initial information
@@ -305,15 +306,9 @@ def parse_cell_opt(self):
         self.num_frames = len(self.energies_list)
 
     def parse_md(self):
-
-        if self.filename:
-            self.all_cells = parse_all_cells(self.output_file)
-            print(f"You are reading cell information from {self.filename}")
-            self.init_atomic_coordinates, self.atom_kind_list, self.chemical_symbols = parse_init_atomic_coordinates(
-            self.output_file)
-            self.atomic_kind = parse_atomic_kinds(self.output_file)
+        self.md_info = parse_md_info(self.filename)
+        self.check_md_type(md_type=self.md_info.ensemble_type)
 
-
         ener_file_list = glob.glob(os.path.join(self.path_prefix, "*.ener"))
         if ener_file_list:
             self.energies_list = parse_md_ener(ener_file_list[0])
@@ -324,9 +319,13 @@ def parse_md(self):
 
             if not hasattr(self, "energies_list"):
                 self.energies_list = energies_list_from_pos
+
         frc_xyz_file_list = glob.glob(os.path.join(self.path_prefix,"*frc*.xyz"))
         if frc_xyz_file_list:
             self.atomic_forces_list = parse_frc_xyz(frc_xyz_file_list[0])
+        else:
+            print(f"Parsing Forces from the CP2K output/log file: {self.filename}")
+            self.atomic_forces_list = parse_atomic_forces_list(self.output_file)
 
         stress_file_list = glob.glob(os.path.join(self.path_prefix,"*.stress"))
         if stress_file_list:
@@ -336,6 +335,35 @@ def parse_md(self):
 
         self.num_frames = len(self.energies_list)
 
+        # here parse cell information
+        if self.filename:
+
+            if (self.md_info.ensemble_type == "NVT") or \
+                (self.md_info.ensemble_type == "NVE") or \
+                (self.md_info.ensemble_type == "REFTRAJ"):
+                # parse the first cell
+                first_cell = parse_all_cells(self.output_file)
+                assert first_cell.shape == (1, 3, 3), \
+                    "cp2kdata obtains more than one cell from the output file, please check if your output file has duplicated header information."
+
+                self.all_cells = first_cell
+                self.all_cells = np.repeat(self.all_cells, repeats=self.num_frames, axis=0)
+
+            elif self.md_info.ensemble_type == "NPT_F":
+                # only parse the first cell
+                first_cell = parse_all_cells(self.output_file)
+                assert first_cell.shape == (1, 3, 3), \
+                    "cp2kdata obtains more than one cell from the output file, please check if your output file has duplicated header information."
+                # parse the rest of the cells
+                self.all_cells = parse_all_md_cells(self.output_file)
+                # prepend the first cell
+                self.all_cells = np.insert(self.all_cells, 0, first_cell[0], axis=0)
+
+            print(f"Parsing Cells Information from {self.filename}")
+            self.init_atomic_coordinates, self.atom_kind_list, self.chemical_symbols = parse_init_atomic_coordinates(
+            self.output_file)
+            self.atomic_kind = parse_atomic_kinds(self.output_file)
+
     @staticmethod
     def get_global_info(run_type=None, filename=None):
         if filename:
@@ -350,11 +378,19 @@ def get_global_info(run_type=None, filename=None):
     def check_run_type(run_type):
         implemented_run_type_parsers = \
             ["ENERGY_FORCE", "ENERGY", "MD", "GEO_OPT", "CELL_OPT"]
-        if run_type in implemented_run_type_parsers:
-            pass
-        else:
+        if run_type not in implemented_run_type_parsers:
             raise ValueError(
-                f"Parser for Run Type {run_type} Haven't Been Implemented Yet!"
+                f"Parser for Run Type {run_type} haven't been implemented yet!"
+                "Please contact the developer for more information."
                 )
 
+    @staticmethod
+    def check_md_type(md_type):
+        implemented_ensemble_type_parsers = \
+            ["NVE", "NVT", "NPT_F", "REFTRAJ"]
+        if md_type not in implemented_ensemble_type_parsers:
+            raise ValueError(
+                f"Parser for MD Type {md_type} haven't been implemented yet!\n"
+                "Please contact the developer for more information."
+            )
 
diff --git a/cp2kdata/paser_func.py b/cp2kdata/paser_func.py
@@ -1,13 +1,13 @@
-from .block_parser.dft_plus_u import parse_dft_plus_u_occ
-from .block_parser.forces import parse_atomic_forces_list
-from .block_parser.geo_opt import parse_geo_opt
-from .block_parser.header_info import parse_dft_info, parse_global_info, parse_cp2k_info
-from .block_parser.hirshfeld import parse_hirshfeld_pop_list
-from .block_parser.mulliken import parse_mulliken_pop_list
-from .block_parser.energies import parse_energies_list
-from .block_parser.coordinates import parse_init_atomic_coordinates
-from .block_parser.atomic_kind import parse_atomic_kinds
-from .block_parser.errors_handle import parse_errors
-from .block_parser.stress import parse_stress_tensor_list
-from .block_parser.cells import parse_all_cells
-from .block_parser.md_xyz import parse_md_ener, parse_pos_xyz, parse_frc_xyz, parse_md_stress
+from cp2kdata.block_parser.dft_plus_u import parse_dft_plus_u_occ
+from cp2kdata.block_parser.forces import parse_atomic_forces_list
+from cp2kdata.block_parser.geo_opt import parse_geo_opt_info
+from cp2kdata.block_parser.header_info import parse_dft_info, parse_global_info, parse_cp2k_info, parse_md_info
+from cp2kdata.block_parser.hirshfeld import parse_hirshfeld_pop_list
+from cp2kdata.block_parser.mulliken import parse_mulliken_pop_list
+from cp2kdata.block_parser.energies import parse_energies_list
+from cp2kdata.block_parser.coordinates import parse_init_atomic_coordinates
+from cp2kdata.block_parser.atomic_kind import parse_atomic_kinds
+from cp2kdata.block_parser.errors_handle import parse_errors
+from cp2kdata.block_parser.stress import parse_stress_tensor_list
+from cp2kdata.block_parser.cells import parse_all_cells, parse_all_md_cells
+from cp2kdata.block_parser.md_xyz import parse_md_ener, parse_pos_xyz, parse_frc_xyz, parse_md_stress
diff --git a/cp2kdata/pdos/__init__.py b/cp2kdata/pdos/__init__.py