Skip to content

Commit

Permalink
Add scan chiral inversion method; add sidechain modelling to relaxati…
Browse files Browse the repository at this point in the history
…on; fix reading multiple trajectories into pdbdata
  • Loading branch information
rzhu committed Oct 7, 2024
1 parent 7968cfc commit 23949d3
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 33 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,5 @@ ERROR*
LOG*
*.swp
*.swo

archive/
159 changes: 127 additions & 32 deletions src/molearn/analysis/analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from copy import deepcopy
import numpy as np
import torch.optim
from pathlib import Path
from typing import Union

try:
# from modeller import *
Expand Down Expand Up @@ -87,11 +89,11 @@ def get_dataset(self, key):
"""
return self._datasets[key]

def set_dataset(self, key, data, atomselect="*"):
def set_dataset(self, key, data, atomselect="protein"):
"""
:param data: :func:`PDBData <molearn.data.PDBData>` object containing atomic coordinates
:param str key: label to be associated with data
:param list/str atomselect: list of atom names to load, or '*' to indicate that all atoms are loaded.
:param list/str atomselect: list of atom names to load, or 'protein' to indicate that all atoms are loaded.
"""
if isinstance(data, str) and data.endswith(".pdb"):
d = PDBData()
Expand Down Expand Up @@ -453,6 +455,24 @@ def _dope_score(self, frame, refine=True, **kwargs):

return self.dope_score_class.get_score(f * self.stdval, refine=refine, **kwargs)

def _ca_chirality(N, CA, C, CB):
"""
Calculate chirality of Cα atom in a protein residue.
:param N: Cartesian coordinates of N atom
:param CA: Cartesian coordinates of CA atom
:param C: Cartesian coordinates of C atom
:param CB: Cartesian coordinates of CB atom
:return: dot product of normal vector to the plane defined by N, CA, and C
"""
ca_c = C - CA
ca_cb = CB - CA
ca_n = N - CA
normal = np.cross(ca_n, ca_c)
dot_product = np.dot(normal, ca_cb)
# L if dot_product > 0 else D
return dot_product

def get_all_ramachandran_score(self, tensor):
"""
Calculate Ramachandran score of an ensemble of atomic conrdinates.
Expand Down Expand Up @@ -560,6 +580,57 @@ def scan_ramachandran(self):

return self.surfaces["Ramachandran_favored"], self.xvals, self.yvals

def scan_ca_chirality(self):
"""
Calculate chiralities of Cα atoms on a grid sampling the latent space.
Requires a grid system to be defined via a prior call to :func:`set_dataset <molearn.analysis.MolearnAnalysis.setup_grid>`.
Requires the atom selection includes Cα atoms.
Saves a surface in memory, with key 'CA_chirality'.
:return: Number of CA inversions on the latent space NxN surface
:return: x-axis values
:return: y-axis values
"""
assert (
set(['CA','C','N','CB']).issubset(set(self.atoms))
), "Atom selection shoud at least include CA, C, N, and CB"

key = "Chirality"
if key not in self.surfaces:
assert (
"grid" in self._encoded
), "make sure to call MolearnAnalysis.setup_grid first"
decoded = self.get_decoded("grid")

mol_df = self.mol.data

# Get atom indices
indices = dict()
for resid in range(len(mol_df.resid.unique())):
resname = mol_df[mol_df['resid'] == resid].resname.unique()[0]
if not resname == 'GLY':
N_id = mol_df[(mol_df['resid'] == resid) & (mol_df['name'] == 'N')].index[0]
C_id = mol_df[(mol_df['resid'] == resid) & (mol_df['name'] == 'C')].index[0]
CA_id = mol_df[(mol_df['resid'] == resid) & (mol_df['name'] == 'CA')].index[0]
CB_id = mol_df[(mol_df['resid'] == resid) & (mol_df['name'] == 'CB')].index[0]
indices[resname + str(resid)] = (N_id, CA_id, C_id, CB_id)

results = []
for i, j in enumerate(decoded):
s = (j.view(1, 3, -1).permute(0, 2, 1) * self.stdval).numpy()
inversions = {}
for k, v in indices.items():
dot_prod = self._ca_chirality(s[0,v[0],:],
s[0,v[1],:],
s[0,v[2],:],
s[0,v[3],:])
if dot_prod < 0:
inversions[k] = dot_prod
results.append(len(inversions))
self.surfaces[key] = np.array(results).reshape(self.n_samples, self.n_samples)

return self.surfaces[key], self.xvals, self.yvals

def scan_custom(self, fct, params, key):
"""
Generate a surface coloured as a function of a user-defined function.
Expand All @@ -580,43 +651,67 @@ def scan_custom(self, fct, params, key):

return self.surfaces[key], self.xvals, self.yvals

def _relax(self, pdb_path: str, mini_path: str) -> None:
"""
relax generated structure
:param str pdb_path: path of the pdb file to relax
:param str mini_path: path where the new relaxed structure should be saved
"""
# read pdb
pdb = PDBFile(pdb_path)
# preparation
forcefield = ForceField("amber99sb.xml")
modeller = Modeller(pdb.topology, pdb.positions)
modeller.addHydrogens(forcefield)
system = forcefield.createSystem(modeller.topology, nonbondedMethod=NoCutoff)
integrator = VerletIntegrator(0.001 * picoseconds)
simulation = Simulation(modeller.topology, system, integrator)
simulation.context.setPositions(modeller.positions)
# minimize protein
simulation.minimizeEnergy(maxIterations=100)
positions = simulation.context.getState(getPositions=True).getPositions()
# write new pdb file
PDBFile.writeFile(simulation.topology, positions, open(mini_path, "w+"))

def _relax(self, pdb_file: Union[str, Path], out_path: Union[str, Path], maxIterations: int = 1000) -> None:
"""
Model the sidechains and relax generated structure
:param str pdb_file: path to the pdb file generated by the model
:param str out_path: path where the modelled/relaxed structures are be saved
"""

if not isinstance(pdb_file, Path):
pdb_file = Path(pdb_file)
if not isinstance(out_path, Path):
out_path = Path(out_path)

# Assume sidechain modelling is required if the number of selected atoms is less than 6
if len(self.atoms) < 6:
modelled_file = out_path/(pdb_file.stem + "_modelled.pdb")
try:
env = Environ()
env.libs.topology.read(file='$(LIB)/top_heav.lib')
env.libs.parameters.read(file='$(LIB)/par.lib')

mdl = complete_pdb(env, str(pdb_file))
mdl.write(str(modelled_file))
pdb_file = modelled_file
except:
print(f'Failed to model {pdb_file}')
try:
relaxed_file = out_path/(pdb_file.stem + "_relaxed.pdb")
# Read pdb
pdb = PDBFile(pdb_file)
# Add hydrogens
forcefield = ForceField("amber99sb.xml")
modeller = Modeller(pdb.topology, pdb.positions)
modeller.addHydrogens(forcefield)

system = forcefield.createSystem(modeller.topology, nonbondedMethod=NoCutoff)
integrator = VerletIntegrator(0.001 * picoseconds)
simulation = Simulation(modeller.topology, system, integrator)
simulation.context.setPositions(modeller.positions)
# Energy minimization
simulation.minimizeEnergy(maxIterations=maxIterations)
positions = simulation.context.getState(getPositions=True).getPositions()
# Write energy minimized file
PDBFile.writeFile(simulation.topology, positions, open(relaxed_file, "w+"))
except:
print(f'Failed to relax {pdb_file}')

def _pdb_file(
self,
prot_coords: np.ndarray[tuple[int, int], np.dtype[np.float64]],
outpath: str,
pdb_file: str,
) -> None:
"""
create pdb file for given coordinates
:param np.ndarray[tuple[int, int], np.dtype[np.float64]] prot_coords: coordinates of all atoms of a protein
:param str outpath: path where the pdb file should be stored
:param str pdb_file: path where the pdb file should be stored
"""
pdb_data = self.mol.data
with open(
outpath,
pdb_file,
"w+",
) as cfile:
for ck, k in enumerate(prot_coords):
Expand Down Expand Up @@ -661,13 +756,13 @@ def generate(
gen_prot_coords = s * self.stdval + self.meanval
# create pdb file
if pdb_path is not None:
for ci, i in enumerate(tqdm(gen_prot_coords, desc="Generating pdb file")):
struct_path = os.path.join(pdb_path, f"s{ci}.pdb")
self._pdb_file(i, struct_path)
for i, coord in enumerate(tqdm(gen_prot_coords, desc="Generating pdb files")):
struct_path = os.path.join(pdb_path, f"s{i}.pdb")
self._pdb_file(coord, struct_path)
# relax and save as new file
if relax:
self._relax(
struct_path, f"{os.path.splitext(struct_path)[0]}_relax.pdb"
struct_path, pdb_path, maxIterations=1000
)

return gen_prot_coords
Expand Down
2 changes: 1 addition & 1 deletion src/molearn/data/pdb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def import_pdb(self, filename: str | list[str], topology: str | None = None):
first_universe = mda.Universe(filename[0])
self._mol = mda.Universe(first_universe._topology, filename)
if isinstance(filename, list) and topology is not None:
first_universe = mda.Universe(topology[0], filename[0])
first_universe = mda.Universe(topology, filename[0])
self._mol = mda.Universe(first_universe._topology, filename)
elif topology is None:
self._mol = mda.Universe(filename)
Expand Down

0 comments on commit 23949d3

Please sign in to comment.