Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File format independent data loading #20

Merged
merged 8 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Tests

on: [push]

jobs:
build-linux:
runs-on: ubuntu-latest
strategy:
max-parallel: 5

steps:
- uses: actions/checkout@v4
- name: Set up Python 3.8
uses: actions/setup-python@v3
with:
python-version: '3.8'
- name: Add conda to system path
run: |
# $CONDA is an environment variable pointing to the root of the miniconda directory
echo $CONDA/bin >> $GITHUB_PATH
- name: Install dependencies
run: |
conda install -y python=3.8
conda env update --file environment.yml --name base
- name: Test
run: |
cd test
python test_openmm_plugin.py
python test_pdbdata.py
python test_trainer.py

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ The current version of molearn only supports Linux, and has verified to support
* numpy
* PyTorch (1.7+)
* [Biobox](https://github.com/Degiacomi-Lab/biobox)
* [MDAnalysis](https://www.mdanalysis.org/)

#### Optional Packages

Expand All @@ -42,7 +43,6 @@ To calculate DOPE and Ramachandran scores during analysis:
* [cctbx](https://cctbx.github.io/)

To run the GUI:
* [MDAnalysis](https://www.mdanalysis.org/)
* [plotly](https://plotly.com/python/)
* [NGLView](http://nglviewer.org/nglview/latest/)

Expand Down
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ https://journals.aps.org/prx/abstract/10.1103/PhysRevX.11.011052).
#### Training data

The files `MurD_closed.tar.gz` and `MurD_open.tar.gz` contain 900 conformations each of MurD, generated with MD simulations of its closed and open state. Extracting these files will yield `MurD_closed_selection.pdb` and `MurD_open_selection.pdb`.
In order to use them as training data please run the `prepare_example.py` in order to obtain a joined and prepared trajectory.

#### Test data

Expand Down
9 changes: 7 additions & 2 deletions examples/analysis_example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import torch
import os
# import sys

# sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
from molearn.models.foldingnet import AutoEncoder
from molearn.analysis import MolearnAnalysis
from molearn.data import PDBData
Expand Down Expand Up @@ -40,8 +43,10 @@ def main():
# by defining the manual see and loading the dataset in the same order as when
# the neural network was trained, the same train-test split will be obtained
data = PDBData()
data.import_pdb(f"data{os.sep}MurD_closed_selection.pdb")
data.import_pdb(f"data{os.sep}MurD_open_selection.pdb")
data.import_pdb(
"./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd",
"./clustered/MurD_open_selection_NEW_TOPO.pdb",
)
data.fix_terminal()
data.atomselect(atoms=["CA", "C", "N", "CB", "O"])
data.prepare_dataset()
Expand Down
168 changes: 96 additions & 72 deletions examples/bb_example_subclassing_trainer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import sys, os
sys.path.insert(0, os.path.join(os.path.abspath(os.pardir),'src'))
import sys
import os

sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
from molearn.data import PDBData
from molearn.trainers import OpenMM_Physics_Trainer
from molearn.models.foldingnet import AutoEncoder
Expand All @@ -10,122 +12,144 @@


class CustomTrainer(OpenMM_Physics_Trainer):
#### All commented out sections are not needed, they are there for demonstration purposes
#### All commented out sections are not needed, they are there for demonstration purposes
### This is what common_step looks like in Trainer ###
# def common_step(self, batch):
# self._internal = {}
# encoded = self.autoencoder.encode(batch)
# self._internal['encoded'] = encoded
# decoded = self.autoencoder.decode(encoded)[:,:,:batch.size(2)]
# self._internal['decoded'] = decoded
# return dict(mse_loss = ((batch-decoded)**2).mean())
# def common_step(self, batch):
# self._internal = {}
# encoded = self.autoencoder.encode(batch)
# self._internal['encoded'] = encoded
# decoded = self.autoencoder.decode(encoded)[:,:,:batch.size(2)]
# self._internal['decoded'] = decoded
# return dict(mse_loss = ((batch-decoded)**2).mean())

### This is what common_physics_step looks like in OpenMM_Physics_Trainer ###
# def common_physics_step(self, batch, latent):
# alpha = torch.rand(int(len(batch)//2), 1, 1).type_as(latent)
# latent_interpolated = (1-alpha)*latent[:-1:2] + alpha*latent[1::2]
# generated = self.autoencoder.decode(latent_interpolated)[:,:,:batch.size(2)]
# self._internal['generated'] = generated
# energy = self.physics_loss(generated)
# energy[energy.isinf()]=1e35
# energy = torch.clamp(energy, max=1e34)
# energy = energy.nanmean()
# return {'physics_loss':energy}#a if not energy.isinf() else torch.tensor(0.0)}
# def common_physics_step(self, batch, latent):
# alpha = torch.rand(int(len(batch)//2), 1, 1).type_as(latent)
# latent_interpolated = (1-alpha)*latent[:-1:2] + alpha*latent[1::2]
# generated = self.autoencoder.decode(latent_interpolated)[:,:,:batch.size(2)]
# self._internal['generated'] = generated
# energy = self.physics_loss(generated)
# energy[energy.isinf()]=1e35
# energy = torch.clamp(energy, max=1e34)
# energy = energy.nanmean()
# return {'physics_loss':energy}#a if not energy.isinf() else torch.tensor(0.0)}

### This is what valid_step looks like in OpenMM_Physics_Trainer ###
# def valid_step(self, batch):
# results = self.common_step(batch)
# results.update(self.common_physics_step(batch, self._internal['encoded']))
# scale = (self.psf*results['mse_loss'])/(results['physics_loss'] +1e-5)
# final_loss = torch.log(results['mse_loss'])+scale*torch.log(results['physics_loss']
# results['loss'] = final_loss
# return results
# def valid_step(self, batch):
# results = self.common_step(batch)
# results.update(self.common_physics_step(batch, self._internal['encoded']))
# scale = (self.psf*results['mse_loss'])/(results['physics_loss'] +1e-5)
# final_loss = torch.log(results['mse_loss'])+scale*torch.log(results['physics_loss']
# results['loss'] = final_loss
# return results

def valid_step(self, batch):
results = super().valid_step(batch)
#rmsd
rmsd = (((batch-self._internal['decoded'])*self.std)**2).sum(dim=1).mean().sqrt()
results['RMSD'] = rmsd # 'valid_' will automatically be prepended onto this in valid_epoch, to distinguish it from train_step
results = super().valid_step(batch)
# rmsd
rmsd = (
(((batch - self._internal["decoded"]) * self.std) ** 2)
.sum(dim=1)
.mean()
.sqrt()
)
results["RMSD"] = (
rmsd # 'valid_' will automatically be prepended onto this in valid_epoch, to distinguish it from train_step
)

#calculate some dope
# calculate some dope
if self.first_valid_step:
self.first_valid_step = False
if not hasattr(self, 'dope_score_class'):
self.dope_score_class = Parallel_DOPE_Score(self.mol,processes=torch.get_num_threads())
#Calculated dope of decoded structures
if not hasattr(self, "dope_score_class"):
self.dope_score_class = Parallel_DOPE_Score(
self.mol, processes=torch.get_num_threads()
)
# Calculated dope of decoded structures
self.dope_scores = []
decoded_batch = (self._internal['decoded'].permute(0,2,1)*self.std).data.cpu().numpy()
decoded_batch = (
(self._internal["decoded"].permute(0, 2, 1) * self.std)
.data.cpu()
.numpy()
)
for f in decoded_batch:
if np.isfinite(f).all():
self.dope_scores.append(self.dope_score_class.get_score(f,refine='both'))
self.dope_scores.append(
self.dope_score_class.get_score(f, refine="both")
)

#Calcutate dope of interpolated/generated structures
# Calcutate dope of interpolated/generated structures
self.interp_dope_scores = []
interpolated_batch = (self._internal['generated'].permute(0,2,1)*self.std).data.cpu().numpy()
interpolated_batch = (
(self._internal["generated"].permute(0, 2, 1) * self.std)
.data.cpu()
.numpy()
)
for f in interpolated_batch:
if np.isfinite(f).all():
self.interp_dope_scores.append(self.dope_score_class.get_score(f,refine='both'))
self.interp_dope_scores.append(
self.dope_score_class.get_score(f, refine="both")
)
# These will calculate in the background, synchronize at the end of the epoch.
return results

def valid_epoch(self, *args, **kwargs):
self.first_valid_step = self.epoch%5==0
self.first_valid_step = self.epoch % 5 == 0
results = super().valid_epoch(*args, **kwargs)

# Might as well keep track of cuda memomry once an epoch
memory = torch.cuda.max_memory_allocated()/1000000.0
results['Memory'] = memory
memory = torch.cuda.max_memory_allocated() / 1000000.0
results["Memory"] = memory

if self.epoch%5==0:
if self.epoch % 5 == 0:
t1 = time()
#self.dope_scores contains multiprocessing result objects, get the results
#This will synchronize the code
# self.dope_scores contains multiprocessing result objects, get the results
# This will synchronize the code
dope = np.array([r.get() for r in self.dope_scores])
idope = np.array([r.get() for r in self.interp_dope_scores])

#Dope score returns (DOPE_score, refined_DOPE_score), might as well log both
results['valid_DOPE'] = dope[:,0].mean()
results['valid_DOPE_refined'] = dope[:,1].mean()
results['valid_DOPE_interp'] = idope[:,0].mean()
results['valid_DOPE_interp_refined'] = idope[:,1].mean()
results['valid_DOPE_time'] = time()-t1 # extra time taken to calculate DOPE
# Dope score returns (DOPE_score, refined_DOPE_score), might as well log both
results["valid_DOPE"] = dope[:, 0].mean()
results["valid_DOPE_refined"] = dope[:, 1].mean()
results["valid_DOPE_interp"] = idope[:, 0].mean()
results["valid_DOPE_interp_refined"] = idope[:, 1].mean()
results["valid_DOPE_time"] = (
time() - t1
) # extra time taken to calculate DOPE
return results



if __name__ == '__main__':

if __name__ == "__main__":
##### Load Data #####
data = PDBData()
data.import_pdb('data/MurD_closed_selection.pdb')
data.import_pdb('data/MurD_open_selection.pdb')
data.import_pdb(
"./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd",
"./clustered/MurD_open_selection_NEW_TOPO.pdb",
)
data.fix_terminal()
data.atomselect(atoms = ['CA', 'C', 'N', 'CB', 'O'])
data.atomselect(atoms=["CA", "C", "N", "CB", "O"])

##### Prepare Trainer #####
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trainer = CustomTrainer(device=device)

trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed = 25)
trainer.prepare_physics(remove_NB = True)

trainer.set_autoencoder(AutoEncoder, out_points = data.dataset.shape[-1])
trainer.prepare_optimiser()
trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed=25)
trainer.prepare_physics(remove_NB=True)

trainer.set_autoencoder(AutoEncoder, out_points=data.dataset.shape[-1])
trainer.prepare_optimiser()

##### Training Loop #####
#Keep training until loss does not improve for 32 consecutive epochs
# Keep training until loss does not improve for 32 consecutive epochs

runkwargs = dict(
log_filename='log_file.dat',
log_folder='xbb_foldingnet_checkpoints',
checkpoint_folder='xbb_foldingnet_checkpoints',
)
log_filename="log_file.dat",
log_folder="xbb_foldingnet_checkpoints",
checkpoint_folder="xbb_foldingnet_checkpoints",
)

best = 1e24
while True:
trainer.run(max_epochs = 32+trainer.epoch,**runkwargs)
if not best>trainer.best:
trainer.run(max_epochs=32 + trainer.epoch, **runkwargs)
if not best > trainer.best:
break
best = trainer.best
print(f'best {trainer.best}, best_filename {trainer.best_name}')
print(f"best {trainer.best}, best_filename {trainer.best_name}")
48 changes: 27 additions & 21 deletions examples/bb_foldingnet_basic.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,50 @@
import sys, os
sys.path.insert(0, os.path.join(os.path.abspath(os.pardir),'src'))
import sys
import os

sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
from molearn.data import PDBData
from molearn.trainers import OpenMM_Physics_Trainer
from molearn.models.foldingnet import AutoEncoder
import torch


if __name__ == '__main__':

def main():
##### Load Data #####
data = PDBData()
data.import_pdb('data/MurD_closed_selection.pdb')
data.import_pdb('data/MurD_open_selection.pdb')
data.import_pdb(
"./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd",
"./clustered/MurD_open_selection_NEW_TOPO.pdb",
)
data.fix_terminal()
data.atomselect(atoms = ['CA', 'C', 'N', 'CB', 'O'])
data.atomselect(atoms=["CA", "C", "N", "CB", "O"])

##### Prepare Trainer #####
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trainer = OpenMM_Physics_Trainer(device=device)

trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed = 25)
trainer.prepare_physics(remove_NB = True)

trainer.set_autoencoder(AutoEncoder, out_points = data.dataset.shape[-1])
trainer.prepare_optimiser()
trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed=25)
trainer.prepare_physics(remove_NB=True)

trainer.set_autoencoder(AutoEncoder, out_points=data.dataset.shape[-1])
trainer.prepare_optimiser()

##### Training Loop #####
#Keep training until loss does not improve for 32 consecutive epochs
# Keep training until loss does not improve for 32 consecutive epochs

runkwargs = dict(
log_filename='log_file.dat',
log_folder='xbb_foldingnet_checkpoints',
checkpoint_folder='xbb_foldingnet_checkpoints',
)
log_filename="log_file.dat",
log_folder="xbb_foldingnet_checkpoints",
checkpoint_folder="xbb_foldingnet_checkpoints",
)

best = 1e24
while True:
trainer.run(max_epochs = 32+trainer.epoch,**runkwargs)
if not best>trainer.best:
trainer.run(max_epochs=32 + trainer.epoch, **runkwargs)
if not best > trainer.best:
break
best = trainer.best
print(f'best {trainer.best}, best_filename {trainer.best_name}')
print(f"best {trainer.best}, best_filename {trainer.best_name}")


if __name__ == "__main__":
main()
Loading
Loading