-
Notifications
You must be signed in to change notification settings - Fork 0
/
ModelPlay.py
96 lines (83 loc) · 4.2 KB
/
ModelPlay.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import gzip
import numpy as np
from Bio.PDB import PDBParser
import joblib
import warnings
from tqdm import tqdm
from multiprocessing import Pool
warnings.filterwarnings("ignore")
# Convert three-letter residue names to single-letter codes
def three_to_one_letter(residue_name):
conversion = {
'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}
return conversion.get(residue_name, 'X')
# Function to calculate features from a protein structure
def calculate_features(structure):
hydrophobic_residues = ['A', 'V', 'I', 'L', 'M', 'F', 'W', 'Y', 'P', 'G']
hydrophilic_residues = ['R', 'K', 'D', 'E', 'Q', 'N', 'H', 'S', 'T', 'C']
hydrophobic_count = 0
hydrophilic_count = 0
atom_count = 0
total_residue_count = 0
distances = []
for model in structure:
for chain in model:
residues = list(chain)
total_residue_count += len(residues)
for residue in residues:
atom_count += len(list(residue.get_atoms()))
resname = three_to_one_letter(residue.get_resname())
if resname in hydrophobic_residues:
hydrophobic_count += 1
elif resname in hydrophilic_residues:
hydrophilic_count += 1
for i in range(len(residues) - 2):
for atom1 in residues[i]:
for atom2 in residues[i + 1]:
for atom3 in residues[i + 2]:
distance1 = atom1 - atom2
distance2 = atom2 - atom3
distances.append(distance1)
distances.append(distance2)
if distances:
mean_distance = np.mean(distances)
std_distance = np.std(distances)
min_distance = np.min(distances)
max_distance = np.max(distances)
else:
return np.array([0] * 7) # Fallback for structures with no measurable distances
hydrophobic_proportion = hydrophobic_count / total_residue_count if total_residue_count else 0
hydrophilic_proportion = hydrophilic_count / total_residue_count if total_residue_count else 0
return np.array([mean_distance, std_distance, min_distance, max_distance, hydrophobic_proportion, hydrophilic_proportion, atom_count])
# Main function to process PDB files and classify them using the trained model
def process_pdb_file(pdb_file):
try:
with gzip.open(os.path.join(pdb_directory, pdb_file), 'rt') as handle:
structure = PDBParser().get_structure("new_protein", handle)
features = calculate_features(structure).reshape(1, -1)
prediction_proba = clf.predict_proba(features)[0]
predicted_label = 1 if prediction_proba[1] > 0.5 else 0 # Predicted label based on the higher probability
# Log output for each file including the predicted label
print(f"File: {pdb_file}")
print(f"Predicted probabilities: Non-Resistant: {prediction_proba[0]:.4f}, Resistant: {prediction_proba[1]:.4f}")
print(f"Predicted label: {'Resistant' if predicted_label == 1 else 'Non-Resistant'}\n")
# Return true if predicted as resistant (label 1), this is useful if you're tracking performance in some way
return predicted_label == 1
except Exception as e:
print(f"Error processing file {pdb_file}: {e}")
return False
if __name__ == "__main__":
model_path = "/proj/trained_model.joblib"
pdb_directory = "/proj/Anti_bio_res_prot"
clf = joblib.load(model_path)
pdb_files = [f for f in os.listdir(pdb_directory) if f.endswith('.pdb.gz')]
with Pool(processes=os.cpu_count()) as pool:
results = list(tqdm(pool.imap(process_pdb_file, pdb_files), total=len(pdb_files)))
correct_predictions = sum(results)
accuracy = correct_predictions / len(results) if results else 0
print(f"Overall accuracy: {accuracy * 100:.2f}%")