-
Notifications
You must be signed in to change notification settings - Fork 77
/
Copy pathalign_utils.py
235 lines (203 loc) · 12 KB
/
align_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#! /usr/bin/env python3
#
# align_utils.py
#
# Copyright 2019 Luan Carvalho Martins <[email protected]>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
import numpy
import os_util
from mol_util import obmol_to_rwmol
def align_sequences_match_residues(mobile_seq, target_seq, seq_align_mat='BLOSUM80', gap_penalty=-1.0, verbosity=0):
""" Align two aminoacid sequences using Bio.pairwise2.globalds and substution matrix seq_align_mat, return a tuple
with two list of residues to be used in the 3D alignment (mobile, refence)
:param str mobile_seq: sequence of mobile protein
:param str target_seq: sequence of target protein
:param str seq_align_mat: use this substution matrix from Bio.SubsMat.MatrixInfo
:param float gap_penalty: gap penalty to the alignment; avoid values too low in module
:param int verbosity: sets the verbosity level
:rtype: tuple
"""
try:
from Bio.pairwise2 import align
from Bio.Align import substitution_matrices
seq_align_mat = substitution_matrices.load(seq_align_mat)
except ImportError as error:
os_util.local_print('Failed to import Biopython with error: {}\nBiopython is necessary for sequence '
'alignment. Sequences to be aligned:\nReference: {}\nMobile: {}'
''.format(error, target_seq, mobile_seq),
msg_verbosity=os_util.verbosity_level.error, current_verbosity=verbosity)
raise ImportError(error)
except FileNotFoundError as error:
available_matrices = substitution_matrices.load()
os_util.local_print('Failed to import substitution matrix {} with error: {}\nSubstitution matrix must be one '
'of: {})'
''.format(seq_align_mat, error, available_matrices),
msg_verbosity=os_util.verbosity_level.error, current_verbosity=verbosity)
raise FileNotFoundError(error)
else:
align_result = align.globalds(target_seq, mobile_seq, seq_align_mat, gap_penalty,
gap_penalty)[0]
os_util.local_print('This is the alignment result to be used in protein alignment:\n{}'
''.format(align_result),
msg_verbosity=os_util.verbosity_level.info, current_verbosity=verbosity)
ref_align_str = [True if res_j != '-' else False
for res_i, res_j in zip(align_result[0], align_result[1])
if res_i != '-']
mob_align_str = [True if res_i != '-' else False
for res_i, res_j in zip(align_result[0], align_result[1])
if res_j != '-']
return mob_align_str, ref_align_str
def get_position_matrix(each_mol, each_mol_str=None, atom_selection=None, verbosity=0):
"""
:param pybel.Molecule each_mol: molecule to get positions from
:param list each_mol_str: a alignment string from where residues to be used will be read
:param list atom_selection: use atoms matching this name (default: CA)
:param int verbosity: sets the verbosity level
:rtype: list
"""
if atom_selection is None:
atom_selection = ['CA']
if each_mol_str is None:
each_mol_str = [True for _ in range(len(each_mol.residues))]
added_atoms = []
return_list = []
for each_residue, residue_alignment in zip(each_mol.residues, each_mol_str):
for each_atom in each_residue.atoms:
if each_atom.OBAtom.GetResidue().GetAtomID(each_atom.OBAtom).lstrip().rstrip() in atom_selection:
atom_str = '{}{}{}'.format(each_atom.OBAtom.GetResidue().GetAtomID(each_atom.OBAtom),
each_residue.name, each_residue.idx)
if residue_alignment:
if atom_str not in added_atoms:
return_list.append(each_atom.OBAtom.GetVector())
added_atoms.append(atom_str)
else:
os_util.local_print('Atom {} found twice in your protein {}. Cannot handle multiple '
'occupancies.'.format(atom_str, each_mol.title),
msg_verbosity=os_util.verbosity_level.error, current_verbosity=verbosity)
raise SystemExit(1)
return return_list
@os_util.trace_function
def align_protein(mobile_mol, reference_mol, align_method='openbabel', seq_align_mat='BLOSUM80',
gap_penalty=-1, verbosity=0):
"""
Align mobile_mol to reference_mol using method defined in align_method. Defaults to openbabel.OBAlign, which is
fastest. rdkit's GetAlignmentTransform is much slower and may not work on larger systems.
Parameters
----------
reference_mol : [rdkit.RWMol, pybel.Molecule]
molecule to be used as alignment reference
mobile_mol : [rdkit.RWMol, pybel.Molecule]
rdkit.RWMol molecule to be aligned
align_method : str
method to be used, options are 'openbabel', 'rdkit'
seq_align_mat : str
use this matrix to sequence alignment, only used if sequences differ. Any value from Bio.SubsMat.MatrixInfo will
work
gap_penalty : float
use this gap penalty to sequence alignment, only used if sequences differ.
verbosity : int
be verbosity
Returns
-------
dict
"""
if align_method == 'rdkit':
# Uses rdkit.Chem.rdMolAlign.GetAlignmentTransform to align mobile_mol to reference_mol
import rdkit.Chem.rdMolAlign
reference_mol_rwmol = obmol_to_rwmol(reference_mol)
if reference_mol_rwmol is None:
os_util.local_print('Could not internally convert reference_mol',
msg_verbosity=os_util.verbosity_level.error, current_verbosity=verbosity)
if verbosity >= os_util.verbosity_level.info:
os_util.local_print('Dumping data to receptor_mol_error.pdb',
msg_verbosity=os_util.verbosity_level.info, current_verbosity=verbosity)
reference_mol.write('mol', 'receptor_mol_error.pdb')
raise SystemExit(1)
mobile_mol_rwmol = obmol_to_rwmol(mobile_mol)
if mobile_mol_rwmol is None:
os_util.local_print('Could not internally convert OpenBabel mobile_mol to a RDKit.Chem.Mol object.',
msg_verbosity=os_util.verbosity_level.error, current_verbosity=verbosity)
raise SystemExit(1)
os_util.local_print('Done reading and converting reference_mol {} and mobile_mol {}'
''.format(reference_mol_rwmol.GetProp('_Name'), mobile_mol_rwmol.GetProp('_Name')),
msg_verbosity=os_util.verbosity_level.debug, current_verbosity=verbosity)
# FIXME: implement this
transformation_mat = rdkit.Chem.rdMolAlign.GetAlignmentTransform(reference_mol_rwmol, mobile_mol_rwmol)
raise NotImplementedError('rdkit aligment method not implemented')
elif align_method == 'openbabel':
# FIXME: implement a Biopython-only method
try:
from openbabel.openbabel import OBAlign
from openbabel import pybel
except ImportError:
import pybel
from openbabel import OBAlign
if verbosity < os_util.verbosity_level.extra_debug:
pybel.ob.obErrorLog.SetOutputLevel(pybel.ob.obError)
else:
os_util.local_print('OpenBabel warning messages are on, expect a lot of output.',
msg_verbosity=os_util.verbosity_level.extra_debug, current_verbosity=verbosity)
reference_mol_seq = reference_mol.write('fasta').split('\n', 1)[1].replace('\n', '')
mobile_mol_seq = mobile_mol.write('fasta').split('\n', 1)[1].replace('\n', '')
if reference_mol_seq != mobile_mol_seq:
os_util.local_print('Aminoacid sequences of {} and {} differs:\nReference: {}\nMobile: {}'
''.format(reference_mol.title, mobile_mol.title, reference_mol_seq, mobile_mol_seq),
msg_verbosity=os_util.verbosity_level.info, current_verbosity=verbosity)
mob_align_str, ref_align_str = align_sequences_match_residues(mobile_mol_seq, reference_mol_seq,
seq_align_mat=seq_align_mat,
gap_penalty=gap_penalty,
verbosity=verbosity)
else:
ref_align_str = None
mob_align_str = None
# Creates a new molecule containing only the selected atoms of both proteins
ref_atom_vec = get_position_matrix(reference_mol, ref_align_str)
reference_mol_vec = pybel.ob.vectorVector3(ref_atom_vec)
mob_atom_vec = get_position_matrix(mobile_mol, mob_align_str)
mobile_mol_vec = pybel.ob.vectorVector3(mob_atom_vec)
os_util.local_print('Done extracting Ca from {} and {}'.format(reference_mol.title, mobile_mol.title),
msg_verbosity=os_util.verbosity_level.debug, current_verbosity=verbosity)
# Align mobile to reference using the Ca coordinates
align_obj = OBAlign(reference_mol_vec, mobile_mol_vec)
if not align_obj.Align():
os_util.local_print('Failed to align mobile_mol {} to reference_mol {}'
''.format(mobile_mol.title, reference_mol.title),
msg_verbosity=os_util.verbosity_level.error, current_verbosity=verbosity)
raise SystemExit(1)
os_util.local_print('Alignment RMSD is {}'.format(align_obj.GetRMSD()),
msg_verbosity=os_util.verbosity_level.info, current_verbosity=verbosity)
# Prepare translation and rotation matrices
reference_mol_center = numpy.array([[a.GetX(), a.GetY(), a.GetZ()] for a in reference_mol_vec]).mean(0)
mobile_mol_center = numpy.array([[a.GetX(), a.GetY(), a.GetZ()] for a in mobile_mol_vec]).mean(0)
translation_vector = reference_mol_center
centering_vector = -mobile_mol_center
rot_matrix = align_obj.GetRotMatrix()
rotation_matrix = numpy.reshape([rot_matrix.Get(i, j) for i in range(3) for j in range(3)], [3, 3])
os_util.local_print('Alignment data:\n\tReference: {}\n\tMobile: {}\n\tCentering: {}\n\tTranslation: {}'
'\n\tRotation matrix:\n\t\t{}, {}, {}\n\t\t{}, {}, {}\n\t\t{}, {}, {}'
''.format(reference_mol_center, mobile_mol_center, centering_vector, translation_vector,
*[rot_matrix.Get(i, j) for i in range(3) for j in range(3)]),
current_verbosity=verbosity, msg_verbosity=os_util.verbosity_level.debug)
return {'centering_vector': centering_vector, 'translation_vector': translation_vector,
'rotation_matrix': rotation_matrix}
else:
# TODO implement a internal alignment method
os_util.local_print('Unknown alignment method {}. Currently, only "openbabel" is allowed.'.format(align_method),
current_verbosity=verbosity, msg_verbosity=os_util.verbosity_level.error)
raise ValueError('Unknown alignment method {}.'.format(align_method))