From 740e74e8c61ea01a4b2120bd369b11a58cb9c304 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Mon, 7 Oct 2024 22:07:07 +0200 Subject: [PATCH] Implementation of parallelization to `MDAnalysis.analysis.dssp` (#4721) - Fixes #4674 - Parallelization of the backend support to the class DSSP in dssp.py - Addition of parallelization tests in test_dssp.py and fixtures in conftest.py - Updated Changelog --------- Co-authored-by: Yuxuan Zhuang --- package/CHANGELOG | 3 +- package/MDAnalysis/analysis/dssp/dssp.py | 29 ++++++++++++++--- .../MDAnalysisTests/analysis/conftest.py | 11 ++++++- .../MDAnalysisTests/analysis/test_dssp.py | 31 ++++++++++--------- 4 files changed, 52 insertions(+), 22 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index ec5fe28ecad..b284ffddeec 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -62,7 +62,8 @@ Enhancements * explicitly mark `analysis.pca.PCA` as not parallelizable (Issue #4680) * enables parallelization for analysis.bat.BAT (Issue #4663) * enable parallelization for analysis.dihedrals.{Dihedral,Ramachandran,Janin} - (Issue #4673) + (Issue #4673) + * enables parallelization for analysis.dssp.dssp.DSSP (Issue #4674) * Enables parallelization for analysis.hydrogenbonds.hbond_analysis.HydrogenBondAnalysis (Issue #4664) * Improve error message for `AtomGroup.unwrap()` when bonds are not present.(Issue #4436, PR #4642) * Add `analysis.DSSP` module for protein secondary structure assignment, based on [pydssp](https://github.com/ShintaroMinami/PyDSSP) diff --git a/package/MDAnalysis/analysis/dssp/dssp.py b/package/MDAnalysis/analysis/dssp/dssp.py index d2eccf09473..21dd9423e49 100644 --- a/package/MDAnalysis/analysis/dssp/dssp.py +++ b/package/MDAnalysis/analysis/dssp/dssp.py @@ -148,7 +148,7 @@ import numpy as np from MDAnalysis import Universe, AtomGroup -from ..base import AnalysisBase +from ..base import AnalysisBase, ResultsGroup from ...due import due, Doi due.cite( @@ -196,7 +196,7 @@ class DSSP(AnalysisBase): .. Warning:: For DSSP to work properly, your atoms must represent a protein. The hydrogen atom bound to the backbone nitrogen atom is matched by name - as given by the keyword argument `hydrogen_atom`. There may only be + as given by the keyword argument `hydrogen_atom`. There may only be a single backbone nitrogen hydrogen atom per residue; the one exception is proline, for which there should not exist any such hydrogens. The default value of `hydrogen_atom` should handle the common naming @@ -229,8 +229,8 @@ class DSSP(AnalysisBase): (except proline), namely the one bound to the backbone nitrogen. .. Note:: - To work with different hydrogen-naming conventions by default, the - default selection is broad but if hydrogens are incorrectly selected + To work with different hydrogen-naming conventions by default, the + default selection is broad but if hydrogens are incorrectly selected (e.g., a :exc:`ValueError` is raised) you must customize `hydrogen_name` for your specific case. @@ -263,7 +263,7 @@ class DSSP(AnalysisBase): The :attr:`results.dssp_ndarray` attribute holds a ``(n_frames, n_residues, 3)`` shape ndarray with a *one-hot encoding* of *loop* '-' (index 0), *helix* 'H' (index 1), and *sheet* 'E' - (index 2), respectively for each frame of the trajectory. It can be + (index 2), respectively for each frame of the trajectory. It can be used to compute, for instance, the **average secondary structure**: >>> from MDAnalysis.analysis.dssp import translate, DSSP @@ -276,8 +276,23 @@ class DSSP(AnalysisBase): .. versionadded:: 2.8.0 + + Enabled **parallel execution** with the ``multiprocessing`` and ``dask`` + backends; use the new method :meth:`get_supported_backends` to see all + supported backends. """ + _analysis_algorithm_is_parallelizable = True + + @classmethod + def get_supported_backends(cls): + return ( + "serial", + "multiprocessing", + "dask", + ) + + def __init__( self, atoms: Union[Universe, AtomGroup], @@ -382,6 +397,10 @@ def _conclude(self): self.results.dssp_ndarray = np.array(self.results.dssp_ndarray) self.results.resids = self._heavy_atoms["CA"].resids + def _get_aggregator(self): + return ResultsGroup( + lookup={"dssp_ndarray": ResultsGroup.flatten_sequence}, + ) def translate(onehot: np.ndarray) -> np.ndarray: """Translate a one-hot encoding summary into char-based secondary structure diff --git a/testsuite/MDAnalysisTests/analysis/conftest.py b/testsuite/MDAnalysisTests/analysis/conftest.py index b5fe975dcd8..fc3c8a480c7 100644 --- a/testsuite/MDAnalysisTests/analysis/conftest.py +++ b/testsuite/MDAnalysisTests/analysis/conftest.py @@ -10,6 +10,7 @@ from MDAnalysis.analysis.dihedrals import Dihedral, Ramachandran, Janin from MDAnalysis.analysis.bat import BAT from MDAnalysis.analysis.gnm import GNMAnalysis +from MDAnalysis.analysis.dssp.dssp import DSSP from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import ( HydrogenBondAnalysis, ) @@ -127,8 +128,16 @@ def client_GNMAnalysis(request): def client_BAT(request): return request.param + +# MDAnalysis.analysis.dssp.dssp + +@pytest.fixture(scope="module", params=params_for_cls(DSSP)) +def client_DSSP(request): + return request.param + + # MDAnalysis.analysis.hydrogenbonds @pytest.fixture(scope='module', params=params_for_cls(HydrogenBondAnalysis)) def client_HydrogenBondAnalysis(request): - return request.param \ No newline at end of file + return request.param diff --git a/testsuite/MDAnalysisTests/analysis/test_dssp.py b/testsuite/MDAnalysisTests/analysis/test_dssp.py index 403743d9728..f7a1e118931 100644 --- a/testsuite/MDAnalysisTests/analysis/test_dssp.py +++ b/testsuite/MDAnalysisTests/analysis/test_dssp.py @@ -10,31 +10,31 @@ # Files that match glob pattern '????.pdb.gz' and matching '????.pdb.dssp' files, # containing the secondary structure assignment string, will be tested automatically. @pytest.mark.parametrize("pdb_filename", glob.glob(f"{DSSP_FOLDER}/?????.pdb.gz")) -def test_file_guess_hydrogens(pdb_filename): +def test_file_guess_hydrogens(pdb_filename, client_DSSP): u = mda.Universe(pdb_filename) with open(f"{pdb_filename.rstrip('.gz')}.dssp", "r") as fin: correct_answ = fin.read().strip().split()[0] - run = DSSP(u, guess_hydrogens=True).run() + run = DSSP(u, guess_hydrogens=True).run(**client_DSSP) answ = "".join(run.results.dssp[0]) assert answ == correct_answ -def test_trajectory(): +def test_trajectory(client_DSSP): u = mda.Universe(TPR, XTC).select_atoms("protein").universe - run = DSSP(u).run(stop=10) + run = DSSP(u).run(**client_DSSP, stop=10) first_frame = "".join(run.results.dssp[0]) last_frame = "".join(run.results.dssp[-1]) avg_frame = "".join(translate(run.results.dssp_ndarray.mean(axis=0))) assert first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---" protein = mda.Universe(TPR, XTC).select_atoms("protein") - run = DSSP(protein).run(stop=10) + run = DSSP(protein).run(**client_DSSP, stop=10) -def test_atomgroup(): +def test_atomgroup(client_DSSP): protein = mda.Universe(TPR, XTC).select_atoms("protein") - run = DSSP(protein).run(stop=10) + run = DSSP(protein).run(**client_DSSP, stop=10) first_frame = "".join(run.results.dssp[0]) last_frame = "".join(run.results.dssp[-1]) avg_frame = "".join(translate(run.results.dssp_ndarray.mean(axis=0))) @@ -42,9 +42,9 @@ def test_atomgroup(): assert first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---" -def test_trajectory_with_hydrogens(): +def test_trajectory_with_hydrogens(client_DSSP): u = mda.Universe(TPR, XTC).select_atoms("protein").universe - run = DSSP(u, guess_hydrogens=False).run(stop=10) + run = DSSP(u, guess_hydrogens=False).run(**client_DSSP, stop=10) first_frame = "".join(run.results.dssp[0]) last_frame = "".join(run.results.dssp[-1]) avg_frame = "".join(translate(run.results.dssp_ndarray.mean(axis=0))) @@ -53,28 +53,29 @@ def test_trajectory_with_hydrogens(): @pytest.mark.parametrize("pdb_filename", glob.glob(f"{DSSP_FOLDER}/2xdgA.pdb.gz")) -def test_trajectory_without_hydrogen_fails(pdb_filename): +def test_trajectory_without_hydrogen_fails(pdb_filename, client_DSSP): u = mda.Universe(pdb_filename) with pytest.raises(ValueError): - DSSP(u, guess_hydrogens=False).run() + DSSP(u, guess_hydrogens=False).run(**client_DSSP) @pytest.mark.parametrize( "pdb_filename", glob.glob(f"{DSSP_FOLDER}/1mr1D_failing.pdb.gz") ) -def test_trajectory_with_uneven_number_of_atoms_fails(pdb_filename): +def test_trajectory_with_uneven_number_of_atoms_fails(pdb_filename, + client_DSSP): u = mda.Universe(pdb_filename) with pytest.raises(ValueError): - DSSP(u, guess_hydrogens=True).run() + DSSP(u, guess_hydrogens=True).run(**client_DSSP) @pytest.mark.parametrize( "pdb_filename", glob.glob(f"{DSSP_FOLDER}/wrong_hydrogens.pdb.gz") ) -def test_exception_raises_with_atom_index(pdb_filename): +def test_exception_raises_with_atom_index(pdb_filename, client_DSSP): u = mda.Universe(pdb_filename) with pytest.raises( ValueError, match="Residue contains*", ): - DSSP(u, guess_hydrogens=False).run() + DSSP(u, guess_hydrogens=False).run(**client_DSSP)