Source code for TEMPy.protein.structure_parser

# =============================================================================
#     This file is part of TEMPy.
#
#     TEMPy is a software designed to help the user in the manipulation
#     and analyses of macromolecular assemblies using 3D electron microscopy
#     maps.
#
#     Copyright  2015 Birkbeck College University of London.
#
#     Authors: Maya Topf, Daven Vasishtan, Arun Prasad Pandurangan,
#     Irene Farabella, Agnel-Praveen Joseph, Harpal Sahota
#
#     This software is made available under GPL V3 license
#     http://www.gnu.org/licenses/gpl-3.0.html
#
#     Please cite your use of TEMPy in published work:
#
#     Farabella, I., Vasishtan, D., Joseph, A.P., Pandurangan, A.P., Sahota, H.
#     & Topf, M. (2015). J. Appl. Cryst. 48.
#
# =============================================================================
import os
import subprocess
import urllib
import collections

from numpy import append

from TEMPy.protein.prot_rep_biopy import (
    BioPy_Structure,
    gemmiAtom,
    gemmi_Structure,
)
import TEMPy.math.vector as Vector

try:
    from gemmi import cif
    import gemmi
except ImportError:
    raise ImportError(
        'GEMMI library needs to be installed to use TEMPy\'s mmCIF and PDB'
        'parsers Gemmi can be installed using command: pip install gemmi'
        )

AAs = {
    'GLY': 'G',
    'ALA': 'A',
    'VAL': 'V',
    'LEU': 'L',
    'ILE': 'I',
    'MET': 'M',
    'PHE': 'F',
    'TRP': 'W',
    'PRO': 'P',
    'SER': 'S',
    'THR': 'T',
    'CYS': 'C',
    'TYR': 'Y',
    'ASN': 'N',
    'GLN': 'Q',
    'ASP': 'D',
    'GLU': 'E',
    'LYS': 'K',
    'ARG': 'R',
    'HIS': 'H',
}


[docs]class mmCIFParser:

    """ Class for parsing mmCIF files using the GEMMI library.

    All functions contained in the mmCIFParser class are "staticmethods",
    meaning an object does not need to initiated before using the methods, as
    shown in the :ref:`code examples<Model Parsing Code Example>`.
    """

    def __init__(self, filename):
        self.compressed = False
        self.acol_widths = []

        self._is_file_compressed(filename)

    def _is_file_compressed(self, filename):
        if filename[-2:] == 'gz':
            self.compressed = True
        else:
            self.compressed = False

    @staticmethod
    def _convertGEMMItoTEMPy(
                            data_block,
                            structure,
                            filename,
                            water=False,
                            hetatm=False
                            ):
        """
        Private function: converts the gemmi parsed info from a cif file
        into TEMPy conventions - mostly focusses on _atom_site. information
        which is used to initialise gemmiAtom instances

        Args:
            data_block: A data block from a cif file, typically accessed using
                .sole_block gemmi instance method
            filename: Input filename
            hetatm: If True, HETATM atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                returned after parsing. If False, HETATM atoms are ignored.
            water: If True, HETATM, water atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                instance returned after parsing. If False, HETATM atoms are
                ignored.
        Returns:
            :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
                Parsed structure.
        """
        # remove waters and hetatms if required
        if not water:
            structure.remove_waters()
        if not hetatm:
            structure.remove_ligands_and_waters()

        structure.remove_empty_chains()  # deleting hetatms empties some chains
        atomList = []

        for model in structure:
            for chain in model:
                for entity in structure.entities:
                    for subchain in entity.subchains:
                        for residue in chain.get_subchain(subchain):
                            for atom in residue:
                                atomList.append(gemmiAtom(
                                                        chain,
                                                        entity,
                                                        residue,
                                                        atom))

        # grab the remaining metadata
        header = {}
        header_tags = []
        tempy_scores = collections.OrderedDict()
        tempy_headers = []

        for tag in data_block.get_mmcif_category_names():
            if tag != '_atom_site.':
                header_tags.append(tag)
                header[tag] = data_block.get_mmcif_category(tag, raw=True)
            if tag.startswith('_TEMPy'):
                tempy_headers.append(tag)
                tempy_scores[tag] = data_block.get_mmcif_category(
                                                                tag,
                                                                raw=True)

        header['tags'] = tuple(header_tags)

        return gemmi_Structure(
            atomList,
            structure,
            filename=filename,
            header=header,
            tempy_scores=tempy_scores,
            )

[docs]    @staticmethod
    def read_mmCIF_file(filename, hetatm=False, water=False):
        """Read an mmCIF file to generate a Structure instance.

        Uses the `Gemmi <https://gemmi.readthedocs.io/en/latest/index.html>`_
        library to parse the input PDB file.

        Args:
            filename: Path to the input file
            hetatm: If True, HETATM atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                returned after parsing. If False, HETATM atoms are ignored.
            water: If True, HETATM, water atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                instance returned after parsing. If False, HETATM atoms are
                ignored.
        Returns:
            :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
                Parsed structure.

        """

        blocks = cif.read(filename)
        for block in blocks:
            if block.find_loop('_atom_site.id'):
                break
        structure = gemmi.make_structure_from_block(block)

        return mmCIFParser._convertGEMMItoTEMPy(
                                                block,
                                                structure,
                                                filename,
                                                hetatm=hetatm,
                                                water=water)

[docs]    @staticmethod
    def fetch_mmCIF(
                    structure_id,
                    local_filename,
                    hetatm=False,
                    water=False,
                    ):
        """Fetch an mmCIF file from the Protein Data Bank, and use it to
        generate a Structure instance.

        Uses the `Gemmi <https://gemmi.readthedocs.io/en/latest/index.html>`_
        library to parse the input PDB file.

        Args:
            structure_id: structure_id code of pdb file, e.g. 3agy
            local_filename: Filename for locally saved the fetched mmCIF file
            hetatm: If True, HETATM atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                returned after parsing. If False, HETATM atoms are ignored.
            water: If True, HETATM, water atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                instance returned after parsing. If False, HETATM atoms are
                ignored.
        Returns:
            :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
                Parsed structure.

        """

        url = 'http://www.rcsb.org/pdb/files/%s.cif' % structure_id
        new_file, someinfo = urllib.request.urlretrieve(
                                                    url,
                                                    filename=local_filename)

        return mmCIFParser.read_mmCIF_file(
                                            new_file,
                                            hetatm=hetatm,
                                            water=water)


[docs]class PDBParser:
    """
    A class to read PDB files either directly from the pdb or a structure
    instance from Biopython
    """
    def __init__(self):
        pass

[docs]    @staticmethod
    def read_PDB_file(
            structure_id,
            filename,
            hetatm=False,
            water=False,
            chain=None,
    ):
        """ Read PDB file and create Structure instance.

        Uses the `Gemmi <https://gemmi.readthedocs.io/en/latest/index.html>`_
        library to parse the input PDB file.

        Args:
            structure_id: structure_id code of pdb file
            filename: Filename (path) of pdb file
            hetatm: If True, HETATM atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                returned after parsing. If False, HETATM atoms are ignored.
            water: If True, HETATM, water atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                instance returned after parsing. If False, HETATM atoms are
                ignored.
        Returns:
            :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
                Parsed structure.
        """

        # Necessary to read pdb file and transform it to Gemmi cif format
        structure = gemmi.read_pdb(filename)

        # pdb files created by some software can have blank/empty chain names
        if not structure[0][0].name:
            structure = gemmi_helper_fns.name_nameless_chains(structure)

        structure.remove_empty_chains()
        structure.setup_entities()
        structure.assign_label_seq_id()
        data_block = structure.make_mmcif_document().sole_block()

        return mmCIFParser._convertGEMMItoTEMPy(
                                            data_block,
                                            structure,
                                            filename,
                                            water=water,
                                            hetatm=hetatm,
                                            )

[docs]    @staticmethod
    def fetch_PDB(
            structure_id,
            local_filename,
            hetatm=False,
            water=False,
    ):
        """ Fetch PDB file from the PDB and create Structure instance based
        upon it.

        Uses the `Gemmi <https://gemmi.readthedocs.io/en/latest/index.html>`_
        library to parse the input PDB file.

        Args:
            structure_id: structure_id code of pdb file
            local_filename: Filename for locally saved the fetched pdb file
            hetatm: If True, HETATM atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                returned after parsing. If False, HETATM atoms are ignored.
            water: If True, HETATM, water atoms are included in the
                :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
                instance returned after parsing. If False, HETATM atoms are
                ignored.
        Returns:
            :class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
                Parsed structure.
        """
        url = 'http://www.rcsb.org/pdb/files/%s.pdb' % structure_id
        new_file, someinfo = urllib.request.urlretrieve(
                                                    url,
                                                    filename=local_filename)

        return PDBParser.read_PDB_file(structure_id, new_file)

    @staticmethod
    def _bio_strcuture_to_TEMpy(
            filename,
            structure,
            pdb_id,
            hetatm=False,
            water=False,
    ):
        """
        PRIVATE FUNCTION to convert to Structure Instance
        filename = name of mmCIF file
        hetatm = Boolean representing whether to add hetatm to the
        structure.Default and Raccomanded is False.
        water = Boolean representing whether to add water to the
        structure.Default and Raccomanded is False.
        """
        atomList = []
        hetatomList = []
        wateratomList = []
        footer = ''
        header = ''
        residues = structure.get_residues()
        for res in residues:
            hetfield = res.get_id()[0]
            if hetfield[0] == "H":
                for atom in res:
                    # BioPyAtom(atom)
                    hetatomList.append(gemmiAtom(atom))
            elif hetfield[0] == "W":
                for atom in res:
                    # BioPyAtom(atom)
                    wateratomList.append(gemmiAtom(atom))
            else:
                for atom in res:
                    # BioPyAtom(atom)
                    atomList.append(gemmiAtom(atom))
        if hetatm:
            atomList = append(atomList, hetatomList)
        if water:
            atomList = append(atomList, wateratomList)

        return BioPy_Structure(
            atomList,
            filename=filename,
            header=header,
            footer=footer,
            pdb_id=pdb_id
        )

    @staticmethod
    def calc_SA(self, pdbfile, rsa=True, outsafile=None):
        assert os.path.isfile(pdbfile)
        if outsafile is None:
            outsafile = os.path.basename(pdbfile) + '_sa.out'
        cmd = (
                "~/data/packages/freesasa/freesasa-1.1/src/freesasa %s "
                "--rsa_file=%s --no-log --radii=naccess" %
                (pdbfile, outsafile)
        )
        p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)
        p.communicate()

[docs]    @staticmethod
    def write_sasd_to_txt(sasds, pdb):
        """Write solvent accessible distances to a text (.txt) file.


        The output text file has the default name
        :code:`"./Jwalk_results/{pdb_code}_crosslinks.pdb"`, where pdb_code
        is, for example, 3agy for PDB file with name 3agy.pdb.

        Args:
            sasds: Dictionary of sasds
            pdb: PDB file sasds were calculated on

        """
        if not os.path.exists('./Jwalk_results'):
            os.makedirs('./Jwalk_results')

        with open(
                './Jwalk_results/%s_crosslink_list.txt' % pdb[:-4],
                'w'
        ) as outf:
            outf.write(
                ' '.join('{0:<13}'.format(col) for col in [
                    'Index',
                    'Model',
                    'Atom1',
                    'Atom2',
                    'SASD',
                    'Euclidean Distance',
                ])
            )
            outf.write('\n')
            index = 1
            for xl in sasds:
                (aa1, chain1, res1) = xl[0]
                (aa2, chain2, res2) = xl[1]
                atom1 = ('%s-%d-%s-CA' % (res1, aa1, chain1))
                atom2 = ('%s-%d-%s-CA' % (res2, aa2, chain2))
                sasd = xl[2]
                ed = xl[3]
                outf.write(
                    ' '.join('{0:<13}'.format(col) for col in [
                        index,
                        pdb,
                        atom1,
                        atom2,
                        sasd,
                        ed,
                    ])
                )
                outf.write('\n')
                index += 1

[docs]    @staticmethod
    def write_sasd_to_pdb(dens_map, sasds, pdb):
        """Write solvent accessible distances to a PDB file.

        The output text file has the default name
        :code:`"./Jwalk_results/{pdb_code}_crosslinks.pdb"`, where pdb_code
        is, for example, 3agy for PDB file with name 3agy.pdb.

        Args:
            dens_map: Solvent accessible surface on masked array
            sasds: Dictionary of sasds
            pdb: pdb file sasds were calculated on
        """
        if not os.path.exists('./Jwalk_results'):
            os.makedirs('./Jwalk_results')

        apix = dens_map.apix
        origin = dens_map.origin
        path_coord = {}

        for xl in sasds:
            a = []
            for (x, y, z) in sasds[xl]:
                a.append(
                    [
                        (x * apix) + origin[0],
                        (y * apix) + origin[1],
                        (z * apix) + origin[2]
                    ]
                )
            path_coord[xl] = a

        with open('./Jwalk_results/%s_crosslinks.pdb' % pdb[:-4], 'w') as pdb:
            m_count = 1
            for xl in path_coord:
                (aa1, chain1, res1) = xl[0]
                (aa2, chain2, res2) = xl[1]
                count = 1
                pdb.write(
                    'MODEL %d %s%d%s-%s%d%s\n' % (
                        m_count,
                        res1,
                        aa1,
                        chain1,
                        res2,
                        aa2,
                        chain2,
                    )
                )
                m_count = m_count+1
                for (x, y, z) in path_coord[xl]:
                    p = Vector.Vector(x, y, z)
                    a = p.to_atom()
                    a.record_name = 'ATOM'
                    a.serial = count
                    a.atom_name = 'C'
                    a.alt_loc = ''
                    a.res = 'GLY'
                    a.chain = 'A'
                    a.res_no = count
                    a.icode = ''
                    a.occ = 1
                    a.temp_fac = 0
                    a.elem = 'C'
                    a.charge = ''
                    pdb.write(a.write_to_PDB())
                    count += 1
                pdb.write('END\n')


class gemmi_helper_fns:

    @staticmethod
    def name_nameless_chains(g_structure):
        """
        Function to give names to chains in gemmi Structure objects that have
        blank names. Occurs due to pdb files with no chain label. In such
        files, all residues and atoms will be placed into one chain with no
        name.

        Input: Gemmi Structure instance (with blank chain names)

        Output: Gemmi Structure instance with chains named A, B, C ....
                (in most/all cases will be one chain named A)
        """
        for model in g_structure:
            for n, chain in enumerate(model):
                if not chain.name:
                    # check ASCII table - will give chain 0 name 'A'
                    chain.name = chr(n+65)
        return g_structure