# =============================================================================
# This file is part of TEMPy.
#
# TEMPy is a software designed to help the user in the manipulation
# and analyses of macromolecular assemblies using 3D electron microscopy
# maps.
#
# Copyright 2015 Birkbeck College University of London.
#
# Authors: Maya Topf, Daven Vasishtan, Arun Prasad Pandurangan,
# Irene Farabella, Agnel-Praveen Joseph, Harpal Sahota
#
# This software is made available under GPL V3 license
# http://www.gnu.org/licenses/gpl-3.0.html
#
# Please cite your use of TEMPy in published work:
#
# Farabella, I., Vasishtan, D., Joseph, A.P., Pandurangan, A.P., Sahota, H.
# & Topf, M. (2015). J. Appl. Cryst. 48.
#
# =============================================================================
import os
import subprocess
import urllib
import collections
from numpy import append
from TEMPy.protein.prot_rep_biopy import (
BioPy_Structure,
gemmiAtom,
gemmi_Structure,
)
import TEMPy.math.vector as Vector
try:
from gemmi import cif
import gemmi
except ImportError:
raise ImportError(
'GEMMI library needs to be installed to use TEMPy\'s mmCIF and PDB'
'parsers Gemmi can be installed using command: pip install gemmi'
)
AAs = {
'GLY': 'G',
'ALA': 'A',
'VAL': 'V',
'LEU': 'L',
'ILE': 'I',
'MET': 'M',
'PHE': 'F',
'TRP': 'W',
'PRO': 'P',
'SER': 'S',
'THR': 'T',
'CYS': 'C',
'TYR': 'Y',
'ASN': 'N',
'GLN': 'Q',
'ASP': 'D',
'GLU': 'E',
'LYS': 'K',
'ARG': 'R',
'HIS': 'H',
}
[docs]class mmCIFParser:
""" Class for parsing mmCIF files using the GEMMI library.
All functions contained in the mmCIFParser class are "staticmethods",
meaning an object does not need to initiated before using the methods, as
shown in the :ref:`code examples<Model Parsing Code Example>`.
"""
def __init__(self, filename):
self.compressed = False
self.acol_widths = []
self._is_file_compressed(filename)
def _is_file_compressed(self, filename):
if filename[-2:] == 'gz':
self.compressed = True
else:
self.compressed = False
@staticmethod
def _convertGEMMItoTEMPy(
data_block,
structure,
filename,
water=False,
hetatm=False
):
"""
Private function: converts the gemmi parsed info from a cif file
into TEMPy conventions - mostly focusses on _atom_site. information
which is used to initialise gemmiAtom instances
Args:
data_block: A data block from a cif file, typically accessed using
.sole_block gemmi instance method
filename: Input filename
hetatm: If True, HETATM atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
returned after parsing. If False, HETATM atoms are ignored.
water: If True, HETATM, water atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
instance returned after parsing. If False, HETATM atoms are
ignored.
Returns:
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
Parsed structure.
"""
# remove waters and hetatms if required
if not water:
structure.remove_waters()
if not hetatm:
structure.remove_ligands_and_waters()
structure.remove_empty_chains() # deleting hetatms empties some chains
atomList = []
for model in structure:
for chain in model:
for entity in structure.entities:
for subchain in entity.subchains:
for residue in chain.get_subchain(subchain):
for atom in residue:
atomList.append(gemmiAtom(
chain,
entity,
residue,
atom))
# grab the remaining metadata
header = {}
header_tags = []
tempy_scores = collections.OrderedDict()
tempy_headers = []
for tag in data_block.get_mmcif_category_names():
if tag != '_atom_site.':
header_tags.append(tag)
header[tag] = data_block.get_mmcif_category(tag, raw=True)
if tag.startswith('_TEMPy'):
tempy_headers.append(tag)
tempy_scores[tag] = data_block.get_mmcif_category(
tag,
raw=True)
header['tags'] = tuple(header_tags)
return gemmi_Structure(
atomList,
structure,
filename=filename,
header=header,
tempy_scores=tempy_scores,
)
[docs] @staticmethod
def read_mmCIF_file(filename, hetatm=False, water=False):
"""Read an mmCIF file to generate a Structure instance.
Uses the `Gemmi <https://gemmi.readthedocs.io/en/latest/index.html>`_
library to parse the input PDB file.
Args:
filename: Path to the input file
hetatm: If True, HETATM atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
returned after parsing. If False, HETATM atoms are ignored.
water: If True, HETATM, water atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
instance returned after parsing. If False, HETATM atoms are
ignored.
Returns:
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
Parsed structure.
"""
blocks = cif.read(filename)
for block in blocks:
if block.find_loop('_atom_site.id'):
break
structure = gemmi.make_structure_from_block(block)
return mmCIFParser._convertGEMMItoTEMPy(
block,
structure,
filename,
hetatm=hetatm,
water=water)
[docs] @staticmethod
def fetch_mmCIF(
structure_id,
local_filename,
hetatm=False,
water=False,
):
"""Fetch an mmCIF file from the Protein Data Bank, and use it to
generate a Structure instance.
Uses the `Gemmi <https://gemmi.readthedocs.io/en/latest/index.html>`_
library to parse the input PDB file.
Args:
structure_id: structure_id code of pdb file, e.g. 3agy
local_filename: Filename for locally saved the fetched mmCIF file
hetatm: If True, HETATM atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
returned after parsing. If False, HETATM atoms are ignored.
water: If True, HETATM, water atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
instance returned after parsing. If False, HETATM atoms are
ignored.
Returns:
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
Parsed structure.
"""
url = 'http://www.rcsb.org/pdb/files/%s.cif' % structure_id
new_file, someinfo = urllib.request.urlretrieve(
url,
filename=local_filename)
return mmCIFParser.read_mmCIF_file(
new_file,
hetatm=hetatm,
water=water)
[docs]class PDBParser:
"""
A class to read PDB files either directly from the pdb or a structure
instance from Biopython
"""
def __init__(self):
pass
[docs] @staticmethod
def read_PDB_file(
structure_id,
filename,
hetatm=False,
water=False,
chain=None,
):
""" Read PDB file and create Structure instance.
Uses the `Gemmi <https://gemmi.readthedocs.io/en/latest/index.html>`_
library to parse the input PDB file.
Args:
structure_id: structure_id code of pdb file
filename: Filename (path) of pdb file
hetatm: If True, HETATM atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
returned after parsing. If False, HETATM atoms are ignored.
water: If True, HETATM, water atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
instance returned after parsing. If False, HETATM atoms are
ignored.
Returns:
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
Parsed structure.
"""
# Necessary to read pdb file and transform it to Gemmi cif format
structure = gemmi.read_pdb(filename)
# pdb files created by some software can have blank/empty chain names
if not structure[0][0].name:
structure = gemmi_helper_fns.name_nameless_chains(structure)
structure.remove_empty_chains()
structure.setup_entities()
structure.assign_label_seq_id()
data_block = structure.make_mmcif_document().sole_block()
return mmCIFParser._convertGEMMItoTEMPy(
data_block,
structure,
filename,
water=water,
hetatm=hetatm,
)
[docs] @staticmethod
def fetch_PDB(
structure_id,
local_filename,
hetatm=False,
water=False,
):
""" Fetch PDB file from the PDB and create Structure instance based
upon it.
Uses the `Gemmi <https://gemmi.readthedocs.io/en/latest/index.html>`_
library to parse the input PDB file.
Args:
structure_id: structure_id code of pdb file
local_filename: Filename for locally saved the fetched pdb file
hetatm: If True, HETATM atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
returned after parsing. If False, HETATM atoms are ignored.
water: If True, HETATM, water atoms are included in the
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`
instance returned after parsing. If False, HETATM atoms are
ignored.
Returns:
:class:`Structure instance <TEMPy.protein.prot_rep_biopy.gemmi_Structure>`:
Parsed structure.
"""
url = 'http://www.rcsb.org/pdb/files/%s.pdb' % structure_id
new_file, someinfo = urllib.request.urlretrieve(
url,
filename=local_filename)
return PDBParser.read_PDB_file(structure_id, new_file)
@staticmethod
def _bio_strcuture_to_TEMpy(
filename,
structure,
pdb_id,
hetatm=False,
water=False,
):
"""
PRIVATE FUNCTION to convert to Structure Instance
filename = name of mmCIF file
hetatm = Boolean representing whether to add hetatm to the
structure.Default and Raccomanded is False.
water = Boolean representing whether to add water to the
structure.Default and Raccomanded is False.
"""
atomList = []
hetatomList = []
wateratomList = []
footer = ''
header = ''
residues = structure.get_residues()
for res in residues:
hetfield = res.get_id()[0]
if hetfield[0] == "H":
for atom in res:
# BioPyAtom(atom)
hetatomList.append(gemmiAtom(atom))
elif hetfield[0] == "W":
for atom in res:
# BioPyAtom(atom)
wateratomList.append(gemmiAtom(atom))
else:
for atom in res:
# BioPyAtom(atom)
atomList.append(gemmiAtom(atom))
if hetatm:
atomList = append(atomList, hetatomList)
if water:
atomList = append(atomList, wateratomList)
return BioPy_Structure(
atomList,
filename=filename,
header=header,
footer=footer,
pdb_id=pdb_id
)
@staticmethod
def calc_SA(self, pdbfile, rsa=True, outsafile=None):
assert os.path.isfile(pdbfile)
if outsafile is None:
outsafile = os.path.basename(pdbfile) + '_sa.out'
cmd = (
"~/data/packages/freesasa/freesasa-1.1/src/freesasa %s "
"--rsa_file=%s --no-log --radii=naccess" %
(pdbfile, outsafile)
)
p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)
p.communicate()
[docs] @staticmethod
def write_sasd_to_txt(sasds, pdb):
"""Write solvent accessible distances to a text (.txt) file.
The output text file has the default name
:code:`"./Jwalk_results/{pdb_code}_crosslinks.pdb"`, where pdb_code
is, for example, 3agy for PDB file with name 3agy.pdb.
Args:
sasds: Dictionary of sasds
pdb: PDB file sasds were calculated on
"""
if not os.path.exists('./Jwalk_results'):
os.makedirs('./Jwalk_results')
with open(
'./Jwalk_results/%s_crosslink_list.txt' % pdb[:-4],
'w'
) as outf:
outf.write(
' '.join('{0:<13}'.format(col) for col in [
'Index',
'Model',
'Atom1',
'Atom2',
'SASD',
'Euclidean Distance',
])
)
outf.write('\n')
index = 1
for xl in sasds:
(aa1, chain1, res1) = xl[0]
(aa2, chain2, res2) = xl[1]
atom1 = ('%s-%d-%s-CA' % (res1, aa1, chain1))
atom2 = ('%s-%d-%s-CA' % (res2, aa2, chain2))
sasd = xl[2]
ed = xl[3]
outf.write(
' '.join('{0:<13}'.format(col) for col in [
index,
pdb,
atom1,
atom2,
sasd,
ed,
])
)
outf.write('\n')
index += 1
[docs] @staticmethod
def write_sasd_to_pdb(dens_map, sasds, pdb):
"""Write solvent accessible distances to a PDB file.
The output text file has the default name
:code:`"./Jwalk_results/{pdb_code}_crosslinks.pdb"`, where pdb_code
is, for example, 3agy for PDB file with name 3agy.pdb.
Args:
dens_map: Solvent accessible surface on masked array
sasds: Dictionary of sasds
pdb: pdb file sasds were calculated on
"""
if not os.path.exists('./Jwalk_results'):
os.makedirs('./Jwalk_results')
apix = dens_map.apix
origin = dens_map.origin
path_coord = {}
for xl in sasds:
a = []
for (x, y, z) in sasds[xl]:
a.append(
[
(x * apix) + origin[0],
(y * apix) + origin[1],
(z * apix) + origin[2]
]
)
path_coord[xl] = a
with open('./Jwalk_results/%s_crosslinks.pdb' % pdb[:-4], 'w') as pdb:
m_count = 1
for xl in path_coord:
(aa1, chain1, res1) = xl[0]
(aa2, chain2, res2) = xl[1]
count = 1
pdb.write(
'MODEL %d %s%d%s-%s%d%s\n' % (
m_count,
res1,
aa1,
chain1,
res2,
aa2,
chain2,
)
)
m_count = m_count+1
for (x, y, z) in path_coord[xl]:
p = Vector.Vector(x, y, z)
a = p.to_atom()
a.record_name = 'ATOM'
a.serial = count
a.atom_name = 'C'
a.alt_loc = ''
a.res = 'GLY'
a.chain = 'A'
a.res_no = count
a.icode = ''
a.occ = 1
a.temp_fac = 0
a.elem = 'C'
a.charge = ''
pdb.write(a.write_to_PDB())
count += 1
pdb.write('END\n')
class gemmi_helper_fns:
@staticmethod
def name_nameless_chains(g_structure):
"""
Function to give names to chains in gemmi Structure objects that have
blank names. Occurs due to pdb files with no chain label. In such
files, all residues and atoms will be placed into one chain with no
name.
Input: Gemmi Structure instance (with blank chain names)
Output: Gemmi Structure instance with chains named A, B, C ....
(in most/all cases will be one chain named A)
"""
for model in g_structure:
for n, chain in enumerate(model):
if not chain.name:
# check ASCII table - will give chain 0 name 'A'
chain.name = chr(n+65)
return g_structure