Source code for pyemap.graph_mining.frequent_subgraph

# Copyright (c) 2017-2022, James Gayvert, Ruslan Tazhigulov, Ksenia Bravaya
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import numpy as np
from Bio.SVDSuperimposer import SVDSuperimposer
from .utils import get_graph_matcher, write_graph_smiles, make_pretty_subgraph
from ..utils import draw_mpl_graph
from networkx.drawing.nx_agraph import to_agraph
import networkx as nx
from PIL import Image
from functools import total_ordering
import tempfile
import warnings


def _gen_groups(cc, all_graphs):
    groups = {}
    for group_idx, group in enumerate(cc):
        graph_list = []
        for graph_idx in group:
            graph = all_graphs[graph_idx]
            graph_list.append(graph)
        groups[group_idx + 1] = [x.graph['id'] for x in graph_list]
    return groups


[docs] @total_ordering class SubgraphPattern(): ''' Stores all information regarding an identified subgraph pattern. Attributes ---------- id: str Unique identifier for subgraph pattern. G: :class:`networkx.Graph` Graph representation of this subgraph pattern. support: list of str List of PDB IDs which contain this subgraph protein_subgraphs: dict of str: :class:`networkx.Graph` Dict which contains protein subgraphs which match this pattern. Each entry has a unique identifier and a :class:`networkx.Graph` derived from the graphs generated by the :class:`~pyemap.emap` class which match the pattern of this pattern. groups: dict of str: list of str Protein subgraph (IDs) clustered into groups based on similarity support_number: int Number of PDBs this subgraph pattern was identified in '''
[docs] def __init__(self, G, graph_number, support, res_to_num_label, edge_thresholds, rmsd_thresh): '''Initializes SubgraphPattern object. Parameters ---------- G: :class:`networkx.Graph` Graph representation of this subgraph pattern graph_number: int Unique numerical ID for this subgraph pattern support: list of str List of PDB IDs which contain this subgraph res_to_num_label: dict of str: int Mapping of residue types to numerical node labels edge_thresholds: list of float Edge thresholds which define edge labels rmsd_thresh: float threshold for determining structural similarity between identified subgraphs ''' self.G = G.copy() self.support = support self.total_support = {} self.protein_subgraphs = {} self.unsorted_graphs = {} self.groups = {} self.res_to_num_label = res_to_num_label self.edge_thresholds = edge_thresholds self.rmsd_thresh = rmsd_thresh self.support_number = len(support) self.id = str(graph_number + 1) + "_" + str(write_graph_smiles(self.G)) + "_" + str(self.support_number) if "#" in self.id: self._file_id = self.id.replace("#", "NP") else: self._file_id = self.id for node in self.G.nodes: if self.G.nodes[node]['label'] == "#": self.G.nodes[node]['label'] = "NP"
def __lt__(self, other): if self.support_number != other.support_number: return self.support_number < other.support_number elif str(write_graph_smiles(self.G)) != str(write_graph_smiles(other.G)): return str(write_graph_smiles(self.G)) < str(write_graph_smiles(other.G)) else: return self.G.size(weight="num_label") < other.G.size(weight="num_label") def __eq__(self, other): return self.G == other.G def _update_id(self, graph_number): self.id = str(graph_number + 1) + self.id[self.id.index('_'):] if "#" in self.id: self._file_id = self.id.replace("#", "NP") else: self._file_id = self.id
[docs] def general_report(self): ''' Generates general report which describes this subgraph pattern. Returns -------- full_str: str General report which describes this subgraph pattern ''' full_str = "" full_str += "ID:" + str(self.id) + "\n" full_str += "Support:" + str(self.support_number) + "\n" full_str += "Where:" + str(list(self.support.keys())) + "\n" full_str += "Adjacency list:\n" G = self.G for node in G.nodes: full_str += G.nodes[node]['label'] + str(node) + ":[" for neighbor in G.neighbors(node): full_str += G.nodes[neighbor]['label'] + str(neighbor) + "(" + str( G.edges[(node, neighbor)]['num_label']) + "), " full_str = full_str[:-2] full_str += "]\n" return full_str
[docs] def full_report(self): ''' Returns a full report of all protein subgraphs which match this pattern. Returns ------- full_str: str Report of protein subgraphs which match this pattern ''' full_str = self.general_report() if len(self.protein_subgraphs) == 0: full_str += "Please run `find_protein_subgraphs' to get a full report.\n" return full_str full_str += str(len(self.protein_subgraphs)) + " subgraphs matching this pattern were found.\n" full_str += "Graphs are classified based on " + self.clustering_option + " similarity.\n\n" if self.clustering_option == 'structural': full_str += "Structural clustering is based on an RMSD threshold of " + str(self.rmsd_thresh) + ".\n\n" for key in self.groups: graphs = self.groups[key] full_str += "Group " + str(key) + ": " + str(len(graphs)) + " members\n---------------\n" for graph in graphs: full_str += self._report_for_graph(self.protein_subgraphs[graph]) + "\n" return full_str
def _report_for_graph(self, G): ''' Generates report for a given protein subgraph ''' full_str = "" full_str += str(G.graph['id']) + "\n" full_str += "Nodes\n" for node in G.nodes: full_str += G.nodes[node]['label'] full_str += " Position in alignment:" + str(G.nodes[node]['aligned_resnum']) full_str += "\n" full_str += "Adjacency list:\n" for node in G.nodes: full_str += G.nodes[node]['label'] + ":[" for neighbor in G.neighbors(node): dist = '{0:.2f}'.format(G.edges[(node, neighbor)]['distance']) full_str += G.nodes[neighbor]['label'] + "(" + str(dist) + "), " full_str = full_str[:-2] full_str += "]\n" return full_str
[docs] def visualize_subgraph_in_nglview(self,id,view): '''Visualize pathway in nglview widget Parameters ---------- id: str Subgraph id to be visualized view: :class:`nglview.widget.NGLWidget` NGL Viewer widget ''' cur_subgraph = self.protein_subgraphs[id] cur_emap = self.support[cur_subgraph.graph['pdb_id']] label_texts, _, color_list, selection_strs = self._visualize_subgraph_in_ngl(cur_emap,cur_subgraph) for i in range(0,len(selection_strs)): first_atm_select = selection_strs[i][:selection_strs[i].index(')')+1]+"]" view.add_representation('ball+stick',sele=selection_strs[i],color=color_list[i]) view.add_representation('label',color="black",sele=first_atm_select,labelText=label_texts[i])
def _visualize_subgraph_in_ngl(self, emap, G): ''' Gets visualization of subgraph in NGL viewer Parameters ---------- emap: :class:`~pyemap.emap` :class:`~pyemap.emap` object containing the protein subgraph G: class:`networkx.Graph` Protein subgraph to visualize in NGL viewer ''' colors = {"F": "orange", "Y": "blue", "W": "red", "H": "green"} label_texts = [] labeled_atoms = [] color_list = [] selection_strs = [] for res in G.nodes: label_texts.append(res) try: if res not in emap.eta_moieties: color_list.append(colors[res[0]]) labeled_atoms.append(".CA") else: color_list.append("pink") labeled_atoms.append(next(emap.residues[res].get_atoms()).name) except KeyError: color_list.append("pink") labeled_atoms.append(next(emap.residues[res].get_atoms()).name) selection_strs.append(emap.residues[res].ngl_string) return label_texts, labeled_atoms, color_list, selection_strs
[docs] def find_protein_subgraphs(self, clustering_option="structural", rmsd_thresh=.5, *args, **kwargs): ''' Finds protein subgraphs which match this pattern. This function must be executed to analyze protein subgraphs. Parameters ----------- clustering_option: str, optional Either 'structural' or 'sequence' rmsd_thresh: float threshold for determining structural similarity between identified subgraphs Notes ------ Graphs are clustered by both sequence and structrual similarity, and the results are stored in `self._structural_groups` and `self._sequence_groups`. The clustering_option argument used here determines which one of these groupings is used for `self.groups`. This can be changed at any time by calling :func:`~pyemap.graph_mining.SubgraphPattern.set_clustering` and specifying the other clustering option. ''' self.groups = {} self.protein_subgraphs = {} all_graphs = [] for pdb_id in self.support: all_graphs += self._find_subgraph_in_pdb(pdb_id) all_graphs.sort(key=lambda x: x.size(weight="weight")) for i, graph in enumerate(all_graphs): unique_id = graph.graph['pdb_id'] + "_" + str(i + 1) graph.graph['id'] = unique_id self.protein_subgraphs[unique_id] = graph if len(all_graphs) > 1: self._do_clustering(all_graphs, rmsd_thresh=rmsd_thresh) self.set_clustering(clustering_option) self.rmsd_thresh = rmsd_thresh else: self.groups[1] = [x.graph['id'] for x in all_graphs] self.clustering_option = clustering_option self._structural_groups = self.groups self._sequence_groups = self.groups
[docs] def set_clustering(self, clustering_option): ''' Sets clustering option. Parameters ---------- clustering_option: str Either 'structural' or 'sequence'. Notes ------ Since both types of clustering are always computed by :func:`pyemap.graph_mining.SubgraphPattern.find_protein_subgraphs` all this function actually does is swap some private variables. The purpose of this function is to determine what kind of clustering gets shown in the report. ''' if clustering_option == "structural": self.groups = self._structural_groups elif clustering_option == "sequence": self.groups = self._sequence_groups else: raise Exception("Either structural or sequence.") self.clustering_option = clustering_option
[docs] def _do_clustering(self, all_graphs, rmsd_thresh): '''Compute the supergraphs and find the connected components''' num_graphs = len(all_graphs) num_nodes = len(all_graphs[0].nodes) G_seq = nx.Graph() G_struct = nx.Graph() for i in range(0, len(all_graphs)): G_seq.add_node(i) G_struct.add_node(i) seq_sum = 0 rmsd_sum = 0 for i in range(0, num_graphs): for j in range(i + 1, num_graphs): GM = get_graph_matcher(all_graphs[i], all_graphs[j]) rmsds = [] seq_dists = [] for mapping in GM.subgraph_isomorphisms_iter(): seq_dists.append(self._subgraph_seq_dist(all_graphs[i], all_graphs[j], mapping)) rmsds.append(self._subgraph_rmsd(all_graphs[i], all_graphs[j], mapping)) seq_dist = np.min(seq_dists) rmsd = np.min(rmsds) seq_sum += seq_dist rmsd_sum += rmsd if seq_dist < num_nodes: G_seq.add_edge(i, j) if rmsd <= float(rmsd_thresh): G_struct.add_edge(i, j) self._structural_groups = _gen_groups( [c for c in sorted(nx.connected_components(G_struct), key=len, reverse=True)], all_graphs) self._sequence_groups = _gen_groups([c for c in sorted(nx.connected_components(G_seq), key=len, reverse=True)], all_graphs)
[docs] def _subgraph_seq_dist(self, sg1, sg2, mapping): ''' Computes sequence distance between two protein subgraphs ''' total_dist = 0 for key, val in mapping.items(): if sg1.nodes[key]['aligned_resnum'] != "X" and sg2.nodes[val]['aligned_resnum'] != "X": total_dist += np.absolute(sg1.nodes[key]['aligned_resnum'] - sg2.nodes[val]['aligned_resnum']) return total_dist
[docs] def _subgraph_rmsd(self, sg1, sg2, mapping): ''' Computes RMSD between two protein subgraphs ''' emap1 = self.support[sg1.graph['pdb_id']] emap2 = self.support[sg2.graph['pdb_id']] atoms1 = [] atoms2 = [] for key, val in mapping.items(): res1 = emap1.residues[key] res2 = emap2.residues[val] if 'CA' in res1 and 'CA' in res2: atoms1.append(res1['CA'].coord) atoms2.append(res2['CA'].coord) else: shared_id = None for atm in res1: if atm.id in res2: shared_id = atm.id break if shared_id is not None: atoms1.append(res1[shared_id].coord) atoms2.append(res2[shared_id].coord) else: return float("inf") if len(atoms1) >= 2 and len(atoms1) == len(atoms2): si = SVDSuperimposer() si.set(np.array(atoms1), np.array(atoms2)) si.run() return si.get_rms() else: return float("inf")
def _find_subgraph_in_pdb(self, pdb_id): ''' Finds all monomorphisms of this subgrpah class in a given PDB. Parameters ----------- pdb_id: str PDB ID of graph to be searched Returns -------- sgs: list of :class:`networkx.Graph` ''' GM = get_graph_matcher(self.support[pdb_id].init_graph, self.G) subgraph_isos = GM.subgraph_monomorphisms_iter() sgs = [] degree_dicts = [] for mapping in subgraph_isos: sg = self._generate_protein_subgraph(mapping, self.support[pdb_id].init_graph, self.G, self.support[pdb_id]) # eliminate redundant subgraphs degree_dict = dict(sg.degree) if degree_dict not in degree_dicts: degree_dicts.append(degree_dict) sgs.append(sg) self.total_support[pdb_id] = len(sgs) return sgs def _generate_protein_subgraph(self, mapping, protein_graph, G, emap_obj): ''' Generates protein subgraph for a given monomorphism Parameters ----------- mapping: dict of int:int Mapping of nodes in protein graph and generic subgraph protein_graph: :class:`networkx.Graph` Protein graph generated by pyemap G: :class:`networkx.Graph` Graph corresponding to subgraph pattern emap_obj: :class:`~pyemap.emap` eMap object corresponding to protein graph Returns -------- protein_subgraph: :class:`networkx.Graph` Protein subgraph corresponding to mapping ''' mapping = dict((v, k) for k, v in mapping.items()) protein_subgraph = G.copy() protein_subgraph = nx.relabel_nodes(protein_subgraph, mapping) # the access order is determined by residue number, maybe this will help sorted_graph = nx.Graph() for node in sorted(protein_subgraph.nodes(), key=lambda n: emap_obj.residues[n].full_id[3][1]): sorted_graph.add_node(node) sorted_graph.nodes[node]['shape'] = protein_graph.nodes[node]['shape'] sorted_graph.nodes[node]['label'] = str(node) sorted_graph.nodes[node]['num_label'] = protein_graph.nodes[node]['num_label'] sorted_graph.nodes[node]['aligned_resnum'] = emap_obj.residues[node].aligned_residue_number sorted_graph.nodes[node]['resnum'] = emap_obj.residues[node].full_id[3][1] sorted_graph.graph['pdb_id'] = protein_graph.graph['pdb_id'] for edge in protein_subgraph.edges(): sorted_graph.add_edge(edge[0], edge[1]) for key in protein_graph.edges[edge]: sorted_graph.edges[edge][key] = protein_graph.edges[edge][key] return sorted_graph
[docs] def subgraph_to_Image(self, id=None): '''Returns PIL image of subgraph pattern or protein subgraph Parameters ----------- id: str, optional Protein subgraph ID. If not specified, generic subgraph pattern will be drawn Returns -------- img: :class:`PIL.Image.Image` ''' if id is None: G = self.G.copy() else: G = self.protein_subgraphs[id].copy() make_pretty_subgraph(G) try: import pygraphviz agraph = to_agraph(G) agraph.graph_attr.update() agraph.edge_attr.update(len='1.0') fout = tempfile.NamedTemporaryFile(suffix=".png") agraph.draw(fout.name, prog='dot') except (ModuleNotFoundError, ImportError) as e: warnings.warn("Unable to import pygraphviz. Falling back on matplotlib...") import matplotlib.pyplot as plt draw_mpl_graph(G) fout = tempfile.NamedTemporaryFile(suffix=".png") plt.savefig(fout) plt.clf() img = Image.open(fout.name) return img
[docs] def subgraph_to_file(self, id=None, dest=""): '''Saves image of subgraph pattern or protein subgraph to file Parameters ----------- id: str, optional Protein subgraph ID. If not specified, generic subgraph pattern will be drawn dest; str,optional Destination to save the graph ''' if id is None: temp_G = make_pretty_subgraph(self.G.copy()) if dest == "": dest = self.id + ".png" else: temp_G = make_pretty_subgraph(self.protein_subgraphs[id]) if dest == "": dest = self.id + "_" + id + ".png" try: import pygraphviz agraph = to_agraph(temp_G) agraph.graph_attr.update() agraph.edge_attr.update(len='1.0') agraph.draw(dest, prog='dot') except (ModuleNotFoundError, ImportError) as e: warnings.warn("Unable to import pygraphviz. Falling back on matplotlib...") import matplotlib.pyplot as plt draw_mpl_graph(temp_G) plt.savefig(dest) plt.clf()