Source code for pdb2graph

# August George, 2022, PNNL

import numpy as np
import pandas as pd
import networkx as nx
from Bio.PDB import PDBParser, MMCIFParser  # BioPython PDB doc: https://biopython.org/docs/1.75/api/Bio.PDB.html
from scipy.spatial.distance import pdist,squareform
import argparse
import matplotlib.pyplot as plt
from pathlib import Path


[docs]def get_hydrophobicity(name, warn=False):
    """Gets hydophobicity based on amino acid name.


    ref: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html 


    note: returns NaN if input name is invalid!

    Args:
        name ([str]): Amino acid name
        warn ([bool]): if True, print a warning when hydrophobicity can't be determined

    Returns:
        [float]: hydrophobicity
    """
    acid_names = ['Ile', 'Val', 'Leu', 'Phe', 'Cys', 'Met', 'Ala', 'Gly', 'Thr', 'Ser', 'Trp', 'Tyr', 'Pro', 'His', 'Glu', 'Gln', 'Asp', 'Asn', 'Lys', 'Arg']
    acid_kdH = [4.5,	4.2, 3.8, 2.8, 2.5, 1.9, 1.8, -0.4, -0.7, -0.8, -0.9, -1.3, -1.6, -3.2, -3.5, -3.5, -3.5, -3.5, -3.9, -4.5] 
    hydro_dict = dict(zip(acid_names, acid_kdH))
    try:
        hydrophobicity = hydro_dict[f'{name.title()}']  # make sure string is in correct format 
    except:   
        hydrophobicity = np.nan  
        if warn == True:
            print(f'warning! could not assign hydrophobicity for {name.title()}. setting hydrophobicity=NaN') 
    return hydrophobicity 
   

[docs]def PDB_to_df(pdb_code, fname, pdbx, offset, CA_only=1):
    """Loads a PDB (or PDBx) file and stores the atom coordinates and residue name and number into a dataframe. 
    
    Note: if the PDB file has more than one model, the first model is chosen. 

    Args:
        pdb_code ([str]): PDB ID / label for the protein of interest
        fname ([str]): filename for the protein of interest. Can be PDB or PDBx format
        pdbx ([int]): Set=1 if using the newer PDBx file format. 
        offest ([int]): index offset incase first residue ID in PDB file is not the first physical residue (e.g. PDB starts at 5th residue). 
        CA_only ([int]): Set=1 [default] if using only alpha carbons, else all atoms are used. 
        
    Returns:
        [Pandas dataframe object]: dataframe containing every atom's x,y,z coord and serial number
    """
    # pick a file reader based on the filetype. Newer/bigger structures will use PDBx 
    if pdbx == 1:
        parser = MMCIFParser()  # for PDBx files (.CIF)
    else:
        parser = PDBParser()  # for PDB files (.PDB)
    structure = parser.get_structure(pdb_code, fname)  # parse pdb file and store as a PDB structure object

    # makes lists storing the alpha carbon (CA) coordinates and serial number 
    atom_coord_list = [] 
    res_id_list = []
    hydro_list = []

    # loop through PDB structure object to get alpha carbon (or all atom) coordinates and serial number
    for i, model in enumerate(structure):  # iterate through each model in the structure, exit loop after first model 
        if i > 0: # NMR files have multiple models, but only one model is used for the graph
            break  
        for chain in model:  # iterate through each chain in the model
            for residue in chain:  # iterate through each residue in the chain
                for atom in residue:  # iterate through each atom in residue
                    if CA_only == 1:
                        if atom.get_name() == 'CA':  # if atom is an alpha carbon then append its coordinates and serial number
                            atom_coord_list.append(atom.get_vector()[:])  # convert vector object into list of numbers
                            atom_sn = atom.get_serial_number()

                            # biopython PDB module residue.__repr__() returns a string representing the residue:
                            # e.g. '<Residue THR het=  resseq=1 icode= >'
                            # 'resseq' is the residue serial number
                            # --> split the string until the number after 'resseq=' is all that remains
                            res_repr = residue.__repr__()
                            res_repr_tmp = res_repr.split('resseq=',1)[1]
                            res_sn = int(res_repr_tmp.split(' ',1)[0])  + offset  # get residue serial number (+ offest)
                            res_name = residue.get_resname() # get first letters of residue name
                            hydro_list.append(get_hydrophobicity(res_name))  # get hydrophobicity based on residue name

                            # store residue ID: <first letter of residue name><residue serial number + offset>
                            res_id_list.append(f'{res_name}{res_sn}-{atom_sn}') 
                    else:  # get all atoms
                        if(residue.get_resname() != 'HOH'):
                            atom_coord_list.append(atom.get_vector()[:])  # convert vector object into list of numbers
                            atom_sn = atom.get_serial_number()

                            # biopython PDB module residue.__repr__() returns a string representing the residue:
                            # e.g. '<Residue THR het=  resseq=1 icode= >'
                            # 'resseq' is the residue serial number
                            # --> split the string until the number after 'resseq=' is all that remains
                            res_repr = residue.__repr__()
                            res_repr_tmp = res_repr.split('resseq=',1)[1]
                            res_sn = int(res_repr_tmp.split(' ',1)[0])  + offset  # get residue serial number (+ offest)
                            res_name = residue.get_resname() # get first letters of residue name
                            hydro_list.append(get_hydrophobicity(res_name))  # get hydrophobicity based on residue name

                            # store residue ID: <first letter of residue name><residue serial number + offset>
                            res_id_list.append(f'{res_name}{res_sn}-{atom_sn}') 
                 
    # create dataframes for atom coordinates, serial numbers, and then combine them. 
    # there's probably a better way to do this in a single line
    atom_df = pd.DataFrame(atom_coord_list, columns=['x','y','z'])
    res_id_df = pd.DataFrame(res_id_list, columns=['residue id'])
    hydro_df = pd.DataFrame(hydro_list, columns=['hydrophobicity'])
    df = pd.concat([atom_df, res_id_df, hydro_df], axis=1)
    return df
          

[docs]def PDB_df_to_G(PDB_df, d_cut=8.0):
    """Converts a dataframe containing alpha carbon / atom coordinates (in Angstroms) into a graph, G(V,E). 

    Each vertex, V, is an alpha carbon / atom. Two alpha carbons / atoms with a distance (in Angstroms) less than a cutoff, d_cut, are connected by an edge, E. 

    Args:
        PDB_df ([Pandas dataframe object]): a dataframe containing alpha carbon / atom coordinate columns labeled: 'x', 'y', and 'z'
        d_cut ([float]): Threshold for two alpha carbons / atoms to be connected (in Angstroms) by an edge. Defaults to 8.0

    Returns:
        G ([networkX graph object]): protein structure network graph, G(V,E)
    """
    # create distance matrix where element i,j is the Euclidean distance between vertex i and vertex j
    d_matrix = squareform(pdist(PDB_df[['x','y','z']], 'euclid'))  
    d_matrix_thresh = np.where(d_matrix>d_cut, 0, d_matrix)  # if the distance is > d_cut, replace it with 0 (i.e. remove edge)
    G = nx.convert_matrix.from_numpy_matrix(d_matrix_thresh)   # convert distance matrix to networkx graph object

    # create mapping to relabel nodes from index (i.e. 0...n-1) to residue id (i.e. str(<resname><res s/n + offset>) for each residue)
    label_mapping = {}
    hydro_mapping = {}
    for node in (G.nodes()):  # here the node value is an index (0-n-1)
        label_mapping[node] = PDB_df['residue id'][node]
        hydro_mapping[node] = PDB_df['hydrophobicity'][node]

    betweenness = nx.betweenness_centrality(G)
    nx.set_node_attributes(G, betweenness, 'betweenness')
    nx.set_node_attributes(G, hydro_mapping, 'hydrophobicity')
    H = nx.relabel_nodes(G, label_mapping)  # update the nodes and store as new graph object
    return H


[docs]def save_data(df, G, df_name, G_name):
    """Convenience function that stores dataframe as .csv and graph as .gexf file

    Args:
        df ([Pandas dataframe object]): dataframe to save
        G ([NetworkX graph object]): graph to save
        df_name ([str]): output filename for dataframe .csv
        G_name ([str]): output filename for graph .gexf

    """
    df.to_csv(f'{df_name}.csv')  # write dataframe to .csv 
    nx.write_gexf(G, f'{G_name}.gexf')  # write graph to .gexf (graph network XML file)


[docs]def save_data_at_this_folder(data_path, df, G, df_name, G_name):
    """Convenience function that stores dataframe as .csv and graph as .gexf file

    Args:
        data_path ([str] or [Path]): output directory path
        df ([Pandas dataframe object]): dataframe to save
        G ([NetworkX graph object]): graph to save
        df_name ([str]): output filename for dataframe .csv
        G_name ([str]): output filename for graph .gexf
    """
    df_name = Path(data_path,df_name)    
    df.to_csv(f'{df_name}.csv')  # write dataframe to .csv 
    G_name = Path(data_path,G_name)
    nx.write_gexf(G, f'{G_name}.gexf')  # write graph to .gexf (graph network XML file)


[docs]def plot_coordinates(xyz_data, figsize=5):
    """creates a 3D scatter plot containing the xyz data 

    Args:
        xyz_data ([numpy array]):  A[0] = [x0,y0,z0]
        figsize (int, optional): size of figure (figsize x figsize). Defaults to 5.

    Returns:
        [matplotlib figure object]: 3d scatter plot figure
    """
    fig = plt.figure(figsize=(figsize, figsize))
    plt.title('x y z coordinates')
    ax = fig.add_subplot(projection='3d')
    ax.scatter(xyz_data[:,0], xyz_data[:,1], xyz_data[:,2], c='purple', s=2, alpha=0.3)
    return fig


[docs]def plot_FA_and_CA_coordinates(FA_xyz, CA_xyz, figsize=5):
    """creates a 3D scatter plot containing CA and FA atom coordinates

    Args:
        FA_xyz ([numpy array]): A[0] = [x0,y0,z0] for all atom coordinate data
        CA_xyz ([numpy array]): A[0] = [x0,y0,z0] for alpha carbon only coordinate data
        figsize (int, optional): size of figure (figsize x figsize). Defaults to 5.

    Returns:
        [matplotlib figure object]: 3d scatter plot figure
    """
    fig = plt.figure(figsize=(figsize, figsize))
    
    plt.title('x y z coordinates - FA and CA')
    ax = fig.add_subplot(projection='3d')
    ax.scatter(FA_xyz[:,0],FA_xyz[:,1], FA_xyz[:,2], c='purple', s=5, alpha=0.3, label='FA')
    ax.scatter(CA_xyz[:,0],CA_xyz[:,1], CA_xyz[:,2], c='black', s=15, alpha=0.5, label='CA') 
    ax.legend()
    return fig


[docs]def main(args):
    """Takes a .pdb(x) file, converts it into a graph, and saves the atom coordinates to .csv and graph as .gexf 

    Args:
        args ([argument parser object]):

            - args.pdb_code: PDB id / protein name

            - args.fname: PDB/PDBx filename

            - args.d_cut: Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms)

            - args.o: PDB residue index offset integer. Default is 0.

            - args.pdbx: set=1 to use pdbx file parser

            - args.CA_only: set=1 to use only alpha carbons (0 for all atoms)
    """
    # get arguments
    pdb_code = args.pdb_code
    fname = args.fname
    d_cut = args.d_cut
    o = args.o
    pdbx = args.pdbx
    CA_only = args.CA_only

    df = PDB_to_df(pdb_code, fname, pdbx, o, CA_only)  # convert pdb file into dataframe of atom/alpha carbon coordinates and s/n
    G = PDB_df_to_G(df, d_cut)  # convert coordinate dataframe into network graph
    save_data(df, G, pdb_code, pdb_code)  # save dataframe as .csv and graph as .gexf using the same name as the pdb code


if __name__ == '__main__':
    # example: >> python pdb2graph.py PDBid pdb_file.pdb 8 0 0 1
    # get input arguments from command line
    parser = argparse.ArgumentParser()
    parser.add_argument("pdb_code", help="PDB id / protein name", type=str)
    parser.add_argument("fname", help="PDB/PDBx filename", type=str)
    parser.add_argument("d_cut", help="Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms)", type=float)
    parser.add_argument("o", help="PDB residue index offset integer. Default is 0.", type=int)
    parser.add_argument("pdbx", help="set=1 to use pdbx file parser", type=int)
    parser.add_argument("CA_only", help="set=1 to use only alpha carbons (0 for all atoms)", type=int)
    args = parser.parse_args()
    main(args)
Source code for pdb2graph

grip-tomo

Navigation

Related Topics