Source code for pdb2graph

# August George, 2022, PNNL

import numpy as np
import pandas as pd
import networkx as nx
from Bio.PDB import PDBParser, MMCIFParser  # BioPython PDB doc: https://biopython.org/docs/1.75/api/Bio.PDB.html
from scipy.spatial.distance import pdist,squareform
import argparse
import matplotlib.pyplot as plt
from pathlib import Path


[docs]def get_hydrophobicity(name, warn=False): """Gets hydophobicity based on amino acid name. ref: https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html note: returns NaN if input name is invalid! Args: name ([str]): Amino acid name warn ([bool]): if True, print a warning when hydrophobicity can't be determined Returns: [float]: hydrophobicity """ acid_names = ['Ile', 'Val', 'Leu', 'Phe', 'Cys', 'Met', 'Ala', 'Gly', 'Thr', 'Ser', 'Trp', 'Tyr', 'Pro', 'His', 'Glu', 'Gln', 'Asp', 'Asn', 'Lys', 'Arg'] acid_kdH = [4.5, 4.2, 3.8, 2.8, 2.5, 1.9, 1.8, -0.4, -0.7, -0.8, -0.9, -1.3, -1.6, -3.2, -3.5, -3.5, -3.5, -3.5, -3.9, -4.5] hydro_dict = dict(zip(acid_names, acid_kdH)) try: hydrophobicity = hydro_dict[f'{name.title()}'] # make sure string is in correct format except: hydrophobicity = np.nan if warn == True: print(f'warning! could not assign hydrophobicity for {name.title()}. setting hydrophobicity=NaN') return hydrophobicity
[docs]def PDB_to_df(pdb_code, fname, pdbx, offset, CA_only=1): """Loads a PDB (or PDBx) file and stores the atom coordinates and residue name and number into a dataframe. Note: if the PDB file has more than one model, the first model is chosen. Args: pdb_code ([str]): PDB ID / label for the protein of interest fname ([str]): filename for the protein of interest. Can be PDB or PDBx format pdbx ([int]): Set=1 if using the newer PDBx file format. offest ([int]): index offset incase first residue ID in PDB file is not the first physical residue (e.g. PDB starts at 5th residue). CA_only ([int]): Set=1 [default] if using only alpha carbons, else all atoms are used. Returns: [Pandas dataframe object]: dataframe containing every atom's x,y,z coord and serial number """ # pick a file reader based on the filetype. Newer/bigger structures will use PDBx if pdbx == 1: parser = MMCIFParser() # for PDBx files (.CIF) else: parser = PDBParser() # for PDB files (.PDB) structure = parser.get_structure(pdb_code, fname) # parse pdb file and store as a PDB structure object # makes lists storing the alpha carbon (CA) coordinates and serial number atom_coord_list = [] res_id_list = [] hydro_list = [] # loop through PDB structure object to get alpha carbon (or all atom) coordinates and serial number for i, model in enumerate(structure): # iterate through each model in the structure, exit loop after first model if i > 0: # NMR files have multiple models, but only one model is used for the graph break for chain in model: # iterate through each chain in the model for residue in chain: # iterate through each residue in the chain for atom in residue: # iterate through each atom in residue if CA_only == 1: if atom.get_name() == 'CA': # if atom is an alpha carbon then append its coordinates and serial number atom_coord_list.append(atom.get_vector()[:]) # convert vector object into list of numbers atom_sn = atom.get_serial_number() # biopython PDB module residue.__repr__() returns a string representing the residue: # e.g. '<Residue THR het= resseq=1 icode= >' # 'resseq' is the residue serial number # --> split the string until the number after 'resseq=' is all that remains res_repr = residue.__repr__() res_repr_tmp = res_repr.split('resseq=',1)[1] res_sn = int(res_repr_tmp.split(' ',1)[0]) + offset # get residue serial number (+ offest) res_name = residue.get_resname() # get first letters of residue name hydro_list.append(get_hydrophobicity(res_name)) # get hydrophobicity based on residue name # store residue ID: <first letter of residue name><residue serial number + offset> res_id_list.append(f'{res_name}{res_sn}-{atom_sn}') else: # get all atoms if(residue.get_resname() != 'HOH'): atom_coord_list.append(atom.get_vector()[:]) # convert vector object into list of numbers atom_sn = atom.get_serial_number() # biopython PDB module residue.__repr__() returns a string representing the residue: # e.g. '<Residue THR het= resseq=1 icode= >' # 'resseq' is the residue serial number # --> split the string until the number after 'resseq=' is all that remains res_repr = residue.__repr__() res_repr_tmp = res_repr.split('resseq=',1)[1] res_sn = int(res_repr_tmp.split(' ',1)[0]) + offset # get residue serial number (+ offest) res_name = residue.get_resname() # get first letters of residue name hydro_list.append(get_hydrophobicity(res_name)) # get hydrophobicity based on residue name # store residue ID: <first letter of residue name><residue serial number + offset> res_id_list.append(f'{res_name}{res_sn}-{atom_sn}') # create dataframes for atom coordinates, serial numbers, and then combine them. # there's probably a better way to do this in a single line atom_df = pd.DataFrame(atom_coord_list, columns=['x','y','z']) res_id_df = pd.DataFrame(res_id_list, columns=['residue id']) hydro_df = pd.DataFrame(hydro_list, columns=['hydrophobicity']) df = pd.concat([atom_df, res_id_df, hydro_df], axis=1) return df
[docs]def PDB_df_to_G(PDB_df, d_cut=8.0): """Converts a dataframe containing alpha carbon / atom coordinates (in Angstroms) into a graph, G(V,E). Each vertex, V, is an alpha carbon / atom. Two alpha carbons / atoms with a distance (in Angstroms) less than a cutoff, d_cut, are connected by an edge, E. Args: PDB_df ([Pandas dataframe object]): a dataframe containing alpha carbon / atom coordinate columns labeled: 'x', 'y', and 'z' d_cut ([float]): Threshold for two alpha carbons / atoms to be connected (in Angstroms) by an edge. Defaults to 8.0 Returns: G ([networkX graph object]): protein structure network graph, G(V,E) """ # create distance matrix where element i,j is the Euclidean distance between vertex i and vertex j d_matrix = squareform(pdist(PDB_df[['x','y','z']], 'euclid')) d_matrix_thresh = np.where(d_matrix>d_cut, 0, d_matrix) # if the distance is > d_cut, replace it with 0 (i.e. remove edge) G = nx.convert_matrix.from_numpy_matrix(d_matrix_thresh) # convert distance matrix to networkx graph object # create mapping to relabel nodes from index (i.e. 0...n-1) to residue id (i.e. str(<resname><res s/n + offset>) for each residue) label_mapping = {} hydro_mapping = {} for node in (G.nodes()): # here the node value is an index (0-n-1) label_mapping[node] = PDB_df['residue id'][node] hydro_mapping[node] = PDB_df['hydrophobicity'][node] betweenness = nx.betweenness_centrality(G) nx.set_node_attributes(G, betweenness, 'betweenness') nx.set_node_attributes(G, hydro_mapping, 'hydrophobicity') H = nx.relabel_nodes(G, label_mapping) # update the nodes and store as new graph object return H
[docs]def save_data(df, G, df_name, G_name): """Convenience function that stores dataframe as .csv and graph as .gexf file Args: df ([Pandas dataframe object]): dataframe to save G ([NetworkX graph object]): graph to save df_name ([str]): output filename for dataframe .csv G_name ([str]): output filename for graph .gexf """ df.to_csv(f'{df_name}.csv') # write dataframe to .csv nx.write_gexf(G, f'{G_name}.gexf') # write graph to .gexf (graph network XML file)
[docs]def save_data_at_this_folder(data_path, df, G, df_name, G_name): """Convenience function that stores dataframe as .csv and graph as .gexf file Args: data_path ([str] or [Path]): output directory path df ([Pandas dataframe object]): dataframe to save G ([NetworkX graph object]): graph to save df_name ([str]): output filename for dataframe .csv G_name ([str]): output filename for graph .gexf """ df_name = Path(data_path,df_name) df.to_csv(f'{df_name}.csv') # write dataframe to .csv G_name = Path(data_path,G_name) nx.write_gexf(G, f'{G_name}.gexf') # write graph to .gexf (graph network XML file)
[docs]def plot_coordinates(xyz_data, figsize=5): """creates a 3D scatter plot containing the xyz data Args: xyz_data ([numpy array]): A[0] = [x0,y0,z0] figsize (int, optional): size of figure (figsize x figsize). Defaults to 5. Returns: [matplotlib figure object]: 3d scatter plot figure """ fig = plt.figure(figsize=(figsize, figsize)) plt.title('x y z coordinates') ax = fig.add_subplot(projection='3d') ax.scatter(xyz_data[:,0], xyz_data[:,1], xyz_data[:,2], c='purple', s=2, alpha=0.3) return fig
[docs]def plot_FA_and_CA_coordinates(FA_xyz, CA_xyz, figsize=5): """creates a 3D scatter plot containing CA and FA atom coordinates Args: FA_xyz ([numpy array]): A[0] = [x0,y0,z0] for all atom coordinate data CA_xyz ([numpy array]): A[0] = [x0,y0,z0] for alpha carbon only coordinate data figsize (int, optional): size of figure (figsize x figsize). Defaults to 5. Returns: [matplotlib figure object]: 3d scatter plot figure """ fig = plt.figure(figsize=(figsize, figsize)) plt.title('x y z coordinates - FA and CA') ax = fig.add_subplot(projection='3d') ax.scatter(FA_xyz[:,0],FA_xyz[:,1], FA_xyz[:,2], c='purple', s=5, alpha=0.3, label='FA') ax.scatter(CA_xyz[:,0],CA_xyz[:,1], CA_xyz[:,2], c='black', s=15, alpha=0.5, label='CA') ax.legend() return fig
[docs]def main(args): """Takes a .pdb(x) file, converts it into a graph, and saves the atom coordinates to .csv and graph as .gexf Args: args ([argument parser object]): - args.pdb_code: PDB id / protein name - args.fname: PDB/PDBx filename - args.d_cut: Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms) - args.o: PDB residue index offset integer. Default is 0. - args.pdbx: set=1 to use pdbx file parser - args.CA_only: set=1 to use only alpha carbons (0 for all atoms) """ # get arguments pdb_code = args.pdb_code fname = args.fname d_cut = args.d_cut o = args.o pdbx = args.pdbx CA_only = args.CA_only df = PDB_to_df(pdb_code, fname, pdbx, o, CA_only) # convert pdb file into dataframe of atom/alpha carbon coordinates and s/n G = PDB_df_to_G(df, d_cut) # convert coordinate dataframe into network graph save_data(df, G, pdb_code, pdb_code) # save dataframe as .csv and graph as .gexf using the same name as the pdb code
if __name__ == '__main__': # example: >> python pdb2graph.py PDBid pdb_file.pdb 8 0 0 1 # get input arguments from command line parser = argparse.ArgumentParser() parser.add_argument("pdb_code", help="PDB id / protein name", type=str) parser.add_argument("fname", help="PDB/PDBx filename", type=str) parser.add_argument("d_cut", help="Alpha Carbon / atom pairwise contact distance cutoff (in Angstroms)", type=float) parser.add_argument("o", help="PDB residue index offset integer. Default is 0.", type=int) parser.add_argument("pdbx", help="set=1 to use pdbx file parser", type=int) parser.add_argument("CA_only", help="set=1 to use only alpha carbons (0 for all atoms)", type=int) args = parser.parse_args() main(args)