# August George, 2022, PNNL
import argparse
import mrcfile
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
import networkx as nx
from scipy.spatial.distance import pdist,squareform
from pathlib import Path
[docs]def load_density_file(fname):
"""load a .mrc file using the mrcfile package
Args:
fname ([str]): filename / filepath
Returns:
[mrcfile object]: MRC data
"""
# load .mrc tomogram file as a MRC object which has header and data properties.
# see: https://mrcfile.readthedocs.io/en/latest/usage_guide.html
mrc = mrcfile.mmap(fname, mode=u'r') # memory mapped mode for large files
return mrc
[docs]def normalize_and_threshold_data(mrc, t, noise_stdev=0.0, norm_T=False):
"""normalizes threshold value and densities then applies a cutoff threshold
Args:
mrc ([mrcfile object]): mrc data
t ([float]): raw (unormalized) pixel intensity cutoff threshold
noise_stdev ([float]): Standard deviation of Gaussian noise (mean=0) to add. Default is 0 (no noise added)
norm_T ([bool]): threshold value is normalized (True) or not normalized (False). Default is False.
Returns:
[numpy array]: array of x,y,z coordinates which are above the cutoff threshold. A[0] = [x0,y0,z0]
"""
# load and normalize data, normalize threshold value
if noise_stdev == 0:
D = mrc.data
else:
assert(noise_stdev>=0)
D = add_Gaussian_noise(mrc,scale=noise_stdev)
D_min = np.min(D)
D_max = np.max(D)
D_norm = (D - D_min)/(D_max-D_min) # normalize to 0,1: (x_i-x_min) / (x_max - x_min)
if norm_T ==True:
t_norm = t
else:
t_norm = (t-D_min) / (D_max-D_min)
# get x,y,z coordinates above threshold
x,y,z = np.where(D_norm > t_norm)
xyz_data = np.transpose(np.stack([x,y,z]))
return xyz_data
[docs]def cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples):
"""Clusters data using DBSCAN from sklearn
Args:
xyz_data ([numpy array]): A[0] = [x0,y0,z0]
DBSCAN_epsilon ([float]): DBSCAN epsilon value (in pixels)
DBSCAN_min_samples ([int]): DBSCAN min_samples
Returns:
[sklearn DBSCAN cluster object]: clustering results stored in an object
"""
model = DBSCAN(eps=DBSCAN_epsilon, min_samples=DBSCAN_min_samples) # apply coarse-graining (DBSCAN)
model.fit_predict(xyz_data)
return model
[docs]def get_cluster_centroids(xyz_data, model):
"""Coarse grain density model using cluster centroids
Args:
xyz_data ([numpy array]): A[0] = [x0,y0,z0]
model ([sklearn DBSCAN cluster object]): clustering results stored in an object
Returns:
[numpy array]: array of cluster centroids, A[0] = [centroid_x0, centroid_y0, centroid_z0]
"""
samples_w_lbls = np.concatenate((xyz_data,model.labels_[:,np.newaxis]),axis=1)
if -1 in set(model.labels_): # if noise detected
coarse_model = np.zeros((len(set(model.labels_))-1,3)) # remove last label which is noise
for i in range(len(set(model.labels_))-1):
# https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python
tmp_T = np.transpose(samples_w_lbls[np.in1d(samples_w_lbls[:,-1], np.asarray([i]))])
x_mean = np.mean(tmp_T[0])
y_mean = np.mean(tmp_T[1])
z_mean = np.mean(tmp_T[2])
coarse_model[i] = [x_mean,y_mean,z_mean]
else:
coarse_model = np.zeros((len(set(model.labels_)),3))
for i in range(len(set(model.labels_))):
# https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python
tmp_T = np.transpose(samples_w_lbls[np.in1d(samples_w_lbls[:,-1], np.asarray([i]))])
x_mean = np.mean(tmp_T[0])
y_mean = np.mean(tmp_T[1])
z_mean = np.mean(tmp_T[2])
coarse_model[i] = [x_mean,y_mean,z_mean]
return coarse_model
[docs]def plot_clustering_results(xyz_data, coarse_model, figsize=3):
"""creates a 3D scatter plot containing both the xyz data and the cluster centroids.
Note: should rotate afterwards for better visualization.
Args:
xyz_data ([numpy array]): A[0] = [x0,y0,z0]
coarse_model ([numpy array]): array of cluster centroids, A[0] = [centroid_x0, centroid_y0, centroid_z0]
Returns:
[matplotlib figure object]: 3d scatter plot figure
"""
fig = plt.figure(figsize=(figsize, figsize))
plt.title('clustering results')
ax = fig.add_subplot(projection='3d')
ax.scatter(xyz_data[:,0], xyz_data[:,1], xyz_data[:,2], c='purple', s=2, alpha=0.3)
ax.scatter(coarse_model[:,0], coarse_model[:,1], coarse_model[:,2], c='k', s=10, alpha=0.9)
return fig
[docs]def create_and_save_graph(coarse_model, proximity_px, out_fname, save=True):
"""creates a Networkx graph from the coarse grained model (cluster centroids) and saves it as a graph XML file (.gexf)
Args:
coarse_model ([numpy array]): array of cluster centroids, A[0] = [centroid_x0, centroid_y0, centroid_z0]
proximity_px ([float]): pairwise cutoff distance for assigning edges to nodes, in pixels.
out_fname ([string]): filename for output
save ([boolean]): flag to save file (True) or not (False)
Returns:
[networkx graph object]: graph representation of coarse model (cluster centroids)
"""
d_matrix = squareform(pdist(coarse_model, 'euclid'))
d_matrix_thresh = np.where(d_matrix>proximity_px, 0, d_matrix) # if the distance is > t, replace it with 0 (i.e. remove edge)
G = nx.convert_matrix.from_numpy_matrix(d_matrix_thresh) # convert distance matrix to networkx graph object
if save:
nx.write_gexf(G,f'{out_fname}.gexf')
return G
[docs]def add_Gaussian_noise(mrc, loc=0.0, scale=1.0):
"""Adds Gaussian white noise to the data in an mrc file
ref: https://numpy.org/doc/stable/reference/random/generated/numpy.random.normal.html
Args:
mrc (mrcfile object): mrc object to add noise to
loc (float, optional): mean of Gaussian distribution. Defaults to 0.
scale (float, optional): standard deviation of Gaussian distribution. Defaults to 1.
Returns:
[numpy array]: data with noise added
"""
D = mrc.data
noise = np.random.normal(loc=loc, scale=scale, size=D.shape)
D_w_noise = D + noise
return D_w_noise
[docs]def main(args):
"""Takes a 3D density (.mrc), applies threshold, coarse-grains data, and converts it into a graph network.
Outputs a .png file of the coarse grained model, and a .gexf graph xml file.
Args:
args ([argument parser object]):
- args.fname: .mrc filename (white density w/ black background)
- args.t: unormalized pixel intensity threshold level
- args.eps: DBSCAN epsilon (inter cluster distance)
- args.ms: DBSCAN min samples (minimum number of samples in cluster)
- args.d_cut: pairwise distance cutoff for assigning edges to graph, in pixels
"""
fname = Path(args.fname)
t = args.t
DBSCAN_epsilon = args.eps # try 1
DBSCAN_min_samples = args.ms # try 4
d_cut = args.d_cut # try 8
out_fname = fname.with_suffix('')
mrc = load_density_file(fname)
xyz_data = normalize_and_threshold_data(mrc,t)
model = cluster_data(xyz_data,DBSCAN_epsilon,DBSCAN_min_samples)
coarse_model = get_cluster_centroids(xyz_data,model)
G = create_and_save_graph(coarse_model,d_cut,out_fname)
fig = plot_clustering_results(xyz_data,coarse_model)
if __name__ == '__main__':
# example: >> python density2graph.py fname.mrc 0.5 1 4 8
parser = argparse.ArgumentParser()
parser.add_argument("fname", help="tomogram .mrc filename (white density w/ black background)", type=str)
parser.add_argument("t", help="pixel intensity threshold cutoff (unormalized)", type=float)
parser.add_argument("eps", help="DBSCAN epsilon (inter cluster distance) in pixels", type=float)
parser.add_argument("ms", help="DBSCAN min samples (minimum number of samples in cluster)", type=float)
parser.add_argument("d_cut", help="pairwise distance cutoff for assigning edges in pixels", type=float)
args = parser.parse_args()
main(args)