Plot and colour a minimum spanning tree with networkx
Background
A Minimum Spanning Tree (MST) is a concept from graph theory. Given a connected, undirected graph with weighted edges, an MST is a subset of the edges that connects all vertices together, without any cycles, and with the minimum possible total edge weight. While MSTs and phylogenetic trees both deal with the concept of connecting nodes in a graph, their purposes are different. Phylogenetic Trees represent evolutionary relationships among species or genes and have a root (representing a common ancestor) and branches (representing divergence). MSTs connect a set of points and do not represent evolutionary paths or divergence explicitly. While MSTs are not typically used to represent phylogenetic relationships in the traditional sense, they can be useful for showing genetic relatioships based on pairwise distance metrics. Visualizing genetic relationships in a network format rather than a strict tree format can sometimes be more intuitive.
Code
import pylab as plt
import numpy as np
import pandas as pd
def make_legend(fig, colormap, loc=(1.05, .6), title='',fontsize=12):
"""Make a figure legend wth provided color mapping"""
import matplotlib.patches as mpatches
pts=[]
for c in colormap:
pts.append(mpatches.Patch(color=colormap[c],label=c))
fig.legend(handles=pts,bbox_to_anchor=loc,fontsize=fontsize,title=title)
return pts
def get_color_mapping(df, col, cmap=None, seed=1):
"""Get random color map for categorcical dataframe column"""
c = df[col].unique()
if cmap == None:
rcolors = random_colors(len(c),seed)
else:
cmap = mpl.cm.get_cmap(cmap)
rcolors = [cmap(i) for i in range(len(c))]
colormap = dict(zip(c, rcolors))
newcolors = [colormap[i] if i in colormap else 'Black' for i in df[col]]
return newcolors, colormap
def dist_matrix_to_mst(distance_matrix, df=None, colorcol=None, labelcol=None, colormap=None,
cmap_name='Set1', node_size=4, font_size=6, with_labels=False,
edge_labels=False, legend_loc=(1, .7), ax=None):
"""
Dist matrix to minimum spanning tree
Args:
distance_matrix: matrix as dataframe
df: meta data with index corresponding to node names of tree
colorcol: column in meta table to color nodes by
labelcol: column in meta table to label nodes by
colormap: a mapping of node names to colors
cmap_name: name of an mpl colormap to apply instead of providing colormap
node_size: size of nodes
with_labels: whether to plot labels on nodes
legend_loc: location of legend
"""
if ax == None:
fig,ax=plt.subplots()
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
G = nx.Graph()
for i, row in distance_matrix.iterrows():
for j, weight in row.items():
G.add_edge(i, j, weight=weight)
T = nx.minimum_spanning_tree(G, algorithm='kruskal')
# Compute edge lengths based on distances
edge_lengths = [T[u][v]['weight'] for u, v in T.edges()]
# Plot the minimum spanning tree with edge lengths proportional to distances
pos = graphviz_layout(T)
labels = nx.get_edge_attributes(T, 'weight')
if df is not None:
l = [label for label in T.nodes if label in df.index]
df = df.loc[l]
if colormap is None:
colors,cmap = get_color_mapping(df, colorcol, cmap_name)
else:
#custom colormap if provided
colors = [colormap[i] if i in colormap else 'black' for i in df[colorcol]]
cmap = colormap
#print (cmap)
C = dict(zip(df.index, colors))
node_colors = [C[node] if node in C else 'Black' for node in T.nodes()]
#checks that colormap matches nodes so legend doesn't have crap in it
cmap = check_keys(cmap, df[colorcol].unique())
#add legend for node colors
p = make_legend(ax.figure, cmap, loc=legend_loc, title=colorcol,fontsize=9)
else:
node_colors = 'black'
nx.draw_networkx(T, pos, node_color=node_colors, node_size=node_size,
font_size=font_size, with_labels=with_labels, ax=ax)
if edge_labels == True:
nx.draw_networkx_edge_labels(T, pos, edge_labels=labels, font_size=font_size*.7, ax=ax)
if labelcol not in [None,'']:
node_labels = {node:df.loc[node][labelcol] if node in df.index else '' for node in T.nodes()}
#print (node_labels)
nx.draw_networkx_labels(T, pos, labels=node_labels, font_size=font_size,
horizontalalignment='right',verticalalignment='top')
ax.axis('off')
return T, pos
Here is some code to make test matrix and meta data:
def generate_distance_matrix(n, dim=4):
from scipy.spatial import distance_matrix
# Generate random coordinates in a latent space
coordinates = np.random.rand(n, dim)
# Compute the pairwise distances between the coordinates
dist_matrix = distance_matrix(coordinates, coordinates)
# Create row and column labels
labels = [f"item{i+1}" for i in range(n)]
# Convert the matrix to a pandas DataFrame
distance_df = (pd.DataFrame(dist_matrix, index=labels, columns=labels)*10).round(1)
return distance_df
def generate_random_metadata(names):
import random
# Define possible values for the metadata fields
countries = ['Ireland', 'UK', 'USA', 'Canada', 'Australia']
codes = ['A', 'B', 'C', 'D', 'E']
years = list(range(2000, 2025))
# List to store metadata for each name
metadata = []
for name in names:
# Randomly select values for the metadata fields
country = random.choice(countries)
code = random.choice(codes)
year = random.choice(years)
# Append the metadata as a dictionary
metadata.append({
'name': name,
'country': country,
'code': code,
'year': year
})
# Convert the list of metadata dictionaries to a pandas DataFrame
metadata_df = pd.DataFrame(metadata).set_index('name')
return metadata_df
Usage
Simple plot
dm = generate_distance_matrix(20)
meta = generate_random_metadata(dm.index)
T,pos=dist_matrix_to_mst(dm, meta, 'country', node_size=100)
Add edge labels
fig,ax=plt.subplots(1,1,figsize=(10,6))
T,pos = dist_matrix_to_mst(dm,meta,colorcol='country',cmap_name='tab10',edge_labels=True,font_size=9,
node_size=60,legend_loc=(1,.9),ax=ax)
Add node labels
T,pos = dist_matrix_to_mst(dm,meta,colorcol='year',cmap_name='tab20',with_labels=True,font_size=9,
node_size=600,legend_loc=(1,.9),ax=ax)
Clustering
Finally, let’s illustrate the relationship between clustering and tree visualization. Visualizing an MST shows the shortest paths to connect all nodes. The MST algorithm (e.g., Kruskal’s or Prim’s) selects edges based on the minimum weights without regard to the global structure of clusters. This can result in nodes that are close in distance but part of different local structures being connected in the MST. You can see this below when we apply labels from hierarchical clustering and plot them. It is useful to keep in mind.
def dm_cluster(distance_matrix, t, linkage='average'):
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(distance_threshold=t, n_clusters=None,
linkage=linkage, metric='precomputed').fit(distance_matrix)
labels = clustering.labels_+1
clusters = pd.DataFrame(labels,columns=['cluster'],index=distance_matrix.index)
return labels, clusters
labels,cl = dm_cluster(dm, 7)
meta['cluster'] = labels
#now color by cluster label
T,pos = dist_matrix_to_mst(dm,meta,colorcol='cluster',cmap_name='tab20',
node_size=40,legend_loc=(1,.9))
Links
- https://networkx.org/