Source code for tfcomb.network


from datetime import datetime
import copy
import multiprocessing as mp
import pandas as pd
import numpy as np
import random
import itertools
import scipy
import re
import matplotlib
import matplotlib.pyplot as plt
from packaging import version

if version.parse(matplotlib.__version__) < version.parse("3.6.0"):
	from mpl_toolkits.axes_grid.inset_locator import inset_axes
else:  # axes_grid was removed in matplotlib 3.6.0
	from mpl_toolkits.axes_grid1.inset_locator import inset_axes

#Network analysis
import networkx as nx
import community as community_louvain

import tfcomb.utils
from tfcomb.logging import TFcombLogger, InputError
from tfcomb.utils import check_columns, check_type, check_string

#-------------------------------------------------------------------------------#
#------------------------ Build network from table -----------------------------#
#-------------------------------------------------------------------------------#

def _is_symmetric(a, rtol=1e-05, atol=1e-08):
	""" Utility to check if a matrix is symmetric. 
	Source: https://stackoverflow.com/a/42913743
	"""
	return np.allclose(a, a.T, rtol=rtol, atol=atol)


def _establish_node_attributes(table, node):
	""" 
	Returns a list of columns fitting to 'node' values.

	Parameters
	----------
	table : pd.DataFrame 
		Edges table including node/edge attributes.
	node : str
		Name of column contain node names.

	Returns
	-------
	list
		List of column names
	"""	

	node_attributes = []
	columns_to_assign = [col for col in table.columns if col != node]

	sub = table[:100000] #subset in interest of performance
	factorized = sub.apply(lambda x : pd.factorize(x)[0]) + 1 #factorize to enable correlation
	
	for attribute in columns_to_assign:
		try:
			# temporary change numpy floating-point error handling
			with np.errstate(all="raise"):
				p = scipy.stats.chisquare(factorized[node], f_exp=factorized[attribute])[1]
		except ValueError: #observed frequencies != expected frequencies
			p = 0.0 #not correlated
		except FloatingPointError: #sum of factorized is 0 -> all nan
			p = 0.0

		if p == 1.0: #columns are fully correlated; save to node attribute
			node_attributes.append(attribute)
	
	return(node_attributes)

def _get_table_dtypes(table):
	""" """
	columns = table.columns
	column_dtypes = table.dtypes.values
	dtype_list = [re.sub(r'[0-9]+', '', str(dtype)) for dtype in column_dtypes]
	dtype_list = [dtype if dtype != "object" else "string" for dtype in dtype_list]
	dtype_dict = dict(zip(columns, dtype_list))

	return(dtype_dict)

[docs]def build_network(edge_table, 
					node1="TF1", 
					node2="TF2", 
					node_table=None,
					directed=False, 
					multi=False,
					tool="networkx",
					verbosity=1):
	""" Build a network object from a table using either 'networkx' or 'graph-tool'.
	
	Parameters
	-----------
	edge_table : pd.DataFrame 
		Table containing rows of edges and edge information between node1/node2.
	node1 : str, optional
		The column to use as node1 ID. Default: "TF1".
	node2 : str, optional
		The column to use as node2 ID. Default: "TF2".
	node_table : pandas.DataFrame 
		A table of attributes to use for nodes. Default: node attributes are estimated from the columns in edge_table.
	directed : bool, optional
		Whether edges are directed or not. Default: False.
	multi : bool, optional
		Allow multiple edges between two vertices. NOTE: Only valid for tool == 'networkx'. If False, the first occurrence of TF1-TF2/TF2-TF1 in the table is used. Default: False.
	tool : str, optional
		Which module to use for generating network. Must be one of 'networkx' or 'graph-tool'. Default: 'networkx'.
	verbosity : int, optional
		Verbosity of logging (0/1/2/3). Default: 1.

	Returns
	--------
	if tool is 'networkx': networkx.Graph / networkx.DiGraph / networkx.MultiGraph / networkx.MultiDiGraph - depending on parameters given.
	if tool is 'graph-tool': graph_tool.Graph 
	"""

	#TODO: check given input
	check_type(edge_table, pd.DataFrame, "table")
	check_string(tool, ["networkx", "graph-tool"], "tool")

	if tool == "graph-tool":
		if tfcomb.utils.check_graphtool() == True: #check if graph-tool is installed
			import graph_tool

	#Setup table
	table = edge_table.copy()
	check_columns(table, [node1, node2])

	#Setup logger
	logger = TFcombLogger(verbosity)

	#########################################
	############# Prepare edges #############
	#########################################

	if multi == True and tool == "graph-tool":
		raise InputError("The option 'multi=True' is not compatible with 'tool=graph-tool'. Please adjust parameters.")

	# Subset edges if multi is not allowed
	if multi == False:

		#Remove duplicates of the same edge (first occurrence is kept)
		table = table.drop_duplicates([node1, node2])

		#Collect unique pairs (first occurrence is kept)
		table.set_index([node1, node2], inplace=True)
		pairs = table.index
		to_keep = {}
		for pair in pairs:
			if not pair[::-1] in to_keep: #if opposite was not already found
				to_keep[pair] = ""

		#Subset table
		table = table.loc[list(to_keep.keys())]
		table.reset_index(inplace=True)

	logger.spam("Edges table ({0}) head: {1}".format(table.shape, table.head()))

	#########################################
	####### Setup network attributes ########
	#########################################

	attribute_columns = [col for col in table.columns if col not in [node1, node2]]

	if node_table is None:
		
		#Establish node attributes
		node1_attributes = _establish_node_attributes(table, node1)
		logger.debug("node1_attributes: {0}".format(node1_attributes))
		
		node2_attributes = _establish_node_attributes(table, node2)
		node2_attributes = list(set(node2_attributes) - set(node1_attributes)) #prevent the same columns from being assigned to both TF1 and TF2)
		logger.debug("node2_attributes: {0}".format(node2_attributes))

		#Setup tables for node1 and node2 information
		node1_table = table[[node1] + node1_attributes].drop_duplicates().set_index(node1, drop=False) #also includes node1
		node2_table = table[[node2] + node2_attributes].drop_duplicates().set_index(node2, drop=False) #also includes node2

		#Merge node information to dict for network
		node_table = node1_table.merge(node2_table, left_index=True, right_index=True, how="outer")
		node_table.drop_duplicates(inplace=True)
	
	else:
		#TODO: check that node_table fits with node1/node2
		pass

	logger.spam("node_table ({0}) head:\n{1}".format(node_table.shape, node_table.head()))
	node_attributes = list(node_table.columns)
	logger.debug("node_attributes: {0}".format(node_attributes))
	node_attribute_dict = {i: {att: row[att] for att in node_attributes} for i, row in node_table.iterrows()}
	logger.spam("node_attribute_dict: {0} (...)".format({i: node_attribute_dict[i] for i in list(node_attribute_dict.keys())[:5]}))
	
	######## Setup edge attributes #######	
	edge_attributes = [col for col in attribute_columns if col not in node_attribute_dict]
	logger.debug("edge_attributes: {0}".format(edge_attributes))
	edges_list = [(row[node1], row[node2], {att: row[att] for att in edge_attributes}) for i, row in table.iterrows()]
	logger.spam("edges_list: {0} (...)".format(edges_list[:3]))

	#########################################
	############## Build Graph ##############
	#########################################

	if tool == "networkx":
		if multi == True:
			if directed == True:
				G = nx.MultiDiGraph()
			else:
				G = nx.MultiGraph()
		else:
			if directed == True:
				G = nx.DiGraph()
			else:
				G = nx.Graph()

		#Add collected edges
		G.add_edges_from(edges_list)
		
		#Add node attributes
		nx.set_node_attributes(G, node_attribute_dict)


	elif tool == "graph-tool":

		#Initialize graph
		g = graph_tool.all.Graph(directed=directed)

		node_table["_name"] = node_table.index #Add the node table index to the node info

		#Node attributes
		node_attributes = set([node1, node2] + node_attributes + ["_name"])
		dtype_dict = _get_table_dtypes(node_table)
		for att in node_attributes:
			eprop = g.new_vertex_property(dtype_dict[att])
			g.vertex_properties[att] = eprop

		#Edge attributes
		dtype_dict = _get_table_dtypes(table[edge_attributes])
		for att in edge_attributes:
			eprop = g.new_edge_property(dtype_dict[att])
			g.edge_properties[att] = eprop

		## Add nodes with properties
		name2idx = {} #TF name to idx
		for name, row in node_table.iterrows(): #.to_dict(orient="index").items():
			v = g.add_vertex()
			
			name2idx[name] = v #idx of node			
			for prop in g.vertex_properties:
				g.vertex_properties[prop][v] = row[prop]

		## Add edges with properties
		for _, row in table.iterrows(): #loop over all edges in table
			v1, v2 = name2idx[row[node1]], name2idx[row[node2]]
			e = g.add_edge(v1, v2)
			
			for prop in g.edge_properties:
				g.edge_properties[prop][e] = row[prop]

	#Return finished network
	if tool == "networkx":
		return(G)
	elif tool == "graph-tool":
		return(g)


#-------------------------------------------------------------------------------#
#------------------------- Network analysis algorithms -------------------------#
#-------------------------------------------------------------------------------#

[docs]def get_degree(G, weight=None, direction="both"):
	"""
	Get degree per node in graph. If weight is given, the degree is the sum of weighted edges.

	Parameters
	-----------
	G : networkx.Graph
		An instance of networkx.Graph
	weight : str, optional
		Name of an edge attribute within network. Default: None.
	direction : str, optional
		Which edge direction to use for calculating degrees. Can be one of: ["both", "in", "out"]. Default: 'both'.

	Returns
	--------
	DataFrame
		A table of format (...)

	"""

	#Check input
	tfcomb.utils.check_type(G, [nx.Graph])
	tfcomb.utils.check_type(weight, [str, type(None)], "weight")
	tfcomb.utils.check_string(direction, ["both", "in", "out"], direction)

	
	if weight is None:

		if direction == "both":
			d = dict(G.degree())
		elif direction == "in":
			d = dict(G.in_degree())
		elif direction == "out":
			d = dict(G.out_degree())
		
	else:
		tfcomb.utils.check_type(weight, [str], "weight")

		edge_attributes = list(list(G.edges(data=True))[0][-1].keys())
		if weight in edge_attributes:

			if direction == "both":
				d = dict(G.degree(weight=weight))
			elif direction == "in":
				d = dict(G.in_degree(weight=weight))
			elif direction == "out":
				d = dict(G.out_degree(weight=weight))
			
		else:
			raise ValueError("Weight '{0}' is not an edge attribute of given network. Available attributes are: {1}".format(weight, edge_attributes))
	
	#Convert dict to df
	df = pd.DataFrame.from_dict(d, orient="index")
	df.columns = ["degree"] 
	df.sort_values("degree", inplace=True, ascending=False)    

	#Add attribute to nodes
	att = direction + "_degree" if direction in ["in", "out"] else "degree" 
	for node in d:
		G.nodes[node][att] = d[node] 

	return(df)

[docs]def get_betweenness_centrality(G, weight=None):
	"""
	

	Parameters
	-----------
	G : networkx.Graph
		An instance of networkx.Graph
	weight : 
		Edge attribute. Default: None.
	"""
	pass


	nx.betweenness_centrality(G, weight=weight)

	#Add attribute to nodes
	


[docs]def subset_graph(G, nodes, depth=0):
	"""
	Subset a graph to a subset of nodes and their neighborhoods at the depth given by 'depth'.

	Parameters
	-------------
	G : networkx.Graph
		An instance of networkx.Graph.
	nodes : str or list of str
		Nodes to keep.
	depth : int, optional
		Default: 0 (only edges between given nodes)
	"""

	if isinstance(nodes, str):
		nodes = [nodes]

	#Get all sources
	networkx.single_source_dijkstra(G, )


#Graph clustering
[docs]def cluster_louvain(G, weight=None, attribute_name="cluster", logger=None):
	"""
	Cluster a network using community louvain clustering. By default, sets the attribute "cluster" to each node.

	Parameters
	----------
	G : networkx.Graph 
		An instance of a network graph to cluster.
	weight : str
		Attribute in graph to use as weight. The higher the weight, the stronger the link. Default: None.
	attribute_name : str
		The attribute name to use for saving clustering. Default: "cluster".
	logger : a logger object
		An instance of a logger. Default: No logging.

	Returns
	--------
	None - clustering is added to 'G' in place.

	"""

	if logger is None:
		logger = TFcombLogger(0)

	#TODO: check 
	tfcomb.utils.check_type(G, [nx.Graph])

	#network must be undirected
	if G.is_directed():
		raise TypeError("Bad graph type, use only non directed graph")

	#Process weights
	edge_view = G.edges(data=True)
	edge_attributes = list(list(edge_view)[0][-1].keys())
	if weight is None:
		#choose a weight name which is not within network to ensure that all weights are set to 1. 
		#Ref: https://github.com/taynaud/python-louvain/issues/73#issuecomment-751483227

		weight = "None"
		while weight in edge_attributes: #if weight was in edge_attributes, get random string
			weight = tfcomb.utils.random_string()
		
	else:
		
		edge_view = G.edges(data=True)
		edge_attributes = list(list(edge_view)[0][-1].keys())

		#Check whether weight is available as edge attribute
		if weight not in edge_attributes:
			raise ValueError("Weight '{0}' is not an edge attribute in network. Available edge attributes are: {1}".format(weight, edge_attributes))
		
		#convert all weights into 0-1

	logger.debug("'weight' is set to: '{0}'".format(weight))	
		
	#Cluster network
	logger.debug("Running community_louvain.best_partition()")
	cluster_dict = community_louvain.best_partition(G, weight=weight, random_state=1) #random_state ensures that results are reproducible
	cluster_dict_fmt = {key: {attribute_name: str(value + 1)} for key, value in cluster_dict.items()}

	#Add partition information to each node
	for node_i in cluster_dict_fmt:
		G.nodes[node_i][attribute_name] = cluster_dict_fmt[node_i][attribute_name]
	#nx.set_node_attributes(G, partition_dict_fmt) #overwrites previous attributes; solved by loop over dict

	#No return - G is changed in place

[docs]def cluster_blockmodel(g, attribute_name="cluster"):
	""" Clustering of a graph-tool graph using stochastic block model minimization.
	
	Parameters
	-----------
	g : a graph.tool graph
		An instance of a graph.tool graph.
	attribute_name : str
		The attribute name to use for saving clustering. Default: "cluster".
	"""
	
	#check if graph-tool is installed
	if tfcomb.utils.check_graphtool() == True:
		import graph_tool

	#Infer blocks
	state = graph_tool.inference.minimize.minimize_blockmodel_dl(g)
	blocks = state.get_blocks()

	#Add vertex property to graph
	clustering_prop = g.new_vertex_property("string")
	g.vertex_properties[attribute_name] = clustering_prop
	
	n_nodes = g.num_vertices()
	for i in range(n_nodes):
		g.vertex_properties[attribute_name][i] = blocks[i]

	#No return - g was changed in place

[docs]def get_node_table(G):
	""" Get a table containing node names and node attributes for G.

	Parameters
	-----------
	G : a networkx Graph object or graph_tool Graph object
	
	Returns
	--------
	pandas.DataFrame
	"""

	if isinstance(G, nx.Graph):
		nodeview = G.nodes(data=True)
		table  = pd.DataFrame().from_dict(dict(nodeview), orient='index')

	else:
		#check if graph-tool is installed
		if tfcomb.utils.check_graphtool() == True:
			import graph_tool

		if isinstance(G, graph_tool.Graph):
	
			#Information about graph
			n_nodes = G.num_vertices()
			properties = list(G.vertex_properties.keys())
		
			data = {}
			for i in range(n_nodes):
				data[i] = {prop: G.vertex_properties[prop][i] for prop in properties}

			table = pd.DataFrame.from_dict(data, orient="index")
		
		else:
			raise InputError("Unknown format of input Graph")

	return(table)

[docs]def get_edge_table(G):
	""" Get a table containing edge names and edge attributes for G.

	Parameters
	-----------
	G : a networkx Graph object.
	
	Returns
	--------
	pandas.DataFrame
	"""

	edgeview = G.edges(data=True)
	d = {(e[0],e[1]):e[2] for e in list(edgeview)} #dict of (TF1,TF2):{edge_att}
	table  = pd.DataFrame().from_dict(d, orient='index')

	return(table)

[docs]def create_random_network(nodes, edges):
	""" 
	Create a random network with the given list of nodes and total number of edges.

	Parameters
	-------------
	nodes : list
		List of nodes to use in network.
	edges : int
		Number of edges between nodes.

	Returns
	---------
	networkx.Graph containing random edges between nodes.
	"""

	G_rand = nx.Graph()
	G_rand.add_nodes_from(nodes)
	
	#Setup edges
	combis = list(itertools.combinations(nodes, 2))
	edges_list = random.choices(combis, k=edges)
	G_rand.add_edges_from(edges_list)
	
	return(G_rand)

[docs]def plot_powerlaw(G, title="Node degree powerlaw fit", color="blue", save=None):
	""" 
	Fit and plot a powerlaw distribution to the node degrees in the network. 

	Parameters
	------------
	G : a networkx Graph object
		Networkx containing nodes and edges to analyze.
	title : str, optional
		The title of the resulting plot. Default: "Node degree powerlaw fit".
	color : str, optional
		The color of the data plotted. Default: "blue".
	save : str, optional
		If not None, save the plot to the given path. Default: None.

	Returns
	--------
	ax : matplotlib.axes
		Axes object containing the plot.
	"""

	tfcomb.utils.check_module("powerlaw")
	import powerlaw

	#Get node degrees
	degrees = tfcomb.network.get_degree(G)
	
	#Fit power-law
	data = list(degrees["degree"].values)
	fit = powerlaw.Fit(data, discrete=True, estimate_discrete=True, verbose=False, xmin=1)
	
	#todo: Calculate R2
	#print(fit.alpha)
	#10**-1.4
	
	#### Plot
	fig, ax = plt.subplots()

	fit.power_law.plot_pdf(color='black', linestyle='--', label='Powerlaw fit')
	fit.plot_pdf(color=color, label="Data (n={0})".format(len(degrees)))
	plt.legend(fontsize=12, loc="upper right")
	
	ax1in = inset_axes(ax, width = "30%", height = "30%", loc=3)
	ax1in.hist(data, density=True, color=color)
	ax1in.set_xticks([])
	ax1in.set_yticks([])
	ax1in.set_title("Histogram", fontsize=12)
	
	ax.set_xlabel("Node degree", fontsize=12)
	ax.set_ylabel("Density", fontsize=12)
	ax.set_title(title, fontsize=16)

	if save is not None:
		fig.savefig(save, dpi=600, bbox_inches="tight")

	return(ax)