Source code for tfcomb.annotation

import os
import pandas as pd
pd.options.mode.chained_assignment = None #suppress 'SettingWithCopyWarning' prints
import copy
import os
import numpy as np
import gzip
import shutil
import requests
import tobias
import glob
import multiprocessing as mp
import psutil

#UROPA annotation
import logging
import pysam
import uropa
from uropa.annotation import annotate_single_peak
from uropa.utils import format_config

#GO-term analysis
from goatools.base import download_ncbi_associations
from goatools.base import download_go_basic_obo
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.obo_parser import GODag

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#Import tfcomb
import tfcomb
from tfcomb.utils import check_type, check_value, check_string
from tfcomb.logging import TFcombLogger, InputError

#Load internal data
import pkg_resources
DATA_PATH = pkg_resources.resource_filename("tfcomb", 'data/')
gene_tables = glob.glob(DATA_PATH + "*_genes.txt")
available_organisms = [os.path.basename(gene_table.replace("_genes.txt", "")) for gene_table in gene_tables]

taxid_table = pd.read_csv(DATA_PATH + "tax_ids.txt", sep="\t", header=None)
organism_to_taxid = dict(zip(taxid_table[0], taxid_table[1]))

#-------------------------------------------------------------------------------#
#----------------------------- Annotation of sites -----------------------------#
#-------------------------------------------------------------------------------#

[docs]def annotate_regions(regions, gtf, config=None, best=True, threads=1, verbosity=1):
	"""
	Annotate regions with genes from .gtf using UROPA [1]_. 

	Parameters
	----------
	regions : tobias.utils.regions.RegionList() or pandas.DataFrame
		A RegionList object with positions of genomic elements e.g. TFBS or a DataFrame containing chr/start/stop-coordinates. If DataFrame, the function assumes that the order of columns is: 
		'chromosome', 'start', 'end', 'id', 'score', 'strand'.
	gtf : str
		Path to .gtf file containing genomic elements for annotation.
	config : dict, optional
		A dictionary indicating how regions should be annotated. Default is to annotate feature 'gene' within -10000;1000bp of the gene start. See 'Examples' of how to set up a custom configuration dictionary.
	best : boolean
		Whether to return the best annotation or all valid annotations. Default: True (only best are kept).
	threads : int, optional
		Number of threads to use for multiprocessing. Default: 1.
	verbosity : int, optional
		Level of verbosity of logger. One of 0,1, 2. Default: 1.

	Returns
	--------
	pd.DataFrame or None
		Dataframe including regions and annotation information (if applicable, otherwise a warning will be displayed and None is returned).
		

	References
	----------
		.. [1] Kondili M, Fust A, Preussner J, Kuenne C, Braun T, and Looso M. UROPA: a tool for Universal RObust Peak Annotation. Scientific Reports 7 (2017), doi: 10.1038/s41598-017-02464-y

	Examples
	---------
	>>> custom_config = {"queries": [{"distance": [10000, 1000], 
	...					 "feature_anchor": "start", 
	...					 "feature": "gene"}],
	...					 "priority": True, 
	...					 "show_attributes": "all"}
	
	#Annotate regions (data/ refers to the data directory of the tfcomb github repository) \n
	
	>>> regions = pd.read_csv("data/GM12878_hg38_chr4_ATAC_peaks.bed")				
	>>> annotate_regions(regions, gtf="data/chr4_genes.gtf",
							  config=custom_config)
	"""
	
	#Check input types
	check_type(regions, [list, tobias.utils.regions.RegionList, pd.DataFrame], "regions")
	check_type(gtf, str, "gtf")
	check_type(config, [type(None), dict], "config")
	check_value(threads, vmin=1, name="threads")

	#Setup logger (also checks verbosity is valid)
	logger = TFcombLogger(verbosity)
	
	#Establish configuration dict
	if config is None:
		cfg_dict = {"queries": [{"distance": [10000, 1000], 
								"feature_anchor": "start", 
								"feature": "gene"}],
					"priority": True, 
					"show_attributes": "all"}
	else:
		cfg_dict = config

	cfg_dict = copy.deepcopy(cfg_dict) #make sure that config is not being changed in place
	cfg_dict = format_config(cfg_dict, logger=logger)
	logger.debug("Config dictionary: {0}".format(cfg_dict))

	#Convert regions to dict for uropa
	region_dicts = []
	if type(regions) == pd.DataFrame:	
		for idx, row in regions.iterrows():
			elements = row.iloc[:6].tolist()
			d = {"peak_chr": elements[0],
				 "peak_start": int(elements[1]),
				 "peak_end": int(elements[2]),
				 "peak_id": elements[3],
				 "peak_score": elements[4],
				 "peak_strand": elements[5]}
			region_dicts.append(d)
	else:
		for region in regions:
			d = {"peak_chr": region.chrom, 
				"peak_start": region.start, 
				"peak_end": region.end, 
				"peak_id": region.name,
				"peak_score": region.score,
				"peak_strand": region.strand}
			region_dicts.append(d)
	
	#Index tabix
	gtf_given = gtf 	#gtf before indexing
	gtf_index = gtf + ".tbi"
	if not os.path.exists(gtf_index):
		try:
			gtf = pysam.tabix_index(gtf, preset="gff", keep_original=True)
		except OSError: #gtf is already gzipped
			gtf_gz = gtf + ".gz"
			gtf_index = gtf_gz + ".tbi"
			if not os.path.exists(gtf_index):
				gtf_gz = pysam.tabix_index(gtf_gz, preset="gff", keep_original=True)
			gtf = gtf_gz

	#Force close of gtf file left open; pysam issue 1038
	proc = psutil.Process()
	for f in proc.open_files():
		if f.path == os.path.abspath(gtf_given):
			os.close(f.fd) 

	#Split input regions into cores
	n_reg = len(region_dicts)
	per_chunk = int(np.ceil(n_reg/float(threads)))
	region_dict_chunks = [region_dicts[i:i+per_chunk] for i in range(0, n_reg, per_chunk)]

	#Calculate annotations for each chunk
	annotations = []
	if threads == 1:
		for region_chunk in region_dict_chunks:
			chunk_annotations = _annotate_peaks_chunk(region_chunk, gtf, cfg_dict)
			annotations.extend(chunk_annotations)
	else:
		
		#Start multiprocessing pool
		pool = mp.Pool(threads)

		#Start job for each chunk
		jobs = []
		for region_chunk in region_dict_chunks:
			job = pool.apply_async(_annotate_peaks_chunk, (region_chunk, gtf, cfg_dict, ))
			jobs.append(job)
		pool.close()
	
		#TODO: Print progress
		n_done = [job.ready() for job in jobs]

		#Collect results:
		for job in jobs:
			chunk_annotations = job.get()
			annotations.extend(chunk_annotations)

		pool.join()

	#Select best annotations
	if best == True:
		annotations = [annotations[i] for i, anno in enumerate(annotations) if anno["best_hit"] == 1]
	
	
	found_annotation = False
	#Extend feat_attributes per annotation and format for output
	del_keys = ["raw_distance", "anchor_pos", "query", "peak_center", "peak_length", "feat_length", "feat_center"] 
	for anno in annotations:
		if "feat_attributes" in anno:
			# at least one annotation was found
			found_annotation = True
			for key in anno["feat_attributes"]:
				anno[key] = anno["feat_attributes"][key]
			del anno["feat_attributes"]

		#remove certain keys
		for key in del_keys:
			if key in anno:
				del anno[key]

		#Remove best_hit column if best is True
		if best == True:
			del anno["best_hit"]

		#Convert any lists to string
		for key in anno:
			if isinstance(anno[key], list):
				anno[key] = anno[key][0]


	# handle no annotations found
	if not found_annotation:
		logger.warning("No annotations found for given config. Return type is None.")
		return None

	#Convert to pandas table
	annotations_table = pd.DataFrame(annotations)

	#Merge original sites with annotations
	if type(regions) == pd.DataFrame:
		region_columns = regions.columns.tolist()[:6]
		annotation_columns = ["peak_chr", "peak_start", "peak_end",	"peak_id", "peak_score", "peak_strand"]
		anno_unique = annotations_table.drop_duplicates()
		annotations_table = regions.merge(anno_unique, left_on=region_columns, right_on=annotation_columns, how="left")
		annotations_table.drop(columns=annotation_columns, inplace=True)

	#Add information to .annotation for each peak
	#for i, region in enumerate(regions):
	#	region.annotation = best_annotations[i]
	#logger.info("Attribute '.annotation' was added to each region")
	
	return(annotations_table)

def _annotate_peaks_chunk(region_dicts, gtf, cfg_dict):
	""" Multiprocessing safe function to annotate a chunk of regions """

	logger = uropa.utils.UROPALogger()

	#Open tabix file
	tabix_obj = pysam.TabixFile(gtf)

	#For each peak in input peaks, collect all_valid_annotations
	all_valid_annotations = []
	for i, region in enumerate(region_dicts):
	
		#Annotate single peak
		valid_annotations = annotate_single_peak(region, tabix_obj, cfg_dict, logger=logger)
		all_valid_annotations.extend(valid_annotations)

	tabix_obj.close()

	return(all_valid_annotations)

[docs]def get_annotated_genes(regions, attribute="gene_name"):
	""" Get list of genes from the list of annotated regions from annotate_regions(). 
	
	Parameters
	-----------
	regions : RegionList() or list of OneTFBS objects 
	
	attribute : str
		The name of the attribute in the 9th column of the .gtf file. Default: 'gene_name'.
	"""
	
	genes = []
	for region in regions:
		
		#Check if region has any annotation
		if "feature" in region.annotation: 
			att_dict = region.annotation.get("feat_attributes", {})
			gene = att_dict.get(attribute, [None])[0]
			
			#if gene == None:
			#    print(region.annotation)
			genes.append(gene)
	
	#Format gene list
	genes = [gene for gene in genes if gene is not None] #Remove None from list
	genes = list(set(genes)) #remove duplicates
	
	return(genes)

#-------------------------------------------------------------------------------#
#------------------------------- GO-term enrichment ----------------------------#
#-------------------------------------------------------------------------------#

[docs]class GOAnalysis(pd.DataFrame):

	aspect_translation = {"BP": "Biological Process",
						  "MF": "Molecular Function",
						  "CC": "Cellular Component"}
	@property
	def _constructor(self):
		return GOAnalysis

	@staticmethod
	def _fmt_field(v):
		""" Format a object to string depending on type. For use when formatting table entries of tuple/list """

		if isinstance(v, str):
			s = v
		elif isinstance(v, tuple):
			s = "/".join([str(i) for i in v])
		elif isinstance(v, list) or isinstance(v, set):
			o = sorted(list(v))
			s = ", ".join([str(i) for i in o])
		else: #float/int
			s = v
		return(s)

	@staticmethod
	def _match_column(table, lst):
		""" Returns the column name with the best match to the list of ids/values """

		#Which column from gene_table has the best match?
		match = []
		for column in table.columns:
			tup = (column, sum([value in table[column].tolist() for value in lst]))
			match.append(tup)
		
		id_col = sorted(match, key=lambda t: -t[1])[0][0] #column with best matching IDs

		return(id_col)	

[docs]	def enrichment(self, genes, 
						organism="hsapiens", 
						background=None, 
						propagate_counts=True,
						min_depth=1, 
						verbosity=1):
		"""
		Perform a GO-term enrichment based on a list of genes. This is a TF-COMB wrapper for goatools.

		Parameters
		-----------
		gene_ids : list
			A list of gene ids.
		organism : :obj:`str`, optional
			The organism of which the gene_ids originate. Defaults to 'hsapiens'.
		background : list, optional
			A specific list of background gene ids to use. Default: The list of protein coding genes of the 'organism' given. 
		propagate_counts : bool
			Whether to propagate counts up the tree to parent GO's. Default: True.
		min_depth : int
			Minimum depth of GO-terms to show in output table. Default: 1.
		verbosity : int, optional
			Default: 1.

		See also
		---------
		tfcomb.annotation.GOAnalysis.plot_bubble()

		Returns
		--------
		GOAnalysis object containing enrichment results

		Reference
		----------
		https://www.nature.com/articles/s41598-018-28948-z
		
		"""
		
		#verbosity 0/1/2

		#setup logger
		logger = TFcombLogger(verbosity)
		
		#TODO: Gene_ids must be a list
		tfcomb.utils.check_type(genes, [list, set], "gene_ids")
		
		#Organism must be in human/mouse
		if organism not in available_organisms:
			raise InputError("Organism '{0}' not available. Please choose any of: {1}".format(organism, available_organisms))
		taxid = organism_to_taxid[organism]
		logger.info("Running GO-term enrichment for organism '{0}' (taxid: {1})".format(organism, taxid))
		
		############### Read data ###############

		## Setup GOATOOLS GO DAG
		obo_fname = "go-basic.obo"
		if not os.path.isfile(obo_fname):
			logger.info("Downloading ontologies")
			obo_fname = download_go_basic_obo()
		obodag = GODag(obo_fname)
		
		## Setup gene -> GO term associations
		#check if gene2go contains any data; delete if not
		fin_gene2go = "gene2go"
		if os.path.exists(fin_gene2go):
			s = os.path.getsize(fin_gene2go)
			if s == 0:
				logger.warning("gene2go has size 0; deleting the file")
				os.remove(fin_gene2go)

		else: #file does not exist; try downloading
			logger.info("Downloading NCBI associations")
			try:
				fin_gene2go = download_ncbi_associations()
			except:
				logger.warning("An error occurred downloading NCBI associations using goatools")
				logger.warning("TF-COMB will attempt to download and extract the file manually")
				
				url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz"
				base = os.path.basename(url) #gene2go.gz
				
				logger.debug("Downloading {0}".format(url))
				with open(base, "wb") as f:
					r = requests.get(url)
					f.write(r.content)

				logger.debug("Decompressing {0}".format(base))
				with gzip.open(base, 'rb') as f_in:
					with open(fin_gene2go, 'wb') as f_out:
						shutil.copyfileobj(f_in, f_out)

		logger.debug("fin_gene2go: {0}".format(fin_gene2go))

		logger.debug("Setting up Gene2GoReader")
		objanno = Gene2GoReader(fin_gene2go, taxids=[taxid])
		logger.debug(objanno)

		ns2assoc = objanno.get_ns2assc()
		#logger.debug(ns2assoc)

		
		############### Setup analysis ###############

		#Read data from package
		gene_table = pd.read_csv(DATA_PATH + organism + "_genes.txt", sep="\t")

		###### Setup gene ids ####### 
		logger.info("Setting up gene ids")

		target_col = "entrezgene_id"

		#Check if genes are in ns2assoc; else, try to convert
		all_gene_ids = set(sum([list(ns2assoc[aspect].keys()) for aspect in ns2assoc.keys()], []))
		n_found = sum([gene_id in all_gene_ids for gene_id in genes])
		
		if n_found == 0:
			
			#Which column from gene_table has the best match?
			id_col = self._match_column(gene_table, genes)
			logger.debug("genes best match column: {0}".format(id_col))
			
			#Find out how many ids can be converted
			not_found = set(genes) - set(gene_table[id_col])
			if len(not_found) > 0:
				logger.warning("{0}/{1} gene ids from 'genes' could not be converted to entrez ids for {2}".format(len(not_found), len(set(genes)), organism))

			#Convert to entrez id
			ids2entrez = dict(zip(gene_table[id_col], gene_table[target_col]))
			gene_ids_entrez = [ids2entrez.get(s, -1) for s in genes]
			gene_ids_entrez = set([gene for gene in gene_ids_entrez if gene > 0]) #only genes for which there was a match

			#Save dict for converting back to original values
			entrez2id = dict(zip(gene_table[target_col], gene_table[id_col]))

		else:
			gene_ids_entrez = genes #gene ids were already entrez
			entrez2id = dict(zip(genes, genes))

		###### Setup background gene ids ######
		if background == None:

			#Use data from package
			logger.debug("Getting background_gene_ids from gene_table")
			background_gene_ids_entrez = list(set(gene_table["entrezgene_id"].tolist())) #unique gene ids from table

		else:

			#Find best-fitting column in gene_table
			id_col = self._match_column(gene_table, background)
			ids2entrez = dict(zip(gene_table[id_col], gene_table[target_col]))
			background_gene_ids_entrez = [ids2entrez.get(s, -1) for s in background]
			background_gene_ids_entrez = set([gene for gene in background_gene_ids_entrez if gene > 0]) #only genes for which there was a match
		
		#Check if genes were in ns2assoc




		#Setup goeaobj
		logger.info("Setting up GO enrichment")
		goeaobj = GOEnrichmentStudyNS(
					background_gene_ids_entrez, # List of protein-coding genes in entrezgene format
					ns2assoc, # geneid/GO associations
					obodag, # Ontologies
					propagate_counts = propagate_counts,
					alpha = 0.05, # default significance cut-off
					methods = ['fdr_bh'], # defult multipletest correction method
					prt=None, 
					log=None) 
		
		############### Run study ###############
		logger.debug("Running .run_study():")
		if verbosity <= 1:
			goea_results_all = goeaobj.run_study(gene_ids_entrez, prt=None)
		else:
			goea_results_all = goeaobj.run_study(gene_ids_entrez)
		
		#Convert study items (entrez) to original ids
		for res in goea_results_all:
			res.study_items = [entrez2id[entrez] for entrez in res.study_items]

		############### Format to dataframe ###############

		keys_to_use = ["GO", "name", "NS", "depth", "enrichment", "ratio_in_study", "ratio_in_pop", 
					"p_uncorrected", "p_fdr_bh", "study_count", "study_items"]
		lines = [[self._fmt_field(res.__dict__[key]) for key in keys_to_use] for res in goea_results_all]
		table = pd.DataFrame(lines, columns=keys_to_use)

		#Set minimum depth (stop "propagate" from collecting counts to upper aspect categories)
		table = table[table["depth"] >= min_depth]

		#Convert e (enriched)/p (purified) to increased/decreased
		translation_dict = {"e": "increased", "p": "decreased"}
		table.replace({"enrichment": translation_dict}, inplace=True)

		#Sort by FDR
		table.sort_values(["p_fdr_bh", "study_count"], ascending=[True, False], inplace=True)

		#Convert to GOAnalysis object
		table = GOAnalysis(table)
		logger.debug("Finished go_enrichment!")

		return(table)


[docs]	def plot_bubble(self, aspect="BP", n_terms=20, threshold=0.05, title=None, save=None):
		"""
		Plot a bubble-style plot of GO-enrichment results.

		Parameters
		--------------
		aspect : str
			The aspect for which GO-terms should be shown. Must be one of ["BP", "MF", "CC"]. Default: "BP".
		n_terms : int
			Maximum number of terms to show in graph. Default: 20
		threshold : float between 0-1
			FDR threshold for significant GO-terms. Default: 0.05.	
		title : str
			Custom title for the plot. Default: 'GO-terms for <aspect>'.
		save : str, optional
			Save the plot to the file given in 'save'. Default: None.
		
		Returns
		----------
		ax

		"""
		
		check_string(aspect, ["BP", "CC", "MF"], "aspect") #aspect has to be one of {'BP', 'CC', 'MF'}
		
		#Choose aspect
		table = self #self is a subclass of pd.dataframe
		aspect_table = table[table["NS"] == aspect]
		aspect_table.loc[:,"-log(FDR)"] = -np.log(aspect_table["p_fdr_bh"])
		aspect_table.loc[:,"n_genes"] = aspect_table["study_count"]

		#Sort by pvalue and ngenes
		aspect_table = aspect_table.sort_values(["-log(FDR)", "n_genes"], ascending=False)
		aspect_table.loc[:,"-log(FDR)"] = np.round(aspect_table.loc[:,"-log(FDR)"], 1) #round to prevent long floats on figure
		aspect_table = aspect_table.iloc[:n_terms,:] #first n rows

		#Setup plot
		w, h = 6, (n_terms/20)*4.5 #size 6x4.5 for 20 terms
		fig, ax = plt.subplots(figsize=(w,h))

		#Plot enriched terms 
		ax = sns.scatterplot(x="-log(FDR)", 
									y="name",
									size="n_genes",
									#sizes=(20,500),
									#alpha=0.5,
									hue="-log(FDR)",
									data=aspect_table, 
									ax=ax
									)
		
		#Plot adjustments
		aspect_name = self.aspect_translation[aspect]
		ax.set_ylabel(aspect_name)
		ax.axvline(-np.log(threshold), color="red")
		ax.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0)
		ax.grid()
		ax.set_axisbelow(True) #puts the grid behind data points

		#Set title for plot depending on input
		if title is None:
			ax.set_title("GO-terms for '{0}'".format(aspect_name)) #, pad=20, size=15)
		else:
			ax.set_title(title)

		if save is not None:
			plt.savefig(save, dpi=600, bbox_inches="tight")

		return(ax)

	
[docs]	def compare(self, compare_table):
		"""
		IN PROGRESS: Plot a comparison of two GO-term analysis

		Parameters
		-------------
		compare_table : GOAnalysis object
		"""
		pass