Source code for methylnet.interpretation_classes

"""
interpretation_classes.py
=======================
Contains core classes and functions to extracting explanations for the predictions at single samples, and then interrogates important CpGs for biological plausibility.
"""

import shap, numpy as np, pandas as pd
import torch
from methylnet.datasets import RawBetaArrayDataSet, Transformer
from torch.autograd import Variable
from torch.utils.data import DataLoader
from os.path import join
import click
import os, copy
import pickle
from pymethylprocess.MethylationDataTypes import MethylationArray, MethylationArrays, extract_pheno_beta_df_from_pickle_dict


"""Normal breast 5hmC
chip seq enrichment
roadmap -> hmm chromatin state learning
Encode
Blueprint
Geo
lola r packages
https://bioconductor.org/packages/release/bioc/html/rGREAT.html
https://academic.oup.com/bioinformatics/article/26/13/1662/201195
http://great.stanford.edu/public/html/
https://www.google.com/search?ei=_7wXXNPfEKyK5wLbpI2oAw&q=great+enrichment+genomic+cpg+r&oq=great+enrichment+genomic+cpg+r&gs_l=psy-ab.3..33i299l3.529.724..899...0.0..0.84.160.2......0....1..gws-wiz.dJT2-0two-Q
https://www.rdocumentation.org/packages/missMethyl/versions/1.6.2/topics/gometh
https://bioconductor.org/packages/devel/bioc/vignettes/methylGSA/inst/doc/methylGSA-vignette.html

5hmc project

Read Lola and great papers
"""



[docs]class ShapleyData: """Store SHAP results that stores feature importances of CpGs on varying degrees granularity. Attributes ---------- top_cpgs : type Quick accessible CpGs that have the n highest SHAP scores. shapley_values : type Storing all SHAPley values for CpGs for varying degrees granularity. For classification problems, this only includes SHAP scores particular to the actual class. """ def __init__(self): """ top_cpgs: classes: invididuals: individual: top_cpgs overall: top_cpgs overall: top_cpgs shapley_values: class: individual: shapley_values overall: shapley_values methods to access data force_plot summary_plot explainer.expected_value save figures add methods for misclassified return global shapley scores """ self.top_cpgs={'by_class':{},'overall':{}} self.shapley_values={'by_class':{},'overall':{}}
[docs] def add_class(self, class_name, shap_df, cpgs, n_top_cpgs, add_top_negative=False, add_top_abs=False): """Store SHAP scores for particular class to Shapley_Data class. Save feature importances on granular level, explanations for individual and aggregate predictions. Parameters ---------- class_name : type Particular class name to be stored in dictionary structure. shap_df : type SHAP data to pull from. cpgs : type All CpGs. n_top_cpgs : type Number of top CpGs for saving n top CpGs. add_top_negative : type Only consider top negative scores. """ signed = -1 if not add_top_negative else 1 self.shapley_values['by_class'][class_name]=shap_df shap_vals=shap_df.values class_importance_shaps = shap_vals.mean(0) top_idx = np.argsort(class_importance_shaps*signed)[:n_top_cpgs] self.top_cpgs['by_class'][class_name]={'by_individual':{},'overall':{}} self.top_cpgs['by_class'][class_name]['overall']=pd.DataFrame(np.hstack([cpgs[top_idx][:,np.newaxis],class_importance_shaps[top_idx][:,np.newaxis]]),columns=['cpg','shapley_value']) top_idxs = np.argsort(shap_vals*signed if not add_top_abs else -np.abs(shap_vals))[:,:n_top_cpgs] for i,individual in enumerate(list(shap_df.index)): self.top_cpgs['by_class'][class_name]['by_individual'][individual]=pd.DataFrame(shap_df.iloc[i,top_idxs[i,:]].T.reset_index(drop=False).values,columns=['cpg','shapley_value'])
[docs] def add_global_importance(self, global_importance_shaps, cpgs, n_top_cpgs): """Add overall feature importances, globally across all samples. Parameters ---------- global_importance_shaps : type Overall SHAP values to be saved, aggregated across all samples. cpgs : type All CpGs. n_top_cpgs : type Number of top CpGs to rank and save as well. """ self.shapley_values['overall']=pd.DataFrame(global_importance_shaps[:,np.newaxis],columns=['shapley_values'],index=cpgs) top_ft_idx=np.argsort(global_importance_shaps*-1)[:n_top_cpgs] self.top_cpgs['overall']=pd.DataFrame(np.hstack([cpgs[top_ft_idx][:,np.newaxis],global_importance_shaps[top_ft_idx][:,np.newaxis]]),columns=['cpg','shapley_value'])
[docs] def to_pickle(self,output_pkl, output_dict=False): """Export Shapley data to pickle. Parameters ---------- output_pkl : type Output file to save SHAPley data. """ os.makedirs(output_pkl[:output_pkl.rfind('/')],exist_ok=True) if output_dict: pickle.dump(dict(shapley_values=self.shapley_values,top_cpgs=self.top_cpgs), open(output_pkl,'wb')) else: pickle.dump(self, open(output_pkl,'wb'))
[docs] @classmethod def from_pickle(self,input_pkl, from_dict=False): """Load SHAPley data from pickle. Parameters ---------- input_pkl : type Input pickle. """ if from_dict: d=pickle.load(open(input_pkl,'rb')) shapley_data = ShapleyData() shapley_data.shapley_values=d['shapley_values'] shapley_data.top_cpgs=d['top_cpgs'] return shapley_data else: return pickle.load(open(input_pkl,'rb'))
[docs]class ShapleyDataExplorer: """Datatype used to explore saved ShapleyData. Parameters ---------- shapley_data : type ShapleyData instance to be explored. Attributes ---------- indiv2class : type Maps individuals to their classes used for quick look-up. shapley_data """ def __init__(self, shapley_data): self.shapley_data=shapley_data self.indiv2class={} for class_name in self.shapley_data.top_cpgs['by_class']: for individual in self.shapley_data.top_cpgs['by_class'][class_name]['by_individual'].keys(): self.indiv2class[individual]=class_name
[docs] def return_shapley_data_by_methylation_status(self, methyl_array, threshold): """Return dictionary containing two SHAPley datasets, each split by low/high levels of methylation. Todo: Define this using median methylation value vs 0.5. Parameters ---------- methyl_array : type MethylationArray instance. Returns ------- dictionary Contains shapley data by methylation status. """ threshold = {'mean':methyl_array.beta.mean().mean(),'original':0.5}[threshold] methylation_shapley_data_dict = {'hyper':copy.deepcopy(self.shapley_data),'hypo':copy.deepcopy(self.shapley_data)} for individual in self.list_individuals(return_list=True): class_name,individual,top_shap_df=self.view_methylation(individual,methyl_array) hyper_df = top_shap_df[top_shap_df['methylation']>=threshold] hypo_df = top_shap_df[top_shap_df['methylation']<threshold] methylation_shapley_data_dict['hyper'].top_cpgs['by_class'][class_name]['by_individual'][individual]=hyper_df[['cpg','shapley_value']] methylation_shapley_data_dict['hypo'].top_cpgs['by_class'][class_name]['by_individual'][individual]=hypo_df[['cpg','shapley_value']] for class_name,individuals in self.list_individuals().items(): top_shap_df=self.extract_class(class_name) top_shap_df['methylation']=methyl_array.beta.loc[individuals,self.shapley_data.top_cpgs['by_class'][class_name]['overall']['cpg'].values].mean(axis=0).values hyper_df = top_shap_df[top_shap_df['methylation']>=threshold] hypo_df = top_shap_df[top_shap_df['methylation']<threshold] methylation_shapley_data_dict['hyper'].top_cpgs['by_class'][class_name]['overall']=hyper_df[['cpg','shapley_value']] methylation_shapley_data_dict['hypo'].top_cpgs['by_class'][class_name]['overall']=hypo_df[['cpg','shapley_value']] return methylation_shapley_data_dict
def make_shap_scores_abs(self): for class_name in self.list_classes(): self.shapley_data.shapley_values['by_class'][class_name] = self.shapley_data.shapley_values['by_class'][class_name].abs()
[docs] def return_binned_shapley_data(self, original_class_name, outcome_col, add_top_negative=False): """Converts existing shap data based on continuous variable predictions into categorical variable. Parameters ---------- original_class_name : type Regression results were split into ClassName_pos and ClassName_neg, what is the ClassName? outcome_col : type Feed in from pheno sheet one the column to bin samples on. add_top_negative : type Looking to include negative SHAPs? Returns ------- ShapleyData With new class labels, built from regression results. """ class_names = outcome_col.unique() new_shapley_data = ShapleyData() new_shapley_data.shapley_values['overall']=self.shapley_data.shapley_values['overall'] cpgs = new_shapley_data.shapley_values['overall'].index.values new_shapley_data.top_cpgs['overall']=self.shapley_data.top_cpgs['overall'] n_top_cpgs = new_shapley_data.top_cpgs['overall'].shape[0] regression_classes = self.shapley_data.shapley_values['by_class'].keys() for regression_class in regression_classes: print(regression_class,original_class_name) if regression_class.startswith(original_class_name): shap_df = self.shapley_data.shapley_values['by_class'][regression_class]#['by_class'][regression_class] print(shap_df) for class_name in class_names: print(class_name) print(outcome_col[outcome_col==class_name]) idx = outcome_col[outcome_col==class_name].index.values new_shapley_data.add_class(class_name,shap_df.loc[idx,:],cpgs,n_top_cpgs, add_top_negative) return new_shapley_data
[docs] def add_bin_continuous_classes(self): """Bin continuous outcome variable and extract top positive and negative CpGs for each new class.""" pass
[docs] def add_abs_value_classes(self): """WIP.""" pass
[docs] def return_cpg_sets(self): """Return top sets of CpGs from classes and individuals, and the complementary set. Returns dictionary of sets of CpGs and the union of all the other sets minus the current set. Returns ------- cpg_sets Dictionary of individuals and classes; CpGs contained in set for particular individual/class cpg_exclusion_sets Dictionary of individuals and classes; CpGs not contained in set for particular individual/class """ from functools import reduce def return_cpg_set(shap_df): return set(shap_df['cpg'].values.tolist()) cpg_sets={} for class_name,individuals in self.list_individuals().items(): cpg_set = [set(self.extract_class(class_name)['cpg'].values.tolist())] for individual in individuals: cpg_set.append(set(self.extract_individual(individual)[1]['cpg'].values.tolist())) cpg_set=set(reduce(lambda x,y: x.union(y),cpg_set)) cpg_sets[class_name] = cpg_set cpg_exclusion_sets={} for class_name in cpg_sets: #print("Calculating exclusion set for {}".format(class_name)) cpg_exclusion_sets[class_name]=set(reduce(lambda x,y:x.union(y),[cpg_set for class_name_query,cpg_set in cpg_sets.items() if class_name_query != class_name])) return cpg_sets, cpg_exclusion_sets
[docs] def extract_class(self, class_name, class_intersect=False, get_shap_values=False): """Extract the top cpgs from a class Parameters ---------- class_name : type Class to extract from? class_intersect : type Bool to extract from aggregation of SHAP values from individuals of current class, should have been done already. Returns ------- DataFrame Cpgs and SHAP Values """ if get_shap_values: return self.shapley_data.shapley_values['by_class'][class_name].mean(axis=0) else: if class_intersect: shap_dfs=[] for individual in self.shapley_data.top_cpgs['by_class'][class_name]['by_individual'].keys(): shap_dfs.append(self.shapley_data.top_cpgs['by_class'][class_name]['by_individual'][individual].set_index('cpg')) df=pd.concat([shap_dfs],axis=1,join='inner') df['shapley_value']=df.values.sum(axis=1) df['cpg'] = np.array(list(df.index)) return df[['cpg','shapley_value']] else: return self.shapley_data.top_cpgs['by_class'][class_name]['overall']
[docs] def extract_individual(self, individual, get_shap_values=False): """Extract the top cpgs from an individual Parameters ---------- individual : type Individual to extract from? Returns ------- Tuple Class name of individual, DataFrame of Cpgs and SHAP Values """ class_name=self.indiv2class[individual] return class_name,(self.shapley_data.top_cpgs['by_class'][class_name]['by_individual'][individual] if not get_shap_values else self.shapley_data.shapley_values['by_class'][class_name].loc[individual])
[docs] def regenerate_individual_shap_values(self, n_top_cpgs, abs_val=False, neg_val=False): """Use original SHAP scores to make nested dictionary of top CpGs based on shapley score, can do this for ABS SHAP or Negative SHAP scores as well. Parameters ---------- n_top_cpgs : type Description of parameter `n_top_cpgs`. abs_val : type Description of parameter `abs_val`. neg_val : type Description of parameter `neg_val`. Returns ------- type Description of returned object. """ shapley_data = copy.deepcopy(self.shapley_data) mod_name = lambda name: (name.replace('_pos','').replace('_neg','') if abs_val else name) abs_val = lambda x: (x if not abs_val else np.abs(x)) neg_val = (1 if neg_val else -1) for class_name in list(shapley_data.top_cpgs['by_class'].keys()): new_class_name = mod_name(class_name) cpgs=np.array(list(shapley_data.shapley_values['by_class'][class_name])) shap_df=shapley_data.shapley_values['by_class'][class_name] shap_vals=abs_val(shap_df.values) class_importance_shaps = abs_val(shap_vals).mean(0) top_idx = np.argsort(abs_val(class_importance_shaps)*neg_val)[:n_top_cpgs] shapley_data.top_cpgs['by_class'][class_name]['overall']=pd.DataFrame(np.hstack([cpgs[top_idx][:,np.newaxis],class_importance_shaps[top_idx][:,np.newaxis]]),columns=['cpg','shapley_value']) top_idxs = np.argsort(abs_val(shap_vals)*neg_val)[:,:n_top_cpgs] for i,individual in enumerate(shapley_data.top_cpgs['by_class'][class_name]['by_individual'].keys()): new_indiv_name = mod_name(individual) shapley_data.top_cpgs['by_class'][class_name]['by_individual'][new_indiv_name]=pd.DataFrame(shap_df.iloc[i,top_idxs[i,:]].T.reset_index(drop=False).values,columns=['cpg','shapley_value']) if new_indiv_name != individual: del shapley_data.top_cpgs['by_class'][class_name]['by_individual'][individual] shapley_data.top_cpgs['by_class'][new_class_name]=shapley_data.top_cpgs['by_class'][class_name] shapley_data.shapley_values['by_class'][new_class_name]=shapley_data.shapley_values['by_class'][class_name] if new_class_name != class_name: del shapley_data.top_cpgs['by_class'][class_name], shapley_data.shapley_values['by_class'][class_name] return shapley_data
[docs] def view_methylation(self, individual, methyl_arr): """Use MethylationArray to output top SHAP values for individual with methylation data appended to output data frame. Parameters ---------- individual : type Individual to query from ShapleyData methyl_arr : type Input MethylationArray Returns ------- class_name individual top_shap_df Top Shapley DataFrame with top cpgs for individual, methylation data appended. """ class_name,top_shap_df=self.extract_individual(individual) #print(top_shap_df['cpg'].values,len(top_shap_df['cpg'].values)) top_shap_df['methylation']=methyl_arr.beta.loc[individual,top_shap_df['cpg'].values].values return class_name,individual,top_shap_df
[docs] def extract_methylation_array(self, methyl_arr, classes_only=True, global_vals=False, n_extract=1000, class_name=''): """Subset MethylationArray Beta values by some top SHAP CpGs from classes or overall. Parameters ---------- methyl_arr : type MethylationArray. classes_only : type Setting this to False will include the overall top CpGs per class and the top CpGs for individuals in that class. global_vals : type Use top CpGs overall, across the entire dataset. n_extract : type Number of top CpGs to subset. class_name : type Which class to subset top CpGs from? Blank to use union of all top CpGs. Returns ------- MethylationArray Stores methylation beta and pheno data, reduced here by queried CpGs. """ from functools import reduce total_cpgs=[] if global_vals: all_cpgs=self.shapley_data.top_cpgs['overall']['cpg'].values if n_extract < len(all_cpgs): all_cpgs=all_cpgs[:n_extract] else: if class_name: class_names = [class_name] else: class_names = list(self.shapley_data.top_cpgs['by_class'].keys()) for class_name in class_names: cpgs=np.array(list(self.shapley_data.top_cpgs['by_class'][class_name]['overall'].values[:,0])) if n_extract < len(cpgs): cpgs=cpgs[:n_extract] if not classes_only: cpgs = reduce(np.union1d,[cpgs]+[self.shapley_data.top_cpgs['by_class'][class_name]['by_individual'][individual].values[:,0] for individual in self.shapley_data.top_cpgs['by_class'][class_name]['by_individual'].keys()]) total_cpgs.append(cpgs) all_cpgs = reduce(np.union1d,total_cpgs) methyl_arr.beta=methyl_arr.beta.loc[:,all_cpgs] return methyl_arr
[docs] def limit_number_top_cpgs(self, n_top_cpgs): """Reduce the number of top CpGs. Parameters ---------- n_top_cpgs : type Number top CpGs to retain. Returns ------- ShapleyData Returns shapley data with fewer top CpGs. """ shapley_data = copy.deepcopy(self.shapley_data) if isinstance(shapley_data.top_cpgs['overall'],pd.DataFrame): shapley_data.top_cpgs['overall']=shapley_data.top_cpgs['overall'].iloc[:n_top_cpgs] for class_name in shapley_data.top_cpgs['by_class']: shapley_data.top_cpgs['by_class'][class_name]['overall']=shapley_data.top_cpgs['by_class'][class_name]['overall'].iloc[:n_top_cpgs] for individual in shapley_data.top_cpgs['by_class'][class_name]['by_individual']: shapley_data.top_cpgs['by_class'][class_name]['by_individual'][individual]=shapley_data.top_cpgs['by_class'][class_name]['by_individual'][individual].iloc[:n_top_cpgs] return shapley_data
[docs] def list_individuals(self, return_list=False): """List the individuals in the ShapleyData object. Parameters ---------- return_list : type Return list of individual names rather than a dictionary of class:individual key-value pairs. Returns ------- List or Dictionary Class: Individual dictionary or individuals are elements of list """ if not return_list: individuals={class_name:list(self.shapley_data.top_cpgs['by_class'][class_name]['by_individual'].keys()) for class_name in self.shapley_data.top_cpgs['by_class']} return individuals else: from functools import reduce return list(reduce(lambda x,y:x+y, [list(self.shapley_data.top_cpgs['by_class'][class_name]['by_individual'].keys()) for class_name in self.shapley_data.top_cpgs['by_class']]))
[docs] def list_classes(self): """List classes in ShapleyData object. Returns ------- List List of classes """ classes = list(self.shapley_data.top_cpgs['by_class'].keys()) return classes
[docs] def return_top_cpgs(self, classes=[], individuals=[], class_intersect=False, cpg_exclusion_sets=None, cpg_sets=None): """Given list of classes and individuals, export a dictionary containing data frames of top CpGs and their SHAP scores. Parameters ---------- classes : type Higher level classes to extract CpGs from, list of classes to extract top CpGs from. individuals : type Individual samples to extract top CpGs from. class_intersect : type Whether the top CpGs should be chosen by aggregating the remaining individual scores. cpg_exclusion_sets : type Dict of sets of CpGs, where these ones contain CpGs not particular to particular class. cpg_sets : type Contains top CpGs found for each class. Returns ------- Dict Top CpGs accessed and returned for further processing. """ top_cpgs={} if classes: for class_name in classes: top_cpg_df = self.extract_class(class_name, class_intersect) if cpg_exclusion_sets != None: unique_cpgs = np.array(list(set(top_cpg_df['cpg'].values.tolist())-cpg_exclusion_sets[class_name])) top_cpgs[class_name]= top_cpg_df[np.isin(top_cpg_df['cpg'].values,unique_cpgs)] else: top_cpgs[class_name]= top_cpg_df if individuals: for indiv in individuals: class_name,top_cpg_df=self.extract_individual(indiv) if cpg_exclusion_sets != None: unique_cpgs = np.array(list(set(top_cpg_df['cpg'].values.tolist())-cpg_exclusion_sets[class_name])) print(len(top_cpg_df['cpg'].values),len(unique_cpgs)) top_cpgs['{}_{}'.format(class_name,indiv)]=top_cpg_df[np.isin(top_cpg_df['cpg'].values,unique_cpgs)] else: top_cpgs['{}_{}'.format(class_name,indiv)]=top_cpg_df if cpg_sets != None: print(cpg_sets) intersected_cpgs=np.array(list(set.intersection(*list(cpg_sets.values())))) top_cpgs['intersection']=pd.DataFrame(intersected_cpgs[:,np.newaxis],columns=['cpg']) top_cpgs['intersection']['shapley_value']=-1 return top_cpgs
[docs] def return_global_importance_cpgs(self): """Return overall globally important CpGs. Returns ------- list """ return self.shapley_data.top_cpgs['overall']
[docs] def jaccard_similarity_top_cpgs(self,class_names,individuals=False,overall=False, cooccurrence=False): """Calculate Jaccard Similarity matrix between classes and individuals within those classes based on how they share sets of CpGs. Parameters ---------- class_names : type Classes to include. individuals : type Individuals to include. overall : type Whether to use overall class top CpGs versus aggregate. cooccurrence : type Output cooccurence instead of jaccard. Returns ------- pd.DataFrame Similarity matrix. """ from itertools import combinations from functools import reduce x={} if class_names[0]=='all': class_names=list(self.shapley_data.top_cpgs['by_class'].keys()) for class_name in class_names: if overall: x[class_name]=self.shapley_data.top_cpgs['by_class'][class_name]['overall']['cpg'].values.tolist() if individuals: for indiv,df in list(self.shapley_data.top_cpgs['by_class'][class_name]['by_individual'].items()): x['{}_{}'.format(class_name,indiv)]=df['cpg'].values.tolist() indivs=list(x.keys()) similarity_matrix=pd.DataFrame(np.eye(len(x)),index=indivs,columns=indivs) for i,j in combinations(indivs,r=2): similarity_matrix.loc[i,j] = (round(jaccard_similarity(x[i],x[j]),3) if not cooccurrence else cooccurrence_fn(x[i],x[j])) similarity_matrix.loc[j,i] = similarity_matrix.loc[i,j] if cooccurrence: for i in indivs: similarity_matrix.loc[i,i]=len(x[i]) return similarity_matrix
[docs]def jaccard_similarity(list1, list2): """Jaccard score between two lists. Parameters ---------- list1 list2 Returns ------- float Jaccard similarity between elements in list. """ s1 = set(list1) s2 = set(list2) return len(s1.intersection(s2)) / len(s1.union(s2))
[docs]def cooccurrence_fn(list1,list2): """Cooccurence of elements between two lists. Parameters ---------- list1 list2 Returns ------- float Cooccurence between elements in list. """ return len(set(list1).intersection(set(list2)))
[docs]class PlotCircos: """Plot Circos Diagram using ggbio (regular circos software may be better) Attributes ---------- generate_base_plot : type Function to generate base plot add_plot_data : type Function to add more plot data generate_final_plot : type Function to plot data using Circos """ def __init__(self): from rpy2.robjects.packages import importr import rpy2.robjects as robjects from rpy2.robjects import pandas2ri pandas2ri.activate() self.ggbio=importr('ggbio') self.data = robjects.r("""data("CRC", package = "biovizBase")""") self.hg19 = importr('IlluminaHumanMethylation450kanno.ilmn12.hg19') self.GRanges = importr('GenomicRanges') self.generate_base_plot = robjects.r("""function () { p <- ggbio() return(p)}""") self.add_plot_data = robjects.r("""function(p_data_new, p_data=c()) { p_data=c(p_data,p_data_new) return(p_data) }""") self.generate_final_plot = robjects.r("""function(p,grs) { p<- ggbio() for (gr in grs) { seqlevelsStyle(gr)<-"NCBI" p<-p+circle(gr, geom = "rect", color = "steelblue") } p<-p + circle(hg19sub,geom = "ideo", fill = "gray70") + circle(hg19sub,geom = "scale", size = 2)+ circle(hg19sub,geom = "text", aes(label = seqnames), vjust = 0, size = 3) return(p)}""")
[docs] def plot_cpgs(self, top_cpgs, output_dir='./'): """Plot top CpGs location in genome. Parameters ---------- top_cpgs : type Input dataframe of top CpG name and SHAP scores. Future: Plot SHAP scores in locations? output_dir : type Where to output plots. """ import rpy2.robjects as robjects from rpy2.robjects import pandas2ri from collections import defaultdict pandas2ri.activate() self.top_cpgs = top_cpgs cpg_locations = pandas2ri.ri2py(robjects.r['as'](robjects.r('getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)'),'data.frame')).set_index(keys='Name',drop=False) print(cpg_locations) cpgs_grange = robjects.r('makeGRangesFromDataFrame')(pandas2ri.py2ri(cpg_locations),start_field='pos',end_field='pos',starts_in_df_are_0based=False) output_dfs = {} plots={} p=self.generate_base_plot() #ggsave = robjects.r("""function (file.name, p) {ggsave(file.name,p)}""") for i,k in enumerate(list(self.top_cpgs.keys())): list_cpgs=np.squeeze(self.top_cpgs[k].values[:,0]) cpg_location_subset = cpg_locations.loc[list_cpgs,:] location_subset = robjects.r('makeGRangesFromDataFrame')(pandas2ri.py2ri(cpg_location_subset),start_field='pos',end_field='pos',starts_in_df_are_0based=False) locations=self.add_plot_data(location_subset,locations) if i>0 else self.add_plot_data(location_subset) #self.add_plot(p,location_subset) self.generate_final_plot(p,locations) self.ggbio.ggsave(join(output_dir,'{}.png'.format('_'.join([k for k in self.top_cpgs]))))
[docs]class CpGExplainer: """Produces SHAPley explanation scores for individual predictions, approximating a complex model with a basic linear one, one coefficient per CpG per individual. Parameters ---------- prediction_function : type Model or function that makes predictions on the data, can be sklearn .predict method or pytorch forward pass, etc... cuda : type Run on GPUs? Attributes ---------- explainer : type Type of SHAPley explanation method; kernel (uses LIME model agnostic explainer), deep (uses DeepLift) or Gradient (integrated gradients and backprop for explanations)? prediction_function cuda """ # consider shap.kmeans or grab representative sample of each outcome in training set for background ~ 39 * 2 samples, 39 cancers, should speed things up, small training set when building explainer https://github.com/slundberg/shap/issues/372 def __init__(self,prediction_function=None, cuda=False): self.prediction_function=prediction_function self.cuda = cuda self.explainer=None
[docs] def build_explainer(self, train_methyl_array, method='kernel', batch_size=100): # can interpret latent dimensions """Builds SHAP explainer using background samples. Parameters ---------- train_methyl_array : type Train Methylation Array from which to populate background samples to make estimated explanations. method : type SHAP explanation method? batch_size : type Break up prediction explanation creation into smaller batch sizes for lower memory consumption. """ self.method = method if self.method =='kernel': self.explainer=shap.KernelExplainer(self.prediction_function, train_methyl_array.return_raw_beta_array(), link="identity") elif self.method == 'deep': self.explainer=shap.DeepExplainer(self.prediction_function, to_tensor(train_methyl_array.return_raw_beta_array()) if not self.cuda else to_tensor(train_methyl_array.return_raw_beta_array()).cuda()) elif self.method == 'gradient': self.explainer=shap.GradientExplainer(self.prediction_function, to_tensor(train_methyl_array.return_raw_beta_array()) if not self.cuda else to_tensor(train_methyl_array.return_raw_beta_array()).cuda(), batch_size=batch_size) else: print('Not Implemented, default to kernel explainer.') self.method = 'kernel' self.explainer=shap.KernelExplainer(self.prediction_function, train_methyl_array.return_raw_beta_array(), link="identity")
[docs] def return_shapley_scores(self, test_methyl_array, n_samples, n_outputs, shap_sample_batch_size=None, top_outputs=None): # split up into multiple methods """Generate explanations for individual predictions, in the form of SHAPley scores for supplied test data. One SHAP score per individual prediction per CpG, and more if multiclass/output. For multiclass/multivariate outcomes, scores are generated for each outcome class. Parameters ---------- test_methyl_array : type Testing MethylationArray. n_samples : type Number of SHAP score samples to produce. More SHAP samples provides convergence to correct score. n_outputs : type Number of outcome classes. shap_sample_batch_size : type If not None, break up SHAP score sampling into batches of this size to be averaged. top_outputs : type For deep explainer, limit number of output classes due to memory consumption. Returns ------- type Description of returned object. """ n_batch = 1 if shap_sample_batch_size != None and self.method != 'deep': # and not feature_selection n_batch = int(n_samples/shap_sample_batch_size) n_samples = shap_sample_batch_size test_arr = test_methyl_array.return_raw_beta_array() shap_values = np.zeros((top_outputs if top_outputs != None else n_outputs,)+test_arr.shape) additional_opts = {} if self.method != 'kernel': additional_opts['ranked_outputs']=top_outputs test_arr=to_tensor(test_arr) if not self.cuda else to_tensor(test_arr).cuda() for i in range(n_batch): click.echo("Batch {}".format(i)) print("Batch {}".format(i)) shap_values += return_shap_values(test_arr, self.explainer, self.method, n_samples, additional_opts) shap_values/=float(n_batch) self.shap_values = shap_values
[docs] def classifier_assign_scores_to_shap_data(self, test_methyl_array, n_top_features, interest_col='disease', prediction_classes=None): """Assigns the SHAP scores to a SHAPley data object, populating nested dictionaries (containing class-individual information) with SHAPley information and "top CpGs". Parameters ---------- test_methyl_array : type Testing MethylationArray. n_top_features : type Number of top SHAP scores to use for a subdict called "top CpGs" interest_col : type Column in pheno sheet from which class names reside. prediction_classes : type User supplied prediction classes. """ cpgs=test_methyl_array.return_cpgs() shapley_data = ShapleyData() self.cpg_global_shapley_scores = np.abs(self.shap_values).mean(0).mean(0) shapley_data.add_global_importance(self.cpg_global_shapley_scores,cpgs, n_top_features) """if cross_class: self.shap_values = np.abs(self.shap_values).mean(axis=0)""" for i in range(self.shap_values.shape[0]): class_name = prediction_classes[i] if prediction_classes != None else str(i) shap_df = pd.DataFrame(self.shap_values[i,...],index=test_methyl_array.beta.index,columns=cpgs) if shap_df.shape[0]: if prediction_classes != None: shap_df = shap_df.loc[test_methyl_array.pheno[interest_col].values == class_name,:] shapley_data.add_class(class_name, shap_df, cpgs, n_top_features) self.shapley_data = shapley_data
[docs] def regressor_assign_scores_to_shap_data(self, test_methyl_array, n_top_features, cell_names=[]): """Assigns the SHAP scores to a SHAPley data object, populating nested dictionaries (containing multioutput and single output regression-individual information) with SHAPley information and "top CpGs". Parameters ---------- test_methyl_array : type Testing MethylationArray. n_top_features : type Number of top SHAP scores to use for a subdict called "top CpGs" cell_names : type If multi-output regression, create separate regression classes to access for all individuals. """ n_classes = (self.shap_values.shape[0] if len(self.shap_values.shape) == 3 else 0) cpgs=test_methyl_array.return_cpgs() shapley_data = ShapleyData() self.cpg_global_shapley_scores = (np.abs(self.shap_values).mean(0).mean(0) if n_classes else np.abs(self.shap_values).mean(0)) shapley_data.add_global_importance(self.cpg_global_shapley_scores,cpgs, n_top_features) if n_classes: for i in range(self.shap_values.shape[0]): class_name = str(i) if not cell_names else cell_names[i] shap_df = pd.DataFrame(self.shap_values[i,...],index=test_methyl_array.beta.index,columns=cpgs) if shap_df.shape[0]: shapley_data.add_class(class_name, shap_df, cpgs, n_top_features, add_top_abs=True) #shapley_data.add_class('{}_pos'.format(class_name), shap_df, cpgs, n_top_features) #shapley_data.add_class('{}_neg'.format(class_name), shap_df, cpgs, n_top_features, add_top_negative=True) else: # regression tasks shap_df = pd.DataFrame(self.shap_values,index=test_methyl_array.beta.index,columns=cpgs) shapley_data.add_class('regression' if not cell_names else '{}'.format(cell_names[0]), shap_df, cpgs, n_top_features, add_top_abs=True) #shapley_data.add_class('regression_pos' if not cell_names else '{}_pos'.format(cell_names[0]), shap_df, cpgs, n_top_features) #shapley_data.add_class('regression_neg' if not cell_names else '{}_neg'.format(cell_names[0]), shap_df, cpgs, n_top_features, add_top_negative=True) self.shapley_data = shapley_data
[docs] def feature_select(self, methyl_array, n_top_features): """Perform feature selection based on the best overall SHAP scores across all samples. Parameters ---------- methyl_array : type MethylationArray to run feature selection on. n_top_features : type Number of CpGs to select. Returns ------- MethylationArray Subsetted by top overall CpGs, overall positive contributions to prediction. May need to update. """ cpgs = methyl_array.return_cpgs() print(cpgs) cpgs=cpgs[np.argsort(self.cpg_global_shapley_scores*-1)[:n_top_features]] print(cpgs) return methyl_array.subset_cpgs(cpgs)
[docs] def return_shapley_predictions(self, test_methyl_array, sample_name, interest_col, encoder=None): """Method in development or may be deprecated.""" prediction_class = test_methyl_array['pheno'].loc[sample_name,interest_col] prediction_class_labelled = None if encoder != None: prediction_class_labelled = encoder.transform(prediction_class) return "In development"
[docs] @classmethod def from_explainer(explainer, method, cuda): """Load custom SHAPley explainer""" cpg_explainer = CpGExplainer(cuda=cuda) cpg_explainer.explainer = explainer return cpg_explainer
[docs]class BioInterpreter: """Interrogate CpGs found to be important from SHAP through enrichment and overlap tests. Parameters ---------- dict_top_cpgs : type Dictionary of topCpGs output from ShapleyExplorer object Attributes ---------- top_cpgs : type Dictionary of top cpgs to be interrogated. """ def __init__(self, dict_top_cpgs): self.top_cpgs = dict_top_cpgs from rpy2.robjects.packages import importr self.hg19 = importr('IlluminaHumanMethylation450kanno.ilmn12.hg19') self.GRanges = importr('GenomicRanges') self.missMethyl=importr('missMethyl') self.limma=importr('limma') self.lola=importr('LOLA') self.flow=importr('FlowSorted.Blood.EPIC') self.cgager=importr('cgageR') importr('simpleCache') """if self.prediction_classes == None: self.prediction_classes = list(range(len(self.top_cpgs))) else: self.prediction_classes=list(map(lambda x: x.replace(' ',''),prediction_classes))"""
[docs] def gometh(self, collection='GO', allcpgs=[], length_output=20, gsea_analyses=[], gsea_pickle=''):# consider turn into generator go or kegg # add rgreat, lola, roadmap-chromatin hmm, atac-seq, chip-seq, gometh, Hi-C, bedtools, Owen's analysis """Run GO, KEGG, GSEA analyses or return nearby genes. Parameters ---------- collection : type GO, KEGG, GSEA, GENE overlaps? allcpgs : type All CpGs defines CpG universe, empty if use all CpGs, smaller group of CpGs may yield more significant results. length_output : type How many lines should the output be, maximum number of results to print. gsea_analyses : type GSEA analyses/collections to target. gsea_pickle : type Location of gsea pickle containing gene sets, may need to run download_help_data to acquire. Returns ------- Dict Dictionary containing results from each test run on all of the keys in top_cpgs. """ import rpy2.robjects as robjects from rpy2.robjects import pandas2ri #robjects.packages.importr('org.Hs.eg.db') pandas2ri.activate() output_dfs={} if allcpgs: allcpgs=robjects.vectors.StrVector(allcpgs) else: allcpgs=robjects.r('NULL') if gsea_analyses: gsea_collections=pickle.load(open(gsea_pickle,'rb')) gene_sets=[] for analysis in gsea_analyses: gene_sets.extend(list(gsea_collections[analysis].items())) gene_sets=robjects.ListVector(dict(gene_sets)) for k in self.top_cpgs: list_cpgs=self.top_cpgs[k].values print('Start Prediction {} BIO'.format(k)) list_cpgs=robjects.vectors.StrVector(list_cpgs[:,0].tolist()) if collection == 'GENE': mappedEz = self.missMethyl.getMappedEntrezIDs(sig_cpg=list_cpgs, all_cpg = allcpgs, array_type='450K') gometh_output = robjects.r('function (mappedEz, length.output) {data.frame(mappedEz$sig.eg[1:length.output])}')(mappedEz,length_output) # sig.eg[1:10] print(gometh_output) elif collection in ['GO','KEGG']: gometh_output = self.missMethyl.gometh(sig_cpg=list_cpgs,all_cpg=allcpgs,collection=collection,prior_prob=True) gometh_output = self.limma.topKEGG(gometh_output, number=length_output) if collection=='KEGG' else self.limma.topGO(gometh_output, number=length_output) else: gometh_output = self.missMethyl.gsameth(sig_cpg=list_cpgs,all_cpg=allcpgs,collection=gene_sets,prior_prob=True) gometh_output = robjects.r('as.table')(self.missMethyl.topGSA(gometh_output, number=length_output)) #robjects.r('print')(gometh_output) # FIXME add get genes # genes = self.missMethyl.getMappedEntrezIDs(sig.cpg, all.cpg = NULL, array.type, anno = NULL) output_dfs['prediction_{}'.format(k)]=pandas2ri.ri2py(robjects.r['as'](gometh_output,'data.frame')) if not gsea_analyses else pandas2ri.ri2py(robjects.r('as.data.frame')(gometh_output)).pivot(index='Var1',columns='Var2',values='Freq').sort_values('P.DE') print(output_dfs['prediction_{}'.format(k)].head()) #print('GO/KEGG/GSEA Computed for Prediction {} Cpgs: {}'.format(k, ' '.join(list_cpgs))) return output_dfs
[docs] def get_nearby_cpg_shapleys(self, all_cpgs, max_gap): """Query for CpGs that are within a max_gap of the topCpGs in the genome, helps find cpgs that could be highly correlated and thus also important. Parameters ---------- all_cpgs : type List of all cpgs in methylationarray. max_gap : type Max radius to search for nearby CpGs. Returns ------- Dict Results for each class/indiv's top CpGs, in the form of nearby CpGs to each of these CpGs sets. """ import rpy2.robjects as robjects from rpy2.robjects import pandas2ri from collections import defaultdict pandas2ri.activate() cpg_locations = pandas2ri.ri2py(robjects.r['as'](robjects.r('getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)'),'data.frame')).set_index(keys='Name',drop=False) cpgs_grange = robjects.r('makeGRangesFromDataFrame')(pandas2ri.py2ri(cpg_locations),start_field='pos',end_field='pos',starts_in_df_are_0based=False) output_dfs = {} for k in self.top_cpgs: cpg_dict = defaultdict() list_cpgs=self.top_cpgs[k].values[:,0] cpg_dict.update(self.top_cpgs[k].values.tolist()) cpg_location_subset = cpg_locations.loc[list_cpgs,:] location_subset = robjects.r('makeGRangesFromDataFrame')(pandas2ri.py2ri(cpg_location_subset),start_field='pos',end_field='pos',starts_in_df_are_0based=False) cpgs_overlap = list(pandas2ri.ri2py(robjects.r['as'](self.GRanges.findOverlaps(cpgs_grange,location_subset,maxgap=max_gap, type='any'),'data.frame')).index) for cpg in cpgs_overlap: cpg_dict[cpg] top_cpgs=np.array(list(cpg_dict.items())) output_dfs['prediction_{}'.format(k)]=pd.DataFrame(top_cpgs,index_col=None,columns=['CpG','Shapley Value']) return output_dfs
[docs] def return_overlap_score(self, set_cpgs='IDOL', platform='450k', all_cpgs=[], output_csv='output_bio_intersect.csv', extract_library=False): """Perform overlap test, overlapping top CpGs with IDOL CpGs, age related cpgs, etc. Parameters ---------- set_cpgs : type Set of reference cpgs. platform : type 450K or 850K all_cpgs : type All CpGs to build a universe. output_csv : type Output CSV for results of overlaps between classes, how much doo they share these CpGs overlapped. extract_library : type Can extract a list of the CpGs that ended up beinng overlapped with. Returns ------- List Optional output of overlapped library of CpGs. """ import rpy2.robjects as robjects from rpy2.robjects import pandas2ri pandas2ri.activate() if set_cpgs == 'IDOL': cpg_reference_set=set(robjects.r('IDOLOptimizedCpGs') if platform=='epic' else robjects.r('IDOLOptimizedCpGs450klegacy')) elif set_cpgs == 'clock': cpg_reference_set=set(open('data/age_cpgs.txt').read().splitlines()) # clock elif set_cpgs == 'epitoc': cpg_reference_set=set(robjects.r('EpiTOCcpgs')) elif set_cpgs == 'horvath': cpg_reference_set=set(robjects.r('as.vector(HorvathLongCGlist$MR_var)')) elif set_cpgs == 'hannum': cpg_reference_set=set(robjects.r('as.vector(hannumModel$marker)')) else: cpg_reference_set=set(robjects.r('as.vector(HorvathLongCGlist$MR_var)')) if all_cpgs: cpg_reference_set = cpg_reference_set.intersection(all_cpgs) intersected_cpg_sets = {} for k in self.top_cpgs: query_cpgs=set(self.top_cpgs[k].values[:,0]) intersected_cpgs=query_cpgs.intersection(cpg_reference_set) intersected_cpg_sets[k]=intersected_cpgs overlap_score = round(float(len(intersected_cpgs)) / len(cpg_reference_set) * 100,2) print("{} top cpgs overlap with {}% of {} cpgs".format(k,overlap_score,set_cpgs)) keys=list(intersected_cpg_sets.keys()) if len(keys)>1: from itertools import combinations from functools import reduce similarity_matrix=pd.DataFrame(np.eye(len(keys)),index=keys,columns=keys) for i,j in combinations(keys,r=2): similarity_matrix.loc[i,j]=len(intersected_cpg_sets[i].intersection(intersected_cpg_sets[j])) similarity_matrix.loc[j,i]=similarity_matrix.loc[i,j] for k in keys: similarity_matrix.loc[k,k]=len(intersected_cpg_sets[k]) print("{} shared cpgs: {}/{}".format(k,len(set(reduce(lambda x,y:x.union(y),[intersected_cpg_sets[k].intersection(intersected_cpgs) for k2,intersected_cpgs in intersected_cpg_sets.items() if k2 != k]))),similarity_matrix.loc[k,k])) similarity_matrix.loc[np.sort(keys),np.sort(keys)].to_csv(output_csv) if extract_library: return reduce(np.union1d,[list(intersected_cpg_sets[k]) for k in keys]) return None
[docs] def run_lola(self, all_cpgs=[], lola_db='', cores=8, collections=[],depletion=False): """Run LOLA enrichment test. Parameters ---------- all_cpgs : type CpG universe for LOLA. lola_db : type Location of LOLA database, can be downloaded using methylnet-interpret. Set to extended or core, and collections correspond to these. cores : type Number of cores to use. collections : type LOLA collections to run, leave empty to run all. depletion : type Set true to look for depleted regions over enriched. Returns ------- type Description of returned object. """ import rpy2.robjects as robjects from rpy2.robjects import pandas2ri order_by_max_rnk=robjects.r("function (dt) {dt[order(meanRnk, decreasing=FALSE),]}") pandas2ri.activate() cpg_locations = pandas2ri.ri2py(robjects.r['as'](robjects.r('getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)'),'data.frame')).set_index(keys='Name',drop=False) all_cpg_regions = robjects.r('makeGRangesFromDataFrame')(pandas2ri.py2ri(cpg_locations if not all_cpgs else cpg_locations.loc[all_cpgs,:]),start_field='pos',end_field='pos',starts_in_df_are_0based=False) #robjects.r('load("{}")'.format(lola_rdata)) lolaDB = self.lola.loadRegionDB(lola_db,collections=(robjects.r('NULL') if not collections else rpy2.robjects.vectors.StrVector(collections)))# output_dfs={} for k in self.top_cpgs: list_cpgs=self.top_cpgs[k].values[:,0] cpg_location_subset = cpg_locations.loc[list_cpgs,:] location_subset = robjects.r('makeGRangesFromDataFrame')(pandas2ri.py2ri(cpg_location_subset),start_field='pos',end_field='pos',starts_in_df_are_0based=False) lola_output=self.lola.runLOLA(location_subset,all_cpg_regions,lolaDB,cores=cores,direction=('depletion' if depletion else 'enrichment')) output_dfs['prediction_{}'.format(k)]=pandas2ri.ri2py(robjects.r['as'](order_by_max_rnk(lola_output),'data.frame'))#.iloc[:20,:] print(output_dfs['prediction_{}'.format(k)].head()) return output_dfs
#https://academic.oup.com/bioinformatics/article/32/4/587/1743969
[docs]def return_shap_values(test_arr, explainer, method, n_samples, additional_opts): """Return SHAP values, sampled a number of times, used in CpGExplainer class. Parameters ---------- test_arr : type Testing MethylationArray. explainer : type SHAP explainer object. method : type Method of explaining. n_samples : type Number of samples to estimate SHAP scores. additional_opts : type Additional options to be passed into non-kernel/gradient methods. Returns ------- np.array/list Shapley scores. """ if method == 'kernel' or method == 'gradient': # ranked_outputs=ranked_outputs, add if feature_selection svals=(explainer.shap_values(test_arr, nsamples=n_samples, **additional_opts)[0] if (method == 'gradient' and additional_opts['ranked_outputs'] != None) else explainer.shap_values(test_arr, nsamples=n_samples)) return np.stack(svals,axis=0) if type(svals) == type([]) else svals else: return (explainer.shap_values(test_arr, **additional_opts)[0] if additional_opts['ranked_outputs'] !=None else explainer.shap_values(test_arr))
[docs]def to_tensor(arr): """Turn np.array into tensor.""" return Transformer().generate()(arr)
[docs]def return_predict_function(model, cuda): """Decorator to build the supplied prediction function, important for kernel explanations. Parameters ---------- model : type Prediction function with predict method. cuda : type Run using cuda. Returns ------- function Predict function. """ def predict(loader): # model can be VAE_MLP, TybaltTitusVAE, or CVAE model.eval() outputs=[] for input in loader: input=Variable(input) if cuda: input = input.cuda() outputs.append(np.squeeze(model.forward_predict(input).detach().cpu().numpy())) outputs=np.vstack(outputs) return outputs return predict
# https://github.com/slundberg/shap/blob/master/shap/common.py # Provided model function fails when applied to the provided data set.
[docs]def return_dataloader_construct(n_workers,batch_size): """Decorator to build dataloader compatible with KernelExplainer for SHAP. Parameters ---------- n_workers : type Number CPU. batch_size : type Batch size to load data into pytorch. Returns ------- DataLoader Pytorch dataloader. """ def construct_data_loader(raw_beta_array): raw_beta_dataset=RawBetaArrayDataSet(raw_beta_array,Transformer()) raw_beta_dataloader=DataLoader(dataset=raw_beta_dataset, num_workers=n_workers, batch_size=batch_size, shuffle=False) return raw_beta_dataloader return construct_data_loader
[docs]def main_prediction_function(n_workers,batch_size, model, cuda): """Combine dataloader and prediction function into final prediction function that takes in tensor data, for Kernel Explanations. Parameters ---------- n_workers : type Number of workers. batch_size : type Batch size for input data. model : type Model/predidction function. cuda : type Running on GPUs? Returns ------- function Final prediction function. """ dataloader_constructor=return_dataloader_construct(n_workers,batch_size) predict_function=return_predict_function(model, cuda) def main_predict(raw_beta_array): return predict_function(dataloader_constructor(raw_beta_array)) return main_predict
[docs]def plot_lola_output_(lola_csv, plot_output_dir, description_col, cell_types): """Plots any LOLA output in the form of a forest plot. Parameters ---------- lola_csv : type CSV containing lola results. plot_output_dir : type Plot output directory. description_col : type Column that will label the points on the plot. cell_types : type Column containing cell-types, colors plots and are rows to be compared. """ os.makedirs(plot_output_dir,exist_ok=True) import matplotlib matplotlib.use('Agg') import seaborn as sns import matplotlib.pyplot as plt # from rpy2.robjects.packages import importr # import rpy2.robjects as robjects # from rpy2.robjects import pandas2ri # pandas2ri.activate() # ggplot=importr("ggplot2") # importr("ggpubr") # forcats=importr('forcats') # # create_bar_chart=robjects.r("""function(df,fill.col=NULL){ # ggboxplot(df, x = "description", y = "oddsRatio", fill = fill.col,#, # color = "white", # Set bar border colors to white # palette = "jco", # jco journal color palett. see ?ggpar # # orientation = "horiz")} # Sort the value in dscending order # #sort.by.groups = F, #TRUE, # Sort inside each group # # sort.val = "desc", # #) # #}""") def create_bar_chart(df, description='description', cell_type=None): sns.set(style="whitegrid") f, ax = plt.subplots(figsize=(10,7)) sns.despine(bottom=True, left=True) sns.stripplot(x="oddsRatio", y=description, hue=cell_type, data=df, dodge=True, jitter=True, alpha=.25, zorder=1) sns.pointplot(x="oddsRatio", y=description, hue=cell_type, data=df, dodge=.532, join=False, palette="dark", markers="d", scale=.75, ci=None) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[3:], labels[3:], title="Cell Types", handletextpad=0, columnspacing=1, loc="lower right", ncol=3, frameon=True) ax.set_xlim(left=0) return f,ax lola_results=pd.read_csv(lola_csv)#[['collection','description','oddsRatio','cellType']] # filter cell types here collections_df=lola_results.groupby('collection') for name,df in collections_df: top_results_df=df.iloc[:min(25,df.shape[0]),:].reset_index(drop=True) #top_results_df=pandas2ri.py2ri(top_results_df)#pandas2ri.py2ri(top_results_df) #robjects.r("print")(top_results_df) #top_results_df=robjects.r("function(df){data.frame(df,stringsAsFactors=FALSE)}")(robjects.r("function(df){lapply(df,as.character)}")(top_results_df)) # FIX #print(top_results_df) """RESET FACTORS ^^^""" #diagnose_df=pandas2ri.ri2py(top_results_df) top_results_df=top_results_df.fillna('NULL') plt.figure() f,ax=create_bar_chart(top_results_df,description_col,'cellType') plt.tight_layout() plt.savefig(join(plot_output_dir,lola_csv.split('/')[-1].replace('.csv','_{}.png'.format(name)))) plt.close()
#ggplot.ggsave(join(plot_output_dir,lola_csv.split('/')[-1].replace('.csv','_{}.png'.format(name)))) # add shore island breakdown
[docs]class DistanceMatrixCompute: """From any embeddings, calculate pairwise distances between classes that label embeddings. Parameters ---------- methyl_array : MethylationArray Methylation array storing beta and pheno data. pheno_col : str Column name from which to extract sample names from and group by class. Attributes ---------- col : pd.DataFrame Column of pheno array. classes : np.array Unique classes of pheno column. embeddings : pd.DataFrame Embeddings of beta values. """ def __init__(self, methyl_array, pheno_col): self.embeddings = methyl_array.beta self.col = methyl_array.pheno[pheno_col] self.classes = self.col.unique()
[docs] def compute_distances(self, metric='cosine', trim=0.): """Compute distance matrix between classes by average distances between points of classes. Parameters ---------- metric : str Scikit-learn distance metric. """ from itertools import combinations from sklearn.metrics import pairwise_distances from scipy.stats import trim_mean distance_calculator = lambda x,y: trim_mean(pairwise_distances(x,y,metric=metric).flatten(),trim)#.mean() self.distances = pd.DataFrame(0,index=self.classes,columns=self.classes) for i,j in combinations(self.classes,r=2): x1=self.embeddings.loc[self.col[self.col==i].index] x2=self.embeddings.loc[self.col[self.col==j].index] self.distances.loc[i,j]=distance_calculator(x1,x2) self.distances.loc[j,i]=self.distances.loc[i,j]
[docs] def calculate_p_values(self): """Compute pairwise p-values between different clusters using manova.""" from statsmodels.multivariate.manova import MANOVA from itertools import combinations from sklearn.preprocessing import OneHotEncoder test_id = 0 # returns wilk's lambda self.p_values = pd.DataFrame(1,index=self.classes,columns=self.classes) for i,j in combinations(self.classes,r=2): if i!=j: cluster_labels = self.col[np.isin(self.col,np.array([i,j]))] embeddings = np.array(self.embeddings.loc[cluster_labels.index].values) cluster_labels = OneHotEncoder().fit_transform(cluster_labels.values[:,np.newaxis]).todense().astype(int) #print(embeddings,cluster_labels) test_results = MANOVA(embeddings, cluster_labels) #print(test_results) p_val = test_results.mv_test().results['x0']['stat'].values[test_id, 4] self.p_values.loc[i,j]=p_val self.p_values.loc[j,i]=self.p_values.loc[i,j]
#cluster_labels = cluster_labels.map({v:k for k,v in enumerate(self.col.unique().tolist())})
[docs] def return_distances(self): """Return the distance matrix Returns ------- pd.DataFrame Distance matrix between classes. """ return self.distances
[docs] def return_p_values(self): """Return the distance matrix Returns ------- pd.DataFrame MANOVA values between classes. """ return self.p_values