import pandas as pd import matplotlib.pyplot as plt import seaborn as sb import numpy as np from matplotlib import gridspec as gs import glob as glob import pickle as pkl import argparse,argcomplete import os,subprocess exec(open("/home/ramn/workdir/NGS-pipeline/scripts/HEADER/python-header.py",'r').read()) exec(open("/home/ramn/workdir/NGS-pipeline/scripts/src/plotter.py",'r').read()) exec(open("/home/ramn/workdir/NGS-pipeline/scripts/src/bookkeeping.py",'r').read()) def adjust_boxplot_whiskers(ax=None,width=0.5,color='black',cmap=None): for i,artist in enumerate(ax.artists): for j in range(i*6,i*6+6): line = ax.lines[j] line.set_color(color) line.set_mfc(color) line.set_mec(color) line.set_lw(width) def read_gdac_data(cancers=[],tcgafolder='../RSEM/',genes=[],filename='assembled_data.pkl',diagnosis=['tumor']): assert len(genes)>0,'No genes to plot; Exiting' if len(cancers)==0: cancers = glob.glob(tcgafolder+'/*') groups = [_cancer+'/'+filename for _cancer in cancers] cancers = [_xx.split('/')[-1] for _xx in cancers] else: groups = [tcgafolder+_cancer+'/'+filename for _cancer in cancers] _tcga = pd.DataFrame(columns=['cancer','diagnosis','Symbol','variable','value']) for _group in groups: cancername = _group.split('/'+filename)[0].split('/')[-1] try: _temp = pkl.load(open(_group,'rb')) for _diag in diagnosis: _temp1 = _temp['rna'][['Symbol']+_temp['group'][_diag]] _temp1 = _temp1[_temp1.Symbol.isin(genes)].melt(id_vars=['Symbol']) _temp1['diagnosis']=_diag _temp1['cancer'] = _group.split('/')[-2] _tcga = _tcga.append(_temp1,sort=False) except: pass return [_tcga,cancers] def plot_distribution(_tcga=None,cancer=[],genes=[],diagnosis=['tumor'],filename='./distribution_plot',log2=True): plt.style.use('seaborn-ticks') assert len(genes)>0,'Exiting: Genes not specified' fig,ax = plt.subplots(nrows=len(genes),ncols=1,figsize=(8,len(genes)*2),squeeze=False,sharex=False) _cmap = {'normal':'green','tumor':'red'} if log2: _tcga['value'] = np.log2(_tcga['value']+1) for _mm,_gene in enumerate(genes): _tcgafilt = _tcga[(_tcga.Symbol==_gene)] sb.boxplot(x='cancer',y='value',hue='diagnosis',data=_tcgafilt,ax=ax[_mm][0],dodge=True,order=cancer, boxprops=dict(alpha=1.,lw=0.1),fliersize=0.6,palette=_cmap,notch=True,hue_order=diagnosis,width=0.75) ax[_mm][0].set_xlabel('') if log2: ax[_mm][0].set_ylabel('log2(RSEM+1)') else: ax[_mm][0].set_ylabel('RSEM') ax[_mm][0].legend().remove() ax[_mm][0].set_title(_gene+' levels in TCGA studies (n='+str(len(cancer))+')') sb.despine(ax=ax[_mm][0]) for _ax in ax[_mm][0].get_xticklabels():_ax.set_rotation(90) adjust_boxplot_whiskers(ax=ax[_mm][0]) ax[_mm][0].legend(loc=(0.8,1.0),markerscale=0.5,handlelength=0.5,ncol=2) plt.tight_layout() plt.savefig(filename+'.svg') def plot_distribution_gg(_tcga=None,cancer=[],genes=[],diagnosis=['tumor'],filename='./distribution_plot',log2=True,palette='hls'): plt.style.use('seaborn-ticks') assert len(genes)>0,'Exiting: Genes not specified' fig,ax = plt.subplots(nrows=len(diagnosis),ncols=1,figsize=(8,len(diagnosis)*3),squeeze=False,sharex=False) if log2: _tcga['value'] = np.log2(_tcga['value']+1) for _mm,_diag in enumerate(diagnosis): _tcgafilt = _tcga[(_tcga.diagnosis==_diag)] sb.boxplot(x='cancer',y='value',hue='Symbol',data=_tcgafilt,ax=ax[_mm][0],dodge=True,order=cancer, boxprops=dict(alpha=1.,lw=0.1),fliersize=0.4,palette=palette,notch=True,hue_order=genes,width=0.75) ax[_mm][0].set_xlabel('') if log2: ax[_mm][0].set_ylabel('log2(RSEM+1)') else: ax[_mm][0].set_ylabel('RSEM') ax[_mm][0].legend().remove() ax[_mm][0].set_title('expression levels in TCGA studies ('+_diag+')') sb.despine(ax=ax[_mm][0]) for _ax in ax[_mm][0].get_xticklabels():_ax.set_rotation(90) adjust_boxplot_whiskers(ax=ax[_mm][0]) ax[_mm][0].legend(loc=(0.8,1.0),markerscale=0.5,handlelength=0.5,ncol=2) plt.tight_layout() plt.savefig(filename+'.svg') def plot_distribution_plotly(_tcga=None,cancer=[],genes=[],diagnosis=['tumor'],filename='./distribution_plot',log2=True): from bokeh.palettes import Category20_7 as colors assert len(genes)>0,'Exiting: Genes not specified' _cmap = {'normal':colors[4],'tumor':colors[6]} if log2: _tcga['value'] = np.log10(_tcga['value']+1) plyfig = splots.make_subplots(rows=len(genes),cols=1,shared_yaxes=False,shared_xaxes=False, subplot_titles=tuple([_gene+' expression (RSEM)' for _gene in genes])) for _mm,_gene in enumerate(genes): data = [] for _diag in diagnosis: _tcgafilt = _tcga[(_tcga.Symbol==_gene) & (_tcga.diagnosis==_diag)] plyfig.append_trace(go.Box(x=_tcgafilt['cancer'].tolist(),y=_tcgafilt['value'].tolist(),whiskerwidth=1, name=_diag,marker=dict(color=_cmap[_diag],size=2)),_mm+1,1) plyfig['layout'].update(width=min(1200,400+len(cancer)*40),height=len(genes)*600,xaxis=dict(tickfont=dict(size=18)), yaxis=dict(tickfont=dict(size=18)),boxmode='group') for _nn in range(2,len(genes)+1): plyfig['layout']['xaxis'+str(_nn)].update(dict(tickfont=dict(size=18)), automargin=True) plyfig['layout']['yaxis'+str(_nn)].update(dict(tickfont=dict(size=18)), automargin=True) save_plotly_html(plyfig,filename) def plot_distribution_plotly_gg(_tcga=None,cancer=[],genes=[],diagnosis=['tumor'],filename='./distribution_plot',log2=True): from bokeh.palettes import Category20_7 as colors assert len(genes)>0,'Exiting: Genes not specified' if log2: _tcga['value'] = np.log10(_tcga['value']+1) plyfig = splots.make_subplots(rows=len(diagnosis),cols=1,shared_yaxes=False,shared_xaxes=False, subplot_titles=tuple(['expression in '+_diag for _diag in diagnosis])) _cmap = {_gene:colors[_nn%20] for _nn,_gene in enumerate(genes)} for _mm,_diag in enumerate(diagnosis): data = [] for _gene in genes: _tcgafilt = _tcga[(_tcga.Symbol==_gene) & (_tcga.diagnosis==_diag)] plyfig.append_trace(go.Box(x=_tcgafilt['cancer'].tolist(),y=_tcgafilt['value'].tolist(),whiskerwidth=1, name=_gene,marker=dict(color=_cmap[_gene],size=2)),_mm+1,1) plyfig['layout'].update(width=min(1200,400+len(cancer)*40),height=len(diagnosis)*600,xaxis=dict(tickfont=dict(size=18)), yaxis=dict(tickfont=dict(size=18)),boxmode='group') for _nn in range(2,len(diagnosis)+1): plyfig['layout']['xaxis'+str(_nn)].update(dict(tickfont=dict(size=18)), automargin=True) plyfig['layout']['yaxis'+str(_nn)].update(dict(tickfont=dict(size=18)), automargin=True) save_plotly_html(plyfig,filename) def main(): parser = argparse.ArgumentParser(description="Plot heatmaps of ChIP coverage") parser.add_argument('-g','--genes',nargs='+',help='-g/--gene ',default=[]) parser.add_argument('-c','--cancer',nargs='+',help='-c/--cancer ; default=all',default=[]) parser.add_argument('-d','--diagnosis',nargs='+',help='-d/--diagnosis tumor/normal; default=tumor',default=['tumor']) parser.add_argument('-o','--out',dest='out',help='-o/--out ',default='./TCGA_expression') parser.add_argument('-m','--mode',dest='mode',help='-m/--mode group_diagnosis/group_genes',default='group_diagnosis') parser.add_argument('--mpl_palette',help='--mpl_palette ...',default='hls') argcomplete.autocomplete(parser) args=parser.parse_args() tcga,args.cancer=read_gdac_data(cancers=args.cancer,genes=args.genes,diagnosis=args.diagnosis) outdir = os.path.dirname(os.path.abspath(args.out)) if not os.path.isdir(outdir): os.system('mkdir -pv '+outdir) if args.mode=='group_diagnosis': plot_distribution(_tcga=tcga,genes=args.genes,diagnosis=args.diagnosis,filename=args.out,cancer=args.cancer) plot_distribution_plotly(_tcga=tcga,genes=args.genes,diagnosis=args.diagnosis,filename=args.out,cancer=args.cancer) else: plot_distribution_gg(_tcga=tcga,genes=args.genes,diagnosis=args.diagnosis,filename=args.out,cancer=args.cancer,palette=args.mpl_palette) plot_distribution_plotly_gg(_tcga=tcga,genes=args.genes,diagnosis=args.diagnosis,filename=args.out,cancer=args.cancer) tcga.to_csv(args.out+'.csv',index=False) return tcga if __name__=='__main__': tcga=main()