exec(open('/home/ramn/workdir/NGS-pipeline/scripts/HEADER/python-header.py').read()) exec(open('/home/ramn/workdir/NGS-pipeline/scripts/src/plotter.py').read()) class analysis: def __init__(self,assembly=None,deseq='../microarray/RA_vs_DMSO/DESeq/analysis/atleast_2fold',minfpkm=0.5, outfolder=None,survival=True,plotgenes=[]): self.assembly = assembly self.deseq = deseq self._gexpression = [] self.clinical = [] self.group = [] self.minfpkm = minfpkm self.g_normal,self.g_tumor,self.sampcols = [],[],[] self.dehuman = [] self.outfolder=outfolder self.read_assembly() self.read_deseq() self.run_analysis(survival=survival,plotgenes=plotgenes) def read_assembly(self): import pickle as pkl import pandas as pd datadict=pkl.load(open(self.assembly,'rb')) self._gexpression = datadict['rna'] self.clinical = datadict['clinical'] self.clinical = self.clinical.set_index('tumor',drop=False) self.g_normal,self.g_tumor = datadict['group']['normal'],datadict['group']['tumor'] self.sampcols = self.g_normal + self.g_tumor self._gexpression['mean_fpkm'] = self._gexpression[self.sampcols].mean(axis=1) self._gexpression['std_fpkm'] = self._gexpression[self.sampcols].std(axis=1) self._gexpression['slope']=0.0 self._gexpression['intercept']=0.0 self._gexpression['correlation']=0.0 self._gexpression['pvalue']=0.0 self._gexpression = self._gexpression[self._gexpression.mean_fpkm>self.minfpkm] def read_deseq(self): dehuman={'up':pd.read_csv(self.deseq+'/human/group_DGE_up_top1000.rnk',sep='\t',names=['gene','lfc']), 'down':pd.read_csv(self.deseq+'/human/group_DGE_down_top1000.rnk',sep='\t',names=['gene','lfc'])} self.dehuman = pd.DataFrame() self.dehuman = self.dehuman.append(dehuman['up']) self.dehuman = self.dehuman.append(dehuman['down']) self.dehuman = self.dehuman def write_datatable_json(self,dataframe,filename,cssid='datatable',elementid="tableinput",fixpos=None,script='local',title=None,visible=[]): ''' Convert a dataframe into a html file. Here the data is stored as a JSON file which is rendered in the html file using a JAVASCRIPT ''' visibleoff = ['0'] # for the index for nn,visibleflag in enumerate(visible): if not visibleflag: visibleoff.append(str(nn+1)) # set the visibility of the columns based on the boolean strings passed if not fixpos: fixpos = visibleoff[-1] jsonfile = filename+'.json' fp=open(filename+'.html','w') fp.write('' + '\n') fp.write('' + '\n') if script.lower()=='local': fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') else: fp.write(' ' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write(' ' + '\n') fp.write(' ' + '\n') fp.write(' ' + '\n') fp.write(' ' + '\n') fp.write(' ' + '\n') fp.write(' ' + '\n') fp.write(' ' + '\n') fp.write(' ' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write('' + '\n') fp.write(''+'\n') fp.write('') fp.close() dataframe.to_json(jsonfile,orient='table') def run_analysis(self,foldcuts=[2.0,3.0,4.0],survival=True,plotgenes=[]): hrcols = ["Haldar_Class",'Haldar_Rich_Class'] for foldcut in foldcuts: _fprefix = self.outfolder+'/analysis/FC_'+str(foldcut) _corrtable=_fprefix+'/correlation_table' _RAscoretable=_fprefix+'/RAscore_table' if not os.path.isdir(_fprefix): subprocess.call(['mkdir', '-pv', _fprefix]) _lfoldcut = np.log2(foldcut) _dehuman = self.dehuman[abs(self.dehuman.lfc)>=_lfoldcut] DEGEXP = (self._gexpression[self._gexpression.Symbol.isin(_dehuman.gene)])[['Symbol']+hrcols+self.sampcols].copy(deep=True) DEGEXP['DRS'] = 'up' _downindex = DEGEXP[DEGEXP.Symbol.isin(_dehuman[_dehuman.lfc<=-foldcut].gene)].index DEGEXP['DRS'].loc[_downindex] = 'down' zscore = stats.zscore(DEGEXP[self.sampcols].values,axis=1) DEGEXP[self.sampcols]=(zscore) DEGEXP = DEGEXP[['Symbol','DRS']+hrcols + self.sampcols] rowsort = plot_interactive_heatmap(DEGEXP,self.sampcols,'Symbol',self.sampcols, _fprefix+'/heatmap_sortgenes',xgap=0.1, vmin=-5,vmax=5,pwidth=4,pheight=2) rowcolsort = plot_interactive_heatmap(rowsort,self.sampcols,'Symbol',self.sampcols, _fprefix+'/heatmap_sortgenes_samples',xgap=0.1, vmin=-5,vmax=5,pwidth=4,pheight=2,sortmode='col') _reorgcols = rowcolsort.columns[1:] _datasum = DEGEXP[_reorgcols].sum(axis=0) _absdatasum = abs(DEGEXP[_reorgcols]).sum(axis=0) _RAscoredict = {_xx:_yy for _xx,_yy in zip(_reorgcols,_datasum)} _RAscoreframe = pd.DataFrame.from_dict(_RAscoredict,orient='index',columns=['RAscore']) _RAscoreframe ['sample'] = _RAscoreframe.index.tolist() _RAscoreframe ['group'] = 'normal' _RAscoreframe.group.loc[self.g_tumor] = 'tumor' _RAscoreframe['abs_RAscore'] = _absdatasum #_RAscoreframe.to_csv(_RAscoretable+'.csv') #self.write_datatable_json(_RAscoreframe,filename=_RAscoretable) ### plot distribution of normareturn GDAN vs tumor plt.style.use('seaborn-ticks') fig,ax = plt.subplots(1,1,figsize=(2.5,2.5)) sb.boxplot(x='group',y='RAscore',data=_RAscoreframe,ax=ax,dodge=True,order=['normal','tumor']) ax.set_xlabel('') plt.tight_layout() plt.savefig(_fprefix+'/RAscore_distribution.svg') #### plot RAscore for all samples colordict = {'normal':'blue','tumor':'red'} _color = [colordict[_xx] for _xx in _RAscoreframe['group'].tolist()] _xval = np.array([_xx for _xx,_ in enumerate(self.sampcols)]) trace_RA = go.Scatter(x=_RAscoreframe['sample'],y=_RAscoreframe.RAscore,hovertext=_RAscoreframe['sample'], mode='lines+markers',name='RA score',marker=dict(size=8,color=_color), line=dict(color='gray')) trace_absRA = go.Scatter(x=_RAscoreframe['sample'],y=_RAscoreframe.abs_RAscore,hovertext=_RAscoreframe['sample'], mode='lines+markers',name='|RA score|',marker=dict(size=8,color=_color), line=dict(color='black')) layout = go.Layout(autosize=True,width=1000,height=600, xaxis=dict(automargin=True), yaxis=dict(automargin=True)) fig = go.Figure(data=[trace_RA],layout=layout) save_plotly_html(fig,filename=_fprefix+'/RAscore_allsamples') ##### assign samples to a group (top 25% are RA_UP, lower 25% are RA_DOWN, and rest are RA_MID) # apply RA grouping to clinical data _RAscoreframe_T = _RAscoreframe.loc[self.g_tumor] _tclinical = self.clinical.copy(deep=True) _tclinical['tumorid']=[_xx[0:12] for _xx in _tclinical.tumor] _tclinical['RAgroup'] = 'RA_MID' lower25 = _RAscoreframe_T.RAscore.quantile(0.25) _lindex = _RAscoreframe_T[_RAscoreframe_T.RAscore<=lower25].index _lindex = _lindex[_lindex.isin(_tclinical.index)] # retain groups present in clinical data _tclinical['RAgroup'].loc[_lindex] = 'RA_LOW' upper25 = _RAscoreframe_T.RAscore.quantile(0.75) _uindex = _RAscoreframe_T[_RAscoreframe_T.RAscore>=upper25].index _uindex = _uindex[_uindex.isin(_tclinical.index)] # retain groups present in clinical data _tclinical['RAgroup'].loc[_uindex] = 'RA_HIGH' _tclinical.to_csv(_fprefix+'/clinical_RAgrouping.csv') _arr_rascore = _RAscoreframe_T.RAscore.values if survival: ### write Rscript to run survival plot biolinkscript = _fprefix+'/TCGA_biolinks.R' fp=open(biolinkscript,'w') fp.write('#!/bin/env/Rscript \n') fp.write("setwd('" + os.getcwd()+'/'+_fprefix+"')\n") fp.write('library(TCGAbiolinks)\nlibrary(dplyr)\nlibrary(DT)\n') fp.write("clinical <- read.csv('./clinical_RAgrouping.csv')\n") fp.write('clin.kirp <- GDCquery_clinic("TCGA-'+self.outfolder+'", "clinical")\n') fp.write('row.names(clin.kirp) <- clin.kirp$bcr_patient_barcode\n') fp.write('clin.kirp <- clin.kirp[c(clinical$tumorid),]\n') fp.write('clin.kirp$RAgroup <- clinical$RAgroup\n') fp.write('TCGAanalyze_survival(clin.kirp,"RAgroup",main ="'+self.outfolder+'",height = 10, width=10,color = c("blue","red","green","black"),conf.int = 0.01)\n') fp.write('save.image(file="TCGA_biolinks.RData")\n') fp.close() ### Run the Rscript p=subprocess.Popen('Rscript '+biolinkscript+' > Routput.log 2>&1',shell=True) print ("\t\t |-- PID for survival plot run : ",p.pid+1) os.waitpid(p.pid,0) print ("\t\t |-- Generated survival plot") ### correlation plot for each gene print ('Computing correlation over all genes') _expression_frame = self._gexpression.copy(deep=True) _expression_frame['GID'] = _expression_frame.index.tolist() _expression_frame['HTML']='-' _expression_frame['SVG']='-' _expression_frame = _expression_frame.reset_index() ## correlation calculations if len(plotgenes)==0: for ind,row in _expression_frame.iterrows(): _arr_expression = row[self.g_tumor].values.tolist() _slope,_intercept = np.polyfit(_arr_rascore,_arr_expression,1) _fitval = _slope*_arr_rascore+_intercept corrcoeff,pvalue = stats.pearsonr(_arr_expression,_fitval) _expression_frame.at[ind,'slope'] = _slope _expression_frame.at[ind,'intercept'] = _intercept _expression_frame.at[ind,'correlation'] = corrcoeff _expression_frame.at[ind,'pvalue'] = pvalue _expression_frame = _expression_frame.fillna('?') _hrexpression_frame = _expression_frame[~(_expression_frame[hrcols[1]] == '?')] else: _stexpression_frame=pd.read_csv(_corrtable+'.csv') _stexpression_frame.set_index('GID',drop=False) _stexpression_frame = _stexpression_frame.reindex(_expression_frame.index.tolist()) for _xxx in ['HTML','SVG','slope','intercept','correlation','pvalue']: if _xxx in ['slope','intercept','correlation','pvalue']: _expression_frame[_xxx]=_stexpression_frame[_xxx].astype('float64').tolist() else: _expression_frame[_xxx]=_stexpression_frame[_xxx] _expression_frame = _expression_frame.fillna('?') if plotgenes[0].lower()=='all': _hrexpression_frame = _expression_frame else: _hrexpression_frame = _expression_frame[_expression_frame.Symbol.isin(plotgenes)] _HRCplot=_fprefix+'/HRC_correlation_plots' if not os.path.isdir(_HRCplot): subprocess.call(['mkdir', '-pv', _HRCplot]) _xval = _RAscoreframe_T.RAscore.values _url='https://cbi-asang-009.pmacs.upenn.edu/NGS/Samir/GDAC_data/'+_HRCplot+'/' for ind,row in _hrexpression_frame.iterrows(): ### pub quality SVG plots plt.style.use('seaborn-white') fig,ax = plt.subplots(1,1,figsize=(3,3)) _yval = row[self.g_tumor].values ax.scatter(_xval,_yval,s=20,alpha=0.2) #print (ind,row.slope,row.intercept,_xval) _fitval = row.slope*_xval+row.intercept corrcoeff = round(row.correlation,3) _fitcolor='black' if row.slope > 0.0: _fitcolor='blue' if row.slope < 0.0: _fitcolor='red' ax.plot(_xval,_fitval,lw=2,zorder=10,color=_fitcolor) label = row.Symbol +'\n'+'m,c='+str(round(row.slope,3))+','+str(round(row.intercept,3))+'\n'+'r='+str(corrcoeff) ax.set_title(label,y=0.8,fontsize=10,x=0.2) plt.tight_layout() plt.savefig(_HRCplot+'/'+row.Symbol+'.svg') plt.close(fig) ## interactive html plot trace1 = go.Scatter(x=_xval,y=_yval,mode='markers',name='FPKM',hovertext=_RAscoreframe_T['sample'] ,marker=dict(size=12)) trace2 = go.Scatter(x=_xval,y=_fitval,mode='lines',line=dict(color=_fitcolor,width=2),name='linear fit',hovertext=label) layout = go.Layout(autosize=True,width=1000,height=600, xaxis=dict(automargin=True), yaxis=dict(automargin=True)) ifig = go.Figure(data=[trace1,trace2],layout=layout) save_plotly_html(ifig,filename=_HRCplot+'/'+row.Symbol,debug=False) _expression_frame.at[ind,'HTML'] = ' HTML ' _expression_frame.at[ind,'SVG'] = ' SVG ' _printcols = ['GID','Symbol']+hrcols+['slope','intercept','correlation','pvalue','HTML','SVG'] _expression_frame[_printcols].to_csv(_corrtable+'.csv') self.write_datatable_json(_expression_frame[_printcols],filename=_corrtable)