import pandas as pd import matplotlib.pyplot as plt from matplotlib import cm import numpy as np import scipy.stats as stats import pickle as pkl class assembler: ''' Script and data structure to assemble gdac data ''' def __init__(self,folder=None,rnaseqfile=None,annotationfile=None, featurefile=None,clinicalfile=None,cli_HRC = 'patient.bcr_patient_barcode'): self.location=folder self.rnaseqfile=folder+'/'+rnaseqfile self.annotationfile=annotationfile self.featurefile = featurefile self.clinicalfile = folder+'/'+clinicalfile self.cli_HRC = cli_HRC self.datadict = {'rna':[], 'clinical':[],'group': []} if self.rnaseqfile: self.read_RNAseqdata() if self.clinicalfile: self.read_clinicaldata() def read_RNAseqdata(self): print ('1.1 Read annotation',self.annotationfile) gene_annot = pd.read_csv(self.annotationfile,sep='\t',low_memory=False) gene_annot = gene_annot[gene_annot.Status=='live'] gene_annot = gene_annot.set_index('GeneID',drop=False) print ('1.2 Read features',self.featurefile) HRC = pd.read_csv(self.featurefile,sep='\t') HRC['NCBI gene ID'] = HRC['NCBI gene ID'].astype('str').str.replace('?','nan') HRC = HRC[~(HRC['NCBI gene ID']=='nan')] HRC['NCBI gene ID'] = HRC['NCBI gene ID'].astype('int') HRC = HRC.set_index('NCBI gene ID',drop=False) HRC = HRC[~(HRC['NCBI gene ID'].duplicated())] print ('1.3 Read rnaseqfile',self.rnaseqfile) gdac_rna = pd.read_csv(self.rnaseqfile,sep='\t',low_memory=False) gdac_rna = gdac_rna.drop(gdac_rna.index[0]) gdac_rna['GeneID'] = [_xx.split('|')[1] for _xx in gdac_rna['Hybridization REF']] gdac_rna[gdac_rna.columns[1:]] = gdac_rna[gdac_rna.columns[1:]].astype('float64') gdac_rna['GeneID'] = gdac_rna['GeneID'].astype('int') gdac_rna = gdac_rna.set_index('GeneID') gdac_rna['Symbol']='UNDEFINED' gdac_rna['Haldar_Class']='UNDEFINED' gdac_rna['Haldar_Rich_Class']='UNDEFINED' print ('1.4 Assemble the data') ####### Compare to gene annotation and assign gene symbol to Gene ID######## gdac_rna_v1 = gdac_rna.reindex(gene_annot.GeneID).dropna() for _ind,row in gdac_rna_v1.iterrows(): gdac_rna_v1.at[_ind, 'Symbol'] = gene_annot.loc[_ind,'Symbol'] try: gdac_rna_v1.at[_ind, 'Haldar_Class'] = HRC.loc[_ind,'Haldar Class'] gdac_rna_v1.at[_ind, 'Haldar_Rich_Class'] = HRC.loc[_ind,'HaldarRichClass'] except: pass _nindex1 = gdac_rna_v1[gdac_rna_v1['Haldar_Class']=='UNDEFINED'].index _nindex2 = gdac_rna_v1[gdac_rna_v1['Haldar_Rich_Class']=='UNDEFINED'].index gdac_rna_v1['Haldar_Class'].loc[_nindex1] = np.NaN gdac_rna_v1['Haldar_Rich_Class'].loc[_nindex2] = np.NaN _tempcols = gdac_rna_v1.columns.tolist() gdac_rna_v1 = gdac_rna_v1[[_tempcols[0]]+_tempcols[-3:]+_tempcols[1:-3]] gdac_rna_v1 = gdac_rna_v1[~(gdac_rna_v1['Symbol'].duplicated())] print ('1.5 classify the columns') ### classify the column samples samplegroup={'normal':[],'tumor':[],'control':[]} for _xx in gdac_rna_v1.columns[4:]: marker = int(_xx[13:15]) if marker<10: samplegroup['tumor'].append(_xx) elif marker>=10 and marker<20: samplegroup['normal'].append(_xx) elif marker>=20: samplegroup['control'].append(_xx) for _list in samplegroup: print ('\t==>',_list, len(samplegroup[_list])) self.datadict['rna'] = gdac_rna_v1 self.datadict['group'] = samplegroup def read_clinicaldata(self): print ('2. Reading clinical data') clinical = pd.read_csv(self.clinicalfile,sep='\t',low_memory=False).transpose() clinical = clinical.rename(columns=clinical.iloc[0]).drop(clinical.index[0]) clinical[self.cli_HRC] = clinical[self.cli_HRC].str.upper() clinical['normal'] ='' clinical['tumor'] ='' clinical['control'] ='' rnaseqdata = self.datadict['rna'] for _ind,_row in clinical.iterrows(): allfound = rnaseqdata.columns[rnaseqdata.columns.str.contains(_row[self.cli_HRC])] for _found in allfound: marker = int(_found[13:15]) if marker<10: clinical.at[_ind,'tumor'] = _found elif marker>=10 and marker<20: clinical.at[_ind,'normal'] = _found elif marker>=20: clinical.at[_ind,'control'] = _found self.datadict['clinical'] = clinical