#!/bin/env python import argparse,argcomplete import glob import pickle as pkl import _gdac_assembler as GDAS import _gdac_analysis as GDAN exec(open('/home/ramn/workdir/NGS-pipeline/scripts/HEADER/python-header.py').read()) exec(open('/home/ramn/workdir/NGS-pipeline/scripts/src/plotter.py').read()) exec(open('/home/ramn/workdir/NGS-pipeline/scripts/src/bookkeeping.py').read()) parser=argparse.ArgumentParser(description="assemble GDAC data") parser.add_argument('--sample',help="--sample default=all",nargs='+',default=[]) parser.add_argument('--expression',help="--expression default=geneexpression_RSEM.txt",default="RNA-seq/geneexpression_RSEM.txt") parser.add_argument('--clinical',help="--clinical default=geneexpression_RSEM.txt",default="CLINICAL/clinical_merged.txt") parser.add_argument('--annotation',help="--annotation default=./annotation.gtf",default="./annotation.gtf") parser.add_argument('--features',help="--features default=./Haldar_rich_class_annotation.csv",default="./Haldar_rich_class_annotation.csv") parser.add_argument('--plotgenes',nargs='+',help="--plotgenes default= all in Haldar_rich_class",default=[]) parser.add_argument('--nosurvival',help="--nosurvival switches off survival plots",action='store_false') argcomplete.autocomplete(parser) args=parser.parse_args() if len(args.sample)==0: args.sample=[_xx.split('/CLINICAL')[0] for _xx in glob.glob('../*/CLINICAL')] _tcga = pd.DataFrame(columns=['cancer','group','Symbol','variable','value']) for _sample in args.sample: print (_sample) outpklname = _sample+'/assembled_data.pkl' if not os.path.isfile(outpklname): print ('Processing sample:',_sample) _temp = GDAS.assembler(folder=_sample,rnaseqfile=args.expression,annotationfile=args.annotation, featurefile=args.features,clinicalfile=args.clinical) pkl.dump(_temp.datadict,open(outpklname,'wb')) print ('Completed analysis for:',_sample,'\n\n') else: _temp = pkl.load(open(outpklname,'rb')) for _group in ['normal','tumor']: if len(_temp['group'][_group])>0: _temp1 = _temp['rna'][['Symbol']+_temp['group'][_group]] _temp1['cancer'] = _sample.split('../')[-1] _temp1['group'] = _group _tcga = _tcga.append(_temp1,sort=False) _tcga.to_json('GDAC_master_dataframe.json')