#mapEnhancerFromFactor.py ''' PROGRAM TO STITCH TOGETHER REGIONS TO FORM ENHANCERS, MAP READ DENSITY TO STITCHED REGIONS, AND RANK ENHANCERS BY READ DENSITY TO DISCOVER SUPER-ENHANCERS APRIL 11, 2013 VERSION 0.1 CONTACT: youngcomputation@wi.mit.edu ''' import sys import pandas as pd import ROSE_utils import time import os from string import upper,join from collections import defaultdict import subprocess as subprocess #================================================================== #=================CHECKING REFERENCE COLLECTION==================== #================================================================== def checkRefCollection(referenceCollection): ''' makes sure the names of all loci in the reference collection are unique ''' namesList = [locus.ID() for locus in referenceCollection.getLoci()] if len(namesList) != len(ROSE_utils.uniquify(namesList)): print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS") print("THE SECOND COLUMN OF THE INPUT .GFF OR THE FOURTH COLUMN OF THE INPUT .BED MUST HAVE A UNIQUE IDENTIFIER FOR EACH REGION") sys.exit() else: print("REFERENCE COLLECTION PASSES QC") return #================================================================== #=====================REGION STITCHING============================= #================================================================== def regionStitching(inputGFF,stitchWindow,tssWindow,annotFile,removeTSS=True): print('PERFORMING REGION STITCHING') #first have to turn bound region file into a locus collection #need to make sure this names correctly... each region should have a unique name boundCollection = ROSE_utils.gffToLocusCollection(inputGFF) debugOutput = [] #filter out all bound regions that overlap the TSS of an ACTIVE GENE if removeTSS: #first make a locus collection of TSS startDict = ROSE_utils.makeStartDict(annotFile) #now makeTSS loci for active genes removeTicker=0 #this loop makes a locus centered around +/- tssWindow of transcribed genes #then adds it to the list tssLoci tssLoci = [] for geneID in startDict.keys(): tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,tssWindow,tssWindow)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = ROSE_utils.LocusCollection(tssLoci,50) #gives all the loci in boundCollection boundLoci = boundCollection.getLoci() #this loop will check if each bound region is contained by the TSS exclusion zone #this will drop out a lot of the promoter only regions that are tiny #typical exclusion window is around 2kb for locus in boundLoci: if len(tssCollection.getContainers(locus,'both'))>0: #if true, the bound locus overlaps an active gene boundCollection.remove(locus) debugOutput.append([locus.__str__(),locus.ID(),'CONTAINED']) removeTicker+=1 print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker)) #boundCollection is now all enriched region loci that don't overlap an active TSS stitchedCollection = boundCollection.stitchCollection(stitchWindow,'both') if removeTSS: #now replace any stitched region that overlap 2 distinct genes #with the original loci that were there fixedLoci = [] tssLoci = [] for geneID in startDict.keys(): tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,50,50)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = ROSE_utils.LocusCollection(tssLoci,50) removeTicker = 0 originalTicker = 0 for stitchedLocus in stitchedCollection.getLoci(): overlappingTSSLoci = tssCollection.getOverlap(stitchedLocus,'both') tssNames = [startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci] tssNames = ROSE_utils.uniquify(tssNames) if len(tssNames) > 2: #stitchedCollection.remove(stitchedLocus) originalLoci = boundCollection.getOverlap(stitchedLocus,'both') originalTicker+=len(originalLoci) fixedLoci+=originalLoci debugOutput.append([stitchedLocus.__str__(),stitchedLocus.ID(),'MULTIPLE_TSS']) removeTicker+=1 else: fixedLoci.append(stitchedLocus) print('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker)) print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker)) fixedCollection = ROSE_utils.LocusCollection(fixedLoci,50) return fixedCollection,debugOutput else: return stitchedCollection,debugOutput #================================================================== #=====================REGION LINKING MAPPING======================= #================================================================== def mapCollection(stitchedCollection,referenceCollection,bamFileList,mappedFolder,output,refName): ''' makes a table of factor density in a stitched locus and ranks table by number of loci stitched together ''' print('FORMATTING TABLE') loci = stitchedCollection.getLoci() locusTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE']] #strip out any that are in chrY for locus in list(loci): if locus.chr() == 'chrY': loci.remove(locus) lociLenList = [locus.len() for locus in loci] #modified NR lenOrder = ROSE_utils.order(lociLenList,decreasing=True) for ticker, i in enumerate(lenOrder): #modified NR locus = loci[i] #First get the size of the enriched regions within the stitched locus refEnrichSize = 0 refOverlappingLoci = referenceCollection.getOverlap(locus,'both') for refLocus in refOverlappingLoci: refEnrichSize += refLocus.len() try: stitchCount = int(locus.ID().split('_')[0]) except ValueError: stitchCount = 1 locusTable.append([locus.ID(),locus.chr(),locus.start(),locus.end(),stitchCount,refEnrichSize]) print('GETTING MAPPED DATA') for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] print('GETTING MAPPING DATA FOR %s' % bamFile) #assumes standard convention for naming enriched region gffs #opening up the mapped GFF print('OPENING %s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName)) mappedGFF = ROSE_utils.parseTable('%s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName),'\t') signalDict = defaultdict(float) print('MAKING SIGNAL DICT FOR %s' % (bamFile)) mappedLoci = [] for line in mappedGFF[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mappedLoci.append(ROSE_utils.Locus(chrom,start,end,'.',line[0])) try: signalDict[line[0]] = float(line[2])*(abs(end-start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mappedCollection = ROSE_utils.LocusCollection(mappedLoci,500) locusTable[0].append(bamFileName) for i in range(1,len(locusTable)): signal=0.0 line = locusTable[i] lineLocus = ROSE_utils.Locus(line[1],line[2],line[3],'.') overlappingRegions = mappedCollection.getOverlap(lineLocus,sense='both') for region in overlappingRegions: signal += signalDict[region.ID()] locusTable[i].append(signal) ROSE_utils.unParseTable(locusTable,output,'\t') #================================================================== #=========================MAIN METHOD============================== #================================================================== debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage = usage) #required flags parser.add_option("-i","--i", dest="input",nargs = 1, default=None, help = "Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r","--rankby", dest="rankby",nargs = 1, default=None, help = "bamfile to rank enhancer by") parser.add_option("-o","--out", dest="out",nargs = 1, default=None, help = "Enter an output folder") parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (MM9,MM8,HG18,HG19)") #optional flags parser.add_option("-b","--bams", dest="bams",nargs = 1, default=None, help = "Enter a comma separated list of additional bam files to map to") parser.add_option("-c","--control", dest="control",nargs = 1, default=None, help = "bamfile to rank enhancer by") parser.add_option("-s","--stitch", dest="stitch",nargs = 1, default=12500, help = "Enter a max linking distance for stitching") parser.add_option("-t","--tss", dest="tss",nargs = 1, default=0, help = "Enter a distance from TSS to exclude. 0 = no TSS exclusion") #RETRIEVING FLAGS (options,args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() #making the out folder if it doesn't exist outFolder = ROSE_utils.formatFolder(options.out,True) #figuring out folder schema gffFolder = ROSE_utils.formatFolder(outFolder+'gff/',True) mappedFolder = ROSE_utils.formatFolder(outFolder+ 'mappedGFF/',True) #GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': #CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder,inputGFFName) ROSE_utils.bedToGFF(options.input,inputGFFFile) elif options.input.split('.')[-1] =='gff': #COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile,gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') #COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile,gffFolder)) #GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby,options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileLIst = ROSE_utils.uniquify(bamFileList) #optional args #Stitch parameter stitchWindow = int(options.stitch) #tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False #GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] #GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) #GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18':'%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19':'%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10':'%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[upper(genome)] #MAKING THE START DICT print('MAKING START DICT') startDict = ROSE_utils.makeStartDict(annotFile) #LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = ROSE_utils.gffToLocusCollection(inputGFFFile) #CHECKING INPUT REGIONS FOR FORMATTING print('CHECKING INPUT TO MAKE SURE EACH REGION HAS A UNIQUE IDENTIFIER') checkRefCollection(referenceCollection) #makes sure that all input regions have a unique ID #NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection,debugOutput = regionStitching(inputGFFFile,stitchWindow,tssWindow,annotFile,removeTSS) #NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF=ROSE_utils.locusCollectionToGFF(stitchedCollection) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder,inputName,stitchWindow/1000) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,stitchWindow/1000) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder,inputName,stitchWindow/1000) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder,inputName,stitchWindow/1000) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName,stitchWindow/1000) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder,inputName,stitchWindow/1000) #WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) ROSE_utils.unParseTable(debugOutput,debugOutFile,'\t') #WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) ROSE_utils.unParseTable(stitchedGFF,stitchedGFFFile,'\t') #SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # bin for bam mapping nBin =1 njobs=0 recompute_coverage = False for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] #MAPPING TO THE STITCHED GFF mappedOut1 ='%s%s_%s_MAPPED.gff' % (mappedFolder,stitchedGFFName,bamFileName) cmd1 = "python ROSE_bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s &" % (nBin,bamFile,stitchedGFFFile,mappedOut1) #MAPPING TO THE ORIGINAL GFF mappedOut2 ='%s%s_%s_MAPPED.gff' % (mappedFolder,inputName,bamFileName) cmd2 = "python ROSE_bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s &" % (nBin,bamFile,inputGFFFile,mappedOut2) if not os.path.isfile(mappedOut1) or recompute_coverage: subprocess.call(cmd1,shell=True) njobs += 1 if not os.path.isfile(mappedOut1) or recompute_coverage: subprocess.call(cmd2,shell=True) njobs += 1 if njobs>0: print('PAUSING TO MAP') time.sleep(10) #CHECK FOR MAPPING OUTPUT outputDone = False ticker = 0 print('WAITING FOR MAPPING TO COMPLETE. ELAPSED TIME (MIN):') while not outputDone: ''' check every 5 minutes for completed output ''' outputDone = True if ticker%6 == 0: print(ticker*5) ticker +=1 #CHANGE THIS PARAMETER TO ALLOW MORE TIME TO MAP if ticker == 144: print('ERROR: OPERATION TIME OUT. MAPPING OUTPUT NOT DETECTED') exit() break for bamFile in bamFileList: #GET THE MAPPED OUTPUT NAMES HERE FROM MAPPING OF EACH BAMFILE bamFileName = bamFile.split('/')[-1] mappedOut1 ='%s%s_%s_MAPPED.gff' % (mappedFolder,stitchedGFFName,bamFileName) try: mapFile = open(mappedOut1,'r') mapFile.close() except IOError: outputDone = False mappedOut2 ='%s%s_%s_MAPPED.gff' % (mappedFolder,inputName,bamFileName) try: mapFile = open(mappedOut2,'r') mapFile.close() except IOError: outputDone = False if outputDone == True: break time.sleep(1) print('MAPPING TOOK %s MINUTES' % (ticker*5)) print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') #CALCULATE DENSITY BY REGION mapCollection(stitchedCollection,referenceCollection,bamFileList,mappedFolder,outputFile1,refName = stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE_callSuper.R' % (outFolder,outputFile1,inputName,controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE_callSuper.R' % (outFolder,outputFile1,inputName,controlName) #subprocess.call(cmd,shell=True)