#bamToGFF.py #script to grab reads from a bam that align to a .gff file import sys import re import pandas as pd import ROSE_utils from collections import defaultdict import os from string import join,upper,maketrans #===================================================================== #====================MAPPING BAM READS TO GFF REGIONS================= #===================================================================== def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False,matrix = None): #def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0,density = False,rpm = False,binSize = 25,clusterGram = None,matrix = None,raw = False,includeJxnReads = False): '''maps reads from a bam to a gff''' floor = int(floor) #USING BAM CLASS bam = ROSE_utils.Bam(bamFile) #new GFF to write to newGFF = [] #millionMappedReads if rpm: MMR= round(float(bam.getTotalReads('mapped'))/1000000,4) else: MMR = 1 print('using a MMR value of %s' % (MMR)) testdata = pd.read_table(gff,header=None) senseTrans = maketrans('-+.','+-+') if ROSE_utils.checkChrStatus(bamFile) == 1: hasChrFlag = 1 else: print "does not have chr" hasChrFlag = 0 if type(gff) == str: gff = ROSE_utils.parseTable(gff,'\t') #setting up a maxtrix table newGFF.append(['GENE_ID','locusLine'] + ['bin_'+str(n)+'_'+bamFile.split('/')[-1] for n in range(1,int(matrix)+1,1)]) #getting and processing reads for gff lines print('Number lines processed') for ticker, line in enumerate(gff): line = line[0:9] if ticker%100 == 0: print ticker if not hasChrFlag: line[0] = re.sub(r"chr",r"",line[0]) gffLocus = ROSE_utils.Locus(line[0],int(line[3]),int(line[4]),line[6],line[1]) searchLocus = ROSE_utils.makeSearchLocus(gffLocus,int(extension),int(extension)) reads = bam.getReadsLocus(searchLocus,'both',False,'none') #now extend the reads and make a list of extended reads extendedReads = [] for locus in reads: if locus.sense() == '+' or locus.sense() == '.': locus = ROSE_utils.Locus(locus.chr(),locus.start(),locus.end()+extension,locus.sense(), locus.ID()) if locus.sense() == '-': locus = ROSE_utils.Locus(locus.chr(),locus.start()-extension,locus.end(),locus.sense(),locus.ID()) extendedReads.append(locus) if gffLocus.sense() == '+' or gffLocus.sense == '.': senseReads = filter(lambda x:x.sense() == '+' or x.sense() == '.',extendedReads) antiReads = filter(lambda x:x.sense() == '-',extendedReads) else: senseReads = filter(lambda x:x.sense() == '-' or x.sense() == '.',extendedReads) antiReads = filter(lambda x:x.sense() == '+',extendedReads) senseHash = defaultdict(int) antiHash = defaultdict(int) #filling in the readHashes if sense == '+' or sense == 'both' or sense =='.': for read in senseReads: for x in range(read.start(),read.end()+1,1): senseHash[x]+=1 if sense == '-' or sense == 'both' or sense == '.': for read in antiReads: for x in range(read.start(),read.end()+1,1): antiHash[x]+=1 #now apply flooring and filtering for coordinates keys = ROSE_utils.uniquify(senseHash.keys()+antiHash.keys()) if floor > 0: keys = filter(lambda x: (senseHash[x]+antiHash[x]) > floor,keys) #coordinate filtering keys = filter(lambda x: gffLocus.start() < x < gffLocus.end(),keys) import numpy as np mink,maxk = min(keys),max(keys) covmatrix = np.zeros(maxk-mink+1,'d') for nnn,kkey in enumerate(keys): covmatrix[nnn] = senseHash[kkey] + antiHash[kkey] #setting up the output table clusterLine = [gffLocus.ID(),gffLocus.__str__()] #getting the binsize binSize = (gffLocus.len()-1)/int(matrix) nBins = int(matrix) if binSize == 0: clusterLine+=['NA']*int(matrix) newGFF.append(clusterLine) continue n=0 if gffLocus.sense() == '+' or gffLocus.sense() =='.' or gffLocus.sense() == 'both': i = gffLocus.start() while n