import os

import re

from string import *
import sys
import subprocess
import datetime

from collections import defaultdict

#SET OF UTILITY FUNCTIONS FOR mapEnhancerFromFactor.py

#==================================================================
#==========================I/O FUNCTIONS===========================
#==================================================================

#unParseTable 4/14/08
#takes in a table generated by parseTable and writes it to an output file
#takes as parameters (table, output, sep), where sep is how the file is delimited
#example call unParseTable(table, 'table.txt', '\t') for a tab del file

def unParseTable(table, output, sep):
    fh_out = open(output,'w')
    if len(sep) == 0:
        for i in table:
            fh_out.write(str(i))
            fh_out.write('\n')
    else:
        for line in table:
            line = [str(x) for x in line]
            line = join(line,sep)

            fh_out.write(line)
            fh_out.write('\n')

    fh_out.close()

#parseTable 4/14/08
#takes in a table where columns are separated by a given symbol and outputs
#a nested list such that list[row][col]
#example call:
#table = parseTable('file.txt','\t')
def parseTable(fn, sep, header = False,excel = False):
    fh = open(fn)
    if header == True:
        header = fh.readline() #disposes of the header

    table = []
    for line in fh:
        line = line.rstrip().split(sep)
        table.append(line)

    fh.close()

    return table


def formatBed(bed,output=''):

    '''
    formats a bed file from UCSC or MACS into a WUSTL gateway compatible bed
    '''


    newBed = []

    if type(bed) == str:
        bed = parseTable(bed,'\t')

    indexTicker = 1
    for line in bed:

        newLine = line[0:4]
        try:
            strand = line[5]
        except IndexError:
            strand = '.'
        newLine+= [indexTicker,strand]
        indexTicker +=1
        newBed.append(newLine)
    
    if len(output) > 0:
        unParseTable(newBed,output,'\t')
    else:
        return newBed
    
    
def bedToGFF(bed,output=''):

    '''
    turns a bed into a gff file
    '''
    if type(bed) == str:
        bed = parseTable(bed,'\t')
    
    bed = formatBed(bed)

    gff = []
    for line in bed:
        gffLine = [line[0],line[3],'',line[1],line[2],line[4],line[5],'',line[3]]
        gff.append(gffLine)

    if len(output) > 0:
        unParseTable(gff,output,'\t')
    else:
        return gff


def gffToBed(gff,output= ''):
    '''
    turns a gff to a bed file
    '''
    bed = []
    indexTicker = 1
    for line in gff:
        newLine = [line[0],line[3],line[4],line[1],indexTicker,line[6]]
        indexTicker+=1
        bed.append(newLine)
    if len(output) == 0:
        return bed
    else:
        unParseTable(bed,output,'\t')

def formatFolder(folderName,create=False):

    '''
    makes sure a folder exists and if not makes it
    returns a bool for folder
    '''
    
    if folderName[-1] != '/':
        folderName +='/'

    try: 
        foo = os.listdir(folderName)
        return folderName
    except OSError:
        print('folder %s does not exist' % (folderName))
        if create:
            os.system('mkdir %s' % (folderName))
            return folderName
        else:
                    
            return False 


#==================================================================
#===================ANNOTATION FUNCTIONS===========================
#==================================================================


def makeStartDict(annotFile,geneList = []):
    '''
    makes a dictionary keyed by refseq ID that contains information about 
    chrom/start/stop/strand/common name
    '''

    if type(geneList) == str:
        geneList = parseTable(geneList,'\t')
        geneList = [line[0] for line in geneList]
            
    if upper(annotFile).count('REFSEQ') == 1:
        refseqTable,refseqDict = importRefseq(annotFile)
        if len(geneList) == 0:
            geneList = refseqDict.keys()
        startDict = {}
        for gene in geneList:
            if refseqDict.has_key(gene) == False:
                continue
            startDict[gene]={}
            startDict[gene]['sense'] = refseqTable[refseqDict[gene][0]][3]
            startDict[gene]['chr'] = refseqTable[refseqDict[gene][0]][2]
            startDict[gene]['start'] = getTSSs([gene],refseqTable,refseqDict)
            if startDict[gene]['sense'] == '+':
                startDict[gene]['end'] =[int(refseqTable[refseqDict[gene][0]][5])]
            else:
                startDict[gene]['end'] = [int(refseqTable[refseqDict[gene][0]][4])]
            startDict[gene]['name'] = refseqTable[refseqDict[gene][0]][12]
    return startDict


#generic function to get the TSS of any gene
def getTSSs(geneList,refseqTable,refseqDict):
    #refseqTable,refseqDict = importRefseq(refseqFile)
    if len(geneList) == 0:
        refseq = refseqTable
    else:
        refseq = refseqFromKey(geneList,refseqDict,refseqTable)
    TSS = []
    for line in refseq:
        if line[3] == '+':
            TSS.append(line[4])
        if line[3] == '-':
            TSS.append(line[5])
    TSS = map(int,TSS)
    
    return TSS


#12/29/08
#refseqFromKey(refseqKeyList,refseqDict,refseqTable)
#function that grabs refseq lines from refseq IDs
def refseqFromKey(refseqKeyList,refseqDict,refseqTable):
    typeRefseq = []
    for name in refseqKeyList:
        if refseqDict.has_key(name):
            typeRefseq.append(refseqTable[refseqDict[name][0]])
    return typeRefseq


#10/13/08
#importRefseq
#takes in a refseq table and makes a refseq table and a refseq dictionary for keying the table

def importRefseq(refseqFile, returnMultiples = False):

    '''
    opens up a refseq file downloaded by UCSC
    '''
    refseqTable = parseTable(refseqFile,'\t')
    refseqDict = {}
    ticker = 1
    for line in refseqTable[1:]:
        if refseqDict.has_key(line[1]):
            refseqDict[line[1]].append(ticker)
        else:
            refseqDict[line[1]] = [ticker]
        ticker = ticker + 1

    multiples = []
    for i in refseqDict:
        if len(refseqDict[i]) > 1:
            multiples.append(i)

    if returnMultiples == True:
        return refseqTable,refseqDict,multiples
    else:
        return refseqTable,refseqDict


#==================================================================
#========================LOCUS INSTANCE============================
#==================================================================

#Locus and LocusCollection instances courtesy of Graham Ruby


class Locus:
    # this may save some space by reducing the number of chromosome strings
    # that are associated with Locus instances (see __init__).
    __chrDict = dict()
    __senseDict = {'+':'+', '-':'-', '.':'.'}
    # chr = chromosome name (string)
    # sense = '+' or '-' (or '.' for an ambidexterous locus)
    # start,end = ints of the start and end coords of the locus;
    #      end coord is the coord of the last nucleotide.
    def __init__(self,chr,start,end,sense,ID=''):
        coords = [int(start),int(end)]
        coords.sort()
        # this method for assigning chromosome should help avoid storage of
        # redundant strings.
        if not(self.__chrDict.has_key(chr)): self.__chrDict[chr] = chr
        self._chr = self.__chrDict[chr]
        self._sense = self.__senseDict[sense]
        self._start = int(coords[0])
        self._end = int(coords[1])
        self._ID = ID
    def ID(self): return self._ID
    def chr(self): return self._chr
    def start(self): return self._start  ## returns the smallest coordinate
    def end(self): return self._end   ## returns the biggest coordinate
    def len(self): return self._end - self._start + 1
    def getAntisenseLocus(self):
        if self._sense=='.': return self
        else:
            switch = {'+':'-', '-':'+'}
            return Locus(self._chr,self._start,self._end,switch[self._sense])
    def coords(self): return [self._start,self._end]  ## returns a sorted list of the coordinates
    def sense(self): return self._sense
    # returns boolean; True if two loci share any coordinates in common

    def overlaps(self,otherLocus):
        if self.chr()!=otherLocus.chr(): return False
        elif not(self._sense=='.' or \
                 otherLocus.sense()=='.' or \
                 self.sense()==otherLocus.sense()): return False
        elif self.start() > otherLocus.end() or otherLocus.start() > self.end(): return False
        else: return True
        
    # returns boolean; True if all the nucleotides of the given locus overlap
    #      with the self locus

    def contains(self,otherLocus):
        if self.chr()!=otherLocus.chr(): return False
        elif not(self._sense=='.' or \
                 otherLocus.sense()=='.' or \
                 self.sense()==otherLocus.sense()): return False
        elif self.start() > otherLocus.start() or otherLocus.end() > self.end(): return False
        else: return True
        
    # same as overlaps, but considers the opposite strand
    def overlapsAntisense(self,otherLocus):
        return self.getAntisenseLocus().overlaps(otherLocus)
    # same as contains, but considers the opposite strand
    def containsAntisense(self,otherLocus):
        return self.getAntisenseLocus().contains(otherLocus)
    def __hash__(self): return self._start + self._end
    def __eq__(self,other):
        if self.__class__ != other.__class__: return False
        if self.chr()!=other.chr(): return False
        if self.start()!=other.start(): return False
        if self.end()!=other.end(): return False
        if self.sense()!=other.sense(): return False
        return True
    def __ne__(self,other): return not(self.__eq__(other))
    def __str__(self): return self.chr()+'('+self.sense()+'):'+'-'.join(map(str,self.coords()))
    def checkRep(self):
        pass


class LocusCollection:
    def __init__(self,loci,windowSize):
        ### top-level keys are chr, then strand, no space
        self.__chrToCoordToLoci = dict()
        self.__loci = dict()
        self.__winSize = windowSize
        for lcs in loci: self.__addLocus(lcs)

    def __addLocus(self,lcs):
        if not(self.__loci.has_key(lcs)):
            self.__loci[lcs] = None
            if lcs.sense()=='.': chrKeyList = [lcs.chr()+'+', lcs.chr()+'-']
            else: chrKeyList = [lcs.chr()+lcs.sense()]
            for chrKey in chrKeyList:
                if not(self.__chrToCoordToLoci.has_key(chrKey)): self.__chrToCoordToLoci[chrKey] = dict()
                for n in self.__getKeyRange(lcs):
                    if not(self.__chrToCoordToLoci[chrKey].has_key(n)): self.__chrToCoordToLoci[chrKey][n] = []
                    self.__chrToCoordToLoci[chrKey][n].append(lcs)

    def __getKeyRange(self,locus):                                    # convert real loci start and end to a scaled coordinate
        start = locus.start() / self.__winSize
        end = locus.end() / self.__winSize + 1 ## add 1 because of the range
        return range(start,end)

    def __len__(self): return len(self.__loci)
        
    def append(self,new): self.__addLocus(new)
    def extend(self,newList):
        for lcs in newList: self.__addLocus(lcs)
    def hasLocus(self,locus):
        return self.__loci.has_key(locus)
    def remove(self,old):
        if not(self.__loci.has_key(old)): raise ValueError("requested locus isn't in collection")
        del self.__loci[old]
        if old.sense()=='.': senseList = ['+','-']
        else: senseList = [old.sense()]
        for k in self.__getKeyRange(old):
            for sense in senseList:
                self.__chrToCoordToLoci[old.chr()+sense][k].remove(old)

    def getWindowSize(self): return self.__winSize
    def getLoci(self): return self.__loci.keys()
    def getChrList(self):
        # i need to remove the strand info from the chromosome keys and make
        # them non-redundant.
        tempKeys = dict()
        for k in self.__chrToCoordToLoci.keys(): tempKeys[k[:-1]] = None
        return tempKeys.keys()
            
    def __subsetHelper(self,locus,sense):
        sense = sense.lower()
        if ['sense','antisense','both'].count(sense)!=1:
            raise ValueError("sense command invalid: '"+sense+"'.")
        matches = dict()
        senses = ['+','-']

        if locus.sense()=='.' or sense=='both': lamb = lambda s: True
        elif sense=='sense': lamb = lambda s: s==locus.sense()
        elif sense=='antisense': lamb = lambda s: s!=locus.sense()
        else: raise ValueError("sense value was inappropriate: '"+sense+"'.")
        for s in filter(lamb, senses):
            chrKey = locus.chr()+s
            if self.__chrToCoordToLoci.has_key(chrKey):
                temparr = self.__getKeyRange(locus) 
                #print ('min, max =',temparr[0],temparr[-1])
                for n in self.__getKeyRange(locus):
                    if self.__chrToCoordToLoci[chrKey].has_key(n):
                        for lcs in self.__chrToCoordToLoci[chrKey][n]: matches[lcs] = None
        return matches.keys()
        
    # sense can be 'sense' (default), 'antisense', or 'both'
    # returns all members of the collection that overlap the locus
    def getOverlap(self,locus,sense='sense'):
        matches = self.__subsetHelper(locus,sense)
        ### now, get rid of the ones that don't really overlap
        realMatches = dict()
        if sense=='sense' or sense=='both':
            for i in filter(lambda lcs: lcs.overlaps(locus), matches):
                realMatches[i] = None
        if sense=='antisense' or sense=='both':
            for i in filter(lambda lcs: lcs.overlapsAntisense(locus), matches):
                realMatches[i] = None
        return realMatches.keys()

    # sense can be 'sense' (default), 'antisense', or 'both'
    # returns all members of the collection that are contained by the locus
    def getContained(self,locus,sense='sense'):
        matches = self.__subsetHelper(locus,sense)
        ### now, get rid of the ones that don't really overlap
        realMatches = dict()
        if sense=='sense' or sense=='both':
            for i in filter(lambda lcs: locus.contains(lcs), matches):
                realMatches[i] = None
        if sense=='antisense' or sense=='both':
            for i in filter(lambda lcs: locus.containsAntisense(lcs), matches):
                realMatches[i] = None
        return realMatches.keys()

    # sense can be 'sense' (default), 'antisense', or 'both'
    # returns all members of the collection that contain the locus
    def getContainers(self,locus,sense='sense'):
        matches = self.__subsetHelper(locus,sense)
        ### now, get rid of the ones that don't really overlap
        realMatches = dict()
        if sense=='sense' or sense=='both':
            for i in filter(lambda lcs: lcs.contains(locus), matches):
                realMatches[i] = None
        if sense=='antisense' or sense=='both':
            for i in filter(lambda lcs: lcs.containsAntisense(locus), matches):
                realMatches[i] = None
        return realMatches.keys()

    def stitchCollection(self,stitchWindow=1,sense='both'):

        '''
        reduces the collection by stitching together overlapping loci
        returns a new collection
        '''

        #initializing stitchWindow to 1 
        #this helps collect directly adjacent loci

        locusList = self.getLoci()
        oldCollection = LocusCollection(locusList,500)            # start with a master list of peaks
        stitchedCollection = LocusCollection([],500)

        for locus in locusList:                                   # start with the first peak the master list
            
            if oldCollection.hasLocus(locus):                                       
                oldCollection.remove(locus)                     # remove the locus that is being processed
                #extend peaks locus on either sides
                extlocus = Locus(locus.chr(),locus.start()-stitchWindow,locus.end()+stitchWindow,locus.sense(),locus.ID())
                overlappingLoci = oldCollection.getOverlap(extlocus,sense) #compute it overlap with all peaks in the master list
                
                stitchTicker = 1
                while len(overlappingLoci) > 0:               # run over all peaks identified to overlap
                    stitchTicker += len(overlappingLoci)
                    overlapCoords = locus.coords() 
                    for overlappingLocus in overlappingLoci:
                        overlapCoords += overlappingLocus.coords()
                        oldCollection.remove(overlappingLocus)              #remove the overlapping peak from the master list
                    if sense == 'both':
                        locus = Locus(locus.chr(),min(overlapCoords),max(overlapCoords),'.',locus.ID())
                    else:
                        locus = Locus(locus.chr(),min(overlapCoords),max(overlapCoords),locus.sense(),locus.ID())   # merge all overlapping peaks

                    #use the merged peak as the reference and compute overlap with remaining peaks in the master list
                    overlappingLoci = oldCollection.getOverlap(Locus(locus.chr(),locus.start()-stitchWindow,locus.end()+stitchWindow,locus.sense()),sense)
                
                ''' The above loop continues until there are no further overlaps '''

                locus._ID = '%s_%s_lociStitched' % (stitchTicker,locus.ID())        # give a name to the merged peak
                stitchedCollection.append(locus)                                    # add the locus to a merged list
            else:
                continue
        return stitchedCollection


#==================================================================
#========================LOCUS FUNCTIONS===========================
#==================================================================
#06/11/09
#turns a locusCollection into a gff
#does not write to disk though
def locusCollectionToGFF(locusCollection):
    lociList = locusCollection.getLoci()
    gff = []
    for locus in lociList:
        newLine = [locus.chr(),locus.ID(),'',locus.coords()[0],locus.coords()[1],'',locus.sense(),'',locus.ID()]
        gff.append(newLine)
    return gff


def gffToLocusCollection(gff,window =500):

    '''
    opens up a gff file and turns it into a LocusCollection instance
    '''

    lociList = []
    if type(gff) == str:
        gff = parseTable(gff,'\t')


    nameList = []
    for line in gff:
        #USE line[2] as the locus ID.  If that is empty use line[8]
        if len(line) < 7:
            print('SKIPPING THIS LINE')
            print(line)
            continue


        if line[0][0] == '#':
            continue

        if len(line[1]) > 0:
            name = line[1]
        elif len(line[8]) >0:
            name = line[8]
        else:
            name = '%s:%s:%s-%s' % (line[0],line[6],line[3],line[4])
        nameList.append(name)
        lociList.append(Locus(line[0],line[3],line[4],line[6],name))

    #check that all the names are unique
    if len(nameList) != len(uniquify(nameList)):
        print('ERROR: FOR GFFS, ALL REGIONS MUST HAVE A UNIQUE IDENTIFIER IN COLUMN 2')
        sys.exit()
    return LocusCollection(lociList,window)


#maketranscriptCollection
#04/07/09
#makes a LocusCollection w/ each transcript as a locus
#bob = makeTranscriptCollection('/Users/chazlin/genomes/mm8/mm8refseq.txt')
def makeTranscriptCollection(annotFile,upSearch,downSearch,window = 500,geneList = []):
    '''
    makes a LocusCollection w/ each transcript as a locus
    takes in a refseqfile
    '''

    if upper(annotFile).count('REFSEQ') == 1:
        refseqTable,refseqDict = importRefseq(annotFile)
        locusList = []
        ticker = 0
        if len(geneList) == 0:
            geneList =refseqDict.keys()
        for line in refseqTable[1:]:
            if geneList.count(line[1]) > 0:
                if line[3] == '-':
                    locus = Locus(line[2],int(line[4])-downSearch,int(line[5])+upSearch,line[3],line[1])
                else:
                    locus = Locus(line[2],int(line[4])-upSearch,int(line[5])+downSearch,line[3],line[1])
                locusList.append(locus)
                ticker = ticker + 1
                if ticker%1000 == 0:
                    print(ticker)
            

    transCollection = LocusCollection(locusList,window)

    return transCollection


def makeTSSLocus(gene,startDict,upstream,downstream):
    '''
    given a startDict, make a locus for any gene's TSS w/ upstream and downstream windows
    '''
    
    start = startDict[gene]['start'][0]
    if startDict[gene]['sense'] =='-':
        return Locus(startDict[gene]['chr'],start-downstream,start+upstream,'-',gene)
    else:
        return Locus(startDict[gene]['chr'],start-upstream,start+downstream,'+',gene)


#06/11/09
#takes a locus and expands it by a fixed upstream/downstream amount. spits out the new larger locus
def makeSearchLocus(locus,upSearch,downSearch):
    if locus.sense() == '-':
        searchLocus = Locus(locus.chr(),locus.start()-downSearch,locus.end()+upSearch,locus.sense(),locus.ID())
    else:
        searchLocus = Locus(locus.chr(),locus.start()-upSearch,locus.end()+downSearch,locus.sense(),locus.ID())
    return searchLocus


#==================================================================
#==========================BAM CLASS===============================
#==================================================================

#11/11/10
#makes a new class Bam for dealing with bam files and integrating them into the SolexaRun class


def checkChrStatus(bamFile):
    command = 'samtools view %s | head -n 1' % (bamFile)
    #print "TESTING"
    #print command
    stats = subprocess.Popen(command,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True)
    statLines = stats.stdout.readlines()
    stats.stdout.close()
    chrPattern = re.compile('chr')
    for line in statLines:
      #print line
      sline = line.split("\t")
      #print sline[2]
      if re.search(chrPattern, sline[2]):
	return 1
      else:
	return 0
	    
def convertBitwiseFlag(flag):
   if int(flag) & 16:
	return "-";
   else:
	return "+";
           
class Bam:
    '''A class for a sorted and indexed bam file that allows easy analysis of reads'''
    def __init__(self,bamFile):
        self._bam = bamFile
        

    def getTotalReads(self,readType = 'mapped'):
        command = 'samtools flagstat %s' % (self._bam)
        stats = subprocess.Popen(command,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True)
        statLines = stats.stdout.readlines()
        stats.stdout.close()
        if readType == 'mapped':
            for line in statLines:
                if line.count('mapped (') == 1:
                    
                    return int(line.split(' ')[0])
        if readType == 'total':
            return int(statLines[0].split(' ')[0])
    
    def convertBitwiseFlag(self,flag):
      if flag & 16:
	return "-";
      else:
	return "+";

    def getRawReads(self,locus,sense,unique = False,includeJxnReads = False,printCommand = False):
        '''
        gets raw reads from the bam using samtools view.
        can enforce uniqueness and strandedness
        '''
        locusLine = locus.chr()+':'+str(locus.start())+'-'+str(locus.end())
        
        command = 'samtools view %s %s' % (self._bam,locusLine)
        if printCommand:
            print(command)
        getReads = subprocess.Popen(command,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True)
        reads = getReads.communicate()
        reads = reads[0].split('\n')[:-1]
        reads = [read.split('\t') for read in reads]
        if includeJxnReads == False:
            reads = filter(lambda x: x[5].count('N') < 1,reads)

        #convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-'}
        convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-','256':'+','272':'-','99':'+','147':'-'}
        
        
        #BJA added 256 and 272, which correspond to 0 and 16 for multi-mapped reads respectively:
        #http://onetipperday.blogspot.com/2012/04/understand-flag-code-of-sam-format.html
        #convert = string.maketrans('160','--+')
        keptReads = []
        seqDict = defaultdict(int)
        if sense == '-':
          strand = ['+','-']
          strand.remove(locus.sense())
          strand = strand[0]
        else:
            strand = locus.sense()
        for read in reads:
            #readStrand = read[1].translate(convert)[0]
	    #print read[1], read[0]
            #readStrand = convertDict[read[1]]
            readStrand = convertBitwiseFlag(read[1])

            if sense == 'both' or sense == '.' or readStrand == strand:

                if unique and seqDict[read[9]] == 0:
                    keptReads.append(read)
                elif not unique:
                    keptReads.append(read)
            seqDict[read[9]]+=1
            
        return keptReads

    def readsToLoci(self,reads,IDtag = 'sequence,seqID,none'):
        '''
        takes raw read lines from the bam and converts them into loci
        '''
        loci = []
        ID = ''
        if IDtag == 'sequence,seqID,none':
            print('please specify one of the three options: sequence, seqID, none')
            return
        #convert = string.maketrans('160','--+')
        #convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-'}
	#convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-','256':'+','272':'-'}
        
        #BJA added 256 and 272, which correspond to 0 and 16 for multi-mapped reads respectively:
        #http://onetipperday.blogspot.com/2012/04/understand-flag-code-of-sam-format.html
        #convert = string.maketrans('160','--+')
        numPattern = re.compile('\d*')
        for read in reads:
            chrom = read[2]
            #strand = read[1].translate(convert)[0]
            #strand = convertDict[read[1]]
            strand = convertBitwiseFlag(read[1])
            if IDtag == 'sequence':
                ID = read[9]
            elif IDtag == 'seqID':
                ID = read[0]
            else:
                ID = ''
                
            length = len(read[9])
            start = int(read[3])
            if read[5].count('N') == 1:
                #this awful oneliner first finds all of the numbers in the read string
                #then it filters out the '' and converts them to integers
                #only works for reads that span one junction
                
                [first,gap,second] = [int(x) for x in filter(lambda x: len(x) > 0, re.findall(numPattern,read[5]))][0:3]
                if IDtag == 'sequence':
                    loci.append(Locus(chrom,start,start+first,strand,ID[0:first]))
                    loci.append(Locus(chrom,start+first+gap,start+first+gap+second,strand,ID[first:]))
                else:
                    loci.append(Locus(chrom,start,start+first,strand,ID))
                    loci.append(Locus(chrom,start+first+gap,start+first+gap+second,strand,ID))
            elif read[5].count('N') > 1:
                continue
            else:
                loci.append(Locus(chrom,start,start+length,strand,ID))
        return loci

    def getReadsLocus(self,locus,sense = 'both',unique = True,IDtag = 'sequence,seqID,none',includeJxnReads = False):
        '''
        gets all of the reads for a given locus
        '''
        reads = self.getRawReads(locus,sense,unique,includeJxnReads)
        
        loci = self.readsToLoci(reads,IDtag)
            
        return loci

    def getReadSequences(self,locus,sense = 'both',unique = True,includeJxnReads = False):

        reads = self.getRawReads(locus,sense,unique,includeJxnReads)

        return [read[9] for read in reads]
    
    def getReadStarts(self,locus,sense = 'both',unique = False,includeJxnReads = False):
        reads = self.getRawReads(locus,sense,unique,includeJxnReads)

        return [int(read[3]) for read in reads]
        

    def getReadCount(self,locus,sense = 'both',unique = True,includeJxnReads = False):
        reads = self.getRawReads(locus,sense,unique,includeJxnReads)

        return len(reads)
                                    

#==================================================================
#========================MISC FUNCTIONS============================
#==================================================================


#uniquify function
#by Peter Bengtsson
#Used under a creative commons license
#sourced from  here: http://www.peterbe.com/plog/uniqifiers-benchmark

def uniquify(seq, idfun=None): 
    # order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        # in old Python versions:
        # if seen.has_key(marker)
        # but in new ones:
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result


#082009
#taken from http://code.activestate.com/recipes/491268/

def order(x, NoneIsLast = True, decreasing = False):
    """
    Returns the ordering of the elements of x. The list
    [ x[j] for j in order(x) ] is a sorted version of x.

    Missing values in x are indicated by None. If NoneIsLast is true,
    then missing values are ordered to be at the end.
    Otherwise, they are ordered at the beginning.
    """

    omitNone = False
    if NoneIsLast == None:
        NoneIsLast = True
        omitNone = True
        
    n  = len(x)
    ix = range(n)
    
    if None not in x:
        ix.sort(reverse = decreasing, key = lambda j : x[j])
    else:
        # Handle None values properly.
        def key(i, x = x):
            elem = x[i]
            # Valid values are True or False only.
            if decreasing == NoneIsLast:
                return not(elem is None), elem
            else:
                return elem is None, elem
        ix = range(n)
        ix.sort(key=key, reverse=decreasing)
            
    if omitNone:
        n = len(x)
        for i in range(n-1, -1, -1):
            if x[ix[i]] == None:
                n -= 1
        return ix[:n]
    return ix