##################################
#                                #
# Last modified 09/05/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def reverseComplement(sequence,DNA):
    
    reversesequence=''
    for i in range(len(sequence)):
        reversesequence=reversesequence+DNA[sequence[len(sequence)-i-1]]
    
    return reversesequence

def run():

    if len(sys.argv) < 7:
        print 'usage: python %s datafilename <datafilename_type (fasta | bowtie)> minLength maxLenght minOverlap maxOverlap outfilename' % sys.argv[0]
        print '       If the data type is fasta, the script will look for overlap between reads' 
        print '       If the data type is bowtie, the script will look for overlap between reads but will also require that they map accordingly' 
        print '       The script will take the 1U reads and look for overlap with the other reads; it will exclude reads of the 1U 10A kind' 
        sys.exit(1)

    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n','-':'-'}
    RNA = {'A':'U','U':'A','G':'C','C':'G','N':'N','a':'u','u':'a','g':'c','c':'g','n':'n','-':'-'}

    datafilename = sys.argv[1]
    type = sys.argv[2]
    minLength = int(sys.argv[3])
    maxLength = int(sys.argv[4])
    minOverlap = int(sys.argv[5])
    maxOverlap = int(sys.argv[6])
    outfilename = sys.argv[7]

    outfile = open(outfilename, 'w')

    doU=False
    if '-1U' in sys.argv:
        doU = True

    ScoreDict={}
    StartKmerDict={}
    for i in range(minOverlap,maxOverlap+1):
        StartKmerDict[i]={}
        ScoreDict[i]=0

    TotalReads=0
    lineslist = open(datafilename)
    if type == 'fasta':
        ReadList=[]
        t=0
        for line in lineslist:
            t+=1
            if t % 2000000 == 0:
                print t/2, 'reads processed'
            if line[0]=='>':
                TotalReads+=1
                continue
            read=line.strip().replace('U','T')
            if len(read) < minLength or len(read) > maxLength:
                continue
            if read[0] == 'T' and read[9] == 'A':
                continue
            if read[0] != 'T':
                for i in range(minOverlap,maxOverlap+1):
                    kmer = read[0:i]
                    StartKmerDict[i][kmer]=''
            else:
                ReadList.append(read)
        t=0
        print 'finished importing reads'
        for read in ReadList:
            t+=1
            if t % 1000000 == 0:
                print t, 'reads processed'
            largestKmer=0
            for i in range(minOverlap,maxOverlap+1):
                kmer = read[0:i]
                revkmer = reverseComplement(kmer,DNA)
                if StartKmerDict[i].has_key(revkmer):
                    largestKmer=i
            if largestKmer != 0:
                ScoreDict[largestKmer]+=1
            if largestKmer == 10:
                print read, read[0:10], reverseComplement(read[0:10],DNA)
        print 'finished matching reads'

    outline = '#offset\t1U_Read_Number\tTotal_1U_Reads\tTotal_Reads\tPercentage\n'
    outfile.write(outline)
    keys=ScoreDict.keys()
    keys.sort()
    for i in keys:
        score = ScoreDict[i] / (len(ReadList) + 0.0)
        outline = str(i) + '\t' + str(ScoreDict[i]) + '\t' + str(len(ReadList)) + '\t' + str(TotalReads) + '\t' + str(score)
        outfile.write(outline + '\n')
    outfile.close()
        
run()

