##################################
#                                #
# Last modified 09/05/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def reverseComplement(sequence,DNA):
    
    reversesequence=''
    for i in range(len(sequence)):
        reversesequence=reversesequence+DNA[sequence[len(sequence)-i-1]]
    
    return reversesequence

def run():

    if len(sys.argv) < 7:
        print 'usage: python %s 1Ufasta 10Afasta minLength maxLength minOverlap maxOverlap outfilename [-writeReads overLap[,overlap2,overlap3...] outfilename]' % sys.argv[0]
        print '       The script will take the 1U reads and look for overlap with the 10A reads; it will exclude reads of the 1U 10A kind' 
        sys.exit(1)

    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n','-':'-'}
    RNA = {'A':'U','U':'A','G':'C','C':'G','N':'N','a':'u','u':'a','g':'c','c':'g','n':'n','-':'-'}

    datafilename1U = sys.argv[1]
    datafilename10A = sys.argv[2]
    minLength = int(sys.argv[3])
    maxLength = int(sys.argv[4])
    minOverlap = int(sys.argv[5])
    maxOverlap = int(sys.argv[6])
    outfilename = sys.argv[7]

    doWriteReads = False
    if '-writeReads' in sys.argv:
        doWriteReads = True
        overlapValues = sys.argv[sys.argv.index('-writeReads')+1].split(',')
        WriteReadsOverlapDict = {}
        for overlap in overlapValues:
            WriteReadsOverlapDict[int(overlap)]=0
        WriteReadOutfile = open(sys.argv[sys.argv.index('-writeReads')+2],'w')
        OW=0

    outfile = open(outfilename, 'w')

    ScoreDict={}
    StartKmerDict={}
    for i in range(minOverlap,maxOverlap+1):
        StartKmerDict[i]={}
        ScoreDict[i]=0

    Total1UReads=0
    lineslist = open(datafilename1U)
    ReadList1U=[]
    for line in lineslist:
        if line[0]=='>':
            Total1UReads+=1
            continue
        read=line.strip().replace('U','T')
        if len(read) < minLength or len(read) > maxLength:
            continue
        if read[0] == 'T' and read[9] == 'A':
            continue
        ReadList1U.append(read)

    Total10AReads=0
    lineslist = open(datafilename10A)
    ReadList10A=[]
    for line in lineslist:
        if line[0]=='>':
            Total10AReads+=1
            continue
        read=line.strip().replace('U','T')
        if len(read) < minLength or len(read) > maxLength:
            continue
        if read[0] == 'T' and read[9] == 'A':
            continue
        ReadList10A.append(read)
        for i in range(minOverlap,maxOverlap+1):
            kmer = read[0:i]
            StartKmerDict[i][kmer]=''

    for read in ReadList1U:
        largestKmer=0
        for i in range(minOverlap,maxOverlap+1):
            kmer = read[0:i]
            revkmer = reverseComplement(kmer,DNA)
            if StartKmerDict[i].has_key(revkmer):
                largestKmer=i
        if largestKmer != 0:
            ScoreDict[largestKmer]+=1
        if doWriteReads and WriteReadsOverlapDict.has_key(largestKmer):
            OW+=1
            WriteReadOutfile.write('>read'+str(OW)+'\n')
            WriteReadOutfile.write(read+'\n')

    if doWriteReads:
        WriteReadOutfile.close()


    outline = '#offset\t1U_Read_Number\t1U_Reads\tTotal_1U_Reads\t10A_Reads\tTotal_10A_Reads\tPercentage\n'
    outfile.write(outline)
    keys=ScoreDict.keys()
    keys.sort()
    for i in keys:
        score = ScoreDict[i] / (len(ReadList1U) + 0.0)
        outline = str(i) + '\t' + str(ScoreDict[i]) + '\t' + str(len(ReadList1U)) + '\t' + str(Total1UReads) + '\t' + str(len(ReadList10A)) + '\t' + str(Total10AReads) + '\t' + str(score)
        outfile.write(outline + '\n')
    outfile.close()
        
run()

