##################################
#                                #
# Last modified 04/25/2014       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import time
import math
import random
from sets import Set

def reverseComplement(sequence,DNA):
    
    reversesequence=''
    for i in range(len(sequence)):
        reversesequence=reversesequence+DNA[sequence[len(sequence)-i-1]]
    
    return reversesequence

def run():

    if len(sys.argv) < 8:
        print 'usage: python %s fasta1 fasta2 number_reads_to_sample iterations minLength maxLength Overlap outfilename [-collapseDups]' % sys.argv[0]
        print '\tNote: This script will not enforce the 1U rule' 
        sys.exit(1)

    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n','-':'-'}
    RNA = {'A':'U','U':'A','G':'C','C':'G','N':'N','a':'u','u':'a','g':'c','c':'g','n':'n','-':'-'}


    doCollapseDups = False
    if '-collapseDups' in sys.argv:
        doCollapseDups = True

    fasta1 = sys.argv[1]
    fasta2 = sys.argv[2]
    numReads = int(sys.argv[3])
    iterations = int(sys.argv[4])
    minLength = int(sys.argv[5])
    maxLength = int(sys.argv[6])
    Overlap = int(sys.argv[7])
    outfilename = sys.argv[8]

    ReadList1 = []
    ReadList2 = []

    lineslist = open(fasta1)
    for line in lineslist:
        if line[0]=='>':
            continue
        read=line.strip().replace('U','T')
        if len(read) < minLength or len(read) > maxLength:
            continue
        ReadList1.append(read)

    if doCollapseDups:
        ReadList1 = list(Set(ReadList1))

    lineslist = open(fasta2)
    for line in lineslist:
        if line[0]=='>':
            continue
        read=line.strip().replace('U','T')
        if len(read) < minLength or len(read) > maxLength:
            continue
        ReadList2.append(read)

    if doCollapseDups:
        ReadList2 = list(Set(ReadList2))

    outfile = open(outfilename, 'w')
    outline = '#Iteration\tTotalReads1\tTotalReads2\tSampleReads1\tSampleReads2\tSampledReadsInPingPongPairs-1\tFraction-1'
    outfile.write(outline + '\n')

    for i in range(iterations):
        print i
        start = time.time()
        sampledReadList1 = random.sample(ReadList1,min(numReads,len(ReadList1)))
        sampledReadList2 = random.sample(ReadList2,min(numReads,len(ReadList2)))
        First10KmerDict = {}
        First10RevKmerDict = {}
        Palindromic = 0
        InPingPongPairs = 0.0
        for read in sampledReadList1:
            kmer = read[0:Overlap]
            First10KmerDict[kmer] = 1
        for read in sampledReadList2:
            revkmer = reverseComplement(read[0:Overlap],DNA)
            First10RevKmerDict[revkmer] = 1
        for read in sampledReadList1:
            kmer = read[0:Overlap]
            if First10RevKmerDict.has_key(kmer):
                InPingPongPairs += 1
        outline = str(i) + '\t' + str(len(ReadList1)) + '\t' + str(len(ReadList2)) + '\t' + str(len(sampledReadList1)) + '\t' + str(len(sampledReadList2)) + '\t' + str(InPingPongPairs) + '\t' + str(InPingPongPairs/(len(sampledReadList1)))
        outfile.write(outline + '\n')
        end = time.time()
        print end - start

    outfile.close()
        
run()

