##################################
#                                #
# Last modified 01/23/2014       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import time
import math
import random
from sets import Set

def reverseComplement(sequence,DNA):
    
    reversesequence=''
    for i in range(len(sequence)):
        reversesequence=reversesequence+DNA[sequence[len(sequence)-i-1]]
    
    return reversesequence

def run():

    if len(sys.argv) < 7:
        print 'usage: python %s fasta number_reads_to_sample iterations minLength maxLength Overlap outfilename [-collapseDups]' % sys.argv[0]
        print '\tNote: This script will not enforce the 1U rule' 
        sys.exit(1)

    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n','-':'-'}
    RNA = {'A':'U','U':'A','G':'C','C':'G','N':'N','a':'u','u':'a','g':'c','c':'g','n':'n','-':'-'}


    doCollapseDups = False
    if '-collapseDups' in sys.argv:
        doCollapseDups = True

    fasta = sys.argv[1]
    numReads = int(sys.argv[2])
    iterations = int(sys.argv[3])
    minLength = int(sys.argv[4])
    maxLength = int(sys.argv[5])
    Overlap = int(sys.argv[6])
    outfilename = sys.argv[7]

    ReadList = []

    lineslist = open(fasta)
    for line in lineslist:
        if line[0]=='>':
            continue
        read=line.strip().replace('U','T')
        if len(read) < minLength or len(read) > maxLength:
            continue
        ReadList.append(read)

    if doCollapseDups:
        ReadList = list(Set(ReadList))

    outfile = open(outfilename, 'w')
    outline = '#Iteration\tTotalReads\tSampleReads\tSampledReadsInPingPongPairs\tFraction'
    outfile.write(outline + '\n')

    for i in range(iterations):
        print i
        start = time.time()
        sampledReadList = random.sample(ReadList,min(numReads,len(ReadList)))
        First10KmerDict = {}
        First10RevKmerDict = {}
        Palindromic = 0
        InPingPongPairs = 0.0
        for read in sampledReadList:
            kmer = read[0:Overlap]
            revkmer = reverseComplement(read[0:Overlap],DNA)
            if kmer == revkmer:
                Palindromic += 1
#                print read
                continue
            First10KmerDict[kmer] = 1
            First10RevKmerDict[revkmer] = 1
        for read in sampledReadList:
            kmer = read[0:Overlap]
            if First10RevKmerDict.has_key(kmer):
                InPingPongPairs += 1
        outline = str(i) + '\t' + str(len(ReadList)) + '\t' + str(len(sampledReadList) - Palindromic) + '\t' + str(InPingPongPairs) + '\t' + str(InPingPongPairs/(len(sampledReadList) - Palindromic))
        outfile.write(outline + '\n')
        end = time.time()
        print end - start

    outfile.close()
        
run()

