##################################
#                                #
# Last modified 04/11/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

try:
	import psyco
	psyco.full()
except:
	pass

def reverseComplement(DNA,sequence):
    
    reversesequence=''
    for i in range(len(sequence)):
        reversesequence=reversesequence+DNA[sequence[len(sequence)-i-1]]
    
    return reversesequence

def HammingDistance(str1,str2):

    distance=0
    for i in range(len(str1)):
        if str1[i]!=str2[i]:
            distance+=1
    return distance

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s fastq1 fastq2 minMatch max_Differcene outfile' % sys.argv[0]
        sys.exit(1)

    fastq1 = sys.argv[1]
    fastq2 = sys.argv[2]
    minMatch = int(sys.argv[3])
    maxN = int(sys.argv[4])
    outfile = open(sys.argv[5], 'w')

    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}

    ReadDict={}
    input_stream = open(fastq1)
    input_stream2 = open(fastq2)
    i=1
    j=0
    for line in input_stream:
        line2=input_stream2.readline()
        if i==1 and line[0]=='@':
            ID1=line.strip()[0:-2]
            ID2=line2.strip()[0:-2]
            if ID1 != ID2:
                print 'read IDs not matching, exiting'
                sys.exit(1)
            i=2
            continue
        if i==2:
            i=3
            j=j+1
            if j % 1000000 == 0:
                print j, 'reads processed'
            sequence1=line.strip()
            NCounts1=sequence1.count('N')
            sequence2=line2.strip()
            NCounts2=sequence2.count('N')
            sequence2=reverseComplement(DNA,sequence2)
            continue
        if i==3 and line[0]=='+':
            i=4
            continue
        if i==4:
            i=1
            scores1=line.strip()
            scores2=line2.strip()
            if NCounts1 > maxN or NCounts2 > maxN:
                continue
            read2substring = sequence2[0:minMatch]
            matchPos=sequence1.find(read2substring)
            if matchPos == -1:
                continue
            mismatches = HammingDistance(sequence1[matchPos:],sequence2[0:len(sequence2)-matchPos])
            if mismatches <= maxN:
                sequence=sequence1[0:matchPos]+sequence2
                scores=scores1[0:matchPos]+scores2
                outfile.write(ID1+'\n')
                outfile.write(sequence+'\n')
                outfile.write('+\n')
                outfile.write(scores+'\n')

    outfile.close()

run()

