##################################
#                                #
# Last modified 04/11/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import difflib

try:
	import psyco
	psyco.full()
except:
	pass

def reverseComplement(DNA,sequence):
    
    reversesequence=''
    for i in range(len(sequence)):
        reversesequence=reversesequence+DNA[sequence[len(sequence)-i-1]]
    
    return reversesequence

def HammingDistance(str1,str2):

    distance=0
    for i in range(len(str1)):
        if str1[i]!=str2[i]:
            distance+=1
    return distance

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s fastq1 fastq2 minMatch max_Differcene outfile' % sys.argv[0]
        sys.exit(1)

    fastq1 = sys.argv[1]
    fastq2 = sys.argv[2]
    minMatch = int(sys.argv[3])
    maxN = int(sys.argv[4])
    outfile = open(sys.argv[5], 'w')

    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}

    ReadDict={}
    input_stream = open(fastq1)
    i=1
    j=0
    for line in input_stream:
        if i==1 and line[0]=='@':
            ID=line.strip()[0:-2]
            i=2
            continue
        if i==2:
            i=3
            j=j+1
            if j % 1000000 == 0:
                print j, 'reads processed in first file'
            sequence=line.strip()
            NCounts=sequence.count('N')
            continue
        if i==3 and line[0]=='+':
            i=4
            continue
        if i==4:
            i=1
            scores=line.strip()
            if NCounts < maxN:
                ReadDict[ID]=[]
                ReadDict[ID].append((sequence,scores))
            scores=''
            NCounts=''
            sequence=''
            continue

    input_stream = open(fastq2)
    i=1
    j=0
    for line in input_stream:
        if i==1 and line[0]=='@':
            ID=line.strip()[0:-2]
            i=2
            continue
        if i==2:
            i=3
            j=j+1
            if j % 1000000 == 0:
                print j, 'reads processed in second file'
            sequence=line.strip()
            NCounts=sequence.count('N')
            continue
        if i==3 and line[0]=='+':
            i=4
            continue
        if i==4:
            i=1
            scores=line.strip()
            if ReadDict.has_key(ID):
                if NCounts < maxN:
                    ReadDict[ID].append((reverseComplement(DNA,sequence),scores))
                else:
                    del ReadDict[ID]
            scores=''
            NCounts=''
            sequence=''
            continue

    IDlist=ReadDict.keys()
    k=0
    singleRead=0
    print len(IDlist)
    for ID in IDlist:
        k+=1
        if k % 1000000 == 0:
            print k, 'read pairs processed'
        if len(ReadDict[ID]) < 2:
            singleRead+=1
            continue
        sequence1=ReadDict[ID][0][0]
        sequence2=ReadDict[ID][1][0]
        read2substring = sequence2[0:minMatch]
        matchPos=sequence1.find(read2substring)
        if matchPos == -1:
            continue
#        print matchPos, len(sequence2)-matchPos
#        print sequence1[matchPos:]
#        print sequence2[0:len(sequence2)-matchPos]
        mismatches = HammingDistance(sequence1[matchPos:],sequence2[0:len(sequence2)-matchPos])
#        print 'mismatches', mismatches
        if mismatches <= maxN:
            sequence=sequence1[0:matchPos]+sequence2
            scores1=ReadDict[ID][0][1]
            scores2=ReadDict[ID][1][1]
            scores=scores1[0:matchPos]+scores2
            outfile.write(ID+'\n')
            outfile.write(sequence+'\n')
            outfile.write('+\n')
            outfile.write(scores+'\n')

    print singleRead

    outfile.close()

run()

