##################################
#                                #
# Last modified 2019/01/06       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os
import math
from sets import Set
import Levenshtein
import gzip

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s inputfilename fieldID minOverlap(bp) outfile' % sys.argv[0]
        print '\tUse - to specify standard input'
        print '\tone kmer per line, in any column'
        sys.exit(1)

    inputfilename = sys.argv[1]
    fieldID = int(sys.argv[2])
    minOL = int(sys.argv[3])
    outprefix = sys.argv[4]

    KMerDict = []

    doStdIn = False
    if inputfilename != '-':
        if inputfilename.endswith('.bz2'):
            cmd = 'bzip2 -cd ' + inputfilename
        elif inputfilename.endswith('.gz'):
            cmd = 'gunzip -c ' + inputfilename
        else:
            cmd = 'cat ' + inputfilename
        p = os.popen(cmd, "r")
    else:
        doStdIn = True
    line = 'line'
    i=1
    while line != '':
        if doStdIn:
            line = sys.stdin.readline()
        else:
            line = p.readline()
        if line == '':
            break
        fields = line.strip().split('\t')
        KMerDict[fields[fieldID]] = []

    KMersList = KMerDict.keys()

    for kmer1 in KMersList:
        for kmer2 in KMersList:
            if kmer1 == kmer2:
                continue
            M = Levenshtein.matching_blocks(Levenshtein.editops(kmer1,kmer2),kmer1,kmer2)
            if M[0][1] != 0 or M[0][2] < minOL:
                continue
            if kmer1.endswith(kmer2[0:M[0][2]]):
                KMerDict[kmer1].append(kmer2)

    
                

# get_matching_blocks(	)
# Return list of triples describing matching subsequences. Each triple is of the form (i, j, n), and means that a[i:i+n] == b[j:j+n]. 
# The triples are monotonically increasing in i and j.
# SequenceMatcher(None,a,b).get_matching_blocks()
# [Match(a=3, b=0, size=5), Match(a=16, b=7, size=0)]
# Note: does not show multiple matches

    outfile = open(outprefix + '.end2.fastq', 'w')

        ID = fields[0]
        sequence = fields[1]
        scores = fields[2]
        outfile.write(ID + '\n')
        outfile.write(sequence + '\n')
        outfile.write('+' + '\n')
        outfile.write(scores + '\n')
    outfile.close()

run()

