##################################
#                                #
# Last modified 2019/06/10       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os
import string
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s config window_size outputfilename' % sys.argv[0]
        print '\tconfig format:' 
        print '\t\tlabel <tab> fasta_file_name' 
        print '\tassumed chromosome format:' 
        print '\t\tchrI::AWRI1631' 
        print '\tit is assumed all chromosome are of the same size' 
        sys.exit(1)

    inputfilename = sys.argv[1]
    W = int(sys.argv[2])
    outfilename = sys.argv[3]

    GenomeDict={}
    linelist = open(inputfilename)
    for line in linelist:
        fields = line.strip().split('\t')
        label = fields[0]
        fasta = fields[1]
        GenomeDict[label] = {}
        sequence=''
        inputdatafile = open(fasta)
        for line in inputdatafile:
            if line[0]=='>':
                if sequence != '':
                    GenomeDict[label][chr] = ''.join(sequence)
                chr = line.strip().split('>')[1].split('::')[0]
                print label, chr
                sequence=[]
                continue
            else:
                sequence.append(line.strip())
        GenomeDict[label][chr] = ''.join(sequence)

    outfile = open(outfilename, 'w')

    labels = GenomeDict.keys()

    labels.sort()

    ref_label = labels[0]

    FinalSeqDict = {}
    for label in labels:
        FinalSeqDict[label] = []

    D = 0

    for chr in GenomeDict[ref_label].keys():
        for pos in range(0,len(GenomeDict[ref_label][chr]),W):
            sequences = []
            for label in labels:
                sequences.append(GenomeDict[label][chr][pos:pos+W])
            if len(Set(sequences)) > 1:
                D += 1
                if D % 100 == 0:
                    print D, chr, pos, len(Set(sequences))
                for label in labels:
                    FinalSeqDict[label].append(GenomeDict[label][chr][pos:pos+W])

    print 'found', D, 'windows with sequence differences'

    for label in labels:
        FinalSeqDict[label] = ''.join(FinalSeqDict[label])
        outline = '>' + label
        outfile.write(outline + '\n')
        outline = FinalSeqDict[label]
        outfile.write(outline + '\n')
   
run()
