##################################
#                                #
# Last modified 03/18/2016       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
from sets import Set

def SS(seq1,seq2):

    NumIdent = 0.0
    L = 0
    for i in range(len(seq1)):
        if seq1[i] == '-':
            continue
        L += 1
        if seq1[i] == seq2[i]:
            NumIdent += 1

    return NumIdent/L

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s aligment_fasta output [-max] [-pairedOutput]' % sys.argv[0]
        sys.exit(1)

    doMax = False
    if '-max' in sys.argv:
        doMax = True

    doPO = False
    if '-pairedOutput' in sys.argv:
        doPO = True

    fasta = sys.argv[1]
    outfilename = sys.argv[2]

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1].split(' ')[0]
            sequence=[]
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence)

    seqList = GenomeDict.keys()
    seqList.sort()

    outfile = open(outfilename,'w')

    if doPO:
        outline = '#seq1\tLength\tseq2\tLength\tSimilarity_AB\tSimilarity_BA\tSimilarity_max'
        outfile.write(outline + '\n')
        for i in range(len(seqList)):
            seq1 = seqList[i]
            for j in range(i+1,len(seqList)):
                seq2 = seqList[j]
                outline = seq1 + '\t' + str(len(GenomeDict[seq1].replace('-','')))
                outline = outline + '\t' + seq2 + '\t' + str(len(GenomeDict[seq2].replace('-','')))
                outline = outline + '\t' + str(SS(GenomeDict[seq1],GenomeDict[seq2]))
                outline = outline + '\t' + str(SS(GenomeDict[seq2],GenomeDict[seq1]))
                outline = outline + '\t' + str(max(SS(GenomeDict[seq1],GenomeDict[seq2]),SS(GenomeDict[seq2],GenomeDict[seq1])))
                outfile.write(outline + '\n')
    else:
        outline = '#seq\t'
        for seq in seqList:
            outline = outline + '\t' + seq
        outfile.write(outline + '\n')
        outline = '#\tlength'
        for seq in seqList:
            outline = outline + '\t' + str(len(GenomeDict[seq].replace('-','')))
        outfile.write(outline + '\n')

        for seq1 in seqList:
            outline = seq1 + '\t' + str(len(GenomeDict[seq1].replace('-','')))
            for seq2 in seqList:
                if doMax:
                    outline = outline + '\t' + str(max(SS(GenomeDict[seq1],GenomeDict[seq2]),SS(GenomeDict[seq2],GenomeDict[seq1])))
                else:
                    outline = outline + '\t' + str(SS(GenomeDict[seq1],GenomeDict[seq2]))
            outfile.write(outline + '\n')

    outfile.close()

run()

