##################################
#                                #
# Last modified 10/12/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s list_of_files outputfilename [-FastaBlocks]' % sys.argv[0]
        print '\tThe list of files should contain a list of multiple sequence alignment files in fasta file'
        print '\tThe script assumes that each sequence is labelled like this:'
        print '\t\t>Oxytricha_trifallax::lcl|JN383843.1_prot_AEV66700.1_86'
        print '\t\ti.e., species name, then ::, then other stuff'
        print '\tIt also assumes that species names are consistent across files and that there is only one entry per species per file'
        print '\tMissing data, i.e. species that are missing from some alignments files will be set to ? in that part of the supermatrix'
        sys.exit(1)

    listoffiles = sys.argv[1]
    outfilename = sys.argv[2]

    doFB = False
    if '-FastaBlocks' in sys.argv:
        doFB = True

    SpeciesIDs = {}

    inputdatafile = open(listoffiles)
    SequenceDict = {}
    for line1 in inputdatafile:
        fasta = line1.strip().split('\t')[0]
        SequenceDict[fasta] = {}
        sequence=''
        linelist = open(fasta)
        for line in linelist:
            if line[0]=='>':
                if sequence != '':
                    SequenceDict[fasta][chr] = ''.join(sequence)
                chr = line.strip().split('>')[1].split('::')[0]
                SpeciesIDs[chr] = 1
                sequence=[]
                continue
            else:
                sequence.append(line.strip())
        SequenceDict[fasta][chr] = ''.join(sequence)

    species = SpeciesIDs.keys()
    species.sort()

    print species

    FinalSequenceDict = {}
    for s in species:
        FinalSequenceDict[s] = ''

    genes = SequenceDict.keys()
    genes.sort()

    for fasta in genes:
        L = len(SequenceDict[fasta][SequenceDict[fasta].keys()[0]])
        for s in species:
            if SequenceDict[fasta].has_key(s):
                FinalSequenceDict[s] += SequenceDict[fasta][s]
            else:
                FinalSequenceDict[s] += L*'?'

    outfile = open(outfilename, 'w')

    blocksize = 1000

    for s in species:
        outline = '>' + s
        outfile.write(outline + '\n')
        sequence = FinalSequenceDict[s]
        if doFB:
            for i in range(0,len(sequence),blocksize):
                outfile.write(sequence[i:min(i+blocksize, len(sequence))] + '\n')
        else:
            outfile.write(sequence + '\n')
    outfile.close()
   
run()
