##################################
#                                #
# Last modified 2020/10/12       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s fasta config N_length outfile [-unassigneddebris name]' % sys.argv[0]
        print '\tconfig format: new_chr_name <tab> chr1(,chr2,chr3,...,chrN)'
        sys.exit(1)

    fasta = sys.argv[1]
    config = sys.argv[2]
    Nlen = int(sys.argv[3])
    Ns = Nlen*'N'
    outfilename = sys.argv[4]

    NewFastaDict = {}
    SeenDict = {}

    doUAD = False
    if '-unassigneddebris' in sys.argv:
        doUAD = True
        UAD = sys.argv[sys.argv.index('-unassigneddebris') + 1]
        NewFastaDict[UAD] = []
        print 'will merge unassigned contigs into', UAD

    lineslist = open(config)
    for line in lineslist:
        fields = line.strip().split('\t')
        newChr = fields[0]
        NewFastaDict[newChr] = fields[1].split(',')
        for chr in fields[1].split(','):
            SeenDict[chr] = 1

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1]
            if SeenDict.has_key(chr):
                pass
            else:
                if doUAD:
                    NewFastaDict[UAD].append(chr)
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence)

    print 'finished inputting fasta'
   
    outfile = open(outfilename, 'w')

    blocksize = 100

    IDList = NewFastaDict.keys()
    IDList.sort()
    for newChr in IDList:
        outline = '>' + newChr
        outfile.write(outline + '\n')
        seqList = []
        for chr in NewFastaDict[newChr]:
            seqList.append(GenomeDict[chr])
        newSeq = Ns.join(seqList)
        for i in range(0,len(newSeq),blocksize):
            outfile.write(newSeq[i:min(i+blocksize, len(newSeq))] + '\n')

    outfile.close()

run()
