##################################
#                                #
# Last modified 2019/05/02       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import gzip

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s inputfilename <outfile prefix>' % sys.argv[0]
        print '\tassumed format of readIDs: >hg38-chr1:0-25, i.e. chromosome are separated from coordiantes by a ":" symbol'
        sys.exit(1)

    inputfilename = sys.argv[1]
    outprefix = sys.argv[2]

    outfileDict = {}

    if inputfilename.endswith('.gz'):
        lineslist = gzip.open(inputfilename)
    else:
        lineslist = open(inputfilename)
    i=0
    for line in lineslist:
        if i % 2000000 == 0:
            print str(i/2000000) + 'M reads processed'
        i+=1
        if line[0] == '>':
            ID = line.strip().split('>')[1].split(':')[0]
            if outfileDict.has_key(ID):
                pass
            else:
                outfileDict[ID] = open(outprefix + '.' + ID + '.fa', 'w')
                print ID
            currentFile = outfileDict[ID]
            currentFile.write(line)
        else:
            currentFile.write(line)

    for outfile in outfileDict.keys():
        outfileDict[outfile].close()

run()
