##################################
#                                #
# Last modified 2026/01/14       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
from sets import Set

try:
	import psyco
	psyco.full()
except:
	pass

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s inputfilename outfilename [-ext TSS_upstream_bp TTS_downstream_bp] [-split binsize distanceToTSS distanceToTTS]' % sys.argv[0]
        sys.exit(1)

    inputfilename = sys.argv[1]
    outputfilename = sys.argv[2]

    doSplit = False
    if '-split' in sys.argv:
        doSplit = True
        bin = int(sys.argv[sys.argv.index('-split') + 1])
        TSSdist = int(sys.argv[sys.argv.index('-split') + 2])
        TTSdist = int(sys.argv[sys.argv.index('-split') + 3])

    TSSext = 0
    TTSext = 0
    if '-ext' in sys.argv:
        TSSext = int(sys.argv[sys.argv.index('-ext') + 1])
        TTSext = int(sys.argv[sys.argv.index('-ext') + 2])

    outfile = open(outputfilename, 'w')

    GeneDict={}
    listoflines = open(inputfilename)
    for line in listoflines:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        ID=fields[8].split('gene_id "')[1].split('"')[0]
        if 'gene_name "' in fields[8]:
            name=fields[8].split('gene_name "')[1].split('"')[0]
        else:
            name = ID
        chr=fields[0]
        left=int(fields[3])
        right=int(fields[4])
        strand=fields[6]
        if GeneDict.has_key((chr,ID,name)):
            pass
        else:
            GeneDict[(chr,ID,name)]={}
            GeneDict[(chr,ID,name)]['strand']=strand
            GeneDict[(chr,ID,name)]['chr']=chr
            GeneDict[(chr,ID,name)]['coordinates']=[]
        GeneDict[(chr,ID,name)]['coordinates'].append(left)
        GeneDict[(chr,ID,name)]['coordinates'].append(right)

    outline='#chr\tleft\tright\tstrand\tID\tname'
    outfile.write(outline+'\n')

    for (chr,ID,name) in GeneDict.keys():
        strand = GeneDict[(chr,ID,name)]['strand']
        if strand == '+':
            left = min(GeneDict[(chr,ID,name)]['coordinates']) - TSSext
            right = max(GeneDict[(chr,ID,name)]['coordinates']) + TTSext
        if strand == '-':
            left = min(GeneDict[(chr,ID,name)]['coordinates']) - TTSext
            right = max(GeneDict[(chr,ID,name)]['coordinates']) + TSSext
        chr = GeneDict[(chr,ID,name)]['chr']
        if doSplit:
            if strand == '+':
                left = left + TSSdist
                right = right - TTSdist
            if strand == '-':
                left = left + TTSdist
                right = right - TSSdist
            if right < left or right - left < bin:
                continue
            B = left
            C = 0
            while B < right:
                outline = chr + '\t' + str(B) + '\t' + str(B + bin) + '\t' + strand + '\t' + ID + '\t' + name + '_' + str(C)
                outfile.write(outline+'\n')
                B += bin
        else:
            outline=chr + '\t' + str(left) + '\t' + str(right) + '\t' + strand + '\t' + ID + '\t' + name
            outfile.write(outline+'\n')
            
    outfile.close()

run()

