##################################
#                                #
# Last modified 2024/10/19       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import gzip
import string
from sets import Set
import math

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s gff JGI|GFF3|GMAP|GiardiaDB|Candida|AUGUSTUS|AUGUSTUS-SGD|YeastGDB|YeastGDB-AGAPE|PomBase|Flatworm|NCBI-Prok|NCBI-Prok2|NCBI-Prok3|Fugacium outfile ' % sys.argv[0]
        print '\tNote: for simplicity, the script will only output exons and CDS/UTR annotations'
        print '\tNote: the AUGUSTUS option currently outputs only CDS features, not UTRs!!1'
        sys.exit(1)

    GFF = sys.argv[1]
    GTF = sys.argv[3]

    GeneDict={}
    outfile = open(GTF, 'w')
    if sys.argv[2] == 'NCBI-Prok':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] != 'CDS' and fields[2] != 'exon':
                continue
            geneID = fields[8].split(';gene=')[1].split(';')[0]
            try:
                geneName = fields[8].split(';product=')[1].split(';')[0]
            except:
                if 'pseudo=true' in fields[8]:
                    geneName = 'pseudogene'
                else:
                    print 'problem with gene name:' 
                    print fields
            GeneDict[geneID] = geneName
        linelist=open(GFF)
        for line in linelist:
            if line.startswith('#'):
                outfile.write(line)
                continue
            fields=line.strip().split('\t')
            if fields[2] == 'region' or fields[2] == 'repeat_region':
                continue
            if fields[2] == 'exon' or fields[2] == 'CDS':
                geneID = fields[8].split(';gene=')[1].split(';')[0]
                geneName = GeneDict[geneID]
                outline = fields[0] + '\t'
                outline = outline + fields[1] + '\t'
                outline = outline + fields[2] + '\t'
                outline = outline + fields[3] + '\t'
                outline = outline + fields[4] + '\t'
                outline = outline + fields[5] + '\t'
                outline = outline + fields[6] + '\t'
                outline = outline + fields[7] + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + geneID + '"; gene_name "' + geneName + '"; transcript_name "' + geneName + '";'
            elif fields[2] == 'gene':
                geneID = fields[8].split(';gene=')[1].split(';')[0]
                if GeneDict.has_key(geneID):
                    geneName = GeneDict[geneID]
                else:
                    if ';product=' in fields[8]:
                        geneName = fields[8].split(';product=')[1].split(';')[0]
                    elif 'Note=' in fields[8]: 
                        geneName = fields[8].split(';Note=')[1].split(';')[0]
                    else:
                        geneName = geneID
                outline = fields[0] + '\t'
                outline = outline + fields[1] + '\t'
                outline = outline + 'exon' + '\t'
                outline = outline + fields[3] + '\t'
                outline = outline + fields[4] + '\t'
                outline = outline + fields[5] + '\t'
                outline = outline + fields[6] + '\t'
                outline = outline + fields[7] + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + geneID + '"; gene_name "' + geneName + '"; transcript_name "' + geneName + '";'
            else:
                continue
            outfile.write(outline + '\n')
    if sys.argv[2] == 'NCBI-Prok2':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] != 'CDS' and fields[2] != 'exon':
                continue
            geneID = fields[8].split('Parent=')[1].split(';')[0]
            try:
                geneName = fields[8].split(';product=')[1].split(';')[0]
            except:
                if 'pseudo=true' in fields[8]:
                    geneName = 'pseudogene'
                else:
                    print 'problem with gene name:' 
                    print fields
	            GeneDict[geneID] = geneName
        linelist=open(GFF)
        for line in linelist:
            if line.startswith('#'):
                outfile.write(line)
                continue
            fields=line.strip().split('\t')
            if fields[2] == 'region' or fields[2] == 'repeat_region':
                continue
            if fields[2] == 'exon' or fields[2] == 'CDS':
                geneID = fields[8].split('Parent=')[1].split(';')[0]
                geneName = GeneDict[geneID]
                outline = fields[0] + '\t'
                outline = outline + fields[1] + '\t'
                outline = outline + fields[2] + '\t'
                outline = outline + fields[3] + '\t'
                outline = outline + fields[4] + '\t'
                outline = outline + fields[5] + '\t'
                outline = outline + fields[6] + '\t'
                outline = outline + fields[7] + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + geneID + '"; gene_name "' + geneName + '"; transcript_name "' + geneName + '";'
                if fields[2] == 'CDS':
                    outfile.write(outline.replace('\tCDS\t','\texon\t') + '\n')
            elif fields[2] == 'gene':
                geneID = fields[8].split('ID=')[1].split(';')[0]
                if GeneDict.has_key(geneID):
                    geneName = GeneDict[geneID]
                else:
                    if ';product=' in fields[8]:
                        geneName = fields[8].split(';product=')[1].split(';')[0]
                    elif 'Note=' in fields[8]: 
                        geneName = fields[8].split(';Note=')[1].split(';')[0]
                    else:
                        geneName = geneID
                outline = fields[0] + '\t'
                outline = outline + fields[1] + '\t'
                outline = outline + 'gene' + '\t'
                outline = outline + fields[3] + '\t'
                outline = outline + fields[4] + '\t'
                outline = outline + fields[5] + '\t'
                outline = outline + fields[6] + '\t'
                outline = outline + fields[7] + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + geneID + '"; gene_name "' + geneName + '"; transcript_name "' + geneName + '";'
#            elif fields[2] == 'ncRNA' or  or fields[2] == 'rRNA' or fields[2] == 'tRNA' or fields[2] == 'tmRNA':
            else:
                continue
            outfile.write(outline + '\n')
    if sys.argv[2] == 'JGI':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            chr=fields[0]
            if fields[2] == 'CDS':
                geneID=fields[8].split('name "')[1].split('";')[0]
                transcriptID=fields[8].split('proteinId ')[1].split(';')[0]
                transcriptName=transcriptID
                geneName=geneID
                GeneDict[(chr,geneID,geneName,transcriptID,transcriptName)]['gBT'] = 'protein_coding'
                continue
            if fields[2] != 'exon':
                continue
            left = int(fields[3])
            right = int(fields[4])
            strand = fields[6]
            geneID=fields[8].split('name "')[1].split('";')[0]
            transcriptID=fields[8].split('transcriptId ')[1].split(';')[0]
            transcriptName=transcriptID
            geneName=geneID
            if GeneDict.has_key((chr,geneID,geneName,transcriptID,transcriptName)):
                pass
            else:
                GeneDict[(chr,geneID,geneName,transcriptID,transcriptName)]={}
                GeneDict[(chr,geneID,geneName,transcriptID,transcriptName)]['exons'] = []
                GeneDict[(chr,geneID,geneName,transcriptID,transcriptName)]['gBT'] = 'unknown'
            GeneDict[(chr,geneID,geneName,transcriptID,transcriptName)]['exons'].append((left,right,strand))
        linelist=open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] == 'exon' or fields[2] == 'CDS':
                pass
            else:
                continue
            geneID=fields[8].split('name "')[1].split('";')[0]
            if fields[2] == 'exon':
                transcriptID=fields[8].split('transcriptId ')[1].split(';')[0]
            if fields[2] == 'CDS':
                transcriptID=fields[8].split('proteinId ')[1].split(';')[0]
            transcriptName=transcriptID
            geneName=geneID
            chr=fields[0]
            gBT = GeneDict[(chr,geneID,geneName,transcriptID,transcriptName)]['gBT']
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
            outfile.write(outline + '\n')
    if sys.argv[2] == 'GMAP':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) < 3:
                continue
            chr = fields[0]
            if fields[2] != 'exon':
                continue
            if 'Name=' in fields[8]:
                pass
            else:
                print 'problem with gene name:' 
                print fields
                continue
            left = int(fields[3])
            right = int(fields[4])
            strand = fields[6]
            geneID = fields[8].split('Name=')[1].split('";')[0].split(':')[0]
            geneName = fields[8].split('Name=')[1].split('";')[0].split(':')[1]
            transcriptID = fields[8].split('Name=')[1].split('";')[0].split(':')[2]
            transcriptName = fields[8].split('Name=')[1].split('";')[0].split(':')[3]
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '";'
            outfile.write(outline + '\n')
    if sys.argv[2] == 'GFF3':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        GeneTypeDict = {}
        TranscriptParentDict = {}
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.replace('PARENT=','Parent=').strip().split('\t')
            if fields[2].endswith('_gene') or fields[2] == 'gene' or ((fields[2] == 'pseudogene' or fields[2] == 'pseudogenic_tRNA' or fields[2] == 'RNA') and 'Parent=' not in fields[8]):
                if 'biotype=' in fields[8]:
                    gBT = fields[8].split('biotype=')[1].split(';')[0]
                else:
                    gBT = 'NaN'
                geneID = fields[8].split('ID=')[1].split(';')[0]
                if 'external_name=' in fields[8]:
                    geneName = fields[8].split('external_name=')[1].split(';')[0]
                else:
                    geneName = geneID
                GeneTypeDict[geneID] = (geneName,gBT)
                continue
            if fields[2] == 'transcript' or fields[2] == 'miRNA' or fields[2] == 'pseudogene' or fields[2] == 'rRNA' or fields[2] == 'snRNA' or fields[2] == 'snoRNA':
                geneID = fields[8].split('Parent=')[1].split(';')[0]
                transcriptID = fields[8].split('ID=')[1].split(';')[0]
                TranscriptParentDict[transcriptID] = geneID
                continue
        linelist=open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.replace('PARENT=','Parent=').strip().split('\t')
            if fields[2] == 'exon' or fields[2] == 'CDS':
                pass
            else:
                continue
            transcriptID=fields[8].split('Parent=')[1].split(';')[0]
            transcriptName = transcriptID
            geneID = TranscriptParentDict[transcriptID]
            (geneName,gBT) = GeneTypeDict[geneID]
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
            outfile.write(outline + '\n')
    if sys.argv[2] == 'GiardiaDB':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        GeneTypeDict = {}
        TranscriptParentDict = {}
        ExonCDSParentDict = {}
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] == 'mRNA' or fields[2] == 'rRNA' or fields[2] == 'tRNA' or fields[2] == 'RNase_MRP_RNA' or fields[2] == 'RNase_P_RNA' or fields[2] == 'SRP_RNA_encoding' or fields[2] == 'snRNA':
                transcriptID = fields[8].split('ID=')[1].split(';')[0]
                geneID = fields[8].split('Parent=')[1].split(';')[0]
                gBT = fields[2]
                GeneTypeDict[geneID] = gBT
                TranscriptParentDict[transcriptID] = geneID
                continue
            if fields[2] == 'exon' or fields[2] == 'CDS':
                ID = fields[8].split('ID=')[1].split(';')[0]
                transcriptID = fields[8].split('Parent=')[1].split(';')[0]
                ExonCDSParentDict[ID] = transcriptID
                continue
        linelist=open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] == 'exon' or fields[2] == 'CDS':
                pass
            else:
                continue
            ID=fields[8].split('ID=')[1].split(';')[0]
            transcriptID = ExonCDSParentDict[ID]
            geneID = TranscriptParentDict[transcriptID]
            gBT = GeneTypeDict[geneID]
            geneName = geneID
            transcriptName = transcriptID
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
            outfile.write(outline + '\n')
    if sys.argv[2] == 'Flatworm':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        GeneTypeDict = {}
        TranscriptParentDict = {}
        ExonCDSParentDict = {}
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] == 'mRNA' or fields[2] == 'rRNA' or fields[2] == 'tRNA' or fields[2] == 'RNase_MRP_RNA' or fields[2] == 'RNase_P_RNA' or fields[2] == 'SRP_RNA_encoding' or fields[2] == 'snRNA':
                transcriptID = fields[8].split('ID=')[1].split(';')[0]
                geneID = fields[8].split('Parent=')[1].split(';')[0]
                gBT = fields[2]
                GeneTypeDict[geneID] = gBT
                TranscriptParentDict[transcriptID] = geneID
                continue
            if fields[2] == 'exon' or fields[2] == 'CDS':
                ID = fields[8].split('ID=')[1].split(';')[0]
                transcriptID = fields[8].split('Parent=')[1].split(';')[0]
                ExonCDSParentDict[ID] = transcriptID
                continue
        linelist=open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] == 'exon' or fields[2] == 'CDS':
                pass
            else:
                continue
            if 'Parent=gene' in fields[8]:
                continue
            ID=fields[8].split('ID=')[1].split(';')[0]
            transcriptID = ExonCDSParentDict[ID]
            geneID = TranscriptParentDict[transcriptID]
            gBT = GeneTypeDict[geneID]
            geneName = geneID
            transcriptName = transcriptID
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
            outfile.write(outline + '\n')
    if sys.argv[2] == 'Candida':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        GeneTypeDict = {}
        TranscriptParentDict = {}
        ExonCDSParentDict = {}
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] == 'mRNA' or fields[2] == 'rRNA' or fields[2] == 'tRNA' or fields[2] == 'RNase_MRP_RNA' or fields[2] == 'RNase_P_RNA' or fields[2] == 'SRP_RNA_encoding' or fields[2] == 'snRNA' or fields[2] == 'snoRNA' or fields[2] == 'ncRNA':
                transcriptID = fields[8].split('ID=')[1].split(';')[0]
                if transcriptID.endswith('-T') or transcriptID.endswith('-P'):
                    transcriptID = transcriptID[0:-2]
                if '-T-' in transcriptID:
                    transcriptID = transcriptID.split('-T-')[0]
                geneID = fields[8].split('Parent=')[1].split(';')[0]
                if geneID.endswith('-T') or geneID.endswith('-P'):
                    geneID = transcriptID[0:-2]
                if '-T-' in transcriptID:
                    geneID = geneID.split('-T-')[0]
                gBT = fields[2]
                GeneTypeDict[geneID] = gBT
                TranscriptParentDict[transcriptID] = geneID
                continue
            if fields[2] == 'exon' or fields[2] == 'CDS':
                ID = fields[8].split('ID=')[1].split(';')[0]
                if ID.endswith('-T') or transcriptID.endswith('-P') :
                    ID = ID[0:-2]
                if '-T-' in ID:
                    ID = ID.split('-T-')[0]
                transcriptID = fields[8].split('Parent=')[1].split(';')[0]
                if transcriptID.endswith('-T') or transcriptID.endswith('-P') :
                    transcriptID = transcriptID[0:-2]
                if '-T-' in transcriptID:
                    transcriptID = transcriptID.split('-T-')[0]
                ExonCDSParentDict[ID] = transcriptID
                continue
        linelist=open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] == 'exon' or fields[2] == 'CDS':
                pass
            else:
                continue
            ID=fields[8].split('ID=')[1].split(';')[0]
            if ID.endswith('-T') or transcriptID.endswith('-P') :
                ID = ID[0:-2]
            if '-T-' in ID:
                ID = ID.split('-T-')[0]
            transcriptID = ExonCDSParentDict[ID]
#            print fields
#            print transcriptID, ID, TranscriptParentDict[transcriptID]
            try:
                geneID = TranscriptParentDict[transcriptID]
            except:
                geneID = transcriptID
                print 'no geneID found, assigning transcript ID to gene', transcriptID
            if GeneTypeDict.has_key(geneID):
                gBT = GeneTypeDict[geneID]
            else:
                if 'pseudo=true' in fields[8]:
                    gBT = 'pseudogene'
                else:
                    gBT = 'NaN'
            geneName = geneID
            transcriptName = transcriptID
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
            outfile.write(outline + '\n')
    if sys.argv[2] == 'AUGUSTUS':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if fields[2] != 'CDS':
                continue
            geneName = fields[8].split('Gene=')[1].split(';')[0]
            transcriptName = geneName
            geneID = geneName
            transcriptID = geneName
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '";'
            outfile.write(outline + '\n')
            outfile.write(outline)
            outfile.write(outline.replace('\tCDS\t','\texon\t') + '\n')
    if sys.argv[2] == 'AUGUSTUS-SGD':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if fields[2] != 'CDS':
                continue
            geneID = fields[8].split('Gene=')[1].split(';')[0]
            geneName = fields[8].split('SGD=')[1].split(';')[0]
            transcriptName = geneName
            transcriptID = geneID
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '";'
            outfile.write(outline + '\n')
            outfile.write(outline.replace('\tCDS\t','\texon\t') + '\n')
    if sys.argv[2] == 'YeastGDB':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if len(fields) < 9:
                continue
            if fields[2] != 'CDS':
                continue
            geneID = fields[8].split('Name=')[1].split(';')[0]
            if 'gene=' in fields[8]:
                geneName = fields[8].split('gene=')[1].split(';')[0]
            else:
                geneName = geneID
            transcriptName = geneName
            transcriptID = geneID
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '";'
            outfile.write(outline + '\n')
            outfile.write(outline.replace('\tCDS\t','\texon\t') + '\n')
    if sys.argv[2] == 'YeastGDB-AGAPE':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        for line in linelist:
            if line.startswith('#'):
                continue
            fields = line.strip().split(' ')
            if len(fields) < 9:
                continue
            if fields[2] != 'CDS':
                continue
            geneID = fields[8].split(',')[0]
            geneName = geneID
            transcriptName = geneName
            transcriptID = geneID
            outline = fields[0] + '\t'
            outline = outline + fields[1] + '\t'
            outline = outline + fields[2] + '\t'
            outline = outline + fields[3] + '\t'
            outline = outline + fields[4] + '\t'
            outline = outline + fields[5] + '\t'
            outline = outline + fields[6] + '\t'
            outline = outline + fields[7] + '\t'
            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '";'
            outfile.write(outline + '\n')
            outfile.write(outline.replace('\tCDS\t','\texon\t') + '\n')
    if sys.argv[2] == 'PomBase':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        GeneTypeDict = {}
        GeneParentDict = {}
        TranscriptParentDict = {}
        TranscriptDict = {}
        ExonCDSParentDict = {}
        for line in linelist:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if fields[2] == 'gene' or fields[2] == 'ncRNA_gene':
                ID = fields[8].split('ID=')[1].split(';')[0]
                if 'Name=' in fields[8]:
                    name = fields[8].split('Name=')[1].split(';')[0]
                else:
                    name = ID
                GeneParentDict[ID] = name
                continue
            if fields[2] == 'mRNA' or fields[2] == 'rRNA' or fields[2] == 'tRNA' or fields[2] == 'RNase_MRP_RNA' or fields[2] == 'RNase_P_RNA' or fields[2] == 'SRP_RNA_encoding' or fields[2] == 'snRNA' or fields[2] == 'snoRNA' or fields[2] == 'ncRNA' or fields[2] == 'pseudogenic_transcript':
                transcriptID = fields[8].split('ID=')[1].split(';')[0]
                geneID = fields[8].split('Parent=')[1].split(';')[0]
                gBT = fields[2]
                GeneTypeDict[geneID] = gBT
                TranscriptParentDict[transcriptID] = geneID
                continue
            if fields[2] == 'five_prime_UTR' or fields[2] == 'three_prime_UTR' or fields[2] == 'CDS':
                ID = fields[8].split('ID=')[1].split(';')[0]
                transcriptID = fields[8].split('Parent=')[1].split(';')[0]
                ExonCDSParentDict[ID] = transcriptID
                if TranscriptDict.has_key(transcriptID):
                    pass
                else:
                    TranscriptDict[transcriptID] = {}
                    TranscriptDict[transcriptID]['CDS'] = []
                    TranscriptDict[transcriptID]['five_prime_UTR'] = []
                    TranscriptDict[transcriptID]['three_prime_UTR'] = []
                    TranscriptDict[transcriptID]['exonic'] = []
                chr = fields[0]
                left = int(fields[3])
                right = int(fields[4])
                strand = fields[6]
                TranscriptDict[transcriptID][fields[2]].append((chr,left,right,strand))
                TranscriptDict[transcriptID]['exonic'].append((chr,left,right,strand))
                continue
        for transcriptID in TranscriptDict.keys():
            transcriptName = transcriptID
            geneID = TranscriptParentDict[transcriptID]
            geneName = GeneParentDict[geneID]
            gBT = GeneTypeDict[geneID]
            TranscriptDict[transcriptID]['exonic'].sort()
            NewExons = []
            NewExons.append(TranscriptDict[transcriptID]['exonic'][0])
            for i in range(1,len(TranscriptDict[transcriptID]['exonic'])):
                if NewExons[-1][2] + 1 == TranscriptDict[transcriptID]['exonic'][i][1]:
                    NewExons[-1] = (NewExons[-1][0],NewExons[-1][1],TranscriptDict[transcriptID]['exonic'][i][2],NewExons[-1][3])
                else:
                    NewExons.append(TranscriptDict[transcriptID]['exonic'][i])
            for (chr,left,right,strand) in NewExons:
                outline = chr + '\t'
                outline = outline + 'PomBase' + '\t'
                outline = outline + 'exon' + '\t'
                outline = outline + str(left) + '\t'
                outline = outline + str(right) + '\t'
                outline = outline + '.' + '\t'
                outline = outline + strand + '\t'
                outline = outline + '.' + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
                outfile.write(outline + '\n')
            for (chr,left,right,strand) in TranscriptDict[transcriptID]['CDS']:
                outline = chr + '\t'
                outline = outline + 'PomBase' + '\t'
                outline = outline + 'CDS' + '\t'
                outline = outline + str(left) + '\t'
                outline = outline + str(right) + '\t'
                outline = outline + '.' + '\t'
                outline = outline + strand + '\t'
                outline = outline + '.' + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
                outfile.write(outline + '\n')
    if sys.argv[2] == 'NCBI-Prok3':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        GeneTypeDict = {}
        GeneParentDict = {}
        TranscriptParentDict = {}
        TranscriptDict = {}
        ExonCDSParentDict = {}
        for line in linelist:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if fields[2] == 'gene' or fields[2] == 'ncRNA_gene' or fields[2] == 'pseudogene':
                ID = fields[8].split('ID=')[1].split(';')[0].split('gene:')[1]
                if 'Name=' in fields[8]:
                    name = fields[8].split('Name=')[1].split(';')[0]
                else:
                    name = ID
                GeneParentDict[ID] = name
                continue
            if fields[2] == 'mRNA' or fields[2] == 'rRNA' or fields[2] == 'tRNA' or fields[2] == 'RNase_MRP_RNA' or fields[2] == 'RNase_P_RNA' or fields[2] == 'SRP_RNA_encoding' or fields[2] == 'snRNA' or fields[2] == 'snoRNA' or fields[2] == 'ncRNA' or fields[2] == 'pseudogenic_transcript':
                transcriptID = fields[8].split('ID=')[1].split(';')[0].split('transcript:')[1]
                geneID = fields[8].split('Parent=')[1].split(';')[0].split('gene:')[1]
                gBT = fields[2]
                GeneTypeDict[geneID] = gBT
                TranscriptParentDict[transcriptID] = geneID
                continue
            if fields[2] == 'CDS':
                ID = fields[8].split('ID=')[1].split(';')[0].split('CDS:')[1]
                transcriptID = fields[8].split('Parent=')[1].split(';')[0].split('transcript:')[1]
                ExonCDSParentDict[ID] = transcriptID
                if TranscriptDict.has_key(transcriptID):
                    pass
                else:
                    TranscriptDict[transcriptID] = {}
                    TranscriptDict[transcriptID]['CDS'] = []
                    TranscriptDict[transcriptID]['exonic'] = []
                chr = fields[0]
                left = int(fields[3])
                right = int(fields[4])
                strand = fields[6]
                TranscriptDict[transcriptID][fields[2]].append((chr,left,right,strand))
                continue
            if fields[2] == 'exon':
                transcriptID = fields[8].split('Parent=')[1].split(';')[0].split('transcript:')[1]
                if TranscriptDict.has_key(transcriptID):
                    pass
                else:
                    TranscriptDict[transcriptID] = {}
                    TranscriptDict[transcriptID]['CDS'] = []
                    TranscriptDict[transcriptID]['exonic'] = []
                chr = fields[0]
                left = int(fields[3])
                right = int(fields[4])
                strand = fields[6]
                TranscriptDict[transcriptID]['exonic'].append((chr,left,right,strand))
                continue
        for transcriptID in TranscriptDict.keys():
            transcriptName = transcriptID
            geneID = TranscriptParentDict[transcriptID]
            geneName = GeneParentDict[geneID]
            gBT = GeneTypeDict[geneID]
            TranscriptDict[transcriptID]['exonic'].sort()
            TranscriptDict[transcriptID]['CDS'].sort()
            for (chr,left,right,strand) in TranscriptDict[transcriptID]['exonic']:
                outline = chr + '\t'
                outline = outline + 'NCBI-Prok3' + '\t'
                outline = outline + 'exon' + '\t'
                outline = outline + str(left) + '\t'
                outline = outline + str(right) + '\t'
                outline = outline + '.' + '\t'
                outline = outline + strand + '\t'
                outline = outline + '.' + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
                outfile.write(outline + '\n')
            for (chr,left,right,strand) in TranscriptDict[transcriptID]['CDS']:
                outline = chr + '\t'
                outline = outline + 'NCBI-Prok3' + '\t'
                outline = outline + 'CDS' + '\t'
                outline = outline + str(left) + '\t'
                outline = outline + str(right) + '\t'
                outline = outline + '.' + '\t'
                outline = outline + strand + '\t'
                outline = outline + '.' + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
                outfile.write(outline + '\n')
    if sys.argv[2] == 'Fugacium':
        if GFF.endswith('.gz'):
            linelist = gzip.open(GFF)
        else:
            linelist = open(GFF)
        GeneNameDict = {}
        GeneTypeDict = {}
        GeneParentDict = {}
        TranscriptParentDict = {}
        TranscriptDict = {}
        ExonCDSParentDict = {}
        for line in linelist:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if fields[2] == 'gene':
                ID = fields[8].split('ID=')[1].split(';')[0]
                if 'Name=' in fields[8]:
                    name = fields[8].split('Name=')[1].split(';')[0]
                    if GeneNameDict.has_key(name):
                        N=1
                        newname = name + '_' + str(N)
                        while GeneNameDict.has_key(newname):
                            N+=1
                            newname = name + '_' + str(N)
                        name = newname
                    GeneNameDict[name] = 1
                else:
                    name = ID
                GeneParentDict[ID] = name
                continue
            if fields[2] == 'mRNA' or fields[2] == 'rRNA' or fields[2] == 'tRNA' or fields[2] == 'RNase_MRP_RNA' or fields[2] == 'RNase_P_RNA' or fields[2] == 'SRP_RNA_encoding' or fields[2] == 'snRNA' or fields[2] == 'snoRNA' or fields[2] == 'ncRNA' or fields[2] == 'pseudogenic_transcript':
                transcriptID = fields[8].split('ID=')[1].split(';')[0]
                geneID = fields[8].split('Parent=')[1].split(';')[0]
                gBT = fields[2]
                GeneTypeDict[geneID] = gBT
                TranscriptParentDict[transcriptID] = geneID
                continue
            if fields[2] == 'CDS':
                transcriptID = fields[8].split('Parent=')[1].split(';')[0]
                ExonCDSParentDict[ID] = transcriptID
                if TranscriptDict.has_key(transcriptID):
                    pass
                else:
                    TranscriptDict[transcriptID] = {}
                    TranscriptDict[transcriptID]['CDS'] = []
                    TranscriptDict[transcriptID]['exonic'] = []
                chr = fields[0]
                left = int(fields[3])
                right = int(fields[4])
                strand = fields[6]
                TranscriptDict[transcriptID][fields[2]].append((chr,left,right,strand))
                continue
            if fields[2] == 'exon':
                transcriptID = fields[8].split('Parent=')[1].split(';')[0]
                if TranscriptDict.has_key(transcriptID):
                    pass
                else:
                    TranscriptDict[transcriptID] = {}
                    TranscriptDict[transcriptID]['CDS'] = []
                    TranscriptDict[transcriptID]['exonic'] = []
                chr = fields[0]
                left = int(fields[3])
                right = int(fields[4])
                strand = fields[6]
                TranscriptDict[transcriptID]['exonic'].append((chr,left,right,strand))
                continue
        for transcriptID in TranscriptDict.keys():
            transcriptName = transcriptID
            geneID = TranscriptParentDict[transcriptID]
            geneName = GeneParentDict[geneID]
            gBT = GeneTypeDict[geneID]
            TranscriptDict[transcriptID]['exonic'].sort()
            TranscriptDict[transcriptID]['CDS'].sort()
            for (chr,left,right,strand) in TranscriptDict[transcriptID]['exonic']:
                outline = chr + '\t'
                outline = outline + '.' + '\t'
                outline = outline + 'exon' + '\t'
                outline = outline + str(left) + '\t'
                outline = outline + str(right) + '\t'
                outline = outline + '.' + '\t'
                outline = outline + strand + '\t'
                outline = outline + '.' + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
                outfile.write(outline + '\n')
            for (chr,left,right,strand) in TranscriptDict[transcriptID]['CDS']:
                outline = chr + '\t'
                outline = outline + '.' + '\t'
                outline = outline + 'CDS' + '\t'
                outline = outline + str(left) + '\t'
                outline = outline + str(right) + '\t'
                outline = outline + '.' + '\t'
                outline = outline + strand + '\t'
                outline = outline + '.' + '\t'
                outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; gene_type "' + gBT + '";'
                outfile.write(outline + '\n')

    outfile.close()
   
run()
