##################################
#                                #
# Last modified 06/06/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
import math

def convertCoordinates(S):

    strand = '+' 
    if S.startswith('complement'):
        strand = '-'
    newS = S.replace('(','').replace(')','').replace('<','').replace('>','').replace('complement','').replace('join','')
    exons = newS.split(',')
    newCoords = []
    for e in exons:
        if '..' not in e:
            e1 = str(int(e)-1)
            e2 = e
        else:
            e1 = e.split('..')[0]
            e2 = e.split('..')[1]
        newCoords.append((e1,e2))

    return (strand,newCoords)

def tabline(line):

    newline = line
    while '  ' in newline:
        newline = newline.replace('  ',' ')
    
    return newline.strip().replace(' ','\t')

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s GenBank outfile ' % sys.argv[0]
        print 'Note: the scripts behavior is unpredictable for file containing more than one isoform per gene'
        sys.exit(1)

    GB = sys.argv[1]
    GTF = sys.argv[2]

    GeneDict={}

    CurrentChr = ''
    InFeatures = False
    InSource = False
    CurrentFeature = {}
    InJoin = False
    InF1 = False

    outfile = open(GTF,'w')

    linelist=open(GB)
    i=0
    for line in linelist:
        i+=1
        if line.strip().startswith('/') or InF1 or InJoin:
            newline = line.strip()
        else:
            newline = tabline(line)
#        print i, '............'
#        print newline
#        print 'InJoin', InJoin
#        print 'InF1', InF1
#        print CurrentFeature
        fields = line.strip().split('\t')
        if line.strip() == 'ORIGIN':
            InFeatures = False
            continue
        if line.startswith('FEATURES'):
            InFeatures = True
            continue
        if InFeatures:
            fields = newline.strip().split('\t')
            if fields[0] == 'source':
                InSource = True
                continue
            elif fields[0].startswith('/'):
                pass
            else:
                InSource = False
                pass
            if InSource:
                if fields[0].startswith('/chromosome'):
                    CurrentChr = 'chr' + fields[0].split('/chromosome="')[1].split('"')[0]
                continue
            if fields[0].startswith('/'):
                f1 = fields[0].split('/')[1].split('=')[0]
                if '=' not in fields[0]:
                    CurrentFeature[f1] = True
                    continue
                if '"' not in fields[0]:
                    continue
                f2 = fields[0].split('"')[1].split('"')[0]
                CurrentFeature[f1] = f2
                if fields[0].endswith('"'):
                    InF1 = False
                else:
                    InF1 = True
                    continue
            elif InJoin and InF1:
                CurrentFeature['coordinates'] += newline
                if newline.endswith(')'):
                    InJoin = False
            elif InJoin:
                CurrentFeature['coordinates'] += newline
                if newline.endswith(')'):
                    InJoin = False
            elif InF1:
                CurrentFeature[f1] += newline
                if newline.endswith('"'):
                    InF1 = False
            else:
                if CurrentFeature == {}:
                    pass
                else:
                    if CurrentFeature['type'] == 'gene':
                        pass
                    if CurrentFeature['type'] == 'rRNA':
                        coordinates = convertCoordinates(CurrentFeature['coordinates'])
                        if 'db_xref' in CurrentFeature.keys():
                            geneID = CurrentFeature['db_xref']
                        else:
                            geneID = 'rRNA' + '-' + CurrentChr + '-' + coordinates[1][0][1] + '-' + coordinates[1][-1][1] + '-strand=' + coordinates[0]
                        geneName = CurrentFeature['product']
                        transcriptID = geneID
                        transcriptName = geneName
                        strand = coordinates[0]
                        if 'pseudo' in CurrentFeature.keys():
                            TT = 'pseudogene'
                        else:
                            TT = 'rRNA'
                        for (left,right) in coordinates[1]:
                            outline = CurrentChr + '\t' + 'GenBank' + '\t' + 'exon' + '\t' + left + '\t' + right + '\t' + '.' + '\t' + strand + '\t' + '.' + '\t'
                            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; transcript_type "' + TT + '";'
                            outfile.write(outline + '\n')
                    if CurrentFeature['type'] == 'tRNA':
                        coordinates = convertCoordinates(CurrentFeature['coordinates'])
                        geneName = CurrentFeature['product'] + '-'
                        if 'codon_recognized' in CurrentFeature.keys():
                            geneName = geneName + CurrentFeature['codon_recognized']
                        elif 'note' in CurrentFeature.keys():
                            if 'recognized: ' in CurrentFeature['note']:
                                geneName = geneName + CurrentFeature['note'].split('recognized: ')[1]
                            elif 'recognized ' in CurrentFeature['note']:
                                geneName = geneName + CurrentFeature['note'].split('recognized ')[1]
                            else:
                                print 'no codon found in tRNA note'
                                print CurrentFeature['note']
                                geneName = geneName + CurrentFeature['note']
#                                sys.exit(1)
                        else:
                            pass
                        geneID = geneName + '-' + CurrentChr + '-' + coordinates[1][0][1] + '-' + coordinates[1][-1][1] + '-strand=' + coordinates[0]
                        transcriptID = geneID
                        transcriptName = geneName
                        strand = coordinates[0]
                        if 'pseudo' in CurrentFeature.keys():
                            TT = 'pseudogene'
                        else:
                            TT = 'tRNA'
                        for (left,right) in coordinates[1]:
                            outline = CurrentChr + '\t' + 'GenBank' + '\t' + 'exon' + '\t' + left + '\t' + right + '\t' + '.' + '\t' + strand + '\t' + '.' + '\t'
                            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; transcript_type "' + TT + '";'
                            outfile.write(outline + '\n')
                    if CurrentFeature['type'] == 'ncRNA':
                        coordinates = convertCoordinates(CurrentFeature['coordinates'])
                        if 'product' in CurrentFeature.keys():
                            geneName = CurrentFeature['product']
                        else:
                            geneName = CurrentFeature['note']
                        if 'locus_tag' in CurrentFeature.keys():
                            geneID = CurrentFeature['locus_tag'] + '-' + CurrentChr + '-' + coordinates[1][0][1] + '-' + coordinates[1][-1][1] + '-strand=' + coordinates[0]
                        else:
                            geneID = CurrentFeature['ncRNA_class'] + '-' + CurrentChr + '-' + coordinates[1][0][1] + '-' + coordinates[1][-1][1] + '-strand=' + coordinates[0]
                        transcriptID = geneID
                        transcriptName = geneName
                        strand = coordinates[0]
                        if 'pseudo' in CurrentFeature.keys():
                            TT = 'pseudogene'
                        else:
                            TT = CurrentFeature['ncRNA_class']
                        for (left,right) in coordinates[1]:
                            outline = CurrentChr + '\t' + 'GenBank' + '\t' + 'exon' + '\t' + left + '\t' + right + '\t' + '.' + '\t' + strand + '\t' + '.' + '\t'
                            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; transcript_type "' + TT + '";'
                            outfile.write(outline + '\n')
                    if CurrentFeature['type'] == 'mRNA':
                        coordinates = convertCoordinates(CurrentFeature['coordinates'])
                        if 'product' in CurrentFeature.keys():
                            geneName = CurrentFeature['product']
                        elif 'gene' in CurrentFeature.keys():
                            geneName = CurrentFeature['gene']
                        else:
                            geneName = CurrentFeature['locus_tag']
                        if CurrentFeature.has_key('db_xref'):
                            geneID = CurrentFeature['db_xref']
                        elif CurrentFeature.has_key('locus_tag'):
                            geneID = CurrentFeature['locus_tag']
                        elif CurrentFeature.has_key('locus_tag'):
                            geneID = CurrentFeature['locus_tag']
                        elif CurrentFeature.has_key('gene'):
                            geneID = CurrentFeature['gene']
                        else:
                            geneID = 'gene' + '-' + CurrentChr + '-' + coordinates[1][0][1] + '-' + coordinates[1][-1][1] + '-strand=' + coordinates[0]
                        transcriptID = geneID
                        transcriptName = geneName
                        strand = coordinates[0]
                        if 'pseudo' in CurrentFeature.keys():
                            TT = 'pseudogene'	
                        else:
                            TT = 'protein_coding'
                        for (left,right) in coordinates[1]:
                            outline = CurrentChr + '\t' + 'GenBank' + '\t' + 'exon' + '\t' + left + '\t' + right + '\t' + '.' + '\t' + strand + '\t' + '.' + '\t'
                            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; transcript_type "' + TT + '";'
                            outfile.write(outline + '\n')
                    if CurrentFeature['type'] == 'CDS':
                        coordinates = convertCoordinates(CurrentFeature['coordinates'])
                        if 'product' in CurrentFeature.keys():
                            geneName = CurrentFeature['product']
                        else:
                            geneName = CurrentFeature['gene']
                        if CurrentFeature.has_key('db_xref'):
                            geneID = CurrentFeature['db_xref']
                        elif CurrentFeature.has_key('locus_tag'):
                            geneID = CurrentFeature['locus_tag']
                        elif CurrentFeature.has_key('gene'):
                            geneID = CurrentFeature['gene']
                        else:
                            geneID = 'gene' + '-' + CurrentChr + '-' + coordinates[1][0][1] + '-' + coordinates[1][-1][1] + '-strand=' + coordinates[0]
                        transcriptID = geneID
                        transcriptName = geneName
                        strand = coordinates[0]
                        if 'pseudo' in CurrentFeature.keys():
                            TT = 'pseudogene'
                        else:
                            TT = 'protein_coding'
                        for (left,right) in coordinates[1]:
                            outline = CurrentChr + '\t' + 'GenBank' + '\t' + 'CDS' + '\t' + left + '\t' + right + '\t' + '.' + '\t' + strand + '\t' + '.' + '\t'
                            outline = outline + 'gene_id "' + geneID + '"; transcript_id "' + transcriptID + '"; gene_name "' + geneName + '"; transcript_name "' + transcriptName + '"; transcript_type "' + TT + '";'
                            outfile.write(outline + '\n')
                if len(fields) < 2:
                    continue
                CurrentFeature = {}
                CurrentFeature['type'] = fields[0]
                CurrentFeature['coordinates'] = fields[1]
                if 'join' in fields[1]:
                    if fields[1].endswith(')'):
                        InJoin = False
                    else:
                        InJoin = True
               

    outfile.close()
   
run()
