##################################
#                                #
# Last modified 05/15/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set
import os
import subprocess

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s cmscan output [-skip skipped-list]'
        print '\tonly use this script for prokaryote genomes!!!'
        print '\tskipped-list format: one name per line; Note: the script will look for names beginning with these strings!!!!'
        sys.exit(1)

    input = sys.argv[1]
    outfilename = sys.argv[2]

    SkippedDict = {}
    doSkip = False
    if '-skip' in sys.argv:
        doSkip = True
        linelist = open(sys.argv[sys.argv.index('-skip') + 1])
        for line in linelist:
            name = line.strip()
            SkippedDict[name] = 1

    GeneDict = {}

    SeenDict = {}

    currentChr = ''
    doParse = False
    lineslist  = open(input)
    for line in lineslist:
        if line.strip() == '':
            continue
        if line.startswith('#'):
            continue
        if line.startswith('Query:'):
            linestring = line.strip()
            while '  ' in linestring:
                linestring = linestring.replace('  ',' ')
            currentChr = linestring.split(' ')[1]
            doParse = True
            continue
        if doParse:
            if line.strip().startswith('Hit scores:'):
                continue
            if line.strip().startswith('Description: '):
                continue
            if line.strip().startswith('rank'):
                continue
            if Set(line.strip()) == Set([' ', '-']):
                continue
            if line.strip() == '------ inclusion threshold ------' or line.strip() == '[No hits detected that satisfy reporting thresholds]' or line.strip().startswith('Hit alignments:'):
                doParse = False
                continue
            linestring = line.strip()
            while '  ' in linestring:
                linestring = linestring.replace('  ',' ')
            fields = linestring.replace(' !','!').split(' ')
#            print fields
            name = fields[4]
            if doSkip:
                if SkippedDict.has_key(name):
                    continue
                else:
                    SkipThisLine = False
                    for K in SkippedDict.keys():
                         if name.startswith(K):
                             SkipThisLine = True
                             break
            if SkipThisLine:
                continue
            if SeenDict.has_key(name):
                pass
            else:
                SeenDict[name] = 0
            SeenDict[name] += 1
            if SeenDict[name] == 1:
                pass
            else:
                name = name + '-' + str(SeenDict[name])
            start = int(fields[5])
            end = int(fields[6])
            strand = fields[7]
            if GeneDict.has_key(currentChr):
                pass
            else:
                GeneDict[currentChr] = []
            GeneDict[currentChr].append((name,start,end,strand))
        else:
            continue

    chromosomes = GeneDict.keys()
    chromosomes.sort()

    outfile = open(outfilename,'w')

    for chr in chromosomes:
        for (name,start,end,strand) in GeneDict[chr]:
            outline = chr + '\tInfernal\tgene\t' + str(min(end,start)) + '\t' + str(max(end,start)) + '\t1000\t' + strand + '\t.\tgene_id "' + name + '"; gene_name "' + name + '"; ' + 'transcript_id "' + name + '"; transcript_name "' + name + '";'
            outfile.write(outline + '\n')

    outfile.close()

run()