##################################
#                                #
# Last modified 2023/08/25       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import random
import math
import os
from sets import Set

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s gene_prefix tblastn_folder chrom.sizes_folder outfilename [-round N] [-minSpecies N]' % sys.argv[0]
        sys.exit(1)


    gene = sys.argv[1]
    tbalstnF = sys.argv[2]
    CSF = sys.argv[3]
    outfilename = sys.argv[4]

    ChromSizeDict = {}

    minSpecies = 100
    if '-minSpecies' in sys.argv:
        minSpecies = int(sys.argv[sys.argv.index('-minSpecies')+1])
        print 'will not calculate entropy in the available species are less than', minSpecies

    roundN = 2
    if '-round' in sys.argv:
        roundN = int(sys.argv[sys.argv.index('-round')+1])
        print 'will use bin size rounded to the', roundN, 'decimal point'

    print 'importing chromosome size info'

    cmd = 'ls -1 ' + CSF
    p = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = p.readline()
        if line == '':
            break
        fields = line.strip().split('\t')
        species = line.split('.chrom.sizes')[0]
        print species
        CSfile = CSF + '/' + line.strip()
        ChromSizeDict[species] = {}
        linelist = open(CSfile)
        for L in linelist:
            fields = L.strip().split('\t') 
            chr = fields[0]
            chrSize = int(fields[1])
            if chrSize > 1000000:
                ChromSizeDict[species][chr] = chrSize

    relative_positions = []

    print 'importing tblastn results'

    outfile = open(outfilename, 'w')

    cmd = 'ls -1 ' + tbalstnF
    p = os.popen(cmd, "r")
    line = 'line'
    i=0
    while line != '':
        line = p.readline()
        if line == '':
            break
        if '.tblastn' not in line:
            continue
        species = line.split(gene + '.')[1].split('.tblastn')[0]
        TBNfile = tbalstnF + '/' + line.strip()
        linelist = open(TBNfile)
        L = linelist.readline()
        fields = L.strip().split('\t')
        if len(fields) < 10:
            continue
        chr = fields[1]
        pos = int(fields[8]) + 0.0
        EVal = float(fields[10])
        if EVal > 1e-10:
            continue
        if ChromSizeDict[species].has_key(chr):
            pass
        else:
            continue
        RelPos = pos/ChromSizeDict[species][chr]
        if RelPos > 0.5:
            RelPos = 1 - RelPos
        outline = species + '\t' + str(RelPos) + '\t' + chr + '\t' + str(pos) + '\t' + str(ChromSizeDict[species][chr])
        outfile.write(outline + '\n')
        relative_positions.append(RelPos)

    PosDict = {}
    for RelPos in relative_positions:
        strRelPos = round(RelPos, roundN)
        if PosDict.has_key(strRelPos):
            pass
        else:
            PosDict[strRelPos] = 0.0
        PosDict[strRelPos] += 1

    PosEnt = 0
    for strRelPos in PosDict.keys():
        p = PosDict[strRelPos]/len(relative_positions)
        print strRelPos, PosDict[strRelPos], len(relative_positions), p, PosEnt
        PosEnt += (-1)*p*math.log(p)

    if len(relative_positions) > minSpecies:
        outline = 'entropy:\t' + str(PosEnt)
    else:
        outline = 'entropy:\t' + 'nan'
    outfile.write(outline + '\n')

    outfile.close()

run()
