##################################
#                                #
# Last modified 06/14/2016       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
from collections import Counter

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s GTF1 species list_of_HOG_fasta_files windows_size fraction_match outfilename' % sys.argv[0]
        print '\tExample of a gene name in the OrthologousGroups.txt file:'
        print '\t\tParamecium_biaurelia_V1-4:PBIGNG33362|PBIGNG33362|PBIGNT33362|PBIGNT33362|scaffold_0398|2934,25827'
        print '\tin this case the prefix name would be Paramecium_biaurelia_V1-4'
        sys.exit(1)

    GTF = sys.argv[1]
    prefix = sys.argv[2]
    FFlist = sys.argv[3]
    window = int(sys.argv[4])
    fraction = float(sys.argv[5])
    outfilename = sys.argv[6]

    HOGDict = {}
    GeneToHOGDict = {}
    linelist = open(FFlist)
    for fileline in linelist:
        if line.startswith('#'):
            continue
        F = fileline.split('/')[-1].split('.fa')[0]
        if HOGDict.has_key(F):
            print 'duplicate file names detected, exiting'
            print F
            sys.exit(1)
        HOGDict[F] = {}
        fasta = fileline.strip().split('\t')[0]
        inputdatafile = open(fasta)
        for line in inputdatafile:
            if line[0]=='>':
                species = line.strip().split(']')[0].split('[')[1]
                if species != prefix:
                    continue
                gene = line.strip().split('>')[1].split('[')[0].strip()
                HOGDict[F].append(gene)
                GeneToHOGDict[gene] = F

    print 'finished inputting HOG'

    GeneDict = {}
    linelist=open(GTF)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2] != 'exon':
            continue
        chr = fields[0]
        left = int(fields[3])
        right = int(fields[4])
        strand = fields[6]
        geneID = fields[8].split('gene_id "')[1].split('";')[0]
        transcriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if GeneDict.has_key(geneID):
            pass
        else:
            GeneDict[geneID]={}
        if GeneDict[geneID].has_key(transcriptID):
            pass
        else:
            GeneDict[geneID][transcriptID]=[]
        GeneDict[geneID][transcriptID].append((chr,left,right,strand))

    print 'finished inputting', GTF

    chrGenesDict = {}
    chrGenesDictCoordToID = {}
    chrGenesDictIdToCoord = {}

    for geneID in GeneDict.keys():
        chr = GeneDict[geneID][GeneDict[geneID].keys()[0]][0][0]
        strand = GeneDict[geneID][GeneDict[geneID].keys()[0]][0][-1]
        coordinates = []
        for transcriptID in GeneDict[geneID].keys():
            for (chr,left,right,strand) in GeneDict[geneID][transcriptID]:
                coordinates.append(left)
                coordinates.append(right)
        left = min(coordinates)
        right = max(coordinates)
        if chrGenesDict.has_key(chr):
            pass
        else:
            chrGenesDict[chr]=[]
            chrGenesDictCoordToID[chr]={}
            chrGenesDictIdToCoord[chr]={}
        chrGenesDict[chr].append((left,right,geneID,strand))
        chrGenesDictCoordToID[chr][(left,right,geneID,strand)] = geneID
        chrGenesDictIdToCoord[chr][geneID] = (left,right,geneID,strand)

    for chr in chrGenesDict.keys():
        chrGenesDict[chr].sort()

    pos = 0
    ParalogyDict = {}

    for chr in chrGenesDict.keys():
        windowsGeneIDs = {}
        while pos <= len(chrGenesDict[chr]) - window:
            geneIDs = []
            for i in range(pos,pos + window):
                geneID = chrGenesDictCoordToID[chr][chrGenesDict[chr][i]]
                geneIDs.append(geneID)
                windowsGeneIDs[pos] = geneIDs
                pos +=1
        for pos in windowsGeneIDs:
            geneHOGDict = {}
            chrtochrDict = {}
            for geneID in windowsGeneIDs[pos]:
                F = GeneToHOGDict[gene]
                if len(HOGDict[F]) > 1:
                    geneHOGDict[geneID] = []
                    for gene in HOGDict[F]:
                        if gene != geneID:
                            geneHOGDict[geneID].append(gene)
                            HOGchr = GeneDict[gene][GeneDict[gene].keys()[0]][0][0]
                            if chrtochrDict[chr].has_key(HOGchr):
                                pass
                            else:
                                chrtochrDict[chr][HOGchr] = []
                                chrtochrDict[chr][HOGchr].append(gene)
                else:
                    continue
            if len(geneHOGDict.keys())/(window + 0.0) < fraction:
                continue
            HOGgenecounts = []
            for HOGchr in chrtochrDict[chr].keys():
                HOGgenecounts.append((len(chrtochrDict[chr][HOGchr]),HOGchr))
            HOGgenecounts.sort()
            HOGgenecounts.reverse()
            for i in range(len(HOGgenecounts))
                if HOGgenecounts[i][0]/(window + 0.0) < fraction:
                    break
                HOGcoordinates = []
                HOGchr = HOGgenecounts[i][1]
                for gene in chrtochrDict[chr][HOGchr]:
                    (left,right,geneID,strand) = chrGenesDictIdToCoord[HOGchr][gene]
                    HOGcoordinates.append(chrGenesDict[chr].index((left,right,geneID,strand)))
                HOGcoordinates.sort()
                if max(HOGcoordinates) - min(HOGcoordinates) > window:
                    maxC = max(HOGcoordinates)
                    minC = min(HOGcoordinates)
                    while maxC - minC > window:
                        for j in range(len(HOGcoordinates)):
                            for k in range(len(HOGcoordinates)-j):
                                maxC = HOGcoordinates[-(1+k)]
                                minC = HOGcoordinates[j]
                    if k == 0:
                        HOGcoordinates = HOGcoordinates[j:]
                    else:
                        HOGcoordinates = HOGcoordinates[j:-k]
                if len(HOGcoordinates) >= fraction/(window + 0.0):
                    if ParalogyDict.has_key(chr):
                        pass
                    else:            
                        ParalogyDict[chr] = {}
                    if ParalogyDict.has_key(HOGchr):
                        pass
                    else:
                        ParalogyDict[chr][HOGchr] = {}
                    ParalogyDict[chr][HOGchr][(pos,pos + window)] = {} 
                    ParalogyDict[chr][HOGchr][(pos,pos + window)][(HOGcoordinates[0],HOGcoordinates[1])] = HOGcoordinates

    for chr in ParalogyDict.keys():
        for HOGchr in ParalogyDict[chr],keys():
            for (p1,p1+w) in ParalogyDict[chr][HOGchr].keys():
                

    outfile.close()

run()
