##################################
#                                #
# Last modified 5/6/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

# This script takes the genome database and a file with gene expression values from an RNA-seq experiment
# and returns a lits of all genes present in both, with chromosome number, chromosome positions and orientation 
#
try:
	import psyco
	psyco.full()
except:
	print 'psyco not running'
from cistematic.core.geneinfo import geneinfoDB
from cistematic.genomes import Genome
import sys

if len(sys.argv) < 3:
    print 'usage: python %s genome expressionfile outfile' % sys.argv[0]
    sys.exit(1)

genome = sys.argv[1]
expressionfile = sys.argv[2]
outfilename = sys.argv[3]

hg = Genome(genome)
idb = geneinfoDB()

expressionDataFile = open(expressionfile)
expressionData = expressionDataFile.readlines()
RegulatedGenes = []

geneinfoDict = idb.getallGeneInfo(genome)
featDict = hg.getallGeneFeatures()
geneIDs = featDict.keys()
genes = []
i=0

for k in geneIDs:

#    # discard pseudogenes and entries in the database for which there is no geneinfo 
#
#    if str(featDict[k][0][0])=='PSEUDO':
#        continue
#    if idb.getGeneInfo((genome,k))==[]:
#        continue

    genes.append(i)
    genes[i] = []
    genes[i].append(k)
    if idb.getGeneInfo((genome,k))==[]:
        genes[i].append('LOC'+str(k))
    else:
        genes[i].append(idb.getGeneInfo((genome,k))[0])
    genes[i].append('chr' + str(featDict[k][0][1]))
    leftPos = []
    rightPos = []
    for feature in featDict[k]:
        leftPos.append(feature[2])
        rightPos.append(feature[3])
    genes[i].append(min(leftPos))
    genes[i].append(max(rightPos))
    genes[i].append(str(featDict[k][0][4]))
    print i
    i+=1

outfile = open(outfilename,'w')

expressiondatagenes = []
for line in expressionData:
    fields = line.split('\t')
    geneID = fields[0]
    expressiondatagenes.append(geneID)

print 'genes', len(genes)
print 'expressiondatagenes', len(expressiondatagenes)
print 'len(geneIDs)', len(geneIDs)

for gene in genes:
    if gene[0] in expressiondatagenes:
        outfile.write(gene[0])
        outfile.write('\t')
        outfile.write(gene[1])
        outfile.write('\t')
        outfile.write(str(gene[2]))
        outfile.write('\t')
        outfile.write(str(gene[3]))
        outfile.write('\t')
        outfile.write(str(gene[4]))
        outfile.write('\t')
        outfile.write(gene[5])
        outfile.write('\n')

outfile.close()