##################################
#                                #
# Last modified 10/05/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from cistematic.core import Genome
from cistematic.core.geneinfo import geneinfoDB
from commoncode import *

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s genome tile_size rds-files-directory IDtoRDSfilename outputfilename [-nomulti] [-cache size] ' % sys.argv[0]

        sys.exit(1)
    
    genome = sys.argv[1]
    tile = int(sys.argv[2])
    rdsdirectory = sys.argv[3]
    IDtoRDSfilename = sys.argv[4]
    outfilename = sys.argv[5]

    outfile = open(outfilename, 'w')

    doMulti=True
    if '-nomulti' in sys.argv:
        doMulti=False 
   
    cachePages = -1
    doCache = False
    if '-cache' in sys.argv:
        doCache = True
        cachePages =  int(sys.argv[sys.argv.index('-cache') + 1])

    hg = Genome(genome)
    ChromosomesList = hg.allChromNames()
    print ChromosomesList
    files = os.listdir(rdsdirectory)
    ID=0
    IDtoRDSdict={}
    for hitfilename in files:
        if hitfilename.endswith('.rds'):
            pass
        else:
            print hitfilename, 'not an rds file; skipping'
            continue
        ID+=1
        IDtoRDSdict[ID]=hitfilename

    for chr in ChromosomesList:
        outline = chr + '\n'
        outfile.write(outline)
    outfile.close()

    outfile = open(IDtoRDSfilename, 'w')
    keys=IDtoRDSdict.keys()
    keys.sort()
    for ID in keys:
        outline=str(ID)+' => '+'"'+IDtoRDSdict[ID]+'"\n'
        outfile.write(outline)
    outfile.close()

    outfile = open(outfilename, 'r')
    ChromosomesList=[]
    linelist=outfile.readlines()
    for line in linelist:
        ChromosomesList.append(line.strip())
    outfile.close()

    ChromosomesList.sort()
    print ChromosomesList

    outfile = open(outfilename, 'w')
    outfile.close()

    print IDtoRDSdict
    for chromosome in ChromosomesList:
        outfile = open(outfilename, 'a')
        PartitionDict={}
        lenchr = len(hg.getChromosomeSequence(chromosome))
        print chromosome
        for i in range(0,lenchr,tile):
            if i % 10000000 == 0:
                print chromosome, i
            PartitionDict[i]={}
        IDtoRDSdictKeys=IDtoRDSdict.keys()
        IDtoRDSdictKeys.sort()
        posList=PartitionDict.keys()
        posList.sort()
        for ID in IDtoRDSdictKeys:
            hitfilename=IDtoRDSdict[ID]
            print hitfilename
            hitfilename=rdsdirectory+hitfilename
            hitRDS = readDataset(hitfilename, verbose = True, cache=True)
            #sqlite default_cache_size is 2000 pages
            if cachePages > hitRDS.getDefaultCacheSize():
                hitRDS.setDBcache(cachePages)
            metadata = hitRDS.getMetadata()
            readlen = int(metadata['readsize'])
            dataType = metadata['dataType']
            readlenRange = range(readlen)
            normalizeBy = len(hitRDS)/1000000.
            chr='chr'+chromosome
            for pos in posList:
                rmin=pos
                rmax=pos+tile
                value=hitRDS.getCounts(chrom=chr, rmin=rmin, rmax=rmax, uniqs=True, multi=doMulti, splices=False, reportCombined=True)
                RPKM=value/(normalizeBy*((tile)/1000.0))
                if RPKM != 0:
                    PartitionDict[pos][ID]=RPKM
                if pos % 10000000==0:
                    print ID, hitfilename, chr, rmin, rmax, RPKM
        hitRDS=''
        print 'writing output to file; chr', chr
        for pos in posList:
            ModList=PartitionDict[pos].keys()
            ModList.sort()
            for mod in ModList:
                outline=str(chr)+'\t'+str(pos)+'\t'+str(mod)+'\t'+str(PartitionDict[pos][mod])+'\n'
                outfile.write(outline)
        outfile.close()
       
run()
