##################################
#                                #
# Last modified 9/21/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import random
from cistematic.core import Genome
from cistematic.core.geneinfo import geneinfoDB
from commoncode import *

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s genome regionfile chrField rds-files-directory outputfilename [-cache size] [-addreadwhenzero] [-nomulti]' % sys.argv[0]
        sys.exit(1)
    
    genome = sys.argv[1]
    regionfile = sys.argv[2]
    chrField = int(sys.argv[3])
    rdsdirectory = sys.argv[4]
    outfilename = sys.argv[5]

    outfile = open(outfilename, 'w')

    doMulti=True
    if '-nomulti' in sys.argv:
        doMulti=False 
   
    cachePages = -1
    doCache = False
    if '-cache' in sys.argv:
        doCache = True
        cachePages =  int(sys.argv[sys.argv.index('-cache') + 1])

    doAddRead=False
    if '-addreadwhenzero' in sys.argv:
        doAddRead=True

    hg = Genome(genome)

    RegionDict={}
    randomRegionDict=[]

    listoflines = open(regionfile)
    lineslist = listoflines.readlines()
    for line in lineslist:
        if line[0]=='#' or line[0]=='G':
            continue
        fields = line.split('\t')
        chromosome=fields[chrField]
        length=int(fields[chrField+2])-int(fields[chrField+1])
        if RegionDict.has_key(chromosome):
            RegionDict[chromosome].append(length)
        else:
            RegionDict[chromosome]=[]
            RegionDict[chromosome].append(length)
    outlineDict={}
    for chromosome in RegionDict.keys():
        lenchr = len(hg.getChromosomeSequence(chromosome[3:len(chromosome)]))
        for regionLength in RegionDict[chromosome]:
            randomPos=random.randint(0, lenchr)
            ID=chromosome+':'+str(randomPos)+'-'+str(randomPos+regionLength)
            randomRegionDict.append((chromosome,randomPos,randomPos+regionLength,ID))
            outlineDict[ID]=chromosome+'\t'+str(randomPos)+'\t'+str(randomPos+regionLength)

    outline='#chr\tstart\tstop\t'
    files = os.listdir(rdsdirectory)
    for hitfilename in files:
        if '.rds' not in hitfilename:
            continue
        print hitfilename
        outline=outline+hitfilename.split('.rds')[0]+'\t'
        hitRDS = readDataset(hitfilename, verbose = True, cache=True)
        #sqlite default_cache_size is 2000 pages
        if cachePages > hitRDS.getDefaultCacheSize():
            hitRDS.setDBcache(cachePages)
        metadata = hitRDS.getMetadata()
        readlen = int(metadata['readsize'])
        dataType = metadata['dataType']
        readlenRange = range(readlen)
        normalizeBy = len(hitRDS)/1000000.
        i=0
        for (chr,rmin,rmax,ID) in randomRegionDict:
            i+=1
            if i % 1000==0:
                print i
            value=hitRDS.getCounts(chrom=chr, rmin=rmin, rmax=rmax, uniqs=True, multi=doMulti, splices=False, reportCombined=True)
            if doAddRead:
                value=value+1
            RPKM=value/(normalizeBy*((rmax-rmin)/1000.0))
            outlineDict[ID]=outlineDict[ID]+'\t'+str(RPKM)
        hitRDS=''
    outfile.write(outline + '\n')
    for ID in outlineDict.keys():
        outfile.write(outlineDict[ID]+'\n')
    outfile.close()
   
run()
