##################################
#                                #
# Last modified 5/6/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from cistematic.core.geneinfo import geneinfoDB
from cistematic.genomes import Genome

def getSequence(genome,chromosome,start,stop,sense):
    
    hg = Genome(genome)
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N'}
    chromosome = chromosome[3:len(chromosome)]
    if sense=='F' or sense=='+':
        sequence = string.upper(hg.sequence(chromosome,start,stop-start))
    if sense=='R' or sense=='-':
        preliminarysequence = string.upper(hg.sequence(chromosome,start,stop-start))
        sequence=''
        for i in range(len(preliminarysequence)):
            sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-i-1]]
    
    return sequence

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s  genome inputfilename outpurfilename [-top number] [-extend bp] [-startField fieldID]' % sys.argv[0]
        sys.exit(1)

    cachePages = 2000000
    
    genome = sys.argv[1]
    inputfilename = sys.argv[2]
    outfilename = sys.argv[3]
    doExtend=False
    fieldID=0
    if '-extend' in sys.argv:
        doExtend=True
        extension = int(sys.argv[sys.argv.index('-extend') + 1])
        print 'will extend regions with', extension ,'bp'
    if '-startField' in sys.argv:
        fieldID= int(sys.argv[sys.argv.index('-startField') + 1])


    outfile = open(outfilename, 'w')
    hg = Genome(genome)
    idb = geneinfoDB()
    
    inputdatafile = open(inputfilename)
    inputdatalist = inputdatafile.readlines()
    if doExtend:
        for line in inputdatalist:
            if 'random' in line:
                continue
            fields = line.split('\n')[0].split('\t')
            sense = 'F'
            chromosome = fields[fieldID]
            start = int(fields[fieldID+1])-extension
            stop = int(fields[fieldID+2])+extension
            try:
                sequence=getSequence(genome,chromosome,start,stop,sense)
                outfile.write('>'+chromosome+':'+str(start)+'-'+str(stop)+'\n')
                outfile.write(sequence)
                outfile.write('\n')
            except:
                print 'could not find', chromosome, start, stop
    else:
        for line in inputdatalist:
            if 'random' in line:
                continue
            fields = line.split('\n')[0].split('\t')
            sense = 'F'
            chromosome = fields[fieldID]
            start = int(fields[fieldID+1])
            stop = int(fields[fieldID+2])
            try:
                sequence=getSequence(genome,chromosome,start,stop,sense)
                outfile.write('>'+chromosome+':'+str(start)+'-'+str(stop)+'\n')
                outfile.write(sequence)
                outfile.write('\n')
            except:
                print 'could not find', chromosome, start, stop
   
run()
