##################################
#                                #
# Last modified 7/8/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os
import string
from cistematic.core.geneinfo import geneinfoDB
from cistematic.genomes import Genome



def run():

    if len(sys.argv) < 4:
        print 'usage: python %s genome repeatdirectory RepeatType outputfilename' % sys.argv[0]
        sys.exit(1)

    cachePages = 2000000
    
    genome = sys.argv[1]
    rmaskdir = sys.argv[2]
    RepeatType = sys.argv[3]
    outfilename = sys.argv[4]
    doExtend=False

    outfile = open(outfilename, 'w')
    hg = Genome(genome)
    idb = geneinfoDB()

    files = os.listdir(rmaskdir)
    for filename in files:
        if '_rmsk' not in filename:
            continue
        print filename
        infile = open(rmaskdir + '/' + filename)
        linelist = infile.readlines()
        i=0
        for line in linelist:
            if i % 10000 == 0:
                print i, 'lines processed'
            i+=1
            fields=line.strip().split('\t')
            if fields[11][len(fields[11])-len(RepeatType):len(fields[11])]==RepeatType:
                start=int(fields[6])
                stop=int(fields[7])
                chromosome=fields[5]
                chr=chromosome[3:len(chromosome)]
                sequence=hg.sequence(chr,start,stop-start)
                outfile.write('>'+chromosome+':'+str(start)+'-'+str(stop)+'\n')
                outfile.write(sequence)
                outfile.write('\n')
    outfile.close()
  
run()
