import os
import sys
import re
import string
import time
import random

def getLineList(hitfile, verbose=False):
	hitfile_stream = open(hitfile)
	line_list = hitfile_stream.readlines()
	if verbose:
		print len(line_list)
	hitfile_stream.close()
	return line_list

def loadRegions(regionList):
	read_dictionary = {}
	for line in regionList:
		fields = line.split()
		length = int(fields[3]) - int(fields[2])
		chromosome = fields[1]
		if chromosome in read_dictionary:
			read_dictionary[chromosome].append(length)
			continue
		else:
			read_dictionary[chromosome] = []
			read_dictionary[chromosome].append(length)
			continue
	return read_dictionary


def seqtostr(f1):
	s = ''
	t1 = ''
	for t in f1.readlines():
		# print t[0]
		if t[0] != '>':
			t1 = t1 + t
	t2 = re.sub(r'[a-z]*[\d]*', '', t1)
	t2 = t2.replace('\n','')
	t2 = t2.replace(' ', '')
	t2 = t2.replace('N', '')
	t2 = t2.upper()
	# print t1
	# print len(t1)
	return t2


try:
	regions_file = sys.argv[1]
	sequence_file_path = sys.argv[2]
	output_file = sys.argv[3]
except:
	print 'proper usage:  <regions file> <sequence path> <output_file>'


read_dictionary = {}
mouse_chromosome_tags = []
for i in range(1,20):
	tag = 'chr' + str(i)
	mouse_chromosome_tags.append(tag)
	read_dictionary[tag] = []
mouse_chromosome_tags.append('chrX')
mouse_chromosome_tags.append('chrY')

regionList = getLineList(regions_file, False)

read_dictionary = loadRegions(regionList)

outfile = open(output_file, 'w')

for chromosome in mouse_chromosome_tags:
	sequence_file = sequence_file_path + chromosome + '.fa'
	seq = open(sequence_file)
	seq_string = seqtostr(seq)
	seq_length = len(seq_string)
	print chromosome, seq_length
	# print seq_string[0:301]
	total = 0
	if chromosome not in read_dictionary:
		continue
	for entry in read_dictionary[chromosome]:
		total += 1
		seed = 1
		while (seed*seq_length+int(entry) > seq_length):
			seed = random.random()
			# print int(seed*seq_length+int(entry))
		fake_start = int(seed*seq_length)
		fake_end = int(seed*seq_length) + int(entry)
		outfile.write('fake_'+chromosome+'_'+str(total)+'\t')
		outfile.write(chromosome+'\t')
		outfile.write(str(fake_start)+'\t')
		outfile.write(str(fake_end)+'\t')
		outfile.write('0'+'\n')
		
		


