##################################
#                                #
# Last modified 5/6/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
import random

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s  sites-genes-pairs-filename genestobeexcludedfilename NumberToBeSelected outfilename [-limitsize size]' % sys.argv[0]
        sys.exit(1)

    sitesgenespairsfilename = sys.argv[1]
    genestobeexcludedfilename = sys.argv[2]
    number = int(sys.argv[3])
    outfilename = sys.argv[4]
    doLimitSize=False
    if '-limitsize' in sys.argv:
        doLimitSize=True
        print 'doLimitSize' 
        size = int(sys.argv[sys.argv.index('-limitsize') + 1])

    outfile = open(outfilename, 'w')

    genestobeexcludedfile = open(genestobeexcludedfilename)
    genestobeexcluded = genestobeexcludedfile.readlines()
    listofgenestobeexcluded = []
    for line in genestobeexcluded:
        fields = line.split('\n')[0].split('\t')
        listofgenestobeexcluded.append(fields[0])

    sitesgenespairsfile = open(sitesgenespairsfilename)
    sitesgenespairslist = sitesgenespairsfile.readlines()
    sitesgenespairsUpstream = []
    sitesgenespairs1kbProximal = []
    sitesgenespairsDownstream = []
    line1 = sitesgenespairslist[0]
    sitesgenespairslist.remove(sitesgenespairslist[0])
    for line in sitesgenespairslist:
        fields = line.split('\n')[0].split('\t')
        if fields[1] in genestobeexcluded:
            continue
        print int(fields[8])-int(fields[7])
        if (doLimitSize and (int(fields[8])-int(fields[7])>size)):
            continue
        else:
            if (int(fields[9]) >= -60000) and (int(fields[9]) <= -1000):
                 sitesgenespairsUpstream.append(line)
            if (int(fields[9]) >= -1000) and (int(fields[9]) <= 1000):
                 sitesgenespairs1kbProximal.append(line)
            if (int(fields[9]) >= 1000) and (int(fields[9]) <= 60000):
                 sitesgenespairsDownstream.append(line)

    print 'len(sitesgenespairsUpstream)', len(sitesgenespairsUpstream)
    print 'len(sitesgenespairs1kbProximal)', len(sitesgenespairs1kbProximal)
    print 'len(sitesgenespairsDownstream)', len(sitesgenespairsDownstream)
    print 'number', number

    outfile.write(line1)

    outfile.write('\n\nUpstream Sites\n\n')
    if number < len(sitesgenespairsUpstream):
        list = random.sample(sitesgenespairsUpstream,number)
    else:
        list = sitesgenespairsUpstream
    for line in list:
        outfile.write(line)     

    outfile.write('\n\n1kbUpstream Sites\n\n')
    if number < len(sitesgenespairs1kbProximal):
        list = random.sample(sitesgenespairs1kbProximal,number)
    else:
        list = sitesgenespairs1kbProximal
    for line in list:
        outfile.write(line)     

    outfile.write('\n\nDownstream Sites\n\n')
    if number < len(sitesgenespairsDownstream):
        list = random.sample(sitesgenespairsDownstream,number)
    else:
        list = sitesgenespairsDownstream
    list = random.sample(sitesgenespairsDownstream,number)
    for line in list:
        outfile.write(line)     

run()
