##################################
#                                #
# Last modified 11/25/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import random
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s inputfilename number outfilename' % sys.argv[0]
        sys.exit(1)

    input = sys.argv[1]
    N = int(sys.argv[2])
    outfilename = sys.argv[3]

    IDList=[]

    listofgenesfile = open(input)
    i=0
    for line in listofgenesfile:
        i+=1
        if i % 1000000 == 0:
            print i, 'lines processed'
        fields = line.strip().split('\t')
        ID=fields[0]
        IDList.append(ID)

    IDList=list(Set(IDList))

    chosenList=random.sample(range(len(IDList)),N)

    IDDict={}
    for i in chosenList:
        IDDict[IDList[i]]=''

    IDlist=[]

    print 'read IDs picked, outputing results'

    print len(IDDict.keys())

    outfile = open(outfilename, 'w')

    lineslist = open(input)
    i=0
    for line in lineslist:
        i+=1
        fields = line.strip().split('\t')
        ID=fields[0]
        if IDDict.has_key(ID):
            outfile.write(line)
        if i % 1000000 == 0:
            print i, 'lines processed'
    outfile.close()

run()
