##################################
#                                #
# Last modified 2018/04/23       #
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import os
import random	
from sets import Set

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s sgRNA_human_10_ref.csv.gz wanted fieldID geneID|geneName N_safes' % sys.argv[0]
        print '\tnote: the script will print to stdout by default'
        sys.exit(1)

    input = sys.argv[1]
    wanted = sys.argv[2]
    wantedFieldID = int(sys.argv[3])
    type = sys.argv[4]
    Nsafes = int(sys.argv[5])

    Guides = {}
    Guides['safes'] = []
    Guides['none'] = []
    Guides['genes'] = []
    if input.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + input
    elif input.endswith('.gz'):
        cmd = 'gunzip -c ' + input
    else:
        cmd = 'cat ' + input
    p = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = p.readline()
        if line == '':
            break
        if line.startswith('#'):
            continue
        if line.startswith('0None_none'):
            Guides['none'].append(line.strip())
        elif line.startswith('0Safe_safe'):
            Guides['safes'].append(line.strip())
        else:
            Guides['genes'].append(line.strip())

    WanteDict = {}
    if wanted.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + wanted
    elif wanted.endswith('.gz'):
        cmd = 'gunzip -c ' + wanted
    else:
        cmd = 'cat ' + wanted
    p = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = p.readline()
        if line.startswith('#'):
            continue
        if line == '':
            break
        fields = line.strip().split('\t')
        WanteDict[fields[wantedFieldID]] = 1

    safes = random.sample(Guides['safes'],Nsafes)

    for safe in safes:
        print safe

    for sgRNA in Guides['genes']:
        if type == 'geneID':
            gene = sgRNA.split('_')[0]
        if type == 'geneName':
            gene = sgRNA.split('_')[1]
#        print gene
        if WanteDict.has_key(gene):
            print sgRNA
           

run()
