##################################
#                                #
# Last modified 2019/03/27       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
import os

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s  datafilename fieldID min max outfilename [-regionLength chrFieldID] [-splitby string]' % sys.argv[0]
        print '\tuse "Inf" to indicate greater or smaller than'
        print '\tuse "q-n-m" to indicate quartile, where n is the number of bins to be split, and m is the number of the bin (0-based)'
        print '\tuse "q-n-m" to indicate quartile, where n is the number of bins to be split, and m is the number of the bin (0-based)'
        print "\tNote: you can use gzipped and bzipped files, the script will detect those automaticlaly if they end in .gz or .bz2"
        sys.exit(1)

    input = sys.argv[1]
    ID = int(sys.argv[2])
    doMoreThanMin=False
    doLessThanMax=False
    doMinQ = False
    doMaxQ = False
    if sys.argv[3] == 'Inf':
        minV = sys.argv[3]
    elif sys.argv[3].startswith('q-'):
        n1 = float(sys.argv[3].split('-')[1])
        m1 = float(sys.argv[3].split('-')[2])
        doMinQ = True
        doMoreThanMin = True
    else:
        minV = float(sys.argv[3])
        doMoreThanMin = True
    if sys.argv[4] == 'Inf':
        maxV = sys.argv[4]
    elif sys.argv[4].startswith('q-'):
        n2 = float(sys.argv[4].split('-')[1])
        m2 = float(sys.argv[4].split('-')[2])
        doMaxQ = True
        doLessThanMax = True
    else:
        maxV = float(sys.argv[4])
        doLessThanMax = True
    outfilename = sys.argv[5]

    doRL = False
    if '-regionLength' in sys.argv:
        doRL = True
        chrFieldID = int(sys.argv[sys.argv.index('-regionLength') + 1])

    SplitBy = '\t'
    if '-splitby' in sys.argv:
        SplitBy = sys.argv[sys.argv.index('-splitby') + 1]

    outfile = open(outfilename, 'w')

    if doMinQ or doMaxQ:
        ValueList = []
#        lineslist = open(input)
        if input.endswith('.bz2'):
            cmd = 'bzip2 -cd ' + input
        elif input.endswith('.gz'):
            cmd = 'gunzip -c ' + input
        else:
            cmd = 'cat ' + input
        p = os.popen(cmd, "r")
        line = '.'
#        for line in lineslist:
        while line != '':
            line = p.readline()
            if line.startswith('#'):
                outfile.write(line)
                continue
            if line == '':
                continue
            fields = line.strip().split(SplitBy)
            try:
                value = float(fields[ID])
            except:
                print 'skipping line', fields
                continue
            ValueList.append(value)
        ValueList.sort()
        print min(ValueList), max(ValueList)
        if doMinQ:
            minV = ValueList[int(m1*(len(ValueList)/n1))]
            print int(m1*(len(ValueList)/n1)), minV, 
        if doMaxQ:
            maxV = ValueList[int(m2*(len(ValueList)/n2))]
            print int(m2*(len(ValueList)/n2)), maxV

#    lineslist = open(input)
    if input.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + input
    elif input.endswith('.gz'):
        cmd = 'gunzip -c ' + input
    else:
        cmd = 'cat ' + input
    p = os.popen(cmd, "r")
    line = '.'
    while line != '':
        line = p.readline()
#    for line in lineslist:
        if line.startswith('#'):
            outfile.write(line)
            continue
        if line == '':
            continue
        fields = line.strip().split(SplitBy)
        if doRL:
            right = int(fields[chrFieldID + 2])
            left = int(fields[chrFieldID + 1])
            value = right - left
        else:
            try:
                value = float(fields[ID])
            except:
                print 'skipping line', fields
                continue
        doPrint = True
        if doMoreThanMin and value < minV:
            doPrint = False
        if doLessThanMax and value > maxV:
            doPrint = False
        if doPrint:
            outfile.write(line)

    outfile.close()

run()

