##################################
#                                #
# Last modified 2024/02/13       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import gzip
import string
from sets import Set

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s  datafilename listofwantedgenes datafileFieldIDs outfilename [-singleName name] [-startFields fieldID1,....] [-containedInLine] [-splitline string] [-ignoreCap] [-header]' % sys.argv[0]
        print '\note: use - for stdin'
        sys.exit(1)

    expressiondatafilename = sys.argv[1]
    listofgenesfilename = sys.argv[2]
    outfilename = sys.argv[4]
    fields = sys.argv[3].split(',')
    fieldIDs = []
    for f in fields:
        fieldIDs.append(int(f))
    fieldIDs.sort()

    doCap=False
    if '-ignoreCap' in sys.argv:
        doCap=True

    doContainedInLine=False
    doMultipleFields=False
    if '-containedInLine' in sys.argv:
        doContainedInLine=True

    doHeader=False
    if '-header' in sys.argv:
        doHeader=True

    startFields=[0]
    if '-startFields' in sys.argv:
        startFields =[]
        fields = sys.argv[sys.argv.index('-startFields') + 1].split(',')
        for f in fields:
            startFields.append(int(f))
        startFields.sort()

    doSplitLine=False
    if '-splitline' in sys.argv:
        doSplitLine=True
        splitLineString=sys.argv[sys.argv.index('-splitline')+1]
        if splitLineString=='upp':
            splitLineString='"'
        print splitLineString

    doSingleName=False
    if '-singleName' in sys.argv:
        doSingleName=True
        singleName=sys.argv[sys.argv.index('-singleName')+1]
        print singleName

    outfile = open(outfilename, 'w')

    wantedgenes = {}
    wantedgenesList = []

  
    lineslist = open(listofgenesfilename)
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if len(fields) <= max(startFields):
            print 'skipping line:', fields
            continue
        label = []
        for ID in startFields:
            if doCap:
                label.append(fields[ID].upper())
            else:
                label.append(fields[ID])
        wantedgenesList.append(tuple(label))

    wantedgenesList=list(Set(wantedgenesList))
    for gene in wantedgenesList:
        wantedgenes[gene]=''
    if doSingleName:
        wantedgenes={}
        singleName = (singleName)
        wantedgenes[singleName]=''

    if expressiondatafilename == '-':
        lineslist = sys.stdin
    elif expressiondatafilename.endswith('.gz'):
        lineslist = gzip.open(expressiondatafilename)
    else:
        lineslist = open(expressiondatafilename)
    if doHeader:
        line = lineslist.readline()
        outfile.write(line)
    i=0
    if doContainedInLine:
        for line in lineslist:
            i=i+1
            if i % 100000 == 0:
                print i
            for G in wantedgenes.keys():
                if G[0] in line:
                    outfile.write(line)
                    break
    else:
        for line in lineslist:
            i=i+1
            if line[0]=='#':
                continue
            if i % 10000 == 0:
                print i
            if doSplitLine:
                fields = line.strip().split(splitLineString)
            else:
                fields = line.strip().split('\t')
            if len(fields) < (max(fieldIDs) + 1):
                print 'skipping', fields
                continue
            G=[]
            for ID in fieldIDs:
                if doCap:
                    G.append(fields[ID].upper())
                else:
                    G.append(fields[ID])
            G=tuple(G)
            if wantedgenes.has_key(G):
                outfile.write(line)

    outfile.close()

run()

