##################################
#                                #
# Last modified 2022/03/15       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
from sets import Set
import os
import gzip
import string

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s inputfilename sequenceFieldID labelFieldID(s)|count outfilename [-minSeqLen number] [-ACGTonly]' % sys.argv[0]
        print '\tNote: use - instead of an input file name to indicate stanard input'
        print '\tNote: use - instead of an output file name to indicate stanard output'
        print '\tNote: if there is no read ID in the file and you want the read to be numbers, use "count" for the labelFieldID(s) parameter'
        sys.exit(1)

    inputfilename = sys.argv[1]
    seqID = int(sys.argv[2])
    labelIDs=[]
    doNumber = False
    if sys.argv[3] == 'count':
        doNumber = True
    else:
        fields = sys.argv[3].split(',')
        for ID in fields:
            labelIDs.append(int(ID))
        labelIDs.sort()
    outputfilename = sys.argv[4]

    doACGT = False
    if '-ACGTonly' in sys.argv:
        doACGT = True

    doStdInput = False
    if inputfilename == '-':
        doStdInput = True

    doStdOutput = False
    if outputfilename == '-':
        doStdOutput = True

    if not doStdOutput:
        print labelIDs

    doMinSeqLen=False
    if '-minSeqLen' in sys.argv:
        doMinSeqLen=True
        minSeqLen = int(sys.argv[sys.argv.index('-minSeqLen')+1])

    if not doStdOutput:
        outfile = open(outputfilename, 'w')

    if doStdInput:
        lineslist  = sys.stdin
    else:
        lineslist  = open(inputfilename)
    i=0
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if len(fields) < seqID+1:
            continue
        if doMinSeqLen and len(fields[seqID].strip()) < minSeqLen:
            continue
        if doACGT:
            if set(fields[seqID].upper()) == set(['A', 'C', 'T', 'G']):
                pass
            else:
                continue
        outline = '>' 
        i+=1
        if doNumber:
            outline = outline + 'read' + str(i) + '|'
        else:
            for ID in labelIDs:
                outline = outline + fields[ID] + '|'
        if not doStdOutput:
            outfile.write(outline[0:-1] + '\n')
            outline = fields[seqID]
            outfile.write(outline.strip() + '\n')
        else:
            print outline[0:-1]
            print fields[seqID].strip()

    if not doStdOutput:   
        outfile.close()

run()

