##################################
#                                #
# Last modified 02/15/2013       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set
import os
import subprocess

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s input minFASTQscore minReadLength QualityScoreScale output [-stdout]'
        print ' QualityScore options: [Sanger | Solexa64 | Illumina1.3 | Illumina1.5 | Illumina1.8]'
        print ' use - if the input is ffrom standard input'
        print ' use -stdout if you want to print to standard output'
        sys.exit(1)

    fastq = sys.argv[1]
    minFASTQscore = int(sys.argv[2])
    minReadLength = int(sys.argv[3])
    Scale = sys.argv[4]
    outfilename = sys.argv[5]

    doStdInput = False
    if fastq == '-':
        doStdInput = True

    doStdOutput = False
    if outfilename == '-stdout':
        doStdOutput = True

    if not doStdOutput:
        print 'will remove all residues after finding a residue with a quality score lower than', minFASTQscore, 'or fater a N residue'
        print 'will only keep reads longer than', minReadLength, 'after trimming'

    QualityScoresScale = {}

    QualityScoresScale['!'] = 33
    QualityScoresScale['"'] = 34
    QualityScoresScale['#'] = 35
    QualityScoresScale['$'] = 36
    QualityScoresScale['%'] = 37
    QualityScoresScale['&'] = 38
    QualityScoresScale["'"] = 39
    QualityScoresScale['('] = 40
    QualityScoresScale[')'] = 41
    QualityScoresScale['*'] = 42
    QualityScoresScale['+'] = 43
    QualityScoresScale[','] = 44
    QualityScoresScale['-'] = 45
    QualityScoresScale['.'] = 46
    QualityScoresScale['/'] = 47
    QualityScoresScale['0'] = 48
    QualityScoresScale['1'] = 49
    QualityScoresScale['2'] = 50
    QualityScoresScale['3'] = 51
    QualityScoresScale['4'] = 52
    QualityScoresScale['5'] = 53
    QualityScoresScale['6'] = 54
    QualityScoresScale['7'] = 55
    QualityScoresScale['8'] = 56
    QualityScoresScale['9'] = 57
    QualityScoresScale[':'] = 58
    QualityScoresScale[';'] = 59
    QualityScoresScale['<'] = 60
    QualityScoresScale['='] = 61
    QualityScoresScale['>'] = 62
    QualityScoresScale['?'] = 63
    QualityScoresScale['@'] = 64
    QualityScoresScale['A'] = 65
    QualityScoresScale['B'] = 66
    QualityScoresScale['C'] = 67
    QualityScoresScale['D'] = 68
    QualityScoresScale['E'] = 69
    QualityScoresScale['F'] = 70
    QualityScoresScale['G'] = 71
    QualityScoresScale['H'] = 72
    QualityScoresScale['I'] = 73
    QualityScoresScale['J'] = 74
    QualityScoresScale['K'] = 75
    QualityScoresScale['L'] = 76
    QualityScoresScale['M'] = 77
    QualityScoresScale['N'] = 78
    QualityScoresScale['O'] = 79
    QualityScoresScale['P'] = 80
    QualityScoresScale['Q'] = 81
    QualityScoresScale['R'] = 82
    QualityScoresScale['S'] = 83
    QualityScoresScale['T'] = 84
    QualityScoresScale['U'] = 85
    QualityScoresScale['V'] = 86
    QualityScoresScale['W'] = 87
    QualityScoresScale['X'] = 88
    QualityScoresScale['Y'] = 89
    QualityScoresScale['Z'] = 90
    QualityScoresScale['['] = 91
    QualityScoresScale['\\'] = 92
    QualityScoresScale[']'] = 93
    QualityScoresScale['^'] = 94
    QualityScoresScale['_'] = 95
    QualityScoresScale['`'] = 96
    QualityScoresScale['a'] = 97
    QualityScoresScale['b'] = 98
    QualityScoresScale['c'] = 99
    QualityScoresScale['d'] = 100
    QualityScoresScale['e'] = 101
    QualityScoresScale['f'] = 102
    QualityScoresScale['g'] = 103
    QualityScoresScale['h'] = 104
    QualityScoresScale['i'] = 105
    QualityScoresScale['j'] = 106
    QualityScoresScale['k'] = 107
    QualityScoresScale['l'] = 108
    QualityScoresScale['m'] = 109
    QualityScoresScale['n'] = 110
    QualityScoresScale['o'] = 111
    QualityScoresScale['p'] = 112
    QualityScoresScale['q'] = 113
    QualityScoresScale['r'] = 114
    QualityScoresScale['s'] = 115
    QualityScoresScale['t'] = 116
    QualityScoresScale['u'] = 117
    QualityScoresScale['v'] = 118
    QualityScoresScale['w'] = 119
    QualityScoresScale['x'] = 120
    QualityScoresScale['y'] = 121
    QualityScoresScale['z'] = 122
    QualityScoresScale['{'] = 123
    QualityScoresScale['|'] = 124
    QualityScoresScale['}'] = 125
    QualityScoresScale['~'] = 126

    if Scale == 'Sanger' or Scale == 'Illumina1.8':
        QualityScoreOffset = 33
    if Scale == 'Solexa64' or Scale == 'Illumina1.3' or Scale == 'Illumina1.5':
        QualityScoreOffset = 64    

    if not doStdOutput:
        print Scale, QualityScoreOffset
        outfile = open(outfilename, 'w')

    TotalReads=0
    PostFilteringReadLengthDistribution={}
    PostFilteringReadLengthDistribution[0]=0

    if doStdInput:
        lineslist  = sys.stdin
    else:
        lineslist  = open(fastq)
    i=0
    pos=1
    for line in lineslist:
        if i % 20000000 == 0:
            if not doStdOutput:
                print str(i/4000000) + 'M reads processed'
        i+=1
        if pos==1:
            read={}
            if line.startswith('@'):
                pass
            else:
                print 'fastq file format broken, exiting'
                print pos, i, line.strip()
                sys.exit(1)
            read['ID']=line.strip()
            pos=2
            continue
        if pos==2:
            read['sequence']=line.strip()
            pos=3
            continue
        if pos==3:
            if line.startswith('+'):
                pass
            else:
                print 'fastq file format broken, exiting'
                print pos, i, line.strip()
                sys.exit(1)
            pos=4
            continue
        if pos==4:
            read['QualityScores']=line.strip()
            pos=1
            firstN = read['sequence'].find('N')
            if firstN == -1:
                firstN = len(read['sequence'])
            scorebreak = len(read['QualityScores'])
            for n in range(len(read['QualityScores'])):
                Q = QualityScoresScale[read['QualityScores'][n]]
                if Q - QualityScoreOffset < minFASTQscore:
                    scorebreak = n
                    break
            if firstN < minReadLength or scorebreak < minReadLength:
                PostFilteringReadLengthDistribution[0]+=1
                continue
            else:
                trimmedReadLength = min(firstN,scorebreak)
                if PostFilteringReadLengthDistribution.has_key(trimmedReadLength):
                    PostFilteringReadLengthDistribution[trimmedReadLength]+=1
                else:
                    PostFilteringReadLengthDistribution[trimmedReadLength]=1
                if not doStdOutput:
                    outfile.write(read['ID'] + '\n')
                    outfile.write(read['sequence'][0:trimmedReadLength] + '\n')
                    outfile.write('+\n')
                    outfile.write(read['QualityScores'][0:trimmedReadLength] + '\n')
                else:
                    print read['ID']
                    print read['sequence'][0:trimmedReadLength]
                    print '+'
                    print read['QualityScores'][0:trimmedReadLength]
            continue

    if not doStdOutput:
        outfile.close()
        outfile = open(outfilename +'.log', 'w')
        print 'Read length distribution after filtering:'
        keys = PostFilteringReadLengthDistribution.keys()
        keys.sort()
        for k in keys:
            print k, PostFilteringReadLengthDistribution[k]
            outfile.write(str(k) + '\t' + str(PostFilteringReadLengthDistribution[k]) + '\n')
        
run()