##################################
#                                #
# Last modified 11/20/2013       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import os
import subprocess

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s FASTQ1 FASTQ2 [-convertToFASTA] [-filter minFASTQscore minReadLength QualityScoreScale]'
        print '\tQualityScore options: [Sanger | Solexa64 | Illumina1.3 | Illumina1.5 | Illumina1.8]'
        print '\tFiles ending with .bz2 and .gz will be read from stdin by unzipping them with the correspondign program'
        print '\tthe script will print to stdout by default'
        sys.exit(1)

    fastq1 = sys.argv[1]
    fastq2 = sys.argv[2]

    doFA = False
    if '-convertToFASTA' in sys.argv:
        doFA = True
    
    doFilter = False
    if '-filter' in sys.argv:
        doFilter = True
        minFASTQscore = int(sys.argv[sys.argv.index('-filter')+1])
        minReadLength = int(sys.argv[sys.argv.index('-filter')+2])
        Scale = sys.argv[sys.argv.index('-filter')+3]
        if Scale == 'Sanger' or Scale == 'Illumina1.8':
            QualityScoreOffset = 33
        if Scale == 'Solexa64' or Scale == 'Illumina1.3' or Scale == 'Illumina1.5':
            QualityScoreOffset = 64    

    QualityScoresScale = {}

    QualityScoresScale['!'] = 33
    QualityScoresScale['"'] = 34
    QualityScoresScale['#'] = 35
    QualityScoresScale['$'] = 36
    QualityScoresScale['%'] = 37
    QualityScoresScale['&'] = 38
    QualityScoresScale["'"] = 39
    QualityScoresScale['('] = 40
    QualityScoresScale[')'] = 41
    QualityScoresScale['*'] = 42
    QualityScoresScale['+'] = 43
    QualityScoresScale[','] = 44
    QualityScoresScale['-'] = 45
    QualityScoresScale['.'] = 46
    QualityScoresScale['/'] = 47
    QualityScoresScale['0'] = 48
    QualityScoresScale['1'] = 49
    QualityScoresScale['2'] = 50
    QualityScoresScale['3'] = 51
    QualityScoresScale['4'] = 52
    QualityScoresScale['5'] = 53
    QualityScoresScale['6'] = 54
    QualityScoresScale['7'] = 55
    QualityScoresScale['8'] = 56
    QualityScoresScale['9'] = 57
    QualityScoresScale[':'] = 58
    QualityScoresScale[';'] = 59
    QualityScoresScale['<'] = 60
    QualityScoresScale['='] = 61
    QualityScoresScale['>'] = 62
    QualityScoresScale['?'] = 63
    QualityScoresScale['@'] = 64
    QualityScoresScale['A'] = 65
    QualityScoresScale['B'] = 66
    QualityScoresScale['C'] = 67
    QualityScoresScale['D'] = 68
    QualityScoresScale['E'] = 69
    QualityScoresScale['F'] = 70
    QualityScoresScale['G'] = 71
    QualityScoresScale['H'] = 72
    QualityScoresScale['I'] = 73
    QualityScoresScale['J'] = 74
    QualityScoresScale['K'] = 75
    QualityScoresScale['L'] = 76
    QualityScoresScale['M'] = 77
    QualityScoresScale['N'] = 78
    QualityScoresScale['O'] = 79
    QualityScoresScale['P'] = 80
    QualityScoresScale['Q'] = 81
    QualityScoresScale['R'] = 82
    QualityScoresScale['S'] = 83
    QualityScoresScale['T'] = 84
    QualityScoresScale['U'] = 85
    QualityScoresScale['V'] = 86
    QualityScoresScale['W'] = 87
    QualityScoresScale['X'] = 88
    QualityScoresScale['Y'] = 89
    QualityScoresScale['Z'] = 90
    QualityScoresScale['['] = 91
    QualityScoresScale['\\'] = 92
    QualityScoresScale[']'] = 93
    QualityScoresScale['^'] = 94
    QualityScoresScale['_'] = 95
    QualityScoresScale['`'] = 96
    QualityScoresScale['a'] = 97
    QualityScoresScale['b'] = 98
    QualityScoresScale['c'] = 99
    QualityScoresScale['d'] = 100
    QualityScoresScale['e'] = 101
    QualityScoresScale['f'] = 102
    QualityScoresScale['g'] = 103
    QualityScoresScale['h'] = 104
    QualityScoresScale['i'] = 105
    QualityScoresScale['j'] = 106
    QualityScoresScale['k'] = 107
    QualityScoresScale['l'] = 108
    QualityScoresScale['m'] = 109
    QualityScoresScale['n'] = 110
    QualityScoresScale['o'] = 111
    QualityScoresScale['p'] = 112
    QualityScoresScale['q'] = 113
    QualityScoresScale['r'] = 114
    QualityScoresScale['s'] = 115
    QualityScoresScale['t'] = 116
    QualityScoresScale['u'] = 117
    QualityScoresScale['v'] = 118
    QualityScoresScale['w'] = 119
    QualityScoresScale['x'] = 120
    QualityScoresScale['y'] = 121
    QualityScoresScale['z'] = 122
    QualityScoresScale['{'] = 123
    QualityScoresScale['|'] = 124
    QualityScoresScale['}'] = 125
    QualityScoresScale['~'] = 126

    if fastq1.endswith('.bz2'):
        cmd1 = 'bzip2 -cd ' + fastq1
    elif fastq1.endswith('.gz'):
        cmd1= 'gunzip -c ' + fastq1
    else:
        cmd1 = 'cat ' + fastq1
    p1 = os.popen(cmd1, "r")

    if fastq2.endswith('.bz2'):
        cmd2 = 'bzip2 -cd ' + fastq2
    elif fastq2.endswith('.gz'):
        cmd2 = 'gunzip -c ' + fastq2
    else:
        cmd2 = 'cat ' + fastq2
    p2 = os.popen(cmd2, "r")

    
    ID1 = 'line1'
    ID2 = 'line2'
    i=0
    while ID1 != '':
        i+=1
        ID1 = p1.readline().strip()
        if ID1 == '':
            break
        ID2 = p2.readline().strip()
        seq1 = p1.readline().strip()
        seq2 = p2.readline().strip()
        plus1 = p1.readline().strip()
        plus2 = p2.readline().strip()
        q1 = p1.readline().strip()
        q2 = p2.readline().strip()
        if ID1.startswith('@') and ID2.startswith('@') and plus1.startswith('+') and plus2.startswith('+'):
            pass
        else:
            print 'broken input files, exiting', i*4, i*4+1, i*4+2, i*4+3
            print ID1
            print ID2
            print seq1
            print seq2
            print plus1
            print plus2
            print q1
            print q2
        scorebreak1 = len(seq1)
        scorebreak2 = len(seq2)
        firstN1 = len(seq1)
        firstN2 = len(seq2)
        if doFilter:
            firstN1 = seq1.find('N')
            firstN2 = seq2.find('N')
            if firstN1 == -1:
                firstN1 = len(seq1)
            if firstN2 == -1:
                firstN2 = len(seq2)
            for n in range(len(seq1)):
                Q = QualityScoresScale[q1[n]]
                if Q - QualityScoreOffset < minFASTQscore:
                    scorebreak1 = n
                    break
            for n in range(len(seq2)):
                Q = QualityScoresScale[q2[n]]
                if Q - QualityScoreOffset < minFASTQscore:
                    scorebreak2 = n
                    break
            if firstN1 < minReadLength or scorebreak1 < minReadLength or firstN2 < minReadLength or scorebreak2 < minReadLength:
                continue
        trimmedReadLength1 = min(firstN1,scorebreak1)
        trimmedReadLength2 = min(firstN2,scorebreak2)
        if doFA:
            print '>' + ID1[1:len(ID1)]
            print seq1[0:trimmedReadLength1]
            print '>' + ID2[1:len(ID2)]
            print seq2[0:trimmedReadLength2]
        else:
            print ID1
            print seq1[0:trimmedReadLength1]
            print '+'
            print q1[0:trimmedReadLength1]
            print ID2
            print seq2[0:trimmedReadLength2]
            print '+'
            print q2[0:trimmedReadLength2]
        
run()