##################################
#                                #
# Last modified 12/15/2012       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math

# Format description: RepeatMasker .out record
# field	example	SQL type	info	description
# bin	585	smallint(5) unsigned	range	Indexing field to speed chromosome range queries.
# swScore	778	int(10) unsigned	range	Smith Waterman alignment score
# milliDiv	167	int(10) unsigned	range	Base mismatches in parts per thousand
# milliDel	7	int(10) unsigned	range	Bases deleted in parts per thousand
# milliIns	20	int(10) unsigned	range	Bases inserted in parts per thousand
# genoName	chr2L	varchar(255)	values	Genomic sequence name
# genoStart	1	int(10) unsigned	range	Start in genomic sequence
 # genoEnd	154	int(10) unsigned	range	End in genomic sequence
# genoLeft	-23011390	int(11)	range	-#bases after match in genomic sequence
# strand	+	char(1)	values	Relative orientation + or -
# repName	HETRP_DM	varchar(255)	values	Name of repeat
# repClass	Satellite	varchar(255)	values	Class of repeat
# repFamily	Satellite	varchar(255)	values	Family of repeat
# repStart	1519	int(11)	range	Start (if strand is +) or -#bases after match (if strand is -) in repeat sequence
# repEnd	1669	int(11)	range	End in repeat sequence
# repLeft	-203	int(11)	range	-#bases after match (if strand is +) or start (if strand is -) in repeat sequence
# id	1	char(1)	values	First digit of id field in RepeatMasker .out file. Best ignored.

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s repeatMasker maxMismatches maxDeletions maxInsertions minLength minFraction out' % sys.argv[0]
        print '\tmaxMismatches, maxDeletions and maxInsertions should be integers corresponding to parts per thousand' 
        print '\tminLength should be in bases, minFraction should be a float between 0 and 1' 
        sys.exit(1)

    repeatMasker = sys.argv[1]
    maxMM = int(sys.argv[2])
    maxD = int(sys.argv[3])
    maxI = int(sys.argv[4])
    minLength = int(sys.argv[5])
    minFraction = float(sys.argv[6])
    outfilename = sys.argv[7]
    
    outfile = open(outfilename, 'w')

    inputdatafile = open(repeatMasker)
    chr=''
    for line in inputdatafile:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        MM = int(fields[2])
        if MM > maxMM:
            continue
        D = int(fields[3])
        if D > maxD:
            continue
        I = int(fields[4])
        if I > maxI:
            continue
        strand = fields[9]
        if strand == '+':
            start = int(fields[13])
            end = int(fields[14])
            basesLeft = -int(fields[15])
        if strand == '-':
            start = int(fields[15])
            end = int(fields[14])
            basesLeft = -int(fields[13])
        length = end - start
        if length < minLength:
            continue 
        repeatLength = end + basesLeft
#        print fields, MM, D, I, strand, start, end, basesLeft, length, repeatLength
        fraction = length/(repeatLength + 0.0)
        if fraction < minFraction:
            continue
        else:
            outfile.write(line)

    outfile.close()
   
run()
