##################################
#                                #
# Last modified 2018/09/18       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import numpy as np
import random
import os
import math
from sets import Set

def run():

    if len(sys.argv) < 7:
        print 'usage: python %s genome.fa motifs_file chrFieldID leftFieldID rightFieldID seq1[,seq2,...,seqN] outfilename [-minWidth bp]' % sys.argv[0]
        print '\tthe script will print to stdout by default'
        print '\tuse the -minWidth option if you want regions shorter than it to be extended (from the middle)'
        sys.exit(1)

    fasta = sys.argv[1]
    motifs = sys.argv[2]
    chrFieldID = int(sys.argv[3])
    leftFieldID = int(sys.argv[4])
    rightFieldID = int(sys.argv[5])
    sequences = sys.argv[6].split(',')
    outiflename = sys.argv[7]

    minW = 0
    if '-minWidth' in sys.argv:
        minW = int(sys.argv[sys.argv.index('-minWidth') + 1])

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence).upper()
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence).upper()

    print 'finished inputting genomic sequence'

    Total = 0.0
    Positions = 0.0

    CountDict = {}

    if motifs.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + motifs
    elif motifs.endswith('.gz'):
        cmd = 'gunzip -c ' + motifs
    elif motifs.endswith('.zip'):
        cmd = 'unzip -p ' + motifs
    else:
        cmd = 'cat ' + motifs
    RN = 0
    p = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = p.readline().strip()
        if line == '':
            break
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        chr = fields[chrFieldID]
        left = int(fields[leftFieldID])
        right = int(fields[rightFieldID])
        if right - left < minW:
            left = int((right + left)/2.0 - minW/2.0)
            right = int((right + left)/2.0 + minW/2.0)
        S = 0
        for seq in sequences:
            S += GenomeDict[chr][left:right].count(seq)
        if CountDict.has_key(S):
            pass
        else:
            CountDict[S] = 0
        CountDict[S] += 1

    outfile = open(outiflename, 'w')

    outline = '#Number_informative_bases\tnumber_features'
    outfile.write(outline + '\n')

    Ks = CountDict.keys()
    Ks.sort()

    for S in Ks:
        outline = str(S) + '\t' + str(CountDict[S])
        outfile.write(outline + '\n')

    outfile.close()
            
run()

