##################################
#                                #
# Last modified 2018/08/10       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import numpy as np
import random
import os
import math
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s genome.fa seq1[,seq2,...,seqN]' % sys.argv[0]
        print '\tthe script will print to stdout by default'
        sys.exit(1)

    fasta = sys.argv[1]
    sequences = sys.argv[2].split(',')

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence).upper()
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence).upper()

    print 'finished inputting genomic sequence'

    Total = 0.0
    Positions = 0.0

    outline = '#feature\tfeature_size\ttotal_positions_in_subset\tfraction\tresolution'
    print outline

    for chr in GenomeDict.keys():
        L = len(GenomeDict[chr])
        P = 0.01
        Total += L
        for seq in sequences:
            P += GenomeDict[chr].count(seq)
        Positions += P
        outline = chr + '\t' + str(L) + '\t' + str(P) + '\t' + str(P/L) + '\t' + str(L/P)
        print outline

    outline = 'genome-wide\t' + str(Total) + '\t' + str(Positions) + '\t' + str(Positions/Total) + '\t' + str(Total/Positions)
    print outline
            
run()

