##################################
#                                #
# Last modified 2023/05/13       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import random
import os
import math
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s methylation_reads_all.tsv outfile [-NumDecimal N]' % sys.argv[0]
        sys.exit(1)

    reads = sys.argv[1]
    outfilename = sys.argv[2]

    doDecimal = False
    if '-NumDecimal' in sys.argv:
         doDecimal = True
         DecN = int(sys.argv[sys.argv.index('-NumDecimal') + 1])
         print 'will only consider the first', DecN, 'decimal values'

    PDict = {}
    Total = 0

    if reads.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + reads
    elif reads.endswith('.gz') or reads.endswith('.bgz'):
        cmd = 'zcat ' + reads
    elif reads.endswith('.zip'):
        cmd = 'unzip -p ' + reads
    else:
        cmd = 'cat ' + reads
    RN = 0
    P = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        RN += 1
        if RN % 10000 == 0:
           print RN, 'reads processed'
        line = P.readline().strip()
        if line == '':
            break
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        loglike = fields[7].split(',')
        for LL in loglike:
            if doDecimal:
                L = round(float(LL), DecN)
            else:
                L = float(LL)
            if PDict.has_key(L):
                pass
            else:
                PDict[L] = 0.0
            PDict[L] += 1
        Total += len(loglike)

    outfile = open(outfilename,'w')
    outline = '#p\tcounts\tfraction'
    outfile.write(outline + '\n')

    Ps = PDict.keys()
    Ps.sort()

    for P in Ps:
        outline = str(P) + '\t' + str(PDict[P]) + '\t' + str(PDict[P]/Total)
        outfile.write(outline + '\n')

    outfile.close()
            
run()

