##################################
#                                #
# Last modified 2018/06/03       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
import math
from sets import Set
import re

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s methylation_reads.tsv loglike_threshold outfile' % sys.argv[0]
        sys.exit(1)

    reads = sys.argv[1]
    LLthreshold = float(sys.argv[2])
    outfilename = sys.argv[3]

    CoverageDict = {}
    
    if reads.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + reads
    elif reads.endswith('.gz') or reads.endswith('.bgz'):
        cmd = 'zcat ' + reads
    elif reads.endswith('.zip'):
        cmd = 'unzip -p ' + reads
    else:
        cmd = 'cat ' + reads
    RN = 0
    P = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = P.readline().strip()
        if line == '':
            break
        if line.startswith('chromosome\tstart\tend\tread_name'):
            continue
        fields = line.strip().split('\t')
        RN += 1
        if RN % 1000000 == 0:
            print RN, 'lines processed'
        chr = fields[0]
        sequence = fields[9]
        if CoverageDict.has_key(chr):
            pass
        else:
            CoverageDict[chr] = {}
        left = int(fields[1])
        right = int(fields[2])
        ll = float(fields[4])
        matches = [m.start() for m in re.finditer('CG', sequence)]
        minPos = min(matches)
        for p in matches:
            pos = left + p - minPos
            if CoverageDict[chr].has_key(pos):
                pass
            else:
                CoverageDict[chr][pos] = [0,0]
            if ll < LLthreshold:
                CoverageDict[chr][pos][1] += 1
            else:        
                CoverageDict[chr][pos][0] += 1

    print 'finished inputting reads'

    chromosomes = CoverageDict.keys()
    chromosomes.sort()

    outfile = open(outfilename,'w')

    outline = '#chr\tstart\tend\tmeth\tunmeth\tcov'
    outfile.write(outline + '\n')

    K=0
    for chr in chromosomes:
        print chr
        positions = CoverageDict[chr].keys()
        positions.sort()
        for pos in positions:
            outline = chr + '\t' + str(pos) + '\t' + str(pos + 1)
            outline = outline + '\t' + str(CoverageDict[chr][pos][1])
            outline = outline + '\t' + str(CoverageDict[chr][pos][0])
            outline = outline + '\t' + str(CoverageDict[chr][pos][0] + CoverageDict[chr][pos][1])
            outfile.write(outline + '\n')
            
    outfile.close()

            
run()

