#################################
#                                #
# Last modified 2019/01/03       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
import math
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s methylation_reads_all.tsv_1,methylation_reads_all_2.tsv(,etc.) outfile' % sys.argv[0]
        print '\tNote: this script will take two (or more) comma-separated files for (presumably) the same set of reads and combine the modification calls for each read into a new file'
        sys.exit(1)

    reads_files = sys.argv[1].split(',')
    outfilename = sys.argv[2]

    ReadDict = {}

    for reads in reads_files:
        if reads.endswith('.bz2'):
            cmd = 'bzip2 -cd ' + reads
        elif reads.endswith('.gz') or reads.endswith('.bgz'):
            cmd = 'zcat ' + reads
        elif reads.endswith('.zip'):
            cmd = 'unzip -p ' + reads
        else:
            cmd = 'cat ' + reads
        P = os.popen(cmd, "r")
        line = 'line'
        while line != '':
            line = P.readline().strip()
            if line == '':
                break
            fields = line.strip().split('\t')
            if len(fields) < 8:
                print 'skipping:', fields
                continue
            chr = fields[0]
            strand = fields[3]
            ID = fields[4]
            Ps = []
            for p in fields[6].split(','):
                Ps.append(int(p))
            LLs = []
            for ll in fields[7].split(','):
                LLs.append(float(ll))
            if ReadDict.has_key(ID):
                pass
            else:
                ReadDict[ID] = {}
                ReadDict[ID]['chr'] = chr
                ReadDict[ID]['strand'] = strand
                ReadDict[ID]['meth'] = []
            if ReadDict[ID]['chr'] != chr:
                print 'multiple chromosomes detected for read ID', ID, 'exiting'
                sys.exit(1)
            if ReadDict[ID]['strand'] != strand:
                print 'multiple strands detected for read ID', ID, 'exiting'
                sys.exit(1)
            ReadDict[ID]['meth'] += zip(Ps,LLs)

    outfile = open(outfilename,'w')

    for ID in ReadDict.keys():
        ReadDict[ID]['meth'].sort()
        [Ps,LLs] = zip(*ReadDict[ID]['meth'])
        outline = ReadDict[ID]['chr'] + '\t' + str(min(Ps)) + '\t' + str(max(Ps)) + '\t' + ReadDict[ID]['strand'] + '\t' + ID + '\t.\t'
        outline = outline + str(Ps).replace('(','',).replace(')','',).replace(' ','',)
        outline = outline + '\t' + str(LLs).replace('(','',).replace(')','',).replace(' ','',)
        outfile.write(outline + '\n')
            
    outfile.close()

            
run()

