##################################
#                                #
# Last modified 2021/01/16       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s config MACS2_location genome_size(bp) samtools BAMPseudoReps.py IDR_location' % sys.argv[0]
        print '\tconfig format:'
        print '\tlabel\tChIP-Rep1.bam\tInput-Rep1.bam\tChIP-Rep2.bam\tInput-Rep2.bam'
        print '\tThe script will print to stdout'
        sys.exit(1)

    print 'fix BAMPseudoReps part -- samtools genome.fa'
    sys.exit(1)

    config = sys.argv[1]
    MACS2 = sys.argv[2]
    GS = sys.argv[3]
    samtools = sys.argv[4]
    BAMpseudoreps = sys.argv[5]
    IDR2 = sys.argv[6]

    CommandDict = {
                   'PeakCallRep12':[],
                   'PeakCallPooled':[],
                   'PeakCallPooledPseudoRep':[],
                   'PeakCallIndividualPseudoRep':[],
                   'IDRRep12':[],
                   'IDRPooledPseudoRep':[],
                   'IDRIndividualPseudoRep':[],
                   'merge':[],
                   'sort':[],
                   'index':[],
                   'BAMPseudoRep':[]
                    }
    DataList = []

    linelist = open(config)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        label = fields[0]
        ChIPRep1 = fields[1]
        InputRep1 = fields[2]
        ChIPRep2 = fields[3]
        InputRep2 = fields[4]
        DataList.append((label,ChIPRep1,InputRep1,ChIPRep2,InputRep2))

    for (label,ChIPRep1,InputRep1,ChIPRep2,InputRep2) in DataList:
#        Rep1Dir = 'SPP-300K-' + label + '-Rep1'
#        Rep2Dir = 'SPP-300K-' + label + '-Rep2'
#        CommandDict['mkdirRep'].append('mkdir ' + Rep1Dir)
#        CommandDict['mkdirRep'].append('mkdir ' + Rep2Dir)

#        PooledDir = 'SPP-300K-' + label + '-Pooled'
#        CommandDict['mkdirPooled'].append('mkdir ' + PooledDir)

#        PooledPseudoRep1Dir = 'SPP-300K-' + label + '-PooledPseudoRep1'
#        PooledPseudoRep2Dir = 'SPP-300K-' + label + '-PooledPseudoRep2'
#        CommandDict['mkdirPooledPseudoRep'].append('mkdir ' + PooledPseudoRep1Dir)
#        CommandDict['mkdirPooledPseudoRep'].append('mkdir ' + PooledPseudoRep2Dir)

#        Rep1IndividualPseudoRep1Dir = 'SPP-300K-' + label + '-Rep1PseudoRep1'
#        Rep1IndividualPseudoRep2Dir = 'SPP-300K-' + label + '-Rep1PseudoRep2'
#        Rep2IndividualPseudoRep1Dir = 'SPP-300K-' + label + '-Rep2PseudoRep1'
#        Rep2IndividualPseudoRep2Dir = 'SPP-300K-' + label + '-Rep2PseudoRep2'
#        CommandDict['mkdirIndidividualPseudoRep'].append('mkdir ' + Rep1IndividualPseudoRep1Dir)
#        CommandDict['mkdirIndidividualPseudoRep'].append('mkdir ' + Rep1IndividualPseudoRep2Dir)
#        CommandDict['mkdirIndidividualPseudoRep'].append('mkdir ' + Rep2IndividualPseudoRep1Dir)
#        CommandDict['mkdirIndidividualPseudoRep'].append('mkdir ' + Rep2IndividualPseudoRep2Dir)

        PooledChIPBAMmerged = label + '.ChIP.pooled.bam'
        PooledInputBAMmerged = label + '.Control.pooled.bam'
        CommandDict['merge'].append(samtools + ' merge ' + PooledChIPBAMmerged + ' ' + ChIPRep1 + ' ' + ChIPRep2)
        CommandDict['merge'].append(samtools + ' merge ' + PooledInputBAMmerged + ' ' + InputRep1 + ' ' + InputRep2)

        PooledChIPBAMsorted = label + '.ChIP.pooled.sorted.bam'
        PooledInputBAMsorted = label + '.Control.pooled.sorted.bam'
        CommandDict['sort'].append(samtools + ' sort ' + PooledChIPBAMmerged + ' ' + PooledChIPBAMsorted[0:-4])
        CommandDict['sort'].append(samtools + ' sort ' + PooledInputBAMmerged + ' ' + PooledInputBAMsorted[0:-4])

        CommandDict['index'].append(samtools + ' index ' + PooledChIPBAMsorted)
        CommandDict['index'].append(samtools + ' index ' + PooledInputBAMsorted)

        CommandDict['BAMPseudoRep'].append('python ' + BAMpseudoreps + ' ' + PooledChIPBAMsorted)
        CommandDict['BAMPseudoRep'].append('python ' + BAMpseudoreps + ' ' + PooledInputBAMsorted)
        CommandDict['BAMPseudoRep'].append('python ' + BAMpseudoreps + ' ' + ChIPRep1)
        CommandDict['BAMPseudoRep'].append('python ' + BAMpseudoreps + ' ' + ChIPRep2)
        CommandDict['BAMPseudoRep'].append('python ' + BAMpseudoreps + ' ' + InputRep1)
        CommandDict['BAMPseudoRep'].append('python ' + BAMpseudoreps + ' ' + InputRep2)

        PooledChIPBAMPseudoRep1 = PooledChIPBAMsorted[0:-4] + '.pseudoRep1.bam'
        PooledChIPBAMPseudoRep2 = PooledChIPBAMsorted[0:-4] + '.pseudoRep2.bam'
        PooledInputBAMPseudoRep1 = PooledInputBAMsorted[0:-4] + '.pseudoRep1.bam'
        PooledInputBAMPseudoRep2 = PooledInputBAMsorted[0:-4] + '.pseudoRep2.bam'
        Rep1ChIPIndividualPseudoRep1 = ChIPRep1[0:-4] + '.pseudoRep1.bam'
        Rep2ChIPIndividualPseudoRep1 = ChIPRep2[0:-4] + '.pseudoRep1.bam'
        Rep1InputIndividualPseudoRep1 = InputRep1[0:-4] + '.pseudoRep1.bam'
        Rep2InputIndividualPseudoRep1 = InputRep2[0:-4] + '.pseudoRep1.bam'
        Rep1ChIPIndividualPseudoRep2 = ChIPRep1[0:-4] + '.pseudoRep2.bam'
        Rep2ChIPIndividualPseudoRep2 = ChIPRep2[0:-4] + '.pseudoRep2.bam'
        Rep1InputIndividualPseudoRep2 = InputRep1[0:-4] + '.pseudoRep2.bam'
        Rep2InputIndividualPseudoRep2 = InputRep2[0:-4] + '.pseudoRep2.bam'

        CommandDict['PeakCallRep12'].append(MACS2 + ' callpeak -t ' + ChIPRep1 + ' -c ' + InputRep1 + ' -n ' + label + '.Rep1.MACS-2.1.0.p1e-1 -g ' + GS +  ' -f BAMPE --keep-dup all --to-large -p 1e-1')
        CommandDict['PeakCallRep12'].append(MACS2 + ' callpeak -t ' + ChIPRep1 + ' -c ' + InputRep1 + ' -n ' + label + '.Rep1.MACS-2.1.0 -g ' + GS +  ' -f BAMPE --keep-dup all')
        CommandDict['PeakCallRep12'].append(MACS2 + ' callpeak -t ' + ChIPRep2 + ' -c ' + InputRep2 + ' -n ' + label + '.Rep2.MACS-2.1.0.p1e-1 -g ' + GS +  ' -f BAMPE --keep-dup all --to-large -p 1e-1')
        CommandDict['PeakCallRep12'].append(MACS2 + ' callpeak -t ' + ChIPRep2 + ' -c ' + InputRep2 + ' -n ' + label + '.Rep2.MACS-2.1.0 -g ' + GS +  ' -f BAMPE --keep-dup all')

        CommandDict['PeakCallPooled'].append(MACS2 + ' callpeak -t ' + PooledChIPBAMsorted + ' -c ' + PooledInputBAMsorted + ' -n ' + label + '.pooled.MACS-2.1.0.p1e-1 -g ' + GS +  ' -f BAMPE --keep-dup all --to-large -p 1e-1')

        CommandDict['PeakCallPooledPseudoRep'].append(MACS2 + ' callpeak -t ' + PooledChIPBAMPseudoRep1 + ' -c ' + PooledInputBAMPseudoRep1 + ' -n ' + label + '.pooled_pseudorep1.MACS-2.1.0.p1e-1 -g ' + GS +  ' -f BAMPE --keep-dup all --to-large -p 1e-1')
        CommandDict['PeakCallPooledPseudoRep'].append(MACS2 + ' callpeak -t ' + PooledChIPBAMPseudoRep2 + ' -c ' + PooledInputBAMPseudoRep2 + ' -n ' + label + '.pooled_pseudorep2.MACS-2.1.0.p1e-1 -g ' + GS +  ' -f BAMPE --keep-dup all --to-large -p 1e-1')

        CommandDict['PeakCallIndividualPseudoRep'].append(MACS2 + ' callpeak -t ' + Rep1ChIPIndividualPseudoRep1 + ' -c ' + Rep1InputIndividualPseudoRep1 + ' -n ' + label + '.rep1_pseudorep1.MACS-2.1.0.p1e-1 -g ' + GS +  ' -f BAMPE --keep-dup all --to-large -p 1e-1')
        CommandDict['PeakCallIndividualPseudoRep'].append(MACS2 + ' callpeak -t ' + Rep1ChIPIndividualPseudoRep2 + ' -c ' + Rep1InputIndividualPseudoRep2 + ' -n ' + label + '.rep1_pseudorep2.MACS-2.1.0.p1e-1 -g ' + GS +  ' -f BAMPE --keep-dup all --to-large -p 1e-1')
        CommandDict['PeakCallIndividualPseudoRep'].append(MACS2 + ' callpeak -t ' + Rep2ChIPIndividualPseudoRep1 + ' -c ' + Rep2InputIndividualPseudoRep1 + ' -n ' + label + '.rep2_pseudorep1.MACS-2.1.0.p1e-1 -g ' + GS +  ' -f BAMPE --keep-dup all --to-large -p 1e-1')
        CommandDict['PeakCallIndividualPseudoRep'].append(MACS2 + ' callpeak -t ' + Rep2ChIPIndividualPseudoRep2 + ' -c ' + Rep2InputIndividualPseudoRep2 + ' -n ' + label + '.rep2_pseudorep2.MACS-2.1.0.p1e-1 -g ' + GS +  ' -f BAMPE --keep-dup all --to-large -p 1e-1')


        CommandDict['IDRRep12'].append(IDR2 + ' --samples ' + label + '.Rep1.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz ' + label + '.Rep2.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz --peak-list ' + label + '.pooled.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz --input-file-type narrowPeak --output-file ' + label + '.indreps.IDR --rank p.value --soft-idr-threshold 0.05 --plot --use-best-multisummit-IDR')
        CommandDict['IDRPooledPseudoRep'].append(IDR2 + ' --samples ' + label + '.pooled_pseudorep1.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz ' + label + '.pooled_pseudorep1.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz --peak-list ' + label + '.pooled.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz --input-file-type narrowPeak --output-file ' + label + '.pseudoreps.IDR --rank p.value --soft-idr-threshold 0.05 --plot --use-best-multisummit-IDR')
        CommandDict['IDRIndividualPseudoRep'].append(IDR2 + ' --samples ' + label + '.rep1_pseudorep1.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz ' + label + '.rep1_pseudorep2.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz --peak-list ' + label + '.pooled.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz --input-file-type narrowPeak --output-file ' + label + '.rep1_pseudoreps.IDR --rank p.value --soft-idr-threshold 0.05 --plot --use-best-multisummit-IDR')
        CommandDict['IDRIndividualPseudoRep'].append(IDR2 + ' --samples ' + label + '.rep2_pseudorep1.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz ' + label + '.rep2_pseudorep2.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz --peak-list ' + label + '.pooled.MACS-2.1.0.p1e-1_peaks.narrowPeak.gz --input-file-type narrowPeak --output-file ' + label + '.rep2_pseudoreps.IDR --rank p.value --soft-idr-threshold 0.05 --plot --use-best-multisummit-IDR')

    print '\n# merge replicate BAM files:'
    for command in CommandDict['merge']:
        print command
    print '\n# sort merged BAM files:'
    for command in CommandDict['sort']:
        print command
    print '\n# index sorted BAM files:'
    for command in CommandDict['index']:
        print command
    print '\n# Generate pseudoreplicates:'
    for command in CommandDict['BAMPseudoRep']:
        print command
    print '\n# Peak calls for individual replicates:'
    for command in CommandDict['PeakCallRep12']:
        print command
    print '\n# Peak calls for for pooled datasets:'
    for command in CommandDict['PeakCallPooled']:
        print command
    print '\n# Peak calls for for pooled pseudoreplicates:'
    for command in CommandDict['PeakCallPooledPseudoRep']:
        print command
    print '\n# Peak calls for for individual pseudoreplicates:'
    for command in CommandDict['PeakCallIndividualPseudoRep']:
        print command
    print '\n# Clean up pseudoreplicate BAM files:'
    print 'rm *.pooled.bam'
    print 'rm *.sorted.bam'
    print 'rm *.pseudoRep1.bam'
    print 'rm *.pseudoRep2.bam'
    print 'rm *.sorted.bamb.bai'
    print 'rm *.pseudoRep1.bam.bai'
    print 'rm *.pseudoRep2.bam.bai'
    print '\n# unzip peak call files:'
    print 'gzip *MACS-2.1.0_peaks.narrowPeak'
    print 'rm *_model.r'
    print 'rm *peaks.xls'
    print 'rm *summits.bed'
    print '\n# IDR for individual replicates:'
    for command in CommandDict['IDRRep12']:
        print command
    print '\n# IDR for pooled pseudoreplicates:'
    for command in CommandDict['IDRPooledPseudoRep']:
        print command
    print '\n# IDR for individual pseudoreplicates:'
    for command in CommandDict['IDRIndividualPseudoRep']:
        print command

run()

