#!/bin/bash

##############################################################
# ##### Please replace PATHTO with your own directory ###### #
##############################################################

PATHTO='/woldlab/castor/data00/home/georgi/SICER_v1.01/SICER'
SICER=/woldlab/castor/data00/home/georgi/SICER_v1.01/SICER
PYTHONPATH=$SICER/lib
export PYTHONPATH

if [ $# -lt 4 ]; then
    echo ""
    echo 1>&2 Usage: $0 ["bed file"] ["control file"] ["window size (bp)"] ["gap size (bp)"] ["FDR"] 
    echo ""
    exit 1
fi

#================================================================================
echo "#############################################"
echo "# ##### Parameters for running SICER ###### #"
echo "#############################################"
# Input sample bed file
SAMPLEBED=$1
SAMPLE=${SAMPLEBED%.*}
echo "Chip library: $SAMPLEBED"

# Input control bed file
CONTROLBED=$2
CONTROL=${CONTROLBED%.*}
echo "Control library: $CONTROLBED"

# Species, for allowed species see GenomeData.py
SPECIES=hg18
echo "Species: $SPECIES"

# Effective Genome as fraction of the genome size. It depends on read length.
EFFECTIVEGENOME=0.74
echo "Effective genome size as a fraction of reference genome: $EFFECTIVEGENOME"

# THRESHOLD is the threshold is for redundancy allowed for reads. THRESHOLD=n
# means that each read has at most n copy after preprocessing.
THRESHOLD=1
echo "Threshold for redundancy allowed for reads: $THRESHOLD"

# WINDOW_SIZE is the size of the windows to scan the genome width. 
# One WINDOW_SIZE is the smallest possible island.
WINDOW_SIZE=$3
echo "Window size: $WINDOW_SIZE bps"

# FRAGMENT_SIZE is for determination of the amound of shift for center
# of the DNA fragments represented by the reads. FRAGMENT_SIZE=150
# means the shift is 75.
FRAGMENT_SIZE=150
echo "Fragment size: $FRAGMENT_SIZE bps. The shift for reads is half of $FRAGMENT_SIZE"

#GAP_SIZE is in base pairs.
GAP_SIZE=$4
echo "Gap size: $GAP_SIZE bps."

#EVALUE is the number of islands expected in random background. The E value is used for identification of candidate islands that exhibit clustering.
EVALUE=1000
echo "Evalue for identification of candidate islands that exhibit clustering: $EVALUE"

#False discovery rate controlling significance
FDR=$5
echo "False discovery rate controlling significance: $FDR "
#================================================================================
#############################################
# ######  SET UP DIRECTORY STRUCTURE ###### #
#############################################
# If data files are not in the current directory, replace this with
# the path to the data files.
DATADIR=.
SAMPLEDIR=$DATADIR
CONTROLDIR=$DATADIR
# If You want the output files not in the current directory, replace this.
OUTPUTDIR=.
SUMMARY_DIR=$OUTPUTDIR
ISLANDS_BED_DIR=$OUTPUTDIR

#================================================================================
#############################################
# ###### DEFINITION OF OUTPUT FILES  ###### #
#############################################

# This file stores the preprocessed raw bed file.
FILTEREDSAMPLEBED=$SAMPLE-${THRESHOLD}-removed.bed
FILTEREDCONTROLBED=$CONTROL-${THRESHOLD}-removed.bed

# This file stores the summary graph.  
SUMMARY=$SAMPLE-W$WINDOW_SIZE.graph

#This file stores the candidate islands.
ISLAND=$SAMPLE-W$WINDOW_SIZE-G$GAP_SIZE.probscoreisland

#This file stores the summary of candidate islands, including tags counts, 
# pvalue, fold change and qvalue
ISLANDSIGNIFICANCEFILE=$SAMPLE-W$WINDOW_SIZE-G$GAP_SIZE-islands-summary

#This file stores the summary of significant islands identified with FDR criterion.
SIGNIFICANTISLANDS=$SAMPLE-W$WINDOW_SIZE-G$GAP_SIZE-islands-summary-FDR$FDR

echo " "
echo " "
echo "Preprocess the raw $SAMPLE file to remove redundancy with threshold $THRESHOLD..."
echo "python $SICER/src/remove_multiple_identical_tags.py -s $SPECIES -b $SAMPLEDIR/$SAMPLEBED -t $THRESHOLD -o $OUTPUTDIR/$FILTEREDSAMPLEBED"
python $SICER/src/remove_multiple_identical_tags.py -s $SPECIES -b $SAMPLEDIR/$SAMPLEBED -t $THRESHOLD -o $OUTPUTDIR/$FILTEREDSAMPLEBED

echo " "
echo " "
echo "Preprocess the raw $CONTROL file to remove redundancy ..."
echo "python $SICER/src/remove_multiple_identical_tags.py -s $SPECIES -b $CONTROLDIR/$CONTROL.bed -t $THRESHOLD -o $OUTPUTDIR/$FILTEREDCONTROLBED"
python $SICER/src/remove_multiple_identical_tags.py -s $SPECIES -b $CONTROLDIR/$CONTROL.bed -t $THRESHOLD -o $OUTPUTDIR/$FILTEREDCONTROLBED


echo " "
echo " "
echo "Partion the genome in windows ..."
#if test -f $SUMMARY_DIR/$SUMMARY; then
#	echo "$SUMMARY already exists, skipping this step ..."
#	continue;
#else 
echo "Generate summary files ..."
echo "python $SICER/src/run-make-graph-file-by-chrom.py -s $SPECIES -b $OUTPUTDIR/$FILTEREDSAMPLEBED -w $WINDOW_SIZE -i $FRAGMENT_SIZE -o $SUMMARY_DIR/$SUMMARY"
python $SICER/src/run-make-graph-file-by-chrom.py -s $SPECIES -b $OUTPUTDIR/$FILTEREDSAMPLEBED -w $WINDOW_SIZE -i $FRAGMENT_SIZE -o $SUMMARY_DIR/$SUMMARY 
#fi



echo " "
echo " "
echo "Find candidate islands exhibiting clustering ..."
echo "python $SICER/src/find_islands_in_pr.py -s $SPECIES -b $SUMMARY_DIR/$SUMMARY -w $WINDOW_SIZE -g $GAP_SIZE -t $EFFECTIVEGENOME -e $EVALUE -f $ISLANDS_BED_DIR/$ISLAND"
python $SICER/src/find_islands_in_pr.py -s $SPECIES -b $SUMMARY_DIR/$SUMMARY -w $WINDOW_SIZE -g $GAP_SIZE -t $EFFECTIVEGENOME -e $EVALUE  -f $ISLANDS_BED_DIR/$ISLAND



echo ""
echo ""
echo "Calculate significance of candidate islands using the control library ..."
echo "python $SICER/src/associate_tags_with_chip_and_control_w_fc_q.py -s $SPECIES  -a $OUTPUTDIR/$FILTEREDSAMPLEBED -b $OUTPUTDIR/$FILTEREDCONTROLBED -d $ISLANDS_BED_DIR/$ISLAND -f $FRAGMENT_SIZE -t $EFFECTIVEGENOME -o $ISLANDS_BED_DIR/$ISLANDSIGNIFICANCEFILE"
python $SICER/src/associate_tags_with_chip_and_control_w_fc_q.py -s $SPECIES  -a $OUTPUTDIR/$FILTEREDSAMPLEBED -b $OUTPUTDIR/$FILTEREDCONTROLBED -d $ISLANDS_BED_DIR/$ISLAND -f $FRAGMENT_SIZE -t $EFFECTIVEGENOME -o $ISLANDS_BED_DIR/$ISLANDSIGNIFICANCEFILE

echo ""
echo ""
echo "Identify significant islands using FDR criterion ..."
echo "python $SICER/src/find_significant_islands.py -i $ISLANDS_BED_DIR/$ISLANDSIGNIFICANCEFILE -q $FDR -o  $ISLANDS_BED_DIR/$SIGNIFICANTISLANDS"
python $SICER/src/find_significant_islands.py -i $ISLANDS_BED_DIR/$ISLANDSIGNIFICANCEFILE -q $FDR -o  $ISLANDS_BED_DIR/$SIGNIFICANTISLANDS
