#!/bin/bash
##############################################################
# ##### Please replace PATHTO with your own directory ###### #
##############################################################
SICER=/woldlab/castor/data00/home/georgi/SICER_v1.01/SICER/
PYTHONPATH=$SICER/lib
export PYTHONPATH

if [ $# -lt 4 ]; then
    echo ""
    echo 1>&2 Usage: $0 ["bed file"] ["window size (bp)"] ["gap size (bp)"] ["E-value"] 
    echo ""
    exit 1
fi

#================================================================================
#Parameters for running SICER without control library and using random background model.
echo "#############################################"
echo "# ##### Parameters for running SICER ###### #"
echo "#############################################"
# Input sample bed file
SAMPLEBED=$1
SAMPLE=${SAMPLEBED%.*}
echo "Chip library: $SAMPLEBED"

# Species, for allowed species see GenomeData.py
SPECIES=hg18
echo "Species: $SPECIES"

# Effective Genome as fraction of the genome size. It depends on read length.
EFFECTIVEGENOME=0.74
echo "Effective genome size as a fraction of reference genome: $EFFECTIVEGENOME"

# THRESHOLD is the threshold is for redundancy for reads. THRESHOLD=n
# means that each read has at most n copy after preprocessing.
THRESHOLD=1
echo "Threshold for redundancy allowed for reads: $THRESHOLD"

# WINDOW_SIZE is the size of the windows to scan the genome width. 
# One WINDOW_SIZE is the smallest possible island.
WINDOW_SIZE=$2
echo "Window size: $WINDOW_SIZE bps"

# FRAGMENT_SIZE is for determination of the amound of shift for center
# of the DNA fragments represented by the reads. FRAGMENT_SIZE=150
# means the shift is 75.
FRAGMENT_SIZE=150

echo "Fragment size: $FRAGMENT_SIZE bps. The shift for reads is half of $FRAGMENT_SIZE"
#GAP_SIZE is in base pairs.
GAP_SIZE=$3
echo "Gap size: $GAP_SIZE bps"

#EVALUE is the number of islands expected in random background. The
#EVALUE is used to determine the score threshold s_T.
EVALUE=$4
echo "Evalue for identification of significant islands: $EVALUE"
#================================================================================

# If data files are not in the current directory, replace this with
# the path to the data files.

DATADIR=.
SAMPLEDIR=$DATADIR
CONTROLDIR=$DATADIR
OUTPUTDIR=.
SUMMARY_DIR=$OUTPUTDIR
ISLANDS_BED_DIR=$OUTPUTDIR

# This file stores the preprocessed raw bed file.
FILTEREDSAMPLEBED=$SAMPLE-${THRESHOLD}-removed.bed

# This file stores the summary graph.  
SUMMARY=$SAMPLE-W$WINDOW_SIZE.graph
# This file stores the histogram of window read-count.  
SUMMARYHIST=$SAMPLE-W$WINDOW_SIZE.graphhist

#This file stores the candidate islands.
ISLAND=$SAMPLE-W$WINDOW_SIZE-G$GAP_SIZE.probscoreisland
# This file stores the windows on islands. 
FILTEREDSUMMARY=$SAMPLE-W$WINDOW_SIZE-G$GAP_SIZE-islandfiltered.scoregraph
# This file stores the histogram of island scores
ISLANDSCOREHIST=$SAMPLE-W$WINDOW_SIZE-G$GAP_SIZE.islandscorehist
# This file stores the histogram of island lengths
ISLANDLENGTHHIST=$SAMPLE-W$WINDOW_SIZE-G$GAP_SIZE.islandlengthhist


echo " "
echo " "
echo "Preprocess the raw $SAMPLE file to remove redundancy with threshold $THRESHOLD..."
echo "python $SICER/src/remove_multiple_identical_tags.py -s $SPECIES -b $SAMPLEDIR/$SAMPLEBED -t $THRESHOLD -o $OUTPUTDIR/$FILTEREDSAMPLEBED"
python $SICER/src/remove_multiple_identical_tags.py -s $SPECIES -b $SAMPLEDIR/$SAMPLEBED -t $THRESHOLD -o $OUTPUTDIR/$FILTEREDSAMPLEBED

echo " "
echo " "
echo "Partion the genome in windows ..."
#if test -f $SUMMARY_DIR/$SUMMARY; then
#	echo "$SUMMARY already exists, skipping this step ..."
#	continue;
#else 
echo "Generate summary files ..."
echo "python $SICER/src/run-make-graph-file-by-chrom.py -s $SPECIES -b $OUTPUTDIR/$FILTEREDSAMPLEBED -w $WINDOW_SIZE -i $FRAGMENT_SIZE -o $SUMMARY_DIR/$SUMMARY"
python $SICER/src/run-make-graph-file-by-chrom.py -s $SPECIES -b $OUTPUTDIR/$FILTEREDSAMPLEBED -w $WINDOW_SIZE -i $FRAGMENT_SIZE -o $SUMMARY_DIR/$SUMMARY 
    
## Generating window read-count statistics
#     echo " "
#     echo " "
#     echo "Generate window read count histograms  ... "
#     echo "python $SICER/utility/get_windows_histogram.py -s $SPECIES -b $SUMMARY_DIR/$SUMMARY -w $WINDOW_SIZE -o $SUMMARY_DIR/$SUMMARYHIST"
#     python $SICER/utility/get_windows_histogram.py -s $SPECIES -b $SUMMARY_DIR/$SUMMARY -w $WINDOW_SIZE -o $SUMMARY_DIR/$SUMMARYHIST
#fi


echo " "
echo " "
echo "Find significant islands with E-value $EVALUE..."
echo "python $SICER/src/find_islands_in_pr.py -s $SPECIES -b $SUMMARY_DIR/$SUMMARY -w $WINDOW_SIZE -g $GAP_SIZE -t $EFFECTIVEGENOME -e $EVALUE -f $ISLANDS_BED_DIR/$ISLAND"
python $SICER/src/find_islands_in_pr.py -s $SPECIES -b $SUMMARY_DIR/$SUMMARY -w $WINDOW_SIZE -g $GAP_SIZE -t $EFFECTIVEGENOME -e $EVALUE  -f $ISLANDS_BED_DIR/$ISLAND

# Generating island statistics, optional
# echo " "
# echo " "
# echo "Get island statistics ..."
# echo "python $SICER/utility/islands_statistics_pr.py -s $SPECIES -b $SUMMARY_DIR/$SUMMARY -i $ISLANDS_BED_DIR/$ISLAND -w $WINDOW_SIZE -o $ISLANDS_BED_DIR/$ISLANDSCOREHIST   -q $ISLANDS_BED_DIR/$ISLANDLENGTHHIST -r $SUMMARY_DIR/$FILTEREDSUMMARY"
# python $SICER/utility/islands_statistics_pr.py -s $SPECIES -b $SUMMARY_DIR/$SUMMARY -i $ISLANDS_BED_DIR/$ISLAND -w $WINDOW_SIZE -o $ISLANDS_BED_DIR/$ISLANDSCOREHIST   -q $ISLANDS_BED_DIR/$ISLANDLENGTHHIST -r $ISLANDS_BED_DIR/$FILTEREDSUMMARY


