# Notes about installing packages.
# Use tools.ra as the one true definition of tools currently installed and used.

# The path to these tools is:  /hive/groups/encode/encode3/tools (aka $EAP_TOOLS_DIR)
# All tools used directly by pipeline should be in $EAP_TOOLS_DIR/tools.ra
# NOTE: toolsRaCheck.py will check validity and -fix will fix some issues of tools.ra

# 2014-03-25 downloaded gencode.v19.annotations.gtf.gz from ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz
# moved to: /hive/groups/encode/encode3/encValData/hg19 and gunzipped.

# Needed but part of encode-01 install
tool java
installed /usr/bin/java
version 1.6.0_24

# cd Python/Python-2.7.6
# ./configure --prefix=/hive/groups/encode/encode3/tools/Python; make; make install

# cd Python/pysam.0.7.4
# Had to edit setup.py to change line 169 to:   include_os = [ "/hive/groups/encode/encode3/tools/Python/include/python2.7" ]
# ../bin/python2.7 setup.py build; ../bin/python2.7 setup.py install --prefix=/hive/groups/encode/encode3/tools/Python 

# cd Python/numpy-1.6.2
# ../bin/python2.7 setup.py build; ../bin/python2.7 setup.py install --prefix=/hive/groups/encode/encode3/tools/Python 

# cd Python/scipy-0.12.0
# setup.py install --prefix=/hive/groups/encode/encode3/tools/Python

# phantomTools R dependencies
# (from tools) R-2.15.2/bin/R...; install.packages('caTools',dependencies=TRUE); install.packages('snow',dependencies=TRUE)
# cd phantompeakqualtools; ../R-2.15.2/bin/R CMD INSTALL spp_1.10.1.tar.gz

# MACS2
# ../../python2.7 setup.py build; ../../python2.7 setup.py install --prefix=/hive/groups/encode/encode3/tools/Python

- Rscript ./configure, make (no install!).  Performed on hgwdev to get F77 (fortran).  Hope it works fine running on encode-01 !!


###############
# STAR install
# https://github.com/alexdobin/STAR/archive/ENCODE_2014-03-16_2.3.1z1.tar.gz
# Note: this was after v2.3.0e which worked fine.  However RNA-seq working group prefers 2.3.1
# tar -xzf STAR-ENCODE_2014-03-16_2.3.1z1.tar.gz; cd STAR-ENCODE_2014-03-16_2.3.1z1/; make
# cd ..; mv STAR-ENCODE_2014-03-16_2.3.1z1.tar.gz STAR-ENCODE_2014-03-16_2.3.1z1/
# ln -sf STAR-ENCODE_2014-03-16_2.3.1z1/STAR STAR
# edit tools.ra

# STAR: making genome indexes
# Note this was after pervios star version and previous gencode.v16 annotations
cd /hive/groups/encode/encode3/encValData/female.hg19/
mkdir starData; cd starData; mkdir ERCC; cd ERCC
STAR --runMode genomeGenerate \
  --genomeDir ${EAP_REF_DIR}/female.hg19/starData/ERCC/ \
  --sjdbGTFfile ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
  --sjdbOverhang 100 \
  --genomeFastaFiles ${EAP_REF_DIR}/female.hg19/female.hg19.fa \
    ${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTP.fasta \
  --runThreadN 6 > ${EAP_REF_DIR}/female.hg19/starData/ERCC/command.v19.log 2>&1
Mar 27 00:50:49 ..... Started STAR run
Mar 27 00:50:49 ... Starting to generate Genome files
Mar 27 00:52:42 ... finished processing splice junctions database ...
Mar 27 00:53:20 ... starting to sort  Suffix Array. This may take a long time...
Mar 27 00:53:54 ... sorting Suffix Array chunks and saving them to disk...
Mar 27 01:25:46 ... loading chunks from disk, packing SA...
Mar 27 01:27:56 ... writing Suffix Array to disk ...
Mar 27 01:29:20 ... Finished generating suffix array
Mar 27 01:29:20 ... starting to generate Suffix Array index...
Mar 27 01:59:20 ... writing SAindex to disk
Mar 27 01:59:26 ..... Finished successfully
cd ..; mkdir WSC; cd WSC
STAR --runMode genomeGenerate \
  --genomeDir ${EAP_REF_DIR}/female.hg19/starData/WSC/ \
  --sjdbGTFfile ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
  --sjdbOverhang 100 \
  --genomeFastaFiles ${EAP_REF_DIR}/female.hg19/female.hg19.fa \
    ${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTO.fasta \
  --runThreadN 6 > ${EAP_REF_DIR}/female.hg19/starData/WSC/command.v19.log 2>&1
Mar 27 00:51:26 ..... Started STAR run
Mar 27 00:51:26 ... Starting to generate Genome files
Mar 27 00:53:20 ... finished processing splice junctions database ...
Mar 27 00:53:59 ... starting to sort  Suffix Array. This may take a long time...
Mar 27 00:54:33 ... sorting Suffix Array chunks and saving them to disk...
Mar 27 01:29:15 ... loading chunks from disk, packing SA...
Mar 27 01:33:06 ... writing Suffix Array to disk ...
Mar 27 01:34:34 ... Finished generating suffix array
Mar 27 01:34:34 ... starting to generate Suffix Array index...
Mar 27 02:03:50 ... writing SAindex to disk
Mar 27 02:03:56 ..... Finished successfully
cd /hive/groups/encode/encode3/encValData/male.hg19/
mkdir starData; cd starData; mkdir ERCC; cd ERCC
STAR --runMode genomeGenerate \
  --genomeDir ${EAP_REF_DIR}/male.hg19/starData/ERCC/ \
  --sjdbGTFfile ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
  --sjdbOverhang 100 \
  --genomeFastaFiles ${EAP_REF_DIR}/male.hg19/male.hg19.fa \
    ${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTP.fasta \
  --runThreadN 6 > ${EAP_REF_DIR}/male.hg19/starData/ERCC/command.v19.log 2>&1
Mar 26 21:42:26 ..... Started STAR run
Mar 26 21:42:26 ... Starting to generate Genome files
Mar 26 21:44:21 ... finished processing splice junctions database ...
Mar 26 21:44:59 ... starting to sort  Suffix Array. This may take a long time...
Mar 26 21:45:34 ... sorting Suffix Array chunks and saving them to disk...
Mar 26 22:17:02 ... loading chunks from disk, packing SA...
Mar 26 22:19:40 ... writing Suffix Array to disk ...
Mar 26 22:21:08 ... Finished generating suffix array
Mar 26 22:21:08 ... starting to generate Suffix Array index...
Mar 26 22:48:41 ... writing SAindex to disk
Mar 26 22:48:44 ..... Finished successfully
cd ..; mkdir WSC; cd WSC
STAR --runMode genomeGenerate \
  --genomeDir ${EAP_REF_DIR}/male.hg19/starData/WSC/ \
  --sjdbGTFfile ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
  --sjdbOverhang 100 \
  --genomeFastaFiles ${EAP_REF_DIR}/male.hg19/male.hg19.fa \
    ${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTO.fasta \
  --runThreadN 6 > ${EAP_REF_DIR}/male.hg19/starData/WSC/command.v19.log 2>&1
Mar 27 00:49:47 ..... Started STAR run
Mar 27 00:49:47 ... Starting to generate Genome files
Mar 27 00:51:36 ... finished processing splice junctions database ...
Mar 27 00:52:15 ... starting to sort  Suffix Array. This may take a long time...
Mar 27 00:52:53 ... sorting Suffix Array chunks and saving them to disk...
Mar 27 01:28:13 ... loading chunks from disk, packing SA...
Mar 27 01:31:59 ... writing Suffix Array to disk ...
Mar 27 01:33:18 ... Finished generating suffix array
Mar 27 01:33:19 ... starting to generate Suffix Array index...
Mar 27 02:01:46 ... writing SAindex to disk
Mar 27 02:01:49 ..... Finished successfully
# ln -sf ${EAP_REF_DIR}/male.hg19/starData ${EAP_REF_DIR}/hg19/starData
# Write starData/README


###############
# tophat install (precompiled linux version)
# http://tophat.cbcb.umd.edu/downloads/tophat-2.0.8.Linux_x86_64.tar.gz
# Note: this was after trying several other versions (2.0.11) which crashed 30hrs into alignment run
# tar -xzf tophat-2.0.8.Linux_x86_64.tar.gz; NO make REQUIRED
# mv tophat-2.0.8.Linux_x86_64.tar.gz tophat-2.0.8.Linux_x86_64/
# ln -sf tophat-2.0.8.Linux_x86_64/tophat tophat
# edit tools.ra

# Bowtie2 install (precompiled linux version) (required by tophat)
# http://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.1.0/bowtie2-2.1.0-linux-x86_64.zip/download
# Note: this was after trying bowtie2-2.2.1, which with tophat-2.0.11 crashed 30hrs into alignment
# unzip bowtie2-2.1.0-linux-x86_64.zip; # no make required
# cd bowtie2-2.1.0-linux-x86_64.zip bowtie2-2.1.0
# Installed the precompiled linux version
# ln -sf bowtie2-2.1.0/bowtie2-align bowtie2-align
# ln -sf bowtie2-2.1.0/bowtie2-build bowtie2-build
# ln -sf bowtie2-2.1.0/bowtie2-inspect bowtie2-inspect
# edit tools.ra

# tophat/bowtie2: making genome indexes
# Note this was after pervious tophat/bowtie2 versions and previous gencode.v16 annotations
cd /hive/groups/encode/encode3/encValData/female.hg19/
mkdir tophatData; cd tophatData
bowtie2-build --offrate 3 -f ${EAP_REF_DIR}/female.hg19/female.hg19.fa,${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTP.fasta \
    ${EAP_REF_DIR}/female.hg19/tophatData/ERCC > ERCC_command.log 2>&1 
tail ERCC_command.log
    sideSz: 64
    sideBwtSz: 48
    sideBwtLen: 192
    numSides: 14769670
    numLines: 14769670
    ebwtTotLen: 945258880
    ebwtTotSz: 945258880
    color: 0
    reverse: 1
Total time for backward call to driver() for mirror index: 01:27:47
bowtie2-build --offrate 3 -f ${EAP_REF_DIR}/female.hg19/female.hg19.fa,${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTO.fasta \
    ${EAP_REF_DIR}/female.hg19/tophatData/WSC > WSC_command.log 2>&1
tail WSC_command.log
    sideSz: 64
    sideBwtSz: 48
    sideBwtLen: 192
    numSides: 14769358
    numLines: 14769358
    ebwtTotLen: 945238912
    ebwtTotSz: 945238912
    color: 0
    reverse: 1
Total time for backward call to driver() for mirror index: 01:33:27
mkdir annotation; cd annotation
# Because I don't know the bowtie command and tophat doesn't support creating this directly, 
# I created it with a mini run from the testing dir
tophat --no-discordant --no-mixed -p 8 -z0 --min-intron-length 20 --max-intron-length 1000000 \
           --read-mismatches 4 --read-edit-dist 4 --max-multihits 20 --library-type fr-firststrand \
           --GTF ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
           --transcriptome-index ${EAP_REF_DIR}/female.hg19/tophatData/annotation/gencode.v19 \
       --min-anchor-length 8 --splice-mismatches 0 --read-gap-length 2 \
       --mate-inner-dist 50 --mate-std-dev 20 --segment-length 25 \
       --b2-L 20 --b2-N 0 --b2-D 15 --b2-R 2 \
           ${EAP_REF_DIR}/female.hg19/tophatData/ERCC tmpR1.fq.gz tmpR2.fq.gz > $1.log 2>&1 &
cd /hive/groups/encode/encode3/encValData/male.hg19/
mkdir tophatData; cd tophatData
bowtie2-build --offrate 3 -f ${EAP_REF_DIR}/male.hg19/male.hg19.fa,${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTP.fasta \
    ${EAP_REF_DIR}/male.hg19/tophatData/ERCC > ERCC_command.log 2>&1
tail ERCC_command.log
    sideSz: 64
    sideBwtSz: 48
    sideBwtLen: 192
    numSides: 14889381
    numLines: 14889381
    ebwtTotLen: 952920384
    ebwtTotSz: 952920384
    color: 0
    reverse: 1
Total time for backward call to driver() for mirror index: 01:25:48
bowtie2-build --offrate 3 -f ${EAP_REF_DIR}/male.hg19/male.hg19.fa,${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTO.fasta \
    ${EAP_REF_DIR}/male.hg19/tophatData/WSC > WSC_command.log 2>&1
tail WSC_command.log
    sideSz: 64
    sideBwtSz: 48
    sideBwtLen: 192
    numSides: 14889069
    numLines: 14889069
    ebwtTotLen: 952900416
    ebwtTotSz: 952900416
    color: 0
    reverse: 1
Total time for backward call to driver() for mirror index: 01:28:23
mkdir annotation; cd annotation
# Because I don't know the bowtie command and tophat doesn't support creating this directly, 
# I created it with a mini run from the testing dir
tophat --no-discordant --no-mixed -p 8 -z0 --min-intron-length 20 --max-intron-length 1000000 \
           --read-mismatches 4 --read-edit-dist 4 --max-multihits 20 --library-type fr-firststrand \
           --GTF ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
           --transcriptome-index ${EAP_REF_DIR}/male.hg19/tophatData/annotation/gencode.v19 \
       --min-anchor-length 8 --splice-mismatches 0 --read-gap-length 2 \
       --mate-inner-dist 50 --mate-std-dev 20 --segment-length 25 \
       --b2-L 20 --b2-N 0 --b2-D 15 --b2-R 2 \
           ${EAP_REF_DIR}/male.hg19/tophatData/ERCC tmpR1.fq.gz tmpR2.fq.gz > $1.log 2>&1 &
# ln -sf ${EAP_REF_DIR}/male.hg19/tophatData ${EAP_REF_DIR}/hg19/tophatData
# Write tophatData/README
# Note that the first run of each tophat alignment first recreates the fa file 
# (combined genome and spike-ins) from index files.  To avoid this in the future, 
# I copied the recreated/combined fa file into the appropriate reference dir.

# tophat_bam_xsA_tag_fix.pl install (tophat alignments still need to be patched up)
# https://github.com/xweigit/xweiEncodeScripts/archive/v1.0.tar.gz
# tar -xzf xweiEncodeScripts-1.0.tar.gz; mv xweiEncodeScripts-1.0.tar.gz xweiEncodeScripts-1.0/
# ln -sf xweiEncodeScripts-1.0/tophat_bam_xsA_tag_fix.pl tophat_bam_xsA_tag_fix.pl
# edit tools.ra

###############
# makewigglefromBAM-NH.py install (Georgi script to make filtered bigWigs from bams)
# https://github.com/georgimarinov/GeorgiScripts.git
# No release!
# unzip GeorgiScripts-master.zip; mv GeorgiScripts-master.zip GeorgiScripts/
# ln -sf GeorgiScripts/makewigglefromBAM-NH.py makewigglefromBAM-NH.py
# edit tools.ra

###############
# RSEM install (quantify RNA-seq known transcript results)
# https://github.com/bli25wisc/RSEM/archive/v1.2.12.tar.gz
# tar -xzf RSEM-1.2.12.tar.gz; mv RSEM-1.2.12.tar.gz RSEM-1.2.12/; cd RSEM-1.2.12/; make
# cd ..; ln -sf RSEM-1.2.12/rsem-calculate-expression rsem-calculate-expression
# edit tools.ra

# RSEM/bowtie2: making genome indexes
cd /hive/groups/encode/encode3/encValData/female.hg19/
mkdir rsemData; cd rsemData
${EAP_TOOLS_DIR}/RSEM-1.2.12/rsem-prepare-reference --no-polyA \
             --bowtie2 --bowtie2-path ${EAP_TOOLS_DIR}/bowtie2-2.1.0 \
             --gtf ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
             ${EAP_REF_DIR}/hg19/female.hg19.fa,${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTO.fasta \
             ${EAP_REF_DIR}/female.hg19/rsemData/WSC > WSC_command.log 2>&1 
tail ERCC_command.log
    sideBwtSz: 48
    sideBwtLen: 192
    numSides: 1490815
    numLines: 1490815
    ebwtTotLen: 95412160
    ebwtTotSz: 95412160
    color: 0
    reverse: 1
Total time for backward call to driver() for mirror index: 00:19:40
${EAP_TOOLS_DIR}/RSEM-1.2.12/rsem-prepare-reference --no-polyA \
             --bowtie2 --bowtie2-path ${EAP_TOOLS_DIR}/bowtie2-2.1.0 \
             --gtf ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
             ${EAP_REF_DIR}/hg19/female.hg19.fa,${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTO.fasta \
             ${EAP_REF_DIR}/female.hg19/rsemData/WSC > WSC_command.log 2>&1 
tail WSC_command.log
    sideBwtSz: 48
    sideBwtLen: 192
    numSides: 1490815
    numLines: 1490815
    ebwtTotLen: 95412160
    ebwtTotSz: 95412160
    color: 0
    reverse: 1
Total time for backward call to driver() for mirror index: 00:20:30
cd /hive/groups/encode/encode3/encValData/male.hg19/
mkdir rsemData; cd rsemData
${EAP_TOOLS_DIR}/RSEM-1.2.12/rsem-prepare-reference --no-polyA \
             --bowtie2 --bowtie2-path ${EAP_TOOLS_DIR}/bowtie2-2.1.0 \
             --gtf ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
             ${EAP_REF_DIR}/hg19/male.hg19.fa,${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTP.fasta \
             ${EAP_REF_DIR}/male.hg19/rsemData/ERCC > ERCC_command.log 2>&1 
tail ERCC_command.log 
    sideBwtSz: 48
    sideBwtLen: 192
    numSides: 1495360
    numLines: 1495360
    ebwtTotLen: 95703040
    ebwtTotSz: 95703040
    color: 0
    reverse: 1
Total time for backward call to driver() for mirror index: 00:21:16
${EAP_TOOLS_DIR}/RSEM-1.2.12/rsem-prepare-reference --no-polyA \
             --bowtie2 --bowtie2-path ${EAP_TOOLS_DIR}/bowtie2-2.1.0 \
             --gtf ${EAP_REF_DIR}/hg19/gencode.v19.annotation.gtf \
             ${EAP_REF_DIR}/hg19/male.hg19.fa,${EAP_REF_DIR}/hg19/rnaSpikeIns.ENCFF001RTO.fasta \
             ${EAP_REF_DIR}/male.hg19/rsemData/WSC > WSC_command.log 2>&1 
tail WSC_command.log
    sideBwtSz: 48
    sideBwtLen: 192
    numSides: 1495360
    numLines: 1495360
    ebwtTotLen: 95703040
    ebwtTotSz: 95703040
    color: 0
    reverse: 1
Total time for backward call to driver() for mirror index: 00:20:27
# ln -sf ${EAP_REF_DIR}/male.hg19/rsemData ${EAP_REF_DIR}/hg19/rsemData
# Write rsemData/README


###############
# ChIP-seq peak callers
#
# Installation procedure followed from existing tools:
#
# mkdir tool-rel
# ln -s tool-rel tool
# cd tool
# ftp package
# uncompress and untar package

# There are 3:  SPP (Park lab, Harvard), PeakSeq (Gerstein lab, Yale), GEM (Gifford lab, MIT)

# SPP (Peter Parchenk), modified by Anshul Kundaje for use with IDR
# http://compbio.med.harvard.edu/Supplements/ChIP-seq/tutorial.html
# https://sites.google.com/site/anshulkundaje/projects/idr#TOC-CALL-PEAKS-ON-INDIVIDUAL-REPLICATES
# https://sites.google.com/site/anshulkundaje/projects/idr#TOC-CALL-PEAKS-ON-POOLED-REPLICATES
# https://code.google.com/p/phantompeakqualtools/

# PeakSeq (Joel Rozowsky)
# http://wiki.encodedcc.org/index.php/PeakSeq
# http://info.gersteinlab.org/PeakSeq

# GEM
# http://wiki.encodedcc.org/index.php/GPS/GEM
# http://www.psrg.csail.mit.edu/gem/

mkdir gem.v2.4.1
http://www.psrg.csail.mit.edu/gem/download/gem.v2.4.1.tar.gz
cd gem

java -jar -Xmx10G gem.jar
# Example: java -Xmx10G -jar gem.jar --d Read_Distribution_default.txt --g mm8.info --genome your_path/mm8 --s 2000000000 --expt SRX000540_mES_CTCF.bed --ctrl SRX000543_mES_GFP.bed --f BED --out mouseCTCF --k_min 6 --k_max 13
# ENCODE example: java -Xmx15G -jar gem.jar --g hg19.info --d Read_Distribution_default.txt --s 2400000000 --expt wgEncodeSydhTfbsGm12878Ctcfsc15914c20StdAlnRep1.bam --expt wgEncodeSydhTfbsGm12878Ctcfsc15914c20StdAlnRep2.bam --ctrl wgEncodeSydhTfbsGm12878InputIggrabAlnRep1.bam --f SAM --out CTCF_GM12878 --genome /yourpath/hg19 --k_min 6 --k_max 13 --outNP
# For use with IDR (relaxed peaks), add -q 0

# QUESTIONS:  
#   Why both replicates in example -- is this for pooled results ? (try it)
# NOTE: -t to specify #cpus (default is number of cpu's on host?)
# Curious what is default -q value ? (significance threshold)

# required params:  --d, --exptX
# use --f SAM for BAM input
# use --outNP for narrowPeak output