# quakeExpr DONE Chris Eisenhart 2017-06-06 
mkdir /hive/data/genomes/hg38/bed/quakeBrainExpression
cd /hive/data/genomes/hg38/bed/quakeBrainExpression

# Go here and download the accession list and runInfo list 
# https://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP057196
# Use awk to turn it into a list of fastq dump commands
cat SRR_Acc_List.txt | awk '{print "~/sratoolkit.2.8.0-centos_linux64/bin/fastq-dump "$1" --gzip --split-files"}' > commandList.txt
chmod 755 commandList.txt
./commandList.txt 
# This has a nasty habit of downloading .sra files to a folder in the home dir ~/ncbi/public/sra/
# these can be removed.  In this case these data are in our system several times already for CIRM

# Get the transcriptome John Vivian used for the Toil recompute. 
sftp ceisenha@cirm-01:/data/cirm/annotation/humanTranscriptome/Homo_sapiens.GRCh38.rel79.cdna.all.fa.index .

# Make a command list and run kallisto on the paired end samples
# Could use parasol here, just make a joblist and pass to parasol instead of 
# running the command list.  
cat SRR_Acc_List.txt | awk '{print "~/bin/x86_64/kallisto quant -i Homo_sapiens.GRCh38.rel79.cdna.all.fa.index -o kallistoOut/"$1" "$1"_1.fastq.gz "$1"_2.fastq.gz"}' > commandList2.txt
chmod 755 commandList2.txt
./commandList2.txt 

# Merge the tpm files into a tpm matrix and gpm matrix
cd kallistoOut
kallistoToMatrix . quakeMatrix -t 20

# Download the meta data file, convert to tagStorm then to tab
wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE67nnn/GSE67835/soft/GSE67835_family.soft.gz
gunzip GSE67835_family.soft.gz 
~/kent/src/tagStorm/softToTagStorm.py GSE67835_family.soft GSE67835_family.tags
tagStormToTab GSE67835_family.tags GSE67835_family.tab

# Generate the sample list, needs SRR names and the cell type which are stored in 
# two different files.  Also we want to throw out the fetal samples.  
echo "#meta" > cleanQuakeSamples.tab
join <(cut -f 62,79 GSE67835_family.tab | awk '{print $2"\t"$1}') <(cut -f 8,10 SraRunTable.txt | sort -k2 | awk '{print $2"\t"$1}') | awk '{print $3"\t"$2}' | grep -v fetal >> cleanQuakeSamples.tab

#head -4 cleanQuakeSamples.tab
#meta	biosample_cell_type
#SRR1974543	oligodendrocytes
#SRR1974544	hybrid
#SRR1974545	oligodendrocytes

# Go through and make the cell types capitalized (fits with the Gtex tissue convention)
#head -4 cleanQuakeSamples.tab
#meta	biosample_cell_type
#SRR1974543	Oligodendrocytes
#SRR1974544	Hybrid
#SRR1974545	Oligodendrocytes


# Clean up the matrices removing the fetal samples.  
rowsToCols kallistoOut/quakeMatrix.tpm.tab tspsdQuakeTranscMatrix.tab
join <(cut -f 1 cleanQuakeSamples.tab | sed 's/meta/transcript/g' | sort) <(sort tspsdQuakeTranscMatrix.tab) > cleanTspsdQuakeTranscMatrix.tab
rowsToCols cleanTspsdQuakeTranscMatrix.tab cleanQuakeTranscMatrix.tab

rowsToCols kallistoOut/quakeMatrix.geneTpm.tab tspsdQuakeGeneMatrix.tab
join <(cut -f 1 cleanQuakeSamples.tab | sed 's/meta/gene/g' | sort) <(sort tspsdQuakeGeneMatrix.tab) > cleanTspsdQuakeGeneMatrix.tab
rowsToCols cleanTspsdQuakeGeneMatrix.tab cleanQuakeGeneMatrix.tab

# Remove the 'meta' to make the script happy. 
sed -i '1D' cleanQuakeSamples.tab 

# Make up the coordinate files that map the transcript/gene names to coordinates
hgsql hg38 -e "select name,chrom,strand,txStart,txEnd from ensGene" > transcDump.tab
hgsql hg38 -e "select name2,chrom,strand,txStart,txEnd from ensGene" > geneDump.tab
cp /hive/data/outside/gencode/release_22/transToGene.tab .
cut -f 3 transToGene.tab > hugoGenes.txt 
cut -f 1 transToGene.tab | cut -f 1 -d "." > trans1.txt
cut -f 2 transToGene.tab | cut -f 1 -d "." > gene1.txt
paste trans1.txt hugoGenes.txt > transcToHugoGenes.tab
paste gene1.txt hugoGenes.txt > geneToHugoGenes.tab
join <(sort transcToHugoGenes.tab) <(sort transcDump.tab) | awk '{print $3"\t"$5"\t"$6"\t"$1"\t0\t"$4"\t"$2}' > transcCoords.bed6+1.tab 
join <(sort geneToHugoGenes.tab) <(sort geneDump.tab) | awk '{print $3"\t"$5"\t"$6"\t"$1"\t0\t"$4"\t"$2}' | sort -k4,4 -u > geneCoords.bed6+1.tab 

# Generate a bed 6+5 barChart file.
time expMatrixToBarchartBed cleanQuakeSamples.tab cleanQuakeTranscMatrix.tab transcCoords.bed6+1.tab quakeTranscExpr.bed 
sort -k1,1 -k2,2n quakeTranscExpr.bed > sortedQuakeTranscExpr.bed
bedToBigBed -as=$HOME/kent/src/hg/lib/barChartTranscExp.as -type=bed6+5 sortedQuakeTranscExpr.bed /hive/data/genomes/hg38/chrom.sizes quakeTranscExpr.bb

time expMatrixToBarchartBed cleanQuakeSamples.tab cleanQuakeGeneMatrix.tab geneCoords.bed6+1.tab quakeGeneExpr.bed 
sort -k1,1 -k2,2n quakeGeneExpr.bed > sortedQuakeGeneExpr.bed
bedToBigBed -as=$HOME/kent/src/hg/lib/barChartGeneExp.as -type=bed6+5 sortedQuakeGeneExpr.bed /hive/data/genomes/hg38/chrom.sizes quakeGeneExpr.bb

cd /gbdb/hgFixed/human/expMatrix/
ln -s /hive/data/genomes/hg38/bed/quakeBrainExpression/cleanQuakeTranscMatrix.tab .
ln -s /hive/data/genomes/hg38/bed/quakeBrainExpression/cleanQuakeGeneMatrix.tab .
ln -s /hive/data/genomes/hg38/bed/quakeBrainExpression/cleanQuakeSamples.tab .
ln -s /hive/data/genomes/hg38/bed/quakeBrainExpression/quakeTranscExpr.bb .
ln -s /hive/data/genomes/hg38/bed/quakeBrainExpression/quakeGeneExpr.bb .

# Get the colors for the barcharts from Figure 1:D of the publication, I used an online
# color to hex converter to get each color's hex code from the paper.  There are a bunch
# of these tools and all provide the same results, http://html-color-codes.info/colors-from-image/