# miRNA Tissue Atlas Track for hg38

##############################################################################
#  miRna tissue expression atlas from https://ccb-web.cs.uni-saarland.de/tissueatlas/
#  expression data from 61 tissues across two individuals

    mkdir /hive/data/genomes/hg38/bed/miRnaAtlas
    cd /hive/data/genomes/hg38/bed/miRnaAtlas

    wget https://ccb-web.cs.uni-saarland.de/tissueatlas/static/data/expression_data.zip
    mkdir expressionData
    unzip -d expressionData expression_data.zip

    # some miRNAs (especially mature miRNAs) from this set are not in the current mirbase version
    hgsql -Ne "select name from wgRna where type='miRNA'" hg38 > mirbaseV22Names.txt
    tail -n +2 expression_data/data_matrix_quantile.txt | cut -f1 > atlasNames.txt
    comm -23 <(sort atlasNames.txt) <(sort mirbaseV22Names.txt) | wc -l
    # 1364

    # use mirbase v21 gff for coordinates instead
    wget ftp://mirbase.org/pub/mirbase/21/genomes/hsa.gff3 -O mirbaseV21.gff3

    # everything is there:
    tail -n +14 mirbaseV21.gff3 | tr ';' '\t' | tr '=' '\t' | cut -f14 > mirbaseV21.names
    comm -23 <(sort atlasNames.txt) <(sort mirbaseV21Names.gff3) | wc -l
    # 0

    # get gff lines of interest, and miRNA precursor if applicable
    tail -n +14 mirbaseV21.gff3 | tr ';' '\t' | tr '=' '\t' | tawk '{print $1,$4,$5,$14,$6,$7,$16}' > mirbaseV21.bed6Plus

    # make autoSql file for bigBed:
    cat << EOF > miRnaAtlas.as
table miRnaAtlas
"BED6+5 with additional fields for category count and median values, and sample matrix fields"
    (
    string chrom;       "Reference sequence chromosome or scaffold"
    uint   chromStart;  "Start position in chromosome"
    uint   chromEnd;    "End position in chromosome"
    string name;        "Mirbase v21 name"
    uint   score;       "Score from 0-1000, typically derived from total of median value from all categories"
    char[1] strand;     "+ or - for strand. Use . if not applicable"
    string name2;       "Precursor miRna from Mirbase if applicable"
    uint expCount;      "Number of categories"
    float[expCount] expScores; "Comma separated list of category values"
    bigint _dataOffset; "Offset of sample data in data matrix file, for boxplot on details page"
    int _dataLen;       "Length of sample data row in data matrix file"
    )
EOF

    grep -Fwf atlasNames.txt mirbaseV21.bed6Plus > atlasCoordinates.bed6Plus
    wc -l atlasCoordinates.bed6Plus
    # 1501
    # some of these miRna's are present in the genome twice, like hsa-mir-392, which is present
    # in two places with a single base mismatch

    # use expMatrixToBarchartBed to make final barChart, first we need a sample mapping:
    head -1 expression_data/data_matrix_quantile.txt | tr '\t' '\n' > tissues.txt
    head -1 expression_data/data_matrix_quantile.txt | tr '\t' '\n' | \
        sed -e 's/\._/\./' | sed -e 's/.$//' | tr -s '.' | rev | sed -e 's/\./\t/' | rev | \
        tawk '{print $2,$1}' | sed -e 's/\t/-/' | tr '.' '_' > samplesCleaned2.txt
    paste tissues.txt samplesCleaned2.txt > atlasSamples2.txt
    head -2 atlasSamples2.txt
    # adipocyte.1.    1-adipocyte
    # arachnoid_mater.1.      1-arachnoid_mater

    # cleanup matrix file
    sed -e '1s/^/\#gene\t/' expression_data/data_matrix_quantile.txt | sed -e 's/,/\./g' > data_matrix_decimals.txt

    expMatrixToBarchartBed atlasSamples.txt data_matrix_decimals.txt atlasCoordinates.bed6Plus miRnaAtlas.bed
    # this is only writing two bed lines, I believe because of duplicates.
    # fixed by commit: 87a97a71

    # turn into a bigBed
    chromSizes=../../chrom.sizes
    sort -k1,1 -k2,2n miRnaAtlas.bed > miRnaAtlas.bed.sorted
    bedToBigBed -as=miRnaAtlas.as -tab -type=bed6+5 miRnaAtlas.bed.sorted $chromSizes miRnaAtlas.bb
    bigBedInfo miRnaAtlas.bb
    # version: 4
    # fieldCount: 11
    # hasHeaderExtension: yes
    # isCompressed: yes
    # isSwapped: 0
    # extraIndexCount: 0
    # itemCount: 1,501
    # primaryDataSize: 240,503
    # primaryIndexSize: 6,388
    # zoomLevels: 10
    # chromCount: 24
    # basesCovered: 30,327
    # meanDepth (of bases covered): 1.014838
    # minDepth: 1.000000
    # maxDepth: 2.000000
    # std of depth: 0.120907

    ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/miRnaAtlas.bb /gbdb/hg38/bbi/miRnaAtlas.bb

##############################################################################
# make composite track one for each individual
##############################################################################

    cd /hive/data/genomes/hg38/bed/miRnaAtlas
    mkdir /hive/data/genomes/hg38/bed/miRnaAtlas/composite
    cd /hive/data/genomes/hg38/bed/miRnaAtlas/composite

    grep '1-' ../samplesCleaned2.txt | cut -d'-' -f2 | sort > person1Tissues.txt
    grep '2-' ../samplesCleaned2.txt | cut -d'-' -f2 | sort > person2Tissues.txt
    grep -Fwf person1Tissues.txt person2Tissues.txt  | wc -l
    # 14
    wc -l person1Tissues.txt person2Tissues.txt
    # 24 person1Tissues.txt
    # 37 person2Tissues.txt
    # 61 total
    # about half of first individuals samples were sampled in individual 2, so 14 bars
    # will have shared colors for easy comparison

    grep "2-" ../atlasSamples2.txt | tr '-' '\t' | cut -f1,3  > person2Samples.txt
    grep "1-" ../atlasSamples2.txt | tr '-' '\t' | cut -f1,3  > person1Samples.txt

    # cut matrix into person1 and person2 matrices:
    head -1 ../data_matrix_decimals2.txt | tl
    cut -f1-25 ../data_matrix_decimals2.txt > person1.matrix
    cut -f1,26- ../data_matrix_decimals2.txt > person2.matrix

    # download TSI values and incorporate into the bed file
    wget https://ccb-web.cs.uni-saarland.de/tissueatlas/static/data/tsi_values.csv
    tail -n +2 tsi_values.csv | cut -f1,5 | sort -k1 > tsi_quantiles_person1.txt
    tail -n +2 tsi_values.csv | cut -f1,6 | sort -k1 > tsi_quantiles_person2.txt
    sort -k4 ../atlasCoordinates.bed6Plus > atlasSortedByName.bed
    join -t $'\t' -1 4 -2 1 atlasSortedByName.bed tsi_quantiles_person1.txt \
        | tawk '{print $2,$3,$4,$1,$5,$6,$7,$8}' > atlasCoordsPlusTSIPerson1.bed
    join -t $'\t' -1 4 -2 1 atlasSortedByName.bed tsi_quantiles_person2.txt \
        | tawk '{print $2,$3,$4,$1,$5,$6,$7,$8}' > atlasCoordsPlusTSIPerson2.bed

    expMatrixToBarchartBed --autoSql miRnaAtlasReordered.as --groupOrderFile person1.order2 person1Samples.txt person1.matrix atlasCoordsPlusTSIPerson1.bed person1.bed1 &> person1.log2
    expMatrixToBarchartBed --autoSql miRnaAtlasReordered.as --groupOrderFile person2.order2 person2Samples.txt person2.matrix atlasCoordsPlusTSIPerson2.bed person2.bed1 &> person2.log2
    # column order person1:
    adipocyte artery colon dura_mater kidney liver lung muscle myocardium skin spleen stomach testis thyroid small_intestine bone gallbladder fascia bladder epididymis tunica_albuginea nerve_nervus_intercostalis arachnoid_mater brain
    # column order person2:
    adipocyte artery colon dura_mater kidney liver lung muscle myocardium skin spleen stomach testis thyroid small_intestine_duodenum small_intestine_jejunum pancreas kidney_glandula_suprarenalis kidney_cortex_renalis kidney_medulla_renalis esophagus prostate bone_marrow vein lymph_node nerve_not_specified pleura brain_pituitary_gland spinal_cord brain_thalamus brain_white_matter brain_nucleus_caudatus brain_gray_matter brain_cerebral_cortex_temporal brain_cerebral_cortex_frontal brain_cerebral_cortex_occipital brain_cerebellum

    # to get corresponding colors for trackDb:
    for tissue in $(cat person1.order2); do grep -w $tissue allTissueNames.txt ; done | cut -f2 | tr '\n' ' ' ; echo
    for tissue in $(cat person2.order2); do grep -w $tissue allTissueNames.txt ; done | cut -f2 | tr '\n' ' ' ; echo

    chromSizes=../../../chrom.sizes
    sort -k1,1 -k2,2n person1.bed1 > person1.bed1.sorted
    sort -k1,1 -k2,2n person2.bed1 > person2.bed1.sorted
    bedToBigBed -as=miRnaAtlasReordered.as -tab -type=bed6+6 person1.bed1.sorted $chromSizes sample1.bb
    bedToBigBed -as=miRnaAtlasReordered.as -tab -type=bed6+6 person2.bed1.sorted $chromSizes sample2.bb

    # get right sample mapping/matrices into gbdb for hgc pages:
    ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/composite/person1Samples.txt /gbdb/hgFixed/human/expMatrix/miRnaAtlasSample1.txt
    ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/composite/person2Samples.txt /gbdb/hgFixed/human/expMatrix/miRnaAtlasSample2.txt
    ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/composite/person1Matrix.txt /gbdb/hgFixed/human/expMatrix/miRnaAtlasSample1Matrix.txt
    ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/composite/person2Matrix.txt /gbdb/hgFixed/human/expMatrix/miRnaAtlasSample2Matrix.txt