# miRNA Tissue Atlas Track for hg38 ############################################################################## # miRna tissue expression atlas from https://ccb-web.cs.uni-saarland.de/tissueatlas/ # expression data from 61 tissues across two individuals mkdir /hive/data/genomes/hg38/bed/miRnaAtlas cd /hive/data/genomes/hg38/bed/miRnaAtlas wget https://ccb-web.cs.uni-saarland.de/tissueatlas/static/data/expression_data.zip mkdir expressionData unzip -d expressionData expression_data.zip # some miRNAs (especially mature miRNAs) from this set are not in the current mirbase version hgsql -Ne "select name from wgRna where type='miRNA'" hg38 > mirbaseV22Names.txt tail -n +2 expression_data/data_matrix_quantile.txt | cut -f1 > atlasNames.txt comm -23 <(sort atlasNames.txt) <(sort mirbaseV22Names.txt) | wc -l # 1364 # use mirbase v21 gff for coordinates instead wget ftp://mirbase.org/pub/mirbase/21/genomes/hsa.gff3 -O mirbaseV21.gff3 # everything is there: tail -n +14 mirbaseV21.gff3 | tr ';' '\t' | tr '=' '\t' | cut -f14 > mirbaseV21.names comm -23 <(sort atlasNames.txt) <(sort mirbaseV21Names.gff3) | wc -l # 0 # get gff lines of interest, and miRNA precursor if applicable tail -n +14 mirbaseV21.gff3 | tr ';' '\t' | tr '=' '\t' | tawk '{print $1,$4,$5,$14,$6,$7,$16}' > mirbaseV21.bed6Plus # make autoSql file for bigBed: cat << EOF > miRnaAtlas.as table miRnaAtlas "BED6+5 with additional fields for category count and median values, and sample matrix fields" ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Mirbase v21 name" uint score; "Score from 0-1000, typically derived from total of median value from all categories" char[1] strand; "+ or - for strand. Use . if not applicable" string name2; "Precursor miRna from Mirbase if applicable" uint expCount; "Number of categories" float[expCount] expScores; "Comma separated list of category values" bigint _dataOffset; "Offset of sample data in data matrix file, for boxplot on details page" int _dataLen; "Length of sample data row in data matrix file" ) EOF grep -Fwf atlasNames.txt mirbaseV21.bed6Plus > atlasCoordinates.bed6Plus wc -l atlasCoordinates.bed6Plus # 1501 # some of these miRna's are present in the genome twice, like hsa-mir-392, which is present # in two places with a single base mismatch # use expMatrixToBarchartBed to make final barChart, first we need a sample mapping: head -1 expression_data/data_matrix_quantile.txt | tr '\t' '\n' > tissues.txt head -1 expression_data/data_matrix_quantile.txt | tr '\t' '\n' | \ sed -e 's/\._/\./' | sed -e 's/.$//' | tr -s '.' | rev | sed -e 's/\./\t/' | rev | \ tawk '{print $2,$1}' | sed -e 's/\t/-/' | tr '.' '_' > samplesCleaned2.txt paste tissues.txt samplesCleaned2.txt > atlasSamples2.txt head -2 atlasSamples2.txt # adipocyte.1. 1-adipocyte # arachnoid_mater.1. 1-arachnoid_mater # cleanup matrix file sed -e '1s/^/\#gene\t/' expression_data/data_matrix_quantile.txt | sed -e 's/,/\./g' > data_matrix_decimals.txt expMatrixToBarchartBed atlasSamples.txt data_matrix_decimals.txt atlasCoordinates.bed6Plus miRnaAtlas.bed # this is only writing two bed lines, I believe because of duplicates. # fixed by commit: 87a97a71 # turn into a bigBed chromSizes=../../chrom.sizes sort -k1,1 -k2,2n miRnaAtlas.bed > miRnaAtlas.bed.sorted bedToBigBed -as=miRnaAtlas.as -tab -type=bed6+5 miRnaAtlas.bed.sorted $chromSizes miRnaAtlas.bb bigBedInfo miRnaAtlas.bb # version: 4 # fieldCount: 11 # hasHeaderExtension: yes # isCompressed: yes # isSwapped: 0 # extraIndexCount: 0 # itemCount: 1,501 # primaryDataSize: 240,503 # primaryIndexSize: 6,388 # zoomLevels: 10 # chromCount: 24 # basesCovered: 30,327 # meanDepth (of bases covered): 1.014838 # minDepth: 1.000000 # maxDepth: 2.000000 # std of depth: 0.120907 ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/miRnaAtlas.bb /gbdb/hg38/bbi/miRnaAtlas.bb ############################################################################## # make composite track one for each individual ############################################################################## cd /hive/data/genomes/hg38/bed/miRnaAtlas mkdir /hive/data/genomes/hg38/bed/miRnaAtlas/composite cd /hive/data/genomes/hg38/bed/miRnaAtlas/composite grep '1-' ../samplesCleaned2.txt | cut -d'-' -f2 | sort > person1Tissues.txt grep '2-' ../samplesCleaned2.txt | cut -d'-' -f2 | sort > person2Tissues.txt grep -Fwf person1Tissues.txt person2Tissues.txt | wc -l # 14 wc -l person1Tissues.txt person2Tissues.txt # 24 person1Tissues.txt # 37 person2Tissues.txt # 61 total # about half of first individuals samples were sampled in individual 2, so 14 bars # will have shared colors for easy comparison grep "2-" ../atlasSamples2.txt | tr '-' '\t' | cut -f1,3 > person2Samples.txt grep "1-" ../atlasSamples2.txt | tr '-' '\t' | cut -f1,3 > person1Samples.txt # cut matrix into person1 and person2 matrices: head -1 ../data_matrix_decimals2.txt | tl cut -f1-25 ../data_matrix_decimals2.txt > person1.matrix cut -f1,26- ../data_matrix_decimals2.txt > person2.matrix # download TSI values and incorporate into the bed file wget https://ccb-web.cs.uni-saarland.de/tissueatlas/static/data/tsi_values.csv tail -n +2 tsi_values.csv | cut -f1,5 | sort -k1 > tsi_quantiles_person1.txt tail -n +2 tsi_values.csv | cut -f1,6 | sort -k1 > tsi_quantiles_person2.txt sort -k4 ../atlasCoordinates.bed6Plus > atlasSortedByName.bed join -t $'\t' -1 4 -2 1 atlasSortedByName.bed tsi_quantiles_person1.txt \ | tawk '{print $2,$3,$4,$1,$5,$6,$7,$8}' > atlasCoordsPlusTSIPerson1.bed join -t $'\t' -1 4 -2 1 atlasSortedByName.bed tsi_quantiles_person2.txt \ | tawk '{print $2,$3,$4,$1,$5,$6,$7,$8}' > atlasCoordsPlusTSIPerson2.bed expMatrixToBarchartBed --autoSql miRnaAtlasReordered.as --groupOrderFile person1.order2 person1Samples.txt person1.matrix atlasCoordsPlusTSIPerson1.bed person1.bed1 &> person1.log2 expMatrixToBarchartBed --autoSql miRnaAtlasReordered.as --groupOrderFile person2.order2 person2Samples.txt person2.matrix atlasCoordsPlusTSIPerson2.bed person2.bed1 &> person2.log2 # column order person1: adipocyte artery colon dura_mater kidney liver lung muscle myocardium skin spleen stomach testis thyroid small_intestine bone gallbladder fascia bladder epididymis tunica_albuginea nerve_nervus_intercostalis arachnoid_mater brain # column order person2: adipocyte artery colon dura_mater kidney liver lung muscle myocardium skin spleen stomach testis thyroid small_intestine_duodenum small_intestine_jejunum pancreas kidney_glandula_suprarenalis kidney_cortex_renalis kidney_medulla_renalis esophagus prostate bone_marrow vein lymph_node nerve_not_specified pleura brain_pituitary_gland spinal_cord brain_thalamus brain_white_matter brain_nucleus_caudatus brain_gray_matter brain_cerebral_cortex_temporal brain_cerebral_cortex_frontal brain_cerebral_cortex_occipital brain_cerebellum # to get corresponding colors for trackDb: for tissue in $(cat person1.order2); do grep -w $tissue allTissueNames.txt ; done | cut -f2 | tr '\n' ' ' ; echo for tissue in $(cat person2.order2); do grep -w $tissue allTissueNames.txt ; done | cut -f2 | tr '\n' ' ' ; echo chromSizes=../../../chrom.sizes sort -k1,1 -k2,2n person1.bed1 > person1.bed1.sorted sort -k1,1 -k2,2n person2.bed1 > person2.bed1.sorted bedToBigBed -as=miRnaAtlasReordered.as -tab -type=bed6+6 person1.bed1.sorted $chromSizes sample1.bb bedToBigBed -as=miRnaAtlasReordered.as -tab -type=bed6+6 person2.bed1.sorted $chromSizes sample2.bb # get right sample mapping/matrices into gbdb for hgc pages: ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/composite/person1Samples.txt /gbdb/hgFixed/human/expMatrix/miRnaAtlasSample1.txt ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/composite/person2Samples.txt /gbdb/hgFixed/human/expMatrix/miRnaAtlasSample2.txt ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/composite/person1Matrix.txt /gbdb/hgFixed/human/expMatrix/miRnaAtlasSample1Matrix.txt ln -s /hive/data/genomes/hg38/bed/miRnaAtlas/composite/person2Matrix.txt /gbdb/hgFixed/human/expMatrix/miRnaAtlasSample2Matrix.txt