#### obtain all the genes for the 'mito' browser ### This script will use the mito.chrom.sizes listing which defines ### the species used in the mito browser. It will parse the names ### to determine if they are UCSC databse browsers or GenArk browsers. ### Then, it will check those browser build directories to find the ### appropriate gene definitions and gather them all together ### in this /hive/data/genomes/mito/trackData/genes/ ### directory. ######################################################################### #!/bin/bash mkdir -p /hive/data/genomes/mito/trackData/genes cd /hive/data/genomes/mito/trackData/genes for asmChr in `cut -f1 ../../mito.chrom.sizes` do case "${asmChr}" in GC*) asmName=`echo $asmChr | cut -d'_' -f1-2 | sed -e 's/v/./;'` chrName=`echo $asmChr | cut -d'_' -f3-` gcX="${asmName:0:3}" d0="${asmName:4:3}" d1="${asmName:7:3}" d2="${asmName:10:3}" t=`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*` if [ -d "${t}" ]; then buildDir=`realpath "${t}"` asmId=`basename "${buildDir}"` ncbiRefSeq="${buildDir}/trackData/ncbiRefSeq" if [ -d "${ncbiRefSeq}" ]; then if [ ! -s "${asmName}/${asmChr}.gp.gz" ]; then printf "%s %s %s\n" "${asmId}" "${asmChr}" "${chrName}" mkdir -p "${asmName}" if [ -s "${ncbiRefSeq}/process/${asmId}.other.gp" -a -s "${ncbiRefSeq}/process/${asmId}.ncbiRefSeq.gp" ]; then egrep -h -w "${chrName}" \ "${ncbiRefSeq}/process/${asmId}.ncbiRefSeq.gp" \ "${ncbiRefSeq}/process/${asmId}.other.gp" \ | sort -u | tawk -v chr="${asmChr}" '{$2=chr; print $0}' \ | gzip -c > "${asmName}/${asmChr}.gp.gz" printf "# DBG proceess %s both\n" "${asmChr}" 1>&2 elif [ -s "${ncbiRefSeq}/process/${asmId}.ncbiRefSeq.gp" ]; then egrep -h -w "${chrName}" \ "${ncbiRefSeq}/process/${asmId}.ncbiRefSeq.gp" \ | sort -u | tawk -v chr="${asmChr}" '{$2=chr; print $0}' \ | gzip -c > "${asmName}/${asmChr}.gp.gz" printf "# DBG proceess %s other\n" "${asmChr}" 1>&2 elif [ -s "${ncbiRefSeq}/process/${asmId}.other.gp" ]; then egrep -h -w "${chrName}" \ "${ncbiRefSeq}/process/${asmId}.other.gp" \ | sort -u | tawk -v chr="${asmChr}" '{$2=chr; print $0}' \ | gzip -c > "${asmName}/${asmChr}.gp.gz" printf "# DBG %s\n" "${asmChr}" 1>&2 else printf "# can not find ncbiRefSeq.gp or other.gp\n" fi else printf "DONE\t%s %s %s\n" "${asmId}" "${asmChr}" "${chrName}" fi else printf "%s: no ncbiRefSeq\n" "${asmName}" fi else printf "%s: GCx missing buildDir\n" "${asmName}" fi ;; mm10_chrM) if [ ! -s "mm10/${asmChr}.gp.gz" ]; then printf "%s\n" "mm10/${asmChr}.gp.gz" mkdir -p mm10 zcat /hive/data/genomes/mm10/bed/ncbiRefSeq.p6.2021-04-22/process/mm10.curated.gp.gz \ | awk '$2 == "chrM"' \ | sed -e 's/chrM/mm10_chrM/;' \ | gzip -c > mm10/${asmChr}.gp.gz else printf "DONE\t%s\n" "mm10/${asmChr}.gp.gz" fi ;; mm39_chrM) if [ ! -s "hg39/${asmChr}.gp.gz" ]; then mkdir -p mm39 zcat /hive/data/genomes/mm39/bed/ncbiRefSeq.2023-04-19/process/mm39.other.gp.gz \ | awk '$2 == "chrM"' \ | sed -e 's/chrM/mm39_chrM/;' \ | gzip -c > mm39/${asmChr}.gp.gz else printf "DONE\t%s\n" "mm39/${asmChr}.gp.gz" fi ;; hg19a_chrM) if [ ! -s "hg19/${asmChr}.gp.gz" ]; then mkdir -p hg19 hgsql -N -e 'select * from ensGene where chrom="chrM";' hg19 \ | cut -f2- | sed -e 's/chrM/hg19a_chrM/;' | gzip -c > hg19/${asmChr}.gp.gz else printf "DONE\t%s\n" "hg19/${asmChr}.gp.gz" fi ;; hg19b_chrMT) if [ ! -s "hg19/${asmChr}.gp.gz" ]; then mkdir -p hg19 zcat /hive/data/genomes/hg38/bed/ncbiRefSeq.p14.2023-03-29/process/hg38.other.gp.gz \ | awk '$2 == "chrM"' \ | sed -e "s/chrM/${asmChr}/;" \ | gzip > hg19/${asmChr}.gp.gz else printf "DONE\t%s\n" "hg19/${asmChr}.gp.gz" fi ;; hg38_chrM) if [ ! -s "hg38/${asmChr}.gp.gz" ]; then mkdir -p hg38 zcat /hive/data/genomes/hg38/bed/ncbiRefSeq.p14.2023-03-29/process/hg38.other.gp.gz \ | awk '$2 == "chrM"' \ | sed -e 's/chrM/hg38_chrM/;' \ | gzip > hg38/${asmChr}.gp.gz else printf "DONE\t%s\n" "hg38/${asmChr}.gp.gz" fi ;; [a-z]*) asmName=`echo $asmChr | cut -d'_' -f1` chr=`echo $asmChr | cut -d'_' -f2-` buildDir="/hive/data/genomes/${asmName}" if [ -d "${buildDir}" ]; then printf "DBG %s:\t" "${asmChr}" 1>&2 hgsql -N -e 'show tables;' $asmName | grep -i gene | while read tbl do printf " %s" "${tbl}" done printf "\n" # priority of gene tables for xTbl in ncbiRefSeq ensGene refGene augustusGene do hgsql -N -e 'show tables;' $asmName | grep -i gene | while read tbl do case "${tbl}" in $xTbl) mkdir -p "${asmName}" if [ ! -s "$asmName/${asmChr}.gp.gz" ]; then printf "DBG select * from $tbl where chrom=\"$chr\"; $asmName\n" 1>&2 hgsql -N -e "select * from $tbl where chrom=\"$chr\";" $asmName \ | cut -f2- | tawk -v chr="${asmChr}" '{$2=chr; print $0}' \ | gzip -c > $asmName/${asmChr}.gp.gz C=`zgrep -c . "$asmName/${asmChr}.gp.gz"` if [ "${C}" -eq 0 ]; then rm -f "$asmName/${asmChr}.gp.gz" fi else printf "DONE\t%s\n" "$asmName/${asmChr}.gp.gz" fi ;; esac done done else printf "%s: a-z missing\n" "${asmName}" fi ;; esac done exit $? 9 refGene 23 augustusGene 9 ensGene 9 ensemblToGeneName 3 geneName 2 geneid 1 mgcGenes 5 ncbiRefSeq 5 no 1 nscanGene 1 sgpGene 1 transMapAlnUcscGenes 1 transMapInfoUcscGenes 19 xenoRefGene