# for emacs: -*- mode: sh; -*- # Use the NCBI taxonomy database and hgcentraltest.dbDb to construct a tree # of the species offered in the GB, encoded as a JS file. # When this is stable, we should consider folding it into makeGenomeDb.pl. ############################################################################# # FETCH LATEST NCBI TAXONOMY (DONE 4/18/16 angie) set date = `date +%Y_%m_%d` mkdir -p /hive/data/outside/ncbi/taxonomy/$date cd /hive/data/outside/ncbi/taxonomy/$date wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz tar xvzf taxdump.tar.gz ############################################################################# # BUILD DBDBTAXONOMY.JS (DONE 4/18/16 angie) # Find latest NCBI Taxonomy dump directory and set its name here: set date = 2016_04_18 cd /hive/data/outside/ncbi/taxonomy/$date # Dump some dbDb columns for active databases, filtering out experimental # databases that use 0 or 1 as a placeholder taxId: hgsql hgcentraltest \ -NBe 'select taxId,name,genome,scientificName from dbDb where active=1 and taxId > 1' \ | sort > dbDb.taxId.alphSort.txt # Look for dbDb.taxId values that are out of date. cut -f 1 dbDb.taxId.alphSort.txt | uniq | sort > taxId.alphSort.txt sort names.dmp > names.alphSort.txt join -a 1 taxId.alphSort.txt names.alphSort.txt | grep -v \| # If there is any output, it is a list of outdated taxIds that are not in names.dmp. # They might appear in merged.dmp or delnodes.dmp. # If any outdated taxIds appear, then find out what the latest taxIds are by searching # for the dbDb.scientificName values in names.dmp. # Then update dbDb.taxId values and start over. # When everything looks good, make a file with just taxId and genome. # Filter out some wacky experimental dbs that also use Human & Mouse's taxIds. # Baboon has two species with one dbDb.genome value; add species to disambiguate. # Use full-line uniq instead of sort -k -u because we want to find lines that have # the same genome, just different taxIds. cut -f 1,3 dbDb.taxId.alphSort.txt | uniq \ | egrep -v 'GRCh38.p|GRCm38.p|GRCh37.p|Venter|hg19Haplotypes|tarInv|Mm10haps' \ | egrep -v 'Catarrhini|Euarchontoglires|Eutheria|Glires|Laurasiatheria' \ | sed -re 's/^9555\tBaboon$/9555\tBaboon (anubis)/; s/^9562\tBaboon$/9562\tBaboon (hamadryas)/; s/Zaire ebolavirus/Ebola virus/;' \ | sort -k2 | uniq > taxIdGenome.txt # compare this result with previous ../someDate/taxIdGenome.txt # to verity it isn't going crazy # Look in taxIdGenome.rr.txt for cases of same genome value, different species # (aside from Baboon). Example failure: cut -f1 taxIdGenome.txt | sort | uniq -c | sort -rn | head 2 186538 # That was two names for the same thing: Zaire ebolavirus == Ebola virus # There are cases where there are two entries for the same name with # different taxIds: cut -f2 taxIdGenome.txt | sort | uniq -c | sort -rn | head # 2 S. cerevisiae # 2 D. pseudoobscura # 2 A. gambiae # this seems to be harmless and the taxIds can not be reconciled, they # are actually different assemblies for the same species # Run a script to create the Javascript file: ~/kent/src/hg/utils/dbDbTaxonomy.pl taxIdGenome.txt nodes.dmp names.dmp \ > ~/kent/src/hg/js/dbDbTaxonomy.js # Look at the git diff: cd ~/kent/src/hg/js git diff dbDbTaxonomy.js # Verify that changes are as expected (new genome?). # git commit and push #############################################################################