With the release of Ensembl 100, idKeys for Ensembl assemblies were calculated in: /hive/data/outside/ensembl/genomes/release-100/ New assemblies have been built for genbank, refseq and ucsc Working in: mkdir /hive/data/inside/assemblyEquivalence/2020-06-15 cd /hive/data/inside/assemblyEquivalence/2020-06-15 mkdir ensembl refseq genbank ucsc ########### Ensembl keys ################## cd ensembl rsync -av --stats \ rsync://ftp.ensembl.org/ensembl/pub/release-100/species_EnsemblVertebrates.txt \ ./ find /hive/data/outside/ensembl/genomes/release-100/idKeys -type f \ | grep keySignature.txt | while read K do idKeysDir=`dirname "${K}"` id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` idKeys="$idKeysDir/$id.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${K}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" done | sort -k1,1 > ensembl-100.keySignatures.txt find /hive/data/outside/ensembl/genomes/release-99/idKeys -type f \ | grep keySignature.txt | while read K do idKeysDir=`dirname "${K}"` id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` idKeys="$idKeysDir/$id.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${K}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" done | sort -k1,1 > ensembl-99.keySignatures.txt sort -u ensembl-100.keySignatures.txt ensembl-99.keySignatures.txt \ > ensembl.keySignatures.txt ########### refseq keys ################## cd ../refseq find /hive/data/genomes/asmHubs/refseqBuild/GCF \ -maxdepth 4 -mindepth 4 -type d | while read buildDir do asmId=`basename "${buildDir}"` keySig="${buildDir}/idKeys/${asmId}.keySignature.txt" idKeysDir=`dirname "${keySig}"` if [ -s "${keySig}" ]; then idKeys="$idKeysDir/$asmId.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${keySig}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}" fi done | sort -k1,1 > refseq.keySignatures.txt ########### genbank keys ################## cd ../genbank find /hive/data/genomes/asmHubs/genbankBuild/GCA \ -maxdepth 4 -mindepth 4 -type d | while read buildDir do asmId=`basename "${buildDir}"` keySig="${buildDir}/idKeys/${asmId}.keySignature.txt" idKeysDir=`dirname "${keySig}"` if [ -s "${keySig}" ]; then idKeys="$idKeysDir/$asmId.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${keySig}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}" fi done | sort -k1,1 > genbank.keySignatures.txt ########### UCSC keys ################## cd ../ucsc ls -d /hive/data/genomes/* | while read dbDir do db=`basename "${dbDir}"` keySig="${dbDir}/bed/idKeys/${db}.keySignature.txt" idKeysDir=`dirname "${keySig}"` if [ -s "${keySig}" ]; then idKeys="$idKeysDir/$db.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${keySig}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${db}" "${fullCount}" "${uniqueCount}" fi done | sort -k1,1 > ucsc.keySignatures.txt cd /hive/data/inside/assemblyEquivalence/2020-06-15 ln -s /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/exact.sh . ########### Exact matches ################## ./exact.sh ~/kent/src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl \ | sort > table.2020-06-16.tsv ########### Near matches ################## mkdir /hive/data/inside/assemblyEquivalence/2020-06-15/notExact cd /hive/data/inside/assemblyEquivalence/2020-06-15/notExact ln -s /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/runOne ./ time ~/kent/src/hg/makeDb/doc/assemblyEquivalence/allByAll.sh # real 10m22.694s sed -e 's/\tcount A .*//;' results/match.*.txt > notExact.table.2020-06-16.tsv ########### Table contents to load ################## sort -u notExact.table.2020-06-16.tsv ../table.2020-06-16.tsv \ > hgFixed.asmEquivalent.tsv ### Compare to existing: hgsql -N -e 'select * from asmEquivalent;' hgFixed | sort \ > existing.2020-06-16.tsv ### most should be identical to before join hgFixed.asmEquivalent.tsv existing.2020-06-16.tsv | wc 4725 61165 645508 ### probably should *not* be losing any from before: join -v 2 hgFixed.asmEquivalent.tsv existing.2020-06-16.tsv | wc -l 1 # This single one is due to an update for a genbank assembly that # now matches exactly where it was a near match before GCA_004886185.2_ASM488618v2 vs: GCA_004886185.1_Basenji_breed-1.1 ### there should be some new ones join -v 1 hgFixed.asmEquivalent.tsv existing.2020-06-16.tsv | wc 139 973 12459 #### To load up new table contents: hgLoadSqlTab hgFixed asmEquivalent ~/kent/src/hg/lib/asmEquivalent.sql \ hgFixed.asmEquivalent.tsv hgsql -N -e 'select * from asmEquivalent;' hgFixed \ | sort > updated.2020-06-16.tsv wc -l updated.2020-06-16.tsv existing.2020-06-16.tsv 2320 updated.2020-06-16.tsv 2118 existing.2020-06-16.tsv