idKeys for Ensembl assemblies were calculated in: /hive/data/outside/ensembl/genomes/release-104/ New assemblies have been built for genbank, refseq and ucsc Working in: mkdir /hive/data/inside/assemblyEquivalence/2021-05-11 cd /hive/data/inside/assemblyEquivalence/2021-05-11 mkdir ensembl refseq genbank ucsc ########### refseq keys ################## cd refseq find /hive/data/genomes/asmHubs/refseqBuild/GCF \ -maxdepth 4 -mindepth 4 -type d | while read buildDir do asmId=`basename "${buildDir}"` keySig="${buildDir}/idKeys/${asmId}.keySignature.txt" idKeysDir=`dirname "${keySig}"` if [ -s "${keySig}" ]; then idKeys="$idKeysDir/$asmId.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${keySig}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}" fi done | sort -k1,1 > refseq.keySignatures.txt ########### genbank keys ################## cd ../genbank find /hive/data/genomes/asmHubs/genbankBuild/GCA \ -maxdepth 4 -mindepth 4 -type d | while read buildDir do asmId=`basename "${buildDir}"` keySig="${buildDir}/idKeys/${asmId}.keySignature.txt" idKeysDir=`dirname "${keySig}"` if [ -s "${keySig}" ]; then idKeys="$idKeysDir/$asmId.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${keySig}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}" fi done | sort -k1,1 > genbank.keySignatures.txt ########### UCSC keys ################## cd ../ucsc ls -d /hive/data/genomes/* | while read dbDir do db=`basename "${dbDir}"` keySig="${dbDir}/bed/idKeys/${db}.keySignature.txt" idKeysDir=`dirname "${keySig}"` if [ -s "${keySig}" ]; then idKeys="$idKeysDir/$db.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${keySig}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${db}" "${fullCount}" "${uniqueCount}" fi done | sort -k1,1 > ucsc.keySignatures.txt cd /hive/data/inside/assemblyEquivalence/2021-05-11 ln -s /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/exact.sh . ########### Ensembl keys ################## cd ../ensembl # does not always work, use the wget instead # rsync -av --stats \ # rsync://ftp.ensembl.org/ensembl/pub/release-104/species_EnsemblVertebrates.txt \ # ./ wget --timestamping \ ftp://ftp.ensembl.org/pub/release-104/species_EnsemblVertebrates.txt # extract the GCx assembly names: awk -F$'\t' '{printf "%s_%s\n", $6,$5}' species_EnsemblVertebrates.txt \ | grep "^G" | sort > ensembl.v104.asmId.txt # compare this to the ones we have on hand here cut -f2 ../genbank/genbank.keySignatures.txt | sort > genbank.asmId.here.txt wc -l ensembl.v104.asmId.txt genbank.asmId.here.txt | sed -e 's/^/# /;' # 295 ensembl.v104.asmId.txt # 1448 genbank.asmId.here.txt comm -12 ensembl.v104.asmId.txt genbank.asmId.here.txt | wc -l # 288 # will need to catch up the assembly hubs for these missing 7: comm -23 ensembl.v104.asmId.txt genbank.asmId.here.txt # GCA_000001215.4_BDGP6.32 # GCA_000004035.1_Meug_1.0 # GCA_000090745.2_AnoCar2.0v2 # GCA_000224145.1_KH # GCA_000409795.2_ChlSab1.1 # GCA_013347765.1_ASM1334776v1 # GCA_900186095.1_CHOK1GS_HDv1 # checking the listings, it isn't clear all these are needed, appears to # be outdated versions at Ensembl, and perhaps only one new one for # GCA_013347765.1_ASM1334776v1 # it exists as a GCF assembly find -L /hive/data/outside/ensembl/genomes/release-104/idKeys -type f \ | grep keySignature.txt | while read K do idKeysDir=`dirname "${K}"` id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` idKeys="$idKeysDir/$id.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${K}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" done | sort -k1,1 > ensembl-104.keySignatures.txt find -L /hive/data/outside/ensembl/genomes/release-103/idKeys -type f \ | grep keySignature.txt | while read K do idKeysDir=`dirname "${K}"` id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` idKeys="$idKeysDir/$id.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${K}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" done | sort -k1,1 > ensembl-103.keySignatures.txt find -L /hive/data/outside/ensembl/genomes/release-101/idKeys -type f \ | grep keySignature.txt | while read K do idKeysDir=`dirname "${K}"` id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` idKeys="$idKeysDir/$id.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${K}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" done | sort -k1,1 > ensembl-101.keySignatures.txt find /hive/data/outside/ensembl/genomes/release-100/idKeys -type f \ | grep keySignature.txt | while read K do idKeysDir=`dirname "${K}"` id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` idKeys="$idKeysDir/$id.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${K}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" done | sort -k1,1 > ensembl-100.keySignatures.txt find /hive/data/outside/ensembl/genomes/release-99/idKeys -type f \ | grep keySignature.txt | while read K do idKeysDir=`dirname "${K}"` id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` idKeys="$idKeysDir/$id.idKeys.txt" fullCount=`cat $idKeys | wc -l` uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${K}"` printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" done | sort -k1,1 > ensembl-99.keySignatures.txt # something is odd with three of these. Their names changed between # versions but the sum stayed the same. Can't have this. # Find them via cut -f1 ensembl.keySignatures.txt | sort | uniq -c | sort -rn | head 2 2e374245cf85b32eddf186cf50f28f82 2 298de0770996a131c6f90a49ebebebb8 2 0eec31acf1f1120af29d62874cdd9952 1 ffb6ffd6b48667dd28ac3a2364b6c10e egrep -h "2e374245cf85b32eddf186cf50f28f82|298de0770996a131c6f90a49ebebebb8|0eec31acf1f1120af29d62874cdd9952" *.txt | cut -f2 | sort | uniq -c 5 Anolis_carolinensis.AnoCar2.0 2 Anolis_carolinensis.AnoCar2.0v2 2 Canis_familiaris.CanFam3.1 5 Canis_lupus_familiaris.CanFam3.1 4 Drosophila_melanogaster.BDGP6.28 3 Drosophila_melanogaster.BDGP6.32 # Eliminate the old ones: sort -u ensembl-10?.keySignatures.txt \ ensembl-99.keySignatures.txt \ | egrep -v -w "Anolis_carolinensis.AnoCar2.0|Canis_familiaris.CanFam3.1|Drosophila_melanogaster.BDGP6.28" \ > ensembl.keySignatures.txt ########### Exact matches ################## cd /hive/data/inside/assemblyEquivalence/2021-05-11 ./exact.sh ~/kent/src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl \ | sort > table.2021-05-11.tsv ########### Near matches ################## mkdir /hive/data/inside/assemblyEquivalence/2021-05-11/notExact cd /hive/data/inside/assemblyEquivalence/2021-05-11/notExact sed -e 's/release-99/release-104/;' \ /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/runOne > runOne chmod +x runOne time (~/kent/src/hg/makeDb/doc/assemblyEquivalence/allByAll.sh) > all.log 2>&1 # real 17m52.128s sed -e 's/\tcount A .*//;' results/match.*.txt > notExact.table.2021-05-11.tsv wc -l notExact.table.2021-05-11.tsv # 1200 notExact.table.2021-05-11.tsv ########### Table contents to load ################## sort -u notExact.table.2021-05-11.tsv ../table.2021-05-11.tsv \ > hgFixed.asmEquivalent.tsv ### Compare to existing: hgsql -N -e 'select * from asmEquivalent;' hgFixed | sort \ > existing.2021-05-11.tsv wc -l hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv 2594 hgFixed.asmEquivalent.tsv 2546 existing.2021-05-11.tsv ### How about the important ones: egrep "mm10|mm39|hg38|hg19" hgFixed.asmEquivalent.tsv GCA_000001405.27_GRCh38.p12 hg38 genbank ucsc 595 595 595 GCA_000001635.8_GRCm38.p6 mm10 genbank ucsc 239 239 239 GCA_000001635.9_GRCm39 mm39 genbank ucsc 61 61 61 GCF_000001635.26_GRCm38.p6 mm10 refseq ucsc 239 239 239 GCF_000001635.27_GRCm39 mm39 refseq ucsc 61 61 61 Mus_musculus.GRCm39 mm39 ensembl ucsc 61 61 61 hg38 GCA_000001405.27_GRCh38.p12 ucsc genbank 595 595 595 mm10 GCA_000001635.8_GRCm38.p6 ucsc genbank 239 239 239 mm10 GCF_000001635.26_GRCm38.p6 ucsc refseq 239 239 239 mm39 GCA_000001635.9_GRCm39 ucsc genbank 61 61 61 mm39 GCF_000001635.27_GRCm39 ucsc refseq 61 61 61 mm39 Mus_musculus.GRCm39 ucsc ensembl 61 61 61 ### most should be identical to before join hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv | wc -l 5473 ### probably should *not* be losing any from before: join -v 2 hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv | wc -l 7 # if not 0, investigate. Sometimes a new assembly is now an # exact match to something where it was a near match before to # a previous assembly of that organism. # In this case, it is the different names same idKeys that were eliminated # this time that weren't taken care of before: join -t$'\t' -v 2 hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv \ | cut -f1-3 Anolis_carolinensis.AnoCar2.0 GCA_000090745.2_AnoCar2.0 ensembl Anolis_carolinensis.AnoCar2.0 GCF_000090745.1_AnoCar2.0 ensembl Anolis_carolinensis.AnoCar2.0 anoCar2 ensembl Canis_familiaris.CanFam3.1 GCF_000002285.3_CanFam3.1 ensembl Canis_familiaris.CanFam3.1 canFam3 ensembl Drosophila_melanogaster.BDGP6.28 GCF_000001215.4_Release_6_plus_ISO1_MT ensembl Drosophila_melanogaster.BDGP6.28 dm6 ensembl ### note previous comments: ### ### Showing 5 this time: ### ### This is because the nearMiss calculation is only taking place on the ### ### most recent Ensembl release, not all the past ones. These two assemblies ### ### have new version in ens104 release, thus these older Ensembl assemblies ### ### are not being checked for near miss. For now, going to leave these ### ### missing from the table. Can't maintain all older Ensembl equivalents. ### Esox_lucius.Eluc_V3 GCA_000721915.3_Eluc_V3 ensembl genbank 1018 1019 1018 ### GCA_000721915.3_Eluc_V3 Esox_lucius.Eluc_V3 genbank ensembl 1018 1018 1019 ### Sarcophilus_harrisii.DEVIL7.0 GCA_000189315.1_Devil_ref_v7.0 ensembl genbank 35974 35975 35974 ### Sarcophilus_harrisii.DEVIL7.0 GCF_000189315.1_Devil_ref_v7.0 ensembl refseq 35974 35975 35974 ### Sarcophilus_harrisii.DEVIL7.0 sarHar1 ensembl ucsc 35974 35975 35974 ### there should be some new ones join -v 1 hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv | wc -l 42 ### There should be no duplicate equivalents: cut -f1,2 hgFixed.asmEquivalent.tsv | sort | uniq -c | sort -rn | head 1 zonAlb1 Zonotrichia_albicollis.Zonotrichia_albicollis-1.0.1 1 zonAlb1 GCF_000385455.1_Zonotrichia_albicollis-1.0.1 1 zonAlb1 GCA_000385455.1_Zonotrichia_albicollis-1.0.1 1 xipMac1 GCA_000241075.1_Xiphophorus_maculatus-4.4.2 1 xenTro9 Xenopus_tropicalis.Xenopus_tropicalis_v9.1 ... etc ... #### To load up new table contents: hgLoadSqlTab hgFixed asmEquivalent ~/kent/src/hg/lib/asmEquivalent.sql \ hgFixed.asmEquivalent.tsv hgsql -N -e 'select * from asmEquivalent;' hgFixed \ | sort > updated.2021-05-11.tsv wc -l updated.2021-05-11.tsv existing.2021-05-11.tsv 2594 updated.2021-05-11.tsv 2546 existing.2021-05-11.tsv # Previously: 2546 updated.2021-03-10.tsv 2344 existing.2021-03-10.tsv