# for emacs: -*- mode: sh; -*- # This file describes browser build for the ambMex1 # Assembly name: ASM291563v1 # Organism name: Ambystoma mexicanum (axolotl) # Infraspecific name: strain=DD151 # Sex: male # Taxid: 8296 # BioSample: SAMN06554622 # BioProject: PRJNA378970 # Submitter: Max Planck Society # Date: 2018-2-5 # Assembly type: haploid # Release type: major # Assembly level: Scaffold # Genome representation: full # WGS project: PGSH01 # Assembly method: MARVEL v. 2016-10-10 # Expected final version: no # Reference guided assembly: de-novo # Genome coverage: 30.0x # Sequencing technology: PacBio # GenBank assembly accession: GCA_002915635.1 # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_002915645.1 Primary Assembly ############################################################################# # obtain photograph (DONE - 2018-07-05 - Hiram) mkdir -p /hive/data/genomes/ambMex1/photo cd /hive/data/genomes/ambMex1/photo wget --timestamping \ 'https://upload.wikimedia.org/wikipedia/commons/1/1e/Ambystoma_mexicanum_%286337857516%29.jpg' convert -sharpen 0 -normalize -geometry 400x400 -quality 80 \ 'Ambystoma_mexicanum_(6337857516).jpg' \ Ambystoma_mexicanum.jpg printf "photoCreditURL\thttps://www.flickr.com/people/35871148@N04 photoCreditName\tRUben Undheim/Flickr\n" > ../photoReference.txt ######################################################################### # Initial steps (DONE - 2018-04-03 - Hiram) # To start this initialBuild.txt document, from a previous assembly document: mkdir ~/kent/src/hg/makeDb/doc/ambMex1 cd ~/kent/src/hg/makeDb/doc/ambMex1 # best to use a most recent document since it has the latest features and # procedures: sed -e 's/neoSch1/ambMex1/g; s/NeoSch1/AmbMex1/g; s/DONE/TBD/g;' ../neoSch1/initialBuild.txt ############################################################################# # fetch sequence from new style download directory (DONE - 2018-07-05 - Hiram) mkdir -p /hive/data/genomes/ambMex1/genbank cd /hive/data/genomes/ambMex1/genbank time rsync -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_other/Ambystoma_mexicanum/all_assembly_versions/GCA_002915635.1_ASM291563v1/ ./ # sent 258 bytes received 20485755990 bytes 28631385.39 bytes/sec # total size is 20480754977 speedup is 1.00 # real 11m55.350s # measure what we have here, this one is hefty: : time faSize GCA_002915635.1_ASM291563v1_genomic.fna.gz # 32393605577 bases (4026911109 N's 28366694468 real 28365740082 upper # 954386 lower) in 125724 sequences in 1 files # Total size: mean 257656.5 sd 973486.9 min 1033 (PGSH01113832.1) # max 21669615 (PGSH01077959.1) median 47471 # %0.00 masked total, %0.00 masked real # real 9m26.732s # with commas, that is: 32,393,605,577 =~ 32Gb ############################################################################# # fixup to UCSC naming scheme (DONE - 2018-07-05 - Hiram) mkdir /hive/data/genomes/ambMex1/ucsc cd /hive/data/genomes/ambMex1/ucsc # verify no duplicate sequences: note the use of the -long argument # on this gigantic amount of sequence time faToTwoBit -long ../genbank/*1_genomic.fna.gz genbank.2bit # real 13m46.258s time twoBitDup genbank.2bit # real 3m57.514s # should be silent output, otherwise the duplicates need to be removed # since this is an unplaced contig assembly, verify all names are .1: twoBitInfo genbank.2bit stdout | awk '{print $1}' \ | sed -e 's/[0-9]\+//;' | sort | uniq -c # 125724 PGSH.1 # in this case, all the .1's can be changed to: v1 time zcat ../genbank/GCA_002915635.1_ASM291563v1_genomic.fna.gz \ | sed -e 's/.1 Ambystoma.*/v1/;' | gzip -c \ > ucsc.ambMex1.fa.gz # real 159m37.622s # -rw-rw-r-- 1 8562523710 Jul 5 15:42 ucsc.ambMex1.fa.gz # and there is no AGP file with the assembly, construct one: time hgFakeAgp -minContigGap=1 ucsc.ambMex1.fa.gz ucsc.ambMex1.fake.agp # real 7m3.852s # do not need the chrM sequences, the chrM is already found as # contamination in the primary assembly # bash syntax here mitoAcc="NC_005797.1" printf "# mitoAcc %s\n" "$mitoAcc" # mitoAcc NC_005797.1 wget -O ${mitoAcc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc" echo ">chrM" > chrM.fa grep -v "^>" ${mitoAcc}.fa >> chrM.fa export mSize=`faCount chrM.fa | grep total | awk '{print $2}'` printf "chrM\t1\t$mSize\t1\tF\t$mitoAcc\t1\t$mSize\t+\n" > chrM.agp time gzip chr*.fa # real 12m14.210s # verify fasta and AGP match: time faToTwoBit -long ucsc.ambMex1.fa.gz test.2bit # real 13m54.200s # verify still silent: time twoBitDup test.2bit # real 3m57.287s # and check AGP vs. fasta correspondence: time cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail # All AGP and FASTA entries agree - both files are valid # real 2m47.252s # verify nothing lost compared to genbank: time twoBitToFa test.2bit stdout | faSize stdin # 32393605577 bases (4026911109 N's 28366694468 real 28365740082 upper # 954386 lower) in 125724 sequences in 1 files # Total size: mean 257656.5 sd 973486.9 min 1033 (PGSH01113832v1) # max 21669615 (PGSH01077959v1) median 47471 # %0.00 masked total, %0.00 masked real # real 8m50.269s # the original genbank count: # 32393605577 bases (4026911109 N's 28366694468 real 28365740082 upper # 954386 lower) in 125724 sequences in 1 files # no longer needed: rm -f genbank.2bit test.2bit ############################################################################# # Initial database build (DONE - 2018-07-06 - Hiram) cd /hive/data/genomes/ambMex1 # establish the config.ra file: # usually would use this ~/kent/src/hg/utils/automation/prepConfig.pl ambMex1 vertebrate axolotl \ genbank/*_assembly_report.txt > ambMex1.config.ra # going to need a mitoAcc ? # verify this looks OK: cat ambMex1.config.ra # config parameters for makeGenomeDb.pl: db ambMex1 clade vertebrate genomeCladePriority 70 scientificName Ambystoma mexicanum commonName Axolotl assemblyDate Feb. 2018 assemblyLabel Max Planck Society assemblyShortLabel ASM291563v1 orderKey 1944 # no mito sequence needed # mitoAcc none fastaFiles /hive/data/genomes/equCab3/ucsc/ucsc.ambMex1.fa.gz agpFiles /hive/data/genomes/equCab3/ucsc/ucsc.ambMex1.fake.agp # qualFiles none dbDbSpeciesDir axolotl photoCreditURL https://www.flickr.com/people/35871148@N04 photoCreditName Ruben Undheim/Flickr ncbiGenomeId 381 ncbiAssemblyId 1553901 ncbiAssemblyName ASM291563v1 ncbiBioProject 378970 ncbiBioSample SAMN06554622 genBankAccessionID GCA_002915635.1 taxId 8296 # working up new code in makeGenomeDb.pl to recognize this large genome # and process it correctly # verify sequence and AGP are OK: time (~/kent/src/hg/utils/automation/makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp ambMex1.config.ra) > agp.log 2>&1 # real 36m50.267s # verify it ran OK: # *** All done! (through the 'agp' step) # then finish it off: time (~/kent/src/hg/utils/automation/makeGenomeDb.pl -workhorse=hgwdev \ -dbHost=hgwdev -fileServer=hgwdev -continue=db \ ambMex1.config.ra ) > db.log 2>&1 # real 222m22.739s # check in the trackDb files created and add to trackDb/makefile # temporary symlink until after masking ln -s `pwd`/ambMex1.unmasked.2bit /gbdb/ambMex1/ambMex1.2bit ############################################################################# # cytoBandIdeo - (DONE - 2018-07-06 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/cytoBand cd /hive/data/genomes/ambMex1/bed/cytoBand time makeCytoBandIdeo.csh ambMex1 # real 0m4.110s ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2018-07-06 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/cpgIslandsUnmasked cd /hive/data/genomes/ambMex1/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/ambMex1/ambMex1.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku ambMex1) > do.log 2>&1 # real 68m7.460s cat fb.ambMex1.cpgIslandExtUnmasked.txt # 1497260787 bases of 28366694468 (5.278%) in intersection ############################################################################# # running repeat masker (DONE - 2018-07-06 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/repeatMasker cd /hive/data/genomes/ambMex1/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku ambMex1) > do.log 2>&1 & # curiously, ran very fast ? There are hardly any repeats in the templates # used by RM # Completed: 71471 of 71471 jobs # CPU time in finished jobs: 7334652s 122244.20m 2037.40h 84.89d 0.233 y # IO & Wait Time: 217376s 3622.93m 60.38h 2.52d 0.007 y # Average job time: 106s 1.76m 0.03h 0.00d # Longest finished job: 512s 8.53m 0.14h 0.01d # Submission to last job: 11477s 191.28m 3.19h 0.13d cat faSize.rmsk.txt # 2400839308 bases (53716773 N's 2347122535 real 1331243472 upper # 1015879063 lower) in 7872 sequences in 1 files # Total size: mean 304984.7 sd 3152710.5 min 1000 (NW_018732762v1) # max 84771923 (NW_018734349v1) median 1998 # %42.31 masked total, %43.28 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; time featureBits -countGaps ambMex1 rmsk # 1016100984 bases of 2400839308 (42.323%) in intersection # real 0m44.536s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's # faster way to get the same result on high contig count assemblies:: time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' ambMex1 \ | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" # total 1016100984.000000 # real 0m36.665s ########################################################################## # running simple repeat (DONE - 2018-07-06 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/simpleRepeat cd /hive/data/genomes/ambMex1/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409=5 ambMex1) > do.log 2>&1 & # real 26m16.578s cat fb.simpleRepeat # 1399128075 bases of 28366694468 (4.932%) in intersection # add to this simple repeat to windowMasker: cd /hive/data/genomes/ambMex1 /cluster/home/hiram/kent/src/hg/utils/twoBitMask/twoBitMask -add \ bed/windowMasker/splitFa/ambMex1.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed ambMex1.2bit twoBitToFa ambMex1.2bit stdout | faSize stdin > faSize.ambMex1.2bit.txt # real 9m59.223s cat faSize.ambMex1.2bit.txt egrep "bases|Total|masked" faSize.ambMex1.2bit.txt \ | fold -w 75 -s | sed -e 's/^/# /;' # 32393605577 bases (4026911109 N's 28366694468 real 18209209746 upper # 10157484722 lower) in 125724 sequences in 1 files # Total size: mean 257656.5 sd 973486.9 min 1033 (PGSH01113832v1) max # 21669615 (PGSH01077959v1) median 47471 # %31.36 masked total, %35.81 masked real # reset symlink rm /gbdb/ambMex1/ambMex1.2bit ln -s `pwd`/ambMex1.2bit /gbdb/ambMex1/ambMex1.2bit ############################################################################# # CREATE MICROSAT TRACK (DONE - 2018-07-08 - Hiram) ssh hgwdev mkdir /cluster/data/ambMex1/bed/microsat cd /cluster/data/ambMex1/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed ambMex1 microsat microsat.bed # Read 56937 elements of size 4 from microsat.bed ############################################################################# # ucscToINSDC table/track (DONE - 2018-07-10 - Hiram) # this is simple since there isn't any RefSeq assembly, and the # names used here are different from INSDC only due to the .1 -> v1 switch cd /hive/data/genomes/ambMex1/bed/ucscToINSDC cut -f1 ../../chrom.sizes | awk '{printf "%s\t%s\n", $1, $1};' \ | sed -e 's/v1/.1/;' | awk '{printf "%s\t%s\n", $2, $1}' \ | sort > ucsc.insdc.txt join -t$'\t' <(sort -k1,1 ../../chrom.sizes) ucsc.insdc.txt \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToINSDC.bed cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 14 # use the 14 in this sed sed -e "s/21/14/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab ambMex1 ucscToINSDC stdin ucscToINSDC.bed # should cover %100 entirely: time featureBits -countGaps ambMex1 ucscToINSDC # 32393605577 bases of 32393605577 (100.000%) in intersection # real 0m47.018s ########################################################################## # run up idKeys files for chromAlias (DONE - 2018-07-08 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/idKeys cd /hive/data/genomes/ambMex1/bed/idKeys time (doIdKeys.pl -twoBit=/hive/data/genomes/ambMex1/ambMex1.unmasked.2bit -buildDir=`pwd` ambMex1) > do.log 2>&1 & # real 60m51.086s cat ambMex1.keySignature.txt # 5c21fa448b2fcbcd8249a2b5d84bf460 ########################################################################## # add chromAlias table (DONE - 2018-07-10 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/chromAlias cd /hive/data/genomes/ambMex1/bed/chromAlias hgsql -N -e 'select chrom,name,"genbank" from ucscToINSDC;' ambMex1 \ > ucsc.genbank.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > ambMex1.chromAlias.tab for t in genbank do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t ambMex1.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking genbank: 125724 =? 125724 OK hgLoadSqlTab ambMex1 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ ambMex1.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2018-07-10 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/axolotl/ambMex1 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" ambMex1 \ | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c # sample of output: # 114752 PGSHv1 # 10972 PGSHv1_1 # 8870 PGSHv1_10 # ... etc ... PGSH # implies a search rule of: 'PGSH[0-9]+(v1_[0-9]+)?' # verify this rule will find them all or eliminate them all: hgsql -N -e "select frag from gold;" ambMex1 | wc -l # 892108 hgsql -N -e "select frag from gold;" ambMex1 \ | egrep -e 'PGSH[0-9]+(v1_[0-9]+)?' | wc -l # 892108 hgsql -N -e "select frag from gold;" ambMex1 \ | egrep -v -e 'PGSH[0-9]+(v1_[0-9]+)?' | wc -l # 0 # hence, add to trackDb/rhesus/ambMex1/trackDb.ra searchTable gold shortCircuit 1 termRegex PGSH[0-9]+(v1_[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 git commit -m 'add gold table search rule refs #19859' trackDb.ra # verify searches work in the position box ########################################################################## ## WINDOWMASKER (DONE - 2018-07-06 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/windowMasker cd /hive/data/genomes/ambMex1/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev ambMex1) > do.log 2>&1 # real 2857m15.802s # broke down due to broken twoBitMask situation # this procedure was carried out on a split setup of sequences mkdir /hive/data/genomes/ambMex1/bed/windowMasker/splitFa cd /hive/data/genomes/ambMex1/bed/windowMasker/splitFa # generate lists of contigs up to 2Gb in size ~/kent/src/hg/makeDb/doc/ambMex1/twoGbChunks.pl # turning to fasta done in a small cluster job on hgwdev printf '#!/bin/bash # input is one of chunkN.list export c=$1 export chunk=`echo $c | sed -e 's/.list//;'` fa="$chunk/$chunk.fa.gz" mkdir -p $chunk # echo $c $fa faSomeRecords ambMex1.fa $c stdout | gzip -c > ${fa} ' > mkChunkFa.sh chmod +x mkChunkFa.sh printf '#LOOP ./mkChunkFa.sh $(path1) #ENDLOOP ' > split.template ls chunk*.list > run.list gensub2 run.list single split.template jobList para create jobList para push cat run.time # Completed: 16 of 16 jobs # CPU time in finished jobs: 11256s 187.59m 3.13h 0.13d 0.000 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 679s 11.31m 0.19h 0.01d # Longest finished job: 790s 13.17m 0.22h 0.01d # Submission to last job: 795s 13.25m 0.22h 0.01d # then turn those fasta into 2bit files: ls -d chunk[0-9] chunk1[0-9] | while read D do ls -ld ${D}/*.fa.gz echo faToTwoBit ${D}/${D}.fa.gz ${D}/${D}.2bit faToTwoBit ${D}/${D}.fa.gz ${D}/${D}.2bit & done wait # Then, a second batch to run WM on each chunk printf '#!/bin/bash set -beEu -o pipefail export chunk=$1 cd /hive/data/genomes/ambMex1/bed/windowMasker/splitFa/${chunk} time (doWindowMasker.pl -buildDir=`pwd` -unmaskedSeq=`pwd`/${chunk}.2bit \ -stop=twobit -workhorse=hgwdev ambMex1) > do.log 2>&1 ' > runWM.sh chmod +x runWM.sh printf '#LOOP runWM.sh $(path1) #ENDLOOP ' > wm.template ls -d chunk[0-9] chunk1[0-9] > sm.list gensub2 wm.list single wm.template wm.jobList para create jobList para push # Completed: 16 of 16 jobs # CPU time in finished jobs: 3s 0.04m 0.00h 0.00d 0.000 y # IO & Wait Time: 173439s 2890.66m 48.18h 2.01d 0.005 y # Average job time: 10840s 180.67m 3.01h 0.13d # Longest finished job: 13596s 226.60m 3.78h 0.16d # Submission to last job: 13601s 226.68m 3.78h 0.16d # all results together: cat chunk*/*.sdust.bed > /dev/shm/ambMex1.sdust.bed $HOME/bin/x86_64/gnusort -S100G --parallel=32 -k1,1 -k2,2n \ /dev/shm/ambMex1.sdust.bed > ambMex1.windowmasker.sdust.bed # temporary fixed twoBitMask /cluster/home/hiram/kent/src/hg/utils/twoBitMask/twoBitMask \ /hive/data/genomes/ambMex1/ambMex1.unmasked.2bit \ ambMex1.windowmasker.sdust.bed ambMex1.wmsk.sdust.2bit twoBitToFa ambMex1.wmsk.sdust.2bit stdout | faSize stdin \ > faSize.ambMex1.wmsk.sdust.txt 2>&1 # Masking statistics cat faSize.ambMex1.cleanWMSdust.txt # 32393605577 bases (4026911109 N's 28366694468 real 18214135230 # upper 10152559238 lower) in 125724 sequences in 1 files # Total size: mean 257656.5 sd 973486.9 min 1033 (PGSH01113832v1) # max 21669615 (PGSH01077959v1) median 47471 # %31.34 masked total, %35.79 masked real hgLoadBed ambMex1 windowmaskerSdust ambMex1.windowmasker.sdust.bed featureBits -countGaps ambMex1 windowmaskerSdust \ > fb.ambMex1.windowmaskerSdust.beforeClean.txt 2>&1 featureBits ambMex1 -not gap -bed=notGap.bed featureBits ambMex1 windowmaskerSdust notGap.bed -bed=stdout \ | gzip -c > cleanWMask.bed.gz hgLoadBed ambMex1 windowmaskerSdust cleanWMask.bed.gz featureBits -countGaps ambMex1 windowmaskerSdust \ > fb.ambMex1.windowmaskerSdust.clean.txt 2>&1 zcat cleanWMask.bed.gz \ | /cluster/home/hiram/kent/src/hg/utils/twoBitMask/twoBitMask \ /hive/data/genomes/ambMex1/ambMex1.unmasked.2bit stdin \ -type=.bed ambMex1.cleanWMSdust.2bit twoBitToFa ambMex1.cleanWMSdust.2bit stdout \ | faSize stdin > faSize.ambMex1.cleanWMSdust.txt 2>&1 featureBits -countGaps ambMex1 rmsk windowmaskerSdust \ > fb.ambMex1.rmsk.windowmaskerSdust.txt 2>&1 cat fb.ambMex1.windowmaskerSdust.clean.txt # 10152559238 bases of 32393605577 (31.341%) in intersection # rmsk is so sparse, it doesn't even overlap any of this cat fb.ambMex1.rmsk.windowmaskerSdust.txt # 0 bases of 32393605577 (0.000%) in intersection ########################################################################## # cpgIslands - (DONE - 2018-07-08 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/cpgIslands cd /hive/data/genomes/ambMex1/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku ambMex1) > do.log 2>&1 # real 100m4.369s cat fb.ambMex1.cpgIslandExt.txt # 352400331 bases of 28366694468 (1.242%) in intersection ############################################################################## # ncbiRefSeq gene track (TBD - 2018-03-15 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/ncbiRefSeq cd /hive/data/genomes/ambMex1/bed/ncbiRefSeq time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl \ -buildDir=`pwd` -bigClusterHub=ku \ -fileServer=hgwdev -workhorse=hgwdev -smallClusterHub=ku -dbHost=hgwdev \ refseq vertebrate_mammalian Neomonachus_schauinslandi \ GCF_002201575.1_ASM220157v1 ambMex1) > do.log 2>&1 # real 4m37.395s cat fb.ncbiRefSeq.ambMex1.txt # 45781082 bases of 2400839308 (1.907%) in intersection ############################################################################## # genscan - (DONE - 2018-07-08 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/genscan cd /hive/data/genomes/ambMex1/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku ambMex1) > do.log 2>&1 # fifty-six were broken at window size of 2400000, # they all worked at 2000000 # real 162m46.438s # continuing: time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -continue=makeBed -bigClusterHub=ku ambMex1) > makeBed.log 2>&1 # real 75m28.038s cat fb.ambMex1.genscan.txt # 896393757 bases of 28366694468 (3.160%) in intersection cat fb.ambMex1.genscanSubopt.txt # 978384685 bases of 28366694468 (3.449%) in intersection ############################################################################# # augustus gene track (DONE - 2018-07-08 - Hiram) mkdir /hive/data/genomes/ambMex1/bed/augustus cd /hive/data/genomes/ambMex1/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=zebrafish -dbHost=hgwdev \ -workhorse=hgwdev ambMex1) > do.log 2>&1 # real 443m58.621s # continuing after removing some broken items: time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=zebrafish -dbHost=hgwdev \ -continue=load -workhorse=hgwdev ambMex1) > load.log 2>&1 cat fb.ambMex1.augustusGene.txt # 828245083 bases of 28366694468 (2.920%) in intersection ############################################################################# # Create kluster run files (DONE - 2018-07-10 - Hiram) # obtain 'real' numbers of this assembly: cd /hive/data/genomes/ambMex1 head -1 faSize.ambMex1.2bit.txt # 32393605577 bases (4026911109 N's 28366694468 real 18209209746 upper # 10157484722 lower) in 125724 sequences in 1 files # numerator is ambMex1 gapless bases "real" # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 28366694468 / 2861349177 \) \* 1024 # ( 28366694468 / 2861349177 ) * 1024 = 10151.677876 # tried repMatch=10000, but it didn't make many # ==> use -repMatch=10000 according to size scaled down from 1024 for human. # and rounded down to nearest 100 cd /hive/data/genomes/ambMex1 time blat ambMex1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/ambMex1.11.ooc \ -repMatch=10000 # Wrote 6435 overused 11-mers to jkStuff/ambMex1.11.ooc # real 9m44.626s # at repMatch=1000 it makes many more: # Wrote 1885128 overused 11-mers to jkStuff/ambMex1.11.ooc # at repMatch=2000 # real 9m18.200s # Wrote 740741 overused 11-mers to jkStuff/ambMex1.11.ooc # at repMatch=4000 # Wrote 125201 overused 11-mers to jkStuff/ambMex1.11.ooc # real 8m38.816s # going to use 3000 since this makes about 10X more than in human # at repMatch=3000 # Wrote 286064 overused 11-mers to jkStuff/ambMex1.11.ooc # real 9m7.973s # there are no non-bridged gaps since these are all fake gaps, # However, there are near 100,000 gaps over 10,000 bases, # going to use those gaps as 'non-bridged' gaps # Size survey: hgsql -N -e 'select size from gap;' ambMex1 | ave stdin # Q1 1701.000000 # median 3331.000000 # Q3 6422.000000 # average 5254.430036 # min 1.000000 # max 289789.000000 # count 766384 # total 4026911109.000000 # check non-bridged gaps to see what the typical size is: # hgsql -N \ # -e 'select * from gap where bridge="no" order by size;' ambMex1 \ # | sort -k7,7nr # minimum size is 10000, added allowBridged option to allow this ~/kent/src/hg/utils/gapToLift/gapToLift -allowBridged -verbose=2 \ -minGap=10000 ambMex1 \ jkStuff/ambMex1.10Kgaps.lft -bedFile=jkStuff/ambMex1.10Kgaps.bed # gapToLift -verbose=2 -minGap=50000 ambMex1 \ # jkStuff/ambMex1.nonBridged.lft -bedFile=jkStuff/ambMex1.nonBridged.bed ######################################################################### # lastz/chain/net swap from hg38 (DONE - 2018-08-09 - Hiram) # alignment to hg38: cd /hive/data/genomes/hg38/bed/lastzAmbMex1.2018-07-09 cat fb.hg38.chainAmbMex1Link.txt # 54520910 bases of 3049335806 (1.788%) in intersection cat fb.hg38.chainSynAmbMex1Link.txt # 3343407 bases of 3049335806 (0.110%) in intersection cat fb.hg38.chainRBest.AmbMex1.txt # 37383183 bases of 3049335806 (1.226%) in intersection # and for the swap: mkdir /hive/data/genomes/ambMex1/bed/blastz.hg38.swap cd /hive/data/genomes/ambMex1/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzAmbMex1.2018-07-09/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 105m38.443s cat fb.ambMex1.chainHg38Link.txt # 59846443 bases of 28366694468 (0.211%) in intersection cat fb.ambMex1.chainSynHg38Link.txt # 3456707 bases of 28366694468 (0.012%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` ambMex1 hg38) > rbest.log 2>&1 & # real 555m51.873s cat fb.ambMex1.chainRBest.Hg38.txt # 38573370 bases of 28366694468 (0.136%) in intersection ############################################################################# # lastz/chain/net swap from mm10 (DONE - 2018-06-10 - Hiram) # alignment to mm10 cd /hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09 cat fb.mm10.chainAmbMex1Link.txt # 52143617 bases of 2652783500 (1.966%) in intersection cat fb.mm10.chainSynAmbMex1Link.txt # 2686570 bases of 2652783500 (0.101%) in intersection cat fb.mm10.chainRBest.AmbMex1.txt # 36938030 bases of 2652783500 (1.392%) in intersection # and for the swap: mkdir /hive/data/genomes/ambMex1/bed/blastz.mm10.swap cd /hive/data/genomes/ambMex1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 39m28.757s cat fb.ambMex1.chainMm10Link.txt # 87124587 bases of 28366694468 (0.307%) in intersection cat fb.ambMex1.chainSynMm10Link.txt # 2893381 bases of 28366694468 (0.010%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` ambMex1 mm10) > rbest.log 2>&1 & # real 568m10.621s # something odd went haywire at the download step time (doRecipBest.pl -load -continue=download -workhorse=hgwdev -buildDir=`pwd` ambMex1 mm10) > download.log 2>&1 & # real 3m16.404s cat fb.ambMex1.chainRBest.Mm10.txt # 38584422 bases of 28366694468 (0.136%) in intersection ############################################################################## # GENBANK AUTO UPDATE (DONE - 2018-07-10 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # organism mrnaCnt estCnt refSeqCnt # Ambystoma mexicanum 7705 43326 0 # edit etc/genbank.conf to add ambMex1 just before nanPar1 # ambMex1 (Axolotl - Ambystoma mexicanum) 125724 scaffolds N50 3052786 - 30Gb total ambMex1.serverGenome = /hive/data/genomes/ambMex1/ambMex1.2bit ambMex1.clusterGenome = /hive/data/genomes/ambMex1/ambMex1.2bit ambMex1.ooc = /hive/data/genomes/ambMex1/jkStuff/ambMex1.11.ooc ambMex1.lift = /hive/data/genomes/ambMex1/jkStuff/ambMex1.10Kgaps.lft ambMex1.perChromTables = no ambMex1.downloadDir = ambMex1 ambMex1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} ambMex1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} ambMex1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} ambMex1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} ambMex1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} ambMex1.genbank.est.xeno.pslCDnaFilter = ${lowCover.genbank.est.xeno.pslCDnaFilter} # defaults yes: genbank.mrna.native.load genbank.mrna.native.loadDesc # yes: genbank.est.native.load refseq.mrna.native.load # yes: refseq.mrna.native.loadDesc refseq.mrna.xeno.load # yes: refseq.mrna.xeno.loadDesc # defaults no: genbank.mrna.xeno.load genbank.mrna.xeno.loadDesc # no: genbank.est.native.loadDesc genbank.est.xeno.load # no: genbank.est.xeno.loadDesc # DO NOT NEED genbank.mrna.xeno except for human, mouse # ambMex1.upstreamGeneTbl = ensGene # ambMex1.upstreamMaf = multiz6way /hive/data/genomes/ambMex1/bed/multiz6way/species.list # And edit src/lib/gbGenome.c to add new species # static char *ambMexNames[] = {"Ambystoma mexicanum", NULL}; git commit -m 'adding ambMex1 Axolotl - Ambystoma mexicanum' src/lib/gbGenome.c etc/genbank.conf git push # verify stated file paths do exist: grep ambMex1 etc/genbank.conf | egrep "Genome|ooc|lift" \ | awk '{print $NF}' | grep -v -w no | xargs ls -og -rw-rw-r-- 1 9104343295 Jul 8 19:32 /hive/data/genomes/ambMex1/ambMex1.2bit -rw-rw-r-- 1 11166819 Jul 10 10:45 /hive/data/genomes/ambMex1/jkStuff/ambMex1.10Kgaps.lft -rw-rw-r-- 1 1144264 Jul 10 10:00 /hive/data/genomes/ambMex1/jkStuff/ambMex1.11.ooc # update /cluster/data/genbank/: make etc-update make install-server # add ambMex1 to: # etc/align.dbs etc/hgwdev.dbs git commit -m 'adding ambMex1/axolotl refs #21715' \ etc/align.dbs etc/hgwdev.dbs git push # update /cluster/data/genbank/: make etc-update # XXX a few days later the genbank tables will be in the database ############################################################################## # BLATSERVERS ENTRY (TBD - 2017-10-06 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ambMex1", "blat1c", "17880", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ambMex1", "blat1c", "17881", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################## # set default position to the PAX3 gene (DONE - 2018-07-11 - Hiram) # as found via UCSC 'Other refseq' track hgsql -e \ 'update dbDb set defaultPos="PGSH01084259v1:1812985-2038932" where name="ambMex1";' \ hgcentraltest ############################################################################## # all.joiner update, downloads and in pushQ - (DONE - 2018-07-13 - Hiram) cd $HOME/kent/src/hg/makeDb/schema ~/kent/src/hg/utils/automation/verifyBrowser.pl ambMex1 # 40 tables in database ambMex1 - Axolotl, Ambystoma mexicanum # verified 40 tables in database ambMex1, 0 extra tables, 12 optional tables # chainNetRBestHg38 3 optional tables # chainNetRBestMm10 3 optional tables # chainNetSynHg38 3 optional tables # chainNetSynMm10 3 optional tables # ERROR: no genbank tables found # verified 28 required tables, 1 missing tables # 1 ucscToRefSeq - missing table # hg38 chainNet to ambMex1 found 3 required tables # mm10 chainNet to ambMex1 found 3 required tables # hg38 chainNet RBest and syntenic to ambMex1 found 6 optional tables # mm10 chainNet RBest and syntenic to ambMex1 found 3 optional tables # ERROR: blat server not found in hgcentraltest.blatServers ? # fixup all.joiner until this is a clean output joinerCheck -database=ambMex1 -tableCoverage all.joiner joinerCheck -database=ambMex1 -times all.joiner joinerCheck -database=ambMex1 -keys all.joiner # need to specify workhorse so it will not overload ku # and this procedure fails on ku due to limited memory for the # maskOutFa step cd /hive/data/genomes/ambMex1 time (makeDownloads.pl -workhorse=hgwdev ambMex1) > downloads.log 2>&1 # real 117m17.756s # continue after broken 'compress' step manually completed: time (makeDownloads.pl -continue=install -workhorse=hgwdev ambMex1) \ > downloadsInstall.log 2>&1 # real 0m4.786s # now ready for pushQ entry mkdir /hive/data/genomes/ambMex1/pushQ cd /hive/data/genomes/ambMex1/pushQ time (makePushQSql.pl -redmineList ambMex1) \ > ambMex1.pushQ.sql 2> stderr.out # real 4m11.149s # remove the tandemDups and gapOverlap from the listings: sed -i -e "/tandemDups/d" redmine.ambMex1.table.list sed -i -e "/Tandem Dups/d" redmine.ambMex1.releaseLog.txt sed -i -e "/gapOverlap/d" redmine.ambMex1.table.list sed -i -e "/Gap Overlaps/d" redmine.ambMex1.releaseLog.txt # check for errors in stderr.out, some are OK, e.g.: # writing redmine listings to # redmine.ambMex1.file.list # redmine.ambMex1.table.list # redmine.ambMex1.releaseLog.txt # WARNING: ambMex1 does not have seq # WARNING: ambMex1 does not have extFile # verify the file listings are valid, should be no output to stderr: cat redmine.ambMex1.file.list \ | while read L; do ls -ogL $L; done > /dev/null # to verify the database.table list is correct, should be the same # line count for these two commands: wc -l redmine.ambMex1.table.list # 41 redmine.ambMex1.table.list awk -F'.' '{ printf "hgsql -N -e \"show table status like '"'"'%s'"'"';\" %s\n", $2, $1 }' redmine.ambMex1.table.list | while read L; do eval $L; done | wc -l # 41 # add the path names to the listing files in the redmine issue # in the three appropriate entry boxes: ls `pwd`/redmine* /hive/data/genomes/ambMex1/pushQ/redmine.ambMex1.file.list /hive/data/genomes/ambMex1/pushQ/redmine.ambMex1.releaseLog.txt /hive/data/genomes/ambMex1/pushQ/redmine.ambMex1.table.list #########################################################################