# for emacs: -*- mode: sh; -*- # This file describes browser build for the felCat9 ######################################################################### # reuse photograph from felCat8 previous versions # (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9 cd /hive/data/genomes/felCat9 cp -p ../felCat8/photoReference.txt . cat photoReference.txt photoCreditURL http://www.genome.gov/pressDisplay.cfm?photoID=42 photoCreditName Dr. Kristina Narfstrom ######################################################################### # Initial steps (DONE - 2018-02-27 - Hiram) # To start this initialBuild.txt document, from a previous assembly document: mkdir ~/kent/src/hg/makeDb/doc/felCat9 cd ~/kent/src/hg/makeDb/doc/felCat9 # best to use a most recent document since it has the latest features and # procedures: sed -e 's/papAnu3/felCat9/g; s/PapAnu3/FelCat9/g; s/DONE/TBD/g;' ../papAnu3/initialBuild.txt > initialBuild.txt mkdir /hive/data/genomes/felCat9/refseq cd /hive/data/genomes/felCat9/refseq time rsync -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Felis_catus/all_assembly_versions/GCF_000181335.3_Felis_catus_9.0/ ./ # sent 3201 bytes received 3658828880 bytes 19307821.01 bytes/sec # total size is 3658370114 speedup is 1.00 # real 3m8.429s # check assembly size for later reference: faSize G*_9.0_genomic.fna.gz # 2521863845 bases (45410675 N's 2476453170 real 1649339622 upper # 827113548 lower) in 4508 sequences in 1 files # Total size: mean 559419.7 sd 9108161.9 min 1969 (NW_019369707.1) # max 242100913 (NC_018723.3) median 9988 # %32.80 masked total, %33.40 masked real # this information is from the top of # felCat9/refseq/GCF_000181335.3_Felis_catus_9.0_assembly_report.txt # Assembly name: Felis_catus_9.0 # Organism name: Felis catus (domestic cat) # Infraspecific name: breed=Abyssinian # Isolate: Cinnamon # Sex: female # Taxid: 9685 # BioSample: SAMN02953640 # BioProject: PRJNA16726 # Submitter: Genome Sequencing Center (GSC) at Washington University (WashU) School of Medicine # Date: 2017-11-20 # Assembly type: haploid # Release type: major # Assembly level: Chromosome # Genome representation: full # WGS project: AANG04 # Assembly method: WTDBG v. 1.2, Chromonomer v. 1.0.7 # Genome coverage: 72x # Sequencing technology: PacBio; 454 Titanium; Illumina; Sanger dideoxy sequencing # RefSeq category: Representative Genome # GenBank assembly accession: GCA_000181335.4 # RefSeq assembly accession: GCF_000181335.3 # RefSeq assembly and GenBank assemblies identical: no # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_000181345.4 GCF_000181345.3 Primary Assembly ## GCF_000029335.1 non-nuclear ############################################################################# # establish config.ra file (DONE - Hiram - 2018-02-27) # arguments here are: cd /hive/data/genomes/felCat9 $HOME/kent/src/hg/utils/automation/prepConfig.pl felCat9 mammal \ cat ./refseq/*_assembly_report.txt > felCat9.config.ra # fixup common name to remain compatible with previous panAnu2 version # and therfore orderKey wasn't correct. Also fixup shortLabel # to check orderKey: hgsql -e 'select name,orderKey,organism from dbDb order by orderKey;' \ hgcentraltest | less # verify it looks sane # reset commonName Domestic cat to: commonName Cat # reset orderKey 4677 to: orderKey 3239 cat felCat9.config.ra # config parameters for makeGenomeDb.pl: db felCat9 clade mammal genomeCladePriority 35 scientificName Felis catus commonName Cat assemblyDate Nov. 2017 assemblyLabel Genome Sequencing Center (GSC) at Washington University (WashU) School of Medicine assemblyShortLabel Felis_catus_9.0 orderKey 3239 # mitochondrial sequence included in refseq release # mitoAcc NC_001700.1 mitoAcc none fastaFiles /hive/data/genomes/felCat9/ucsc/*.fa.gz agpFiles /hive/data/genomes/felCat9/ucsc/*.agp # qualFiles none dbDbSpeciesDir cat photoCreditURL http://www.genome.gov/pressDisplay.cfm?photoID=42 photoCreditName Dr. Kristina Narfstrom ncbiGenomeId 78 ncbiAssemblyId 1448961 ncbiAssemblyName Felis_catus_9.0 ncbiBioProject 16726 ncbiBioSample SAMN02953640 genBankAccessionID GCF_000181335.3 taxId 9685 ############################################################################# # setup UCSC named files (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9/ucsc cd /hive/data/genomes/felCat9/ucsc # check for duplicate sequences: time faToTwoBit -noMask ../refseq/G*_9.0_genomic.fna.gz refseq.2bit # real 1m3.113s twoBitDup refseq.2bit # no output is a good result, otherwise, would have to eliminate duplicates # the scripts creating the fasta here will be using this refseq.2bit file ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \ ../refseq/G*_9.0_genomic.fna.gz \ ../refseq/G*_9.0_assembly_structure/Primary_Assembly # NC_018723.3 chrA1 # NC_018724.3 chrA2 # NC_018725.3 chrA3 # NC_018726.3 chrB1 # NC_018727.3 chrB2 # NC_018728.3 chrB3 # NC_018729.3 chrB4 # NC_018730.3 chrC1 # NC_018731.3 chrC2 # NC_018732.3 chrD1 # NC_018733.3 chrD2 # NC_018734.3 chrD3 # NC_018735.3 chrD4 # NC_018736.3 chrE1 # NC_018737.3 chrE2 # NC_018738.3 chrE3 # NC_018739.3 chrF1 # NC_018740.3 chrF2 # NC_018741.3 chrX # unplaced sequences time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \ ../refseq/*_assembly_structure/Primary_Assembly # processed 4142 sequences into chrUn.fa.gz # real 0m30.235s # unlocalized sequences time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \ ../refseq/*_assembly_structure/Primary_Assembly # D2 # D3 # A1 # B4 # A2 # E1 # C1 # F2 # B2 # E3 # C2 # B1 # X # D1 # D4 # A3 # B3 # E2 # F1 # processed 346 sequences into chr*_random.gz 19 files # real 0m5.891s # bash syntax here mitoAcc=`grep "^# mitoAcc" ../felCat9.config.ra | awk '{print $NF}'` printf "# mitoAcc %s\n" "$mitoAcc" # mitoAcc NC_001700.1 zcat \ ../refseq/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \ | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp printf ">chrM\n" > chrM.fa twoBitToFa -noMask refseq.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa gzip chrM.fa # verify fasta and AGPs agree time faToTwoBit *.fa.gz test.2bit # real 1m10.270s time cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4 # All AGP and FASTA entries agree - both files are valid # real 0m9.332s # and no sequence lost from orginal: twoBitToFa test.2bit stdout | faSize stdin # 2521863845 bases (45410675 N's 2476453170 real 2476453170 upper 0 lower) # in 4508 sequences in 1 files # Total size: mean 559419.7 sd 9108161.9 min 1969 (chrUn_NW_019369707v1) # max 242100913 (chrA1) median 9988 # same numbers as above # 2521863845 bases (45410675 N's 2476453170 real 1649339622 upper # 827113548 lower) in 4508 sequences in 1 files # Total size: mean 559419.7 sd 9108161.9 min 1969 (NW_019369707.1) # max 242100913 (NC_018723.3) median 9988 # no longer need these temporary 2bit files rm refseq.2bit test.2bit ############################################################################# # Initial database build (DONE - 2018-02-27 - Hiram) cd /hive/data/genomes/felCat9 # verify sequence and AGP are OK: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp felCat9.config.ra) > agp.log 2>&1 # real 2m18.763s # verify there was no error in that step: tail agp.log # *** All done! (through the 'agp' step) # then finish it off: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db felCat9.config.ra) > db.log 2>&1 # real 20m30.661s # check in the trackDb files created in TemporaryTrackDbCheckout/ # and add felCat9 to trackDb/makefile # temporary symlink until masked sequence is available cd /hive/data/genomes/felCat9 ln -s `pwd`/felCat9.unmasked.2bit /gbdb/felCat9/felCat9.2bit ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9/bed/cpgIslandsUnmasked cd /hive/data/genomes/felCat9/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/felCat9/felCat9.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku felCat9) > do.log 2>&1 # real 10m10.843s cat fb.felCat9.cpgIslandExtUnmasked.txt # 39762268 bases of 2893270787 (1.374%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9/bed/cytoBand cd /hive/data/genomes/felCat9/bed/cytoBand makeCytoBandIdeo.csh felCat9 ############################################################################# # gapOverlap (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9/bed/gapOverlap cd /hive/data/genomes/felCat9/bed/gapOverlap time (doGapOverlap.pl \ -twoBit=/hive/data/genomes/felCat9/felCat9.unmasked.2bit felCat9 ) \ > do.log 2>&1 # real 0m54.051s # actually failed because there were none of these items found: # -rw-rw-r-- 1 0 Feb 27 15:25 felCat9.gapOverlap.bed time (doGapOverlap.pl -continue=cleanup \ -twoBit=/hive/data/genomes/felCat9/felCat9.unmasked.2bit felCat9 ) \ > cleanup.log 2>&1 ############################################################################# # tandemDups (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9/bed/tandemDups cd /hive/data/genomes/felCat9/bed/tandemDups time (~/kent/src/hg/utils/automation/doTandemDup.pl \ -twoBit=/hive/data/genomes/felCat9/felCat9.unmasked.2bit felCat9) \ > do.log 2>&1 & # real 122m33.710s # 53784656 bases of 2521863845 (2.133%) in intersection bigBedInfo felCat9.tandemDups.bb | sed -e 's/^/# /;' # version: 4 # fieldCount: 13 # hasHeaderExtension: yes # isCompressed: yes # isSwapped: 0 # extraIndexCount: 0 # itemCount: 785,057 # primaryDataSize: 19,206,976 # primaryIndexSize: 96,344 # zoomLevels: 9 # chromCount: 1399 # basesCovered: 1,394,080,174 # meanDepth (of bases covered): 3.940832 # minDepth: 1.000000 # maxDepth: 5370.000000 # std of depth: 17.389426 ############################################################################# # ucscToINSDC and ucscToRefSeq table/track (DONE - 2018-02-27 - Hiram) # the sequence here is working for a 'refseq' assembly # beware of a chrM situation may be specific depending upon what is # available in the assembly mkdir /hive/data/genomes/felCat9/bed/ucscToINSDC cd /hive/data/genomes/felCat9/bed/ucscToINSDC # find accession for chrM grep chrM ../../felCat9.agp # chrM 1 17009 1 O NC_001700.1 1 17009 + # find the genbank accession for NC_020006.2 at Entrez nucleotide # The NC_020006.2 name is the RefSeq name, the genbank name is: JX946196.2 # the assembly_report does not have this AY name since the chrM sequence # is not in the genbank assembly: grep NC_001700.1 ../../refseq/GCF*_9.0_assembly_report.txt # MT assembled-molecule MT Mitochondrion na <> NC_001700.1 non-nuclear 17009 chrM # lookup the genbank/INSDC accession name for chrM at: # https://www.ncbi.nlm.nih.gov/assembly/GCF_000181335.3 # if there is a chrM, use its INSDC name as a second argument: # this is a RefSeq assembly, use the chrM refSeq name: ~/kent/src/hg/utils/automation/ucscToINSDC.sh \ ../../refseq/GCF_*structure/Primary_Assembly NC_001700.1 # this is actually ucscToRefSeq since this is a RefSeq assembly sort -k2 ucscToINSDC.txt > ucscToRefSeq.txt rm -f ucscToINSDC.txt awk '{printf "%s\t%s\n", $2, $1}' ucscToRefSeq.txt \ | sort > refSeqToUcsc.txt # chrM processing needs special help, fixup with the sed # extract the refseq vs. genbank names from the assembly_report # columns 5 and 7 are the INSDC and RefSeq names grep -v "^#" ../../refseq/GCF*_assembly_report.txt | cut -f5,7 \ | awk '{printf "%s\t%s\n", $2, $1}' | sed -e 's/na/U20753.1/' \ | sort > refseq.insdc.txt awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > ucsc.coordinate.tab join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 ucsc.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-4 \ > ucscToRefSeq.bed join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 ucsc.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \ > ucscToINSDC.bed # verify chrM is correct: grep chrM *.bed # ucscToINSDC.bed:chrM 0 17009 U20753.1 # ucscToRefSeq.bed:chrM 0 17009 NC_001700.1 # should be same line counts throughout: # in this case one is missing in the final result due to the duplicate # contig being removed wc -l * # 4508 refSeqToUcsc.txt # 4508 refseq.insdc.txt # 4508 ucsc.coordinate.tab # 4508 ucscToINSDC.bed # 4508 ucscToRefSeq.bed # 4508 ucscToRefSeq.txt export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 27 # use the $chrSize in this sed sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab felCat9 ucscToINSDC stdin ucscToINSDC.bed # should be the same for ucscToRefSeq: export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 27 sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab felCat9 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed # checkTableCoords should be silent checkTableCoords felCat9 # each should cover %100 entirely: featureBits -countGaps felCat9 ucscToINSDC # 2521863845 bases of 2521863845 (100.000%) in intersection featureBits -countGaps felCat9 ucscToRefSeq # 2521863845 bases of 2521863845 (100.000%) in intersection ######################################################################### # add chromAlias table (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9/bed/chromAlias cd /hive/data/genomes/felCat9/bed/chromAlias hgsql -N -e 'select chrom,name from ucscToRefSeq;' felCat9 \ | sort -k1,1 > ucsc.refseq.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' felCat9 \ | sort -k1,1 > ucsc.genbank.tab ### Adding Ensembl alias with v95 release, after idKeys made: 2019-01-16 join -t$'\t' ../idKeys/felCat9.idKeys.txt \ ../../ens95/ensFelCat9.idKeys.txt | cut -f2- \ | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToEns.bed cut -f1,4 ucscToEns.bed | sort > ucsc.ensembl.tab wc -l *.tab # 4508 ucsc.ensembl.tab # 4508 ucsc.genbank.tab # 4508 ucsc.refseq.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > felCat9.chromAlias.tab for t in refseq genbank ensembl do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t felCat9.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 4508 =? 4508 OK # checking genbank: 4508 =? 4508 OK # checking ensembl: 4508 =? 4508 OK hgLoadSqlTab felCat9 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ felCat9.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2018-02-27 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/cat/felCat9 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" felCat9 \ | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c # 4989 AANG.1 # 80 AC.1 # 1 AC.2 # 8 AY.1 # 1 NC_.1 # implies a rule: '[AN][ACY][NG0-9_]+(\.[0-9]+)?' # verify this rule will find them all and eliminate them all: hgsql -N -e "select frag from gold;" felCat9 | wc -l # 5079 hgsql -N -e "select frag from gold;" felCat9 \ | egrep -e '[AN][ACY][NG0-9_]+(\.[0-9]+)?' | wc -l # 5079 hgsql -N -e "select frag from gold;" felCat9 \ | egrep -v -e '[AN][ACY][NG0-9_]+(\.[0-9]+)?' | wc -l # 0 # hence, add to trackDb/chicken/felCat9/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][ACY][NG0-9_]+(\.[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box ########################################################################## # running repeat masker (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9/bed/repeatMasker cd /hive/data/genomes/felCat9/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku felCat9) > do.log 2>&1 & # real 1035m8.587s # cluster difficulties during the cluster run, continuing: time (doRepeatMasker.pl -buildDir=`pwd` \ -continue=cat -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku felCat9) > cat.log 2>&1 & # bad record in the felCat9.fa.out, remove it with grep: # RepeatMasker bug?: Undefined id, line 3670031 of input: # 631 27.0 1.1 1.1 chrD2 87783169 87783348 (2403312) + L1ME2 LINE/L1 5991 6170 (9) mv felCat9.fa.out felCat9.fa.out.broken grep -v "631 27.0 1.1 1.1 chrD2 87783169 87783348" \ felCat9.fa.out.broken > felCat9.fa.out # and reconstruct the sorted file: mv felCat9.sorted.fa.out felCat9.sorted.fa.out.broken head -3 felCat9.fa.out > felCat9.sorted.fa.out tail -n +4 felCat9.fa.out | sort -k5,5 -k6,6n >> felCat9.sorted.fa.out # and the nested repeats remains the same without the error: /cluster/bin/scripts/extractNestedRepeats.pl felCat9.fa.out \ | sort -k1,1 -k2,2n > felCat9.nestedRepeats.bed # continuing: time (doRepeatMasker.pl -buildDir=`pwd` \ -continue=mask -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku felCat9) > mask.log 2>&1 & # real 22m39.249s cat faSize.rmsk.txt # 2521863845 bases (45410675 N's 2476453170 real 1408738694 upper 1067714476 # lower) in 4508 sequences in 1 files # Total size: mean 559419.7 sd 9108161.9 min 1969 (chrUn_NW_019369707v1) # max 242100913 (chrA1) median 9988 # %42.34 masked total, %43.11 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; * time featureBits -countGaps felCat9 rmsk # 1067715717 bases of 2521863845 (42.338%) in intersection # real 0m36.348s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count above # separates out the N's from the bases, it doesn't show lower case N's # faster way to get the same result on high contig count assemblies: time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' felCat9 \ | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" # total 1067715717.000000 # real 0m36.899s ########################################################################## # running simple repeat (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9/bed/simpleRepeat cd /hive/data/genomes/felCat9/bed/simpleRepeat # using trf409 5 here guessing smaller genome (human == 6) time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409 5 felCat9) > do.log 2>&1 & # real 43m56.043s cat fb.simpleRepeat # 163817450 bases of 2893270787 (5.662%) in intersection # adding this trfMask to the other masking cd /hive/data/genomes/felCat9 # when using the Window Masker result: # twoBitMask bed/windowMasker/felCat9.cleanWMSdust.2bit \ # -add bed/simpleRepeat/trfMask.bed felCat9.2bit # you can safely ignore the warning about fields >= 13 # when using Rmsk results, add to rmsk after it is done: twoBitMask felCat9.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed felCat9.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa felCat9.2bit stdout | faSize stdin > faSize.felCat9.2bit.txt cat faSize.felCat9.2bit.txt # 2521863845 bases (45410675 N's 2476453170 real 1407138455 upper 1069314715 # lower) in 4508 sequences in 1 files # Total size: mean 559419.7 sd 9108161.9 min 1969 (chrUn_NW_019369707v1) # max 242100913 (chrA1) median 9988 # %42.40 masked total, %43.18 masked real # reset the symlink rm /gbdb/felCat9/felCat9.2bit ln -s `pwd`/felCat9.2bit /gbdb/felCat9/felCat9.2bit ######################################################################### # CREATE MICROSAT TRACK (DONE - 2018-02-27 - Hiram) ssh hgwdev mkdir /cluster/data/felCat9/bed/microsat cd /cluster/data/felCat9/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed felCat9 microsat microsat.bed # Read 105236 elements of size 4 from microsat.bed ########################################################################## ## WINDOWMASKER (DONE - 2018-02-28 - Hiram) mkdir /hive/data/genomes/felCat9/bed/windowMasker cd /hive/data/genomes/felCat9/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev felCat9) > do.log 2>&1 # real 237m57.966s # continuing after rmsk done featureBits -countGaps felCat9 rmsk windowmaskerSdust \ > fb.felCat9.rmsk.windowmaskerSdust.txt 2>&1 time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -continue=cleanup -dbHost=hgwdev felCat9) > cleanup.log 2>&1 # real 1m52.250s # Masking statistics cat faSize.felCat9.cleanWMSdust.txt # 2521863845 bases (45410675 N's 2476453170 real 1636004228 upper 840448942 # lower) in 4508 sequences in 1 files # Total size: mean 559419.7 sd 9108161.9 min 1969 (chrUn_NW_019369707v1) # max 242100913 (chrA1) median 9988 # %33.33 masked total, %33.94 masked real cat fb.felCat9.rmsk.windowmaskerSdust.txt # 588428058 bases of 2521863845 (23.333%) in intersection ########################################################################## # run up idKeys files for ncbiRefSeq (DONE - 2018-02-27 - Hiram) mkdir /hive/data/genomes/felCat9/bed/idKeys cd /hive/data/genomes/felCat9/bed/idKeys time (doIdKeys.pl -twoBit=/hive/data/genomes/felCat9/felCat9.unmasked.2bit -buildDir=`pwd` felCat9) > do.log 2>&1 & # real 3m8.497s cat felCat9.keySignature.txt # a563ebb4989e5d9dd8e1b57698664160 ########################################################################## # cpgIslands - (DONE - 2018-02-28 - Hiram) mkdir /hive/data/genomes/felCat9/bed/cpgIslands cd /hive/data/genomes/felCat9/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku felCat9) > do.log 2>&1 & # real 5m8.572s cat fb.felCat9.cpgIslandExt.txt # 49418190 bases of 2476453204 (1.996%) in intersection ############################################################################## # genscan - (DONE - 2018-02-28 - Hiram) mkdir /hive/data/genomes/felCat9/bed/genscan cd /hive/data/genomes/felCat9/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku felCat9) > do.log 2>&1 & # real 867m26.572s # one failed job, run with 2,000,000 window:: time ./runGsBig2M.csh chrE1 000 gtf/000/chrE1.gtf pep/000/chrE1.pep subopt/000/chrE1.bed # real 290m18.460s # continuing: time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -continue=makeBed -bigClusterHub=ku felCat9) > makeBed.log 2>&1 & # real 1m55.321s cat fb.felCat9.genscan.txt # 60893812 bases of 2476453204 (2.459%) in intersection cat fb.felCat9.genscanSubopt.txt # 51934230 bases of 2476453204 (2.097%) in intersection ############################################################################# # augustus gene track (DONE - 2018-02-28 - Hiram) mkdir /hive/data/genomes/felCat9/bed/augustus cd /hive/data/genomes/felCat9/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=human -dbHost=hgwdev -workhorse=hgwdev felCat9) > do.log 2>&1 & # real 187m51.379s cat fb.felCat9.augustusGene.txt # 51863692 bases of 2476453204 (2.094%) in intersection ############################################################################# # lastz/chain/net swap human/hg38 (DONE - 2018-03-14 - Hiram) # original alignment cd /hive/data/genomes/hg38/bed/lastzFelCat9.2018-03-14 cat fb.hg38.chainFelCat9Link.txt # 1579231929 bases of 3049335806 (51.789%) in intersection cat fb.hg38.chainSynFelCat9Link.txt # 1516804589 bases of 3049335806 (49.742%) in intersection cat fb.hg38.chainRBestFelCat9Link.txt # 1449222744 bases of 3049335806 (47.526%) in intersection # and for the swap: mkdir /hive/data/genomes/felCat9/bed/blastz.hg38.swap cd /hive/data/genomes/felCat9/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzFelCat9.2018-03-14/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 142m37.100s cat fb.felCat9.chainHg38Link.txt # 1486134443 bases of 2476453204 (60.011%) in intersection cat fb.felCat9.chainSynHg38Link.txt # 1452577988 bases of 2476453204 (58.656%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ felCat9 hg38) > rbest.log 2>&1 & # real 623m28.676s cat fb.felCat9.chainRBestHg38Link.txt # 1449521349 bases of 2476453204 (58.532%) in intersection ######################################################################### # lastz/chain/net swap mouse/mm10 (DONE - 2018-03-14 - Hiram) # alignment to mouse/mm10: cd /hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14 cat fb.mm10.chainFelCat9Link.txt # 801023018 bases of 2652783500 (30.196%) in intersection cat fb.mm10.chainRBestFelCat9Link.txt # 761411281 bases of 2652783500 (28.702%) in intersection # and for the swap mkdir /hive/data/genomes/felCat9/bed/blastz.mm10.swap cd /hive/data/genomes/felCat9/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 70m51.860s cat fb.felCat9.chainMm10Link.txt # 779862191 bases of 2476453204 (31.491%) in intersection cat fb.felCat9.chainSynMm10Link.txt # 754481540 bases of 2476453204 (30.466%) in intersection time (doRecipBest.pl -load felCat9 mm10 -buildDir=`pwd` \ -workhorse=hgwdev) > rbest.log 2>&1 & # real 375m4.937s cat fb.felCat9.chainRBestMm10Link.txt # 760753851 bases of 2476453204 (30.719%) in intersection ############################################################################## # Create kluster run files (DONE - 2018-02-28 - Hiram) cd /hive/data/genomes/felCat9 # numerator is felCat9 gapless bases "real" as reported by: featureBits -noRandom -noHap felCat9 gap # 45323781 bases of 2414945138 (1.877%) in intersection # ^^^ # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2414945138 / 2861349177 \) \* 1024 # ( 2414945138 / 2861349177 ) * 1024 = 864.243987 # ==> use -repMatch=800 same as felCat8 cd /hive/data/genomes/felCat9 blat felCat9.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/felCat9.11.ooc \ -repMatch=800 # Wrote 30315 overused 11-mers to jkStuff/felCat9.11.ooc # felCat8 at repMatch=800 was: # Wrote 36446 overused 11-mers to jkStuff/felCat8.11.ooc # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' felCat9 \ | sort -k7,7nr | ave -col=7 stdin # these are all centromere's at 2 and 3 million: # Q1 2000000.000000 # median 2000000.000000 # Q3 2000000.000000 # average 2052631.578947 # min 2000000.000000 # max 3000000.000000 # count 19 # total 39000000.000000 # standard deviation 229415.733871 # +----+-------+-----------+-----------+----+---+---------+------------+----+ # | 19 | chrA1 | 88880063 | 90880063 | 6 | N | 2000000 | centromere | no | # | 9 | chrF2 | 0 | 2000000 | 1 | N | 2000000 | centromere | no | # | 9 | chrF1 | 0 | 2000000 | 1 | N | 2000000 | centromere | no | # | 1 | chrE2 | 23929417 | 25929417 | 22 | N | 2000000 | centromere | no | # | 1 | chrE1 | 24862965 | 26862965 | 10 | N | 2000000 | centromere | no | # | 12 | chrD4 | 31525997 | 33525997 | 18 | N | 2000000 | centromere | no | # | 12 | chrD3 | 30780723 | 32780723 | 32 | N | 2000000 | centromere | no | # | 11 | chrD2 | 19812854 | 21812854 | 14 | N | 2000000 | centromere | no | # | 1 | chrD1 | 32184683 | 34184683 | 20 | N | 2000000 | centromere | no | # | 2 | chrC2 | 75225443 | 77225443 | 10 | N | 2000000 | centromere | no | # | 16 | chrA2 | 60080733 | 62080733 | 32 | N | 2000000 | centromere | no | # | 15 | chrA3 | 50856976 | 52856976 | 18 | N | 2000000 | centromere | no | # | 13 | chrB1 | 38294479 | 40294479 | 10 | N | 2000000 | centromere | no | # | 12 | chrB2 | 29165855 | 31165855 | 8 | N | 2000000 | centromere | no | # | 12 | chrB3 | 28663498 | 30663498 | 6 | N | 2000000 | centromere | no | # | 14 | chrB4 | 43808484 | 45808484 | 12 | N | 2000000 | centromere | no | # | 2 | chrC1 | 107907036 | 109907036 | 24 | N | 2000000 | centromere | no | # | 15 | chrX | 50521356 | 52521356 | 48 | N | 2000000 | centromere | no | # | 1 | chrE3 | 16733304 | 19733304 | 8 | N | 3000000 | centromere | no | # +----+-------+-----------+-----------+----+---+---------+------------+----+ # therefore, minimum gap size of 2000000 gapToLift -verbose=2 -minGap=2000000 felCat9 jkStuff/felCat9.nonBridged.lft \ -bedFile=jkStuff/felCat9.nonBridged.bed ############################################################################## # LIFTOVER TO felCat8 (DONE - 2018-02-28 - Hiram) ssh hgwdev mkdir /hive/data/genomes/felCat9/bed/blat.felCat8.2018-02-28 cd /hive/data/genomes/felCat9/bed/blat.felCat8.2018-02-28 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/felCat9/jkStuff/felCat9.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ felCat9 felCat8) > do.log 2>&1 # real 605m27.304s # verify the convert link on the test browser is now active from felCat9 to # felCat8 ############################################################################## # GENBANK AUTO UPDATE (DONE - 2018-02-28 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt # Felis catus 2446 921 419 # edit etc/genbank.conf to add felCat9 just before felCat8 # Nov 2017 (GSC) at Washington University (WashU) School of Medicine # Felis_catus_9.0 # felCat9 (Cat) felCat9.serverGenome = /hive/data/genomes/felCat9/felCat9.2bit felCat9.clusterGenome = /hive/data/genomes/felCat9/felCat9.2bit felCat9.ooc = /hive/data/genomes/felCat9/jkStuff/felCat9.11.ooc felCat9.lift = /hive/data/genomes/felCat9/jkStuff/felCat9.nonBridged.lft felCat9.perChromTables = no felCat9.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} felCat9.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} felCat9.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} felCat9.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} felCat9.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} felCat9.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter} felCat9.genbank.mrna.xeno.load = yes felCat9.downloadDir = felCat9 felCat9.refseq.mrna.native.load = yes felCat9.refseq.mrna.xeno.load = yes felCat9.refseq.mrna.xeno.loadDesc = yes # felCat9.upstreamGeneTbl = refGene # felCat9.upstreamMaf = multiz6way # /hive/data/genomes/felCat9/bed/multiz6way/species.list # add felCat9 to: # etc/align.dbs etc/hgwdev.dbs git commit -m 'adding felCat9/cat refs #21043' \ etc/genbank.conf etc/align.dbs etc/hgwdev.dbs git push # update /cluster/data/genbank/: make etc-update # a few days later the genbank tables will be in the database ############################################################################# # ncbiRefSeq (DONE - 2018-03-13 - Hiram) mkdir /hive/data/genomes/felCat9/bed/ncbiRefSeq cd /hive/data/genomes/felCat9/bed/ncbiRefSeq time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Felis_catus \ GCF_000181335.3_Felis_catus_9.0 felCat9) > do.log 2>&1 # real 8m52.047s cat fb.ncbiRefSeq.felCat9.txt # 81659062 bases of 2476453204 (3.297%) in intersection ######################################################################### # BLATSERVERS ENTRY (DONE - 2018-08-13 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("felCat9", "blat1c", "17900", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("felCat9", "blat1c", "17901", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## reset default position to same as what felCat8 has ## (DONE - 2018-03-13 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrA2:53333898-53447862" where name="felCat9";' hgcentraltest ############################################################################ # LIFTOVER TO felCat8 (DONE - 2018-03-19 - Hiram) ssh hgwdev mkdir /hive/data/genomes/felCat9/bed/blat.felCat8.2018-03-19 cd /hive/data/genomes/felCat9/bed/blat.felCat8.2018-03-19 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/felCat9/jkStuff/felCat9.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ felCat9 felCat8) > do.log 2>&1 & # real 192m10.105s # verify the convert link on the test browser is now active from felCat9 to # felCat8 ############################################################################ # LIFTOVER TO felCat5 (DONE - 2018-03-19 - Hiram) ssh hgwdev mkdir /hive/data/genomes/felCat9/bed/blat.felCat5.2018-03-19 cd /hive/data/genomes/felCat9/bed/blat.felCat5.2018-03-19 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/felCat9/jkStuff/felCat9.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ felCat9 felCat5) > do.log 2>&1 & # real 138m37.419s # verify the convert link on the test browser is now active from felCat9 to # felCat5 ############################################################################ # all.joiner update, downloads and in pushQ - (DONE - 2018-03-19 - Hiram) cd $HOME/kent/src/hg/makeDb/schema ~/kent/src/hg/utils/automation/verifyBrowser.pl felCat9 # 65 tables in database felCat9 - Cat, Felis catus # verified 65 tables, 0 extra tables, 10 optional tables # 13 genbank tables found # verified 42 tables, 1 missing tables # 1 gapOverlap # liftOver to previous versions: 2, from previous versions: 2 # there were no gapOverlap items found # fixup all.joiner until this is a clean output joinerCheck -database=felCat9 -tableCoverage all.joiner joinerCheck -database=felCat9 -times all.joiner joinerCheck -database=felCat9 -keys all.joiner cd /hive/data/genomes/felCat9 rm -fr TemporaryTrackDbCheckout time (makeDownloads.pl -workhorse=hgwdev felCat9) > downloads.log 2>&1 # real 26m28.250s # now ready for pushQ entry mkdir /hive/data/genomes/felCat9/pushQ cd /hive/data/genomes/felCat9/pushQ time (makePushQSql.pl -redmineList felCat9) > felCat9.pushQ.sql 2> stderr.out # remove the tandemDups from the file list: sed -i -e "/tandemDups/d" redmine.felCat9.table.list sed -i -e "/Tandem Dups/d" redmine.felCat9.releaseLog.txt # check for errors in stderr.out, some are OK, e.g.: # WARNING: felCat9 does not have seq # WARNING: felCat9 does not have extFile # add the path names to the listing files in the redmine issue # in the three appropriate entry boxes: /hive/data/genomes/felCat9/pushQ/redmine.felCat9.file.list /hive/data/genomes/felCat9/pushQ/redmine.felCat9.releaseLog.txt /hive/data/genomes/felCat9/pushQ/redmine.felCat9.table.list #########################################################################