# for emacs: -*- mode: sh; -*- # This file describes browser build for the proCoq1 # Assembly name: Pcoq_1.0 # Organism name: Propithecus coquereli (Coquerel's sifaka) # Isolate: 6110/MARCELLA # Sex: female # Taxid: 379532 # BioSample: SAMN03121823 # BioProject: PRJNA281642 # Submitter: Baylor College of Medicine # Date: 2015-3-23 # Assembly type: haploid # Release type: major # Assembly level: Scaffold # Genome representation: full # WGS project: JZKE01 # Assembly method: AllPathsLG v. R43839; Atlas Link v. 1.0; Atlas Gapfill v. 2.2 # Genome coverage: 104.7x # Sequencing technology: Illumina # RefSeq category: Representative Genome # GenBank assembly accession: GCA_000956105.1 # RefSeq assembly accession: GCF_000956105.1 # RefSeq assembly and GenBank assemblies identical: no # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_000956115.1 GCF_000956115.1 Primary Assembly ## GCF_000073495.1 non-nuclear ############################################################################# ## The mitochondrion sequence: NC_011053.1 has a genbank ID of AB286049.1 ############################################################################# ############################################################################# # fetch sequence from new style download directory (DONE - 2017-09-27 - Hiram) mkdir -p /hive/data/genomes/proCoq1/refseq cd /hive/data/genomes/proCoq1/refseq time rsync -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Propithecus_coquereli/all_assembly_versions/GCF_000956105.1_Pcoq_1.0/ ./ # sent 562 bytes received 2514962369 bytes 18028408.11 bytes/sec # total size is 2514652560 speedup is 1.00 # real 2m19.428s # measure what we have here: faSize G*0_genomic.fna.gz # 2798152141 bases (714387616 N's 2083764525 real 1506203595 upper # 577560930 lower) in 22539 sequences in 1 files # Total size: mean 124147.1 sd 911840.2 min 307 (NW_012144527.1) # max 29265399 (NW_012133857.1) median 6864 # %20.64 masked total, %27.72 masked real ############################################################################# # fixup to UCSC naming scheme (DONE - 2017-09-27 - Hiram) mkdir /hive/data/genomes/proCoq1/ucsc cd /hive/data/genomes/proCoq1/ucsc # verify no duplicate sequences: faToTwoBit ../refseq/G*0_genomic.fna.gz refseq.2bit twoBitDup refseq.2bit # should be silent # verify all names are .1: twoBitInfo refseq.2bit stdout | awk '{print $1}' \ | sed -e 's/_[0-9]\+//;' | sort | uniq -c # 1 NC.1 # 22538 NW.1 # since they are all .1, change the names to be v1: zcat ../refseq/G*0_assembly_structure/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | grep -v "^#" | sed -e 's/\.1/v1/;' > chrUn.proCoq1.agp zcat ../refseq/G*0_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz \ | sed -e 's/.1 Propithecus .*/v1/;' > chrUn.proCoq1.fa zcat ../refseq/G*0_assembly_structure/non-nuclear/assembled_chromosomes/AGP/chrMT.comp.agp.gz \ | grep -v "^#" | sed -e 's/^NC_[0-9.]\+/chrM/;' > chrM.proCoq1.agp zcat ../refseq/G*0_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz \ | sed -e 's/NC_.*/chrM/;' > chrM.proCoq1.fa time gzip *.fa # real 11m42.655s # verify these two files are compatible: faToTwoBit chrUn.proCoq1.fa.gz chrM.proCoq1.fa.gz test.2bit cat chr*.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail # All AGP and FASTA entries agree - both files are valid # no longer need these rm -f test.2bit refseq.2bit ############################################################################# # photo (DONE - 2017-02-14 - Hiram) mkdir /hive/data/genomes/proCoq1/photo cd /hive/data/genomes/proCoq1/photo wget -O photoFile "http://maxpixel.freegreatpicture.com/static/photo/1x/Coquerels-Sifaka-Sifaka-Propithecus-Coquereli-1809664.jpg" convert -sharpen 0 -normalize -geometry 400x400 -quality 80 photoFile Propithecus_coquereli.jpg cd /hive/data/genomes/proCoq1 printf "photoCreditURL http://maxpixel.freegreatpicture.com/ photoCreditName Max Pixel " > photoReference.txt # check that Propithecus_coquereli.jpg file into source tree # src/hg/htdocs/images/ and copy to /usr/local/apache/htdocs/images/ ############################################################################# # Initial database build (DONE - 2017-09-27 - Hiram) cd /hive/data/genomes/proCoq1 ~/kent/src/hg/utils/automation/prepConfig.pl proCoq1 mammal primate \ refseq/*_assembly_report.txt > proCoq1.config.ra # going to have trouble with that quote mark in Coquerel's name: # verify it looks sane: cat proCoq.config.ra # config parameters for makeGenomeDb.pl: db proCoq1 clade mammal genomeCladePriority 35 scientificName Propithecus coquereli commonName Coquerel's sifaka assemblyDate Mar. 2015 assemblyLabel Baylor College of Medicine assemblyShortLabel Pcoq_1.0 orderKey 3623 # mitochondrial sequence included in refseq release # mitoAcc NC_011053.1 mitoAcc none fastaFiles /hive/data/genomes/proCoq1/ucsc/*.fa.gz agpFiles /hive/data/genomes/proCoq1/ucsc/*.agp # qualFiles none dbDbSpeciesDir primate photoCreditURL http://maxpixel.freegreatpicture.com/ photoCreditName Max Pixel ncbiGenomeId 24390 ncbiAssemblyId 315741 ncbiAssemblyName Pcoq_1.0 ncbiBioProject 281642 ncbiBioSample SAMN03121823 genBankAccessionID GCF_000956105.1 taxId 379532 # verify sequence and AGP are OK: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp proCoq1.config.ra) > agp.log 2>&1 # *** All done! (through the 'agp' step) # real 2m35.718s # this is going to fail on the dbDb insert: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db proCoq1.config.ra) > db.log 2>&1 # real 17m6.039s # finish the insert statements: hgsql hgcentraltest -e "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"Coquerel's sifaka\", \"mammal\", 35)" hgsql hgcentraltest -e "INSERT INTO defaultDb (genome, name) VALUES (\"Coquerel's sifaka\", \"proCoq1\")" # then continuing: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=trackDb proCoq1.config.ra) > trackDb.log 2>&1 # real 0m8.564s # fixup the description.html for the e acute: # WikiMedia Commons: Clément Bardot # check in the trackDb files created and add to trackDb/makefile # then, clean up: rm -fr TemporaryTrackDbCheckout/ ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2017-09-27 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/cpgIslandsUnmasked cd /hive/data/genomes/proCoq1/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/proCoq1/proCoq1.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku proCoq1) > do.log 2>&1 # real 16m26.331s cat fb.proCoq1.cpgIslandExtUnmasked.txt # 44235482 bases of 2083764538 (2.123%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2017-09-27 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/cytoBand cd /hive/data/genomes/proCoq1/bed/cytoBand makeCytoBandIdeo.csh proCoq1 ############################################################################# # running repeat masker (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/repeatMasker cd /hive/data/genomes/proCoq1/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku proCoq1) > do.log 2>&1 # real 341m18.065s # has a failure: # RepeatMasker bug?: Undefined id, line 2112033 of input: # 392 25.1 3.5 0.0 NW_012149596v1 1472171 1472277 (936613) + L1ME1 LINE/L1 6054 6166 (278) # fix it, remove that line: grep -v "NW_012149596v1 1472171 1472277" proCoq1.fa.out \ > clean.proCoq1.fa.out grep -v "NW_012149596v1 1472171 1472277" proCoq1.sorted.fa.out \ > clean.proCoq1.sorted.fa.out mv proCoq1.fa.out broken.proCoq1.fa.out mv proCoq1.sorted.fa.out broken.proCoq1.sorted.fa.out mv clean.proCoq1.fa.out proCoq1.fa.out mv clean.proCoq1.sorted.fa.out proCoq1.sorted.fa.out # finish the 'cat' step: /cluster/bin/scripts/extractNestedRepeats.pl proCoq1.fa.out \ | sort -k1,1 -k2,2n > proCoq1.nestedRepeats.bed # continuing: time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -continue=mask -smallClusterHub=ku proCoq1) > mask.log 2>&1 # real 20m17.716s cat faSize.rmsk.txt # 2798152141 bases (714387616 N's 2083764525 real 1263522213 upper # 820242312 lower) in 22539 sequences in 1 files # Total size: mean 124147.1 sd 911840.2 min 307 (NW_012144527v1) # max 29265399 (NW_012133857v1) median 6864 # %29.31 masked total, %39.36 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; time featureBits -countGaps proCoq1 rmsk # 820905660 bases of 2798152141 (29.337%) in intersection # real 0m39.877s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2017-09-27 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/simpleRepeat cd /hive/data/genomes/proCoq1/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409 5 proCoq1) > do.log 2>&1 # real 12m30.019s cat fb.simpleRepeat # 89300635 bases of 2721424086 (3.281%) in intersection # using the rmsk result cd /hive/data/genomes/proCoq1 twoBitMask bed/repeatMasker/proCoq1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed proCoq1.2bit # you can safely ignore the warning about fields >= 13 # if using windowMasker result: # twoBitMask bed/windowMasker/proCoq1.cleanWMSdust.2bit \ # -add bed/simpleRepeat/trfMask.bed proCoq1.2bit twoBitToFa proCoq1.2bit stdout | faSize stdin > faSize.proCoq1.2bit.txt cat faSize.proCoq1.2bit.txt # 2798152141 bases (714387616 N's 2083764525 real 1262916377 upper # 820848148 lower) in 22539 sequences in 1 files # Total size: mean 124147.1 sd 911840.2 min 307 (NW_012144527v1) # max 29265399 (NW_012133857v1) median 6864 # %29.34 masked total, %39.39 masked real # reset gbdb symlink rm /gbdb/proCoq1/proCoq1.2bit ln -s `pwd`/proCoq1.2bit /gbdb/proCoq1/proCoq1.2bit ########################################################################## # CREATE MICROSAT TRACK (DONE - 2017-09-28 - Hiram) ssh hgwdev mkdir /cluster/data/proCoq1/bed/microsat cd /cluster/data/proCoq1/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed proCoq1 microsat microsat.bed # Read 24078 elements of size 4 from microsat.bed ########################################################################## ## WINDOWMASKER (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/windowMasker cd /hive/data/genomes/proCoq1/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev proCoq1) > do.log 2>&1 # real 349m53.974s # Masking statistics cat faSize.proCoq1.cleanWMSdust.txt # 2798152141 bases (714387616 N's 2083764525 real 1494561914 upper # 589202611 lower) in 22539 sequences in 1 files # Total size: mean 124147.1 sd 911840.2 min 307 (NW_012144527v1) # max 29265399 (NW_012133857v1) median 6864 # %21.06 masked total, %28.28 masked real cat fb.proCoq1.rmsk.windowmaskerSdust.txt # 350354463 bases of 2798152141 (12.521%) in intersection ########################################################################## # run up idKeys files for ncbiRefSeq (DONE - 2017-12-14 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/idKeys cd /hive/data/genomes/proCoq1/bed/idKeys time (doIdKeys.pl -buildDir=`pwd` proCoq1) > do.log 2>&1 & # real 11m17.211s cat proCoq1.keySignature.txt # fbca0b1733d09e002f53e967677d2abc ########################################################################## # cpgIslands - (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/cpgIslands cd /hive/data/genomes/proCoq1/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku proCoq1) > do.log 2>&1 & # real 6m2.712s cat fb.proCoq1.cpgIslandExt.txt # 37852932 bases of 2083764538 (1.817%) in intersection ############################################################################## # augustus - (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/augustus cd /hive/data/genomes/proCoq1/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=human -dbHost=hgwdev -utr -workhorse=hgwdev proCoq1) \ > do.log 2>&1 # real 147m25.270s cat fb.proCoq1.augustusGene.txt # 50238200 bases of 2083764538 (2.411%) in intersection ######################################################################### # genscan - (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/genscan cd /hive/data/genomes/proCoq1/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku proCoq1) > do.log 2>&1 & # real 40m30.369s cat fb.proCoq1.genscan.txt # 56723464 bases of 2083764538 (2.722%) in intersection cat fb.proCoq1.genscanSubopt.txt # 57315334 bases of 2083764538 (2.751%) in intersection ######################################################################### # ucscToINSDC table/track (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/ucscToINSDC cd /hive/data/genomes/proCoq1/bed/ucscToINSDC # check for chrM accession: grep chrM ../../proCoq1.agp # chrM 1 17104 1 O NC_011053.1 1 17104 + # use that accession as an argument to this command ~/kent/src/hg/utils/automation/ucscToINSDC.sh \ ../../refseq/*0_assembly_structure/Primary_Assembly NC_011053.1 # this is actually ucscToRefSeq since this is a RefSeq assembly: sort -k2 ucscToINSDC.txt > ucscToRefSeq.txt rm -f ucscToINSDC.txt awk '{printf "%s\t%s\n", $2, $1}' ucscToRefSeq.txt | sort > refseqToUcsc.txt # There is no INSDC name entry in the assembly_report for MT, # thus the sed to fixup the name: grep -v "^#" ../../refseq/GCF*_assembly_report.txt | cut -f5,7 \ | sed -e 's/na\b/AB286049.1/;' | awk '{printf "%s\t%s\n", $2, $1}' \ | sort > refseq.insdc.txt # the sed \b means to match word awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \ > ucscToINSDC.bed join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-4 \ > ucscToRefSeq.bed # verify all names are coming through, should be same line count: wc -l * # 22539 name.coordinate.tab # 22539 refseq.insdc.txt # 22539 refseqToUcsc.txt # 22539 ucscToINSDC.bed # 22539 ucscToRefSeq.bed # 22539 ucscToRefSeq.txt # verify chrM is correct: grep chrM *.bed # ucscToINSDC.bed:chrM 0 17104 AB286049.1 # ucscToRefSeq.bed:chrM 0 17104 NC_011053.1 cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 14 # use the 14 in this sed sed -e "s/21/14/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab proCoq1 ucscToINSDC stdin ucscToINSDC.bed cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1 # 14 # use the 14 in this sed sed -e "s/21/14/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab proCoq1 ucscToRefSeq stdin ucscToRefSeq.bed # checkTableCoords should be silent for no errors: checkTableCoords proCoq1 # should cover %100 entirely: featureBits -countGaps proCoq1 ucscToINSDC # 2798152141 bases of 2798152141 (100.000%) in intersection featureBits -countGaps proCoq1 ucscToRefSeq # 2798152141 bases of 2798152141 (100.000%) in intersection ######################################################################### # add chromAlias table (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/proCoq1/bed/chromAlias cd /hive/data/genomes/proCoq1/bed/chromAlias hgsql -N -e 'select chrom,name,"refseq" from ucscToRefSeq;' proCoq1 \ > ucsc.refseq.tab hgsql -N -e 'select chrom,name,"genbank" from ucscToINSDC;' proCoq1 \ > ucsc.genbank.tab # verify chrM is correct: grep chrM * ucsc.genbank.tab:chrM AB286049.1 genbank ucsc.refseq.tab:chrM NC_011053.1 refseq awk '{printf "%s\t%s\t%s\n", $2,$1,$3}' ucsc.genbank.tab ucsc.refseq.tab \ | sort > proCoq1.chromAlias.tab hgLoadSqlTab proCoq1 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ proCoq1.chromAlias.tab cd /hive/data/genomes/proCoq1/bed/chromAlias # add ensembl names 2017-12-14 mkdir previous mv *.tab previous join -t$'\t' ../idKeys/proCoq1.idKeys.txt \ ../../ensembl/ensemblProCoq1.idKeys.txt \ | cut -f2,3 | sort > ucsc.ensembl.tab cut -f1,2 previous/ucsc.refseq.tab > ucsc.refseq.tab cut -f1,2 previous/ucsc.genbank.tab > ucsc.genbank.tab ~/kent/src/hg/utils/automation/chromAlias.pl sort -o proCoq1.chromAlias.tab proCoq1.chromAlias.tab for t in refseq genbank ensembl do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t proCoq1.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 22539 =? 22539 OK # checking genbank: 22539 =? 22539 OK # checking ensembl: 22539 =? 22539 OK hgLoadSqlTab proCoq1 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ proCoq1.chromAlias.tab ######################################################################### # Create kluster run files (DONE - 2017-09-28 - Hiram) cd /hive/data/genomes/proCoq1 # numerator is proCoq1 gapless bases "real" as reported by: head -1 faSize.proCoq1.2bit.txt # 2798152141 bases (714387616 N's 2083764525 real 1262916377 upper # 820848148 lower) in 22539 sequences in 1 files # numerator is 'real' base count # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2083764525 / 2861349177 \) \* 1024 # ( 2083764525 / 2861349177 ) * 1024 = 745.723343 # ==> use -repMatch=700 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/proCoq1 time blat proCoq1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/proCoq1.11.ooc \ -repMatch=700 # Wrote 23741 overused 11-mers to jkStuff/proCoq1.11.ooc # real 0m47.660s # there are no non-bridged gaps, do not need to do this # check non-bridged gaps to see what the typical size is: # hgsql -N -e 'select * from gap where bridge="no" order by size;' proCoq1 # | ave -tableOut -col=7 stdin # # min Q1 median Q3 max mean N sum stddev # 50076 58368.8 70128 100495 1.07816e+07 178173 670 1.19376e+08 672006 # note the minimum non-bridged gap size is 50,076 # gapToLift -verbose=2 -minGap=50000 proCoq1 jkStuff/proCoq1.nonBridged.lft \ # -bedFile=jkStuff/proCoq1.nonBridged.bed # hgsql -N \ # -e 'select * from gap where bridge="no" order by size;' proCoq1 \ # | ave -col=7 stdin # not needed: # gapToLift -verbose=2 -minGap=100 bosTau7 jkStuff/nonBridged.lft \ # -bedFile=jkStuff/nonBridged.bed # survey sizes: n50.pl chrom.sizes # reading: chrom.sizes # contig count: 22539, total size: 2798152141, one half size: 1399076070 # cumulative N50 count contig contig size # 1396934527 148 NW_012139969v1 5607000 # 1399076070 one half size # 1402539436 149 NW_012143634v1 5604909 ############################################################################# # GENBANK AUTO UPDATE (DONE - 2017-09-28 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows one mRNA, three refSeq: # organism mrnaCnt estCnt refSeqCnt # Propithecus coquereli 7 0 0 # Propithecus verreauxi 1 0 0 # edit etc/genbank.conf to add proCoq1 just after neoSch1 # proCoq1 (Coquerel's sifaka - Propithecus coquereli) taxId 379532 proCoq1.serverGenome = /hive/data/genomes/proCoq1/proCoq1.2bit proCoq1.clusterGenome = /hive/data/genomes/proCoq1/proCoq1.2bit proCoq1.ooc = /hive/data/genomes/proCoq1/jkStuff/proCoq1.11.ooc proCoq1.lift = no proCoq1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} proCoq1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} proCoq1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} proCoq1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} proCoq1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} proCoq1.genbank.est.xeno.pslCDnaFilter = ${lowCover.genbank.est.xeno.pslCDnaFilter} proCoq1.downloadDir = proCoq1 proCoq1.refseq.mrna.native.load = yes proCoq1.refseq.mrna.xeno.load = yes # DO NOT NEED genbank.mrna.xeno except for human, mouse proCoq1.genbank.mrna.xeno.load = no proCoq1.genbank.mrna.native.load = no proCoq1.genbank.est.native.load = no proCoq1.perChromTables = no # And edit src/lib/gbGenome.c to add new species. Adding lines: # static char *proCoqNames[] = {"Propithecus coquereli", NULL}; # {"proCoq", proCoqNames}, git commit -m "Added proCoq1/Coquerel's sifaka; refs #20235" \ etc/genbank.conf src/lib/gbGenome.c git push # update /cluster/data/genbank/: make etc-update make install-server screen # control this business with a screen since it takes a while cd /cluster/data/genbank time ./bin/gbAlignStep -initial proCoq1 # logFile: var/build/logs/2017.09.28-13:48:33.proCoq1.initalign.log # real 36m52.874s tail -2 var/build/logs/2017.09.28-13:48:33.proCoq1.initalign.log # hgwdev 2017.09.28-14:22:56 proCoq1.initalign: Succeeded: proCoq1 # hgwdev 2017.09.28-14:25:25 proCoq1.initalign: finish # To re-do, rm the dir first: # /cluster/data/genbank/work/initial.proCoq1 # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad proCoq1 # logFile: var/dbload/hgwdev/logs/2017.09.28-16:06:12.proCoq1.dbload.log # real 5m18.311s tail -1 var/dbload/hgwdev/logs/2017.09.28-16:06:12.proCoq1.dbload.log # hgwdev 2017.09.28-16:11:30 proCoq1.dbload: finish # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add proCoq1 to: # vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added proCoq1 - Coquerel's sifaka/Propithecus coquereli refs #20235" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2017-09-28 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/primate/proCoq1 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" proCoq1 \ | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c # 299069 JZKE.1 # 1 NC_.1 # implies a rule: '[JN][CZ][K_][E0-9]+(\.[0-9]+)?' # verify this rule will find them all and eliminate them all: hgsql -N -e "select frag from gold;" proCoq1 | wc -l # 299070 hgsql -N -e "select frag from gold;" proCoq1 \ | egrep -e '[JN][CZ][K_][E0-9]+(\.[0-9]+)?' | wc -l # 299070 hgsql -N -e "select frag from gold;" proCoq1 \ | egrep -v -e '[JN][CZ][K_][E0-9]+(\.[0-9]+)?' | wc -l # 0 # hence, add to trackDb/chicken/proCoq1/trackDb.ra searchTable gold shortCircuit 1 termRegex [JN][CZ][K_][E0-9]+(\.[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box, check full accession name, # accession name without .1 # truncated accession name produces multiple results # and the two chrM accessions, with and without the .1 and partial name # use: accessionName:n-m to display locations n to m on that accession git commit -m 'add gold/assembly track search rule refs #20235' *.ra git push ######################################################################### # lastz/chain/net swap from hg38 (DONE - 2017-09-28 - Hiram) # alignment to hg38 cd /hive/data/genomes/hg38/bed/lastzProCoq1.2017-09-28 cat fb.hg38.chainProCoq1Link.txt # 990017370 bases of 3049335806 (32.467%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` hg38 proCoq1) \ > rbest.log 2>&1 & # real 330m51.520s # and for the swap: mkdir /hive/data/genomes/proCoq1/bed/blastz.hg38.swap cd /hive/data/genomes/proCoq1/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzProCoq1.2017-09-28/DEF \ -swap -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 44m29.295s cat fb.proCoq1.chainHg38Link.txt # 953092997 bases of 2083764538 (45.739%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` proCoq1 hg38) \ > rbest.log 2>&1 # real 334m45.550s ######################################################################### # lastz/chain/net swap from mm10 (DONE - 2017-09-28 - Hiram) # alignment on mm10 cd /hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28 cat fb.mm10.chainProCoq1Link.txt # 882327683 bases of 2652783500 (33.260%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 proCoq1) \ > rbest.log 2>&1 & # real 411m5.774s mkdir /hive/data/genomes/proCoq1/bed/blastz.mm10.swap cd /hive/data/genomes/proCoq1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 62m48.333s cat fb.proCoq1.chainMm10Link.txt # 863635783 bases of 2083764538 (41.446%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` proCoq1 mm10) \ > rbest.log 2>&1 # real 357m54.198s ############################################################################## # BLATSERVERS ENTRY (DONE - 2017-10-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev Starting trans gfServer for proCoq1 on host blat1c, port 17894 Starting untrans gfServer for proCoq1 on host blat1c, port 17895 hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("proCoq1", "blat1c", "17894", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("proCoq1", "blat1c", "17895", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## reset default position to similar location as hg38 default ## (DONE - 2017-10-04 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="NW_012153840v1:5167275-5184676" where name="proCoq1";' hgcentraltest ############################################################################ # all.joiner update, downloads and in pushQ - (DONE - 2017-10-05 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=proCoq1 -tableCoverage all.joiner joinerCheck -database=proCoq1 -times all.joiner joinerCheck -database=proCoq1 -keys all.joiner git commit -m "Added proCoq1 refs #20235" all.joiner # to get this installed, run a 'make alpha' in the hgTables directory # in a clean source tree that has been fully constructed cd /hive/data/genomes/proCoq1 time (makeDownloads.pl proCoq1) > downloads.log 2>&1 # real 15m45.307s # now ready for pushQ entry mkdir /hive/data/genomes/proCoq1/pushQ cd /hive/data/genomes/proCoq1/pushQ time (makePushQSql.pl -redmineList proCoq1) \ > proCoq1.pushQ.sql 2> stderr.out # real 3m48.126s # check for errors in stderr.out, some are OK, e.g.: # writing redmine listings to # redmine.proCoq1.file.list # redmine.proCoq1.table.list # redmine.proCoq1.releaseLog.txt # WARNING: proCoq1 does not have seq # WARNING: proCoq1 does not have extFile # WARNING: proCoq1 does not have estOrientInfo # WARNING: proCoq1 does not have mrnaOrientInfo # Enter the full path names of these listing files: # /hive/data/genomes/proCoq1/pushQ/redmine.proCoq1.file.list # /hive/data/genomes/proCoq1/pushQ/redmine.proCoq1.releaseLog.txt # /hive/data/genomes/proCoq1/pushQ/redmine.proCoq1.table.list # into the Redmine #20190 and set to QA Ready. #########################################################################