# for emacs: -*- mode: sh; -*- # Erinaceus europaeus 2.0 sequence: # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Erinaceus_europaeus/EriEur2.0/ # DATE: 30-May-2012 # ORGANISM: Erinaceus europaeus # TAXID: 9365 # ASSEMBLY LONG NAME: EriEur2.0 # ASSEMBLY SHORT NAME: EriEur2.0 # ASSEMBLY SUBMITTER: Broad Institute # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000296755.1 # FTP-RELEASE DATE: 22-Oct-2012 ########################################################################## # Download sequence (DONE - 2013-06-07 - Hiram) mkdir -p /hive/data/genomes/eriEur2/genbank cd /hive/data/genomes/eriEur2/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Erinaceus_europaeus/EriEur2.0/ ./ > fetch.log 2>&1 ########################################################################### # fixup to UCSC names (DONE - 2013-03-25 - Hiram) cd /hive/data/genomes/eriEur2 $HOME/kent/src/hg/utils/automation/unplacedScaffolds.pl # constructs the directory: /hive/data/genomes/eriEur2/ucsc # with files: cd /hive/data/genomes/eriEur2/ucsc ls -ogrt # -rwxrwxr-x 1 355 Jun 7 12:04 fetchChrM.sh # -rw-rw-r-- 1 17780 Jun 7 12:04 NC_002080.fa # -rw-rw-r-- 1 17704 Jun 7 12:04 chrM.fa # -rw-rw-r-- 1 37 Jun 7 12:04 chrM.agp # -rw-rw-r-- 1 25068539 Jun 7 12:37 eriEur2.ucsc.agp # -rw-rw-r-- 1 711064733 Jun 7 12:39 eriEur2.ucsc.fa.gz # NOTE: the chrM sequence was manually added to the fa.gz and .agp file # see the fetchChrM.sh script there # this script also constructs the eriEur2.unmasked.2bit file, but # this is not needed with the makeGenomeDb.pl script: rm -f /hive/data/genomes/eriEur2/eriEur2.unmasked.2bit ########################################################################### # Initial genome build (DONE - 2013-06-07 - Hiram) cd /hive/data/genomes/eriEur2 cat << '_EOF_' > eriEur2.config.ra # Config parameters for makeGenomeDb.pl: db eriEur2 clade mammal # genomeCladePriority 20.2 scientificName Erinaceus europaeus commonName Hedgehog assemblyDate May 2012 assemblyLabel Broad Institute EriEur2.0 assemblyShortLabel EriEur2.0 orderKey 2569 # chrM already included in agp and fa files mitoAcc none fastaFiles /cluster/data/eriEur2/ucsc/eriEur2.ucsc.fa.gz agpFiles /cluster/data/eriEur2/ucsc/eriEur2.ucsc.agp # qualFiles none dbDbSpeciesDir hedgehog photoCreditURL http://en.wikipedia.org/wiki/File:West_European_Hedgehog_%28Erinaceus_europaeus%292.jpg photoCreditName Papa Lima Whiskey, Wikipedia Commons ncbiGenomeId 227 ncbiAssemblyId 426148 ncbiAssemblyName EriEur2.0 ncbiBioProject 74585 genBankAccessionID GCA_000296755.1 taxId 9365 '_EOF_' # run step wise to confirm sequence and AGP files match each other time makeGenomeDb.pl -stop=agp eriEur2.config.ra > genomeDb.agp.log 2>&1 # real 4m15.127s # verify it is OK: tail -1 genomeDb.agp.log # *** All done! (through the 'agp' step) time nice -n +19 makeGenomeDb.pl -fileServer=hgwdev \ -workhorse=hgwdev -continue=db eriEur2.config.ra \ > genomeDb.db.log 2>&1 # real 17m22.508s # add the trackDb business to the source tree ########################################################################## # running repeat masker (DONE - 2013-06-07 - Hiram) mkdir /hive/data/genomes/eriEur2/bed/repeatMasker cd /hive/data/genomes/eriEur2/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek eriEur2 > do.log 2>&1 & # real 770m28.305s # failing in nestedRepeats, e.g: # RepeatMasker bug?: Undefined id, line 3839311 of input: # 551 24.8 2.2 0.0 JH835619 1597561 1597571 (1160377) + LTR18_EE LTR/ERV1 1293 1343 (174) # ran that last job on hgwdev, then removed this single broken line and # finished off this one job, then continuing: time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -continue=mask -smallClusterHub=encodek eriEur2 > mask.log 2>&1 & # real 31m37.041s cat faSize.rmsk.txt # 2715720925 bases (382647390 N's 2333073535 real 1271539462 upper # 1061534073 lower) in 5803 sequences in 1 files # Total size: mean 467985.7 sd 1304803.0 min 1001 (AMDU01225431) # max 17429815 (JH835289) median 3704 # %39.09 masked total, %45.50 masked real egrep -i "versi|release" do.log # RepeatMasker version open-4.0.2 # April 29 2013 (open-4-0-2) version of RepeatMasker # CC RELEASE 20130422; ########################################################################## # running simple repeat (DONE - 2013-06-07 - Hiram) mkdir /hive/data/genomes/eriEur2/bed/simpleRepeat cd /hive/data/genomes/eriEur2/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ eriEur2 > do.log 2>&1 & # real 14m53.481s cat fb.simpleRepeat # 62243136 bases of 2333073535 (2.668%) in intersection # using RMSK and TRF since RMSK is equivalent to the masking result of WM cd /hive/data/genomes/eriEur2 twoBitMask eriEur2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed eriEur2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa eriEur2.2bit stdout | faSize stdin > faSize.eriEur2.2bit.txt cat faSize.eriEur2.2bit.txt # 2715720925 bases (382647390 N's 2333073535 real 1269258231 upper # 1063815304 lower) in 5803 sequences in 1 files # Total size: mean 467985.7 sd 1304803.0 min 1001 (AMDU01225431) # max 17429815 (JH835289) median 3704 # %39.17 masked total, %45.60 masked real rm /gbdb/eriEur2/eriEur2.2bit ln -s `pwd`/eriEur2.2bit /gbdb/eriEur2/eriEur2.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2013-06-07 - Hiram) mkdir /hive/data/genomes/eriEur2/bed/gap cd /hive/data/genomes/eriEur2/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../eriEur2.unmasked.2bit > findMotif.txt 2>&1 # real 0m37.313s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits eriEur2 -not gap -bed=notGap.bed # 2605196361 bases of 2605196361 (100.000%) in intersection # real 0m24.729s # used to do this featureBits, but it is really really slow if there # are a log of contigs time featureBits eriEur2 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 2715720925 (0.000%) in intersection # real 6m33.744s # this is much faster: awk '{print $3-$2,$0}' notGap.bed | sort -rn > notGap.sizes.txt # largest contiguous sequence: head -1 notGap.sizes.txt | awk '{print $1}' # 233448 # minimal coverage 1 base out of that largest sequence: echo 233448 | awk '{printf "%15.10f\n", 1/(2*$1)}' | sed -e 's/ //g' # 0.0000021418 time bedIntersect -minCoverage=0.0000021418 allGaps.bed notGap.bed \ test.new.gaps.bed # real 0m0.975s # number of bases in these new gaps, none: # -rw-rw-r-- 1 0 Jun 11 17:06 test.new.gaps.bed # when non-zero, to count them, e.g.: # awk '{print $3-$2}' test.new.gaps.bed | ave stdin | grep total # total 8314.000000 ######################################################################### # cytoBandIdeo - (DONE - 2013-06-12 - Hiram) mkdir /hive/data/genomes/eriEur2/bed/cytoBand cd /hive/data/genomes/eriEur2/bed/cytoBand makeCytoBandIdeo.csh eriEur2 ########################################################################## ## WINDOWMASKER (DONE - 2013-05-30 - Hiram) mkdir /hive/data/genomes/eriEur2/bed/windowMasker cd /hive/data/genomes/eriEur2/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev eriEur2 > do.log 2>&1 & # real 241m22.022s cat faSize.eriEur2.cleanWMSdust.txt # 2715720925 bases (382647390 N's 2333073535 real 1191999225 upper # 1141074310 lower) in 5803 sequences in 1 files # Total size: mean 467985.7 sd 1304803.0 min 1001 (AMDU01225431) # max 17429815 (JH835289) median 3704 # %42.02 masked total, %48.91 masked real # This is pretty good for WM, but RMSK isn't that bad either, # so, using the RMSK result to mask the genome featureBits -countGaps eriEur2 rmsk windowmaskerSdust > fb.eriEur2.rmsk.windowmaskerSdust.txt 2>&1 cat fb.eriEur2.rmsk.windowmaskerSdust.txt # 859433945 bases of 2715720925 (31.647%) in intersection ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-18 - Hiram) # Use -repMatch=850, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2333073535 / 2897316137 \) \* 1024 # ( 2333073535 / 2897316137 ) * 1024 = 824.579434 # round up to 850 # eriEur1 was: -repMatch=800 # Wrote 43503 overused 11-mers to eriEur1.11.ooc cd /hive/data/genomes/eriEur2 blat eriEur2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/eriEur2.11.ooc -repMatch=850 # Wrote 37160 overused 11-mers to jkStuff/eriEur2.11.ooc # there are *only* bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" eriEur2 | sort | uniq -c # 219764 yes ######################################################################### # AUTO UPDATE GENBANK (WORKING - 2013-06-06 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Erinaceus europaeus 10 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf # this appears that species has almost nothing ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add eriEur2 before echTel1 and commit to GIT # eriEur2 (European Hedgehog) eriEur2.serverGenome = /hive/data/genomes/eriEur2/eriEur2.2bit eriEur2.clusterGenome = /hive/data/genomes/eriEur2/eriEur2.2bit eriEur2.ooc = /hive/data/genomes/eriEur2/jkStuff/eriEur2.11.ooc eriEur2.lift = no eriEur2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} eriEur2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} eriEur2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} eriEur2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} eriEur2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} eriEur2.refseq.mrna.native.load = no eriEur2.refseq.mrna.xeno.load = yes eriEur2.genbank.mrna.xeno.load = no eriEur2.genbank.est.native.load = no eriEur2.downloadDir = eriEur2 eriEur2.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding eriEur2 hedgehog refs #9419" etc/genbank.conf git push make etc-update # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # anoCar genome information, if this is a new species, need to add stuff # there ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial eriEur2 & # var/build/logs/2013.06.30-22:13:31.eriEur2.initalign.log # real 192m21.565s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad eriEur2 & # logFile: var/dbload/hgwdev/logs/2013.07.01-07:56:34.dbload.log # real 12m38.421s # enable daily alignment and update of hgwdev (TBD - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add eriEur2 to: etc/align.dbs etc/hgwdev.dbs vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added eriEur2 to daily hgwdev build refs #9419" etc/align.dbs etc/hgwdev.dbs git push make etc-update ########################################################################### # lastz alignment with Human/hg19 (DONE - 2013-07-09 - Hiram) # the original alignment cd /hive/data/genomes/hg19/bed/lastzEriEur2.2013-07-08 cat fb.hg19.chainEriEur2Link.txt # 757625719 bases of 2897316137 (26.149%) in intersection # running the swap mkdir /hive/data/genomes/eriEur2/bed/blastz.hg19.swap cd /hive/data/genomes/eriEur2/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/hg19/bed/lastzEriEur2.2013-07-08/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -syntenicNet > swap.log 2>&1 # real 98m18.554s cat fb.eriEur2.chainHg19Link.txt # 729081383 bases of 2333073535 (31.250%) in intersection cd /hive/data/genomes/eriEur2/bed ln -s blastz.hg19.swap lastz.hg19 ############################################################################ # After getting a blat server assigned by the Blat Server Gods, # (DONE - 2013-06-26 - Hiram) ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("eriEur2", "blat4b", "17848", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("eriEur2", "blat4b", "17849", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### ## Default position set to same as was eriEur1 (DONE - 2014-04-11 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="JH835582:2324009-2338336" where name="eriEur2";' hgcentraltest ############################################################################## # cpgIslands - (DONE - 2013-11-25 - Hiram) mkdir /hive/data/genomes/eriEur2/bed/cpgIslands cd /hive/data/genomes/eriEur2/bed/cpgIslands doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ eriEur2 > run.log 2>&1 # Elapsed time: 7m27s cat fb.eriEur2.cpgIslandExt.txt # 16668591 bases of 2333073535 (0.714%) in intersection ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2014-04-11 - Hiram) mkdir /hive/data/genomes/eriEur2/bed/cpgIslandsUnmasked cd /hive/data/genomes/eriEur2/bed/cpgIslandsUnmasked # run stepwise so the loading can be done in a different table time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -stop=makeBed \ -maskedSeq=/hive/data/genomes/eriEur2/eriEur2.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku eriEur2 > makeBed.log 2>&1 # real 3m6.463s # debug load step so it can be loaded into a separate table: time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -debug -continue=load \ -maskedSeq=/hive/data/genomes/eriEur2/eriEur2.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku eriEur2 # edit and change the table name to load: cpgIslandExtUnmasked time ./doLoadCpg.csh > load.log 2>&1 # Read 28528 elements of size 10 from cpgIsland.bed cat fb.eriEur2.cpgIslandExtUnmasked.txt # 19393537 bases of 2333073535 (0.831%) in intersection time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -continue=cleanup \ -maskedSeq=/hive/data/genomes/eriEur2/eriEur2.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku eriEur2 # real 0m22.827s ######################################################################### # create ucscToINSDC name mapping (DONE - 2014-04-11 - Hiram) mkdir /hive/data/genomes/eriEur2/bed/ucscToINSDC cd /hive/data/genomes/eriEur2/bed/ucscToINSDC # this script has been maturing over time, it is close to complete. # to find a latest copy of it: # ls -ogrt /hive/data/genomes/*/bed/ucscToINSDC/translateNames.sh cp -p /hive/data/genomes/criGri1/bed/ucscToINSDC/translateNames.sh . ./translateNames.sh # it says: # need to find chrM accessions # so add this one: echo -e 'chrM\tNC_002080.2' >> ucscToINSDC.txt # needs to be sorted to work with join sort ucscToINSDC.txt > ucscToINSDC.tab awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \ > name.coordinate.tab join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' > ucscToINSDC.bed cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 12 # use the 12 in this sed: sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab eriEur2 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords eriEur2 ucscToINSDC # should cover all bases featureBits -countGaps eriEur2 ucscToINSDC # 2715720925 bases of 2715720925 (100.000%) in intersection ############################################################################## # construct download files (DONE - 2014-04-11 - Hiram) # after db name has been added to all.joiner and # joinerCheck -database=eriEur2 -keys all.joiner # is clean cd /hive/data/genomes/eriEur2 time makeDownloads.pl -workhorse=hgwdev -dbHost=hgwdev eriEur2 \ > downloads.log 2>&1 # real 22m58.066s ############################################################################## # pushQ entry (DONE - 2014-04-11 - Hiram) mkdir /hive/data/genomes/eriEur2/pushQ cd /hive/data/genomes/eriEur2/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl eriEur2 2> stderr.txt | grep -v transMap > eriEur2.sql # real 1m55.792s scp -p eriEur2.sql qateam@hgwbeta:/tmp ssh qateam@hgwbeta './bin/x86_64/hgsql qapushq < /tmp/eriEur2.sql' ########################################################################### # genscan track (DONE - 2013-09-12 - Hiram) cd /hive/data/genomes/eriEur2/bed/genscan doGenscan.pl -buildDir=/hive/data/genomes/eriEur2/bed/genscan \ -bigClusterHub=ku -workhorse=hgwdev \ -dbHost=hgwdev eriEur2 > do.log 2>&1 # Elapsed time: 65m12s cat fb.eriEur2.genscan.txt # 45444338 bases of 2333073535 (1.948%) in intersection cat fb.eriEur2.genscanSubopt.txt # 44934986 bases of 2333073535 (1.926%) in intersection ############################################################################# # fixup search rule for assembly track/gold table (DONE - 2014-05-01 - Hiram) hgsql -N -e "select frag from gold;" eriEur2 | sort | head -1 AMDU01000001.1 hgsql -N -e "select frag from gold;" eriEur2 | sort | tail -2 AMDU01225566.1 NC_002080 # verify this rule will find them all or eliminate them all: hgsql -N -e "select frag from gold;" eriEur2 | wc -l # 225567 hgsql -N -e "select frag from gold;" eriEur2 | egrep -e '[AN][MC][D_][U0]0[0-9]+(\.1)?' | wc -l # 225567 hgsql -N -e "select frag from gold;" eriEur2 | egrep -v -e '[AN][MC][D_][U0]0[0-9]+(\.1)?' | wc -l # 0 # hence, add to trackDb/hedgehog/eriEur2/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][MC][D_][U0]0[0-9]+(\.1)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 ######################################################################### ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## ############################################################################## # GENEID GENE PREDICTIONS (DONE - 2015-06-26 - Hiram) ssh hgwdev mkdir /hive/data/genomes/eriEur2/bed/geneid cd /hive/data/genomes/eriEur2/bed/geneid wget --timestamping \ http://genome.crg.es/genepredictions/E.europaeus/EriEur2/geneid_v1.4/00README wget --timestamping \ http://genome.crg.es/genepredictions/E.europaeus/EriEur2/geneid_v1.4/eriEur2.geneid.gtf ldHgGene -gtf -genePredExt eriEur2 geneid eriEur2.geneid.gtf # Read 30632 transcripts in 236791 lines in 1 files # 30632 groups 5649 seqs 1 sources 3 feature types # 30632 gene predictions featureBits -enrichment eriEur2 augustusGene:CDS geneid # augustusGene:CDS 1.289%, geneid 1.393%, both 1.044%, cover 81.03%, # enrich 58.19x ########################################################################## # SGP GENES (DONE - 2015-07-30 - Hiram) mkdir /hive/data/genomes/eriEur2/bed/sgpGene cd /hive/data/genomes/eriEur2/bed/sgpGene wget --timestamping \ http://genome.crg.es/genepredictions/E.europaeus/EriEur2/SGP2/hg38/00README wget --timestamping \ http://genome.crg.es/genepredictions/E.europaeus/EriEur2/SGP2/hg38/eriEur2.sgp2.gtf wget --timestamping \ http://genome.crg.es/genepredictions/E.europaeus/EriEur2/SGP2/hg38/eriEur2.sgp2.gff3 ldHgGene -gtf -genePredExt eriEur2 sgpGene eriEur2.sgp2.gtf # Read 28927 transcripts in 249483 lines in 1 files # 28927 groups 2366 seqs 1 sources 3 feature types # 28927 gene predictions featureBits -enrichment eriEur2 augustusGene:CDS sgpGene # augustusGene:CDS 1.289%, sgpGene 1.359%, both 1.082%, cover 83.95%, # enrich 61.78x ###########################################################################