# for emacs: -*- mode: sh; -*- ############################################################################## # GRCh37 patch 13 build: build basic tracks on separate database for new # sequences only, then add to test database hg19. ############################################################################## ############################################################################## # download the patch release files (DONE - 2018-08-29 - Angie) # Note: newer assemblies use refseq releases instead of genbank, but hg19 uses genbank # so continue with that when building patches. mkdir -p /hive/data/genomes/grcH37P13/genbank cd /hive/data/genomes/grcH37P13/genbank # The files are no longer on the main ftp site, nor are they in /hive/data/outside/ncbi/genomes. # Do as Hiram did in hg19Patch13.txt: time rsync -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/ ./ # this is an entire release of everything, construct the convenient # single file as in modern releases: find . -type f | grep FASTA | egrep -v "rm.out.gz|/placed_scaffolds/" \ | xargs zcat | gzip -c > GCA_000001405.14_GRCh37.p13_genomic.fna.gz faSize GCA_000001405.14_GRCh37.p13_genomic.fna.gz #3234834689 bases (243146540 N's 2991688149 real 2991688149 upper 0 lower) in 297 sequences in 1 files #Total size: mean 10891699.3 sd 38623365.0 min 4262 (gi|224182996|gb|GL000207.1|) max 249250621 (gi|224384768|gb|CM000663.1|) median 196262 #N count: mean 818675.2 sd 3640667.9 ############################################################################## # Set up fasta and agp with UCSC names (DONE - 2018-09-28 - Angie) mkdir /hive/data/genomes/grcH37P13/ucsc cd /hive/data/genomes/grcH37P13/ucsc # identify sequences not in existing genome db faCount ../genbank/GCA_000001405.14_GRCh37.p13_genomic.fna.gz \ > faCount.GRCh37.p13.txt ~/kent/src/hg/makeDb/doc/hg19.scanAssemblyReport.pl \ /hive/data/genomes/hg19/chrom.sizes \ faCount.GRCh37.p13.txt ../genbank/GCA_000001405.14_GRCh37.p13_assembly_report.txt \ | grep -w new > new.sequences.list wc -l new.sequences.list #204 new.sequences.list # Extract UCSC-named FASTA for the new sequences cut -f3 new.sequences.list > extract.new.list awk '{printf "s/%s/%s/; ", $3,$1}' new.sequences.list > genbankToUCSC.sed time (zcat ../genbank/GCA_000001405.14_GRCh37.p13_genomic.fna.gz \ | sed -re 's/^>gi\|[0-9]+\|gb\|([A-Z0-9.]+)\|.*/>\1/;' \ | faSomeRecords stdin extract.new.list stdout \ | sed -f genbankToUCSC.sed \ | gzip -c > grcH37P13.fa.gz) #real 2m50.420s #user 3m55.080s faSize grcH37P13.fa.gz #97673427 bases (3295737 N's 94377690 real 94377690 upper 0 lower) in 204 sequences in 1 files #Total size: mean 478791.3 sd 848169.0 min 22394 (chr9_jh806577_fix) max 7283150 (chr1_jh636052_fix) median 240775 #N count: mean 16155.6 sd 86668.4 # Compare faSize for whole GCA_000001405.14_GRCh37.p13_genomic.fna.gz (copied from above) -- #3234834689 bases (243146540 N's 2991688149 real 2991688149 upper 0 lower) in 297 sequences in 1 files #Total size: mean 10891699.3 sd 38623365.0 min 4262 (gi|224182996|gb|GL000207.1|) max 249250621 (gi|2#24384768|gb|CM000663.1|) median 196262 #N count: mean 818675.2 sd 3640667.9 # -- vs. concatenation of hg19 fasta and grcH37P13.fa.gz: twoBitToFa /hive/data/genomes/hg19/hg19.2bit stdout \ | faSize grcH37P13.fa.gz stdin #3234834691 bases (243146539 N's 2991688152 real 1524764949 upper 1466923203 lower) in 297 sequences in 2 files #Total size: mean 10891699.3 sd 38623365.0 min 4262 (chr18_gl000207_random) max 249250621 (chr1) median 196262 #N count: mean 818675.2 sd 3640667.9 # Doh! It's off by two bases -- but that's because we used the wrong mitochondrial # sequence for hg19 (16571 bases) while GenBank uses the rCRS (16569 bases). # So the new patch sequences are fine. # Make UCSC-named AGP: zcat ../genbank/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \ | sed -f genbankToUCSC.sed > grcH37P13.agp # construct 2bit file: cd /hive/data/genomes/grcH37P13 faToTwoBit ucsc/grcH37P13.fa.gz grcH37P13.unmasked.2bit twoBitInfo grcH37P13.unmasked.2bit stdout | sort -k2nr > chrom.sizes # take a look at chrom.sizes to verify it looks OK. # Make sure AGP and FASTA/2bit agree: checkAgpAndFa ucsc/grcH37P13.agp grcH37P13.unmasked.2bit | tail -1 #All AGP and FASTA entries agree - both files are valid ############################################################################## # establish config.ra file (DONE - Angie - 2018-08-29) cd /hive/data/genomes/grcH37P13 # Must make photoReference.txt first -- copy from hg38 cp /hive/data/genomes/hg38/photoReference.txt . # arguments here are: $HOME/kent/src/hg/utils/automation/prepConfig.pl grcH37P13 haplotypes \ GRCh37.p13 genbank/*_assembly_report.txt > grcH37P13.config.ra # going to need a mitoAcc ? # Edit grcH37P13.config.ra to avoid confusion with actual hg19 assemblyDate PATCH Jun. 2013 orderKey 2000 mitoAcc none # I don't know what's with the "51" on a line by itself... but the grcH38P* files have # "ncbiGenomeId 51" so I'll just go with that. sed -e 's/^/#/' grcH37P13.config.ra ## config parameters for makeGenomeDb.pl: #db grcH37P13 #clade haplotypes #genomeCladePriority 134 #scientificName Homo sapiens #commonName NotFound #assemblyDate PATCH Jun. 2013 #assemblyLabel Genome Reference Consortium #assemblyShortLabel GRCh37.p13 #orderKey 2000 #mitoAcc none #fastaFiles /hive/data/genomes/grcH37P13/ucsc/*.fa.gz #agpFiles /hive/data/genomes/grcH37P13/ucsc/*.agp ## qualFiles none #dbDbSpeciesDir GRCh37.p13 #photoCreditURL http://www.cbse.ucsc.edu/ #photoCreditName Graphic courtesy of CBSE #ncbiGenomeId 51 #ncbiAssemblyId 37871 #ncbiAssemblyName GRCh37.p13 #ncbiBioProject notFound #ncbiBioSample notFound #genBankAccessionID notFound #taxId 9606 ############################################################################## # Initial database build (DONE - 2018-09-28 - Angie) cd /hive/data/genomes/grcH37P13 # AGP and unmasked.2bit are already built and checked, so start at the db step: mkdir jkStuff $HOME/kent/src/hg/utils/automation/makeGenomeDb.pl grcH37P13.config.ra -debug #HgStepManager: executing from step 'seq' through step 'trackDb'. #HgStepManager: executing step 'seq' Fri Apr 13 14:23:27 2018. #seq: looks like this was run successfully already (/cluster/data/grcH37P13/chrom.sizes exists). Either run with -continue agp or some later step, or move aside/remove /cluster/data/grcH37P13/chrom.sizes and run again. # But we do need to make chromInfo.tab before the db step. mkdir -p bed/chromInfo awk '{print $1 "\t" $2 "\t/gbdb/grcH37P13/grcH37P13.2bit";}' chrom.sizes \ > bed/chromInfo/chromInfo.tab time ($HOME/kent/src/hg/utils/automation/makeGenomeDb.pl \ -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -continue=agp grcH37P13.config.ra) > do.log 2>&1 & tail -f do.log #real 0m45.569s # Ignore all the "NOTES -- STUFF THAT YOU WILL HAVE TO DO --" stuff because this is # going to be folded into hg19. # Now the gold, gap and gc5BaseBw tracks are built. ############################################################################# # RepeatMasker (DONE - 2018-09-28 - Angie) mkdir /hive/data/genomes/grcH37P13/bed/repeatMasker cd /hive/data/genomes/grcH37P13/bed/repeatMasker time ($HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku grcH37P13) > do.log 2>&1 & tail -f do.log # *** All done! - Elapsed time: 59m21s egrep "bases|Total|masked" faSize.rmsk.txt \ | sed -e 's/^/# /;' # 97673427 bases (3295737 N's 94377690 real 42827571 upper 51550119 lower) in 204 sequences in 1 files # Total size: mean 478791.3 sd 848169.0 min 22394 (chr9_jh806577_fix) max 7283150 (chr1_jh636052_fix) median 240775 # %52.78 masked total, %54.62 masked real egrep -i "versi|relea" do.log | sed -e 's/^/# /;' # RepeatMasker version open-4.0.7 # grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker # # February 01 2017 (open-4-0-7) 1.331 version of RepeatMasker # grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl # CC Dfam_Consensus RELEASE 20170127; * # CC RepBase RELEASE 20170127; * featureBits -countGaps grcH37P13 rmsk #51550580 bases of 97673427 (52.779%) in intersection ########################################################################## # running simple repeat (DONE - 2018-09-28 - Angie) mkdir /hive/data/genomes/grcH37P13/bed/simpleRepeat cd /hive/data/genomes/grcH37P13/bed/simpleRepeat # using trf409 6 here like hg19 time ($HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl -buildDir=`pwd` \ -dbHost=hgwdev -workhorse=hgwdev -bigClusterHub=ku -smallClusterHub=ku \ -trf409 6 grcH37P13) > do.log 2>&1 & tail -f do.log #real 9m55.230s # The cleanup stage failed due to an 'rm: no match' but the track was built OK. cat fb.simpleRepeat #4006366 bases of 94378040 (4.245%) in intersection # adding this trfMask to .rmsk.2bit cd /hive/data/genomes/grcH37P13 twoBitMask grcH37P13.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed grcH37P13.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa grcH37P13.2bit stdout | faSize stdin > faSize.grcH37P13.2bit.txt egrep "bases|Total|masked" faSize.grcH37P13.2bit.txt \ | sed -e 's/^/# /;' # 97673427 bases (3295737 N's 94377690 real 42661623 upper 51716067 lower) in 204 sequences in 1 files # Total size: mean 478791.3 sd 848169.0 min 22394 (chr9_jh806577_fix) max 7283150 (chr1_jh636052_fix) median 240775 # %52.95 masked total, %54.80 masked real # reset the symlink ln -sf `pwd`/grcH37P13.2bit /gbdb/grcH37P13/grcH37P13.2bit ########################################################################## # WINDOWMASKER (DONE - 2018-09-28 - Angie) mkdir /hive/data/genomes/grcH37P13/bed/windowMasker cd /hive/data/genomes/grcH37P13/bed/windowMasker time ($HOME/kent/src/hg/utils/automation/doWindowMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -dbHost=hgwdev grcH37P13) > do.log 2>&1 & tail -f do.log #real 3m10.679s featureBits -countGaps grcH37P13 rmsk windowmaskerSdust \ > fb.grcH37P13.rmsk.windowmaskerSdust.txt 2>&1 cat fb.grcH37P13.rmsk.windowmaskerSdust.txt #24560148 bases of 97673427 (25.145%) in intersection # Masking statistics egrep "bases|Total|masked" faSize.grcH37P13.cleanWMSdust.txt \ | sed -e 's/^/# /;' # 97673427 bases (3295737 N's 94377690 real 64868273 upper 29509417 lower) in 204 sequences in 1 files # Total size: mean 478791.3 sd 848169.0 min 22394 (chr9_jh806577_fix) max 7283150 (chr1_jh636052_fix) median 240775 # %30.21 masked total, %31.27 masked real ############################################################################# # cytoBandIdeo - (DONE - 2018-09-28 - Angie) # There is no cytoBand info for these (although... could we map??) # so just make a fake cytoBandIdeo to get a blank ideogram. mkdir /hive/data/genomes/grcH37P13/bed/cytoBand cd /hive/data/genomes/grcH37P13/bed/cytoBand makeCytoBandIdeo.csh grcH37P13 ############################################################################# # cpgIslands - (DONE - 2018-09-28 - Angie) mkdir /hive/data/genomes/grcH37P13/bed/cpgIslands cd /hive/data/genomes/grcH37P13/bed/cpgIslands time ($HOME/kent/src/hg/utils/automation/doCpgIslands.pl -dbHost=hgwdev \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku grcH37P13) > do.log 2>&1 & # *** All done ! Elapsed time: 0m45s cat fb.grcH37P13.cpgIslandExt.txt #1278964 bases of 94378040 (1.355%) in intersection ############################################################################## # genscan - (DONE - 2018-09-28 - Angie) mkdir /hive/data/genomes/grcH37P13/bed/genscan cd /hive/data/genomes/grcH37P13/bed/genscan time ($HOME/kent/src/hg/utils/automation/doGenscan.pl -buildDir=`pwd` \ -workhorse=hgwdev -dbHost=hgwdev -bigClusterHub=ku grcH37P13) > do.log 2>&1 & #real 15m2.782s cat fb.grcH37P13.genscan.txt #2946438 bases of 94378040 (3.122%) in intersection cat fb.grcH37P13.genscanSubopt.txt #2239445 bases of 94378040 (2.373%) in intersection ############################################################################# # augustus gene track (DONE - 2018-09-28 - Angie) mkdir /hive/data/genomes/grcH37P13/bed/augustus cd /hive/data/genomes/grcH37P13/bed/augustus time ($HOME/kent/src/hg/utils/automation/doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=human -dbHost=hgwdev -workhorse=hgwdev grcH37P13) > do.log 2>&1 & # *** All done ! Elapsed time: 34m34s cat fb.grcH37P13.augustusGene.txt #2865177 bases of 94378040 (3.036%) in intersection ############################################################################## # Download files (DONE - 2018-09-28 - Angie) cd /hive/data/genomes/grcH37P13 time ($HOME/kent/src/hg/utils/automation/makeDownloads.pl \ -workhorse=hgwdev grcH37P13) > downloads.log 2>&1 & # *** All done! #real 0m45.275s ##############################################################################