# for emacs: -*- mode: sh; -*- ############################################################################## # GRCh38 patch 14 build: build basic tracks on separate database for new # sequences only (relative to hg38). ############################################################################## # run this first, then it continues the patch in patchUpdate.14.txt ############################################################################## # download or rather ln -s the patch release files (DONE - 2022-09-13 - Galt) # Note: newer assemblies use refseq releases instead of genbank, but hg38 uses genbank # so continue with that when building patches. mkdir -p /hive/data/genomes/grcH38P14/genbank cd /hive/data/genomes/grcH38P14/genbank # Releases have already been downloaded to /hive/data/outside/ncbi/genomes/. ln -s /hive/data/outside/ncbi/genomes/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/* . ############################################################################## # Set up fasta and agp with UCSC names (DONE - 2022-09-23 - Galt) mkdir /hive/data/genomes/grcH38P14/ucsc cd /hive/data/genomes/grcH38P14/ucsc # identify sequences not in existing genome db faCount ../genbank/GCA_000001405.29_GRCh38.p14_genomic.fna.gz \ > faCount.GRCh38.p14.txt ~/kent/src/hg/makeDb/doc/hg38/scanAssemblyReport.pl \ /hive/data/genomes/hg38/chrom.sizes \ faCount.GRCh38.p14.txt ../genbank/GCA_000001405.29_GRCh38.p14_assembly_report.txt \ | grep -w new > new.sequences.list wc -l new.sequences.list #71 new.sequences.list # Extract UCSC-named FASTA for the new sequences cut -f3 new.sequences.list > extract.new.list awk '{printf "s/%s/%s/; ", $3,$1}' new.sequences.list > genbankToUCSC.sed faSomeRecords ../genbank/GCA_000001405.29_GRCh38.p14_genomic.fna.gz extract.new.list stdout \ | sed -e 's/ .*//;' \ | sed -f genbankToUCSC.sed \ | gzip -c > grcH38P14.fa.gz faSize grcH38P14.fa.gz #27093089 bases (242788 N's 26850301 real 17087494 upper 9762807 lower) in 71 sequences in 1 files #Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585 (chr5_MU273354v1_fix) median 301310 # Compare faSize results for whole GCA_000001405.29_GRCh38.p14_genomic.fna.gz # vs. concatenation of hg38 fasta and grcH38P14.fa.gz: faSize ../genbank/GCA_000001405.29_GRCh38.p14_genomic.fna.gz #3298912062 bases (161611482 N's 3137300580 real 1945815910 upper 1191484670 lower) in 709 sequences in 1 files #Total size: mean 4652908.4 sd 25469887.9 min 970 (KI270394.1) max 248956422 (CM000663.2) median 171027 twoBitToFa /hive/data/genomes/hg38/hg38.2bit stdout \ | faSize grcH38P14.fa.gz stdin #3299210039 bases (161611482 N's 3137598557 real 1506729106 upper 1630869451 lower) in 711 sequences in 2 files #Total size: mean 4640239.2 sd 25435109.8 min 970 (chrUn_KI270394v1) max 248956422 (chr1) median 171027 NOTE that 709 != 711. 711 - 709 == 2, so our counts are off by 2. The reason that two fixes went missing is because they got a new subversion, and should apparently replace the original version. Two of the fix patches has changed from version1 to version2, and they have removed obsolete version1 from the genbank release. There are some notes in /hive/data/genomes/grcH38P14/ucsc/galt.NOTES # in p13 and older releases chr11_KQ759759v1_fix chr22_KQ759762v1_fix So, these just became updated as v2 in the new assembly? HG1311_HG2539_PATCH fix-patch 22 Chromosome KQ759762.2 <> na PATCHES 101040 na HG107_HG2565_PATCH fix-patch 11 Chromosome KQ759759.2 <> na PATCHES 204999 na and in the new one we have: chr11_KQ759759v2_fix chr22_KQ759762v2_fix [galt@hgwdev ucsc]$ egrep '(KQ759759|KQ759762)' new.sequences.list chr11_KQ759759v2_fix 204999 KQ759759.2 new chr22_KQ759762v2_fix 101040 KQ759762.2 new Since the removed sequences are in the main hg38.2bit, but not in the new patch14 sequences, we can just continue the GRCh38 patch 14 build which is not affected. We might delete those 2 with old versions in patchUpdate.14.txt which is run after this build doc. # Good, everything in GCA_000001405.29_GRCh38.p14_genomic.fna.gz is accounted for # between hg38.2bit and grcH38P14.fa.gz # Make UCSC-named AGP: zcat ../genbank/GCA_000001405.29_GRCh38.p14_assembly_structure/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \ | grep -Fwf extract.new.list \ | sed -f genbankToUCSC.sed > grcH38P14.agp # construct 2bit file: cd /hive/data/genomes/grcH38P14 faToTwoBit ucsc/grcH38P14.fa.gz grcH38P14.unmasked.2bit twoBitInfo grcH38P14.unmasked.2bit stdout | sort -k2nr > chrom.sizes # take a look at chrom.sizes to verify it looks OK. # Make sure AGP and FASTA/2bit agree: checkAgpAndFa ucsc/grcH38P14.agp grcH38P14.unmasked.2bit | tail -1 #All AGP and FASTA entries agree - both files are valid ############################################################################## # establish config.ra file (DONE - Galt - 2021-09-23) # arguments here are: cd /hive/data/genomes/grcH38P14 # Must make photoReference.txt first -- copy from hg38 cp /hive/data/genomes/hg38/photoReference.txt . $HOME/kent/src/hg/utils/automation/prepConfig.pl grcH38P14 haplotypes \ GRCh38.p14 genbank/*_assembly_report.txt > grcH38P14.config.ra # Edit grcH38P14.config.ra to avoid confusion with actual hg38 assemblyDate Feb. 2019 p14 orderKey 2001 # NOTE FROM GALT, a way to record it in the makedoc with comments so we know what values were used? sed -e 's/^/#/' grcH38P14.config.ra ## config parameters for makeGenomeDb.pl: #db grcH38P14 #clade haplotypes #scientificName Homo sapiens #commonName Human #assemblyDate Feb. 2019 p14 #assemblyLabel Genome Reference Consortium #assemblyShortLabel GRCh38.p14 #orderKey 2001 ## mitochondrial sequence included in refseq release ## mitoAcc J01415.2 #mitoAcc none #fastaFiles /hive/data/genomes/grcH38P14/ucsc/*.fa.gz #agpFiles /hive/data/genomes/grcH38P14/ucsc/*.agp ## qualFiles none #dbDbSpeciesDir GRCh38.p14 #photoCreditURL http://www.cbse.ucsc.edu/ #photoCreditName Graphic courtesy of CBSE #ncbiGenomeId 51 #ncbiAssemblyId 11968211 #ncbiAssemblyName GRCh38.p14 #ncbiBioProject 31257 #ncbiBioSample notFound #genBankAccessionID GCA_000001405.29 #taxId 9606 ############################################################################## # Initial database build (DONE - 2022-09-23 - Galt) cd /hive/data/genomes/grcH38P14 # AGP and unmasked.2bit are already built and checked, so start at the db step: mkdir jkStuff $HOME/kent/src/hg/utils/automation/makeGenomeDb.pl grcH38P14.config.ra -debug #HgStepManager: executing from step 'seq' through step 'trackDb'. #HgStepManager: executing step 'seq' Fri Sep 23 16:26:50 2022. #seq: looks like this was run successfully already (/cluster/data/grcH38P14/chrom.sizes exists). Either run with -continue agp or some later step, or move aside/remove /cluster/data/grcH38P14/chrom.sizes and run again. # Make chromInfo.tab. mkdir -p bed/chromInfo awk '{print $1 "\t" $2 "\t/gbdb/grcH38P14/grcH38P14.2bit";}' chrom.sizes \ > bed/chromInfo/chromInfo.tab # Make a link to the .agp file where makeGenomeDb.pl expects to find it. ln -s ucsc/grcH38P14.agp . # Skip 'seq' and 'agp' steps because those were done above. time ($HOME/kent/src/hg/utils/automation/makeGenomeDb.pl \ -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -continue=db grcH38P14.config.ra) > db.log 2>&1 & tail -f db.log #real 0m22.790s # actually, I had to add uniprot.ra to git archive step list in $HOME/kent/src/hg/utils/automation/makeGenomeDb.pl # then I restarted it at the dbDb step. # Ignore all the "NOTES -- STUFF THAT YOU WILL HAVE TO DO --" stuff because this is # going to be folded into hg38. # Now the gold, gap and gc5BaseBw tracks are built. ############################################################################# # RepeatMasker (DONE - 2022-09-23 - Galt) mkdir /hive/data/genomes/grcH38P14/bed/repeatMasker cd /hive/data/genomes/grcH38P14/bed/repeatMasker time ($HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku grcH38P14) > do.log 2>&1 & tail -f do.log #real 49m1.583s egrep "bases|Total|masked" faSize.rmsk.txt \ | fold -w 95 -s | sed -e 's/^/# /;' # 27093089 bases (242788 N's 26850301 real 13541432 upper 13308869 lower) in 71 sequences in 1 # files # Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585 # (chr5_MU273354v1_fix) median 301310 # %49.12 masked total, %49.57 masked real egrep -i "versi|relea" do.log | sed -e 's/^/# /;' # VERSION:\n # /cluster/software/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. # grep RELEASE /hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl # RepeatMasker version 4.1.2-p1 # grep RELEASE /hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl # CC Artefacts RELEASE 20190301; * # CC Dfam RELEASE Dfam_3.1; featureBits -countGaps grcH38P14 rmsk #13308822 bases of 27093089 (49.123%) in intersection ########################################################################## # running simple repeat (DONE - 2022-09-23 - Galt) mkdir /hive/data/genomes/grcH38P14/bed/simpleRepeat cd /hive/data/genomes/grcH38P14/bed/simpleRepeat # using trf409 6 here like hg38 time ($HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl -buildDir=`pwd` \ -dbHost=hgwdev -workhorse=hgwdev -bigClusterHub=ku -smallClusterHub=ku \ -trf409 6 grcH38P14) > do.log 2>&1 & #real 1m37.386s cat fb.simpleRepeat #942219 bases of 26850301 (3.509%) in intersection # adding this trfMask to .rmsk.2bit cd /hive/data/genomes/grcH38P14 twoBitMask grcH38P14.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed grcH38P14.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa grcH38P14.2bit stdout | faSize stdin > faSize.grcH38P14.2bit.txt egrep "bases|Total|masked" faSize.grcH38P14.2bit.txt \ | fold -w 95 -s | sed -e 's/^/# /;' # 27093089 bases (242788 N's 26850301 real 13510632 upper 13339669 lower) in 71 sequences in 1 # files # Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585 # (chr5_MU273354v1_fix) median 301310 # %49.24 masked total, %49.68 masked real # reset the symlink ln -sf `pwd`/grcH38P14.2bit /gbdb/grcH38P14/grcH38P14.2bit ########################################################################## ## WINDOWMASKER (DONE - 2022-09-26 - Galt) mkdir /hive/data/genomes/grcH38P14/bed/windowMasker cd /hive/data/genomes/grcH38P14/bed/windowMasker time ($HOME/kent/src/hg/utils/automation/doWindowMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -dbHost=hgwdev grcH38P14) > do.log 2>&1 # *** All done ! - Elapsed time: 0m47s featureBits -countGaps grcH38P14 rmsk windowmaskerSdust \ > fb.grcH38P14.rmsk.windowmaskerSdust.txt 2>&1 cat fb.grcH38P14.rmsk.windowmaskerSdust.txt #5549307 bases of 27093089 (20.482%) in intersection # Masking statistics egrep "bases|Total|masked" faSize.grcH38P14.cleanWMSdust.txt \ | fold -w 95 -s | sed -e 's/^/# /;' # 27093089 bases (242788 N's 26850301 real 19806463 upper 7043838 lower) in 71 sequences in 1 # files # Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585 # (chr5_MU273354v1_fix) median 301310 # %26.00 masked total, %26.23 masked real ############################################################################# # cytoBandIdeo - (DONE - 2022-09-26 - Galt) mkdir /hive/data/genomes/grcH38P14/bed/cytoBand cd /hive/data/genomes/grcH38P14/bed/cytoBand makeCytoBandIdeo.csh grcH38P14 ############################################################################# # cpgIslands - (DONE - 2022-09-26 - Galt) mkdir /hive/data/genomes/grcH38P14/bed/cpgIslands cd /hive/data/genomes/grcH38P14/bed/cpgIslands time ($HOME/kent/src/hg/utils/automation/doCpgIslands.pl -dbHost=hgwdev \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku grcH38P14) > do.log 2>&1 & # *** All done ! Elapsed time: 1m10s # *** All done ! Elapsed time: 0m19s cat fb.grcH38P14.cpgIslandExt.txt #394646 bases of 26850301 (1.470%) in intersection ############################################################################## # genscan - (DONE - 2022-09-26 - Galt) mkdir /hive/data/genomes/grcH38P14/bed/genscan cd /hive/data/genomes/grcH38P14/bed/genscan time ($HOME/kent/src/hg/utils/automation/doGenscan.pl -buildDir=`pwd` \ -workhorse=hgwdev -dbHost=hgwdev -bigClusterHub=ku grcH38P14) > do.log 2>&1 & # *** All done ! Elapsed time: 2m44s cat fb.grcH38P14.genscan.txt #1046433 bases of 26850301 (3.897%) in intersection cat fb.grcH38P14.genscanSubopt.txt #844615 bases of 26850301 (3.146%) in intersection ############################################################################# # augustus gene track (DONE - 2022-09-26 - Galt) mkdir /hive/data/genomes/grcH38P14/bed/augustus cd /hive/data/genomes/grcH38P14/bed/augustus time ($HOME/kent/src/hg/utils/automation/doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=human -dbHost=hgwdev -workhorse=hgwdev grcH38P14) > do.log 2>&1 & # *** All done ! Elapsed time: 11m42s cat fb.grcH38P14.augustusGene.txt #972943 bases of 26850301 (3.624%) in intersection ############################################################################# # make the tracks in your browser using the temporary checkout: pushd /hive/data/genomes/grcH38P14/TemporaryTrackDbCheckout/kent/src/hg/src/makeDb/trackDb/ ./loadTracks trackDb_\${USER} hgFindSpec_\${USER} grcH38P14 popd # check out your browser https://hgwdev-${USER}.gi.ucsc.edu/cgi-bin/hgTracks?db=grcH38P14 ############################################################################## # Download files (DONE - 2022-09-26 - Galt) cd /hive/data/genomes/grcH38P14 time ($HOME/kent/src/hg/utils/automation/makeDownloads.pl -noChromFiles \ -workhorse=hgwdev grcH38P14) > downloads.log 2>&1 & # *** All done! #real 0m24.688s ############################################################################## # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2022-10-14 - Galt) mkdir /hive/data/genomes/grcH38P14/bed/linSpecRep cd /hive/data/genomes/grcH38P14/bed/linSpecRep # create individual .out files from the master record in ../repeatMasker # cluster job is perhaps overkill for just the patches, but it works mkdir splitOut cat <<'_EOF_' > split.csh #!/bin/csh -fe set C = $1 head -3 ../repeatMasker/grcH38P14.sorted.fa.out > splitOut/${C}.out grep "${C} " ../repeatMasker/grcH38P14.sorted.fa.out >> splitOut/${C}.out _EOF_ chmod +x split.csh cat << '_EOF_' > template #LOOP split.csh $(root1) {check out line+ splitOut/$(root1).out} #ENDLOOP _EOF_ # small ones first: cut -f1 ../../chrom.sizes | tac > chrom.list gensub2 chrom.list single template jobList para make jobList para time #Completed: 71 of 71 jobs #CPU time in finished jobs: 0s 0.01m 0.00h 0.00d 0.000 y #IO & Wait Time: 188s 3.13m 0.05h 0.00d 0.000 y #Average job time: 3s 0.04m 0.00h 0.00d #Longest finished job: 5s 0.08m 0.00h 0.00d #Submission to last job: 7s 0.12m 0.00h 0.00d #Estimated complete: 0s 0.00m 0.00h 0.00d # now, we can date and process each of those .out files # constructing the humanSpecific set of repeats # this means repeats found in human, and not in others # using mouse here for 'others' is good enough, a variety # of other species could be used (rat dog cow) where they all # produce the same result mkdir dateRepeats cd dateRepeats cat << '_EOF_' > mkLSR #!/bin/bash set -beEu -o pipefail seq=$1 rm -f $seq.out_mus-musculus ln -sf ../splitOut/$seq.out . /scratch/data/RepeatMasker/DateRepeats $seq.out -query human -comp 'mus musculus' rm $seq.out mkdir -p ../humanSpecific /cluster/bin/scripts/extractRepeats 1 $seq.out_mus-musculus \ > ../humanSpecific/$seq.out.spec _EOF_ chmod +x mkLSR cat << '_EOF_' > template #LOOP ./mkLSR $(path1) {check out line+ ../humanSpecific/$(path1).out.spec} #ENDLOOP _EOF_ # actually, I had to tweak the mkLSR script above to use the previous version of DateRepeats # since RM RM4.1.2 has a bug with new development to use hdf5 file format and libraries. # Since that bug should be fixed by the time the next person needs to DateRepeats again, # I did not include the change here. gensub2 ../chrom.list single template jobList para make jobList para time #Completed: 71 of 71 jobs #CPU time in finished jobs: 294s 4.90m 0.08h 0.00d 0.000 y #IO & Wait Time: 199s 3.32m 0.06h 0.00d 0.000 y #Average job time: 7s 0.12m 0.00h 0.00d #Longest finished job: 9s 0.15m 0.00h 0.00d #Submission to last job: 14s 0.23m 0.00h 0.00d # We also need the nibs for blastz runs with lineage specific repeats mkdir /hive/data/genomes/grcH38P14/bed/nibs cd /hive/data/genomes/grcH38P14/bed/nibs cut -f1 ../../chrom.sizes | while read C do twoBitToFa -seq=${C} ../../grcH38P14.2bit stdout \ | faToNib -softMask stdin ${C}.nib echo "${C} done" done # verify nothing lost cat ../../chrom.sizes \ | awk '{printf "nibFrag -masked %s.nib 0 %d + stdout\n", $1, $2}' \ | sh | faSize stdin #27093089 bases (242788 N's 26850301 real 13510632 upper 13339669 lower) in 71 sequences in 1 files #Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix.nib:0-34400) max 2101585 (chr5_MU273354v1_fix.nib:0-2101585) median 301310 #N count: mean 3419.5 sd 28467.7 #U count: mean 190290.6 sd 170338.0 #L count: mean 187882.7 sd 164027.6 #%49.24 masked total, %49.68 masked real mkdir -p /hive/data/staging/data/grcH38P14/nib rsync -a --progress ./ /hive/data/staging/data/grcH38P14/nib rsync -a --progress /hive/data/genomes/grcH38P14/{grcH38P14.2bit,chrom.sizes} \ /hive/data/staging/data/grcH38P14/ ##############################################################################