# for emacs: -*- mode: sh; -*- ############################################################################## # hg19 patch 13 build ############################################################################## ## download sequence, prepare files (DONE - 2018-05-23 - Hiram) ############################################################################## mkdir /hive/data/genomes/hg19/bed/hg19Patch13 cd /hive/data/genomes/hg19/bed/hg19Patch13 mkdir genbank cd genbank time rsync -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/ ./ # sent 14453 bytes received 4386760279 bytes 24037121.82 bytes/sec # total size is 4386176146 speedup is 1.00 # real 3m1.966s # this is an entire release of everything, construct the convenient # single file as in modern releases: find . -type f | grep FASTA | egrep -v "rm.out.gz|/placed_scaffolds/" \ | xargs zcat | gzip -c > GCA_000001405.14_GRCh37.p13_genomic.fna.gz faSize GCA_000001405.14_GRCh37.p13_genomic.fna.gz # 3234834689 bases (243146540 N's 2991688149 real 2991688149 upper 0 lower) # in 297 sequences in 1 files # Total size: mean 10891699.3 sd 38623365.0 # min 4262 (gi|224182996|gb|GL000207.1|) # max 249250621 (gi|224384768|gb|CM000663.1|) median 196262 # %0.00 masked total, %0.00 masked real # so the question is, what is new here compared to what we have in hg19 cd /hive/data/genomes/hg19/bed/hg19Patch13 time faCount genbank/GCA_000001405.14_GRCh37.p13_genomic.fna.gz \ > faCount.GRCH38.p13.txt # real 1m13.179s # had to alter the hg38/scanAssemblyReport.pl script for this hg19 # ~/kent/src/hg/makeDb/doc/hg38/scanAssemblyReport.pl # processing ./scanAssemblyReport.pl ../../chrom.sizes \ faCount.GRCH38.p13.txt genbank/GCA_000001405.14_GRCh37.p13_assembly_report.txt \ | grep -w new | sed -e 's/^/# /' # there are 204 new sequences, and one non-identical sequence (chrM) # verify == 204 = 297 (p13) - 93 (hg19 chrom.sizes) chr1_gl949741_fix 151551 GL949741.1 new chr1_jh636052_fix 7283150 JH636052.4 new chr1_jh636053_fix 1676126 JH636053.3 new chr1_jh806574_fix 22982 JH806574.2 new chr1_gl383518_alt 182439 GL383518.1 new chr1_gl383519_alt 110268 GL383519.1 new chr1_gl383520_alt 366579 GL383520.1 new chr1_gl383516_fix 49316 GL383516.1 new chr1_gl383517_fix 49352 GL383517.1 new chr1_jh636054_fix 758378 JH636054.1 new chr1_jh806573_fix 24680 JH806573.1 new chr1_jh806575_fix 47409 JH806575.1 new chr2_gl383521_alt 143390 GL383521.1 new chr2_gl877871_fix 389939 GL877871.1 new chr2_kb663603_fix 599580 KB663603.1 new chr2_gl383522_alt 123821 GL383522.1 new chr2_gl582966_alt 96131 GL582966.2 new chr2_gl877870_fix 66021 GL877870.2 new chr3_jh636055_alt 173151 JH636055.1 new chr3_gl383523_fix 171362 GL383523.1 new chr3_gl383524_fix 78793 GL383524.1 new chr3_gl383525_fix 65063 GL383525.1 new chr3_jh159131_fix 393769 JH159131.1 new chr3_jh159132_fix 100694 JH159132.1 new chr3_ke332495_fix 263861 KE332495.1 new chr3_gl383526_alt 180671 GL383526.1 new chr4_ke332496_fix 503215 KE332496.1 new chr4_gl383528_alt 376187 GL383528.1 new chr4_gl383529_alt 121345 GL383529.1 new chr4_gl582967_fix 248177 GL582967.1 new chr4_gl383527_alt 164536 GL383527.1 new chr4_gl877872_fix 297485 GL877872.1 new chr5_gl339449_alt 1612928 GL339449.2 new chr5_gl383530_alt 101241 GL383530.1 new chr5_gl383532_alt 82728 GL383532.1 new chr5_gl949742_alt 226852 GL949742.1 new chr5_jh159133_fix 266316 JH159133.1 new chr5_ke332497_fix 543325 KE332497.1 new chr5_gl383531_alt 173459 GL383531.1 new chr6_jh636057_fix 200195 JH636057.1 new chr6_jh806576_fix 273386 JH806576.1 new chr6_gl383533_alt 124736 GL383533.1 new chr6_jh636056_fix 262912 JH636056.1 new chr6_kb663604_fix 478993 KB663604.1 new chr6_ke332498_fix 149443 KE332498.1 new chr6_kb021644_alt 187824 KB021644.1 new chr7_gl582968_fix 356330 GL582968.1 new chr7_gl582969_fix 251823 GL582969.1 new chr7_gl582970_fix 354970 GL582970.1 new chr7_gl582972_fix 327774 GL582972.1 new chr7_jh159134_fix 3821770 JH159134.2 new chr7_jh636058_fix 716227 JH636058.1 new chr7_ke332499_fix 274521 KE332499.1 new chr7_gl383534_alt 119183 GL383534.2 new chr7_gl582971_fix 1284284 GL582971.1 new chr8_gl383535_fix 429806 GL383535.1 new chr8_gl383536_fix 203777 GL383536.1 new chr8_gl949743_fix 608579 GL949743.1 new chr8_jh159135_fix 102251 JH159135.2 new chr8_ke332500_fix 228602 KE332500.1 new chr9_gl383539_alt 162988 GL383539.1 new chr9_jh636059_fix 295379 JH636059.1 new chr9_gl383540_alt 71551 GL383540.1 new chr9_gl383541_alt 171286 GL383541.1 new chr9_gl383542_alt 60032 GL383542.1 new chr9_gl339450_fix 330164 GL339450.1 new chr9_gl383537_fix 62435 GL383537.1 new chr9_gl383538_fix 49281 GL383538.1 new chr9_jh806577_fix 22394 JH806577.1 new chr9_jh806578_fix 169437 JH806578.1 new chr9_jh806579_fix 211307 JH806579.1 new chr9_kb663605_fix 155926 KB663605.1 new chr10_gl383543_fix 392792 GL383543.1 new chr10_gl877873_fix 168465 GL877873.1 new chr10_jh636060_fix 437946 JH636060.1 new chr10_gl383545_alt 179254 GL383545.1 new chr10_gl383546_alt 309802 GL383546.1 new chr10_gl383544_fix 128378 GL383544.1 new chr10_jh591181_fix 2281126 JH591181.2 new chr10_jh591182_fix 196262 JH591182.1 new chr10_jh591183_fix 177920 JH591183.1 new chr10_jh806580_fix 93149 JH806580.1 new chr10_kb663606_fix 305900 KB663606.1 new chr10_ke332501_fix 1020827 KE332501.1 new chr11_jh591184_fix 462282 JH591184.1 new chr11_jh591185_fix 167437 JH591185.1 new chr11_gl383547_alt 154407 GL383547.1 new chr11_gl582973_fix 321004 GL582973.1 new chr11_jh159136_alt 200998 JH159136.1 new chr11_jh159137_alt 191409 JH159137.1 new chr11_gl949744_fix 276448 GL949744.1 new chr11_jh159138_fix 108875 JH159138.1 new chr11_jh159139_fix 120441 JH159139.1 new chr11_jh159140_fix 546435 JH159140.1 new chr11_jh159141_fix 240775 JH159141.2 new chr11_jh159142_fix 326647 JH159142.2 new chr11_jh159143_fix 191402 JH159143.1 new chr11_jh720443_fix 408430 JH720443.2 new chr11_jh806581_fix 872115 JH806581.1 new chr12_gl582974_fix 163298 GL582974.1 new chr12_gl877875_alt 167313 GL877875.1 new chr12_jh720444_fix 273128 JH720444.2 new chr12_gl383549_alt 120804 GL383549.1 new chr12_gl383550_alt 169178 GL383550.1 new chr12_gl383552_alt 138655 GL383552.1 new chr12_gl383553_alt 152874 GL383553.2 new chr12_gl877876_alt 408271 GL877876.1 new chr12_gl949745_alt 372609 GL949745.1 new chr12_kb663607_fix 334922 KB663607.2 new chr12_gl383551_alt 184319 GL383551.1 new chr12_gl383548_fix 165247 GL383548.1 new chr13_gl582975_fix 34662 GL582975.1 new chr14_kb021645_fix 1523386 KB021645.1 new chr15_gl383554_alt 296527 GL383554.1 new chr15_gl383555_alt 388773 GL383555.1 new chr15_jh720445_fix 170033 JH720445.1 new chr16_gl383556_alt 192462 GL383556.1 new chr16_jh720446_fix 97345 JH720446.1 new chr16_gl383557_alt 89672 GL383557.1 new chr17_jh806582_fix 342635 JH806582.2 new chr17_gl383563_alt 270261 GL383563.2 new chr17_gl383559_fix 338640 GL383559.2 new chr17_gl383560_fix 534288 GL383560.1 new chr17_gl383561_fix 644425 GL383561.2 new chr17_gl383562_fix 45551 GL383562.1 new chr17_jh159145_fix 194862 JH159145.1 new chr17_kb021646_fix 211416 KB021646.2 new chr17_ke332502_fix 341712 KE332502.1 new chr17_gl383564_alt 133151 GL383564.1 new chr17_jh159146_alt 278131 JH159146.1 new chr17_jh159147_alt 70345 JH159147.1 new chr17_jh159148_alt 88070 JH159148.1 new chr17_gl383558_fix 457041 GL383558.1 new chr17_gl582976_fix 412535 GL582976.1 new chr17_jh159144_fix 388340 JH159144.1 new chr17_jh720447_fix 454385 JH720447.1 new chr17_gl383565_alt 223995 GL383565.1 new chr17_gl383566_alt 90219 GL383566.1 new chr17_jh591186_fix 376223 JH591186.1 new chr17_jh636061_fix 186059 JH636061.1 new chr18_gl383567_alt 289831 GL383567.1 new chr18_gl383568_alt 104552 GL383568.1 new chr18_gl383569_alt 167950 GL383569.1 new chr18_gl383570_alt 164789 GL383570.1 new chr18_gl383571_alt 198278 GL383571.1 new chr18_gl383572_alt 159547 GL383572.1 new chr19_gl582977_fix 580393 GL582977.2 new chr19_jh159149_fix 245473 JH159149.1 new chr19_gl383573_alt 385657 GL383573.1 new chr19_gl383574_alt 155864 GL383574.1 new chr19_gl383575_alt 170222 GL383575.2 new chr19_gl383576_alt 188024 GL383576.1 new chr19_kb021647_fix 1058686 KB021647.1 new chr19_ke332505_fix 579598 KE332505.1 new chr19_gl949746_alt 987716 GL949746.1 new chr19_gl949747_alt 729519 GL949747.1 new chr19_gl949748_alt 1064303 GL949748.1 new chr19_gl949749_alt 1091840 GL949749.1 new chr19_gl949750_alt 1066389 GL949750.1 new chr19_gl949751_alt 1002682 GL949751.1 new chr19_gl949752_alt 987100 GL949752.1 new chr19_gl949753_alt 796478 GL949753.1 new chr20_gl383577_alt 128385 GL383577.1 new chr20_gl582979_fix 179899 GL582979.2 new chr20_jh720448_fix 70483 JH720448.1 new chr20_kb663608_fix 283551 KB663608.1 new chr21_gl383578_alt 63917 GL383578.1 new chr21_gl383579_alt 201198 GL383579.1 new chr21_gl383580_alt 74652 GL383580.1 new chr21_gl383581_alt 116690 GL383581.1 new chr21_ke332506_fix 307252 KE332506.1 new chr22_jh720449_fix 212298 JH720449.1 new chr22_jh806583_fix 167183 JH806583.1 new chr22_jh806584_fix 70876 JH806584.1 new chr22_jh806585_fix 73505 JH806585.1 new chr22_gl383582_alt 162811 GL383582.2 new chr22_gl383583_alt 96924 GL383583.1 new chr22_kb663609_alt 74013 KB663609.1 new chr22_jh806586_fix 43543 JH806586.1 new chrX_gl877877_fix 284527 GL877877.2 new chrX_jh720451_fix 898979 JH720451.1 new chrX_jh720452_fix 522319 JH720452.1 new chrX_jh806589_fix 270630 JH806589.1 new chrX_kb021648_fix 469972 KB021648.1 new chrX_jh806590_fix 2418393 JH806590.2 new chrX_jh806587_fix 4110759 JH806587.1 new chrX_jh806591_fix 882083 JH806591.1 new chrX_jh806592_fix 835911 JH806592.1 new chrX_jh720453_fix 1461188 JH720453.1 new chrX_jh720454_fix 752267 JH720454.3 new chrX_jh806593_fix 389631 JH806593.1 new chrX_jh806594_fix 390496 JH806594.1 new chrX_jh806595_fix 444074 JH806595.1 new chrX_jh720455_fix 65034 JH720455.1 new chrX_jh806588_fix 862483 JH806588.1 new chrX_jh806601_fix 1389764 JH806601.1 new chrX_jh806602_fix 713266 JH806602.1 new chrX_jh806603_fix 182949 JH806603.1 new chrX_jh806596_fix 413927 JH806596.1 new chrX_jh806597_fix 1045622 JH806597.1 new chrX_jh806598_fix 899320 JH806598.1 new chrX_jh806599_fix 1214327 JH806599.1 new chrX_jh806600_fix 6530008 JH806600.2 new chrX_jh159150_fix 3110903 JH159150.3 new # how much sequence: ./scanAssemblyReport.pl ../../chrom.sizes \ faCount.GRCH38.p13.txt genbank/GCA_000001405.14_GRCh37.p13_assembly_report.txt\ | grep -w new | awk '{sum += $2; printf "%d\t%s\n", sum, $0}' | tail # 97673427 chrX_jh159150_fix 3110903 JH159150.3 new # ^^^^^^^^ total new sequence # verify, vs. p13: 3234834689 hg19: 3209286105 # (3234834689 - 16569) - (3137161264 - 16571) = 97673427 # the 16569 and 16571 are the different chrMt sequences ./scanAssemblyReport.pl ../../chrom.sizes \ faCount.GRCH38.p13.txt genbank/GCA_000001405.14_GRCh37.p13_assembly_report.txt\ | grep -w new > new.sequences.list cut -f3 new.sequences.list > extract.new.list awk '{printf "s/%s/%s/; ", $3,$1}' new.sequences.list > genbankToUCSC.sed ./scanAssemblyReport.pl ../../chrom.sizes \ faCount.GRCH38.p13.txt genbank/GCA_000001405.14_GRCh37.p13_assembly_report.txt\ | egrep -v -w "new|bad" > existing.sequences.list cut -f3 existing.sequences.list > extract.exist.list # the sed gets the names to be sane so they will match the # extract.new.list time zcat genbank/GCA_000001405.14_GRCh37.p13_genomic.fna.gz \ | sed -e 's/gi.[0-9]\+.gb.//; s/. Homo sap.*//;' \ | faSomeRecords stdin extract.new.list stdout | sed -e 's/ .*//;' | \ sed -f genbankToUCSC.sed | gzip -c > hg19Patch13.fa.gz # real 6m54.690s # verify we got everything: faSize hg19Patch13.fa.gz # 97673427 bases (3295737 N's 94377690 real 94377690 upper 0 lower) # in 204 sequences in 1 files # same total as above: # 97673427 chrX_jh159150_fix 3110903 JH159150.3 new time zcat genbank/GCA_000001405.14_GRCh37.p13_genomic.fna.gz \ | sed -e 's/gi.[0-9]\+.gb.//; s/. Homo sap.*//;' \ | faSomeRecords stdin extract.exist.list stdout \ | sed -e 's/ .*//;' | gzip -c > existing.fa.gz # real 16m13.182s # verify same amount of sequence here as hg19: # will be missing chrM size 16571 faSize existing.fa.gz # 3137144693 bases (239850802 N's 2897293891 real 2897293891 upper 0 lower) # in 92 sequences in 1 files # Total size: mean 34099398.8 sd 63732574.5 min 4262 (GL000207.1) # max 249250621 (CM000663.1) median 172545 calc 3137161264 - 3137144693 # 3137161264 - 3137144693 = 16571.000000 # verify correct amount of patch13 sequence here: faSize hg19Patch13.fa.gz # 97673427 bases (3295737 N's 94377690 real 94377690 upper 0 lower) # in 204 sequences in 1 files # same total as above: # 97673427 chrX_jh159150_fix 3110903 JH159150.3 new # and both together should equal the original full patch13 sequence # minus the patch13 chrM sequence 16569 time faSize existing.fa.gz hg19Patch13.fa.gz # 3234818120 bases (243146539 N's 2991671581 real 2991671581 upper 0 lower) # in 296 sequences in 2 files # Total size: mean 10928439.6 sd 38683573.9 min 4262 (GL000207.1) # max 249250621 (CM000663.1) median 198278 # real 1m3.115s calc 3234834689 - 3234818120 # 3234834689 - 3234818120 = 16569.000000 # construct locations file: ~/kent/src/hg/makeDb/doc/hg38/regionScan.pl extract.new.list \ genbank/GCA_000001405.14_GRCh37.p13_assembly_regions.txt \ > ncbi.patchLocations.bed # verify correct number of locations, measured 204 before: wc -l ncbi.patchLocations.bed # 204 ncbi.patchLocations.bed # separate haplotypes from fix patches for two tracks: grep -v fix ncbi.patchLocations.bed | sed -e 's/_alt//;' \ | sed -e 's/\tchr.*_/\t/;' | sed -e 's/v/./;' > hg19Patch13Haplotypes.bed grep fix ncbi.patchLocations.bed | sed -e 's/_fix//;' \ | sed -e 's/\tchr.*_/\t/;' | sed -e 's/v\([0-9]\)$/.\1/;' \ > hg19Patch13Patches.bed # for the kluster run below, need to have these names the same as # would be for UCSC, lower case accession, no .N extension: ~/kent/src/hg/makeDb/doc/hg38/regionScan.pl extract.new.list \ genbank/GCA_000001405.14_GRCh37.p13_assembly_regions.txt \ | tr 'A-Z' 'a-z' | sed -e 's/chrx/chrX/g; s/v[0-9]//;' > patchLocations.bed # verify nothing lost, total should be 204: wc -l hg19*.bed # 73 hg19Patch13Haplotypes.bed # 131 hg19Patch13Patches.bed # 204 total hgLoadBed -type=bed4 hg19 hg19Patch13Haplotypes hg19Patch13Haplotypes.bed # Read 73 elements of size 4 from hg19Patch13Haplotypes.bed hgLoadBed -type=bed4 hg19 hg19Patch13Patches hg19Patch13Patches.bed # Read 131 elements of size 4 from hg19Patch13Patches.bed # construct 2bit file: faToTwoBit hg19Patch13.fa.gz hg19Patch13.unmasked.2bit twoBitInfo hg19Patch13.unmasked.2bit stdout | sort -k2nr > hg19Patch13.chrom.sizes # take a look at that to verify it looks OK: cat hg19Patch13.chrom.sizes | sed -e 's/^/# /;' | head # chr1_jh636052_fix 7283150 # chrX_jh806600_fix 6530008 # chrX_jh806587_fix 4110759 # chr7_jh159134_fix 3821770 # chrX_jh159150_fix 3110903 # chrX_jh806590_fix 2418393 # chr10_jh591181_fix 2281126 # chr1_jh636053_fix 1676126 # chr5_gl339449_alt 1612928 # chr14_kb021645_fix 1523386 cat hg19Patch13.chrom.sizes | sed -e 's/^/# /;' | tail # chr1_gl383517_fix 49352 # chr1_gl383516_fix 49316 # chr9_gl383538_fix 49281 # chr1_jh806575_fix 47409 # chr17_gl383562_fix 45551 # chr22_jh806586_fix 43543 # chr13_gl582975_fix 34662 # chr1_jh806573_fix 24680 # chr1_jh806574_fix 22982 # chr9_jh806577_fix 22394 zcat genbank/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \ | sed -f genbankToUCSC.sed > hg19Patch13.agp checkAgpAndFa hg19Patch13.agp hg19Patch13.unmasked.2bit | tail -1 # All AGP and FASTA entries agree - both files are valid ############################################################################# # build hg19Patch13 database (DONE - 2017-08-09 - Hiram) # need this database for netClass operations during the chain/net # construction mkdir /hive/data/genomes/hg19Patch13 cd /hive/data/genomes/hg19Patch13 mkdir /gbdb/hg19Patch13 ln -s /hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.unmasked.2bit ./ ln -s /hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.agp ./ twoBitInfo hg19Patch13.unmasked.2bit stdout | sort -k2nr > chrom.sizes mkdir -p bed/chromInfo awk '{printf "%s\t%d\t/gbdb/hg19Patch13/hg19Patch13.2bit\n", $1, $2}' \ chrom.sizes > bed/chromInfo/chromInfo.tab hgsql -e 'create database hg19Patch13;' hg19 hgsql hg19Patch13 < $HOME/kent/src/hg/lib/grp.sql hgLoadSqlTab hg19Patch13 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql \ bed/chromInfo/chromInfo.tab hgGoldGapGl -noGl hg19Patch13 hg19Patch13.agp featureBits -or -countGaps hg19Patch13 gold gap # 97673427 bases of 97673427 (100.000%) in intersection hgsql hgcentraltest -e 'INSERT INTO dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, htmlPath, hgNearOk, hgPbOk, sourceName, taxId) VALUES ("hg19Patch13", "Jun. 2013", "/gbdb/hg19Patch13", "GRCh37.p13", "chr1_jh636052_fix:3500000-3700000", 1, 7756, "GRCh37.p13", "Homo sapiens", "/gbdb/hg19Patch13/html/description.html", 0, 0, "GRCh37 patch 13 Genome Reference Consortium Human Reference 37", 9606); INSERT INTO defaultDb (genome, name) VALUES ("GRCh37.p13", "hg19Patch13"); INSERT INTO genomeClade (genome, clade, priority) VALUES ("GRCh37.p13", "haplotypes", 136);' mkdir html # copy description.html from hg38Patch11/html/description.html # edit to update for Patch13 cp ../../hg38Patch11/html/description.html . mkdir -p /hive/data/genomes/hg19Patch13/bed/gc5Base cd /hive/data/genomes/hg19Patch13/bed/gc5Base time hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19Patch13 \ ../../hg19Patch13.unmasked.2bit | wigEncode stdin gc5Base.{wig,wib} # real 0m12.401s # Converted stdin, upper limit 100.00, lower limit 0.00 mkdir /gbdb/hg19Patch13/wib ln -s `pwd`/gc5Base.wib /gbdb/hg19Patch13/wib hgLoadWiggle -pathPrefix=/gbdb/hg19Patch13/wib hg19Patch13 gc5Base gc5Base.wig mkdir /hive/data/genomes/hg19Patch13/bed/repeatMasker cd /hive/data/genomes/hg19Patch13/bed/repeatMasker time (doRepeatMasker.pl -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg19Patch13) \ > do.log 2>&1 # real 144m51.181s cat faSize.rmsk.txt # 97673427 bases (3295737 N's 94377690 real 42827860 upper 51549830 lower) # in 204 sequences in 1 files # Total size: mean 478791.3 sd 848169.0 min 22394 (chr9_jh806577_fix) # max 7283150 (chr1_jh636052_fix) median 240775 # %52.78 masked total, %54.62 masked real mkdir /hive/data/genomes/hg19Patch13/bed/simpleRepeat cd /hive/data/genomes/hg19Patch13/bed/simpleRepeat time (doSimpleRepeat.pl -bigClusterHub=ku -workhorse=hgwdev \ -trf409=6 -smallClusterHub=ku -buildDir=`pwd` hg19Patch13) > do.log 2>&1 # real 12m56.760s cat fb.simpleRepeat # 4006366 bases of 94378040 (4.245%) in intersection # the simpleRepeat procedure fails in the cleanup step since there # is no TrfPart directory mkdir /hive/data/genomes/hg19Patch13/bed/microsat cd /hive/data/genomes/hg19Patch13/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed hg19Patch13 microsat microsat.bed # Read 10322 elements of size 4 from microsat.bed mkdir /hive/data/genomes/hg19Patch13/bed/windowMasker cd /hive/data/genomes/hg19Patch13/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev hg19Patch13) > do.log 2>&1 & # real 4m22.407s cat faSize.hg19Patch13.cleanWMSdust.txt # 97673427 bases (3295737 N's 94377690 real 64868273 upper 29509417 lower) # in 204 sequences in 1 files # Total size: mean 478791.3 sd 848169.0 min 22394 (chr9_jh806577_fix) # max 7283150 (chr1_jh636052_fix) median 240775 # %30.21 masked total, %31.27 masked real cat fb.hg19Patch13.rmsk.windowmaskerSdust.txt # 24560088 bases of 97673427 (25.145%) in intersection cd /hive/data/genomes/hg19Patch13 twoBitMask hg19Patch13.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed hg19Patch13.2bit twoBitToFa hg19Patch13.2bit stdout | faSize stdin # 97673427 bases (3295737 N's 94377690 real 42661912 upper 51715778 lower) in 204 sequences in 1 files # Total size: mean 478791.3 sd 848169.0 min 22394 (chr9_jh806577_fix) # max 7283150 (chr1_jh636052_fix) median 240775 # %52.95 masked total, %54.80 masked real # same size as above in original source: # 97673427 bases (3295737 N's 94377690 real 94377690 upper 0 lower) ln -s `pwd`/hg19Patch13.2bit /gbdb/hg19Patch13 ######################################################################### # run up idKeys files for ncbiRefSeq (DONE - 2017-08-09 - Hiram) mkdir /hive/data/genomes/hg19Patch13/bed/idKeys cd /hive/data/genomes/hg19Patch13/bed/idKeys time (doIdKeys.pl \ -twoBit=/hive/data/genomes/hg19Patch13/hg19Patch13.unmasked.2bit \ -buildDir=`pwd` hg19Patch13) > do.log 2>&1 # real 0m22.713s cat hg19Patch13.keySignature.txt # e0db188c467e4861a9b74fb4675f62da ######################################################################### # ncbiRefSeq Genes (TBD - 2016-05-19 - Hiram) mkdir /hive/data/genomes/hg19Patch13/bed/ncbiRefSeq cd /hive/data/genomes/hg19Patch13/bed/ncbiRefSeq # running step wise as this script is still under development time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ genbank vertebrate_mammalian Homo_sapiens \ GCA_000001405.33_GRCh37.p13 hg19Patch13) > download.log 2>&1 # real 11m43.551s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=process -bigClusterHub=ku -dbHost=hgwdev \ -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ genbank vertebrate_mammalian Homo_sapiens \ GCA_000001405.33_GRCh37.p13 hg19Patch13) > process.log 2>&1 # real 12m40.655s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=load -bigClusterHub=ku -dbHost=hgwdev \ -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ genbank vertebrate_mammalian Homo_sapiens \ GCA_000001405.33_GRCh37.p13 hg19Patch13) > load.log 2>&1 # real 0m13.277s cat fb.ncbiRefSeq.hg19Patch13.txt # 1727668 bases of 21862561 (7.902%) in intersection ######################################################################### # ucscToINSDC table/track (TBD - 2016-05-19 - Hiram) # the sequence here is working for a 'genbank' assembly with a chrM # situation may be specific depending upon what is available in the assembly mkdir /hive/data/genomes/hg19Patch13/bed/ucscToINSDC cd /hive/data/genomes/hg19Patch13/bed/ucscToINSDC # special script to get the names directly out of the # ncbi.patchLocations.bed file: ./mkTab.sh | sort > ucscToINSDC.txt awk '{printf "%s\t%s\n", $2, $1}' ucscToINSDC.txt | sort > insdcToUcsc.txt awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab join -t$'\t' name.coordinate.tab ucscToINSDC.txt > ucscToINSDC.bed join insdc.genbank.txt insdcToUcsc.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \ > ucscToINSDC.bed # should be same line counts throughout (except for insdc.genbank.txt): wc -l * # 196 insdc.genbank.txt # 130 insdcToUcsc.txt # 130 name.coordinate.tab # 130 ucscToINSDC.bed # 130 ucscToINSDC.txt cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 20 # use the 20 in this sed sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab hg19Patch13 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords hg19Patch13 # should cover %100 entirely: featureBits -countGaps hg19Patch13 ucscToINSDC # 23260605 bases of 23260605 (100.000%) in intersection join -1 2 <(sort -k2 ucscToINSDC.txt) insdc.genbank.txt | tr '[ ]' '[\t]' \ | sort -k2 | join -2 2 name.coordinate.tab - | tr '[ ]' '[\t]' \ | cut -f1-4 > ucscToRefSeq.bed cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1 # 20 # use the 20 in this sed sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab hg19Patch13 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed checkTableCoords hg19Patch13 -table=ucscToRefSeq # should cover %100 all bases: featureBits -countGaps hg19Patch13 ucscToRefSeq # 23260605 bases of 23260605 (100.000%) in intersection ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2018-06-06 - Hiram) mkdir /hive/data/genomes/hg19Patch13/bed/cpgIslandsUnmasked cd /hive/data/genomes/hg19Patch13/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/hg19Patch13/hg19Patch13.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku hg19Patch13) > do.log 2>&1 # real 1m15.312s cat fb.hg19Patch13.cpgIslandExtUnmasked.txt # 1737836 bases of 94378040 (1.841%) in intersection ########################################################################## # cpgIslands - (DONE - 2018-06-06 - Hiram) mkdir /hive/data/genomes/hg19Patch13/bed/cpgIslands cd /hive/data/genomes/hg19Patch13/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku hg19Patch13) > do.log 2>&1 # real 2m0.923s cat fb.hg19Patch13.cpgIslandExt.txt # 1278964 bases of 94378040 (1.355%) in intersection ############################################################################## # cytoBandIdeo - (DONE - 2018-06-06 - Hiram) mkdir /hive/data/genomes/hg19Patch13/bed/cytoBand cd /hive/data/genomes/hg19Patch13/bed/cytoBand makeCytoBandIdeo.csh hg19Patch13 ############################################################################# # lastz alignments to hg19 (DONE - 2018-06-07 - Hiram) ############################################################################# mkdir /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06 cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06 # prepare bits of hg19 sequence to lastz align to the patches, # this is selecting out the specific section of hg19 where the patch # is supposed to match, and setting up lastz parameters rm -fr hg19Bits run.blastz run.blastz/tParts run.blastz/qParts psl \ hg19Bits.lift mkdir -p hg19Bits run.blastz run.blastz/tParts run.blastz/qParts psl cut -f4 ../patchLocations.bed | while read FIX do printf "# working: ${FIX}\n" 1>&2 chr=`grep "${FIX}" ../patchLocations.bed | cut -f1` start=`grep "${FIX}" ../patchLocations.bed | cut -f2` end=`grep "${FIX}" ../patchLocations.bed | cut -f3` bitSize=`echo ${end} ${start} | awk '{printf "%d", $1-$2}'` chrSize=`grep -w "${chr}" ../../../chrom.sizes | cut -f2` fixSize=`grep "${FIX}" ../hg19Patch13.chrom.sizes | cut -f2` printf "${chr}:${start}-${end} vs. ${FIX}:0-${fixSize}\n" 1>&2 twoBitToFa /hive/data/genomes/hg19/hg19.unmasked.2bit:${chr}:${start}-${end} stdout \ | sed -e "s/${chr}:/${FIX}_/g" > hg19Bits/${FIX}.fa fixName=`head -1 hg19Bits/${FIX}.fa | sed -e 's/>//;'` printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" 1>&2 printf "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}\n" >> hg19Bits.lift printf "/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/hg19Bits.2bit:${fixName}:0-${bitSize}\n" 1>&2 printf "/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/hg19Bits.2bit:${fixName}:0-${bitSize}\n" > run.blastz/qParts/${fixName}.lst printf "/hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.unmasked.2bit:${FIX}:0-${fixSize}\n" > run.blastz/tParts/${fixName}.lst printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${fixName}.lst qParts/${fixName}.lst ../DEF {check out exists ../psl/${fixName}.psl}\n" 1>&2 printf "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${fixName}.lst qParts/${fixName}.lst ../DEF {check out exists ../psl/${fixName}.psl}\n" >> run.blastz/jobList done faToTwoBit hg19Bits/*.fa hg19Bits.2bit twoBitInfo hg19Bits.2bit stdout | sort -k2nr > hg19Bits.chrom.sizes printf 'BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz # human vs human # maximum M allowed with lastz is only 254 BLASTZ_M=254 # lastz does not like the O= and E= lines in the matrix file BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters from hg18 vs venter1 lastz on advice from Webb BLASTZ_K=10000 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Human Hg19Patch13 SEQ1_DIR=/hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.unmasked.2bit SEQ1_LEN=/hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 SEQ1_LIMIT=1 # QUERY: Human Hg19 #SEQ2_DIR=/scratch/data/hg19/hg19.2bit SEQ2_DIR=/hive/data/genomes/hg19/hg19.unmasked.2bit SEQ2_LEN=/scratch/data/hg19/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/hg19Bits.2bit SEQ2_CTGLEN=/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/hg19Bits.chrom.sizes SEQ2_LIFT=/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/hg19Bits.lift SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06 TMPDIR=/dev/shm ' > DEF ssh ku cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/run.blastz para create jobList para try ... check ... push ... etc para time > run.time # Completed: 204 of 204 jobs # CPU time in finished jobs: 9873s 164.54m 2.74h 0.11d 0.000 y # IO & Wait Time: 729s 12.16m 0.20h 0.01d 0.000 y # Average job time: 52s 0.87m 0.01h 0.00d # Longest finished job: 4859s 80.98m 1.35h 0.06d # Submission to last job: 4891s 81.52m 1.36h 0.06d # put together the individual results cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06 mkdir pslParts cat psl/chr*.psl | gzip -c > pslParts/hg19Patch13.hg19.psl.gz # constructing a chain from those results mkdir -p /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/axtChain/run cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/axtChain/run time zcat ../../pslParts/hg19Patch13.hg19.psl.gz \ | axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \ /hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.unmasked.2bit \ /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/hg19Bits.2bit \ stdout \ | chainAntiRepeat /hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.unmasked.2bit \ /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/hg19Bits.2bit \ stdin hg19Patch13.hg19.preLift.chain # real 14m25.476s liftUp -chainQ hg19Patch13.hg19.lifted.chain \ ../../hg19Bits.lift carry hg19Patch13.hg19.preLift.chain # constructing the net files: cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19.2018-06-06/axtChain chainMergeSort run/hg19Patch13.hg19.lifted.chain \ | gzip -c > hg19Patch13.hg19.all.chain.gz chainSplit chain hg19Patch13.hg19.all.chain.gz # Make nets ("noClass", i.e. without rmsk/class stats which are added later): time chainPreNet hg19Patch13.hg19.all.chain.gz \ ../../hg19Patch13.chrom.sizes \ /hive/data/genomes/hg19/chrom.sizes stdout \ | chainNet stdin -minSpace=1 ../../hg19Patch13.chrom.sizes \ /hive/data/genomes/hg19/chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # real 0m5.751s hgLoadChain -tIndex hg19Patch13 chainHg19 hg19Patch13.hg19.all.chain.gz # Loading 1291769 chains into hg19Patch13.chainHg19 featureBits hg19Patch13 chainHg19Link # 90517278 bases of 94378040 (95.909%) in intersection netClass -verbose=0 -noAr noClass.net hg19Patch13 hg19 hg19Patch13.hg19.net netFilter -minGap=10 hg19Patch13.hg19.net \ | hgLoadNet -verbose=0 hg19Patch13 netHg19 stdin # Make liftOver chains: netChainSubset -verbose=0 noClass.net hg19Patch13.hg19.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg19Patch13.hg19.over.chain.gz # Make axtNet for download: one .axt per hg19Patch13 seq. netSplit noClass.net net cd .. mkdir -p axtNet # beware, tcsh scripting here: foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ ../hg19Patch13.unmasked.2bit \ /hive/data/genomes/hg19/hg19.unmasked.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.hg19Patch13.hg19.net.axt.gz end # Make mafNet for multiz: one .maf per hg19Patch13 seq. mkdir -p mafNet # beware, tcsh scripting here: foreach f (axtNet/*.hg19Patch13.hg19.net.axt.gz) axtToMaf -tPrefix=hg19Patch13. -qPrefix=hg19. $f \ ../hg19Patch13.chrom.sizes \ /hive/data/genomes/hg19/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end ############################################################################# # run this same business with hg19 as target, Patch13 sequence as query # (DONE - 2018-06-07 - Hiram) mkdir /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07 cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07 rm -fr hg19Bits run.blastz psl hg19Bits.lift mkdir -p hg19Bits run.blastz/tParts run.blastz/qParts psl cut -f4 ../patchLocations.bed | while read FIX do chr=`grep "${FIX}" ../patchLocations.bed | cut -f1` start=`grep "${FIX}" ../patchLocations.bed | cut -f2` end=`grep "${FIX}" ../patchLocations.bed | cut -f3` bitSize=`echo ${end} ${start} | awk '{printf "%d", $1-$2}'` chrSize=`grep -w "${chr}" ../../../chrom.sizes | cut -f2` fixSize=`grep "${FIX}" ../hg19Patch13.chrom.sizes | cut -f2` echo ${chr}:${start}-${end} vs. ${FIX}:0-${fixSize} 1>&2 twoBitToFa /hive/data/genomes/hg19/hg19.unmasked.2bit:${chr}:${start}-${end} stdout \ | sed -e "s/${chr}:/${FIX}_/g" > hg19Bits/${FIX}.fa fixName=`head -1 hg19Bits/${FIX}.fa | sed -e 's/>//;'` echo -e "${start}\t${fixName}\t${fixSize}\t${chr}\t${chrSize}" >> hg19Bits.lift echo -e "/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/hg19Bits.2bit:${fixName}:0-${bitSize}" > run.blastz/tParts/${FIX}.lst echo -e "/hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.unmasked.2bit:${FIX}:0-${fixSize}" > run.blastz/qParts/${FIX}.lst echo -e "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/${FIX}.lst qParts/${FIX}.lst ../DEF {check out exists ../psl/${FIX}.psl}" >> run.blastz/jobList done faToTwoBit hg19Bits/*.fa hg19Bits.2bit twoBitInfo hg19Bits.2bit stdout | sort -k2n > hg19Bits.chrom.sizes printf 'BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz # human vs human # maximum M allowed with lastz is only 254 BLASTZ_M=254 # lastz does not like the O= and E= lines in the matrix file BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters from hg18 vs venter1 lastz on advice from Webb BLASTZ_K=10000 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Human Hg19 SEQ1_DIR=/hive/data/genomes/hg19/hg19.unmasked.2bit SEQ1_LEN=/scratch/data/hg19/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/hg19Bits.2bit SEQ1_CTGLEN=/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/hg19Bits.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/hg19Bits.lift SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 SEQ1_LIMIT=1 # QUERY: Human Hg19Patch13 SEQ2_DIR=/hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.unmasked.2bit SEQ2_LEN=/hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07 TMPDIR=/dev/shm ' > DEF ssh ku cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/run.blastz para create jobList para try ... check ... push ... etc para time > run.time sed -e 's/^/# /;' run.time # Completed: 204 of 204 jobs # CPU time in finished jobs: 10063s 167.72m 2.80h 0.12d 0.000 y # IO & Wait Time: 779s 12.98m 0.22h 0.01d 0.000 y # Average job time: 53s 0.89m 0.01h 0.00d # Longest finished job: 4902s 81.70m 1.36h 0.06d # Submission to last job: 4952s 82.53m 1.38h 0.06d # put together the individual results cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07 mkdir pslParts cat psl/chr*.psl | gzip -c > pslParts/hg19.hg19Patch13.psl.gz # constructing a chain from those results mkdir -p /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/axtChain/run cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/axtChain/run time zcat ../../pslParts/hg19.hg19Patch13.psl.gz \ | axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \ /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/hg19Bits.2bit \ /hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.unmasked.2bit \ stdout \ | chainAntiRepeat /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/hg19Bits.2bit \ /hive/data/genomes/hg19/bed/hg19Patch13/hg19Patch13.unmasked.2bit \ stdin hg19.hg19Patch13.preLift.chain # real 8m36.023s liftUp hg19.hg19Patch13.lifted.chain \ ../../hg19Bits.lift carry hg19.hg19Patch13.preLift.chain # constructing the net files: cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/axtChain chainMergeSort run/hg19.hg19Patch13.lifted.chain \ | gzip -c > hg19.hg19Patch13.all.chain.gz hgLoadChain -tIndex hg19 chainHg19Patch13 hg19.hg19Patch13.all.chain.gz # Loading 1297891 chains into hg19.chainHg19Patch13 featureBits hg19 chainHg19Patch13Link # 86001495 bases of 2897316137 (2.968%) in intersection chainSplit chain hg19.hg19Patch13.all.chain.gz # Make nets ("noClass", i.e. without rmsk/class stats which are added later): time chainPreNet hg19.hg19Patch13.all.chain.gz \ /hive/data/genomes/hg19/chrom.sizes \ ../../hg19Patch13.chrom.sizes stdout \ | chainNet stdin -minSpace=1 /hive/data/genomes/hg19/chrom.sizes \ ../../hg19Patch13.chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # real 0m4.868s time netClass -verbose=0 -noAr noClass.net hg19 hg19Patch13 hg19.hg19Patch13.net # real 0m27.676s time netFilter -minGap=10 hg19.hg19Patch13.net \ | hgLoadNet -verbose=0 hg19 netHg19Patch13 stdin # real 0m0.222s # Make liftOver chains: netChainSubset -verbose=0 noClass.net hg19.hg19Patch13.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg19.hg19Patch13.over.chain.gz # Make axtNet for download: one .axt per hg19Patch13 seq. netSplit noClass.net net cd .. mkdir -p axtNet # more tcsh here foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /hive/data/genomes/hg19/hg19.unmasked.2bit \ ../hg19Patch13.unmasked.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.hg19.hg19Patch13.net.axt.gz end # Make mafNet for multiz: one .maf per hg19Patch13 seq. mkdir -p mafNet # tcsh loop again foreach f (axtNet/*.hg19.hg19Patch13.net.axt.gz) axtToMaf -tPrefix=hg19. -qPrefix=hg19Patch13. $f \ /hive/data/genomes/hg19/chrom.sizes \ ../hg19Patch13.chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end cd /hive/data/genomes/hg19/bed/hg19Patch13/lastzHg19Patch13.2018-06-07/axtChain mkdir -p queryChains chainSplit -q queryChains hg19.hg19Patch13.all.chain.gz # then run a 'lift over' chain/net on each single one mkdir -p singleLiftOver for F in queryChains/*.chain do C=`basename ${F}` B=`echo ${C} | sed -e "s/.chain//"` chainPreNet -inclHap ${F} /hive/data/genomes/hg19/chrom.sizes \ ../../hg19Patch13.chrom.sizes stdout \ | chainNet -inclHap stdin -minSpace=1 /hive/data/genomes/hg19/chrom.sizes \ ../../hg19Patch13.chrom.sizes singleLiftOver/${B}.raw.net \ /dev/null netSyntenic singleLiftOver/${B}.raw.net singleLiftOver/${B}.noClass.net netFilter -chimpSyn singleLiftOver/${B}.noClass.net > singleLiftOver/${B}.chimpSyn.net netChainSubset -verbose=0 singleLiftOver/${B}.noClass.net \ ${F} stdout \ | chainStitchId stdin stdout > singleLiftOver/${C} printf "${F} -> singleLiftOver/${C}\n" 1>&2 done # put the chains together into one file chainMergeSort singleLiftOver/chr*.chain | gzip -c \ > hg19.hg19Patch13.single.over.chain.gz # -rw-rw-r-- 1 550369 Jun 7 14:35 hg19.hg19Patch13.single.over.chain.gz # construct psl files from those chains chainToPsl hg19.hg19Patch13.single.over.chain.gz \ /hive/data/genomes/hg19/chrom.sizes \ ../../hg19Patch13.chrom.sizes \ /hive/data/genomes/hg19/hg19.unmasked.2bit \ ../../hg19Patch13.unmasked.2bit \ hg19.hg19Patch13.over.psl # pslCheck reports no errors pslCheck -db=hg19 hg19.hg19Patch13.over.psl # checked: 10890 failed: 0 errors: 0 # pslRecalcMatch doesn't change anything pslRecalcMatch hg19.hg19Patch13.over.psl \ /hive/data/genomes/hg19/hg19.unmasked.2bit \ ../../hg19Patch13.unmasked.2bit hg19.hg19Patch13.recalc.over.psl sum *.psl 07674 2809 hg19.hg19Patch13.over.psl 07674 2809 hg19.hg19Patch13.recalc.over.psl # load this PSL track # this table name prefix altSeqLiftOverPsl is recognized in hgc clicks hgLoadPsl hg19 -table=altSeqLiftOverPslP13 hg19.hg19Patch13.over.psl mkdir /hive/data/genomes/hg19/bed/hg19Patch13/seqExt cd /hive/data/genomes/hg19/bed/hg19Patch13/seqExt twoBitToFa ../hg19Patch13.unmasked.2bit hg19Patch13.fa mkdir -p /gbdb/hg19/hg19Patch13 hg19Patch13 faSplit byname hg19Patch13.fa ./hg19Patch13/ ln -s `pwd`/hg19Patch13/*.fa /gbdb/hg19/hg19Patch13 hgLoadSeq -drop -seqTbl=seqHg19Patch13 -extFileTbl=extHg19Patch13 hg19 \ /gbdb/hg19/hg19Patch13/*.fa ############################################################################# # wrap up scripts and file editing (trackDb, hgdownload) # (DONE - 2018-06-07 - Hiram) # edit ${HOME}/kent/src/hg/makeDb/doc/hg19PatchDescr.pl to reflect this most # recent patch version. cd /hive/data/genomes/hg19/bed/hg19Patch13 # edit hg19PatchDescr.pl to update to this patch level ${HOME}/kent/src/hg/makeDb/doc/hg19PatchDescr.pl \ > ${HOME}/kent/src/hg/makeDb/trackDb/human/hg19/hg19Patch13.html # copy README.txt from previous patch and edit as needed - replace the # patch description lines with the output of running cd /hive/data/genomes/hg19/bed/hg19Patch13 cp -p /hive/data/genomes/hg38/bed/hg38Patch11/readmeLines.pl . ./readmeLines.pl genbank/GCA_000001405.14_GRCh37.p13_assembly_report.txt \ genbank/GCA_000001405.14_GRCh37.p13_assembly_regions.txt \ patchLocations.bed new.sequences.list > add.to.README.txt mkdir /hive/data/genomes/hg19/bed/hg19Patch13/downloads cd /hive/data/genomes/hg19/bed/hg19Patch13/downloads #This README.txt file needs to be edited, and add the lines ../add.to.README.txt cp -p /hive/data/genomes/hg38/bed/hg38Patch11/downloads/README.txt . ln -s ../hg19Patch13.unmasked.2bit hg19Patch13.2bit ln -s ../hg19Patch13.agp ln -s ../hg19Patch13.fa.gz md5sum * > md5sum.txt mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/hg19Patch13 ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/hg19Patch13/ #############################################################################