# for emacs: -*- mode: sh; -*- # This file describes how grcHhh38 was extended with patch sequences and annotations from grcH38P12, # after having been extended with grcH38P11 (see patchUpdate.11.txt). ############################################################################## # Extend main database 2bit, chrom.sizes, chromInfo (DONE - 2018-06-26 - Angie) cd /hive/data/genomes/grcHhh38 # main 2bit time faToTwoBit <(twoBitToFa grcHhh38.2bit stdout) \ <(twoBitToFa /hive/data/genomes/grcH38P12/grcH38P12.2bit stdout) \ grcHhh38.p12.2bit #real 4m9.009s mv grcHhh38.2bit grcHhh38.pre.p12.2bit ln -s grcHhh38.p12.2bit grcHhh38.2bit twoBitInfo grcHhh38.2bit stdout | wc -l #595 # unmasked 2bit twoBitMask -type=.bed grcHhh38.2bit /dev/null grcHhh38.p12.unmasked.2bit mv grcHhh38.unmasked.2bit grcHhh38.pre.p12.unmasked.2bit ln -s grcHhh38.p12.unmasked.2bit grcHhh38.unmasked.2bit # chrom.sizes sort -k2nr,2nr chrom.sizes /hive/data/genomes/grcH38P12/chrom.sizes > chrom.sizes.p12 mv chrom.sizes chrom.sizes.pre.p12 ln -s chrom.sizes.p12 chrom.sizes wc -l chrom.sizes #595 chrom.sizes # chromInfo cd /hive/data/genomes/grcHhh38/bed/chromInfo awk '{print $1 "\t" $2 "\t/gbdb/grcHhh38/grcHhh38.2bit";}' ../../chrom.sizes.p12 \ > chromInfo.p12.tab wc -l chromInfo.p12.tab # 595 chromInfo.p12.tab hgLoadSqlTab grcHhh38 chromInfo chromInfo.sql chromInfo.p12.tab ############################################################################## # Extend main database tables for fileless tracks (DONE - 2018-06-26 - Angie) # Just add the patch table rows to the main database tables for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do echo $table hgsql grcHhh38 -e "insert into grcHhh38.$table select * from grcH38P12.$table" done ############################################################################## # Extend main database gc5BaseBw.bw (DONE - 2018-06-26 - Angie) cd /hive/data/genomes/grcHhh38/bed/gc5Base/ # Concatenate original assembly results with grcH38P12 results time (zcat grcHhh38.gc5Base.wigVarStep.gz \ /hive/data/genomes/grcH38P12/bed/gc5Base/grcH38P12.gc5Base.wigVarStep.gz \ | gzip -c \ > grcHhh38.p12.gc5Base.wigVarStep.gz) #real 8m6.509s mv grcHhh38.gc5Base.wigVarStep.gz grcHhh38.pre.p12.gc5Base.wigVarStep.gz ln -s grcHhh38.p12.gc5Base.wigVarStep.gz grcHhh38.gc5Base.wigVarStep.gz # Make a new gc5BaseBw.bw time wigToBigWig grcHhh38.p12.gc5Base.wigVarStep.gz ../../chrom.sizes.p12 \ grcHhh38.p12.gc5Base.bw #real 20m44.483s bigWigInfo grcHhh38.p12.gc5Base.bw | grep chromCount #chromCount: 595 mv grcHhh38.gc5Base.bw grcHhh38.pre.p12.gc5Base.bw ln -s grcHhh38.p12.gc5Base.bw grcHhh38.gc5Base.bw ############################################################################## # Extend main database download files (DONE - 2018-06-27 - Angie) cd /hive/data/genomes/grcHhh38/goldenPath/bigZips # grcH38P12.2bit and grcH38P12.chrom.sizes were already extended above # AGP: zcat grcHhh38.agp.gz \ /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.agp.gz \ | grep -v ^# \ | gzip -c > grcHhh38.p12.agp.gz mv grcHhh38.agp.gz grcHhh38.pre.p12.agp.gz && ln -s grcHhh38.p12.agp.gz grcHhh38.agp.gz zcat grcHhh38.agp.gz | cut -f 1 | sort -u | wc -l #595 # FASTA (from already-extended 2bit): twoBitToFa grcHhh38.2bit stdout \ | gzip -c > grcHhh38.p12.fa.gz mv grcHhh38.fa.gz grcHhh38.pre.p12.fa.gz && ln -s grcHhh38.p12.fa.gz grcHhh38.fa.gz faSize grcHhh38.fa.gz #3257347282 bases (161368694 N's 3095978588 real 1483113183 upper 1612865405 lower) in 595 sequences in 1 files #Total size: mean 5474533.2 sd 27729929.3 min 970 (chrUn_KI270394v1) max 248956422 (chr1) median 166200 twoBitToFa grcHhh38.2bit stdout \ | maskOutFa stdin hard stdout \ | gzip -c > grcHhh38.p12.fa.masked.gz mv grcHhh38.fa.masked.gz grcHhh38.pre.p12.fa.masked.gz ln -s grcHhh38.p12.fa.masked.gz grcHhh38.fa.masked.gz # RepeatMasker (don't include header of patch file): cat <(zcat grcHhh38.fa.out.gz) \ <(zcat /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.fa.out.gz | tail -n +4) \ | gzip -c > grcHhh38.p12.fa.out.gz zcat grcHhh38.p12.fa.out.gz | tail -n +4 | awk '{print $5;}' | sort -u | wc -l #595 mv grcHhh38.fa.out.gz grcHhh38.pre.p12.fa.out.gz ln -s grcHhh38.p12.fa.out.gz grcHhh38.fa.out.gz # SimpleRepeats/TRF: zcat grcHhh38.trf.bed.gz \ /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.trf.bed.gz \ | gzip -c > grcHhh38.p12.trf.bed.gz # We don't expect a complete set of chroms to have simpleRepeats, but at least an increase: zcat grcHhh38.trf.bed.gz | cut -f 1 | uniq | wc -l #485 zcat grcHhh38.p12.trf.bed.gz | cut -f 1 | uniq | wc -l #502 mv grcHhh38.trf.bed.gz grcHhh38.pre.p12.trf.bed.gz ln -s grcHhh38.p12.trf.bed.gz grcHhh38.trf.bed.gz # hg38 files that are not built by makeDownloads.pl because hg38 is treated as 'scaffold-based': # Per-chrom soft-masked FASTA: rm -rf chroms tar xvzf grcHhh38.chromFa.tar.gz faSplit byname /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.fa.gz chroms/ ls -1 chroms | wc -l #595 tar cvzf grcHhh38.p12.chromFa.tar.gz ./chroms mv grcHhh38.chromFa.tar.gz grcHhh38.pre.p12.chromFa.tar.gz ln -s grcHhh38.p12.chromFa.tar.gz grcHhh38.chromFa.tar.gz rm -rf chroms # Per-chrom hard-masked FASTA: rm -rf maskedChroms tar xvzf grcHhh38.chromFaMasked.tar.gz faSplit byname /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.fa.masked.gz \ maskedChroms/ ls -1 maskedChroms | wc -l #595 tar cvzf grcHhh38.p12.chromFaMasked.tar.gz ./maskedChroms mv grcHhh38.chromFaMasked.tar.gz grcHhh38.pre.p12.chromFaMasked.tar.gz ln -s grcHhh38.p12.chromFaMasked.tar.gz grcHhh38.chromFaMasked.tar.gz rm -rf maskedChroms # RepeatMasker .align files: zcat grcHhh38.fa.align.gz /hive/data/genomes/grcH38P12/bed/repeatMasker/grcH38P12.fa.align.gz \ | gzip -c > grcHhh38.p12.fa.align.gz mv grcHhh38.fa.align.gz grcHhh38.pre.p12.fa.align.gz ln -s grcHhh38.p12.fa.align.gz grcHhh38.fa.align.gz # Update md5sum.txt md5sum grcHhh38.2bit grcHhh38.agp.gz grcHhh38.chrom.sizes grcHhh38.chromFa.tar.gz \ grcHhh38.chromFaMasked.tar.gz grcHhh38.fa.align.gz grcHhh38.fa.gz grcHhh38.fa.masked.gz \ grcHhh38.fa.out.gz grcHhh38.trf.bed.gz \ > md5sum.txt ######################################################################### # Regenerate idKeys with extended grcHhh38 (DONE - 2018-06-27 - Angie) mv /hive/data/genomes/grcHhh38/bed/idKeys{,.pre.p12} mkdir /hive/data/genomes/grcHhh38/bed/idKeys cd /hive/data/genomes/grcHhh38/bed/idKeys time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl \ -twoBit=/hive/data/genomes/grcHhh38/grcHhh38.unmasked.2bit \ -buildDir=`pwd` grcHhh38) > do.log 2>&1 & tail -f do.log #real 1m47.076s cat grcHhh38.keySignature.txt #c9c5d621a52f96886fa9cd785c99248f ######################################################################### # ncbiRefSeq.p12 Genes (DONE - 2018-06-27 - Angie) mkdir /hive/data/genomes/grcHhh38/bed/ncbiRefSeq.p12.2018-06-27 cd /hive/data/genomes/grcHhh38/bed/ncbiRefSeq.p12.2018-06-27 # Adding the -toGpWarnOnly flag because there are a handful of cases of CDS extending # beyond exon coordinates. Terence Murphy says they'll eventually fix it but not soon. # So, make sure to check do.log for warnings from gff3ToGenePred: time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -toGpWarnOnly \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.38_GRCh38.p12 grcHhh38) > do.log 2>&1 & tail -f do.log # gff3ToGenePred warnings: #Warning: skipping: no exon in id1912382 contains CDS 555851-556197 #Warning: skipping: no exon in id1790907 contains CDS 22922593-22922913 #Warning: skipping: no exon in id1790877 contains CDS 22906341-22906661 #Warning: skipping: no exon in id1790824 contains CDS 22822981-22823289 #Warning: skipping: no exon in id1365744 contains CDS 106088082-106088428 #5 warnings converting GFF3 file: stdin # *** All done ! Elapsed time: 23m16s #real 23m16.023s cat fb.ncbiRefSeq.grcHhh38.txt #134109466 bases of 3095998939 (4.332%) in intersection ############################################################################## # main database haplotypes, patches (DONE - 2018-06-27 - Angie) # NOTE FOR NEXT TIME -- instead of using extract.new.list, extract all (cumulative) # patch release sequences since the initial assembly release. Then we won't need # the hgsql commands at the end. # Also, the patches themselves should have patch track entries that point back # to their main chrom locations. mkdir /hive/data/genomes/grcHhh38/bed/hg38Patch12Locations cd /hive/data/genomes/grcHhh38/bed/hg38Patch12Locations # construct locations file: ~/kent/src/hg/makeDb/doc/hg38/regionScan.pl \ /hive/data/genomes/grcH38P12/ucsc/extract.new.list \ /hive/data/genomes/grcH38P12/genbank/GCA_000001405.27_GRCh38.p12_assembly_regions.txt \ > patchLocations.bed # verify correct number of locations: wc -l patchLocations.bed #17 patchLocations.bed # separate haplotypes from fix patches for two tracks: grep -v fix patchLocations.bed \ | sed -e 's/_alt//; s/\tchr.*_/\t/; s/v/./;' \ > hg38Patch12Haplotypes.bed grep fix patchLocations.bed \ | sed -e 's/_fix//; s/\tchr.*_/\t/;' | sed -e 's/v\([0-9]\)$/.\1/;' \ > hg38Patch12Patches.bed # verify nothing lost, should be 17: wc -l hg38*.bed # 11 hg38Patch12Haplotypes.bed # 6 hg38Patch12Patches.bed # 17 total hgLoadBed -type=bed4 grcHhh38 hg38Patch12Haplotypes hg38Patch12Haplotypes.bed #Read 11 elements of size 4 from hg38Patch12Haplotypes.bed hgLoadBed -type=bed4 grcHhh38 hg38Patch12Patches hg38Patch12Patches.bed #Read 6 elements of size 4 from hg38Patch12Patches.bed # Actually we want those to be cumulative... pull in rows from the corresponding Patch11 tables hgsql grcHhh38 -e 'insert into hg38Patch12Haplotypes select * from hg38Patch11Haplotypes;' hgsql grcHhh38 -e 'insert into hg38Patch12Patches select * from hg38Patch11Patches;' ############################################################################ # altLocations and patchLocations (DONE - 2018-06-27 - Angie) # indicate corresponding locations between haplotypes and reference mkdir /hive/data/genomes/grcHhh38/bed/altLocations.p12 cd /hive/data/genomes/grcHhh38/bed/altLocations.p12 ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl \ /hive/data/genomes/grcH38P12/genbank/GCA_000001405.27_GRCh38.p12_assembly_structure/{ALT_*,PATCHES}/alt_scaffolds/alt_scaffold_placement.txt \ | sort -k1,1 -k2n,2n \ > altAndFixLocations.bed wc -l altAndFixLocations.bed #802 altAndFixLocations.bed grep _alt altAndFixLocations.bed > altLocations.bed grep _fix altAndFixLocations.bed > fixLocations.bed hgLoadBed grcHhh38 altLocations{,.bed} #Read 664 elements of size 4 from altLocations.bed hgLoadBed grcHhh38 fixLocations{,.bed} #Read 140 elements of size 4 from fixLocations.bed featureBits -countGaps grcHhh38 altLocations #200094386 bases of 3257347282 (6.143%) in intersection featureBits -countGaps grcHhh38 fixLocations #63654955 bases of 3257347282 (1.954%) in intersection ############################################################################# # Check for new chrX alts/patches to add to par (DONE 2018-06-27 angie) # Thanks to Hiram for pointing out that intersecting chrX positions in # altLocations and par shows whether a chrX alt overlaps a PAR. mkdir /hive/data/genomes/grcHhh38/bed/par cd /hive/data/genomes/grcHhh38/bed/par hgsql grcHhh38 -e 'select * from altLocations where chrom like "chrX%"' #+-----+---------------------+------------+----------+------------------------+ #| bin | chrom | chromStart | chromEnd | name | #+-----+---------------------+------------+----------+------------------------+ #| 73 | chrX | 319337 | 601516 | chrX_KI270880v1_alt | #| 73 | chrX | 326487 | 601516 | chrX_KI270913v1_alt | #| 77 | chrX | 4950956 | 5129468 | chrX_KV766199v1_alt | #| 149 | chrX | 79965153 | 80097082 | chrX_KI270881v1_alt | #| 73 | chrX_KI270880v1_alt | 0 | 284869 | chrX:319338-601516 | #| 73 | chrX_KI270881v1_alt | 0 | 144206 | chrX:79965154-80097082 | #| 73 | chrX_KI270913v1_alt | 0 | 274009 | chrX:326488-601516 | #| 73 | chrX_KV766199v1_alt | 0 | 188004 | chrX:4950957-5129468 | #+-----+---------------------+------------+----------+------------------------+ hgsql grcHhh38 -e 'select * from par where chrom like "chrX%"' #+-----+---------------------+------------+-----------+------+ #| bin | chrom | chromStart | chromEnd | name | #+-----+---------------------+------------+-----------+------+ #| 9 | chrX | 10000 | 2781479 | PAR1 | #| 221 | chrX | 155701382 | 156030895 | PAR2 | #| 73 | chrX_KI270880v1_alt | 0 | 284869 | PAR1 | #| 73 | chrX_KI270913v1_alt | 0 | 274009 | PAR1 | #+-----+---------------------+------------+-----------+------+ # chrX_KI270881v1_alt and chrX_KV766199v1_alt are not in either PAR. # chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1 -- # and are already in the PAR table, so nothing to add. ############################################################################# # adding ucscToINSDC, ucscToRefSeq and chromAlias (DONE - Angie - 2018-06-27) # need to have idKeys for the genbank and refseq assemblies: mkdir -p /hive/data/genomes/grcHhh38/bed/ucscToINSDC/genbankP12 cd /hive/data/genomes/grcHhh38/bed/ucscToINSDC/genbankP12 ln -s /hive/data/outside/ncbi/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.27_GRCh38.p12/GCA_000001405.27_GRCh38.p12_genomic.fna.gz . faToTwoBit GCA_000001405.27_GRCh38.p12_genomic.fna.gz genbankP12.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=genbankP12.2bit genbankP12) \ > do.log 2>&1 #real 1m50.109s mkdir /hive/data/genomes/grcHhh38/bed/ucscToINSDC/refseqP12 cd /hive/data/genomes/grcHhh38/bed/ucscToINSDC/refseqP12 ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz ./ faToTwoBit GCF_000001405.38_GRCh38.p12_genomic.fna.gz refseqP12.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=refseqP12.2bit refseqP12) \ > do.log 2>&1 #real 1m47.878s # with the three idKeys available, join them to make the table bed files: cd /hive/data/genomes/grcHhh38/bed/ucscToINSDC join -t$'\t' ../idKeys/grcHhh38.idKeys.txt genbankP12/genbankP12.idKeys.txt \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToINSDC.p12.bed join -t$'\t' ../idKeys/grcHhh38.idKeys.txt refseqP12/refseqP12.idKeys.txt \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToRefSeq.p12.bed # loading tables: export db=grcHhh38 export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab ${db} ucscToINSDC stdin ucscToINSDC.p12.bed export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' \ | hgLoadSqlTab ${db} ucscToRefSeq stdin ucscToRefSeq.p12.bed # must be exactly 100% coverage featureBits -countGaps ${db} ucscToINSDC #3257347282 bases of 3257347282 (100.000%) in intersection featureBits -countGaps ${db} ucscToRefSeq #3257319537 bases of 3257347282 (99.999%) in intersection # uh-oh! not 100% featureBits -countGaps ${db} \!ucscToRefSeq -bed=stdout #chrUn_KI270752v1 0 27745 chrUn_KI270752v1.1 grep KI270752 \ /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_assembly_report.txt #HSCHRUN_RANDOM_CTG29 unplaced-scaffold na na KI270752.1 <> na Primary Assembly 27745 chrUn_KI270752v1 # Yep, no RefSeq accession there. Guess it was dropped from the RefSeq p12 assembly??? # Will ask Hiram and probably Terence. # construct chromAlias: cd /hive/data/genomes/grcHhh38/bed/chromAlias hgsql -N -e 'select chrom,name from ucscToRefSeq;' ${db} \ | sort -k1,1 > ucsc.refseq.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' ${db} \ | sort -k1,1 > ucsc.genbank.tab # add NCBI sequence names from assembly report grep -v ^# \ /hive/data/genomes/grcH38P12/genbank/GCA_000001405.27_GRCh38.p12_assembly_report.txt \ | tawk '{print $5, $1;}' | sort \ > genbankToAssembly.txt tawk '{print $2, $1;}' ucsc.genbank.tab | sort \ | join -t$'\t' -o 1.2,2.2 - genbankToAssembly.txt \ | sort -k1,1 > ucsc.assembly.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > ${db}.chromAlias.tab # verify all there: for t in refseq genbank assembly do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t grcHhh38.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 594 =? 594 OK # checking genbank: 595 =? 595 OK # checking assembly: 595 =? 595 OK # Note how there's one fewer refseq, consistent with featureBits above. hgLoadSqlTab grcHhh38 chromAlias $HOME/kent/src/hg/lib/chromAlias.sql ${db}.chromAlias.tab ############################################################################## # altSeqLiftOver (DONE 18-06-27 Angie) mkdir /hive/data/genomes/grcHhh38/bed/altSeqLiftOver.p12 cd /hive/data/genomes/grcHhh38/bed/altSeqLiftOver.p12 # Eventually these will be under the /hive/data/genomes/.../genbank/... directory # that points to /hive/data/outside/ncbi/genomes/... but at the moment the contents # of the alignments/ directories are not included in the sync. So for now, # manually download them here. # Original alts: # NOTE FOR NEXT TIME: don't redownload these, link to prior build dir's initialAlts mkdir initialAlts cd initialAlts for d in /hive/data/genomes/grcH38P12/genbank/GCA_000001405.27_GRCh38.p12_assembly_structure/ALT*/alt_scaffolds/alignments ; do subdir=`echo $d | sed -re 's@^/hive/data/genomes/grcH38P12/genbank/@@;'` wget --timestamping --no-verbose \ ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.27_GRCh38.p12/$subdir/\*.gff done # New alts and patches too: mkdir ../patches cd ../patches wget --timestamping --no-verbose\ ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.27_GRCh38.p12/GCA_000001405.27_GRCh38.p12_assembly_structure/PATCHES/alt_scaffolds/alignments/\*.gff cd .. # Use chromAlias to make a .sed file to substitute Genbank accessions to UCSC names hgsql grcHhh38 -NBe 'select alias,chrom from chromAlias where find_in_set("genbank", source);' \ | awk '{print "s@" $1 "@" $2 "@;";}' > gbToUcsc.sed cp /dev/null altToChrom.noScore.psl for f in initialAlts/*.gff patches/*.gff; do e=`basename $f .gff | sed -e 's/_/|/g;'` s=`grep -E $e gbToUcsc.sed` sed -re "$s" $f | gff3ToPsl ../../chrom.sizes{,} stdin stdout \ | pslPosTarget stdin stdout \ >> altToChrom.noScore.psl done pslCheck altToChrom.noScore.psl #checked: 421 failed: 0 errors: 0 pslRecalcMatch altToChrom.noScore.psl ../../grcHhh38.2bit{,} altToChrom.psl #202.461u 1.836s 3:24.46 99.9% 0+0k 0+0io 0pf+0w pslSwap altToChrom.psl stdout | pslPosTarget stdin chromToAlt.psl sort -k14,14 -k16n,16n -k10,10 -k12n,12n altToChrom.psl chromToAlt.psl \ > altAndPatches.psl grep _alt altAndPatches.psl > altSeqLiftOver.psl grep _fix altAndPatches.psl > fixSeqLiftOver.psl # Load tables hgLoadPsl grcHhh38 -table=altSeqLiftOverPsl altSeqLiftOver.psl hgLoadPsl grcHhh38 -table=fixSeqLiftOverPsl fixSeqLiftOver.psl # Make chrom-to-alt PSL file for genbank process. ln -f -s `pwd`/chromToAlt.psl \ /hive/data/genomes/grcHhh38/jkStuff/grcHhh38.p12.alt.psl # Make a liftOver chain file for mapping annotations on main chroms to new patch sequences # exclude alts that were already in grcHhh38 before p12 cut -f 1 ../../chrom.sizes.pre.p12 | grep _ \ | grep -vwf - chromToAlt.psl \ | pslToChain stdin stdout \ | chainScore stdin ../../grcHhh38.2bit{,} ../../jkStuff/grcHhh38.mainToPatch.p12.over.chain #52.068u 1.626s 0:54.43 98.6% 0+0k 15952+0io 2pf+0w # Make bigPsl so we don't have to bother with seq* and ext* tables? takes many hours... twoBitToFa /hive/data/genomes/grcHhh38/grcHhh38.2bit stdout \ | pslToBigPsl -fa=stdin altSeqLiftOver.psl stdout \ | sort -k1,1 -k2n,2n > altSeqLiftOver.bigPslInput #122.847u 338.099s 8:50.12 86.9% 0+0k 67303264+247358392io 1pf+0w bedToBigBed -type=bed12+13 -tab -as=$HOME/kent/src/hg/lib/bigPsl.as \ altSeqLiftOver.bigPslInput /hive/data/genomes/grcHhh38/chrom.sizes \ altSeqLiftOver.bb #14435.151u 103.362s 4:03:04.77 99.6% 0+0k 8+0io 0pf+0w # Yikes, four hours!! ln -sf `pwd`/altSeqLiftOver.bb /gbdb/grcHhh38/bbi/altSeqLiftOver.bb # Clean up giant intermediate file rm altSeqLiftOver.bigPslInput ############################################################################## # Extend wgEncodeReg bigWig tracks (DONE 18-06-28 angie) #NOTE: this has not been liftOver'd to original alts! # Use existing /gbdb/ bigWig files (which have already had p11 added): for dir in /gbdb/grcHhh38/bbi/wgEncodeReg/{*Mark*,*Txn}; do composite=`basename $dir` echo $composite mkdir -p /hive/data/genomes/grcHhh38/bed/$composite cd /hive/data/genomes/grcHhh38/bed/$composite for f in $dir/wg*.bigWig; do track=`basename $f .bigWig` ~/kent/src/hg/utils/liftOverBigWigToPatches $f \ /hive/data/genomes/grcHhh38/jkStuff/grcHhh38.mainToPatch.p12.over.chain \ /hive/data/genomes/grcHhh38/chrom.sizes \ $track.plusP12.bigWig & done wait mkdir -p /gbdb/grcHhh38/bbi/wgEncodeReg/$composite for f in $dir/wg*.bigWig; do track=`basename $f .bigWig` ln -sf `pwd`/$track.plusP12.bigWig \ /gbdb/grcHhh38/bbi/wgEncodeReg/$composite/$track.bigWig done done ############################################################################## # Extend wgEncodeRegDnase (DONE 18-06-28 angie) #NOTE: this has not been liftOver'd to original alts! cd /hive/data/genomes/grcHhh38/bed/wgEncodeRegDnase origFile=/hive/data/genomes/hg38/bed/wgEncodeRegDnase/clusters/uwEnc2DnaseClustered.bed liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/grcHhh38/jkStuff/grcHhh38.mainToPatch.p12.over.chain \ wgEncodeRegDnaseClustered.p12.bed /dev/null hgLoadBed -oldTable -type=bed5+ grcHhh38 wgEncodeRegDnaseClustered \ wgEncodeRegDnaseClustered.p12.bed ############################################################################## # Extend wgEncodeRegTfbsClusteredV3 (DONE 18-06-28 angie) #NOTE: this has not been liftOver'd to original alts! cd /hive/data/genomes/grcHhh38/bed/wgEncodeRegTfbsClusteredV3 origFile=/hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTfbsClusteredV3/hg38.wgEncodeRegClusteredV3.bed liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/grcHhh38/jkStuff/grcHhh38.mainToPatch.p12.over.chain \ wgEncodeRegTfbsClusteredV3.p12.bed /dev/null hgLoadBed -oldTable -type=bed5+ grcHhh38 wgEncodeRegTfbsClusteredV3 \ wgEncodeRegTfbsClusteredV3.p12.bed ############################################################################## # Extend GTEX GENE (DONE 18-06-28 angie) # I'm not really sure what file(s) are the true source of the latest hg38 GTEX Gene tables, # so I'll just work from the tables. mkdir /hive/data/genomes/grcHhh38/bed/gtex.p12 cd /hive/data/genomes/grcHhh38/bed/gtex.p12 # There is actually no bin column in gtexGene. hgsql grcHhh38 -NBe 'select * from gtexGene' > gtexGene.main.bed liftOver -multiple -bedPlus=6 -noSerial gtexGene.main.bed \ /hive/data/genomes/grcHhh38/jkStuff/grcHhh38.mainToPatch.p12.over.chain \ gtexGene.p12.bed /dev/null sort -k1,1 -k2n,2n gtexGene.main.bed gtexGene.p12.bed \ | hgLoadBed -noBin -type=bed6+ -sqlTable=$HOME/kent/src/hg/lib/gtexGeneBed.sql -renameSqlTable \ grcHhh38 gtexGene stdin # gtexGeneModel does have a bin. hgsql grcHhh38 -NBe 'select * from gtexGeneModel' | cut -f 2- > gtexGeneModel.main.gp liftOver -multiple -genePred gtexGeneModel.main.gp \ /hive/data/genomes/grcHhh38/jkStuff/grcHhh38.mainToPatch.p12.over.chain \ gtexGeneModel.p12.gp /dev/null sort -k2,2 -k3n,3n gtexGeneModel.main.gp gtexGeneModel.p12.gp \ | hgLoadGenePred grcHhh38 gtexGeneModel stdin ############################################################################# # Extend cytoBand{,Ideo} (DONE 18-06-28 angie) mkdir /hive/data/genomes/grcHhh38/bed/cytoBand.p12 cd /hive/data/genomes/grcHhh38/bed/cytoBand.p12 tawk '{print $1, 0, $2, "", "gneg";}' /hive/data/genomes/grcH38P12/chrom.sizes \ > cytoBand.p12.tab hgLoadSqlTab -oldTable grcHhh38 cytoBand - cytoBand.p12.tab hgLoadSqlTab -oldTable grcHhh38 cytoBandIdeo - cytoBand.p12.tab ############################################################################## # OMIM tracks (TODO 18-06-? angie) # the otto process builds the omim* tables; edit otto/omim/buildOmimTracks.sh to make sure # the most recent dbSNP version is listed for the db. After the snpNNN table is updated to # include patch sequences, the next otto update will include patches. # omimGene2 is still using refGene, but I think it would be better if it used ncbiRefSeqCurated # if it exists. At least, we would get more mappings :) e.g. KAT6B is mapped to fix by # ncbi but not blat. # TODO: OMIM Genes needs liftOver to new alts and fixes (or redo from ncbiRefSeq). # OMIM Phenotypes needs liftOvers to all alts and fixes. Sometimes it spans a region larger # than an alt/fix, so maybe lower the percentage that has to map? ############################################################################# # DBSNP B151 / SNP151 (TODO 18-? angie) # b151 is for GRCh38.p7 (orgDir human_9606_b151_GRCh38p7) so it doesn't have # sequences adding in p8 and later. # Make a liftOver chain from main chromosomes to patches added after p7. # Do the liftOver for all positional snp151* tables, load with -oldTable #############################################################################