# for emacs: -*- mode: sh; -*- # This file describes browser build for the bosTau8 # Bos taurus -- UMD 3.1.1 (Dec. 2009) # Center for Bioinformatics and Computational Biology, University of Maryland # UMD 3.1.1 ############################################################################# # fetch sequence from new style download directory (DONE - 2014-09-05 - Steve) # NCBI has redesigned their FTP download site, new type of address # and naming scheme mkdir -p /hive/data/genomes/bosTau8/genbank cd /hive/data/genomes/bosTau8/genbank rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCA_000003055.4_Bos_taurus_UMD_3.1.1 ./ # measure what we have here: faSize GCA_000003055.4_Bos_taurus_UMD_3.1.1/GCA_000003055.4_Bos_taurus_UMD_3.1.1_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/*.fna.gz GCA_000003055.4_Bos_taurus_UMD_3.1.1/GCA_000003055.4_Bos_taurus_UMD_3.1.1_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/*.fna.gz # 2670028162 bases (20740269 N's 2649287893 real 2649287893 upper 0 lower) # in 3178 sequences in 31 files # Total size: mean 840159.9 sd 9115217.1 min 224 (GJ060330.1) # max 158337067 (GK000001.2) median 1356 # note that these totals do not include chrM since the GenBank ftp directory # did not include a non-nuclear directory ############################################################################# # fixup to UCSC naming scheme (DONE - 2014-09-05 - Steve) mkdir /hive/data/genomes/bosTau8/ucsc cd /hive/data/genomes/bosTau8/ucsc # evolving these scripts over time, could become source tree scripts cat << '_EOF_' > ucscCompositeAgp.pl #!/bin/env perl use strict; use warnings; my %accToChr; my $primary="../genbank/GCA_000003055.4_Bos_taurus_UMD_3.1.1/GCA_000003055.4_Bos_taurus_UMD_3.1.1_assembly_structure/Primary_Assembly"; open (FH, "<$primary/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat $primary/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat $primary/assembled_chromosomes/FASTA/chr${chrN}.fna.gz|") or die "can not read chr${chrN}.fna.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $primary="../genbank/GCA_000003055.4_Bos_taurus_UMD_3.1.1/GCA_000003055.4_Bos_taurus_UMD_3.1.1_assembly_structure/Primary_Assembly"; my $agpFile = "$primary/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "$primary/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chrUn.agp") or die "can not write to chrUn.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\./v/; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chrUn.fa") or die "can not write to chrUn.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/ .*//; $line =~ s/\./v/; $line =~ s/>//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x ucscCompositeAgp.pl unplaced.pl ./ucscCompositeAgp.pl ./unplaced.pl # download the mitochondrial sequence export mitoAcc=NC_006853.1 wget -O ${mitoAcc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc" echo ">chrM" > chrM.fa grep -v "^>" ${mitoAcc}.fa >> chrM.fa export mSize=`faCount chrM.fa | grep total | awk '{print $2}'` /bin/echo -e "chrM\t1\t$mSize\t1\tF\t$mitoAcc\t1\t$mSize\t+" > chrM.agp # verify this sequence is still the same as it was before # these translations faSize *.fa # 2670044500 bases (20740269 N's 2649304231 real 2649304231 upper 0 lower) # in 3179 sequences in 32 files # Total size: mean 839900.8 sd 9113794.6 min 224 (chrUn_GJ060330v1) # max 158337067 (chr1) median 1356 faSize chrM.fa # 16338 bases (0 N's 16338 real 16338 upper 0 lower) # in 1 sequences in 1 files ############################################################################# # Initial database build (DONE - 2014-09-10 - Steve) cd /hive/data/genomes/bosTau8 cat << '_EOF_' > bosTau8.config.ra # Config parameters for makeGenomeDb.pl: db bosTau8 clade mammal genomeCladePriority 31 scientificName Bos taurus commonName Cow assemblyDate Dec. 2009 assemblyLabel University of Maryland Bos_taurus_UMD_3.1.1 assemblyShortLabel UMD_3.1.1 orderKey 2378 mitoAcc none # mito included in assembly NC_006853.1 fastaFiles /hive/data/genomes/bosTau8/ucsc/chr*.fa agpFiles /hive/data/genomes/bosTau8/ucsc/chr*.agp # qualFiles none dbDbSpeciesDir cow photoCreditURL http://www.genome.gov/10005141 photoCreditName NHGRI Press Photos ncbiGenomeId 82 ncbiAssemblyId 189361 ncbiAssemblyName Bos_taurus_UMD_3.1.1 ncbiBioProject 33843 genBankAccessionID GCA_000003055.4 taxId 9913 '_EOF_' # << happy emacs # verify sequence and AGP are OK: time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp bosTau8.config.ra > agp.log 2>&1 # real 2m22.582s # then finish it off: time nice -n +19 makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db bosTau8.config.ra > db.log 2>&1 ########################################################################## # running repeat masker (DONE - 2014-09-10 - Steve) mkdir /hive/data/genomes/bosTau8/bed/repeatMasker cd /hive/data/genomes/bosTau8/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku bosTau8 > do.log 2>&1 # real 599m0.131s cat faSize.rmsk.txt # 2670044500 bases (20740269 N's 2649304231 real 1360109237 upper # 1289194994 lower) in 3179 sequences in 1 files # Total size: mean 839900.8 sd 9113794.6 min 224 (chrUn_GJ060330v1) # max 158337067 (chr1) median 1356 # %48.28 masked total, %48.66 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; featureBits -countGaps bosTau8 rmsk # 1289639153 bases of 2670044500 (48.300%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE 2014-09-12 - Steve) mkdir /hive/data/genomes/bosTau8/bed/simpleRepeat cd /hive/data/genomes/bosTau8/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ bosTau8 > do.log 2>&1 # real 5m50.579s cat fb.simpleRepeat # 33445177 bases of 2649307237 (1.262%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/bosTau8 twoBitMask bosTau8.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed bosTau8.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa bosTau8.2bit stdout | faSize stdin > faSize.bosTau8.2bit.txt cat faSize.bosTau8.2bit.txt # 2670044500 bases (20740269 N's 2649304231 real 1359475670 upper # 1289828561 lower) in 3179 sequences in 1 files # Total size: mean 839900.8 sd 9113794.6 min 224 (chrUn_GJ060330v1) # max 158337067 (chr1) median 1356 # %48.31 masked total, %48.69 masked real rm /gbdb/bosTau8/bosTau8.2bit ln -s `pwd`/bosTau8.2bit /gbdb/bosTau8/bosTau8.2bit ########################################################################## ## WINDOWMASKER (DONE - 2014-09-12 - Steve) mkdir /hive/data/genomes/bosTau8/bed/windowMasker cd /hive/data/genomes/bosTau8/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev bosTau8 > do.log 2>&1 # real 165m31.022s # Masking statistics cat faSize.bosTau8.cleanWMSdust.txt # 2670044500 bases (20740269 N's 2649304231 real 1560587113 upper # 1088717118 lower) in 3179 sequences in 1 files # Total size: mean 839900.8 sd 9113794.6 min 224 (chrUn_GJ060330v1) # max 158337067 (chr1) median 1356 # %40.78 masked total, %41.09 masked real cat fb.bosTau8.rmsk.windowmaskerSdust.txt # 862982888 bases of 2670044500 (32.321%) in intersection ########################################################################## # cpgIslands - (DONE - 2014-09-12 - Steve) mkdir /hive/data/genomes/bosTau8/bed/cpgIslands cd /hive/data/genomes/bosTau8/bed/cpgIslands time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku bosTau8 > do.log 2>&1 # real 14m36.983s cat fb.bosTau8.cpgIslandExt.txt # 24127795 bases of 2649307237 (0.911%) in intersection ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2014-09-12 - Steve) mkdir /hive/data/genomes/bosTau8/bed/cpgIslandsUnmasked cd /hive/data/genomes/bosTau8/bed/cpgIslandsUnmasked # run stepwise so the loading can be done in a different table time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/bosTau8/bosTau8.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku bosTau8 > do.log 2>&1 # real 36m46.811s cat fb.bosTau8.cpgIslandExtUnmasked.txt # 32916929 bases of 2649307237 (1.242%) in intersection ############################################################################# # cytoBandIdeo - (NEED more work 2014-09-12 - Steve) mkdir /hive/data/genomes/bosTau8/bed/cytoBand cd /hive/data/genomes/bosTau8/bed/cytoBand makeCytoBandIdeo.csh bosTau8 ######################################################################### # genscan - (DONE 2014-09-12 - Steve) mkdir /hive/data/genomes/bosTau8/bed/genscan cd /hive/data/genomes/bosTau8/bed/genscan time doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku bosTau8 > do.log 2>&1 # real 65m53.319s cat fb.bosTau8.genscan.txt # 54695662 bases of 2649307237 (2.065%) in intersection cat fb.bosTau8.genscanSubopt.txt # 50492196 bases of 2649307237 (1.906%) in intersection ######################################################################## # Create kluster run files (DONE - 2014-10-10 - Steve) # numerator is bosTau8 gapless bases "real" as reported by: featureBits -noRandom -noHap bosTau8 gap # 20737263 bases of 2640185480 (0.785%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2640185480 / 2861349177 \) \* 1024 ( 2640185480 / 2861349177 ) * 1024 = 944.851454 # ==> use -repMatch=900 according to size scaled down from 1024 for human. # and rounded down to nearest 50 (runup 899.666140 to 900 in this case) cd /hive/data/genomes/bosTau8 blat bosTau8.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/bosTau8.11.ooc \ -repMatch=900 & # Wrote 33613 overused 11-mers to jkStuff/bosTau8.11.ooc # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' bosTau8 \ | sort -k7,7nr # most non-bridged gaps have size = 5000, follow pig's example (most # 100, but use 5000) # decide on a minimum gap for this break, use either 100 or 5000 will # generate 13387 liftOver rows, but if use 6000, only got 11703 rows. # so use 100 here to get more liftOver row. gapToLift -verbose=2 -minGap=100 bosTau7 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed ######################################################################## # GENBANK AUTO UPDATE (DONE - 2014-10-10 - Steve) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt # Bos taurus 19983 1559521 13243 # edit etc/genbank.conf to add bosTau8 just after bosTau7 # bosTau8 (Cow) bosTau8.serverGenome = /hive/data/genomes/bosTau8/bosTau8.2bit bosTau8.clusterGenome = /hive/data/genomes/bosTau8/bosTau8.2bit bosTau8.ooc = /hive/data/genomes/bosTau8/bosTau8.11.ooc bosTau8.lift = no bosTau8.perChromTables = no bosTau8.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} bosTau8.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} bosTau8.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} bosTau8.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} bosTau8.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} bosTau8.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} bosTau8.downloadDir = bosTau8 bosTau8.refseq.mrna.native.load = yes bosTau8.refseq.mrna.xeno.load = yes bosTau8.refseq.mrna.xeno.loadDesc = yes bosTau8.upstreamGeneTbl = refGene git add etc/genbank.conf git commit -m "Added bosTau8; refs #13852" git push # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. Skipped screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial bosTau8 # logFile: var/build/logs/2014.10.13-13:00:37.bosTau8.initalign.log # real 362m8.057s # To re-do, rm the dir first: # /cluster/data/genbank/work/initial.bosTau8 # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bosTau8 # logFile: var/dbload/hgwdev/logs/2014.10.14-09:00:05.bosTau8.dbload.log # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add bosTau8 to: # etc/align.dbs # etc/hgwdev.dbs git add etc/align.dbs git add etc/hgwdev.dbs git commit -m "Added bosTau8 - Cow" etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # SWAP hg38 LASTZ (DONE - 2014-10-15 - Steve) # original alignment done at hg38.txt cd /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15 cat fb.hg38.chainBosTau8Link.txt # 1401921010 bases of 3049335806 (45.975%) in intersection # Create link cd /hive/data/genomes/hg38/bed ln -s lastzBosTau8.2014-10-15 lastz.bosTau8 # running the swap mkdir /hive/data/genomes/bosTau8/bed/blastz.hg38.swap cd /hive/data/genomes/bosTau8/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 116m32.121s cat fb.bosTau8.chainHg38Link.txt # 1336307377 bases of 2649307237 (50.440%) in intersection cd /hive/data/genomes/bosTau8/bed ln -s blastz.hg38.swap lastz.hg38 ######################################################################### # SWAP mm10 LASTZ (DONE - 2014-10-15 - Steve) # original alignment done at mm10.txt cd /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15 cat fb.mm10.chainBosTau8Link.txt # 698722925 bases of 2652783500 (26.339%) in intersection # Create link cd /hive/data/genomes/mm10/bed ln -s lastzBosTau8.2014-10-15 lastz.bosTau8 # and the swap mkdir /hive/data/genomes/bosTau8/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau8/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 58m4.272s cat fb.bosTau8.chainMm10Link.txt # 687270584 bases of 2649307237 (25.942%) in intersection # Create link cd /hive/data/genomes/bosTau8/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LIFTOVER TO bosTau6 (DONE - 2014-10-21 - Steve) ssh hgwdev cd /hive/data/genomes/bosTau8 ln -s jkStuff/bosTau8.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ bosTau8 bosTau6 > doLiftOverToBosTau6.log 2>&1 # real 82m56.388s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau8/liftOver/ md5sum bosTau8ToBosTau6.over.chain.gz >> md5sum.txt popd ######################################################################### # LIFTOVER TO bosTau7 (DONE - 2014-10-21 - Steve) ssh hgwdev cd /hive/data/genomes/bosTau8 ln -s jkStuff/bosTau8.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ bosTau8 bosTau7 > doLiftOverToBosTau7.log 2>&1 # real 105m34.448s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau8/liftOver/ md5sum bosTau8ToBosTau7.over.chain.gz >> md5sum.txt popd ######################################################################### # all.joiner update, downloads and in pushQ - (DONE 2014-10-21 - Steve) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=bosTau8 -keys all.joiner joinerCheck -database=bosTau8 -tableCoverage all.joiner joinerCheck -database=bosTau8 -times all.joiner cd /hive/data/genomes/bosTau8 makeDownloads.pl bosTau8 > do.log 2>&1 # now ready for pushQ entry mkdir /hive/data/genomes/bosTau8/pushQ cd /hive/data/genomes/bosTau8/pushQ makePushQSql.pl bosTau8 > bosTau8.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/bosTau8/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/bosTau8/wib/quality.wib # WARNING: hgwdev does not have /gbdb/bosTau8/bbi/quality.bw # WARNING: bosTau8 does not have seq # WARNING: bosTau8 does not have extFile # copy it to hgwbeta scp -p bosTau8.pushQ.sql qateam@hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < bosTau8.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly ######################################################################### # uscsToINSDC table/track (DONE - 2014-10-21 - Hiram) # fixup the translateNames.sh file from rn6 work to use the new # genbank structure (copied over from rn6/bed/ucscToINSDC) ./translateNames.sh NC_006853.1 awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \ > ucscToINSDC.bed cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 16 # use the 16 in this sed sed -e "s/21/16/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab bosTau8 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords bosTau8 # should cover %100 entirely: featureBits -countGaps bosTau8 ucscToINSDC # 2670044500 bases of 2670044500 (100.000%) in intersection ######################################################################### # lastz Horse equCab2 (DONE - 2014-10-27 - Hiram) # a tuning run to determine lastz parameters, this process should move # out of this directory to a bosTau8/bed/lastz... directory: mkdir /hive/users/hiram/multiz/100way/bosTau8.equCab2 cd /hive/users/hiram/multiz/100way/bosTau8.equCab2 zcat /hive/data/genomes/equCab2/bed/genscan/genscan.pep.gz \ > equCab2.genscan.pep zcat /hive/data/genomes/equCab2/bed/genscan/genscan.gp.gz \ > equCab2.genscan.gp zcat /hive/data/genomes/bosTau8/bed/genscan/genscan.pep.gz \ > bosTau8.genscan.pep zcat /hive/data/genomes/bosTau8/bed/genscan/genscan.gp.gz \ > bosTau8.genscan.gp # determine a correspondence between genes in the two sequences: time (blat -prot -oneOff=1 bosTau8.genscan.pep equCab2.genscan.pep \ -out=maf bosTau8.equCab2.oneOff.maf) > blat.log 2>&1 # real 282m40.628s ../mafScoreSizeScan.pl bosTau8.equCab2.oneOff.maf > mafScoreSizeScan.list ave mafScoreSizeScan.list | grep "^Q3" | awk '{print $2}' \ | sed -e 's/.000000//' > mafScoreSizeScan.Q3 # with those regions identified, extract corresponding sequence to # use in the lastz tuning operation. Filter all the scores for the # top quartile: Q3=`cat mafScoreSizeScan.Q3` awk '$1 > '$Q3' && $2 > 100 && $7 > 80 && $8 > 80' mafScoreSizeScan.list \ > top.all.list cut -f3,4 top.all.list | sort > selected.topAll.list topCount=`cat selected.topAll.list | wc -l` echo "number of top scorers: $topCount" # number of top scorers: 3240 for N in 100 200 300 400 do cat top.all.list | randomLines -decomment stdin $N top.$N.list cut -f3,4 top.$N.list | sort > selected.top$N.list $TOP/selectedFasta.sh $target $query top$N ./tune.top$N.sh & /cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz_D \ --inferonly=/hive/users/hiram/multiz/100way/create_scores_file.control \ bosTau8.top${N}.fa equCab2.top${N}.fa \ | /hive/users/hiram/multiz/100way/expand_scores_file.py \ > bosTau8.equCab2.tuning.top${N}.txt 2>&1 done # scan the four result files to see the range in the matrix scores # hopefully not too wildly different: ../matrixSummary.pl # read 4 .txt files bosTau8.equCab2 A C G T averages 4 files bosTau8.equCab2 A 81 -119 -30 -133 C -119 100 -95 -30 G -30 -95 100 -119 T -133 -30 -119 81 A C G T ranges 4 files bosTau8.equCab2 A 12 13 4 11 C 13 0 31 4 G 4 31 0 13 T 11 4 13 12 A C G T ranges percent 4 files bosTau8.equCab2 A 14.7 10.9 13.0 8.2 C 10.9 0.0 32.6 13.0 G 13.0 32.6 -0.0 10.9 T 8.2 13.0 10.9 14.7 # looks good, using the results from the top400: mkdir /hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11 cd /hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11 cp -p /hive/users/hiram/multiz/100way/bosTau8.equCab2/bosTau8.equCab2.tuning.top400.txt \ ./bosTau8.equCab2.tuning.Q.txt cat << '_EOF_' > DEF # Horse pig vs Horse # parameters obtained from a tuning run of lastz_D # /hive/users/hiram/multiz/100way/bosTau8.equCab2/bosTau8.equCab2.tuning.top400.txt BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_T=2 BLASTZ_O=400 BLASTZ_E=30 BLASTZ_M=50 BLASTZ_X=880 BLASTZ_Y=3400 BLASTZ_Q=/hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11/bosTau8.equCab2.tuning.Q.txt # A C G T # A 88 -111 -31 -128 # C -111 100 -75 -31 # G -31 -75 100 -111 # T -128 -31 -111 88 # TARGET: Cow bosTau8 SEQ1_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit SEQ1_LEN=/hive/data/genomes/bosTau8/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LIMIT=40 SEQ1_LAP=10000 # QUERY: Horse equCab2 SEQ2_DIR=/hive/data/genomes/equCab2/equCab2.2bit SEQ2_LEN=/hive/data/genomes/equCab2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=500 SEQ2_LAP=0 BASE=/hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11 TMPDIR=/dev/shm '_EOF_' # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 365m26.314s cat fb.bosTau8.chainEquCab2Link.txt # 1463493515 bases of 2649307237 (55.241%) in intersection time (doRecipBest.pl bosTau8 equCab2 -buildDir=`pwd` -workhorse=hgwdev) \ > best.log 2>&1 & # real 59m58.918s # and the swap mkdir /hive/data/genomes/equCab2/bed/blastz.bosTau8.swap cd /hive/data/genomes/equCab2/bed/blastz.bosTau8.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 139m36.654s cat fb.equCab2.chainBosTau8Link.txt # 1497036779 bases of 2428790173 (61.637%) in intersection time (doRecipBest.pl equCab2 bosTau8 -buildDir=`pwd` -workhorse=hgwdev) \ > best.log 2>&1 # real 75m13.344s ######################################################################### ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## ############################################################################## # LASTZ mouse/bosTau8 sheep/oviAri3 - (DONE - 2015-01-20 - Hiram) mkdir /hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20 cd /hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20 cp -p \ /hive/users/hiram/multiz/100way/bosTau8.oviAri3/bosTau8.oviAri3.tuning.top400.txt \ ./bosTau8.oviAri3.tuning.Q.txt cat << '_EOF_' > DEF # Horse vs. Sheep # parameters obtained from a tuning run of lastz_D # /hive/users/hiram/multiz/100way/bosTau8.oviAri3/bosTau8.oviAri3.tuning.top400.txt BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_O=400 BLASTZ_E=30 BLASTZ_M=50 BLASTZ_X=1000 BLASTZ_Y=3400 BLASTZ_K=3000 BLASTZ_L=3000 BLASTZ_Q=/hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20/bosTau8.oviAri3.tuning.Q.txt # A C G T # A 100 -172 -55 -173 # C -172 95 -212 -55 # G -55 -212 95 -172 # T -173 -55 -172 100 # TARGET: Horse bosTau8 SEQ1_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit SEQ1_LEN=/hive/data/genomes/bosTau8/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Sheep oviAri3 SEQ2_DIR=/hive/data/genomes/oviAri3/oviAri3.2bit SEQ2_LEN=/hive/data/genomes/oviAri3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=40 SEQ2_LAP=0 BASE=/hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20 TMPDIR=/dev/shm '_EOF_' # << happy emacs time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 1312m54.887s cat fb.bosTau8.chainOviAri3Link.txt # 2236790379 bases of 2649307237 (84.429%) in intersection time (doRecipBest.pl -buildDir=`pwd` bosTau8 oviAri3) > rbest.log 2>&1 & # real 38m9.755s # and for the swap: mkdir /hive/data/genomes/oviAri3/bed/blastz.bosTau8.swap cd /hive/data/genomes/oviAri3/bed/blastz.bosTau8.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 477m36.613s cat fb.oviAri3.chainBosTau8Link.txt # 2217800123 bases of 2534335866 (87.510%) in intersection time (doRecipBest.pl -buildDir=`pwd` oviAri3 bosTau8) > rbest.log 2>&1 # real 44m43.976s ######################################################################### # GENEID GENE PREDICTIONS (DONE - 2015-06-26 - Hiram) ssh hgwdev mkdir /hive/data/genomes/bosTau8/bed/geneid cd /hive/data/genomes/bosTau8/bed/geneid wget --timestamping \ http://genome.crg.es/genepredictions/B.taurus/bosTau8/geneid_v1.4/bosTau8.geneid.prot wget --timestamping \ http://genome.crg.es/genepredictions/B.taurus/bosTau8/geneid_v1.4/bosTau8.geneid.gtf ldHgGene -gtf -genePredExt bosTau8 geneid bosTau8.geneid.gtf # Read 33831 transcripts in 268467 lines in 1 files # 33831 groups 1612 seqs 1 sources 3 feature types # 33831 gene predictions featureBits -countGaps bosTau8 geneid # 37012407 bases of 2670044500 (1.386%) in intersection featureBits -countGaps bosTau3 geneid # 44623578 bases of 2902896901 (1.537%) in intersection ########################################################################## # SGP GENE PREDICTIONS (DONE - 2015-07-20 - Hiram) ssh hgwdev mkdir /hive/data/genomes/bosTau8/bed/sgpGene cd /hive/data/genomes/bosTau8/bed/sgpGene for F in 00README bosTau8.Hs-Mm.sgp2 bosTau8.Hs-Mm.sgp2.cds \ bosTau8.Hs-Mm.sgp2.gff bosTau8.Hs-Mm.sgp2.gff3 bosTau8.Hs-Mm.sgp2.gtf \ bosTau8.Hs-Mm.sgp2.prot do wget --timestamping \ http://genome.crg.es/genepredictions/B.taurus/bosTau8/SGP2/hg38/${F} done ldHgGene -gtf -genePredExt bosTau8 sgpGene bosTau8.Hs-Mm.sgp2.gtf # Read 35519 transcripts in 297667 lines in 1 files # 35519 groups 364 seqs 1 sources 3 feature types # 35519 gene predictions featureBits -countGaps bosTau8 sgpGene # 36198709 bases of 2670044500 (1.356%) in intersection featureBits -countGaps bosTau3 sgpGene # 40658543 bases of 2902896901 (1.401%) in intersection ########################################################################## # LASTZ cow/bosTau8 killer whale/orcOrc1 - (DONE - 2016-06-01,02 - Hiram) mkdir /hive/data/genomes/bosTau8/bed/lastzOrcOrc1.2016-06-01 cd /hive/data/genomes/bosTau8/bed/lastzOrcOrc1.2016-06-01 printf '# cow vs. killer whale BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: cow bosTau8 SEQ1_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit SEQ1_LEN=/hive/data/genomes/bosTau8/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: killer whale orcOrc1 SEQ2_DIR=/hive/data/genomes/orcOrc1/orcOrc1.2bit SEQ2_LEN=/hive/data/genomes/orcOrc1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/bosTau8/bed/lastzOrcOrc1.2016-06-01 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 1439m45.464s cat fb.bosTau8.chainOrcOrc1Link.txt # 1670894199 bases of 2649307237 (63.069%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` bosTau8 orcOrc1) \ > rbest.log 2>&1 & # real 590m0.045s # and for the swap: mkdir /hive/data/genomes/orcOrc1/bed/blastz.bosTau8.swap cd /hive/data/genomes/orcOrc1/bed/blastz.bosTau8.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau8/bed/lastzOrcOrc1.2016-06-01/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 411m32.032s cat fb.orcOrc1.chainBosTau8Link.txt # 1659691513 bases of 2249582125 (73.778%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` orcOrc1 bosTau8) \ > rbest.log 2>&1 # real 515m19.362s ############################################################################ # DBSNP B148 / SNP148 (DONE 8/1/16 jcasper) # RedMine #16846 screen mkdir -p /hive/data/outside/dbSNP/148/cow_bosTau8 cd /hive/data/outside/dbSNP/148/cow_bosTau8 # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (cow_9913 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # Note that there are two cow entries - 9913 and 30522. 30522, however, # has B128 as its most recent build. # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. # Had to fetch # ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000003055.6.assembly.txt # manually and install it in the config file because NCBI's FTP server # appears to be having some problems with providing the right header # information (curl and wget on our server complain about it, anyway). cat > config.ra <& do.log & tail -f do.log # Some trial and error is always required to get the config.ra just right. # First stop: #*** 2 lines of b148_ContigInfo.bcp.gz contained contig names that #*** could not be mapped to chrom.size via their GenBank contig mappings; see #*** /hive/data/outside/dbSNP/148/cow_bosTau8/cantLiftUpSeqNames.txt . # #*** You must account for all 6337 contig_acc values in config.ra, #*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output). #*** Check the auto-generated suggested.lft to see if it covers all #*** 6337 contigs; if it does, add 'liftUp suggested.lft' to config.ra. #*** Then run again with -continue=loadDbSnp . cat cantLiftUpSeqNames.txt #unplaced-scaffold (no chrUn_GJ058435v1 in chrom.sizes) NW_003099180 na #unplaced-scaffold (no chrUn_GJ058447v1 in chrom.sizes) NW_003099192 na # These GenBank contigs (GJ058435.1 and GJ058447.1) were added in the most # recent version of the Bos_taurus_UMD_3.1.1 assembly - the one with accession # number GCF_000003055.6. Our bosTau8 is build from GCF_000003055.5. # Handling this by dropping those two contigs from the build. tawk '{print $2}' cantLiftUpSeqNames.txt > ignoreDbSnpContigs.txt echo 'ignoreDbSnpContigsFile ignoreDbSnpContigs.txt' >> config.ra cat >> config.ra <> do.log 2>&1 & tail -f do.log # *** All done! zcat snp148.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 120755 #Common: 10469 #Flagged: 0 #leftover: 99100955 # 10,469 SNPs out of ~100M that meet the threshold for Common... that's # pretty sparse. Not sure it's worth pushing out as a separate subtrack. ############################################################################## # LIFTOVER TO bosTau9 (DONE - 2018-11-07 - Hiram) ssh hgwdev mkdir /hive/data/genomes/bosTau8/bed/blat.bosTau9.2018-11-07 cd /hive/data/genomes/bosTau8/bed/blat.bosTau9.2018-11-07 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/bosTau8/jkStuff/bosTau8.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ bosTau8 bosTau9) > do.log 2>&1 & # real 933m14.111s # verify the convert link on the test browser is now # active from bosTau8 to bosTau9 ##############################################################################