# for emacs: -*- mode: sh; -*- # http://www.ncbi.nlm.nih.gov/genome/82 # http://www.ncbi.nlm.nih.gov/bioproject/12555 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAFC01 # http://www.ncbi.nlm.nih.gov/genome/assembly/313728/ # file template copied from bosTau6.txt # Assembly Info: # DATE: 25-Oct-2011 # ORGANISM: Bos taurus # TAXID: 9913 # ASSEMBLY LONG NAME: Btau_4.6.1 # ASSEMBLY SHORT NAME: Btau_4.6.1 # ASSEMBLY SUBMITTER: Bovine Genome Sequencing Consortium # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000003205.4 # # FTP-RELEASE DATE: 03-Nov-2011 # Bos taurus (NCBI Project ID: 12555, Accession: GCA_000003205.4) # by Bovine Genome Sequencing and Analysis Consortiumn # assembly] sequence: # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Bos_taurus/Btau_4.6.1/ # Project 12555 # url: # http://www.ncbi.nlm.nih.gov/nuccore/AAFC00000000 # chrM: AY526085 # http://www.ncbi.nlm.nih.gov/nuccore/AY526085.1 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAFC00 # 7X WGS coverage ########################################################################## # Download sequence (DONE - 2011-05-02 Chin) mkdir /hive/data/genomes/bosTau7 cd /hive/data/genomes/bosTau7 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Bos_taurus/Btau_4.6.1/*" # FINISHED --2011-12-16 15:36:00-- # Downloaded: 244 files, 2.1G in 30m 24s (1.17 MB/s) # Read ASSEMBLY_INFO # measure total sequence in this assembly faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa \ Primary_Assembly/unplaced_scaffolds/FASTA/un*.fa.gz # 2981103241 bases (176446405 N's 2804656836 real 2804656836 upper # 0 lower) in 11691 sequences in 32 files # Total size: mean 254991.3 sd 4719241.6 min 152 (gi|346076751|gb|JH125284.1|) # max 161428367 (gi|346683426|gb|CM000177.5|) median 8572 # N count: mean 15092.5 sd 280692.5 # U count: mean 239898.8 sd 4441763.7 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real mkdir ucscChr # stay at genbank directory # fixup the accession names to become UCSC chrom names export S=Primary_Assembly/assembled_chromosomes cat ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" zcat ${S}/AGP/chr${C}.comp.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz done export S=Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC cat ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" echo ">chr${C}" > ucscChr/chr${C}.fa zcat ${S}/FASTA/chr${C}.fa.gz | grep -v "^>" >> ucscChr/chr${C}.fa gzip ucscChr/chr${C}.fa & done # Check them with faSize faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz # 2673141463 bases (159233025 N's 2513908438 real 2513908438 upper # 0 lower) in 31 sequences in 31 files faSize ucscChr/chr*.fa.gz # 2660906405 bases (20740230 N's 2640166175 real 2640166175 upper # 0 lower) in 30 sequences in 30 files # For unplaced scalfolds, named them as chrUn_xxxxxxxx # where xxxxxx is the original access id as: chrUn_JH121286 # The ".1" at the end (if any) need to be filter out since # MySQL does not allow "." as part of the table name and # will casue problems in genbank task step later export S=Primary_Assembly/unplaced_scaffolds zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > ucscChr/chrUn.agp # append the gap records zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \ | sed -e "s/^/chrUn_/" -e "s/\.1//" >> ucscChr/chrUn.agp gzip ucscChr/chrUn.agp & zcat ${S}/FASTA/unplaced.scaf.fa.gz \ | sed -e "s#^>.*|gb|#>chrUn_#; s#|.*##" -e "s/\.1//" \ | gzip > ucscChr/chrUn.fa.gz # about 11660 sequences in the unplaced zcat ucscChr/chrUn.fa.gz | grep "^>" | wc # 11660 11660 46640 # Check them with faSize faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 307961778 bases (17213380 N's 290748398 real 290748398 upper # 0 lower) in 11660 sequences in 1 files faSize ucscChr/chrUn.fa.gz # 307961778 bases (17213380 N's 290748398 real 290748398 upper # 0 lower) in 11660 sequences in 1 files # N50 cd /hive/data/genomes/bosTau7/ n50.pl chrom.sizes # reading: chrom.sizes # contig count: 11692, total size: 2981119579, one half size: 1490559789 # cumulative N50 count contig contig size # 1444820354 12 chrX 88654062 # 1490559789 one half size # 1529939826 13 chr12 85119472 cat << '_EOF_' > bosTau7.config.ra # Config parameters for makeGenomeDb.pl: db bosTau7 clade mammal genomeCladePriority 31 scientificName Bos taurus commonName Cow assemblyDate OCt. 2011 assemblyLabel Baylor Btau_4.6.1 assemblyShortLabel Baylor Btau_4.6.1 orderKey 236 mitoAcc AY526085 fastaFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.fa.gz agpFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.agp.gz # qualFiles none dbDbSpeciesDir cow photoCreditURL http://www.genome.gov/10005141 photoCreditName NHGRI Press Photos ncbiGenomeId 82 ncbiAssemblyId 313728 ncbiAssemblyName Baylor Btau_4.6.1 ncbiBioProject 12555 genBankAccessionID GCA_000003205.4 taxId 9913 '_EOF_' ######################################################################### # Initial database build (DONE - 2012-01-03 - Chin) cd /hive/data/genomes/bosTau7 cat << '_EOF_' > bosTau7.config.ra # Config parameters for makeGenomeDb.pl: db bosTau7 clade mammal genomeCladePriority 31 scientificName Bos taurus commonName Cow assemblyDate OCt. 2011 assemblyLabel Baylor Btau_4.6.1 assemblyShortLabel Btau_4.6.1 orderKey 236 mitoAcc AY526085 fastaFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.fa.gz agpFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.agp.gz # qualFiles none dbDbSpeciesDir cow photoCreditURL http://www.genome.gov/10005141 photoCreditName NHGRI Press Photos ncbiGenomeId 82 ncbiAssemblyId 313728 ncbiAssemblyName Baylor Btau_4.6.1 ncbiBioProject 12555 genBankAccessionID GCA_000003205.4 taxId 9913 '_EOF_' # << happy emacs time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev bosTau7.config.ra \ > makeGenomeDb.log 2>&1 & # real 23m34.828s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/bosTau7.unmasked.2bit /gbdb/bosTau7/bosTau7.2bit # Per instructions in makeGenomeDb.log: # - replace *** use actual url for project etc.. in all html files #Search for '***' notes in each file in and make corrections (sometimes the #files used for a previous assembly might make a better template): # description.html /cluster/data/bosTau7/html/{trackDb.ra,gap.html,gold.html} # #Then copy these files to your ~/kent/src/hg/makeDb/trackDb/cow/bosTau7 # - cd ~/kent/src/hg/makeDb/trackDb # - edit makefile to add bosTau7 to DBS. # - git add cow/bosTau7/*.{ra,html} # - git commit -m "Added bosTau7 to DBS." makefile # - git commit -m "Initial descriptions for bosTau7." cow/bosTau7/*.{ra,html} # - git pull; git push # - Run make update DBS=bosTau7 and make alpha when done. # - (optional) Clean up /cluster/data/bosTau7/TemporaryTrackDbCheckout # re-generated description html using update makeGenomeDb.pl and config.ra cd /hive/data/genomes/bosTau7 makeGenomeDb.pl -forceDescription -continue=trackDb bosTau7.config.ra ######################################################################### # RepeatMasker (DONE 2012-01-04 - Chin) mkdir /hive/data/genomes/bosTau7/bed/repeatMasker cd /hive/data/genomes/bosTau7/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -bigClusterHub=swarm -noSplit bosTau7 > do.log 2>&1 & # real 472m4.476s cat faSize.rmsk.txt # 2981119579 bases (176446405 N's 2804673174 real 1415188620 upper # 1389484554 lower) in 11692 sequences in 1 files # %46.61 masked total, %49.54 masked real ######################################################################### # simpleRepeats (DONE - 2012-01-05 - Chin) mkdir /hive/data/genomes/bosTau7/bed/simpleRepeat cd /hive/data/genomes/bosTau7/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev -bigClusterHub=swarm -smallClusterHub=memk \ bosTau7 > do.log 2>&1 & # real 39m2.413s cat fb.simpleRepeat # 40432561 bases of 2804801325 (1.442%) in intersection # add to the repeatMasker cd /hive/data/genomes/bosTau7 twoBitMask bosTau7.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau7.2bit # safe to ignore warnings about >=13 fields twoBitToFa bosTau7.2bit stdout | faSize stdin > bosTau7.2bit.faSize.txt cat bosTau7.2bit.faSize.txt # 2981119579 bases (176446405 N's 2804673174 real 1414674956 upper # 1389998218 lower) in 11692 sequences in 1 files # %46.63 masked total, %49.56 masked real # double check with featureBits featureBits -countGaps bosTau7 gap # 176318254 bases of 2981119579 (5.914%) in intersection rm /gbdb/bosTau7/bosTau7.2bit ln -s `pwd`/bosTau7.2bit /gbdb/bosTau7/bosTau7.2bit ######################################################################### # Marking *all* gaps - they are not all in the AGP file # (DONE - 2012-01-05 - Chin) mkdir /hive/data/genomes/bosTau7/bed/allGaps cd /hive/data/genomes/bosTau7/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../bosTau7.unmasked.2bit > findMotif.txt 2>&1 # real 0m42.321s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits bosTau7 -not gap -bed=notGap.bed # 2804801325 bases of 2804801325 (100.000%) in intersection featureBits bosTau7 allGaps.bed notGap.bed -bed=new.gaps.bed # 128151 bases of 2804801325 (0.005%) in intersection # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" bosTau7 | sort -n | tail -1 # 10242 # use tcsh and ctrl-c to create the here doc cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" bosTau7 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits bosTau7 other.bed # 128151 bases of 2804801325 (0.005%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad bosTau7 otherGap other.bed # Loaded 12612 elements of size 8 # adding this many: wc -l bed.tab # 12612 bed.tab # starting with this many hgsql -e "select count(*) from gap;" bosTau7 # 69725 hgsql bosTau7 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" bosTau7 # 82337 # == 69725 + 12612 ########################################################################## ## WINDOWMASKER (DONE 2012-01-05 Chin) mkdir /hive/data/genomes/bosTau7/bed/windowMasker cd /hive/data/genomes/bosTau7/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev bosTau7 > do.log 2>&1 & # real 191m59.940s # Masking statistics twoBitToFa bosTau7.wmsk.2bit stdout | faSize stdin # 2981119579 bases (176446405 N's 2804673174 real 1660551615 upper # 1144121559 lower) in 11692 sequences in 1 files # %38.38 masked total, %40.79 masked real twoBitToFa bosTau7.wmsk.sdust.2bit stdout | faSize stdin # 2981119579 bases (176446405 N's 2804673174 real 1645487915 upper # 1159185259 lower) in 11692 sequences in 1 files # %38.88 masked total, %41.33 masked real hgLoadBed bosTau7 windowmaskerSdust windowmasker.sdust.bed.gz # Reading windowmasker.sdust.bed.gz # Read 13173670 elements of size 3 from windowmasker.sdust.bed.gz # Sorted # Creating table definition for windowmaskerSdust # Saving bed.tab # Loading bosTau7 featureBits -countGaps bosTau7 windowmaskerSdust # 1335621689 bases of 2981119579 (44.803%) in intersection # eliminate the gaps from the masking featureBits bosTau7 -not gap -bed=notGap.bed # 2804673174 bases of 2804673174 (100.000%) in intersection time nice -n +19 featureBits bosTau7 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # real 4m52.720s # 1159185259 bases of 2804673174 (41.330%) in intersection # reload track to get it clean hgLoadBed bosTau7 windowmaskerSdust cleanWMask.bed.gz # Read 13181547 elements of size 4 from cleanWMask.bed.gz # Loading bosTau7 time featureBits -countGaps bosTau7 windowmaskerSdust # real 1m11.488s # 1159185259 bases of 2981119579 (38.884%) in intersection zcat cleanWMask.bed.gz \ | twoBitMask ../../bosTau7.unmasked.2bit stdin \ -type=.bed bosTau7.cleanWMSdust.2bit twoBitToFa bosTau7.cleanWMSdust.2bit stdout | faSize stdin \ > bosTau7.cleanWMSdust.faSize.txt cat bosTau7.cleanWMSdust.faSize.txt # 2981119579 bases (176446405 N's 2804673174 real 1645487915 upper # 1159185259 lower) in 11692 sequences in 1 files # %38.88 masked total, %41.33 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps bosTau7 rmsk windowmaskerSdust # 934428494 bases of 2981119579 (31.345%) in intersection ######################################################################## # Create kluster run files (DONE - 2012-01-05 - Chin) # numerator is bosTau7 gapless bases "real" as reported by: featureBits -noRandom -noHap bosTau7 gap # 159233025 bases of 2513924776 (6.334%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2513924776 / 2861349177 \) \* 1024 ( 2513924776 / 2861349177 ) * 1024 = 899.666140 # ==> use -repMatch=900 according to size scaled down from 1024 for human. # and rounded down to nearest 50 (runup 899.666140 to 900 in this case) cd /hive/data/genomes/bosTau7 blat bosTau7.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/bosTau7.11.ooc \ -repMatch=900 & # Wrote 37068 overused 11-mers to jkStuff/bosTau7.11.ooc mkdir /hive/data/staging/data/bosTau7 cp -p bosTau7.2bit jkStuff/bosTau7.11.ooc /hive/data/staging/data/bosTau7 cp -p chrom.sizes /hive/data/staging/data/bosTau7 # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' bosTau7 \ | sort -k7,7nr # most non-bridges gaps have size = 5000, follow pig's example (most # 100, but use 5000) # decide on a minimum gap for this break, use either 100 or 5000 will # generate 13387 liftOver rows, but if use 6000, only got 11703 rows. # so use 100 here to get more liftOver row. gapToLift -verbose=2 -minGap=100 bosTau7 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed cp -p jkStuff/nonBridged.lft \ /hive/data/staging/data/bosTau7/bosTau7.nonBridged.lft # ask cluster-admin to copy (evry time if any file changed) # /hive/data/staging/data/bosTau7 directory to # /scratch/data/bosTau7 on cluster nodes # before porceed to genbank step ############################################################################ # set this as defaultDb (DONE - 2012-01-06 - Chin) # and make this the default genome for Cow hgsql -e 'update defaultDb set name="bosTau7" where name="bosTau6";' \ hgcentraltest ######################################################################## # GENBANK AUTO UPDATE (DONE - 2012-01-09 - Chin) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt # Bos taurus 18945 1559562 13153 # edit etc/genbank.conf to add bosTau7 just before susScr2 # bosTau7 (Cow) bosTau7.serverGenome = /hive/data/genomes/bosTau7/bosTau7.2bit bosTau7.clusterGenome = /scratch/data/bosTau7/bosTau7.2bit bosTau7.ooc = /scratch/data/bosTau7/bosTau7.11.ooc bosTau7.lift = no bosTau7.perChromTables = no bosTau7.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} bosTau7.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} bosTau7.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} bosTau7.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} bosTau7.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} bosTau7.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} bosTau7.downloadDir = bosTau7 bosTau7.refseq.mrna.native.load = yes bosTau7.refseq.mrna.xeno.load = yes bosTau7.refseq.mrna.xeno.loadDesc = yes bosTau7.upstreamGeneTbl = refGene git add etc/genbank.conf git commit -m "Added bosTau7" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. Skipped screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial bosTau7 & # logFile: var/build/logs/2012.01.11-09:14:36.bosTau7.initalign.log # real 1258m43.818s # To re-do, rm the dir first: # /cluster/data/genbank/data/aligned/genbank.187.0/bosTau7 # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bosTau7 & # logFile: var/dbload/hgwdev/logs/2012.01.12-10:33:25.dbload.log # real 58m49.817s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add bosTau7 to: # etc/align.dbs # etc/hgwdev.dbs git add etc/align.dbs git add etc/hgwdev.dbs git commit -m "Added bosTau7 - Cow" etc/align.dbs etc/hgwdev.dbs git push make etc-update ########################################################################## # BLATSERVERS ENTRY (DONE 2012-01-13 - Chin) # After getting a blat server assigned by the Blat Server Gods, # # bosTau7 (trans) on blatx port 17842 # bosTau7 (untrans) on blatx port 17843 hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("bosTau7", "blatx", "17842", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("bosTau7", "blatx", "17843", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # reset default position (DONE - 2012-01-13 - Chin) # Reset default position to an area contains RHO and a number of # other genes hgsql -e \ 'update dbDb set defaultPos="chr22:56,647,062-58,290,728" where name="bosTau7";' \ hgcentraltest ############################################################################ # ctgPos2 track - showing clone sequence locations on chromosomes # (DONE 2012-01-17 - Chin) # NOTE - create bosTau7 entry in all.joiner since this is a new species mkdir /hive/data/genomes/bosTau7/bed/ctgPos2 cd /hive/data/genomes/bosTau7/bed/ctgPos2 cat << '_EOF_' > agpToCtgPos2.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: zcat your.files.agp.gz | agpToCtgPos2.pl /dev/stdin > ctgPos2.tab\n"; exit 255; } my $agpFile = shift; open (FH, "<$agpFile") or die "can not read $agpFile"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my @a = split('\s+', $line); next if ($a[4] =~ m/^U$/); next if ($a[4] =~ m/^N$/); my $chrSize = $a[2]-$a[1]+1; my $ctgSize = $a[7]-$a[6]+1; die "sizes differ $chrSize != $ctgSize\n$line\n" if ($chrSize != $ctgSize); printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $chrSize, $a[0], $a[1]-1, $a[2], $a[4]; } close (FH); '_EOF_' # << happy emacs chmod 775 agpToCtgPos2.pl export S=../../genbank/Primary_Assembly/assembled_chromosomes cat ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` zcat ${S}/AGP/chr${C}.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" done | ./agpToCtgPos2.pl /dev/stdin > ctgPos2.tab hgLoadSqlTab bosTau7 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab # add the track ctgPos2 to src/hg/makeDb/trackDb/cow/bosTau7/trackDb.ra # at src/makeDb/trackdb do "make update DBS=bosTau7" or/and "make alpha" # based on result of # hgsql -N -e "select type from ctgPos2;" bosTau7 | sort | uniq # W # prepare the src/hg/makeDb/trackDb/cow/bosTau7/ctgPos2.html ############################################################################ # running cpgIsland business (DONE -2012-01-17 - Chin) mkdir /hive/data/genomes/bosTau7/bed/cpgIsland cd /hive/data/genomes/bosTau7/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../bosTau7.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/bosTau7/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod 775 runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time para problems para status # then, kick it with para push # check it with plb # when all are done, para time shows: # Checking finished jobs # Completed: 11692 of 11692 jobs # CPU time in finished jobs: 201s 3.35m 0.06h 0.00d 0.000 y # IO & Wait Time: 31839s 530.65m 8.84h 0.37d 0.001 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 17s 0.28m 0.00h 0.00d # Submission to last job: 111s 1.85m 0.03h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed ssh hgwdev cd /hive/data/genomes/bosTau7/bed/cpgIsland hgLoadBed bosTau7 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Read 38217 elements of size 10 from cpgIsland.bed # Sorted # Creating table definition for cpgIslandExt # Saving bed.tab # Loading bosTau7 # cleanup rm -fr hardMaskedFa ########################################################################### # ornAna1 Platypus LASTZ/CHAIN/NET (DONE - 2012-01-18 - Chin) screen # use screen to control this job mkdir /cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18 cd /cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18 cat << '_EOF_' > DEF # Cow vs. Platypus BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Cow bosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/cluster/data/bosTau7/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=300 SEQ1_CHUNK=20000000 SEQ1_LAP=0 # QUERY: Platypus ornAna1 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -verbose=2 -chainLinearGap=loose > do.log 2>&1 & # real 1780m16.857s *** All done ! Elapsed time: 1780m17s *** Make sure that goldenPath/bosTau7/vsOrnAna1/README.txt is accurate. *** Add {chain,net}OrnAna1 tracks to trackDb.ra if necessary. # filter with doRecipBest.pl time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ bosTau7 ornAna1 > rbest.log 2>&1 # real 12m55.429s cat fb.bosTau7.chainOrnAna1Link.txt # 204121295 bases of 2804673174 (7.278%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastzOrnAna1.2012-01-18 lastz.ornAna1 mkdir /cluster/data/ornAna1/bed/blastz.bosTau7.swap cd /cluster/data/ornAna1/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18/DEF \ -chainMinScore=5000 -verbose=2 -smallClusterHub=memk \ -swap -syntenicNet \ -chainLinearGap=loose -bigClusterHub=swarm > swap.log 2>&1 & # real 58m6.038s cat fb.ornAna1.chainBosTau7Link.txt # 190094136 bases of 1842236818 (10.319%) in intersection *** All done ! Elapsed time: 58m6s *** Make sure that goldenPath/ornAna1/vsBosTau7/README.txt is accurate. *** Add {chain,net}BosTau7 tracks to trackDb.ra if necessary. cd /cluster/data/ornAna1/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ##################################################################### # susScr2 Pig BLASTZ/CHAIN/NET (DONE - 2012-01-20 - Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20 cd /hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20 cat << '_EOF_' > DEF # Pig vs. Cow BLASTZ_M=50 # TARGET: Cow BosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Pig SusScr2 SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit SEQ2_LEN=/scratch/data/susScr2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1624m48.439s cat fb.bosTau7.chainSusScr2Link.txt # 1342940418 bases of 2804673174 (47.882%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s lastzSusScr2.2012-01-20 lastz.susScr2 # then swap mkdir /hive/data/genomes/susScr2/bed/blastz.bosTau7.swap cd /hive/data/genomes/susScr2/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 209m45.776s cat fb.susScr2.chainBosTau7Link.txt # 1443560932 bases of 2231298548 (64.696%) in intersection ######################################################################### # SWAP mm9 LASTZ (DONE - 2012-01-22 - Chin) # original alignment done at mm9.txt cd /hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22 cat fb.mm9.chainBosTau7Link.txt # 695010371 bases of 2620346127 (26.524%) in intersection # Create link cd /hive/data/genomes/mm9/bed ln -s lastzBosTau7.2012-01-22 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.mm9.swap cd /hive/data/genomes/bosTau7/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 51m44.505s cat fb.bosTau7.chainMm9Link.txt # 711305079 bases of 2804673174 (25.361%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.mm9.swap lastz.mm9 ############################################################################ # SWAP hg19 LASTZ (DONE 2012-01-23 - Chin) # original alignment cd /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23 cat fb.hg19.chainBosTau7Link.txt # 1360887008 bases of 2897316137 (46.971%) in intersection # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.hg19.swap cd /hive/data/genomes/bosTau7/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 95m9.611s cat fb.bosTau7.chainHg19Link.txt # 1388551419 bases of 2804673174 (49.508%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.hg19.swap lastz.hg19 ############################################################################ # SWAP canFam2 LASTZ (DONE 2012-01-24 - Chin) # original alignment cd /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-2 cat fb.canFam2.chainBosTau7Link.txt # 1381869475 bases of 2384996543 (57.940%) in intersection # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap cd /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 86m11.258s cat fb.bosTau7.chainCanFam2Link.txt # 1458512906 bases of 2804673174 (52.003%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.canFam2.swap lastz.canFam2 ############################################################################ # SWAP rn4 LASTZ (DONE 2012-01-24 - Chin) cd /hive/data/genomes/rn4/bed/blastzBosTau7.2012-01-24 cat fb.rn4.chainBosTau7Link.txt # 656136973 bases of 2571531505 (25.515%) in intersection # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.rn4.swap cd /hive/data/genomes/bosTau7/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn4/bed/blastzBosTau7.2012-01-24/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 48m51.145 cat fb.bosTau7.chainRn4Link.txt # 668440773 bases of 2804673174 (23.833%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.rn4.swap lastz.rn4 ############################################################################ # SWAP monDom5 LASTZ (DONE 2012-01-26 - Chin) cd /hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26 cat fb.monDom5.chainBosTau7Link.txt # 206928725 bases of 3501660299 (5.909%) in intersection # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.monDom5.swap cd /hive/data/genomes/bosTau7/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 59m33.272s cat fb.bosTau7.chainMonDom5Link.txt # 207518037 bases of 2804673174 (7.399%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.monDom5.swap lastz.monDom5 ######################################################################### # GENSCAN GENE PREDICTIONS (DONE 2012-01-31 - Chin) mkdir /hive/data/genomes/bosTau7/bed/genscan cd /hive/data/genomes/bosTau7/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/genscanlinux # create hard masked .fa files mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../bosTau7.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done # Generate a list file, genome.list, of all the hard-masked contig # chunks: find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list wc -l genome.list # 11692 genome.list # Run on small cluster (more mem than big cluster). ssh swarm cd /hive/data/genomes/bosTau7/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Create template file, template, for gensub2. For example (3-line # file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single template jobList para create jobList para try para check ... etc... para time # Completed: 11690 of 11692 jobs # Crashed: 2 jobs # CPU time in finished jobs: 66494s 1108.23m 18.47h 0.77d 0.002 y # IO & Wait Time: 33892s 564.87m 9.41h 0.39d 0.001 y # Average job time: 9s 0.14m 0.00h 0.00d # Longest finished job: 3664s 61.07m 1.02h 0.04d # Submission to last job: 4007s 66.78m 1.11h 0.05d # Two jobs (chr19 and chr26) failed, re-run them: /cluster/bin/x86_64/gsBig hardMaskedFa/chr26.fa gtf/chr26.gtf -trans=pep/chr26.pep -subopt=subopt/chr26.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=200000 /cluster/bin/x86_64/gsBig hardMaskedFa/chr19.fa gtf/chr19.gtf -trans=pep/chr19.pep -subopt=subopt/chr19.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2200000 # this did not work, runs out of file handles ? find ./gtf -type f | xargs -n 256 endsInLf -zeroOk # Concatenate results: cd /hive/data/genomes/bosTau7/bed/genscan find ./gtf -type f | xargs cat > genscan.gtf find ./pep -type f | xargs cat > genscan.pep find ./subopt -type f | xargs cat > genscanSubopt.bed # Load into the database (without -genePredExt because no frame # info): # Don't load the Pep anymore -- redundant since it's from genomic. ssh hgwdev cd /hive/data/genomes/bosTau7/bed/genscan # to construct a local file with the genePred business: gtfToGenePred genscan.gtf genscan.gp # this produces exactly the same thing and loads the table: ldHgGene -gtf bosTau7 genscan genscan.gtf # Reading genscan.gtf # Read 47871 transcripts in 344386 lines in 1 files # 47871 groups 4673 seqs 1 sources 1 feature types # 47871 gene predictions hgLoadBed bosTau7 genscanSubopt genscanSubopt.bed # Read 518687 elements of size 6 from genscanSubopt.bed featureBits bosTau7 genscan # 58056267 bases of 2804673174 (2.070%) in intersection # previously: featureBits bosTau4 genscan # 58224594 bases of 2731830700 (2.131%) in intersection featureBits bosTau3 genscan # 59251085 bases of 2731807384 (2.169%) in intersection ############################################################################# # LIFTOVER TO bosTau4 (DONE - 2012-05-11 - Chin) ssh hgwdev cd /hive/data/genomes/bosTau7 ln -s jkStuff/bosTau7.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ bosTau7 bosTau4 > doLiftOverToBosTau4.log 2>&1 & # real 1567m22.050s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/ md5sum bosTau7ToBosTau4.over.chain.gz >> md5sum.txt popd ############################################################################# # LIFTOVER TO bosTau5 (DONE - 2012-05-11 - Chin) ssh hgwdev cd /hive/data/genomes/bosTau7 ln -s jkStuff/bosTau7.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ bosTau7 bosTau5 > doLiftOverToBosTau5.log 2>&1 & # real 1570m4.968s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/ md5sum bosTau7ToBosTau5.over.chain.gz >> md5sum.txt popd ############################################################################# # LIFTOVER TO bosTau6 (DONE - 2012-05-11 - Chin) ssh hgwdev cd /hive/data/genomes/bosTau7 ln -s jkStuff/bosTau7.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ bosTau7 bosTau6 > doLiftOverToBosTau6.log 2>&1 & # real 1570m6.579s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/ md5sum bosTau7ToBosTau6.over.chain.gz >> md5sum.txt popd ############################################################################# # LIFTOVER TO bosTau8 (DONE - 2014-10-17 - Steve) ssh hgwdev cd /hive/data/genomes/bosTau7 ln -s jkStuff/bosTau7.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ bosTau7 bosTau8 > doLiftOverToBosTau8.log 2>&1 # real 85m39.000s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/ md5sum bosTau7ToBosTau8.over.chain.gz >> md5sum.txt popd ######################################################################### # all.joiner update, downloads and in pushQ - (DONE 2012-05-16 - Chin) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=bosTau7 -all all.joiner mkdir /hive/data/genomes/bosTau7/goldenPath cd /hive/data/genomes/bosTau7/goldenPath makeDownloads.pl bosTau7 > do.log 2>&1 # now ready for pushQ entry mkdir /hive/data/genomes/bosTau7/pushQ cd /hive/data/genomes/bosTau7/pushQ makePushQSql.pl bosTau7 > bosTau7.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/bosTau7/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/bosTau7/wib/quality.wib # WARNING: hgwdev does not have /gbdb/bosTau7/bbi/quality.bw # WARNING: bosTau7 does not have seq # WARNING: bosTau7 does not have extFile # # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # tableList # copy it to hgwbeta scp -p bosTau7.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < bosTau7.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly ######################################################################### # SWAP mm10 LASTZ (DONE - 2012-06-19 - Chin) # original alignment done at mm10.txt cd /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 cat fb.mm10.chainBosTau7Link.txt # 696498363 bases of 2652783500 (26.255%) in intersection # Create link cd /hive/data/genomes/mm10/bed ln -s lastzBosTau7.2012-03-21 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau7/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 77m58.759s cat fb.bosTau7.chainMm10Link.txt # 711923052 bases of 2804673174 (25.383%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ alpaca vicPac1 (DONE - 2012-06-19 - Chin) # establish a screen to control this job with a name to indicate # what it is screen -S bosTau7VicPac1 mkdir /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19 cd /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # alpaca vs cow BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: cow BosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: alpaca VicPac1 SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=4000 BASE=/hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 3089m45.519s cat fb.bosTau7.chainVicPac1Link.txt # 1259182233 bases of 2804673174 (44.896%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s lastzVicPac1.2012-06-19 lastz.vicPac1 # better to have reciprocal best for this one since it is low # coverage: cd /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19 time doRecipBest.pl bosTau7 vicPac1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 110m1.192s # swap mkdir /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap cd /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 717m22.295s cat fb.vicPac1.chainBosTau7Link.txt # 1309266730 bases of 1922910435 (68.088%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/vicPac1/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ######################################################################### # SWAP canFam3 LASTZ (DONE - 2012-06-23 - Chin) # original alignment done at canFam3.txt cd /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23 cat fb.canFam3.chainBosTau7Link.txt # 1381966556 bases of 2392715236 (57.757%) in intersection # Create link cd /hive/data/genomes/canFam3/bed ln -s lastzBosTau7.2012-06-23 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap cd /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 121m44.022s cat fb.bosTau7.chainCanFam3Link.txt # 1456104306 bases of 2804673174 (51.917%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.canFam3.swap lastz.canFam3 ############################################################################## # LASTZ dolphin turTru2 (DONE 2012-06-19 - Chin) # establish a screen to control this job with a name to indicate # what it is screen -S bosTau7TurTru2 mkdir /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19 cd /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dolphin vs cow BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Cow BosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dolphin TurTru2 SEQ2_DIR=/hive/data/genomes/turTru2/turTru2.2bit SEQ2_LEN=/hive/data/genomes/turTru2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=5000 BASE=/hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 7776m13.787s cat fb.bosTau7.chainTurTru2Link.txt # 1696154184 bases of 2804673174 (60.476%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s lastzTurTru2.2012-06-19 lastz.turTru2 # better to have reciprocal best for this one since it is low # coverage: cd /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19 time doRecipBest.pl bosTau7 turTru2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # 65m6.795s # then swap mkdir /hive/data/genomes/turTru2/bed/blastz.bosTau7.swap cd /hive/data/genomes/turTru2/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 532m53.448s cat fb.turTru2.chainBosTau7Link.txt # 1625706583 bases of 2332402443 (69.701%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru2/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ############################################################################ # lastz White Rhino cerSim1 (DONE - 2012-10-24 - Hiram) screen -S bosTau7CerSim1 mkdir /hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24 cd /hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24 cat << '_EOF_' > DEF # Cow vs White Rhino BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_M=50 # TARGET: Cow BosTau7 SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LIMIT=50 SEQ1_LAP=10000 # QUERY: White Rhino CerSim1 SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl \ `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \ -qRepeats=windowmaskerSdust \ -bigClusterHub=swarm -verbose=2 > do.log 2>&1 & # real 7139m16.881s cat fb.bosTau7.chainCerSim1Link.txt # 1628320914 bases of 2804673174 (58.057%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s lastzCerSim1.2012-10-24 lastz.cerSim1 # and the swap mkdir /hive/data/genomes/cerSim1/bed/blastz.bosTau7.swap cd /hive/data/genomes/cerSim1/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl \ /hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24/DEF \ -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \ -swap -qRepeats=windowmaskerSdust \ -bigClusterHub=swarm -verbose=2 > swap.log 2>&1 & # real 120m35.871s cat fb.cerSim1.chainBosTau7Link.txt # 1592093865 bases of 2366858012 (67.266%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cerSim1/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ############################################################################## # create ucscToINSDC name mapping (DONE - 2013-08-16 - Hiram) mkdir /hive/data/genomes/bosTau7/bed/ucscToINSDC cd /hive/data/genomes/bosTau7/bed/ucscToINSDC # copying these scripts from the previous load and improving them # with each instance ./translateNames.sh ./verifyAll.sh ./join.sh # verify the track link to INSDC functions ############################################################################## # Farm Animal QTL track (IN PROGRESS - 2014-01-07 - Pauline) mkdir /hive/data/genomes/bosTau7/bed/animalQtl cd /hive/data/genomes/bosTau7/bed/animalQtl wget http://www.animalgenome.org/QTLdb/tmp/QTL_Btau_4.6.ubed.txt.gz zcat QTL_Btau_4.6.ubed.txt.gz >> bosTau7.animalQtl.bed cut -f1-4 bosTau7.animalQtl.bed > bosTau7.animalQtl.bed4 hgLoadBed bosTau7 animalQtl bosTau7.animalQtl.bed4 checkTableCoords bosTau7 animalQtl # no results - yay! #added entries in trackDb for track and search #script to determine what to include in search rule: cat > findNames < #/tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' bosTau7 > /tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' galGal4 >> /tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' susScr3 >> /tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' oviAri3 >> /tmp/bosTau7.animalQtl.name.txt export maxLen=`awk '{print length($0)}' /tmp/bosTau7.animalQtl.name.txt | sort -rn | head -1` echo "scan to column: $maxLen" export C=1 while [ $C -le $maxLen ]; do echo -n " $C: " sort -u /tmp/bosTau7.animalQtl.name.txt | awk '{ print substr($0,'$C',1) }' | sort -u | xargs echo | sed -e 's/ //g' C=`echo $C | awk '{print $1+1}'` done #rm -f /tmp/bosTau7.animalQtl.name.txt mv /tmp/bosTau7.animalQtl.name.txt . EOF #script output: 1: 12346ABCDEFGHIJKLMNOPQRSTUVWYZcdfmpu 2: -/0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYacdeimort 3: %-01234568:ABCDEFGHIKLMNOPRSTUVWXYZ_abcdefnost 4: %-/0123456789:ABCDEFGHIKLMNOPRSTUVWYZ_abdehilnqrt 5: -0123456789:ABCDEFGHIJKLMNOPQRSTUVWYZ_abcdiklnorstw 6: %-.0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_bcdhimnoprstw 7: -0123456789:ABCDEFGHIKLMNOPRSTVW_abcent 8: 0123456789:ABCDEFGHILMNPRSTV_abeflo 9: 0123456789:BCEFMOPTW_adeklnt 10: 0123456789:CHLcdfirt 11: -0123456789:aln 12: 0123456789git 13: 0123456789: 14: 0123456789 15: 0123456789 16: 0123456789 17: 012346 18: 023456 #rule constructed based on script output - set termregex to this in trackDb #search stanza ^[a-zA-Z0-9]+[a-zA-Z0-9:/-]+[a-zA-Z0-9:/-_%]+ 614,23 Bot ############################################################################## # Farm Animal Cytoband track (IN PROGRESS - 2014-02-04 - Pauline) mkdir /hive/data/genomes/bosTau7/bed/cytoBand cd /hive/data/genomes/bosTau7/bed/cytoBand wget http://www.animalgenome.org/QTLdb/tmp/QTL_Btau_4.6.ubed.txt.gz zcat QTL_Btau_4.6.ubed.txt.gz >> bosTau7.animalQtl.bed ############################################################################ # DBSNP B138 / SNP138 (DONE 1/31/14 angie) # RedMine #12490 screen mkdir -p /hive/data/outside/dbSNP/138/cow cd /hive/data/outside/dbSNP/138/cow # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (cow_9913 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. cat > config.ra <& do.log & tail -f do.log # Some trial and error is always required to get the config.ra just right. # First stop: # *** This release contains more than one assembly label. # *** Please examine this list in case we need to exclude any of these: # #Bos_taurus_UMD_3.1 #Btau_4.6.1 # *** Add refAssemblyLabel to config.ra. If keeping all labels, it will # *** look like this: # #refAssemblyLabel Bos_taurus_UMD_3.1,Btau_4.6.1 # # *** Edit out any of those that are not included in bosTau7 (e.g. Celera). # *** Then restart this script with -continue=loadDbSnp . # bosTau7 is Btau_4.6.1 (UMD_3.1 is bosTau6): cat >> config.ra <>& do.log & tail -f do.log # Next stop: #*** 1 lines of b138_ContigInfo.bcp.gz contained contig names that #*** could not be mapped to chrom.size via their GenBank contig mappings; see #*** /hive/data/outside/dbSNP/138/cow/cantLiftUpSeqNames.txt . # #*** You must account for all 13387 contig_acc values in config.ra, #*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output). #*** Check the auto-generated suggested.lft to see if it covers all #*** 13387 contigs; if it does, add 'liftUp suggested.lft' to config.ra. #*** Then run again with -continue=loadDbSnp . cat cantLiftUpSeqNames.txt #unlocalized-scaffold (no chr23_GL629717_random in chrom.sizes) NT_182078 chr23 # Let's look at the assembly we got from GenBank: pushd /hive/data/genomes/bosTau7/genbank/Primary_Assembly/ head unlocalized_scaffolds/unlocalized.chr2scaf ##Chromosome Unlocalized Scaffold (Accession.version) #23 GL629717.2 # -- and that's it! Apparently GL629717 is the only unlocalized contig. # Now check the first section in this file... seqeuences were picked up from # assembled_chromosomes and unplaced_scaffolds -- but not from unlocalized_scaffolds. # So I believe we have an orphan contig, and need to ignore dbSNP's mappings to it. popd cut -f 2 cant* > ignoreContig.txt cat >> config.ra <>& do.log & tail -f do.log # The script halted because of a malformed fasta header (note the use of ""..."" # instead of "..." in the alleles keyword): #Parse error line 18636997: #>gnl|dbSNP|rs137508789 rs=137508789|pos=51|len=101|taxid=9913|mol="genomic"|class=2|alleles=""N-,C/T, B-, R-,J-,C/T""|build=133|suspect=? # Tweak doDbSnp.pl to handle that, re-run with -debug to regenerate addToDbSnp.csh: ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=addToDbSnp -stop=addToDbSnp -debug # Now edit addToDbSnp.csh to wrap "if(0)" around the parts that completed, # and run the rest of the commands: ./addToDbSnp.csh >>& do.log & tail -f do.log # Now continue with the next step after addToDbSnp.csh, bigJoin: ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=bigJoin >>& do.log & tail -f do.log # Next stop: the same strangely formatted observed alleles above caused an # abort in snpNcbiToUcsc: #Error line 18001467 of ucscNcbiSnp.bed: Encountered something that doesn't fit observedIndelFormat: N-,C/T, B-, R-,J-,C/T #Command failed: #ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' hgwdev nice /hive/data/outside/dbSNP/138/cow/translate.csh # Changed snpNcbiToUcsc to assign an ObservedWrongFormat exception instead of aborting. # Try again: ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=translate >>& do.log & tail -f do.log #Skipped 172559 snp mappings due to errors -- see snp138Errors.bed # That's a lot of errors... take a look and ask dbSNP? All are 'Missing observed'. # Spot-checking... rs210944221 turns up in rs_chNotOn.fas.gz which we exclude... # I think it's a bug for it to be there. It's on chr1 in Btau_4.6.1 (bosTau6) according to # http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=rs210944221 # Could it be in rs_chNotOn because it's not placed in UMD/bosTau7?? # Maybe excluding rs_chNotOn is not the right thing to do. # Yep, all spot-checks imply we should keep rs_chNotOn. # OK, fixed doDbSnp.pl; re-run from addToDbSnp on... mkdir -p `cat workingDir` ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=addToDbSnp >>& do.log & tail -f do.log # Here's the next place that strangely formatted observed alleles gum things up: #hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd bosTau7 snp138 -sqlTable=snp138.sql snp138.bed.gz #Reading snp138.bed.gz #Read 22370480 elements of size 25 from snp138.bed.gz #Sorted #Creating table definition for snp138 #Saving /data/tmp/loadBed_hgwdev_8e4_c23ae0.tab #ERROR: expecting r,g,b specification, found: 'N-,C/T, B-, R-,J-,C/T' # The observed column does not contain rgb values. Up to this point it didn't contain # commas either I guess... better add -type=bed6+ to the hgLoadBed command. # And better make hgLoadBed not assume itemRgb if bedN < 9! ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=load >>& do.log & tail -f do.log # *** All done! ############################################################################## # FILTER SNP138 (DONE 2/3/14 angie) cd /hive/data/outside/dbSNP/138/cow zcat snp138.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 2460543 #Common: 10223 #Flagged: 0 #leftover: 19899714 # 10,461 SNPs out of > 22M that meet the threshold for Common... enough to warrant a # Common SNPs track? Maybe; easy enough to drop it later if we decide it's too sparse. foreach f ({Mult,Common}.bed.gz) mv $f snp138$f end # Load tables foreach subset (Mult Common) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ bosTau7 snp138$subset -sqlTable=snp138.sql snp138$subset.bed.gz end ############################################################################## # DBSNP CODING ANNOTATIONS (138) (DONE 2/27/14 angie) # Originally done 2/3/14, but Brian L found hgc crash caused by script bug... cd /hive/data/outside/dbSNP/138/cow # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | ~/kent/src/hg/utils/automation/fixNcbiFuncCoords.pl ncbiFuncInsertions.ctg.bed.gz \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiFuncAnnotationsFixed.txt wc -l ncbiFuncAnnotationsFixed.txt #243649 ncbiFuncAnnotationsFixed.txt # How many & what kinds of function types? cut -f 6 ncbiFuncAnnotationsFixed.txt \ | sort -n | uniq -c # 55145 3 (coding-synon) # 119906 8 (cds-reference) # 1037 41 (nonsense) # 63668 42 (missense) # 146 43 (stop-loss) # 572 44 (frameshift) # 3175 45 (cds-indel) # 2/27/14: Re-ran from here after fixing bug in script: # Gather up multiple annotation lines into one line per {snp, gene, frame}: ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl ncbiFuncAnnotationsFixed.txt \ | liftUp snp138CodingDbSnp.bed suggested.lft warn stdin hgLoadBed bosTau7 snp138CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp138CodingDbSnp.bed #Read 123098 elements of size 11 from snp138CodingDbSnp.bed ############################################################################## # LIFTOVER TO bosTau9 (DONE - 2018-11-07 - Hiram) ssh hgwdev mkdir /hive/data/genomes/bosTau7/bed/blat.bosTau9.2018-11-07 cd /hive/data/genomes/bosTau7/bed/blat.bosTau9.2018-11-07 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/bosTau7/jkStuff/bosTau7.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ bosTau7 bosTau9) > do.log 2>&1 & # real 562m47.638s # verify the convert link on the test browser is now # active from bosTau7 to bosTau9 ##############################################################################