# for emacs: -*- mode: sh; -*- # DATE: 24-Aug-2011 # ORGANISM: Sus scrofa # TAXID: 9823 # ASSEMBLY LONG NAME: Sscrofa10.2 # ASSEMBLY SHORT NAME: Sscrofa10.2 # ASSEMBLY SUBMITTER: Swine Genome Sequencing Consortium # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # Assembly Accession: GCA_000003025.4 # FTP-RELEASE DATE: 08-Sep-2011 # http://www.ncbi.nlm.nih.gov/genome/84 # http://www.ncbi.nlm.nih.gov/genome/assembly/304498/ # http://www.ncbi.nlm.nih.gov/bioproject/13421 # http://www.ncbi.nlm.nih.gov/bioproject/28993 - chrMt NC_000845.1 # included in Sscrofa10 # http://www.ncbi.nlm.nih.gov/nuccore/326195957 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AEMK01 # Genome Coverage : 24x # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9823 # A different chrMt: # http://www.ncbi.nlm.nih.gov/bioproject/34563 - chrMt - NC_012095.1 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Sus_scrofa/Sscrofa10.2/ ./ ########################################################################## # Download sequence (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/susScr3 cd /hive/data/genomes/susScr3 mkdir genbank cd genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Sus_scrofa/Sscrofa10.2/ ./ # verify the size of the sequence here: faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/placed_scaffolds/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz # 5138998834 bases (296506722 N's 4842492112 real 4842492112 upper # 0 lower) in 9925 sequences in 41 files # Total size: mean 517783.3 sd 6498082.1 # min 100 (gi|333451020|gb|GL895433.1|) # max 315321322 (gi|345632377|gb|CM000812.4|) median 146507 ########################################################################## # fixup names for UCSC standards (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/susScr3/ucsc cd /hive/data/genomes/susScr3/ucsc ######################## Assembled Chromosomes cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, "|gzip -c >chr${chrN}.agp.gz") or die "can not write to chr${chrN}.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, "|gzip -c >chr${chrN}.fa.gz") or die "can not write to chr${chrN}.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl time ./toUcsc.pl # real 11m13.537s ######################## Unplaced scaffolds # verify we don't have any .acc numbers different from .1 zcat \ ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | cut -f1 | egrep "^GL|^AEMK" \ | sed -e 's/^GL[0-9][0-9]*//; s/^AEMK[0-9][0-9]*//' | sort | uniq -c # 111316 .1 # 130799 .2 # in that case, since they are not all .1, we need to use the full name # modified to remove the . # this is like the unplaced.pl script in other assemblies except it # does not add chrUn_ to the names since they are all just scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\./-/; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\|.*//; $line =~ s/\./-/; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 1m2.948s # verify nothing lost from original: time faSize *.fa.gz # 2808509378 bases (289538800 N's 2518970578 real 2518970578 upper # 0 lower) in 4582 sequences in 21 files # Total size: mean 612944.0 sd 9554154.6 min 100 (GL895433-1) # max 315321322 (chr1) median 25294 # real 1m3.821s # make sure none of the names got to be over 31 characers long: zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \ | sort -rn | uniq -c | head # 332154 10 # 3879 5 # 6787 4 ########################################################################## # Initial makeGenomeDb.pl (DONE - 2012-04-11 - Hiram) cd /hive/data/genomes/susScr3 cat << '_EOF_' > susScr3.config.ra # Config parameters for makeGenomeDb.pl: db susScr3 clade mammal genomeCladePriority 35 scientificName Sus scrofa commonName Pig assemblyDate Aug. 2011 assemblyLabel SGSC Sscrofa10.2 (NCBI project 13421, GCA_000003025.4, WGS AEMK01) assemblyShortLabel SGSC Sscrofa10.2 ncbiAssemblyName SGSC Sscrofa10.2 ncbiAssemblyId 304498 orderKey 2349 mitoAcc NC_000845 fastaFiles /hive/data/genomes/susScr3/ucsc/*.fa.gz agpFiles /hive/data/genomes/susScr3/ucsc/*.agp.gz # qualFiles none dbDbSpeciesDir pig taxId 9823 '_EOF_' # << happy emacs # verify sequence and agp are OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp susScr3.config.ra > agp.log 2>&1 # real 8m23.506s # verify OK: tail -1 agp.log # *** All done! (through the 'agp' step) # finish it off time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ -dbHost=hgwdev susScr3.config.ra > db.log 2>&1 # real 24m50.869s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/susScr3.unmasked.2bit /gbdb/susScr3/susScr3.2bit # browser should function now ######################################################################### # running repeat masker (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/susScr3/bed/repeatMasker cd /hive/data/genomes/susScr3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek susScr3 > do.log 2>&1 & # real 990m24.340s cat faSize.rmsk.txt # 2808525991 bases (289538800 N's 2518987191 real 1404190109 upper # 1114797082 lower) in 4583 sequences in 1 files # Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1) # max 315321322 (chr1) median 25278 # %39.69 masked total, %44.26 masked real egrep -i "versi|relea" do.log # April 26 2011 (open-3-3-0) version of RepeatMasker # CC RELEASE 20110920; # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ featureBits -countGaps susScr3 rmsk # 29525418 bases of 2525996391 (1.169%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/susScr3/bed/simpleRepeat cd /hive/data/genomes/susScr3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ susScr3 > do.log 2>&1 & # real 9m41.939s cat fb.simpleRepeat # 29525418 bases of 2525996391 (1.169%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/susScr3 twoBitMask susScr3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed susScr3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa susScr3.2bit stdout | faSize stdin > faSize.susScr3.2bit.txt cat faSize.susScr3.2bit.txt # 2808525991 bases (289538800 N's 2518987191 real 1403003553 upper # 1115983638 lower) in 4583 sequences in 1 files # Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1) # max 315321322 (chr1) median 25278 # %39.74 masked total, %44.30 masked real rm /gbdb/susScr3/susScr3.2bit ln -s `pwd`/susScr3.2bit /gbdb/susScr3/susScr3.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/susScr3/bed/gap cd /hive/data/genomes/susScr3/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../susScr3.unmasked.2bit > findMotif.txt 2>&1 # real 0m4.977s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits susScr3 -not gap -bed=notGap.bed # 2525996391 bases of 2525996391 (100.000%) in intersection time featureBits susScr3 allGaps.bed notGap.bed -bed=new.gaps.bed wc -l new.gaps.bed # 14502 new.gaps.bed # 39382069 bases of 2403678276 (1.638%) in intersection # real 2m51.724s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" susScr3 | sort -n | tail -1 # 1382 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" susScr3 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps susScr3 other.bed # 702334 bases of 2808525991 (0.025%) in intersection wc -l other.bed # 14502 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad susScr3 otherGap other.bed # Read 14502 elements of size 8 from other.bed # starting with this many hgsql -e "select count(*) from gap;" susScr3 # 169119 hgsql susScr3 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" susScr3 # 183621 # == 14502 + 169119 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 susScr3 nonBridged.lift -bedFile=nonBridged.bed # one of these new ones is adjacent to an existing gap: # WARNING: overlapping gap at chr1:16262949-16312949(fragment) and chr1:16312949-16312957(other) # see example in danRer7.txt # are there non-bridged gaps here: hgsql -N -e "select bridge from gap;" susScr3 | sort | uniq -c # 169119 no # 14502 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/susScr3/bed/windowMasker cd /hive/data/genomes/susScr3/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev susScr3 > do.log 2>&1 & # real 224m8.502s # Masking statistics twoBitToFa susScr3.wmsk.2bit stdout | faSize stdin # 2808525991 bases (289538800 N's 2518987191 real 1636028761 upper # 882958430 lower) in 4583 sequences in 1 files # Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1) # max 315321322 (chr1) median 25278 # %31.44 masked total, %35.05 masked real twoBitToFa susScr3.wmsk.sdust.2bit stdout | faSize stdin # 2808525991 bases (289538800 N's 2518987191 real 1622625959 upper # 896361232 lower) in 4583 sequences in 1 files # Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1) # max 315321322 (chr1) median 25278 # %31.92 masked total, %35.58 masked real hgLoadBed susScr3 windowmaskerSdust windowmasker.sdust.bed.gz # Read 13770155 elements of size 3 from windowmasker.sdust.bed.gz featureBits -countGaps susScr3 windowmaskerSdust # 1185785055 bases of 2808525991 (42.221%) in intersection # eliminate the gaps from the masking featureBits susScr3 -not gap -bed=notGap.bed # 2525294057 bases of 2525294057 (100.000%) in intersection time nice -n +19 featureBits susScr3 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 902560289 bases of 2525294057 (35.741%) in intersection # real 3m46.190s # reload track to get it clean hgLoadBed susScr3 windowmaskerSdust cleanWMask.bed.gz # Read 13751189 elements of size 4 from cleanWMask.bed.gz featureBits -countGaps susScr3 windowmaskerSdust # 902560289 bases of 2808525991 (32.136%) in intersection zcat cleanWMask.bed.gz \ | twoBitMask ../../susScr3.unmasked.2bit stdin \ -type=.bed susScr3.cleanWMSdust.2bit twoBitToFa susScr3.cleanWMSdust.2bit stdout | faSize stdin \ > susScr3.cleanWMSdust.faSize.txt cat susScr3.cleanWMSdust.faSize.txt # 2808525991 bases (289538800 N's 2518987191 real 1622625959 upper # 896361232 lower) in 4583 sequences in 1 files # Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1) # max 315321322 (chr1) median 25278 # %31.92 masked total, %35.58 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps susScr3 rmsk windowmaskerSdust # 660954442 bases of 2808525991 (23.534%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-11 - Hiram) # since rmsk masks so very little of the genome, use the clean WM mask # on the genome sequence # cd /hive/data/genomes/susScr3 # twoBitMask -add bed/windowMasker/susScr3.cleanWMSdust.2bit \ # bed/simpleRepeat/trfMask.bed susScr3.2bit # safe to ignore the warnings about BED file with >=13 fields # twoBitToFa susScr3.2bit stdout | faSize stdin > faSize.susScr3.txt # cat faSize.susScr3.txt # 927696114 bases (111611440 N's 816084674 real 607935500 upper # 208149174 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %22.44 masked total, %25.51 masked real # create symlink to gbdb # rm /gbdb/susScr3/susScr3.2bit # ln -s `pwd`/susScr3.2bit /gbdb/susScr3/susScr3.2bit ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/susScr3/bed/cpgIslands cd /hive/data/genomes/susScr3/bed/cpgIslands time doCpgIslands.pl susScr3 > do.log 2>&1 # real 16m36.325s cat fb.susScr3.cpgIslandExt.txt # 28059061 bases of 2525294057 (1.111%) in intersection ######################################################################### # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/susScr3/bed/genscan cd /hive/data/genomes/susScr3/bed/genscan time doGenscan.pl susScr3 > do.log 2>&1 # real 526m12.532s # one failing job: (as found via 'para problems' on swarm) ./runGsBig.csh chr1 000 gtf/000/chr1.gtf pep/000/chr1.pep subopt/000/chr1.bed # rerun at a window size of 2000000 worked out OK # real 116m26.816s time doGenscan.pl -continue=makeBed susScr3 > makeBed.log 2>&1 # real 22m49.789s cat fb.susScr3.genscan.txt # 54701861 bases of 2525294057 (2.166%) in intersection cat fb.susScr3.genscanSubopt.txt # 53271634 bases of 2525294057 (2.110%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2518987191 / 2897316137 \) \* 1024 # ( 2518987191 / 2897316137 ) * 1024 = 890.286997 # round up to 900 (susScr2 was 750) cd /hive/data/genomes/susScr3 time blat susScr3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/susScr3.11.ooc -repMatch=900 # Wrote 27633 overused 11-mers to jkStuff/susScr3.11.ooc # susScr2 had: Wrote 31605 overused 11-mers to jkStuff/susScr2.11.ooc # real 1m33.117s # there are non-bridged gaps, create lift file needed for genbank hgsql -N -e "select bridge from gap;" susScr3 | sort | uniq -c # 169119 no # 14502 yes cd /hive/data/genomes/susScr3/jkStuff gapToLift susScr3 susScr3.nonBridged.lift -bedFile=susScr3.nonBridged.bed # this assembly has gaps abutting each other which produces warnings # from this gapToLift program. # largest non-bridged contig: awk '{print $3-$2,$0}' susScr3.nonBridged.bed | sort -nr | head # 3862550 chr13 35251702 39114252 chr13.72 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Sus scrofa 39994 1669350 3943 # to decide which "native" mrna or ests you want to specify in genbank.conf # ORGANISM: Sus scrofa ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add: # susScr3 (pig) susScr3.serverGenome = /hive/data/genomes/susScr3/susScr3.2bit susScr3.clusterGenome = /hive/data/genomes/susScr3/susScr3.2bit susScr3.ooc = /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc susScr3.lift = /hive/data/genomes/susScr3/jkStuff/susScr3.nonBridged.lift susScr3.perChromTables = no susScr3.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} susScr3.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} susScr3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} susScr3.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} susScr3.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} susScr3.genbank.est.xeno.pslCDnaFilter = ${lowCover.genbank.est.xeno.pslCDnaFilter} susScr3.downloadDir = susScr3 susScr3.refseq.mrna.native.load = yes susScr3.refseq.mrna.xeno.load = yes susScr3.genbank.mrna.xeno.load = yes susScr3.genbank.mrna.xeno.loadDesc = yes susScr3.genbank.est.native.load = yes # susScr3.upstreamGeneTbl = ensGene # susScr3.upstreamMaf = multiz12way # /hive/data/genomes/susScr3/bed/multiz12way/species.list # end of section added to etc/genbank.conf git commit -m "adding susScr3 pig" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S susScr3 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial susScr3 & # var/build/logs/2012.05.04-17:32:17.susScr3.initalign.log # real 1612m51.055s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad susScr3 & # logFile: var/dbload/hgwdev/logs/2012.05.08-11:06:15.dbload.log # real 100m45.660s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add susScr3 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added susScr3." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-26 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr18:20905375-20909537" where name="susScr3";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/susScr3/pushQ cd /hive/data/genomes/susScr3/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl susScr3 2> stderr.txt | grep -v transMap > susScr3.sql # real 3m45.401s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/susScr3/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/susScr3/wib/quality.wib # WARNING: hgwdev does not have /gbdb/susScr3/bbi/quality.bw # WARNING: susScr3 does not have seq # WARNING: susScr3 does not have extFile # WARNING: susScr3 does not have estOrientInfo scp -p susScr3.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/susScr3.sql" ############################################################################ # LIFTOVER TO susScr2 (DONE - 2012-08-28 - Hiram ) mkdir /hive/data/genomes/susScr3/bed/blat.susScr2.2012-08-28 cd /hive/data/genomes/susScr3/bed/blat.susScr2.2012-08-28 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -ooc /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc susScr3 susScr2 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ susScr3 susScr2 > do.log 2>&1 # real 78m46.791s ############################################################################# # create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram) mkdir /hive/data/genomes/susScr3/bed/ucscToINSDC cd /hive/data/genomes/susScr3/bed/ucscToINSDC # copying these scripts from the previous load and improving them # with each instance ./translateNames.sh NC_000845.1 ./verifyAll.sh ./join.sh sed -e "s/21/10/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab susScr3 ucscToINSDC stdin ucscToINSDC.tab checkTableCoords susScr3 ucscToINSDC featureBits -countGaps susScr3 ucscToINSDC # 2808525991 bases of 2808525991 (100.000%) in intersection # verify the track link to INSDC functions ############################################################################## # Farm Animal QTL track (IN PROGRESS - 2014-01-06 - Pauline) mkdir /hive/data/genomes/susScr3/bed/animalQtl cd /hive/data/genomes/susScr3/bed/animalQtl wget http://www.animalgenome.org/hu/43874/QTL_SS_10.2.ubed.txt zcat QTL_SS_10.2.ubed.txt >> susScr3.animalQtl.bed cut -f1-4 susScr3.animalQtl.bed > susScr3.animalQtl.bed4 hgLoadBed susScr3 animalQtl susScr3.animalQtl.bed4 checkTableCoords susScr3 animalQtl # no results - yay! #had to reload to pickup new identifier syntax cut -f1-4 QTL_OAR_3.1.ubed.txt > susScr3.animalQtlTEST.bed4 hgLoadBed susScr3 animalQtl susScr3.animalQtlTEST.bed4 #added entries in trackDb for track and search #script to determine what to include in search rule: cat > findNames < /tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' galGal4 >> /tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' susScr3 >> /tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' oviAri3 >> /tmp/bosTau7.animalQtl.name.txt export maxLen=`awk '{print length($0)}' /tmp/bosTau7.animalQtl.name.txt | sort -rn | head -1` echo "scan to column: $maxLen" export C=1 while [ $C -le $maxLen ]; do echo -n " $C: " sort -u /tmp/bosTau7.animalQtl.name.txt | awk '{ print substr($0,'$C',1) }' | sort -u | xargs echo | sed -e 's/ //g' C=`echo $C | awk '{print $1+1}'` done #rm -f /tmp/bosTau7.animalQtl.name.txt mv /tmp/bosTau7.animalQtl.name.txt . EOF #script output: 1: 12346ABCDEFGHIJKLMNOPQRSTUVWYZcdfmpu 2: -/0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYacdeimort 3: %-01234568:ABCDEFGHIKLMNOPRSTUVWXYZ_abcdefnost 4: %-/0123456789:ABCDEFGHIKLMNOPRSTUVWYZ_abdehilnqrt 5: -0123456789:ABCDEFGHIJKLMNOPQRSTUVWYZ_abcdiklnorstw 6: %-.0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_bcdhimnoprstw 7: -0123456789:ABCDEFGHIKLMNOPRSTVW_abcent 8: 0123456789:ABCDEFGHILMNPRSTV_abeflo 9: 0123456789:BCEFMOPTW_adeklnt 10: 0123456789:CHLcdfirt 11: -0123456789:aln 12: 0123456789git 13: 0123456789: 14: 0123456789 15: 0123456789 16: 0123456789 17: 012346 18: 023456 #rule constructed based on script output - set termregex to this in trackDb #search stanza ^[a-zA-Z0-9]+[a-zA-Z0-9:/-]+[a-zA-Z0-9:/-_%]+ ############################################################################ # DBSNP B138 / SNP138 (DONE 1/29/14 angie) # RedMine #12490 screen mkdir -p /hive/data/outside/dbSNP/138/pig cd /hive/data/outside/dbSNP/138/pig # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (pig_9823 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. cat > config.ra <& do.log & tail -f do.log # Some trial and error is always required to get the config.ra just right. # First stop: #*** 4562 lines of b138_ContigInfo.bcp.gz contained contig names that #*** could not be mapped to chrom.size via their GenBank contig mappings; see #*** /hive/data/outside/dbSNP/138/pig/cantLiftUpSeqNames.txt . # #*** You must account for all 9906 contig_acc values in config.ra, #*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output). #*** Check the auto-generated suggested.lft to see if it covers all #*** 9906 contigs; if it does, add 'liftUp suggested.lft' to config.ra. #*** Then run again with -continue=loadDbSnp . head -5 cantLiftUpSeqNames.txt #unplaced-scaffold (no chrUn_GL892100 in chrom.sizes) NW_003536876 na #unplaced-scaffold (no chrUn_GL892101 in chrom.sizes) NW_003536877 na #unplaced-scaffold (no chrUn_GL892102 in chrom.sizes) NW_003536878 na #unplaced-scaffold (no chrUn_GL892103 in chrom.sizes) NW_003536879 na #unplaced-scaffold (no chrUn_GL892104 in chrom.sizes) NW_003536880 na grep GL892100 /hive/data/genomes/susScr3/chrom.sizes | head #GL892100-1 100 # Uh-oh -- a naming convention unfamiliar to doDbSnpContig's automatic # mapping of RefSeq contig IDs to GenBank contig IDs to chrom.sizes IDs. # Adding a special case for susScr3 to doDbSnp.pl, trying again: ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log & tail -f do.log # Stopped at same point -- but now it has generated a complete suggested.lft, # so add that to config.ra and proceed: cat >> config.ra <>& do.log & tail -f do.log # Lost mysql connection during a big load -- modified doDbSnp.pl to be more # efficient in the way it removes duplicates from snp138Seq.tab. # Regenerate load.csh using -debug: ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=load -debug # Now edit load.csh to wrap "if(0)" around the parts that already completed, # and run the remaining steps: ./load.csh >>& do.log & tail -f do.log # All done! ############################################################################## # FILTER SNP138 (DONE 1/31/14 angie) cd /hive/data/outside/dbSNP/138/pig zcat snp138.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 129889 #Common: 187 #Flagged: 0 #leftover: 28592315 # 187 SNPs out of > 28M that meet the threshold for Common... not enough to warrant a # Common SNPs track. foreach f ({Mult}.bed.gz) mv $f snp138$f end # Load tables foreach subset (Mult) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ susScr3 snp138$subset -sqlTable=snp138.sql snp138$subset.bed.gz end ############################################################################## # DBSNP CODING ANNOTATIONS (138) (DONE 2/27/14 angie) # Originally done 1/31/14, but Brian L found hgc crash caused by script bug... cd /hive/data/outside/dbSNP/138/pig # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | ~/kent/src/hg/utils/automation/fixNcbiFuncCoords.pl ncbiFuncInsertions.ctg.bed.gz \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiFuncAnnotationsFixed.txt wc -l ncbiFuncAnnotationsFixed.txt #415603 ncbiFuncAnnotationsFixed.txt # How many & what kinds of function types? cut -f 6 ncbiFuncAnnotationsFixed.txt \ | sort -n | uniq -c # 129680 3 (coding-synon) # 207072 8 (cds-reference) # 1041 41 (nonsense) # 77449 42 (missense) # 161 43 (stop-loss) # 107 44 (frameshift) # 93 45 (cds-indel) # 2/27/14: Re-ran from here after fixing bug in script: # Gather up multiple annotation lines into one line per {snp, gene, frame}: ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl ncbiFuncAnnotationsFixed.txt \ | liftUp snp138CodingDbSnp.bed suggested.lft warn stdin hgLoadBed susScr3 snp138CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp138CodingDbSnp.bed #Read 207193 elements of size 11 from snp138CodingDbSnp.bed ############################################################################## ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## # LASTZ pig/susScr3 sheep/oviAri3 - (DONE - 2014-05-19 - Hiram) mkdir /hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19 cd /hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19 cp -p \ /hive/users/hiram/multiz/100way/susScr3.oviAri3/susScr3.oviAri3.tuning.top400.txt \ ./susScr3.oviAri3.tuning.Q.txt cat << '_EOF_' > DEF # Pig vs. Sheep # parameters obtained from a tuning run of lastz_D # /hive/users/hiram/multiz/100way/susScr3.oviAri3/susScr3.oviAri3.tuning.top400.txt BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_O=400 BLASTZ_E=30 BLASTZ_M=50 BLASTZ_X=800 BLASTZ_Y=3400 BLASTZ_K=3000 BLASTZ_L=3000 BLASTZ_Q=/hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19/susScr3.oviAri3.tuning.Q.txt # A C G T # A 80 -115 -29 -130 # C -115 100 -93 -29 # G -29 -93 100 -115 # T -130 -29 -115 80 # TARGET: Pig susScr3 SEQ1_DIR=/hive/data/genomes/susScr3/susScr3.2bit SEQ1_LEN=/hive/data/genomes/susScr3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Sheep oviAri3 SEQ2_DIR=/hive/data/genomes/oviAri3/oviAri3.2bit SEQ2_LEN=/hive/data/genomes/oviAri3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19 TMPDIR=/dev/shm '_EOF_' # << happy emacs time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 1170m6.492s cat fb.susScr3.chainOviAri3Link.txt # 1453348100 bases of 2525294057 (57.552%) in intersection time (doRecipBest.pl -buildDir=`pwd` susScr3 oviAri3) > rbest.log 2>&1 & # real 109m38.929s # and for the swap: mkdir /hive/data/genomes/oviAri3/bed/blastz.susScr3.swap cd /hive/data/genomes/oviAri3/bed/blastz.susScr3.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 155m31.793s cat fb.oviAri3.chainSusScr3Link.txt # 1311099603 bases of 2534335866 (51.733%) in intersection time (doRecipBest.pl -buildDir=`pwd` oviAri3 susScr3) > rbest.log 2>&1 # real 42m24.943s ######################################################################### # LIFTOVER TO susScr11 (DONE - 2017-07027 - Hiram ) mkdir /hive/data/genomes/susScr3/bed/blat.susScr11.2017-07-27 cd /hive/data/genomes/susScr3/bed/blat.susScr11.2017-07-27 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -ooc /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc susScr3 susScr11 # Real run: time (doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ susScr3 susScr11) > do.log 2>&1 & # real 273m49.891s # verify browser can run a view in other genomes to susScr11 from susScr3 #############################################################################