# for emacs: -*- mode: sh; -*- # This file describes browser build for the galGal4 # Chicken Gallus Gallus genome: Nov. 2011 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AADN00 # 12X coverage via a variety of methods # http://www.ncbi.nlm.nih.gov/genome/111 # http://www.ncbi.nlm.nih.gov/genome/assembly/317958/ # http://www.ncbi.nlm.nih.gov/bioproject/13342 - WashU # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AADN00 # http://www.ncbi.nlm.nih.gov/bioproject/10808 - Montreal - chrMt ############################################################################# # Fetch sequence from genbank (DONE - 2011-12-22 - Hiram) mkdir -p /hive/data/genomes/galGal4/genbank cd /hive/data/genomes/galGal4/genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Gallus_gallus/Gallus_gallus-4.0/*" # real 11m57.658s # measure sequence to be used here faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz \ Primary_Assembly/unlocalized_scaffolds/FASTA//*.fa.gz # 1046915324 bases (14077289 N's 1032838035 real 1032838035 upper 0 lower) in 15931 sequences in 65 files # Total size: mean 65715.6 sd 2473072.9 min 253 (gi|354532582|gb|AADN03009880.1|) max 195276750 (gi|357973830|gb|CM000093.3|) median 1206 ############################################################################# # process into UCSC naming scheme (DONE - 2011-02-17 - Hiram) mkdir /hive/data/genomes/galGal4/ucsc cd /hive/data/genomes/galGal4/ucsc cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; # these names became too long, limit is 31 in browser: if ($chrN =~ m/LGE22C19W28_E50C23/) { my $before = $ucscName; $ucscName =~ s/_E50C23//; $ucscName =~ s/AAD//; printf STDERR "shorter: $before -> $ucscName\n"; } printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/\|.*//; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; # these names became too long, limit is 31 in browser: if ($chrN =~ m/LGE22C19W28_E50C23/) { $ucscName =~ s/_E50C23//; $ucscName =~ s/AAD//; } printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl time ./toUcsc.pl # real 0m30.708s ./unlocalized.pl # real 0m1.397s ./unplaced.pl # real 0m1.611s gzip *.fa *.agp # verify nothing lost in the translation, should be the same as above # except for the name translations faSize *.fa # 1046915324 bases (14077289 N's 1032838035 real 1032838035 upper 0 lower) in 15931 sequences in 65 files # Total size: mean 65715.6 sd 2473072.9 min 253 (chr2_AADN03009880_random) max 195276750 (chr1) median 1206 ############################################################################# # Initial browser build (DONE - 2012-01-03 - Hiram) cd /hive/data/genomes/galGal4 cat << '_EOF_' > galGal4.config.ra # Config parameters for makeGenomeDb.pl: db galGal4 clade vertebrate genomeCladePriority 50 scientificName Gallus gallus commonName Chicken assemblyDate Nov. 2011 assemblyLabel ICGSC Gallus_gallus-4.0 (GCA_000002315.2) assemblyShortLabel ICGSC Gallus_gallus-4.0 orderKey 434 mitoAcc NC_001323 fastaFiles /hive/data/genomes/galGal4/ucsc/*.fa.gz agpFiles /hive/data/genomes/galGal4/ucsc/*.agp.gz dbDbSpeciesDir chicken taxId 9031 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp galGal4.config.ra > agp.log 2>&1 # real 1m37.730s # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db galGal4.config.ra > db.log 2>&1 # real 8m20.996s # the first attempted failed in the: # featureBits -or -countGaps galGal4 gold gap # due to the long chrom names, e.g.: # chrLGE22C19W28_E50C23_AADN03011267_random # buffer overflow, size 32, format: %s, buffer: 'chrLGE22C19W28_E50C23_JH375242_' # the unlocalized.pl script was reworked a bit to shorten that one # chr random name. ############################################################################# # running repeat masker (DONE - 2012-01-04 - Hiram) mkdir /hive/data/genomes/galGal4/bed/repeatMasker cd /hive/data/genomes/galGal4/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk galGal4 > do.log 2>&1 & # real 115m21.264s cat faSize.rmsk.txt # 1046932099 bases (14077289 N's 1032854810 real 921370201 upper # 111484609 lower) in 15932 sequences in 1 files # %10.65 masked total, %10.79 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps galGal4 rmsk # 111540299 bases of 1046932099 (10.654%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-01-04 - Hiram) mkdir /hive/data/genomes/galGal4/bed/simpleRepeat cd /hive/data/genomes/galGal4/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ galGal4 > do.log 2>&1 & # about 1 hour cat fb.simpleRepeat # 10508414 bases of 1032854810 (1.017%) in intersection XXX - ready for this when rmsk is done # not going to add to rmsk here, using the window masker instead since # it masks more sequence cd /hive/data/genomes/galGal4 twoBitMask galGal4.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed galGal4.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa galGal4.2bit stdout | faSize stdin > faSize.galGal4.2bit.txt cat faSize.galGal4.2bit.txt # 1511735326 bases (153400444 N's 1358334882 real 1024824487 upper # 333510395 lower) in 19550 sequences in 1 files # %22.06 masked total, %24.55 masked real rm /gbdb/galGal4/galGal4.2bit ln -s `pwd`/galGal4.2bit /gbdb/galGal4/galGal4.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-01-04 - Hiram) mkdir /hive/data/genomes/galGal4/bed/gap cd /hive/data/genomes/galGal4/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../galGal4.unmasked.2bit > findMotif.txt 2>&1 # real 0m17.839s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits galGal4 -not gap -bed=notGap.bed # 1043113087 bases of 1043113087 (100.000%) in intersection # real 0m8.581s time featureBits galGal4 allGaps.bed notGap.bed -bed=new.gaps.bed # 10258277 bases of 1043113087 (0.983%) in intersection # real 1m29.109s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" galGal4 | sort -n | tail -1 # 446 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" galGal4 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed wc -l other.bed # 11035 featureBits -countGaps galGal4 other.bed # 10258277 bases of 1046932099 (0.980%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad galGal4 otherGap other.bed # verify no overlap: time featureBits -countGaps galGal4 gap other.bed # 0 bases of 1046932099 (0.000%) in intersection # real 0m29.669s # verify no errors before adding to the table: time gapToLift -minGap=1 galGal4 nonBridged.before.lift \ -bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1 & # real 0m7.205s # check for warnings in before.gapToLift.txt, should be empty: # -rw-rw-r-- 1 0 Jan 4 15:13 before.gapToLift.txt # starting with this many: hgsql -e "select count(*) from gap;" galGal4 # 2863 hgsql galGal4 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" galGal4 # 13898 # == 2863 + 11035 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 galGal4 nonBridged.lift -bedFile=nonBridged.bed # there should be no errors or other output, checked bridged gaps: hgsql -N -e "select bridge from gap;" galGal4 | sort | uniq -c # 915 no # 12983 yes ########################################################################## ## WINDOWMASKER (DONE - 2011-09-08 - Hiram) mkdir /hive/data/genomes/galGal4/bed/windowMasker cd /hive/data/genomes/galGal4/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev galGal4 > do.log 2>&1 & # about 45 minutes # Masking statistics twoBitToFa galGal4.wmsk.2bit stdout | faSize stdin # 1046932099 bases (14077289 N's 1032854810 real 828377808 upper # 204477002 lower) in 15932 sequences in 1 files # %19.53 masked total, %19.80 masked real twoBitToFa galGal4.wmsk.sdust.2bit stdout | faSize stdin # 1046932099 bases (14077289 N's 1032854810 real 822407863 upper # 210446947 lower) in 15932 sequences in 1 files # %20.10 masked total, %20.38 masked real hgLoadBed galGal4 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 5996190 elements of size 3 featureBits -countGaps galGal4 windowmaskerSdust # 224522674 bases of 1046932099 (21.446%) in intersection # eliminate the gaps from the masking featureBits galGal4 -not gap -bed=notGap.bed # 1032854810 bases of 1032854810 (100.000%) in intersection time nice -n +19 featureBits galGal4 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 210446947 bases of 1032854810 (20.375%) in intersection # real 2m3.610s # reload track to get it clean hgLoadBed galGal4 windowmaskerSdust cleanWMask.bed.gz # Loaded 5994965 elements of size 4 time featureBits -countGaps galGal4 windowmaskerSdust # real 0m40.505s # 210446947 bases of 1046932099 (20.101%) in intersection # mask with this clean result zcat cleanWMask.bed.gz \ | twoBitMask ../../galGal4.unmasked.2bit stdin \ -type=.bed galGal4.cleanWMSdust.2bit twoBitToFa galGal4.cleanWMSdust.2bit stdout | faSize stdin \ > galGal4.cleanWMSdust.faSize.txt cat galGal4.cleanWMSdust.faSize.txt # 1046932099 bases (14077289 N's 1032854810 real 822407863 upper # 210446947 lower) in 15932 sequences in 1 files # %20.10 masked total, %20.38 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps galGal4 rmsk windowmaskerSdust # 71066487 bases of 1046932099 (6.788%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-01-05 - Hiram) cd /hive/data/genomes/galGal4 twoBitMask -add bed/windowMasker/galGal4.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed galGal4.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa galGal4.2bit stdout | faSize stdin > faSize.galGal4.txt cat faSize.galGal4.txt # 1046932099 bases (14077289 N's 1032854810 real 822061190 upper # 210793620 lower) in 15932 sequences in 1 files # %20.13 masked total, %20.41 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/galGal4/galGal4.2bit ln -s `pwd`/galGal4.2bit /gbdb/galGal4/galGal4.2bit # what happens with all masks: twoBitMask -add galGal4.2bit bed/repeatMasker/galGal4.sorted.fa.out \ galGal4.wm.trf.rmsk.2bit twoBitToFa galGal4.wm.trf.rmsk.2bit stdout | faSize stdin # 1046932099 bases (14077289 N's 1032854810 real 781762152 upper # 251092658 lower) in 15932 sequences in 1 files # %23.98 masked total, %24.31 masked real # rmsk covers another 40 million bases 251092658-210793620 = 40299038 ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/galGal4/bed/cpgIslands cd /hive/data/genomes/galGal4/bed/cpgIslands time doCpgIslands.pl galGal4 > do.log 2>&1 # Elapsed time: 58m41s cat fb.galGal4.cpgIslandExt.txt # 14619450 bases of 1032854810 (1.415%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/galGal4/bed/genscan cd /hive/data/genomes/galGal4/bed/genscan time doGenscan.pl galGal4 > do.log 2>&1 # real 29m37.528s # one broken job: ./runGsBig.csh chr8 000 gtf/000/chr8.gtf pep/000/chr8.pep subopt/000/chr8.bed # rerunning with window size of 2000000 # about 10 minutes # continuing: time doGenscan.pl -continue=makeBed galGal4 > makeBed.log 2>&1 # real 12m15.506s cat fb.galGal4.genscan.txt # 22904400 bases of 1032854810 (2.218%) in intersection cat fb.galGal4.genscanSubopt.txt # 24186038 bases of 1032854810 (2.342%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 1032838035 / 2897316137 \) \* 1024 # ( 1032838035 / 2897316137 ) * 1024 = 365.036502 # round up to 380 (galGal3 was 380) cd /hive/data/genomes/galGal4 time blat galGal4.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/galGal4.11.ooc -repMatch=380 # Wrote 12593 overused 11-mers to jkStuff/galGal4.11.ooc # galGal3 had: Wrote 13061 overused 11-mers to /cluster/bluearc/galGal3/11.ooc # real 0m25.395s # there are non-bridged gaps, create lift file needed for genbank hgsql -N -e "select bridge from gap;" galGal4 | sort | uniq -c # 915 no # 12983 yes cd /hive/data/genomes/galGal4/jkStuff gapToLift galGal4 galGal4.nonBridged.lift -bedFile=galGal4.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' galGal4.nonBridged.bed | sort -nr | head # 44546952 chr3 11581655 56128607 chr3.03 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Felis catus 1081 919 354 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add: # galGal4 (chicken) galGal4.serverGenome = /hive/data/genomes/galGal4/galGal4.2bit galGal4.clusterGenome = /hive/data/genomes/galGal4/galGal4.2bit galGal4.ooc = /hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc galGal4.lift = /hive/data/genomes/galGal4/jkStuff/galGal4.nonBridged.lift galGal4.perChromTables = no galGal4.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} galGal4.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} galGal4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} galGal4.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} galGal4.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} galGal4.refseq.mrna.native.load = yes galGal4.refseq.mrna.xeno.load = yes galGal4.genbank.mrna.xeno.load = yes galGal4.downloadDir = galGal4 # galGal4.upstreamGeneTbl = refGene # galGal4.upstreamMaf = multiz7way # /hive/data/genomes/galGal3/bed/multiz7way/species.lst # end of section added to etc/genbank.conf git commit -m "adding galGal4 chicken" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S galGal4 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial galGal4 & # var/build/logs/2012.05.04-15:58:41.galGal4.initalign.log # real 1604m15.676s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad galGal4 & # logFile: var/dbload/hgwdev/logs/2012.05.08-08:48:35.dbload.log # real 48m18.467s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add galGal4 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added galGal4." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # create downloads and pushQ entry (DONE - 2012-05-24 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema time joinerCheck -database=galGal4 -keys all.joiner # real 2m30.542s cd /hive/data/genomes/galGal4 time makeDownloads.pl -workhorse hgwdev galGal4 > download.log 2>&1 # real 17m56.594s mkdir /hive/data/genomes/galGal4/pushQ cd /hive/data/genomes/galGal4/pushQ time makePushQSql.pl galGal4 > galGal4.sql 2> stderr.out # real 3m28.689s # check stderr.out for no significant problems, it is common to see: WARNING: hgwdev does not have /gbdb/galGal4/wib/gc5Base.wib WARNING: hgwdev does not have /gbdb/galGal4/wib/quality.wib WARNING: hgwdev does not have /gbdb/galGal4/bbi/quality.bw WARNING: galGal4 does not have seq WARNING: galGal4 does not have extFile scp -p galGal4.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < galGal4.sql ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2012-06-07 - Hiram) hgsql -N -e "select frag from gold;" galGal4 | sort | head -1 AADN03009256.1 hgsql -N -e "select frag from gold;" galGal4 | sort | tail -2 JH375140.1 JH393278.1 NC_001323 # to test the rule: hgsql -N -e "select frag from gold;" galGal4 | wc -l # 18795 # should find all: hgsql -N -e "select frag from gold;" galGal4 \ | egrep -e '[AJN][AHC][D3_][N079][0-9]+(\.1)?' | wc -l # 18795 # or exclude all: hgsql -N -e "select frag from gold;" galGal4 \ | egrep -v -e '[AJN][AHC][D3_][N079][0-9]+(\.1)?' | wc -l # 0 # hence, add to trackDb/chicken/galGal4/trackDb.ra searchTable gold shortCircuit 1 termRegex [AJN][AHC][D3_][N079][0-9]+(\.1)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 ######################################################################### # LASTZ Medium Ground Finch geoFor1 (DONE - 2012-07-27 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S geoFor1 mkdir /hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27 cd /hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # Chicken vs. Medium Ground Finch BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: chicken galGal4 SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Medium Ground Finch GeoFor1 SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 # the chaining step ran for several days, the longest chain job: # Longest finished job: 213567s 3559.45m 59.32h 2.47d # Elapsed time: 4572m44s cat fb.galGal4.chainGeoFor1Link.txt # 744451026 bases of 1032854810 (72.077%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/galGal4/bed ln -s lastzGeoFor1.2012-07-27 lastz.geoFor1 mkdir /hive/data/genomes/geoFor1/bed/blastz.galGal4.swap cd /hive/data/genomes/geoFor1/bed/blastz.galGal4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 81m0.123s cat fb.geoFor1.chainHg19Link.txt # 736380222 bases of 1041286029 (70.718%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/geoFor1/bed ln -s blastz.galGal4.swap lastz.galGal4 ############################################################################## # LASTZ petMar2 Lamprey (DONE - 2012-10-23 - Hiram) screen -S galGal4PetMar2 # use a screen to control this multi-day job mkdir /cluster/data/galGal4/bed/lastzPetMar2.2012-10-23 cd /cluster/data/galGal4/bed/lastzPetMar2.2012-10-23 cat << '_EOF_' > DEF # Chicken vs Lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal4 SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Lamprey petMar2 SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/galGal4/bed/lastzPetMar2.2012-10-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # why is memk used here for big hub ? Should have been swarm time nice -n +19 doBlastzChainNet.pl \ `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=memk -verbose=2 > do.log 2>&1 & # shut this down on memk after awhile and finished it on swarm # real 4136m52.553s # continuing after finishing on swarm: time nice -n +19 doBlastzChainNet.pl \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=swarm -verbose=2 > cat.log 2>&1 & # Elapsed time: 49m2s cat fb.galGal4.chainPetMar2Link.txt # 18440677 bases of 1032854810 (1.785%) in intersection # and the swap mkdir /cluster/data/petMar2/bed/blastz.galGal4.swap cd /cluster/data/petMar2/bed/blastz.galGal4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal4/bed/lastzPetMar2.2012-10-23/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=swarm -verbose=2 > swap.log 2>&1 & # real 6m41.091s cat fb.petMar2.chainGalGal4Link.txt # 18576173 bases of 647368134 (2.869%) in intersection ############################################################################ # construct liftOver to galGal3 (DONE - 2013-04-30 - Hiram) screen -S galGal4 # manage this longish running job in a screen mkdir /hive/data/genomes/galGal4/bed/blat.galGal3.2013-04-30 cd /hive/data/genomes/galGal4/bed/blat.galGal3.2013-04-30 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev galGal4 galGal3 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \ -dbHost=hgwdev -workhorse=hgwdev galGal4 galGal3 > do.log 2>&1 # real 24m2.551s # verify this file exists: # /gbdb/galGal4/liftOver/galGal4ToGalGal3.over.chain.gz # and try out the conversion on genome-test from galGal4 to galGal3 ############################################################################ # construct ucscToEnsembl name translation table (DONE - 2013-05-01 - Hiram) mkdir /hive/data/genomes/galGal4/ensembl cd /hive/data/genomes/galGal4/ensembl rsync -a -P rsync://ftp.ensembl.org/ensembl/pub/mnt2/release-71/fasta/gallus_gallus/dna/Gallus_gallus.Galgal4.71.dna.toplevel.fa.gz ./ faCount Gallus_gallus.Galgal4.71.dna.toplevel.fa.gz > ens71.faCount.txt cut -f1,2 ens71.faCount.txt \ | egrep -v "^total|^#seq" | sort -k2nr > ens71.chrom.sizes faToTwoBit Gallus_gallus.Galgal4.71.dna.toplevel.fa.gz ens71.galGal4.2bit twoBitDup ens71.galGal4.2bit cat > ucscToEns.pl << '_EOF_' #!/usr/bin/env perl use strict; use warnings; my %ucscChromSizes; my %contigToChr; open (FH, "<../chrom.sizes") or die "can not read ../chrom.sizes"; while (my $line = ) { chomp $line; my ($chr, $size) = split('\s+', $line); my $ucscChr = $chr; $ucscChromSizes{$chr} = $size; if ($chr =~ m/chrLGE22C19W28_N03011.*_random/) { $chr =~ s/chrLGE22C19W28_N03011/chrLGE22C19W28_AADN03011/; } if ($chr =~ m/_random/) { $chr =~ s/_random//; $chr =~ s/.*_//; } $chr =~ s/^chr[0-9]+_//; $chr =~ s/^chrUn_//; $chr =~ s/^chr//; $contigToChr{$chr} = $ucscChr; } close (FH); open (NF, ">not.found.txt") or die "can not write to not.found.txt"; open (FH, ") { chomp $line; my ($chr, $size) = split('\s+', $line); $chr = "M" if ($chr =~ m/MT/); my $ucscContig = $chr; $chr = "MT" if ($chr =~ m/M/); $ucscContig =~ s/\.1$//; my $ucscChr = "unknown"; my $ucscSize = 0; if (exists($contigToChr{$ucscContig})) { $ucscChr = $contigToChr{$ucscContig}; $ucscSize = $ucscChromSizes{$ucscChr}; die "sizes not equal ? $chr=$size $ucscChr=$ucscSize" if ($size != $ucscSize); printf "%s\t%s\n", $ucscChr, $chr; } else { $ucscContig = "chr${ucscContig}"; if (exists($contigToChr{$ucscContig})) { $ucscChr = $contigToChr{$ucscContig}; $ucscSize = $ucscChromSizes{$ucscChr}; die "sizes not equal ? $chr=$size $ucscChr=$ucscSize" if ($size != $ucscSize); printf "%s\t%s\n", $ucscChr, $chr; } else { printf STDERR "can not find contig '$chr'\n"; printf NF "%s\t%d\n", $chr, $size; } } } close (FH); close (NF); '_EOF_' # << happy emacs chmod +x ucscToEns.pl ./ucscToEns.pl > ucscToEnsembl.tab mkdir /hive/data/genomes/galGal4/bed/ucscToEnsembl cd /hive/data/genomes/galGal4/bed/ucscToEnsembl ln -s ../../ensembl/ucscToEnsembl.tab cat > ucscToEnsembl.sql << '_EOF_' # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(30)) ); '_EOF_' # << happy emacs hgsql galGal4 -e "drop table ucscToEnsembl;" hgsql galGal4 < ucscToEnsembl.sql hgsql galGal4 \ -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl' # verify blue navigation bar will now go to Ensembl ############################################################################ # create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram) mkdir /hive/data/genomes/galGal4/bed/ucscToINSDC cd /hive/data/genomes/galGal4/bed/ucscToINSDC # copying these scripts from the previous load and improving them # with each instance ./translateNames.sh # this one needed some editing to ucscToINSDC.txt and sorting of that # to truncate some of the names as were done in the original build # since the names went beyond 31 characters ./verifyAll.sh ./join.sh # this didn't work for some odd reason. Did a manual load of the table: # sed -e "s/21/31/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ # | hgLoadSqlTab galGal4 ucscToINSDC stdin ucscToINSDC.tab sed -e "s/21/31/" $HOME/kent/src/hg/lib/ucscToINSDC.sql > ucscToINSDC.sql hgsql galGal4 < ucscToINSDC.sql hgsql -e \ 'load data local infile "ucscToINSDC.tab" into TABLE ucscToINSDC;' galGal4 checkTableCoords galGal4 ucscToINSDC featureBits -countGaps galGal4 ucscToINSDC # 1046932099 bases of 1046932099 (100.000%) in intersection # verify the track link to INSDC functions ############################################################################## # BLASTZ/CHAIN/NET Turkey vs Chicken (DONE - 2013-10-18,25 - Hiram) # use a screen to manage this multi-day job screen -S melGal1 mkdir /hive/data/genomes/galGal4/bed/lastzMelGal1.2013-10-18 cd /hive/data/genomes/galGal4/bed/lastzMelGal1.2013-10-18 cat << '_EOF_' > DEF # Chicken vs. Turkey BLASTZ_M=50 # TARGET: Chicken GalGal4 SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Turkey MelGal1 SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/galGal4/bed/lastzMelGal1.2013-10-18 TMPDIR=/dev/shm '_EOF_' # << this line keeps emacs coloring happy # screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 3091m50.056s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=chainMerge -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 # real 103m43.211s cat fb.galGal4.chainMelGal1Link.txt # 925109523 bases of 1032854810 (89.568%) in intersection cd /hive/data/genomes/galGal4/bed ln -s lastzMelGal1.2013-10-18 lastz.melGal1 # and then swap mkdir /hive/data/genomes/melGal1/bed/blastz.galGal4.swap cd /hive/data/genomes/melGal1/bed/blastz.galGal4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal4/bed/lastzMelGal1.2013-10-18/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 106m45.413s cat fb.melGal1.chainGalGal4Link.txt # 889739529 bases of 935922386 (95.066%) in intersection cd /hive/data/genomes/melGal1/bed ln -s blastz.galGal4.swap lastz.galGal4 ############################################################################ # BLASTZ/CHAIN/NET Zebra finch vs Chicken (DONE - 2013-10-18,25 - Hiram) # use a screen to manage this multi-day job screen -S taeGut2 mkdir /hive/data/genomes/galGal4/bed/lastzTaeGut2.2013-10-18 cd /hive/data/genomes/galGal4/bed/lastzTaeGut2.2013-10-18 cat << '_EOF_' > DEF # Chicken vs. Zebra finch BLASTZ_M=50 # TARGET: Chicken GalGal4 SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Zebra finch TaeGut2 SEQ2_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit SEQ2_LEN=/hive/data/genomes/taeGut2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/hive/data/genomes/galGal4/bed/lastzTaeGut2.2013-10-18 TMPDIR=/dev/shm '_EOF_' # << this line keeps emacs coloring happy # screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 3418m20.002s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=chainMerge `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > merge.log 2>&1 # real 50m6.651s cat fb.galGal4.chainTaeGut2Link.txt # 705616239 bases of 1032854810 (68.317%) in intersection cd /hive/data/genomes/galGal4/bed ln -s lastzTaeGut2.2013-10-18 lastz.taeGut2 # and then swap mkdir /hive/data/genomes/taeGut2/bed/blastz.galGal4.swap cd /hive/data/genomes/taeGut2/bed/blastz.galGal4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal4/bed/lastzTaeGut2.2013-10-18/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 93m43.072s cat fb.taeGut2.chainGalGal4Link.txt # 814628366 bases of 1222864691 (66.616%) in intersection cd /hive/data/genomes/taeGut2/bed ln -s blastz.galGal4.swap lastz.galGal4 ############################################################################ # BLASTZ/CHAIN/NET Ostrich vs Chicken (DONE - 2013-11-06 - Hiram) # use a screen to manage this multi-day job screen -S strCam0 mkdir /hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06 cd /hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06 cat << '_EOF_' > DEF # Chicken vs. Ostrich BLASTZ_M=50 # TARGET: Chicken GalGal4 SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Ostrich StrCam0 SEQ2_DIR=/hive/data/genomes/strCam0/strCam0.2bit SEQ2_LEN=/hive/data/genomes/strCam0/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06 TMPDIR=/dev/shm '_EOF_' # << this line keeps emacs coloring happy # screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 4093m32.252s cat fb.galGal4.chainStrCam0Link.txt # 807682594 bases of 1032854810 (78.199%) in intersection cd /hive/data/genomes/galGal4/bed ln -s lastzStrCam0.2013-11-06 lastz.strCam0 cd /hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06 time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ galGal4 strCam0 > rbest.log 2>&1 & # real 36m13.961s # and then swap, can be done while Rbest is working: mkdir /hive/data/genomes/strCam0/bed/blastz.galGal4.swap cd /hive/data/genomes/strCam0/bed/blastz.galGal4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06/DEF \ -swap \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 58m45.359s cat fb.strCam0.chainGalGal4Link.txt # 798315880 bases of 1187883366 (67.205%) in intersection cd /hive/data/genomes/strCam0/bed ln -s blastz.galGal4.swap lastz.galGal4 time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ strCam0 galGal4 > rbest.log 2>&1 # real 40m28.417s ############################################################################ # BLASTZ/CHAIN/NET Falco peregrinus vs Chicken (DONE - 2013-11-06 - Hiram) # use a screen to manage this multi-day job screen -S falPer1 mkdir /hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06 cd /hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06 cat << '_EOF_' > DEF # Chicken vs. Falco peregrinus BLASTZ_M=50 # TARGET: Chicken GalGal4 SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Falco peregrinus FalPer1 SEQ2_DIR=/hive/data/genomes/falPer1/falPer1.2bit SEQ2_LEN=/hive/data/genomes/falPer1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06 TMPDIR=/dev/shm '_EOF_' # << this line keeps emacs coloring happy # screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 4115m9.767s cat fb.galGal4.chainFalPer1Link.txt # 848910764 bases of 1032854810 (82.191%) in intersection cd /hive/data/genomes/galGal4/bed ln -s lastzFalPer1.2013-11-06 lastz.falPer1 cd /hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06 time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ galGal4 falPer1 > rbest.log 2>&1 & # real 37m52.361s # and then swap mkdir /hive/data/genomes/falPer1/bed/blastz.galGal4.swap cd /hive/data/genomes/falPer1/bed/blastz.galGal4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06/DEF \ -swap \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 69m9.773s cat fb.falPer1.chainGalGal4Link.txt # 831664485 bases of 1153404357 (72.105%) in intersection cd /hive/data/genomes/falPer1/bed ln -s blastz.galGal4.swap lastz.galGal4 time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ falPer1 galGal4 > rbest.log 2>&1 # real 45m55.767s ############################################################################ # (IN PROGRESS - 2014-01-07 - Pauline) mkdir /hive/data/genomes/galGal4/bed/animalQtl cd /hive/data/genomes/galGal4/bed/animalQtl wget http://www.animalgenome.org/QTLdb/tmp/QTL_GG_4.0.ubed.txt.gz zcat QTL_GG_4.0.ubed.txt.gz >> galGal4.animalQtl.bed cut -f1-4 galGal4.animalQtl.bed > galGal4.animalQtl.bed4 hgLoadBed galGal4 animalQtl galGal4.animalQtl.bed4 checkTableCoords galGal4 animalQtl #no results - yay! # originally used commands for intial version of file - ids were # name:idNumber, contributor reloaded these as numbers so skipped sed step # and processed as above. # trim redundant columns to the bare minimum, alter name to be # QTLname:idNumber to allow use of $p in external URL for idNumber cut -f1-4 galGal4.animalQtl.bed | sed -e 's/-/:/;' > galGal4.animalQtl.bed4 hgLoadBed galGal4 animalQtl galGal4.animalQtl.bed4 #script to determine what to include in search rule: cat > findNames < #/tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' bosTau7 > /tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' galGal4 >> /tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' susScr3 >> /tmp/bosTau7.animalQtl.name.txt hgsql -N -e 'select name from animalQtl;' oviAri3 >> /tmp/bosTau7.animalQtl.name.txt export maxLen=`awk '{print length($0)}' /tmp/bosTau7.animalQtl.name.txt | sort -rn | head -1` echo "scan to column: $maxLen" export C=1 while [ $C -le $maxLen ]; do echo -n " $C: " sort -u /tmp/bosTau7.animalQtl.name.txt | awk '{ print substr($0,'$C',1) }' | sort -u | xargs echo | sed -e 's/ //g' C=`echo $C | awk '{print $1+1}'` done #rm -f /tmp/bosTau7.animalQtl.name.txt mv /tmp/bosTau7.animalQtl.name.txt . EOF #script output: 1: 12346ABCDEFGHIJKLMNOPQRSTUVWYZcdfmpu 2: -/0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYacdeimort 3: %-01234568:ABCDEFGHIKLMNOPRSTUVWXYZ_abcdefnost 4: %-/0123456789:ABCDEFGHIKLMNOPRSTUVWYZ_abdehilnqrt 5: -0123456789:ABCDEFGHIJKLMNOPQRSTUVWYZ_abcdiklnorstw 6: %-.0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_bcdhimnoprstw 7: -0123456789:ABCDEFGHIKLMNOPRSTVW_abcent 8: 0123456789:ABCDEFGHILMNPRSTV_abeflo 9: 0123456789:BCEFMOPTW_adeklnt 10: 0123456789:CHLcdfirt 11: -0123456789:aln 12: 0123456789git 13: 0123456789: 14: 0123456789 15: 0123456789 16: 0123456789 17: 012346 18: 023456 #rule constructed based on script output - set termregex to this in trackDb #search stanza ^[a-zA-Z0-9]+[a-zA-Z0-9:/-]+[a-zA-Z0-9:/-_%]+ ############################################################################ # DBSNP B138 / SNP138 (DONE 1/24/14 angie) # RedMine #12490 screen mkdir -p /hive/data/outside/dbSNP/138/chicken cd /hive/data/outside/dbSNP/138/chicken # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (chicken_9031 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. cat > config.ra <& do.log & tail -f do.log # Some trial and error is always required to get the config.ra just right. # First stop: #*** 34 lines of b138_ContigInfo.bcp.gz contained contig names that #*** could not be mapped to chrom.size via their GenBank contig mappings; see #*** /hive/data/outside/dbSNP/138/chicken/cantLiftUpSeqNames.txt . # #*** You must account for all 16847 contig_acc values in config.ra, #*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output). #*** Check the auto-generated suggested.lft to see if it covers all #*** 16847 contigs; if it does, add 'liftUp suggested.lft' to config.ra. #*** Then run again with -continue=loadDbSnp . head -5 cantLiftUpSeqNames.txt #unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011211_random in chrom.sizes) NW_003766145 chrLGE22C19W28_E50C23 #unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011213_random in chrom.sizes) NW_003766138 chrLGE22C19W28_E50C23 #unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011215_random in chrom.sizes) NW_003766135 chrLGE22C19W28_E50C23 #unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011216_random in chrom.sizes) NW_003766131 chrLGE22C19W28_E50C23 #unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011217_random in chrom.sizes) NW_003766128 chrLGE22C19W28_E50C23 grep chrLGE22C19W28_E50C23 /hive/data/genomes/galGal4/chrom.sizes #chrLGE22C19W28_E50C23 965146 # In our assembly, chrLGE22C19W28_E50C23 is an assembled unit, but dbSNP has mappings # to individual NW_* contigs. # GCF_000002315.3.assembly.txt, downloaded by doDbSnp.pl, has mappings of those NW_* contigs # to AADN* GenBank contig IDs like this: grep NW_003766145 GCF_000002315.3.assembly.txt #ChrE22C19W28_E50C23_random_7180000979879 unlocalized-scaffold LGE22C19W28_E50C23 Linkage Group AADN03011211.1 = NW_003766145.1 Primary Assembly # Now, see if galGal4/genbank/ has any info about the mapping of those AADN*'s to the # assembled LGE22C19W28_E50C23: pushd /hive/data/genomes/galGal4/genbank/Primary_Assembly find . -type f | xargs grep AADN03011213 #./unlocalized_scaffolds/unlocalized.chr2scaf:LGE22C19W28_E50C23 AADN03011213.1 #./component_localID2acc:ctg7180000923959 AADN03011213.1 #./scaffold_localID2acc:ChrE22C19W28_E50C23_random_7180000977649 AADN03011213.1 # OK, not as hopeful as I thought (no coord offsets). # But, since AADN03011213 is associated with LGE22C19W28_E50C23 in unlocalized_scaffolds, # it was in the genbank assembly. Why not in chrom.sizes? # -- because Hiram found that sequence names like chrLGE22C19W28_E50C23_AADN03011213_random # would be too long (search above for "these names became too long, limit is 31 in browser"). # So Hiram shortened them. Now that we know what they look like in chrom.sizes, we can # generate liftUp mappings to add to suggested.lft. popd # Map too-big sequence name to sequence size, sort on too-big name: egrep 'chrLGE22C19W28_(N0|JH)' /hive/data/genomes/galGal4/chrom.sizes \ | sed -e 's/chrLGE22C19W28_/chrLGE22C19W28_E50C23_/; s/_N0/_AADN0/;' \ | sort > bigNameToSize # Map too-big sequence name to RefSeq NW_* accession, sort on too-big name: awk '{print $3 "\t" $6;}' cantLiftUpSeqNames.txt | sort > bigNameToNW # Use join and sed to make some liftUp entries (and re-shorten the big names): join -o 2.2,1.2,1.1,1.2 bigNameToSize bigNameToNW \ | sed -re 's/^/0\t/; s/ /\t/g; s/_E50C23//; s/AAD//;' \ > bigName.lft wc -l bigName.lft #34 bigName.lft # Now add our tweaked-name mappings to the auto-generated suggested.lft: cat suggested.lft bigName.lft > suggestedPlus.lft cat >> config.ra <>& do.log & tail -f do.log # *** All done! ############################################################################## # FILTER SNP138 (DONE 1/27/14 angie) cd /hive/data/outside/dbSNP/138/chicken zcat snp138.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 189438 #Common: 20461 #Flagged: 0 #leftover: 9173531 # 20,461 SNPs out of > 9M that meet the threshold for Common... enough to warrant a # Common SNPs track? Maybe; easy enough to drop it later if we decide it's too sparse. foreach f ({Mult,Common}.bed.gz) mv $f snp138$f end # Load tables foreach subset (Mult Common) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ galGal4 snp138$subset -sqlTable=snp138.sql snp138$subset.bed.gz end ############################################################################## # DBSNP CODING ANNOTATIONS (138) (DONE 2/27/14 angie) # Originally done 1/27/14, but Brian L found hgc crash caused by script bug... cd /hive/data/outside/dbSNP/138/chicken # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | ~/kent/src/hg/utils/automation/fixNcbiFuncCoords.pl ncbiFuncInsertions.ctg.bed.gz \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiFuncAnnotationsFixed.txt wc -l ncbiFuncAnnotationsFixed.txt #281194 ncbiFuncAnnotationsFixed.txt # How many & what kinds of function types? cut -f 6 ncbiFuncAnnotationsFixed.txt \ | sort -n | uniq -c # 84191 3 (coding-synon) # 125292 8 (cds-reference) # 1230 41 (nonsense) # 46722 42 (missense) # 67 43 (stop-loss) # 4017 44 (frameshift) # 19675 45 (cds-indel) # 2/27/14: Re-ran from here after fixing bug in script: # Gather up multiple annotation lines into one line per {snp, gene, frame}: ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl ncbiFuncAnnotationsFixed.txt \ | liftUp snp138CodingDbSnp.bed suggested.lft warn stdin hgLoadBed galGal4 snp138CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp138CodingDbSnp.bed #Read 145135 elements of size 11 from snp138CodingDbSnp.bed ############################################################################## # Farm Animal Cytoband track (IN PROGRESS - 2014-02-04 - Pauline) mkdir /hive/data/genomes/galGal4/bed/cytoBand cd /hive/data/genomes/galGal4/bed/cytoBand wget http://www.animalgenome.org/hu/share/GbandCoord.tar.gz ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## ############################################################################## # LIFTOVER TO galGal5 (DONE - 2016-04-15 - Hiram) ssh hgwdev mkdir /hive/data/genomes/galGal4/bed/blat.galGal5.2016-04-15 cd /hive/data/genomes/galGal4/bed/blat.galGal5.2016-04-15 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \ galGal4 galGal5 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \ galGal4 galGal5) > doLiftOverToGalGal5.log 2>&1 # real 155m34.324s # see if the liftOver menus function in the browser from galGal5 to galGal4 ############################################################################## # LASTZ chicken/galGal4 vs. Chinese softshell turtle - (DONE - 2016-05-17 - Hiram) mkdir /hive/data/genomes/galGal4/bed/lastzPelSin1.2016-05-17 cd /hive/data/genomes/galGal4/bed/lastzPelSin1.2016-05-17 printf '# chicken vs. Chinese softshell turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: chicken galGal4 SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LIMIT=100 SEQ1_LAP=10000 # QUERY: Chinese softshell turtle pelSin1 SEQ2_DIR=/hive/data/genomes/pelSin1/pelSin1.2bit SEQ2_LEN=/hive/data/genomes/pelSin1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE= /hive/data/genomes/galGal4/bed/lastzPelSin1.2016-05-17 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 85m6.637s cat fb.galGal4.chainPelSin1Link.txt # 253365632 bases of 1032854810 (24.531%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` galGal4 pelSin1) \ > rbest.log 2>&1 & # real 110m52.863s # and for the swap: mkdir /hive/data/genomes/pelSin1/bed/blastz.galGal4.swap cd /hive/data/genomes/pelSin1/bed/blastz.galGal4.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal4/bed/lastzPelSin1.2016-05-17/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 50m44.030s cat fb.pelSin1.chainGalGal4Link.txt # 269390769 bases of 2106639384 (12.788%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` pelSin1 galGal4) \ > rbest.log 2>&1 # real 168m45.457s ######################################################################### # LIFTOVER TO galGal6 (DONE - 2016-10-12 - Hiram) ssh hgwdev mkdir /hive/data/genomes/galGal4/bed/blat.galGal6.2018-10-12 cd /hive/data/genomes/galGal4/bed/blat.galGal6.2018-10-12 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \ galGal4 galGal6 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \ galGal4 galGal6) > doLiftOverToGalGal6.log 2>&1 # real 47m3.714s # see if the liftOver menus function in the browser from galGal4 to galGal6 #########################################################################