# for emacs: -*- mode: sh; -*- # This file describes browser build for the chrPic2 # Western Painted Turtle - Chrysemys picta bellii - Jan 2012 # http://www.ncbi.nlm.nih.gov/assembly/49541 # http://www.ncbi.nlm.nih.gov/genome/24706 # http://www.ncbi.nlm.nih.gov/bioproject/PRJNA78657 # http://www.ncbi.nlm.nih.gov/bioproject/PRJNA210179 - chrMt NC_002073.3 # http://www.ncbi.nlm.nih.gov/nuccore/591771112/ # ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Chrysemys_picta/Chrysemys_picta_bellii-3.0.3/ # DATE: 31-Mar-2014 # ORGANISM: Chrysemys picta bellii # TAXID: 8478 # ASSEMBLY LONG NAME: Chrysemys_picta_bellii-3.0.3 # ASSEMBLY SHORT NAME: Chrysemys_picta_bellii-3.0.3 # ASSEMBLY SUBMITTER: Painted turtle genome sequencing consortium # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000241765.2 # FTP-RELEASE DATE: 01-Apr-2014 # Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria; Coelomata; # Deuterostomia; Chordata; Craniata; Vertebrata; Gnathostomata; # Teleostomi; Euteleostomi; Sarcopterygii; Tetrapoda; Amniota; # Sauropsida; Testudines; Cryptodira; Testudinoidea; Emydidae; # Chrysemys; Chrysemys picta ############################################################################# # Fetch sequence from genbank (DONE - 2014-04-07 - Hiram) mkdir -p /hive/data/genomes/chrPic2/genbank cd /hive/data/genomes/chrPic2/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Chrysemys_picta/Chrysemys_picta_bellii-3.0.3/ ./ faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2365749696 bases (192562473 N's 2173187223 real 2173187223 upper # 0 lower) in 78594 sequences in 19 files # Total size: mean 30100.9 sd 710121.5 # min 294 (gi|591641262|gb|AHGY02129839.1|) # max 77392008 (gi|602278193|gb|CM002655.1|) median 754 mkdir -p /hive/data/genomes/chrPic2/ucsc cd /hive/data/genomes/chrPic2/ucsc export mitoAcc=NC_002073.3 wget -O ${mitoAcc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc" echo ">chrM" > chrM.fa grep -v "^>" ${mitoAcc}.fa >> chrM.fa export mSize=`faCount chrM.fa | grep total | awk '{print $2}'` /bin/echo -e "chrM\t1\t$mSize\t1\tF\t$mitoAcc\t1\t$mSize\t+" > chrM.agp faCount chrM.fa # #seq len A C G T N cpg # chrM 16866 5810 4376 2165 4515 0 331 ############################################################################# # process into UCSC naming scheme (DONE - 2012-01-12 - Hiram) mkdir /hive/data/genomes/chrPic2/ucsc cd /hive/data/genomes/chrPic2/ucsc cat << '_EOF_' > ucscCompositeAgp.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x ucscCompositeAgp.pl time ./ucscCompositeAgp.pl #### construct chrUn .fa and .agp cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chrUn.agp") or die "can not write to chrUn.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\./v/; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chrUn.fa") or die "can not write to chrUn.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/. Chrysemys.*//; $line =~ s/\./v/; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # -rw-rw-r-- 1 23867717 Apr 8 15:07 chrUn.agp # -rw-rw-r-- 1 1932948955 Apr 8 15:07 chrUn.fa gzip *.fa *.agp # verify nothing lost in the translation, should be the same as above # except for the name translations and the addition of chrM faSize chr*.fa.gz # 2365766562 bases (192562473 N's 2173204089 real 2173204089 upper # 0 lower) in 78595 sequences in 20 files # Total size: mean 30100.7 sd 710117.0 min 294 (chrUn_AHGY02129839v1) # max 77392008 (chr1) median 754 ############################################################################# # Initial browser build (DONE - 2012-03-27 - Hiram) cd /hive/data/genomes/chrPic2 cat << '_EOF_' > chrPic2.config.ra # Config parameters for makeGenomeDb.pl: db chrPic2 clade vertebrate genomeCladePriority 50 scientificName Chrysemys picta bellii commonName Painted Turtle assemblyDate Dec. 2011 assemblyLabel International Painted Turtle Genome Sequencing Consortium (GCA_000241765.1) assemblyShortLabel v3.0.1 ncbiAssemblyName Chrysemys_picta_bellii-3.0.1 ncbiAssemblyId 326468 orderKey 4000 # chrM already included mitoAcc none fastaFiles /hive/data/genomes/chrPic2/ucsc/*.fa.gz agpFiles /hive/data/genomes/chrPic2/ucsc/*.agp.gz dbDbSpeciesDir turtle taxId 8478 '_EOF_' # << happy emacs # the makeGenomeDb.pl script wasn't used to build this assembly. # The steps in quickAssembly.txt were performed to get the database # established. The steps for makeGenomeDb.pl were gone through # in a debug mode and therefore verified they have been completed. time makeGenomeDb.pl -workhorse=hgwdev -stop=agp chrPic2.config.ra \ > agp.log 2>&1 # real 2m2.893s # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db chrPic2.config.ra > db.log 2>&1 # real 17m30.996s ############################################################################# # running repeat masker (DONE - 2014-04-09 - Hiram) mkdir /hive/data/genomes/chrPic2/bed/repeatMasker cd /hive/data/genomes/chrPic2/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku chrPic2 > do.log 2>&1 & # real 152m29.224s cat faSize.rmsk.txt # 2365766562 bases (192562473 N's 2173204089 real 1959124115 upper # 214079974 lower) in 78595 sequences in 1 files # Total size: mean 30100.7 sd 710117.0 min 294 (chrUn_AHGY02129839v1) # max 77392008 (chr1) median 754 # %9.05 masked total, %9.85 masked real grep -i versi do.log # RepeatMasker version open-4.0.3 # June 20 2013 (open-4-0-3) version of RepeatMasker featureBits -countGaps chrPic2 rmsk # 214213191 bases of 2365766562 (9.055%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, this featureBits count # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2014-04-09 - Hiram) mkdir /hive/data/genomes/chrPic2/bed/simpleRepeat cd /hive/data/genomes/chrPic2/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ chrPic2 > do.log 2>&1 & # real 43m42.360s cat fb.simpleRepeat # 21563230 bases of 2158293365 (0.999%) in intersection # not going to add to rmsk here, using the window masker instead since # it masks more sequence ######################################################################### # CREATE MICROSAT TRACK (DONE - 2017-05-08 - Hiram) ssh hgwdev mkdir /cluster/data/chrPic2/bed/microsat cd /cluster/data/chrPic2/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed chrPic2 microsat microsat.bed # Read 24434 elements of size 4 from microsat.bed ########################################################################## # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-03-27 - Hiram) mkdir /hive/data/genomes/chrPic2/bed/gap cd /hive/data/genomes/chrPic2/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../chrPic2.unmasked.2bit > findMotif.txt 2>&1 # real 0m23.801s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits -countGaps chrPic2 -not gap -bed=notGap.bed # 2158293365 bases of 2589745704 (83.340%) in intersection # real 0m21.340s time featureBits -countGaps chrPic2 allGaps.bed notGap.bed -bed=new.gaps.bed # 3619 bases of 2589745704 (0.000%) in intersection # real 216m26.278s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" chrPic2 | sort -n | tail -1 # 8964 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" chrPic2 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps chrPic2 other.bed # 3619 bases of 2589745704 (0.000%) in intersection wc -l other.bed # 3589 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad chrPic2 otherGap other.bed # Read 3589 elements of size 8 from other.bed # starting with this many hgsql -e "select count(*) from gap;" chrPic2 # 470729 hgsql chrPic2 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" chrPic2 # 474318 # == 470729 + 3589 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 chrPic2 nonBridged.lift -bedFile=nonBridged.bed # see example in danRer7.txt # there are no non-bridged gaps here: hgsql -N -e "select bridge from gap;" chrPic2 | sort | uniq -c # 474318 yes ########################################################################## ## WINDOWMASKER (DONE - 2014-04-10 - Hiram) mkdir /hive/data/genomes/chrPic2/bed/windowMasker cd /hive/data/genomes/chrPic2/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev chrPic2 > do.log 2>&1 & # real 225m32.368s # Masking statistics cat faSize.chrPic2.cleanWMSdust.txt # 2365766562 bases (192562473 N's 2173204089 real 1488430331 upper # 684773758 lower) in 78595 sequences in 1 files # Total size: mean 30100.7 sd 710117.0 min 294 (chrUn_AHGY02129839v1) # max 77392008 (chr1) median 754 # %28.95 masked total, %31.51 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps chrPic2 rmsk windowmaskerSdust # 140761128 bases of 2365766562 (5.950%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2014-04-10 - Hiram) cd /hive/data/genomes/chrPic2 twoBitMask -add bed/windowMasker/chrPic2.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed chrPic2.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa chrPic2.2bit stdout | faSize stdin > faSize.chrPic2.txt cat faSize.chrPic2.txt # 2365766562 bases (192562473 N's 2173204089 real 1488302752 upper # 684901337 lower) in 78595 sequences in 1 files # Total size: mean 30100.7 sd 710117.0 min 294 (chrUn_AHGY02129839v1) # max 77392008 (chr1) median 754 # %28.95 masked total, %31.52 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/chrPic2/chrPic2.2bit ln -s `pwd`/chrPic2.2bit /gbdb/chrPic2/chrPic2.2bit ######################################################################### # cpgIslands - (DONE - 2014-04-16 - Hiram) mkdir /hive/data/genomes/chrPic2/bed/cpgIslands cd /hive/data/genomes/chrPic2/bed/cpgIslands time doCpgIslands.pl chrPic2 > do.log 2>&1 # Elapsed time: 114m58s cat fb.chrPic2.cpgIslandExt.txt # 16307343 bases of 2173204089 (0.750%) in intersection ######################################################################### # genscan - (DONE - 2014-04-16 - Hiram) mkdir /hive/data/genomes/chrPic2/bed/genscan cd /hive/data/genomes/chrPic2/bed/genscan time doGenscan.pl chrPic2 > do.log 2>&1 # Elapsed time: 534m29s cat fb.chrPic2.genscan.txt # 41715267 bases of 2173204089 (1.920%) in intersection cat fb.chrPic2.genscanSubopt.txt # 49704463 bases of 2173204089 (2.287%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2014-04-29 - Hiram) # Use -repMatch=800, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2173204089 / 2897316137 \) \* 1024 # ( 2173204089 / 2897316137 ) * 1024 = 768.076690 # round up to 800 cd /hive/data/genomes/chrPic2 time blat chrPic2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/chrPic2.11.ooc -repMatch=800 # Wrote 14175 overused 11-mers to jkStuff/chrPic2.11.ooc # real 0m40.913s # there are non-bridged gaps, create lift file for genbank hgsql -N -e "select bridge from gap;" chrPic2 | sort | uniq -c # 56 no # 183695 yes cd /hive/data/genomes/chrPic2/jkStuff gapToLift chrPic2 chrPic2.nonBridged.lift -bedFile=chrPic2.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' chrPic2.nonBridged.bed | sort -nr | head # 29146328 chr3 33862197 63008525 chr3.03 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2014-04-29 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Chrysemys picta 50 0 8 # Chrysemys picta marginata 1 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add chrPic2 just before chrPic1 # chrPic2 (green painted turtle) chrPic2.serverGenome = /hive/data/genomes/chrPic2/chrPic2.2bit chrPic2.clusterGenome = /hive/data/genomes/chrPic2/chrPic2.2bit chrPic2.ooc = /hive/data/genomes/chrPic2/jkStuff/chrPic2.11.ooc chrPic2.lift = /hive/data/genomes/chrPic2/jkStuff/chrPic2.nonBridged.lift chrPic2.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} chrPic2.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} chrPic2.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} chrPic2.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} chrPic2.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} chrPic2.refseq.mrna.native.load = no chrPic2.refseq.mrna.xeno.load = yes chrPic2.genbank.mrna.xeno.load = yes chrPic2.genbank.est.native.load = no chrPic2.downloadDir = chrPic2 chrPic2.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding chrPic2 painted turtle refs #13166" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S chrPic2 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial chrPic2 & # var/build/logs/2014.04.29-14:57:04.chrPic2.initalign.log # real 992m15.247s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad chrPic2 & # logFile: # var/dbload/hgwdev/logs/2014.04.30-16:46:08.chrPic2.dbload.log # real 51m42.197s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add chrPic2 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added chrPic2 refs #13166" etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram) # XXX - redone correctly 2017-05-08 see later instructions XXX ##### mkdir /hive/data/genomes/chrPic2/bed/ucscToINSDC cd /hive/data/genomes/chrPic2/bed/ucscToINSDC # copying these scripts from the previous load and improving them cp -p ../../../sorAra2/bed/ucscToINSDC/translateNames.sh . # with each instance ./translateNames.sh NC_002073.3 # needs to be sorted to work with join sort ucscToINSDC.txt > ucscToINSDC.tab awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \ > name.coordinate.tab join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' > ucscToINSDC.bed cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 20 # use the 20 in this sed: sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab chrPic2 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords chrPic2 ucscToINSDC # should cover all bases featureBits -countGaps chrPic2 ucscToINSDC # 2365766562 bases of 2365766562 (100.000%) in intersection # verify the track link to INSDC functions ############################################################################## # cytoBandIdeo - (DONE - 2017-05-08 - Hiram) mkdir /hive/data/genomes/chrPic2/bed/cytoBand cd /hive/data/genomes/chrPic2/bed/cytoBand makeCytoBandIdeo.csh chrPic2 ############################################################################# # fixup search rule for assembly track/gold table (DONE - 2014-05-01 - Hiram) hgsql -N -e "select frag from gold;" chrPic2 | sort | head -1 AHGY02000001.1 hgsql -N -e "select frag from gold;" chrPic2 | sort | tail -2 AHGY02262325.1 NC_002073.3 # verify this rule will find them all or eliminate them all: hgsql -N -e "select frag from gold;" chrPic2 | wc -l # 262326 hgsql -N -e "select frag from gold;" chrPic2 | egrep -e '[AN][HC][G_][Y0]0[0-9]+(\.1)?' | wc -l # 262326 hgsql -N -e "select frag from gold;" chrPic2 | egrep -v -e '[AN][HC][G_][Y0]0[0-9]+(\.1)?' | wc -l # 0 # hence, add to trackDb/turtle/chrPic2/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][HC][G_][Y0]0[0-9]+(\.1)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # note: this rule is already in trackDb/turtle/trackDb.ra ######################################################################### # lastz chainNet swap mm10/mouse (DONE - 2017-04-06 - Hiram) # original alignment cd /hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05 cat fb.mm10.chainChrPic2Link.txt # 112560591 bases of 2652783500 (4.243%) in intersection # and for the swap mkdir /hive/data/genomes/chrPic2/bed/blastz.mm10.swap cd /hive/data/genomes/chrPic2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 12m2.676s cat fb.chrPic2.chainMm10Link.txt # 106063993 bases of 2173204089 (4.881%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` chrPic2 mm10) \ > rbest.log 2>&1 & # real 110m9.546s ######################################################################### # ucscToINSDC and ucscToRefSeq table/track (DONE - 2017-05-08 - Hiram) # the sequence here is working for a 'refseq' assembly # beware of a chrM situation may be specific depending upon what is # available in the assembly # normal procedure did not work here since our names were different # than the genbank(INSDC) names mkdir /hive/data/genomes/chrPic2/bed/ucscToRefSeq cd /hive/data/genomes/chrPic2/bed/ucscToRefSeq # run up idKeys for the refseq release mkdir refseqIdKeys cd refseqIdKeys doIdKeys.pl -buildDir=`pwd` -twoBit=refseq.2bit refseqChrPic303 # that can be joined to UCSC to get that reference join refseqChrPic303.idKeys.txt ../../idKeys/chrPic2.idKeys.txt \ | awk '{printf "%s\t%s\n", $3,$2 }' | sort -k2,2 > ../ucscToRefSeq.txt # find accession for chrM grep chrM ../../chrPic2.agp # chrM 1 16866 1 F NC_002073.3 1 16866 + # find the genbank accession for NC_002073.3 at Entrez nucleotide # in this case, this NC_002073.3 is not used in either the genbank or # the refseq assembly. From Entrez, the equivalent genbank name is # AF069423.1 # if there is a chrM, use its RefSeq name as a second argument: # this is a RefSeq assembly, use the chrM refSeq name: # ~/kent/src/hg/utils/automation/ucscToINSDC.sh \ # ../../refseq/GCF_*structure/Primary_Assembly NC_002073.3 # this is actually ucscToRefSeq since this is a RefSeq assembly # sort -k2 ucscToINSDC.txt > ucscToRefSeq.txt # rm -f ucscToINSDC.txt awk '{printf "%s\t%s\n", $2, $1}' ucscToRefSeq.txt \ | sort > refSeqToUcsc.txt # chrM processing needs special help, fixup with the sed # extract the refseq vs. genbank names from the assembly_report # columns 5 and 7 are the INSDC and RefSeq names grep -v "^#" ../../refseq/GCF*_assembly_report.txt | cut -f5,7 \ | awk '{printf "%s\t%s\n", $2, $1}' | sed -e 's/na/AF069423.1/' \ | sed -e 's/NC_023890.1/NC_002073.3/;' | sort > refseq.insdc.txt awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > ucsc.coordinate.tab join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 ucsc.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-4 \ > ucscToRefSeq.bed printf "chrM\t0\t16866\tNC_002073.3\n" >> ucscToRefSeq.bed join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 ucsc.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \ > ucscToINSDC.bed printf "chrM\t0\t16866\tAF069423.1\n" >> ucscToINSDC.bed # verify chrM is correct: grep chrM *.bed # ucscToINSDC.bed:chrM 0 16866 AF069423.1 # ucscToRefSeq.bed:chrM 0 16866 NC_002073.3 # should be same line counts throughout: (except for chrM sometimes) wc -l * # 78594 refSeqToUcsc.txt # 78595 refseq.insdc.txt # 78595 ucsc.coordinate.tab # 78595 ucscToINSDC.bed # 78595 ucscToRefSeq.bed # 78594 ucscToRefSeq.txt export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 20 # use the 25 in this sed sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab chrPic2 ucscToINSDC stdin ucscToINSDC.bed # should be the same for ucscToRefSeq: export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 20 sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab chrPic2 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed # checkTableCoords should be silent checkTableCoords chrPic2 # each should cover %100 entirely: featureBits -countGaps chrPic2 ucscToINSDC # 1440398454 bases of 1440398454 (100.000%) in intersection featureBits -countGaps chrPic2 ucscToRefSeq # 1440398454 bases of 1440398454 (100.000%) in intersection ######################################################################### # add chromAlias table (DONE - 2017-03-27 - Hiram) mkdir /hive/data/genomes/chrPic2/bed/chromAlias cd /hive/data/genomes/chrPic2/bed/chromAlias hgsql -N -e 'select chrom,name,"refseq" from ucscToRefSeq;' chrPic2 \ > ucsc.refseq.tab hgsql -N -e 'select chrom,name,"genbank" from ucscToINSDC;' chrPic2 \ > ucsc.genbank.tab awk '{printf "%s\t%s\t%s\n", $2,$1,$3}' ucsc.genbank.tab ucsc.refseq.tab \ | sort > chrPic2.chromAlias.tab hgLoadSqlTab chrPic2 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ chrPic2.chromAlias.tab ######################################################################### # create pushQ entry (DONE - 2017-05-08 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema joinerCheck -database=chrPic2 -keys all.joiner mkdir /hive/data/genomes/chrPic2/pushQ cd /hive/data/genomes/chrPic2/pushQ makePushQSql.pl chrPic2 > chrPic2.sql 2> stderr.out # check stderr.out for no significant problems, it is common to see: # WARNING: hgwdev does not have /gbdb/chrPic2/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/chrPic2/wib/quality.wib # WARNING: hgwdev does not have /gbdb/chrPic2/bbi/quality.bw # WARNING: chrPic2 does not have seq # WARNING: chrPic2 does not have extFile # which are no real problem # if some tables are not identified: # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # # put them in manually after loading the pushQ entry scp -p chrPic2.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < chrPic2.sql ######################################################################### # all.joiner update, downloads and in pushQ - (DONE - 2017-05-08 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=chrPic2 -tableCoverage all.joiner joinerCheck -database=chrPic2 -times all.joiner joinerCheck -database=chrPic2 -keys all.joiner cd /hive/data/genomes/chrPic2 time (makeDownloads.pl -workhorse=hgwdev chrPic2) > downloads.log 2>&1 # real 22m35.669s # now ready for pushQ entry mkdir /hive/data/genomes/chrPic2/pushQ cd /hive/data/genomes/chrPic2/pushQ time (makePushQSql.pl -redmineList chrPic2) > chrPic2.pushQ.sql 2> stderr.out # real 7m18.635s # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/chrPic2/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/chrPic2/wib/quality.wib # WARNING: hgwdev does not have /gbdb/chrPic2/bbi/quality.bw # WARNING: chrPic2 does not have seq # WARNING: chrPic2 does not have extFile #########################################################################