# for emacs: -*- mode: sh; -*- # This file describes how to construct a UCSC genome browser for # any genome sequence. Enter your genome information in the source tree # file: kent/src/hg/utils/phyloTrees/dbCommonScientificCladeOrderkey.tab # for example (note tab separated columns): # calMil1 Elephant shark Callorhinchus milii vertebrate chondrichthyes 4796 # 1. find your sequence at: # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes # for example: # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Callorhinchus_milii/Callorhinchus_milii-6.1.3/ # establish your DB name to use in the following commands: export DB=calMil1 ############################################################################# # create directory and download sequence mkdir -p /hive/data/genomes/${DB}/genbank cd /hive/data/genomes/${DB}/genbank # Verify this FTP directory appears to be correct: grep `echo $DB | sed -e 's/[0-9][0-9]*//'` $HOME/kent/src/hg/utils/phyloTrees/db.ncbiTaxId.tab | sort | cut -f5 | tail -1 | sed -e 's#/ASSEMB.*##' # if that looks good, then let it be used: grep `echo $DB | sed -e 's/[0-9][0-9]*//'` $HOME/kent/src/hg/utils/phyloTrees/db.ncbiTaxId.tab | sort | cut -f5 | tail -1 | sed -e 's#/ASSEMB.*##' | while read ftpDir do time rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/${ftpDir}/ ./ done ############################################################################# # establish UCSC naming scheme # setup the scientific name: cd /hive/data/genomes/${DB} grep ORGANISM: genbank/ASSEMBLY_INFO | cut -f2 > species.name.txt # When the assembly is composed only of unplaced scaffolds: time ~/kent/src/hg/utils/automation/unplacedScaffolds.pl ${DB} > step1.log 2>&1 # this constructs the files and directory: # -rw-rw-r-- 1 244682665 Mar 4 12:30 calMil1.unmasked.2bit # drwxrwxr-x 2 512 Mar 4 12:30 ucsc # -rw-rw-r-- 1 369313 Mar 4 12:30 chrom.sizes # check for a mitochondrion sequence if it isn't included in the download # query NCBI Nucleotide search for # " mitochondrion complete genome" # note the accession number, and fetch it: cd /hive/data/genomes/${DB}/ucsc export mitoAcc=NC_xxxxxx.x wget -O ${mitoAcc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc" echo ">chrM" > chrM.fa grep -v "^>" ${mitoAcc}.fa >> chrM.fa export mSize=`faCount chrM.fa | grep total | awk '{print $2}'` /bin/echo -e "chrM\t1\t$mSize\t1\tF\t$mitoAcc\t1\t$mSize\t+" > chrM.agp # then, combine this chrM.agp and chrM.fa sequence with the unplaced # agp and fa files mv ${DB}.ucsc.fa.gz ${DB}.ucsc.fa.0.gz (zcat ${DB}.ucsc.fa.0.gz; cat chrM.fa) | gzip -c > ${DB}.ucsc.fa.gz mv ${DB}.ucsc.agp ${DB}.ucsc.0.agp cat ${DB}.ucsc.0.agp chrM.agp > ${DB}.ucsc.agp # faToTwoBit on the new sequence, checkAgpAndFa with the new agp # replace the ${DB}.unmasked.2bit and ${DB}.agp at the top faToTwoBit ${DB}.ucsc.fa.gz ../${DB}.unmasked.2bit twoBitInfo ../${DB}.unmasked.2bit stdout | sort -k2nr > ../chrom.sizes checkAgpAndFa ${DB}.ucsc.agp ../${DB}.unmasked.2bit 2>&1 | tail # if OK, remove the old ones: rm ${DB}.ucsc.fa.0.gz ${DB}.ucsc.0.agp cp -p ucsc/${DB}.ucsc.agp ./${DB}.agp ############################################################################# # construct minimal database mkdir -p /hive/data/genomes/${DB}/bed/chromInfo cd /hive/data/genomes/${DB} awk '{print $1 "\t" $2 "\t/gbdb/'$DB/$DB'.2bit";}' chrom.sizes \ > bed/chromInfo/chromInfo.tab hgsql -e "create database $DB;" test hgGoldGapGl -noGl ${DB} ${DB}.agp cut -f1 bed/chromInfo/chromInfo.tab | awk '{print length($0)}' \ | sort -nr > bed/chromInfo/t.chrSize export chrSize=`head -1 bed/chromInfo/t.chrSize` sed -e "s/chrom(16)/chrom($chrSize)/" \ ${HOME}/kent/src/hg/lib/chromInfo.sql > bed/chromInfo/chromInfo.sql rm -f bed/chromInfo/t.chrSize hgLoadSqlTab ${DB} chromInfo bed/chromInfo/chromInfo.sql \ bed/chromInfo/chromInfo.tab hgsql $DB < $HOME/kent/src/hg/lib/grp.sql mkdir -p /gbdb/${DB} ln -s /hive/data/genomes/${DB}/${DB}.unmasked.2bit /gbdb/${DB}/${DB}.2bit mkdir $HOME/kent/src/hg/makeDb/trackDb/assemblyHubs/${DB} cd $HOME/kent/src/hg/makeDb/trackDb make EXTRA="-strict" DBS=$DB make GIT=echo EXTRA="-strict" DBS=$DB alpha rm -f /gbdb/$DB/trackDb.ix /gbdb/$DB/trackDb.ixx ############################################################################# # add dbDb/clade/genomeClade entries to add browser to selection menus # on genome-test mkdir /hive/data/genomes/${DB}/dbDb cd /hive/data/genomes/${DB}/dbDb export asmInfoPath="/hive/data/genomes/${DB}/genbank/ASSEMBLY_INFO" export dbDbInfo="${HOME}/kent/src/hg/utils/phyloTrees/dbCommonScientificCladeOrderkey.tab" export chromSizes="/hive/data/genomes/${DB}/chrom.sizes" export asmDate=`grep "^DATE:" ${asmInfoPath} | cut -f2 | sed -e 's/^[0-9]+-//; s/-/ /g;'` export asmShortLabel=`grep "^ASSEMBLY SHORT NAME:" ${asmInfoPath} | cut -f2`; export asmSciName=`grep "^ORGANISM:" ${asmInfoPath} | cut -f2`; export asmTaxID=`grep "^TAXID:" ${asmInfoPath} | cut -f2`; export asmSourceName=`grep "^ASSEMBLY SUBMITTER:" ${asmInfoPath} | cut -f2`; export asmAccession=`grep "^ASSEMBLY ACCESSION:" ${asmInfoPath} | cut -f2`; export orderKey=`grep "^$DB" $dbDbInfo | cut -f6`; export clade=`grep "^$DB" $dbDbInfo | cut -f4`; export commonName=`grep "^$DB" $dbDbInfo | cut -f2`; export chrName=`head -1 $chromSizes | cut -f1`; export chrLength=`head -1 $chromSizes | cut -f2`; export chrStart=`echo $chrLength | awk '{printf "%d", ($1/2)-($1/10)}'` export chrEnd=`echo $chrLength | awk '{printf "%d", ($1/2)+($1/10)}'` export defaultPos="${chrName}:${chrStart}-${chrEnd}" echo "DELETE from dbDb where name = \"$DB\"; INSERT INTO dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, htmlPath, hgNearOk, hgPbOk, sourceName, taxId) VALUES (\"$DB\", \"$asmDate ($asmShortLabel/$DB)\", \"/gbdb/$DB\", \"$commonName\", \"$defaultPos\", 1, $orderKey, \"$commonName\", \"$asmSciName\", \"/gbdb/$DB/html/description.html\", 0, 0, \"$asmSourceName $asmAccession\", $asmTaxID);" > ${DB}.dbDb.sql echo \ "INSERT INTO defaultDb (genome, name) VALUES (\"${commonName}\", \"$DB\");" \ > ${DB}.defaultDb.sql case $clade in "primate") echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"16\");" ;; "mammal") echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"35\");" ;; "rodent") echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"40\");" ;; "vertebrate") echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"70\");" ;; "insect") echo "INSERT INTO genomeClade (genome, clade, priority) VALUES (\"${commonName}\", \"$clade\", \"70\");" ;; esac > ${DB}.genomeClade.sql # take a look at these to verify they are sane: cat ${DB}.*.sql # If they appear to be OK, load them into dbDb: cat ${DB}.*.sql | hgsql hgcentraltest ############################################################################# # the three repeat procedures can all start at the same time. # the windowMasker may end with an error if it finishes before # repeatMasker since it tries to compare its result with the RM result # but this isn't a fatal flaw, it is just a featureBits that doesn't finish ############################################################################# # running repeat masker: mkdir -p /hive/data/genomes/${DB}/bed/repeatMasker cd /hive/data/genomes/${DB}/bed/repeatMasker export sciName=`cat ../../species.name.txt` $ the useRMBlastn is the faster alignment engine time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -species "$sciName" -useRMBlastn \ -smallClusterHub=ku ${DB}) > do.log 2>&1 & ############################################################################# # running simple repeats: mkdir /hive/data/genomes/${DB}/bed/simpleRepeat cd /hive/data/genomes/${DB}/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` \ -smallClusterHub=ku -workhorse=hgwdev ${DB}) > do.log 2>&1 & ############################################################################# # running window masker: mkdir /hive/data/genomes/${DB}/bed/windowMasker cd /hive/data/genomes/${DB}/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev ${DB}) > do.log 2>&1 & ############################################################################# # check gaps mkdir /hive/data/genomes/${DB}/bed/gap cd /hive/data/genomes/${DB}/bed/gap findMotif -motif=gattaca -verbose=4 -strand=+ ../../${DB}.unmasked.2bit > findMotif.txt 2>&1 grep "#GAP " findMotif.txt | sed -e "s/^#GAP //" | sort -k1,1 -k2,2n > allGaps.bed awk '{printf "%s\t%d\t'${DB}'.2bit\n", $1, $2}' ../../chrom.sizes > chromInfo.tab featureBits test -chromSize=chromInfo.tab -not -bed=notGap.bed allGaps.bed 2> /dev/null awk '{print $3-$2}' allGaps.bed | ave stdin > gap.stats.txt ############################################################################# # decide to use WindowMasker or RepeatMasker result # usually on whichever is the greater mask, but use RM when # the results are similar cd /hive/data/genomes/${DB} # to use Repeat Masker result: twoBitMask bed/repeatMasker/${DB}.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed ${DB}.2bit # the warning about the bed file >=13 fields is OK # to use Window Masker result: twoBitMask bed/windowMasker/${DB}.cleanWMSdust.2bit \ -add bed/simpleRepeat/trfMask.bed ${DB}.2bit # measure the results from either masking: twoBitToFa ${DB}.2bit stdout | faSize stdin > faSize.${DB}.2bit.txt # and reset the symlink rm /gbdb/${DB}/${DB}.2bit ln -s /hive/data/genomes/${DB}/${DB}.2bit /gbdb/${DB}/${DB}.2bit ############################################################################# # after the masked sequence is established, the following tracks can be created ############################################################################# # genscan track mkdir /hive/data/genomes/${DB}/bed/genscan cd /hive/data/genomes/${DB}/bed/genscan time (doGenscan.pl -buildDir=`pwd` -bigClusterHub=ku -workhorse=hgwdev \ -dbHost=hgwdev $DB) > do.log 2>&1 ############################################################################# # CPG Islands track mkdir /hive/data/genomes/${DB}/bed/cpgIslands cd /hive/data/genomes/${DB}/bed/cpgIslands time (doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -smallClusterHub=ku -workhorse=hgwdev $DB) > do.log 2>&1 ############################################################################# # CPG Islands Unmasked track mkdir /hive/data/genomes/${DB}/bed/cpgIslandsUnmasked cd /hive/data/genomes/${DB}/bed/cpgIslandsUnmasked time (doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \ -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \ -workhorse=hgwdev \ -maskedSeq=/hive/data/genomes/${DB}/${DB}.unmasked.2bit $DB) > do.log 2>&1 ############################################################################# # GC 5-base track mkdir /hive/data/genomes/${DB}/bed/gc5Base cd /hive/data/genomes/${DB}/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 ${DB} \ /hive/data/genomes/${DB}/${DB}.unmasked.2bit \ | gzip -c > ${DB}.gc5Base.wigVarStep.gz wigToBigWig ${DB}.gc5Base.wigVarStep.gz ../../chrom.sizes ${DB}.gc5Base.bw mkdir -p /gbdb/${DB}/bbi/gc5BaseBw rm -f /gbdb/${DB}/bbi/gc5BaseBw/gc5Base.bw ln -s `pwd`/${DB}.gc5Base.bw /gbdb/${DB}/bbi/gc5BaseBw/gc5Base.bw hgsql ${DB} -e "DROP TABLE IF EXISTS gc5BaseBw; CREATE TABLE gc5BaseBw (fileName varchar(255) not null); INSERT INTO gc5BaseBw VALUES (\"/gbdb/${DB}/bbi/gc5BaseBw/gc5Base.bw\");" #############################################################################