#!/bin/tcsh -efx # Script to create a relational version of UniProt database. Should be run on # hgwdev. # Uses stripEvidence, spToDb, spDbAddVarSplice, makeTableDescriptions set DBDATE=170510 set OLDDBDATE=160229 set DB=sp$DBDATE # Set up working directory mkdir -p /hive/data/outside/uniProt/$DBDATE/build # Download uniProt. This will take about 12 hours cd /hive/data/outside/uniProt/$DBDATE/build wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz # 549,072,194 955K/s in 5m 42s wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz # 51,786,464,623 9.50M/s in 2h 43m wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz # 8,122,751 1.40M/s in 6.5s # Turn flat file into relational tab-separated files. # strip out evidence tags zcat *.dat.gz | /cluster/home/jcasper/bin/x86_64/stripEvidence stdin stdout | /cluster/home/jcasper/bin/x86_64/spToDb stdin ../tabFiles cd ../tabFiles wc -l *.txt | awk '{print $2,$1}' | sort > counts cd ../.. join $OLDDBDATE/tabFiles/counts $DBDATE/tabFiles/counts | awk '{print $1, $3/$2}' | awk '{if (($2 > 2) || ($2 < 0.5)) print}' # feature.txt 2.33141 # featureId.txt 3.20125 # Create the database. hgsql mm10 -e "create database sp$DBDATE" # Load it up with table definitions from source directory hgsql sp$DBDATE < ~/kent/src/hg/protein/spToDb/spDb.sql # Load up the data from tab files. This takes about an hour. set s=`date +%s` cd /hive/data/outside/uniProt/$DBDATE/tabFiles foreach i (*.txt) hgsqlimport --local sp$DBDATE $i end # NOTES: # sp170510.extDbRef: Records: 1030353439 Deleted: 0 Skipped: 0 Warnings: 53250 # sp170510.reference: Records: 390385 Deleted: 0 Skipped: 0 Warnings: 1 set e=`date +%s` expr $e - $s # 38301 # Add varsplice info zcat ../build/uniprot_sprot_varsplic.fasta.gz | spDbAddVarSplice sp$DBDATE stdin . hgLoadSqlTab sp$DBDATE -notOnServer -append varProtein /dev/null varProtein.txt hgLoadSqlTab sp$DBDATE -notOnServer -append protein /dev/null varProtein.txt hgLoadSqlTab sp$DBDATE -notOnServer -append varAcc /dev/null varAcc.txt hgLoadSqlTab sp$DBDATE -notOnServer -append displayId /dev/null varDisplayId.txt hgLoadSqlTab sp$DBDATE -notOnServer -append accToTaxon /dev/null varAccToTaxon.txt hgLoadSqlTab sp$DBDATE -notOnServer -append geneLogic /dev/null varGeneLogic.txt tawk '{$(NF+1) = "0"; print}' varGene.txt | hgLoadSqlTab sp$DBDATE -notOnServer -append gene /dev/null stdin hgLoadSqlTab sp$DBDATE -notOnServer -append description /dev/null varDescription.txt # Add table descriptions makeTableDescriptions sp$DBDATE ~/kent/src/hg/protein/spToDb/spDbTables.as # Zip up tab files for people who prefer them to database. gzip *.txt # Don't forget to ask the admins to update the database softlink - "uniProt" # should go to this newest version of the sp* database. # Don't forget to update the proteins database next (see ../proteins/*.txt).