#!/bin/tcsh -efx # Script to create a relational version of UniProt database. Should be run on # hgwdev. set DBDATE=150225 set DB=sp$DBDATE # Set up working directory mkdir -p /hive/data/outside/uniProt/$DBDATE/build # Download uniProt. This will take about 12 hours cd /hive/data/outside/uniProt/$DBDATE/build wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz # 523,470,864 16.0M/s in 41s wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz # 50,308,178,357 12.4M/s in 67m 44s wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz # 7,860,658 1.95M/s in 4.2s # Turn flat file into relational tab-separated files. # strip out evidence tags zcat *.dat.gz | /cluster/home/braney/bin/x86_64/stripEvidence stdin stdout | /cluster/home/braney/bin/x86_64/spToDb stdin ../tabFiles cd ../tabFiles wc -l *.txt | awk '{print $2,$1}' | sort > counts cd ../.. join 140122/tabFiles/counts 150225/tabFiles/counts | awk '{print $1, $3/$2}' | awk '{if (($2 > 2) || ($2 < 0.5)) print}' # accToKeyword.txt 2.17571 # accToTaxon.txt 3.43881 # commonName.txt 0.136933 # taxon.txt 0.0283836 # Create the database. hgsql mm10 -e "create database sp$DBDATE" # Load it up with table definitions from source directory hgsql sp$DBDATE < ~/kent/src/hg/protein/spToDb/spDb.sql # Load up the data from tab files. This takes about an hour. set s=`date +%s` cd /hive/data/outside/uniProt/$DBDATE/tabFiles foreach i (*.txt) hgsqlimport --local sp$DBDATE $i end set e=`date +%s` expr $e - $s # 23604 # Add varsplice info zcat ../build/uniprot_sprot_varsplic.fasta.gz | spDbAddVarSplice sp$DBDATE stdin . hgLoadSqlTab sp$DBDATE -notOnServer -append varProtein /dev/null varProtein.txt hgLoadSqlTab sp$DBDATE -notOnServer -append protein /dev/null varProtein.txt hgLoadSqlTab sp$DBDATE -notOnServer -append varAcc /dev/null varAcc.txt hgLoadSqlTab sp$DBDATE -notOnServer -append displayId /dev/null varDisplayId.txt hgLoadSqlTab sp$DBDATE -notOnServer -append accToTaxon /dev/null varAccToTaxon.txt hgLoadSqlTab sp$DBDATE -notOnServer -append geneLogic /dev/null varGeneLogic.txt hgLoadSqlTab sp$DBDATE -notOnServer -append gene /dev/null varGene.txt hgLoadSqlTab sp$DBDATE -notOnServer -append description /dev/null varDescription.txt # Add table descriptions makeTableDescriptions sp$DBDATE ~/kent/src/hg/protein/spToDb/spDbTables.as # Zip up tab files for people who prefer them to database. gzip *.txt