# for emacs: -*- mode: sh; -*- ########################################################################### # accession names obtained from Entrez with the query: # # (ebola[title] OR ebolavirus[title]) AND genome[title] # # as of 2014-09-16 shows 158 sequences # ########################################################################### # fetch both fasta and gbk records: (DONE - 2014-09-16 - Hiram) mkdir -p /hive/data/genomes/eboVir3/genbank cd /hive/data/genomes/eboVir3/genbank # save the 158 accession list from the Entrez query into the file: # 158.strain.list mkdir fasta gbk cat 158.strain.list | while read acc do wget -O fasta/${acc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$acc" wget -O gbk/${acc}.gbk \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=gb&sendto=on&id=$acc" done ########################################################################### # scan descriptions to make a list (DONE - 2014-09-16 - Hiram) # cd /hive/data/genomes/eboVir3/genbank # using the perl script from the source tree, and removing # some of the extra verbage: ~/kent/src/hg/makeDb/doc/eboVir3/gbkDefinition.pl gbk/*.gbk \ | sed -e 's# Zaire ebolavirus isolate Ebola virus H.sapiens-wt##; s#/SLE/2014/##; s#Zaire ebolavirus isolate Ebola virus H.sapiens-wt##; s#, partial genome.##; s#, complete genome.##; s#ebolavirus##; s#Ebola virus##' # the output of that was used to construct: # src/hg/makeDb/doc/eboVir2/158.strain.descriptions.txt # # with a bit of editing to fill in some missing bits # src/hg/makeDb/doc/eboVir2/158.strain.descriptions.txt # ########################################################################### # construct UCSC names (DONE - 2014-09-16 - Hiram) mkdir /hive/data/genomes/eboVir3/ucsc cd /hive/data/genomes/eboVir3/ucsc for F in ../genbank/fasta/*.fa do gbName=`grep -h "^>" ${F} | awk -F'|' '{print $4}'` ucscName=`echo $gbName | sed -e 's/\./v/'` echo $gbName $ucscName echo ">$ucscName" > $ucscName.fa grep -v "^>" ${F} >> $ucscName.fa gzip -f $ucscName.fa done # check for duplicates: faToTwoBit *.fa.gz t.2bit twoBitDup t.2bit # KC242795v1 and KC242793v1 are identical # KC242797v1 and KC242793v1 are identical # KC242798v1 and KC242794v1 are identical # KM233068v1 and KM233065v1 are identical # KM233078v1 and KM233073v1 are identical # KM233094v1 and KM233081v1 are identical # KM233108v1 and KM233037v1 are identical # NC_002549v1 and AF086833v2 are identical # NC_004161v1 and AF522874v1 are identical # NC_006432v1 and AY729654v1 are identical # NC_014372v1 and FJ217162v1 are identical # NC_014373v1 and FJ217161v1 are identical # convert that to strain names: twoBitDup t.2bit \ | sed -f ~/kent/src/hg/makeDb/doc/eboVir2/ucscName.strainName.sed # 1Mbie_Gabon_1996 and 1Eko_1996 are identical # 1Oba_Gabon_1996 and 1Eko_1996 are identical # 1Ikot_Gabon_1996 and 2Nza_1996 are identical # ManoR_G3769v4_2014 and ManoR_G3769v1_2014 are identical # ManoR_G3796_2014 and ManoR_G3786_2014 are identical # ManoR_G3820_2014 and ManoR_G3800_2014 are identical # ManoR_G3845_2014 and ManoR_EM110_2014 are identical # zaire_AF086833v2_1976 and zaire_AF086833v2_1976 are identical # Pennsylvania_1990 and Reston_PA_1990 are identical # Gulu_2000 and Gulu_Uganda_2000 are identical # Cote_dIvoire_1994 and Cote_dIvoire_CIEBOV_1994 are identical # Bundibugyo_2007 and Bundibugyo_Uganda_2007 are identical # some of the sequences really are duplicates, just different # names for the same thing, e.g.: # NC_002549v1 and AF086833v2 are identical # NC_004161v1 and AF522874v1 are identical # NC_006432v1 and AY729654v1 are identical # NC_014372v1 and FJ217162v1 are identical # NC_014373v1 and FJ217161v1 are identical # I'm not sure what's up with the other things ###########################################################################