# These new sequences are found at NCBI Entriz in the Nucleotide search: # (ebola[title] or ebolavirus[title]) and genome # # download the resulting list of accessions and compare to the existing # list to see what it new to be done. # adding new sequences to the eboVir3 browser in a format similar # to the multiple alignment # this procedure can be repeated to add new sequences mkdir /hive/data/genomes/eboVir3/bed/newSequences cd /hive/data/genomes/eboVir3/bed/newSequences # added 2015-01-30 # KP342330.1 # KP658432.1 # added 2015-01-06 # KP260799.1 # KP260800.1 # KP260801.1 # KP260802.1 # KP271018.1 # KP271019.1 # KP271020.1 # added 2014-12-03 # KP178538.1 # KP184503.1 # DI389180.1-does not work, this is the vaccine, no protein sequence to annotate # DI389182.1-does not work, this is the vaccine, no protein sequence to annotate # added 2014-11-18: # KP096420.1 # KP096421.1 # KP096422.1 # KP120616.1 # add new sequence genbank accession identifiers to new.acc.list # as of 2014-11-10: # KM519951.1 # KM655246.1 # NC_016144.1 ####################################################################### # obtain sequences and genbank records from NCBI: mkdir -p fasta gbk ucsc 2bit cat new.acc.list | while read acc do export accV=`echo $acc | sed -e 's/\./v/;'` if [ ! -s fasta/${acc}.fa ]; then echo "# fetch: ${acc} -> ${accV}" wget -O fasta/${acc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$acc" echo ">${accV}" > ucsc/${accV}.fa grep -v '^>' fasta/${acc}.fa >> ucsc/${accV}.fa faToTwoBit ucsc/${accV}.fa 2bit/${accV}.2bit fi if [ ! -s gbk/${accV}.gbk ]; then wget -O gbk/${accV}.gbk \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=gb&sendto=on&id=$acc" fi done # verify UCSC names are reasonable: faCount ucsc/*.fa # #seq len A C G T N cpg # KM519951v1 18953 6051 4063 3743 5096 0 490 # KM655246v1 18797 6003 4006 3723 5065 0 477 # KP096420v1 18958 6055 4050 3753 5100 0 477 # KP096421v1 18958 6054 4053 3753 5097 1 477 # KP096422v1 18958 6050 4050 3758 5100 0 477 # KP120616v1 18920 6034 4047 3748 5091 0 474 # NC_016144v1 18927 5842 4831 3874 4380 0 514 # ... etc ... ####################################################################### # construct the genePred records: mkdir -p gp for F in gbk/*.gbk do B=`basename $F | sed -e 's/.gbk//;'` if [ ${F} -nt gp/${B}.gp ]; then echo "gbkToGp.pl ${F} > gp/${B}.gp" ~/kent/src/hg/makeDb/doc/eboVir3/gbkToGp.pl ${F} > gp/${B}.gp fi done ####################################################################### # running faAlign on all the sequences mkdir -p /hive/data/genomes/eboVir3/bed/newSequences/faAlign cd /hive/data/genomes/eboVir3/bed/newSequences/faAlign mkdir -p sizes axt psl maf # this was done once at the beginning, doesn't need to repeat twoBitToFa ../../../eboVir3.2bit stdout > KM034562v1.target.fa ln -s ../../../chrom.sizes ./KM034562v1.sizes ls ../ucsc/*.fa | egrep -v "DI389180|DI389182" \ | sed -e 's/.fa//; s#../ucsc/##;' | while read A do if [ ../ucsc/${A}.fa -nt sizes/${A}.sizes ]; then faToTwoBit ../ucsc/${A}.fa stdout | twoBitInfo stdin sizes/${A}.sizes echo "faAlign KM034562v1 ${A}.fa axt/${A}.axt" if [ ! -s axt/${A}.axt ]; then faAlign KM034562v1.target.fa ../ucsc/${A}.fa axt/${A}.axt fi axtToPsl axt/${A}.axt KM034562v1.sizes sizes/${A}.sizes psl/${A}.psl axtToMaf axt/${A}.axt KM034562v1.sizes sizes/${A}.sizes stdout \ | sed -e 's/^s \([A-Za-z0-9_]*\)/s \1.\1/;' \ | sed -e 's/^s KM034562v1.KM034562v1/s eboVir3.KM034562v1/;' > maf/${A}.maf fi done # examine pslScore ranges: pslScore psl/* | sort -k5n | sed -e 's/^/# /;' # KM034562v1 6038 8065 DI389182v1:0-2026 614 65.90 # KM034562v1 5967 8270 DI389180v1:1-2300 2080 95.40 # KM034562v1 4 18336 NC_016144v1:0-18925 3194 58.90 # KM034562v1 176 18936 KP271019v1:0-18760 15340 96.70 # KM034562v1 53 18914 KP271020v1:0-18861 17635 96.80 # KM034562v1 45 18841 KM655246v1:0-18796 17682 97.10 # KM034562v1 16 18955 KP271018v1:0-18939 17713 96.80 # KM034562v1 3 18956 KM519951v1:0-18953 17741 96.90 # KM034562v1 29 18957 KP658432v1:0-18928 18894 100.00 # KM034562v1 36 18956 KP120616v1:0-18920 18898 100.00 # KM034562v1 0 18957 KP260802v1:0-18957 18923 100.00 # KM034562v1 0 18957 KP260801v1:0-18957 18925 100.00 # KM034562v1 0 18957 KP260800v1:0-18957 18927 100.00 # KM034562v1 1 18957 KP184503v1:0-18956 18932 100.00 # KM034562v1 0 18957 KP260799v1:0-18957 18933 100.00 # KM034562v1 0 18957 KP096420v1:0-18957 18935 100.00 # KM034562v1 0 18957 KP096421v1:0-18957 18937 100.00 # KM034562v1 0 18957 KP096422v1:0-18957 18941 100.00 # KM034562v1 0 18957 KP178538v1:0-18957 18941 100.00 # KM034562v1 0 18957 KP342330v1:0-18957 18941 100.00 ####################################################################### # construct gene frames cd /hive/data/genomes/eboVir3/bed/newSequences mkdir frames singleCover # this was done once at the beginning, doesn't need to repeat genePredSingleCover ../ncbiGene/correct.txStart.ncbiGene.gp \ singleCover/KM034562v1.gp ls faAlign/maf/*.maf | egrep -v "DI389180|DI389182" | sed -e 's/.maf//g; s#faAlign/##g;' | while read M do if [ gp/${M}.gp -nt singleCover/${M}.gp ]; then genePredSingleCover gp/${M}.gp singleCover/${M}.gp genePredToMafFrames eboVir3 faAlign/maf/${M}.maf frames/${M}.bed \ eboVir3 singleCover/KM034562v1.gp ${M} singleCover/${M}.gp echo "singleCover/${M}.gp" 1>&2 fi done ls frames/*.bed | egrep -v "DI389180|DI389182" | xargs sort -u \ | sort -k1,1 -k2,2n > newSequencesFrames.bed hgLoadMafFrames eboVir3 newSequencesFrames newSequencesFrames.bed ####################################################################### # add iRows cd /hive/data/genomes/eboVir3/bed/newSequences mkdir iRows ls faAlign/maf/*.maf | egrep -v "DI389180|DI389182" \ | sed -e 's/.maf//g; s#faAlign/##g;' | while read M do if [ faAlign/maf/${M}.maf -nt iRows/${M}.nBeds ]; then echo "${M}.bed" > iRows/${M}.nBeds if [ ! -s 2bit/${M}.2bit ]; then echo "ERROR: missing 2bit/${M}.2bit" else twoBitInfo -nBed "2bit/${M}.2bit" iRows/${M}.bed twoBitInfo "2bit/${M}.2bit" iRows/${M}.len echo "${M}.len" > iRows/${M}.sizes echo "mafAddIRows -nBeds=${M}.bed faAlign/maf/${M}.maf /gbdb/eboVir3/eboVir3.2bit" echo "iRows/${M}.bed" > iRows/bedList.txt mafAddIRows -nBeds=iRows/bedList.txt faAlign/maf/${M}.maf /gbdb/eboVir3/eboVir3.2bit \ iRows/${M}.maf rm -f /gbdb/eboVir3/newSequences/${M}.maf ln -s `pwd`/iRows/${M}.maf /gbdb/eboVir3/newSequences/${M}.maf fi fi done # load all files in /gbdb/eboVir3/newSequences hgLoadMaf eboVir3 newSequences ####################################################################### # construct SNP display bed files cd /hive/data/genomes/eboVir3/bed/newSequences ./mkSnpView.sh rm -fr /hive/data/genomes/eboVir3/bed/newSequences/snpView mkdir -p /hive/data/genomes/eboVir3/bed/newSequences/snpView cd /hive/data/genomes/eboVir3/bed/newSequences/snpView for M in ../faAlign/maf/*.maf do awk '/^s/ {print $2}' ${M} | sed 's/\..*//' done | sort -u > species.list