# epitopes predicted by IEDB, Julia Ponomarenko, received by email # mapped with BLAST + pslMap (Done Max 2014-11-25) # cd /hive/data/genomes/eboVir3/bed/iedbPred/ # got three supplemental files via email, Supplemental_file_{1,2,3}.txt # BUILD PSL file for PSLMAP # get proteins and their nucleotide-sizes less Supplemental_file_1.txt | grep Zaire-2014 | cut -f5 | tabUniq -rs | cut -f1 | cut -d\| -f4 > prot.lst echo AIG96296.1 >> prot.lst # is only in suppl file 3 retrEutils prot.lst prot1.fa -f fasta -o --db protein --field 3 faSize -detailed prot1.fa | tawk '{$2*=3;print}'> prot1.sizes # blast and convert to nucl-space alignment on query side twoBitToFa ../../eboVir3.2bit eboVir3.fa /cluster/bin/blast/x86_64/blast-2.2.16/bin/formatdb -i eboVir3.fa -p F # XX Why is the coverage so low? If minCover is increased, only 9 proteins match.. /cluster/bin/blast/x86_64/blast-2.2.16/bin/blastall -p tblastn -F F -d eboVir3.fa -i prot1.fa | blastToPsl stdin stdout | pslCDnaFilter stdin stdout -minId=0.90 -minCover=0.55 | pslProtCnv > prot1.psl # suppl file 1 # create bed file from IEDB Zaire 2014 annotations less Supplemental_file_1.txt | grep Zaire-2014 | cut -f9,11,12,1 | tawk '{print $2,($3-1)*3,(($4)*3),$1}' | grep -v NCBI > supp1.bed # lift the bed file bedToPsl prot1.sizes supp1.bed stdout | pslMap stdin prot1.psl stdout | pslToBed stdin supp1.lifted.bed # remove some columns from main table into a tab cat Supplemental_file_1.txt | cut -f-20 | egrep 'Strain-short|Zaire-2014'| tabRemoveCol -k2,4,7,9,5 > supp1.tab # join bed and tab file bedAppend supp1.lifted.bed supp1.tab 0 supp1.join.bed supp1.as # reformat name and score fields cat supp1.join.bed | tawk '{split($14, a, "-"); $4=a[2]; print}' | tawk '{split ($21, a, "."); $5=1000-a[1];print}' > supp1.final.bed # split by name, convert to bb and make links bedSplitOnName supp1.final.bed supp1/ -p iedb for i in supp1/*.bed; do bedToBigBed $i /hive/data/genomes/eboVir3/chrom.sizes supp1/`basename $i .bed`.bb -tab -type=bed12+ -as=supp1.as; done for i in supp1/*.bb; do ln -s `pwd`/$i /gbdb/eboVir3/bbi/`basename $i`; done for i in supp1/*.bb; do hgBbiDbLink eboVir3 `basename $i .bb` /gbdb/eboVir3/bbi/`basename $i`; done # same for supp 2 less Supplemental_file_2.txt | grep Zaire-2014 | cut -f9,11,12,1 | tawk '{print $2,($3-1)*3,$4*3,$1}' | grep -v NCBI > supp2.bed bedToPsl prot1.sizes supp2.bed stdout | pslMap stdin prot1.psl stdout | pslToBed stdin supp2.lifted.bed cat Supplemental_file_2.txt | cut -f-14 | egrep 'Strain-short|Zaire-2014'| tabRemoveCol -k2,4,7,9,5 > supp2.tab bedAppend supp2.lifted.bed supp2.tab 0 supp2.join.bed supp2.as cat supp2.join.bed | tawk '{split($14, a, "-"); $4=a[2]; print}' | tawk '{$5=int(1000-(1000*$21));print}' > supp2.final.bed bedSplitOnName supp2.final.bed supp2/ -p iedb for i in supp2/*.bed; do bedToBigBed $i /hive/data/genomes/eboVir3/chrom.sizes supp2/`basename $i .bed`.bb -tab -type=bed12+ -as=supp2.as; done for i in supp2/*.bb; do ln -s `pwd`/$i /gbdb/eboVir3/bbi/`basename $i`; done for i in supp2/*.bb; do hgBbiDbLink eboVir3 `basename $i .bb` /gbdb/eboVir3/bbi/`basename $i`; done # similar for supp 3, but don't split into subtracks less Supplemental_file_3.txt | grep Zaire-2014 | cut -f8,10,11,1 | tawk '{print $2,$3,$4,$1}' | grep -v NCBI > supp3.bed bedToPsl prot1.sizes supp3.bed stdout | pslProtCnv | pslMap stdin prot1.psl stdout | pslToBed stdin supp3.lifted.bed cat Supplemental_file_3.txt | cut -f-19 | egrep 'Strain-short|Zaire-2014'| cut -f1,3,6,8,10- > supp3.tab bedAppend supp3.lifted.bed supp3.tab 0 supp3.join.bed supp3.as cat supp3.join.bed | tawk '{$4=$19; print}' | tawk '{split ($20, a, "."); $5=1000-(10*a[1]);print}' > supp3.final.bed # add a prot L feature (created manually) to indicate that L is not covered cat protLsupp3.bed >> supp3.final.bed bedToBigBed supp3.final.bed /hive/data/genomes/eboVir3/chrom.sizes iedbSupp3.bb -tab -type=bed12+ -as=supp3.as ln -s `pwd`/iedbSupp3.bb /gbdb/eboVir3/bbi/iedbSupp3.bb hgBbiDbLink eboVir3 iedbSupp3 /gbdb/eboVir3/bbi/iedbSupp3.bb # change all names to sequences, do not show the HLA types in the name field # anymore for i in supp1/*.bed; do echo $i; cat $i | tawk '{$4=$20; print}' > temp.txt; mv temp.txt $i; done for i in supp2/*.bed; do echo $i; cat $i | tawk '{$4=$20; print}' > temp.txt; mv temp.txt $i; done for i in supp1/*.bed; do bedToBigBed $i /hive/data/genomes/eboVir3/chrom.sizes supp1/`basename $i .bed`.bb -tab -type=bed12+ -as=supp1.as; done for i in supp2/*.bed; do bedToBigBed $i /hive/data/genomes/eboVir3/chrom.sizes supp2/`basename $i .bed`.bb -tab -type=bed12+ -as=supp2.as; done bedToBigBed protL.bed /hive/data/genomes/eboVir3/chrom.sizes protL.bb -tab -type=bed4+1 -as=protL.as ln -s `pwd`/protL.bb /gbdb/eboVir3/bbi/protL.bb hgBbiDbLink eboVir3 protL2 /gbdb/eboVir3/bbi/protL.bb