# DONE braney 2018-09-06 # this procedure is not perfect. Lots of suspicious stuff in the results, probably # because the source pages don't follow the same format. mkdir /hive/groups/browser/wikipediaScrape cd /hive/groups/browser/wikipediaScrape # get all the pages in the category "Human Proteins" wget "https://petscan.wmflabs.org/?format=tsv&language=en&project=wikipedia&depth=10&categories=human%20genes%0D%0A&interface_language=en&&doit=" -O humanProteins.lst mkdir pages for i in `tawk '{print $2}' humanProteins.lst`; do echo $i; # deal with slashes in names f=`echo $i | sed 's?/?TWTWTWTW?g' `; wget "https://en.wikipedia.org/wiki/$i" -O pages/$f; sleep 2; done # create a mapping of symbols to pages for i in pages/*; do f=`basename $i | sed 's?TWTWTWTW?/?g'`; grep genenames $i | sed 's/.*hgnc_id=[0-9]*">//' | sed 's? ;?;?' | sed 's/<.*//' | \ awk -v name=$f 'BEGIN {OFS="\t"} {n=split($0, a, ";");for (ii=1; ii<= n; ii++) print a[ii],name}'; done | tr -d ' ' | sort > symbolToPage.txt