# for emacs: -*- mode: sh; -*- ############################################################################# # ENCODE Analysis Working Group (AWG) tracks on hg19 # # Data in these tracks are generated by uniform processing pipelines that merge replicates # and apply consistent analysis across different data production labs, so they are # of greater interest to most users (and easier to use) than the ENCODE lab tracks. # # Wiki pages for AWG: ############################################################################# # Uniform DNase peaks (2013-1-23 DONE kate) # # Note - this work started in conjunction with update of ENCODE Regulation super-track # as it is needed for the DNase Clusters track. # # Bob Thurman at UW ENCODE (Stam lab) generated this data by running the UW HotSpot # pipeline on UW and Duke datasets (aligned reads). # It is currently hosted on the ENCODE Analysis Hub. # Note: DNase signal tracks from AWG are also available on the hub -- these # were generated from aligned reads using the wiggler pipeline, # without any replicate or lab merging. cd /hive/data/genomes/hg19/bed mkdir -p wgEncodeAwgDnaseUniform # Track composite will be wgEncodeAwgDnaseUniform # get track description, trackDb, and files from EBI site that hosts analysis hub wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.html wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt cp uniformDnase.html ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeAwgDnaseUniform.html sed -n 's/bigDataUrl /wget /p' uniformDnase.txt > wget.csh mkdir bb narrowPeak cd bb csh ../wget.csh >&! ../wget.out & # 125 files (.bb), dated Nov 22, 2011 # Sample case had ~150K peaks (by bigBedInfo) # These are narrowPeak (by bigBedSummary) w/ position & signalVal, no name, score or stats #$ bigBedToBed wgEncodeUwDnaseSKNMC.fdr01peaks.hg19.bb stdout | head chr1 10180 10330 . 0 . 26.000000 -1 -1 -1 # rename to UCSC style and convert to bed: wgEncodeAwgDnaseUniPk.narrowPeak ls *.fdr01peaks.hg19.bb | \ perl -wpe 's^(wgEncode)(\S+)(Dnase)(\S+)(.fdr01peaks.hg19.bb)^bigBedToBed $1$2$3$4$5 ../narrowPeak/$1Awg$3\u\L$2\E\u\L$4\EUniPk.narrowPeak^' > ../unbig.csh # convert to bed and rename csh ../unbig.csh >&! ../unbig.out & # unbig.out is empty, all went OK cd narrowPeak cat > ../loadDnase.csh << 'EOF' set b = wgEncodeAwgDnase set v = ${b}Uniform set t = ../$b.ra set m = ../$b.metaDb.ra rm -f $t $m #gunzip *.gz foreach f (${b}*.narrowPeak) set table = $f:r hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $f gzip $f set md5sum = `md5sum $f.gz | awk '{print $1}'` set lab = `echo $table | perl -wpe 's/(wgEncodeAwgDnase)([A-Z][a-z]+)(.*)UniPk/$2/'` set cell = `echo $table | perl -wpe 's/(wgEncodeAwgDnase)([A-Z][a-z]+)(.*)UniPk/$3/'` echo " track $table" >> $t echo " type narrowPeak" >> $t echo " parent $v" >> $t echo " shortLabel $cell DNase" >> $t echo " longLabel $cell DNaseI HS Uniform Peaks from ENCODE/Analysis" >> $t echo " subGroups cellType=$cell treatment=None lab=$lab view=Peaks" >> $t echo "" >> $t echo " metaObject $table" >> $m echo " objType table" >> $m echo " cell $cell" >> $m echo " treatment None" >> $m echo " composite $b" >> $m echo " dataType DnaseSeq" >> $m echo " dataVersion ENCODE Jan 2011 Freeze" >> $m echo " dccAccession X" >> $m echo " project wgEncode" >> $m echo " tableName $table" >> $m echo " fileName $f.gz" >> $m echo " md5sum $md5sum" >> $m echo " view Peaks" >> $m echo "" >> $m end 'EOF' csh ../loadDnase.csh >&! ../loadDnase.out & # NOTE: error loading wgEncodeAwgDnaseDukeHuh7.5UniPk due to presence of '.' # I didn't catch this till much later... renaming to *Huh75* hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 wgEncodeAwgDnaseDukeHuh75UniPk wgEncodeAwgDnaseDukeHuh7.5UniPk.narrowPeak # Reading wgEncodeAwgDnaseDukeHuh7.5UniPk.narrowPeak # Read 182986 elements of size 10 from wgEncodeAwgDnaseDukeHuh7.5UniPk.narrowPeak # Sorted # Creating table definition for wgEncodeAwgDnaseDukeHuh75UniPk # Saving bed.tab # Loading hg19 set trackDb = ~/kent/src/hg/makeDb/trackDb/human/hg19 cp wgEncodeAwgDnaseUniform.ra $trackDb # merge in composite settings from analysis hub, edit labels for length, treatment # note: raToTab and google spreadsheet greatly help with 17-char shortLabel adjustment # We could use a tabToRa too... cp wgEncodeAwgDnase.metaDb.ra $trackDb/metaDb/alpha/wgEncodeAwgDnaseUniform.ra pushd $trackDb/human/hg19/metaDb/alpha # add composite settings (as in ENCODE UW Dnase track) # edit cells, treatments to match CV. mdbUpdate hg19 -test wgEncodeAwgDnaseUniform.ra mdbUpdate hg19 wgEncodeAwgDnaseUniform.ra popd # obtain lab and DCC accession from metaDb rm -f accession.metaDb.ra mdbPrint hg19 -vars="lab=UW replicate=1 dataType=DnaseSeq view=Peaks" | egrep 'metaObject|objType|lab |cell|treatment|dccAccession|dataVersion|^$' | sed -e 's/wgEncodeUwDnase/wgEncodeAwgDnaseUw/' -e 's/PkRep1/UniPk/' > accession.metaDb.ra mdbPrint hg19 -vars="lab=Duke dataType=DnaseSeq view=Peaks" | egrep 'metaObject|objType|lab |cell|treatment|dccAccession|dataVersion|^$' | sed -e 's/wgEncodeOpenChromDnase/wgEncodeAwgDnaseDuke/' -e 's/Pk/UniPk/' >> accession.metaDb.ra raMerge accession.metaDb.ra $trackDb/metaDb/alpha/wgEncodeAwgDnaseUniform.ra > metaDb.new.ra mv metaDb.new.ra $trackDb/metaDb/alpha/wgEncodeAwgDnaseUniform.new.ra # note -- could have used above to get CV cell type as well (I did this manually) # Add object for composite (copy from UW DNase) # edit to remove orphan stanzas, create wgEncodeAwgDnaseUniform.ra # load and print with magic number pushd $trackDb; make; popd mdbPrint hg19 -vars="composite=wgEncodeAwgDnaseUniform" > wgEncodeAwgDnaseUniform.ra # check in to source control # set up file downloads in expected locale mkdir downloads set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19 # must use name of composite mkdir -p $dir/wgEncodeAwgDnaseUniform cd $dir/wgEncodeAwgDnaseUniform # setup downloads area in ENCODE 2 standard fashion (as on encodewiki 'Data Wrangler HOWTO') mkdir release1 ln -s release1 beta ln -s release1 releaseLatest cd /hive/genomes/hg19/bed/wgEncodeAwgDnaseUniform/narrowPeaks cat << 'EOF' > link.csh foreach f (*.gz) echo $f ln $f /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgDnaseUniform/releaseLatest end 'EOF' csh link.csh cd $dir/wgEncodeAwgDnaseUniform/releaseLatest # add files.txt, md5sum.txt encodeMkFilesList hg19 ln md5sum.txt releaseLatest # add README sed 's/wgEncodeUwDnase/wgEncodeAwgDnaseUniform/' $dir/wgEncodeUwDnase/README.txt > $dir/wgEncodeAwgDnaseUniform cp $dir/wgEncodeUwDnase/README.txt $dir/wgEncodeAwgDnaseUniform/releaseLatest # get file list for pushQ cd ~/kent/src/hg/makeDb/doc/encodeDccHg19 /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgDnaseUniform 1 > wgEncodeAwgDnaseUniform.release1.notes # QA preview (vs. redmine issue) encodeQaInit hg19 wgEncodeAwgDnaseUniform 1 9850 -t ############################################################################# # Uniform TFBS (2013-1-24 IN PROGRESS kate) # # This track is based on data from the March 2012 ENCODE data freeze, processed by # Anshul Kundaje's pipeline (https://sites.google.com/site/anshulkundaje/projects/idr). # This round of uniform processing has a slightly more relaxed threshold (2% FDR vs 1% based on # input Anshul received from users than used for the Jan 2011 data freeze (Analysis pubs and # and UCSC Txn Factor ChIP-seq V2). And there are more data sets (708 vs. 495) # # Based on Anshul's recommendations, using 'optimal' SPP peaks, with black list filtering # (as in previous track). cd /hive/data/genomes/hg19/bed mkdir -p wgEncodeAwgTfbsUniform cd wgEncodeAwgTfbsUniform set build = `pwd` mkdir logs mkdir ebi; cd ebi # set user = encode-box-01 # set pwd = enc*deDOWN wget -r -nd -np --ftp-user=$user --ftp-password=$pwd ftp://ftp-private.ebi.ac.uk/byDataType/peaks/june2012/spp/optimal # using the user name and password in the ENCODE wiki at: # http://encodewiki.ucsc.edu/EncodeDCC/index.php/Notes_on_using_the_Aspera_client_(and_EBI_ftp) # These are also available at: # http://www.ebi.ac.uk/~anshul/public/encodeRawData/peaks_spp/mar2012/distinct/idrOptimalBlackListFilt # NOTE: quality metrics are available here: # https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=7 # -> Check with Anshul for list of data sets to exclude from clustering based on quality. # For now, use all. # sample filename: # spp.idrOptimal.bf.wgEncodeSydhTfbsHct116Pol2UcdAlnRep0.bam_VS_wgEncodeSydhTfbsHct116InputUcdAlnRep1.bam.narrowPeak.gz ls -l filelist.txt # 708 filelist.txt cd .. # extract UCSC object names (experiment and control) from peaks file name ls ebi/*.narrowPeak.gz > peak.lst regClusterMakeTableOfTables ans02 peak.lst partial.table -verbose=3 >&! logs/regTable.out & # attach metadata, by antibody and by target regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table -verbose=3 regClusterAttachMetadataToTableOfTables -antibodyTarget hg19 partial.fixed.table \ withDupes.target.table -verbose=3 # NOTE: partial.fixed.table corrected HAIB GM12878 Egr-1 to Rep3, but lacks controls # NOTE next time rename files immediately, and use UCSC names in scripting # 91 cell types, 201 antibodies (177 targets), 28 treatments, 9 labs ###################### # unzip and rename to UCSC style : wgEncodeAwgTfbs[]UniPk.narrowPeak # load table # create trackDb and metaDb entries # zip and md5sum # save cellType and antibody targets for subGroups at the end cat > nameTable.csh << 'EOF' # create table name for an experiment object set base = $1 set exp = $2 set vars = `echo $exp | \ sed -e 's/Control.*AlnRep1//' -e 's/StdAlnRep1//' -e 's/AlnRep1//' -e 's/wgEncode//' \ -e 's/AlnRep3//' -e 's/Tfbs//' -e 's/BroadHistone/Broad/' -e 's/OpenChromChip/Uta/'` set table = ${base}${vars}UniPk echo $table 'EOF' cat > loadOne.csh << 'EOF' # load table set infile = $1 set exp = $2 set outdir = $3 set base = $4 set table = `csh nameTable.csh $base $exp` hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \ -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $infile # return value 'EOF' cat > makeTag.csh << 'EOF' echo $1 | perl -wpe 's/[^a-z0-9]+//ig; s/(.*)/\U$1/' 'EOF' cat > targetToFactor.csh << 'EOF' set factor = $1 if ($factor =~ eGFP-*) then echo $factor | sed 's/^eGFP-//' else echo $factor endif 'EOF' cat > trackOne.csh << 'EOF' #create track set exp = $1 set composite = $2 set base = $3 set cellFile = $4 set factorFile = $5 # get essential metadata set meta = `mdbPrint hg19 -obj=$exp | \ raToTab stdin -cols=lab,cell,antibody,treatment stdout` if ($status != 0) then echo "ERROR: $exp" continue endif set lab = $meta[1] set cell = $meta[2] set tier = `encodeCvTerm cell $cell tier` # color subtracks with tier 1 and 2 cells if ($tier != "3") then set color = `encodeCvTerm cell $cell color` if ($status != 0) then unset color endif endif set cellTag = `csh makeTag.csh $cell` set antibody = $meta[3] set factor = `encodeCvTerm antibody $antibody target` # special handling of UChicago GFP tagged targets -- strip prefix to get factor set factor = `csh targetToFactor.csh $factor` if ($#meta == 4) then set treatment = $meta[4] else set treatment = "None" endif echo -n "$exp $lab $cell $tier $antibody $factor $treatment" set factorTag = `csh makeTag.csh $factor` echo "$factorTag=$factor" >> $factorFile if ($tier != "3") then set cellTag = "a${tier}${cellTag}" echo "$cellTag=$cell (Tier_${tier})" >> $cellFile else echo "$cellTag=$cell" >> $cellFile endif set table = `csh nameTable.csh $base $exp` set trackDb = ${composite}.ra echo " track $table" >> $trackDb # first cut at default tracks on if ($tier != "3" && \ ($factor == "CTCF" || $factor == "JUN" || $factor == "E2F4" || $factor == "MYC" )) then echo " parent $composite" >> $trackDb else echo " parent $composite off" >> $trackDb endif if ($?treatment && $treatment != "None") then set treatmentLabel = " ($treatment)" else set treatmentLabel = "" endif echo " shortLabel ${cell}${treatmentLabel} $factor" >> $trackDb echo " longLabel $cell${treatmentLabel} TFBS Uniform Peaks of ${antibody} from ENCODE/${lab}/Analysis" >> $trackDb echo " subGroups tier=a${tier}0 cellType=$cellTag factor=$factorTag lab=$lab">> $trackDb if ($?color) then echo " color $color" >> $trackDb endif echo "" >> $trackDb 'EOF' cat > metaOne.csh << 'EOF' # create metadata file set exp = $1 set composite = $2 set base = $3 set table = `csh nameTable.csh $base $exp` set metaDb = ${composite}.metaDb.ra # get essential metadata set meta = `mdbPrint hg19 -obj=$exp | \ raToTab stdin -cols=lab,cell,antibody,treatment stdout` if ($status != 0) then echo "ERROR: $exp" continue endif set lab = $meta[1] set cell = $meta[2] set antibody = $meta[3] set factor = `encodeCvTerm antibody $antibody target` # special handling of UChicago GFP tagged targets -- strip prefix to get factor set factor = `csh targetToFactor.csh $factor` if ($#meta == 4) then set treatment = $meta[4] else set treatment = None endif # now the control info (very inconsistent) set meta = `mdbPrint hg19 -obj=$exp | \ raToTab stdin -cols=controlId,control stdout` if ($status != 0) then echo "ERROR: $exp" continue endif set controlId = $meta[1] # the 'control' term is used In SYDH, but not HAIB if ($#meta == 2) then set control = $meta[2] endif # finally, the DCC accession set accession = "X" set meta = `mdbPrint hg19 -obj=$exp | \ raToTab stdin -cols=dccAccession stdout` if ($status == 0) then set accession = $meta endif echo "$table : $exp $lab $cell $antibody $treatment $controlId" echo "metaObject $table" >> $metaDb echo "objType table" >> $metaDb echo "cell $cell" >> $metaDb echo "target $factor" >> $metaDb echo "antibody $antibody" >> $metaDb if ($?control) then echo "control $control" >> $metaDb endif echo "controlId $controlId" >> $metaDb echo "lab $lab" >> $metaDb echo "treatment $treatment" >> $metaDb echo "composite $composite" >> $metaDb echo "dataType ChipSeq" >> $metaDb echo "dataVersion ENCODE Mar 2012 Freeze" >> $metaDb echo "dccAccession $accession" >> $metaDb echo "project wgEncode" >> $metaDb echo "tableName $table" >> $metaDb echo "fileName $table.narrowPeak.gz" >> $metaDb echo "view Peaks" >> $metaDb echo "" >> $metaDb 'EOF' cat > renameOne.csh << 'EOF' # rename files based on generated table names set exp = $1 set base = $2 set infile = $3 set outdir = $4 set table = `csh nameTable.csh $base $exp` cp -p $infile $outdir/$table.narrowPeak.gz 'EOF' cat > loadTfbs.csh << 'EOF' # parse filename and metaObjects for experiment and control from output # of regClusterMakeTableOfTables set fileList = $1 set base = wgEncodeAwgTfbs set composite = ${base}Uniform set outdir = downloads mkdir -p $outdir @ ct = 1 foreach x (`sed 's/\t/|/g' $fileList`) set obj = `echo $x | awk -F\| '{print $1, $7, $8}'` set infile = $obj[1] set exp = $obj[2] set ctrl = $obj[3] echo "${ct} Track $exp composite=$composite base=$base infile=$infile ctrl=$ctrl" @ ct++ # csh loadOne.csh $infile $exp $outdir $base # csh trackOne.csh $exp $composite $base cells.txt factors.txt # csh metaOne.csh $exp $composite $base #csh renameOne.csh $exp $base $infile $outdir end 'EOF' #ERROR: wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep1 # Seems there is only a Rep3 for this prefix. There exist Rep1 and Rep2 for a different protocol #V0416101 ( vs Pcr2x). And the two protocols have different dccAccessions... We will # consider the Rep3 was used as it was later data and Anshul's filename has Pcr2x # (and try to verify with Anshul). # -> handle this one manually # NOTE: instructions for this patch may be rough # wgEncodeAwgTfbsHaibGm12878Egr1Pcr2xUniPk.narrowPeak.gz is an orphaned file, there is no matching met adata # add in missing dataset cd $build grep Gm12878Egr1 partial.table > haibFix.table # edit to change to Rep3 csh loadTfbs.csh haibFix.table >&! logs/loadHaib.out cp wgEncodeAwgTfbsUniform.metaDb.ra haibFix.metaDb.ra # edit out all but new entry mdbUpdate hg19 haibFix.metaDb.ra mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.metaDb.new.ra diff wgEncodeAwgTfbsUniform.metaDb.new.ra $trackDb/human/hg19/metaDb/alpha # just the HAIB fix pushd $trackDb/human/hg19/metaDb/alpha mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.ra cd $dir/wgEncodeAwgTfbsUniform/releaseLatest csh loadTfbs.csh partial.table >&! logs/loadTfbs.out & sort factors.txt | uniq | sed 's/$/ \\/' > factors.uniq.txt sort cells.txt | uniq | sed 's/$/ \\/' > cells.uniq.txt # insert above as subGroups in trackDb grep track wgEncodeAwgTfbsUniform.ra grep metaObject wgEncodeAwgTfbsUniform.metaDb.ra | wc -l # 707 wc -l partial.table # 708 grep metaObject *.metaDb.ra | grep -v Uchi | wc -l # 694 # NOTE: lost one here -- need to investigate. And remove UChicago from metadata. # The lost file (from below): wgEncodeAwgTfbsHaibGm12878Egr1Pcr2xUniPk # It is a single replicate, Rep3 (?!), so couldn't be located in metadata as Rep1 set trackDb = ~/kent/src/hg/makeDb/trackDb cp wgEncodeAwgTfbsUniform.ra human/hg19/$trackDb # add composite settings (as in ENCODE SYDH TFBS track) # merge in composite settings from analysis hub, edit labels for length, treatment # note: raToTab and google spreadsheet greatly help with 17-char shortLabel adjustment # We could use a tabToRa too... # INVALID cv lookup: control = 'n/a' not found. in obj: wgEncodeAwgTfbsUtaK562CtcfUniPk # 274 of these # INVALID cv lookup: antibody = 'Pol2(b)' DEPRECATED. in obj: wgEncodeAwgTfbsBroadHelas3Pol2bUniPk # 4 of these pushd $trackDb/human/hg19/metaDb/alpha mdbUpdate -delete hg19 -composite=wgEncodeAwgTfbsUniform.ra mdbUpdate hg19 wgEncodeAwgTfbsUniform.ra # 707 (note -- need to remove UChicago) mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.ra git add wgEncodeAwgTfbsUniform.ra # commit # grab track description from previous version on the Analysis hub # - edit for conformance to browser style # - update with new counts for cell types, etc. # - have Anshul update (done) mkdir html cd html wget -nd -np http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/uniformTfbs.html wget -nd -np http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/trackDb.txt cp uniformTfbs.html ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeAwgTfbsUniform.html cd .. # fix up labels # short (<=17) hgsql hg19 -e 'select shortLabel from trackDb_kate where tableName like "wgEncodeAwgTfbs%" and length(shortLabel) > 17' > badShorts.txt # use + # to distinguish same experiment diff labs, add single char lab id: # h=HudsonAlpha, s=Stanford, t=UTexas-Austin, w=UWashington, c=USC, v=Harvard, y=Yale, b=Broad, i=UChicago (Illinois?) # Add '2, e.g. 'h2, if 2 experiments from same lab (e.g. different control or antibody) hgsql hg19 -e 'select longLabel from trackDb_kate where tableName like "wgEncodeAwgTfbs%" and length(longLabel) > 80' > badLongs.txt # Shorten by abbreviating treatments, and excise 'from' # Add quality scores from Anshul's manual curation : # https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=9 # add metadata term 'quality'=['good','caution'] for integrated metric [1,0] # drop failed datasets (quality == -1) # add passing U Chicago (total 5) cd $build mkdir quality cd quality # download .csv from google spreadsheet above to wc -l tfbsQualityReport.csv # 712 #### # rescue good Chicago datasets grep Chicago tfbsQualityReport.csv | awk -F',' '$2 != -1 {print $3}' #wgEncodeUchicagoTfbsK562Ehdac8ControlAlnRep0 #wgEncodeUchicagoTfbsK562EjundControlAlnRep0 #wgEncodeUchicagoTfbsK562EjunbControlAlnRep0 #wgEncodeUchicagoTfbsK562Egata2ControlAlnRep0 #wgEncodeUchicagoTfbsK562EfosControlAlnRep0 grep Uchicago partial.table > chicago.table # edit out all but good datasets, listed above -> chicago.good.table csh loadTfbs.csh chicago.good.table >&! logs/loadTfbs.chicago.out & #### # remove failed datasets awk -F',' '$2 == -1 {print $3}' tfbsQualityReport.csv | grep -v chicago > bad.table #wgEncodeSydhTfbsGm12878Spt20StdAlnRep0 #wgEncodeSydhTfbsGm12878Gcn5StdAlnRep0 #wgEncodeHaibTfbsPfsk1Pol24h8V0416101AlnRep0 #wgEncodeHaibTfbsA549Sp1V0422111Etoh02AlnRep0 # NOTE: this one not in Anshul's file directory #wgEncodeSydhTfbsHepg2CebpzIggrabAlnRep0 #wgEncodeSydhTfbsHelas3Gcn5StdAlnRep0 #wgEncodeSydhTfbsGm12878Irf3IggmusAlnRep0 #wgEncodeSydhTfbsK562Xrcc4StdAlnRep0 #wgEncodeSydhTfbsHepg2Srebp2PravastStdAlnRep0 #wgEncodeSydhTfbsHepg2Srebp1PravastStdAlnRep0 #wgEncodeSydhTfbsHepg2Pol2PravastStdAlnRep0 # 11 total # update trackDb: add stanzas, uniquify short labels, add 1 new lab, 2 new factors to subgroups # tweak cell type tags for tiers>3 # update track description # update metaDb # was 695. Added 5, removed 10 -> -5. Should be 690. # Read 691 metadata objects from hg19 # Yes! (includes composite object) # TODO: Histogram of score distributions (may want to normalize to >1std) # Ref JK suggestion during code review 3/26/12 # TODO: Add 5 chicago and remove 10 baddies links from downloads dir (and releaseLatest) # add quality metric from Anshul's spreadsheet cd quality cat > getQuality.pl << 'EOF' while (<>) { ($id, $qual, $obj, $junk) = split(","); if ($qual eq "0") { $qual = "caution"; } else if ($qual eq "1") { $qual = "good"; } else { next; } $obj =~ s/Rep0/Rep1/; print "$obj.$qual\n"; } 'EOF' perl getQuality.pl < tfbsQualityReport.csv > quality.lst wc -l quality.lst # 690 cat > addQuality.csh << 'EOF' set qualList = $1 foreach q (`cat $qualList`) set obj = $q:r set qual = $q:e set obj = `csh ../nameTable.csh wgEncodeAwgTfbs $obj` echo "metaObject $obj" echo "quality $qual" echo "" end 'EOF' csh addQuality.csh quality.lst < quality.ra mdbUpdate hg19 quality.ra #Affected 689 row(s) in hg19.metaDb_kate # ugh, lost 1 mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.ra # NOTE: add quality typeOfTerm and the two values (caution, good) to cv.ra # hand-tweak 3 HudsonAlpha that have problematic V02, etc. in filenames #wgEncodeAwgTfbsHaibK562CtcfcPcr1xUniPk - peak count matches: wgEncodeHaibTfbsK562CtcfcPcr1xAlnRep0V2 (quality 1) #wgEncodeAwgTfbsHaibU87NrsfPcr2xUniPk - peak count matches:wgEncodeHaibTfbsU87NrsfPcr2xAlnRep0V2 (quality 1) #wgEncodeAwgTfbsHaibU87Pol24h8V0416101UniPk - peak count matches: wgEncodeHaibTfbsU87Pol24h8V0416101AlnRep0V2 (quality 1) # Redo scores in tables and files. Files had no scores -- we will assign using 'reg' scoring # method (after evaluating multiple methods with 'bedScore'). Tables had encode pipeline # scoring method -- these will be replaced. # Tried all 4 scoring methods -- reviewing graphs with Jim, it was decided to use 'reg' scoring, # transformed with single normalization value. cat > scoreNarrowPeaks.csh << 'EOF' set method = $1 set peakDir = $2 set uniformDir = $peakDir/${method}.uniform rm -fr $dir $uniformDir mkdir $dir $uniformDir bedScore -col=7 -verbose=2 -method=$method peaks/*.narrowPeak $dir bedScore -uniform -col=7 -verbose=2 -method=$method peaks/*.narrowPeak $uniformDir 'EOF' cat > doScorePeaks.csh << 'EOF' set outDir = scoredPeaks mkdir -p $outDir $outDir/logs foreach method (encode reg std2 asinh) echo "Scoring by $method" csh scoreNarrowPeaks.csh $method $outDir >&! $outDir/logs/$method.log end 'EOF' csh doScorePeaks.csh >&! doScoredPeaks.log # We will use files in scoredPeaks/reg.uniform # reload tables cat > loadScored.csh << 'EOF' cd scoredPeaks/reg.uniform foreach path (*.narrowPeak) set table = $path:r echo $table /cluster/bin/x86_64/hgLoadBed -noNameIx -trimSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \ -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $path end 'EOF' csh loadScored.csh >&! logs/loadScored.out & # replace downloads mv downloads downloads.orig cat > downloadScored.csh << 'EOF' mkdir -p downloads cd scoredPeaks/reg.uniform foreach file (*.narrowPeak) echo $file gzip -c $file > ../../downloads/$file.gz end 'EOF' csh downloadScored.csh >&! logs/downloadScored.out & # set up file downloads in expected locale set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19 # must use name of composite mkdir -p $dir/wgEncodeAwgTfbsUniform cd $dir/wgEncodeAwgTfbsUniform # link files from build area cd /hive/data/genomes/hg19/bed/wgEncodeAwgTfbsUniform/downloads cat << 'EOF' > link.csh foreach f (*.gz) echo $f ln $f /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgTfbsUniform end 'EOF' csh link.csh # rescue dataset missing from peaks/ so not rescored cd $build csh renameOne.csh wgEncodeSydhTfbsHepg2Srebp1InslnStdAlnRep1 wgEncodeAwgTfbs ebi/spp.idrOptimal.bf.wgEncodeSydhTfbsHepg2Srebp1InslnStdAlnRep0.bam_VS_wgEncodeSydhTfbsHepg2InputInslnStdAlnRep0.bam.narrowPeak.gz downloads.orig set f = wgEncodeAwgTfbsSydhHepg2Srebp1InslnUniPk.narrowPeak zcat downloads.orig/$f.gz > peaks/$f bedScore -col=7 -verbose=2 -method=reg -uniform peaks/$f scoredPeaks/reg.uniform/$f set table = $f:r /cluster/bin/x86_64/hgLoadBed -noNameIx -trimSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \ -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table scoredPeaks/reg.uniform/$f cd scoredPeaks/reg.uniform gzip -c $f > ../../downloads/$f.gz cd /hive/data/genomes/hg19/bed/wgEncodeAwgTfbsUniform/downloads ln $f.gz /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgTfbsUniform # add files.txt, md5sum.txt cd /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgTfbsUniform encodeMkFilesList hg19 -md5 -cv=/cluster/home/kate/kent/src/hg/makeDb/trackDb/cv/alpha # wgEncodeAwgTfbsHaibPfsk1Pol24h8V0416101UniPk.narrowPeak.gz is an orphaned file, there is no matching metadata rm wgEncodeAwgTfbsHaibPfsk1Pol24h8V0416101UniPk.narrowPeak.gz # setup downloads area in ENCODE 2 standard fashion (as on encodewiki 'Data Wrangler HOWTO') mkdir release1 ln -s release1 beta ln -s release1 releaseLatest # add composite object to metaDb (use wgEncodeAwgDnaseUniform as example) # add README sed 's/wgEncodeHaibTfbs/wgEncodeAwgTfbsUniform/' $dir/wgEncodeHaibTfbs/README.txt > $dir/wgEncodeAwgTfbsUniform/README.txt # edit cp $dir/wgEncodeTfbsUniform/README.txt $dir/wgEncodeAwgTfbsUniform/releaseLatest # get file list for pushQ cd ~/kent/src/hg/makeDb/doc/encodeDccHg19 /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgTfbsUniform 1 > wgEncodeAwgTfbsUniform.release1.notes # check in notes file # QA preview (vs. redmine issue) encodeQaInit hg19 wgEncodeAwgTfbsUniform 1 10042 -t ############################################### # Master list of 2.9M DNase sites from Analysis (in progress kate) # (Chromatin landscape Nature paper, Sep 6 2012) # Filename is in Supplementary data PDF, p. 49, # Supplementary Methods section 1.2, "DHS Master List and its annotation" cd /hive/data/genomes/hg19/bed/wgEncodeAwgDnaseMasterSites set track = wgEncodeAwgDnaseMasterSites mkdir ebi; cd ebi wget ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/byDataType/openchrom/jan2011/combined_peaks/multi-tissue.master.ntypes.simple.hg19.bed wc -l * # 2890742 multi-tissue.master.ntypes.simple.hg19.bed head -3 * chr1 10120 10270 MCV-37 39.232800 chr1 10440 10590 MCV-4 36.369500 chr1 16140 16290 MCV-5 18.768900 ave -col=5 ebi/*.bed # Q1 6.028710 # median 9.877880 # Q3 34.639700 # average 37.963365 # min 4.215660 # max 5445.570000 # count 2890742 # total 109742293.141479 # standard deviation 69.927595 # These are z-scores (by my reading of the Chromatin landscape supplementary inf) # histogram recommends log transform when filling in score # Truncate to reasonable significant digits (%.2f) # From Bob Thurman, UW, a file with cell sources for each DHS site: # paste this on to bed multi-tissue.master.ntypes.simple.names.hg19.txt # create source table: id name #tr ';' '\n' < multi*.txt | sort | uniq > cells.txt #wc -l cells.txt #125 cells.txt # Format to use: BED5 + # # chr start end name score floatScore sourceCount sources (cellId1, cellId2...) autoSql -addBin -dbLink bed5Sources.as bed5Sources # Paste together bed and sources, trim MCV- from item name to leave just the count # of cell types (Bob Thurman suggestion), replace semi-colons in source list with commas, # and add score field, log transformed and scaled to 100-1000 range: # m = 900/(ln(max)-ln(min)) # -> 125.6 # b = ln(max) * m - 1000 # -> 80 # score = floatScore * m + b awk '{sub(/MCV-/,"",$4); printf("%s\t%u\t%u\t%u\t%d\t%.2f\n", $1,$2,$3,$4,125.6*log($5)-80,$5)}' \ ebi/multi-tissue.master.ntypes.simple.hg19.bed > dhs.bed sed -e 's/;/,/g' multi-tissue.master.ntypes.simple.names.hg19.txt | \ paste dhs.bed - > dhs.sources.bed # Create files for loading database tables: track table (bed5Sources) # and sourceTable (idName). Use new tool to split above format (bed5 + floatScore + sources) # Name field has count of cell types having DHS overlapping the master site hgBedSources dhs.sources.bed wgEncodeAwgDnaseMaster # cat checklinks.csh << 'EOF' foreach t (`awk '{print $2}' < wgEncodeAwgDnaseMasterSources.tab`) echo -n "$t :" curl -s "http://genome.ucsc.edu/cgi-bin/hgEncodeVocab?ra=encode/cv.ra&deprecated=true&term=$t" \ | grep -c 'argument not found' end 'EOF' # manually map to CV to create maplinks.txt csh maplinks.csh wgEncodeAwgDnaseMasterSources.tab > sources.fixed.tab mv sources.fixed.tab wgEncodeAwgDnaseMasterSources.unfixed.tab csh maplinks.csh wgEncodeAwgDnaseMasterSources.unfixed.tab > wgEncodeAwgDnaseMasterSources.tab $ csh checklinks.csh wgEncodeDnaseMasterSources.tab| grep -v ':0' # ignore those with + CD14+ :1 CD20+ :1 CD34+_Mobilized :1 # fix these (add hyphen) HMVEC-dBlAd :1 HMVEC-dBlNeo :1 HMVEC-dLyAd :1 HMVEC-dLyNeo :1 # needs 7.5 Huh-75 :1 # live with these (treated) UrotheliaUt189 :1 WI-38_TAM :1 HeLa-S3s3Ifna4h :1 # edit maplinks2.csh mv wgEncodeAwgDnaseMasterSources.tab wgEncodeAwgDnaseMasterSources.unfixed2.tab csh maplinks2.csh wgEncodeAwgDnaseMasterSources.unfixed2.tab > wgEncodeAwgDnaseMasterSources.tab # Load sources table autoSql $HOME/kent/src/hg/lib/idName.as idName hgLoadSqlTab hg19 wgEncodeAwgDnaseMasterSources idName.sql wgEncodeAwgDnaseMasterSources.tab # BED table mv wgEncodeAwgDnaseMaster.bed wgEncodeAwgDnaseMasterSites.bed hgLoadBed -noNameIx -type=bed5+3 \ -sqlTable=$HOME/kent/src/hg/lib/bed5Sources.sql \ -renameSqlTable -as=$HOME/kent/src/hg/lib/bed5Sources.as \ hg19 wgEncodeAwgDnaseMasterSites wgEncodeAwgDnaseMasterSites.bed # Read 2890742 elements of size 8 from wgEncodeAwgDnaseMasterSites.bed # Downloads mkdir downloads cp wgEncodeAwgDnaseMasterSources.tab wgEncodeAwgDnaseMasterSites.bed downloads cd downloads gzip wgEncodeAwgDnaseMasterSites.bed pushd set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19 mkdir -p $dir/wgEncodeAwgDnaseMasterSites cd $dir/wgEncodeAwgDnaseMasterSites mkdir release1 ln -s release1 beta ln -s release1 releaseLatest popd ln wgEncodeAwgDnaseMasterSites.bed.gz wgEncodeAwgDnaseMasterSources.tab \ $dir/wgEncodeAwgDnaseMasterSites cd $dir/wgEncodeAwgDnaseMasterSites # add metadata file trackDb/human/hg19/metaDb/alpha/wgEncodeAwgDnaseMasterSites.ra encodeMkFilesList hg19 -md5 -cv=/cluster/home/kate/kent/src/hg/makeDb/trackDb/cv/alpha # Add README.txt # Notes file for QA cd ~/kent/src/hg/makeDb/doc/encodeDccHg19 /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgDnaseMasterSites 1 > wgEncodeAwgDnaseMasterSites.release1.notes # QA preview (vs. redmine issue) encodeQaInit hg19 wgEncodeAwgDnaseMasterSites 1 13230 -t ############################################### # DNAseI Fingerprinting (in progress Kate) #From SeqAnswers! cd /hive/data/genomes/hg19/bed/wgEncodeAwg/dnase/footprints set track = wgEncodeAwgDnaseFootprint ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/supplementary/integration_data_jan2011/byDataType/footprints/jan2011/ mkdir ebi; cd ebi set dir = ftp.ebi.ac.uk/pub/databases/ensembl/encode/supplementary/integration_data_jan2011/byDataType/footprints/jan2011/ wget ftp://$dir/README wget ftp://$dir/combined.fps.gz wget ftp://$dir/all.footprints.gz gunzip combined.fps.gz wc -l ebi/combined.fps # 8374968 ebi/combined.fps # big file -> use bigBed mkdir /gbdb/hg19/$track set table = wgEncodeAwgDnaseCombinedFootprint bedToBigBed combined.fps /cluster/data/hg19/chrom.sizes $table.bb ln -s `pwd`/$table.bb /gbdb/hg19/$track/$table.bb hgBbiDbLink hg19 $table /gbdb/hg19/$track/$table.bb ############################################################################# # Segmentations (2014-2-18 kate) # There are 3 segmentations on the AWG hub: # 1. ChromHMM # 2. Segway # 3. Combined # UCSC already has a ChromHMM track (currently in the Broad Histone supertrack, but # this is a newer segmentation using that method (10-state vs. 15)). # The segmentations will be formatted similarly to existing track (BED 9) # Docs and data will be pulled from the AWG hub. cd /hive/data/genomes/hg19/bed mkdir -p wgEncodeAwgGenomeSegmentation cd wgEncodeAwgGenomeSegmentation mkdir ebi cd ebi # get track description, trackDb, and files from EBI site that hosts analysis hub wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/trackDb.txt wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/genomeSegmentation.html cp genomeSegmentation.html ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeAwgSegmentation.html grep segmentation trackDb.txt | grep bigDataUrl | sed 's/.*\///' > files.txt wc -l files.txt # 18 (6 each for the 3 methods) cat > getFiles.csh << 'EOF' foreach f (`cat files.txt`) echo $f wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/byDataType/segmentations/jan2011/hub/$f end 'EOF' # NOTE: bigBedInfo shows Segway with roughly 10x item count of ChromHMM (12M vs. 1.2M) # ChromHMM: 1.1M - 1.4M range # Segway: 12.7M - 15.8M # Combined: .7M - 3.2M # Compare hub ChromHMM files with current UCSC track cd .. zcat /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeBroadHmm/wgEncodeBroadHmmGm12878HMM.bed.gz > \ bed/gm12878.broad.bed bigBedToBed ebi/gm12878.ChromHMM.bb bed/gm123878.chromHmm.bed #These look very different -- confirming they were redone for Sep 2012 papers. cd ebi cat > expandFiles.csh << 'EOF' foreach f (`cat files.txt`) echo $f set b = $f:r set cell = `echo $b:r | perl -wpe 's/(.)(.*)/\u$1\L$2/'` set method = `echo $b:e | perl -wpe 's/(.)(.*)/\u$1\L$2/'` set out = wgEncodeAwgSegmentation${method}${cell} bigBedToBed $f ../bed/$out.bed end 'EOF' csh expandFiles.csh >&! expandFiles.out & # Load into tables cd bed cat > loadFiles.csh << 'EOF' foreach f (*.bed) hgLoadBed -tab hg19 $f:r $f end 'EOF' csh loadFiles.csh >&! loadFiles.out & cd .. # Add metadata # add+edit trackDb/human/hg19/metaDb/alpha/wgEncodeAwgSegmentation.ra pushd ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha mdbUpdate hg19 -test wgEncodeAwgSegmentation.ra mdbUpdate hg19 wgEncodeAwgSegmentation.ra mdbPrint hg19 -composite=wgEncodeAwgSegmentation > wgEncodeAwgSegmentation.ra # add to makefile # commit to git popd # Create downloads cd bed foreach f (*.bed) echo $f gzip -c $f > ../downloads/$f.gz end cd .. # set up file downloads in expected locale set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgSegmentation mkdir -p $dir ln downloads/*.gz $dir # release1, releaseLatest ? ln -s release1 beta # make alpha in trackDb (install metaDb) # add files.txt, md5sum.txt cd $dir encodeMkFilesList hg19 md5sum *.gz > md5sum.txt # create README file /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgSegmentation 1 > wgEncodeAwgSegmentation.release1.notes encodeQaInit hg19 wgEncodeAwgSegmentation 1 12259 -t # Responses to QA # Fixes to track description from Steve Wilder (kate 4/11/14) cd /hive/data/genomes/hg19/bed/wgEncodeAwgGenomeSegmentation/ebi cp genomeSegmentation.html genomeSegmentation.html.old wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHubdevel/hg19/genomeSegmentation.html diff genomeSegmentation.html genomeSegmentation.html.old