In [1]:
import basepair
import modisco
Using TensorFlow backend.
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
b'/usr/bin/ld: cannot find -lcudnn\ncollect2: error: ld returned 1 exit status\n'
Mapped name None to device cuda: GeForce GTX TITAN X (0000:09:00.0)
In [2]:
from keras.models import Model, load_model
from basepair.losses import twochannel_multinomial_nll
In [3]:
# Use gpus 3, 5
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3, 5"
In [4]:
model = load_model("model.h5", custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-08-23 20:01:46,195 [WARNING] From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-23 20:01:54,618 [WARNING] From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [5]:
from basepair.utils import read_pkl
train,valid,test = read_pkl("/users/avsec/workspace/basepair-workflow/models/0/data.pkl")
In [6]:
from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])
Out[6]:
{'loss': 953.9045610385498,
 'profile/Oct4_loss': 271.63486264911495,
 'profile/Sox2_loss': 162.5192502521077,
 'profile/Klf4_loss': 228.8282409395277,
 'profile/Nanog_loss': 265.30063137148727,
 'counts/Oct4_loss': 0.612707477322428,
 'counts/Sox2_loss': 0.5658578240037332,
 'counts/Klf4_loss': 0.8421652589296793,
 'counts/Nanog_loss': 0.5414273575545541}
In [7]:
task_names = ["profile/Oct4", "profile/Sox2", "profile/Klf4", "profile/Nanog",
              "counts/Oct4", "counts/Sox2", "counts/Klf4", "counts/Nanog"]
In [8]:
import keras.backend as K
inp = model.inputs[0]
fn_pos = {}
fn_neg = {}
for task_id, task_name in enumerate(task_names):
    if "counts" in task_name:
        fn_pos[task_name] = K.function([inp], K.gradients(model.outputs[task_id][:, 0], inp))
        fn_neg[task_name] = K.function([inp], K.gradients(model.outputs[task_id][:, 1], inp))
    else:
        fn_pos[task_name] = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(
            model.outputs[task_id][:, :, 0])) * model.outputs[task_id][:, :, 0], axis=-1), inp))
        fn_neg[task_name] = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(
            model.outputs[task_id][:, :, 1])) * model.outputs[task_id][:, :, 1], axis=-1), inp))
In [9]:
import numpy as np
from basepair.data import numpy_minibatch

grads_pos = {}
grads_neg = {}

for task_name in task_names:
    grads_pos[task_name] = np.concatenate([np.array(fn_pos[task_name]([batch])).squeeze()
                                           for batch in numpy_minibatch(test[0], 512)])
    grads_neg[task_name] = np.concatenate([np.array(fn_neg[task_name]([batch])).squeeze()
                                           for batch in numpy_minibatch(test[0], 512)])
2018-08-23 20:02:17,662 [WARNING] git-lfs not installed
2018-08-23 20:02:17,868 [WARNING] git-lfs not installed
In [10]:
# Setup different scores
hyp_scores = {}
scores = {}
for task_name in task_names:
    hyp_scores[task_name] = grads_pos[task_name] + grads_neg[task_name]
    hyp_scores[task_name] = hyp_scores[task_name] - hyp_scores[task_name].mean(-1, keepdims=True)
    scores[task_name] = hyp_scores[task_name] * test[0]
In [11]:
from concise.utils.plot import seqlogo_fig, seqlogo
import matplotlib.pyplot as plt
fig, (ax0, ax1)= plt.subplots(2, 1, sharex=True, figsize=(20, 6))

ax0.set_title("scores")
seqlogo(scores["profile/Oct4"][0], ax=ax0)

ax1.set_title("hyp_scores")
seqlogo(hyp_scores["profile/Oct4"][0], ax=ax1)
In [12]:
onehot_data = test[0]
task_to_scores = scores
task_to_hyp_scores = hyp_scores
In [13]:
from collections import OrderedDict, Counter

#Compute the contributions at each position, for each task
per_position_contrib_scores = OrderedDict([                             
                                (x, np.sum(task_to_scores[x],axis=2))
                                for x in task_names])
In [14]:
from modisco import coordproducers
from importlib import reload
reload(coordproducers)
Out[14]:
<module 'modisco.coordproducers' from '/users/amr1/tfmodisco/modisco/coordproducers.py'>
In [15]:
from modisco import core

contrib_scores_tracks = [core.DataTrack(name=key+"_contrib_scores",                                     
                                        fwd_tracks=task_to_scores[key],                                 
                                        rev_tracks=[x[::-1, ::-1] for x in                              
                                                    task_to_scores[key]],                               
                                        has_pos_axis=True) for key in task_names]
hypothetical_contribs_tracks = [core.DataTrack(name=key+"_hypothetical_contribs",                   
                                               fwd_tracks=task_to_hyp_scores[key],               
                                               rev_tracks=[x[::-1, ::-1] for x in                   
                                                            task_to_hyp_scores[key]],            
                                               has_pos_axis=True) for key in task_names]                 
onehot_track = core.DataTrack(name="sequence", fwd_tracks=onehot_data,                
                              rev_tracks=[x[::-1, ::-1] for x in onehot_data],        
                              has_pos_axis=True)
track_set = core.TrackSet(data_tracks=contrib_scores_tracks                       
                          +hypothetical_contribs_tracks+[onehot_track])
In [16]:
# load modisco object
In [17]:
grads_pos_valid = {}
grads_neg_valid = {}

for task_name in task_names:
    grads_pos_valid[task_name] = np.concatenate([np.array(fn_pos[task_name]([batch])).squeeze()
                                           for batch in numpy_minibatch(valid[0], 512)])
    grads_neg_valid[task_name] = np.concatenate([np.array(fn_neg[task_name]([batch])).squeeze()
                                           for batch in numpy_minibatch(valid[0], 512)])
In [18]:
# Setup different scores
hyp_scores_valid = {}
scores_valid = {}
for task_name in task_names:
    hyp_scores_valid[task_name] = grads_pos_valid[task_name] + grads_neg_valid[task_name]
    hyp_scores_valid[task_name] = hyp_scores_valid[task_name] - hyp_scores_valid[task_name].mean(-1, keepdims=True)
    scores_valid[task_name] = hyp_scores_valid[task_name] * valid[0]
In [19]:
onehot_data_valid = valid[0]
task_to_scores_valid = scores_valid
task_to_hyp_scores_valid = hyp_scores_valid
In [20]:
import h5py
import modisco
import modisco.util
import modisco.core
import modisco.tfmodisco_workflow.seqlets_to_patterns
from modisco.tfmodisco_workflow import workflow

track_set_valid = modisco.tfmodisco_workflow.workflow.prep_track_set(
    task_names=task_names,
    contrib_scores=task_to_scores_valid,
    hypothetical_contribs=task_to_hyp_scores_valid,
    one_hot=onehot_data_valid)

grp = h5py.File("modisco_results_on_valid.hdf5","r")
loaded_tfmodisco_results =\
    workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set_valid)
grp.close()
In [21]:
# get seqlets from test set
In [22]:
per_position_contrib_scores = OrderedDict([                             
    (x, [np.sum(s,axis=1) for s in task_to_scores[x]]) for x in task_names])
seqlets_from_data =(
    loaded_tfmodisco_results.multitask_seqlet_creation_results.multitask_seqlet_creator(
        task_name_to_score_track=per_position_contrib_scores,           
        track_set=track_set,
        task_name_to_thresholding_results=
            loaded_tfmodisco_results
             .multitask_seqlet_creation_results
             .task_name_to_thresholding_results)).final_seqlets
On task profile/Oct4
Computing windowed sums
Got 5002 coords
On task profile/Sox2
Computing windowed sums
Got 2403 coords
On task profile/Klf4
Computing windowed sums
Got 702 coords
On task profile/Nanog
Computing windowed sums
Got 219 coords
On task counts/Oct4
Computing windowed sums
Got 7031 coords
On task counts/Sox2
Computing windowed sums
Got 2705 coords
On task counts/Klf4
Computing windowed sums
Got 176 coords
On task counts/Nanog
Computing windowed sums
Got 249 coords
After resolving overlaps, got 9251 seqlets
In [23]:
from modisco import affinitymat
reload(affinitymat.core)
reload(affinitymat)
from modisco import hit_scoring
reload(hit_scoring.fast_hit_scoring)
reload(hit_scoring)
from collections import OrderedDict
In [24]:
from modisco import aggregator
In [25]:
trim_pattern = True
In [26]:
def trim_pssm_idx(pssm, frac=0.05):
    pssm = np.abs(pssm)
    threshold = pssm.sum(axis=-1).max() * frac
    for i in range(len(pssm)):
        if pssm[i].sum(axis=-1) > threshold:
            break

    for j in reversed(range(len(pssm))):
        if pssm[j].sum(axis=-1) > threshold:
            break
    return i, j
In [27]:
def ic_scale(x):
    from modisco.visualization import viz_sequence
    background = np.array([0.27, 0.23, 0.23, 0.27])
    return viz_sequence.ic_scale(x, background=background)
In [28]:
seqlet_size_to_score_with = 25
metacluster_idx_to_scorer = OrderedDict()

all_pattern_scorers = []
all_pattern_names = []

# loop through the metaclusters
for metacluster_name in sorted(loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results.keys()):
    submetacluster_results = (loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results[metacluster_name])

    activity_pattern = submetacluster_results.activity_pattern

    relevant_task_names = [task_name for (task_name, x) in
                           zip(task_names, activity_pattern) if np.abs(x) != 0]

    if trim_pattern:
        trim_sizes = {}
        trimmed_patterns = []
        for pattern_idx, pattern in\
            enumerate(submetacluster_results.
                      seqlets_to_patterns_result.patterns):
            pssm = ic_scale(pattern["sequence"].fwd)
            t1, t2 = trim_pssm_idx(pssm)
            trim_sizes[pattern_idx] = t2 - t1
            trimmer = aggregator.TrimToBestWindow(
                        window_size=trim_sizes[pattern_idx],
                        track_names=([x+"_contrib_scores" for x in relevant_task_names]
                                     +[x+"_hypothetical_contribs" for x in relevant_task_names]))
            trimmed_patterns.extend(trimmer([pattern]))

        submetacluster_results.seqlets_to_patterns_result.patterns = trimmed_patterns


    pattern_comparison_settings = affinitymat.core.PatternComparisonSettings(
        track_names=([x + "_contrib_scores" for x in relevant_task_names] +
                     [x + "_hypothetical_contribs" for x in relevant_task_names]),  # only compare across relevant tasks
        track_transformer=affinitymat.L1Normalizer(),
        min_overlap=0.7)
    
    pattern_to_seqlets_sim_computer = hit_scoring.PatternsToSeqletsSimComputer(
        pattern_comparison_settings=pattern_comparison_settings,
        cross_metric_computer=affinitymat.core.ParallelCpuCrossMetricOnNNpairs(
            n_cores=10,
            cross_metric_single_region=affinitymat.core.CrossContinJaccardSingleRegionWithArgmax(),
            verbose=False),
        seqlet_trimmer=modisco.hit_scoring.SeqletTrimToBestWindow(
            window_size=seqlet_size_to_score_with,
            track_names=[x + "_contrib_scores" for x
                         in relevant_task_names])
    )

    # Get a list of scorers for all the patterns in the metacluster
    metacluster_pattern_scorers = []
    
    if submetacluster_results.seqlets_to_patterns_result.patterns is None or \
        len(submetacluster_results.seqlets_to_patterns_result.patterns) == 0:
        # metacluster has no patterns
        # don't append anything
        continue
    
    for pattern_idx, pattern in\
        enumerate(submetacluster_results.
                   seqlets_to_patterns_result.patterns):
        metacluster_idx = int(metacluster_name.split("_")[1])
        all_pattern_names.append("metacluster_"+str(metacluster_idx)
                             +",pattern_"+str(pattern_idx))
        if trim_pattern:
            pattern_to_seqlets_sim_computer = hit_scoring.PatternsToSeqletsSimComputer(
                pattern_comparison_settings=pattern_comparison_settings,
                cross_metric_computer=affinitymat.core.ParallelCpuCrossMetricOnNNpairs(
                    n_cores=10,
                    cross_metric_single_region=affinitymat.core.CrossContinJaccardSingleRegionWithArgmax(),
                    verbose=False),
                seqlet_trimmer=modisco.hit_scoring.SeqletTrimToBestWindow(
                    window_size=min(seqlet_size_to_score_with, trim_sizes[pattern_idx]),
                    track_names=[x + "_contrib_scores" for x
                                 in relevant_task_names])
            )
        pattern_scorer = hit_scoring.RankBasedPatternScorer(
                aggseqlets=pattern,
                patterns_to_seqlets_sim_computer=
                    pattern_to_seqlets_sim_computer)
        metacluster_pattern_scorers.append(pattern_scorer)
        all_pattern_scorers.append(pattern_scorer)
    #This is the final scorer for the metacluster;
    # it takes the maximum score produced by all the
    # individual scorers
    max_rank_based_pattern_scorer = hit_scoring.MaxRankBasedPatternScorer(
                                        pattern_scorers=metacluster_pattern_scorers)
    metacluster_idx_to_scorer[metacluster_idx] = max_rank_based_pattern_scorer
cross_metacluster_scorer = hit_scoring.MaxRankBasedPatternScorer(
                                pattern_scorers=all_pattern_scorers)
In [29]:
from modisco.visualization import viz_sequence
from collections import defaultdict, OrderedDict

metacluster_indices = loaded_tfmodisco_results.metaclustering_results.metaclusterer.transform(
                                    seqlets_from_data).metacluster_indices

metacluster_idx_to_seqlets = defaultdict(list)
for a_seqlet, metacluster_idx in zip(seqlets_from_data, metacluster_indices):
    metacluster_idx_to_seqlets[metacluster_idx].append(a_seqlet)
In [30]:
#map to patterns within each metacluster
metacluster_idx_to_localized_and_trimmed_seqlets = OrderedDict()
metacluster_to_seqlet_pattern_matches = OrderedDict() 
for metacluster_idx in metacluster_idx_to_scorer:
    print("working on metacluster ", metacluster_idx)
    metacluster_to_seqlet_pattern_matches[metacluster_idx] =\
        metacluster_idx_to_scorer[metacluster_idx](metacluster_idx_to_seqlets[metacluster_idx])
    metacluster_idx_to_localized_and_trimmed_seqlets[metacluster_idx] = []
    submetacluster_results = (loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results['metacluster_'+str(metacluster_idx)])
    for score_result, seqlet in \
        zip(metacluster_to_seqlet_pattern_matches[metacluster_idx], metacluster_idx_to_seqlets[metacluster_idx]):
        pattern_start = seqlet.coor.start + score_result.offset
        pattern_end = pattern_start + len(submetacluster_results.seqlets_to_patterns_result.patterns[score_result.pattern_idx])
        metacluster_idx_to_localized_and_trimmed_seqlets[metacluster_idx].append((seqlet.coor.example_idx, pattern_start, pattern_end, score_result.revcomp))
working on metacluster  0
working on metacluster  1
working on metacluster  10
working on metacluster  11
working on metacluster  2
working on metacluster  3
working on metacluster  4
working on metacluster  5
working on metacluster  6
working on metacluster  8
working on metacluster  9