TODO

  • [ ] look at some of the examples for some motifs to debug it further
  • [ ] explicitly exclude some pairwise distances (align trimmed motif to the reference)

Requires

  • {modisco_dir}/modisco.h5
  • {modisco_dir}/instances.parq
  • {output_dir}/patterns.pkl

Produces

  • {output_dir}/profile/pattern-table.TE.contrib-cluster.html
  • {output_dir}/profile/pattern-table.TE.contrib-cluster.csv
In [27]:
%matplotlib inline
from uuid import uuid4
from collections import OrderedDict
from kipoi.utils import unique_list
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_long
from basepair.imports import *


model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = modisco_dir
In [2]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = [x.split("/")[0] for x in mr.tasks()]
In [3]:
patterns = read_pkl(output_dir / 'patterns.pkl')

Add the fraction of other motifs

  • Use the following information to remove duplicates: 'align': {'use_rc': False, 'offset': 7},
In [4]:
import holoviews as hv
hv.extension('bokeh')
In [5]:
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix
In [6]:
# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])
In [8]:
# Load instances
#
dfi = load_instances(f"{modisco_dir}/instances.parq", motifs, dedup=False)
dfi.set_index("example_idx", inplace=True)
In [9]:
# load seqlets
df_seqlets = mr.seqlet_df_instances(trim_frac=0.08)
df_seqlets = df_seqlets.rename(columns={"seqname": "example_idx"}).set_index('example_idx')
TF-MoDISco is using the TensorFlow backend.
In [10]:
dfi['pattern_strand'] = dfi['strand']
In [11]:
dfi.head()
Out[11]:
pattern pattern_start pattern_end strand pattern_len pattern_center match_weighted match_weighted_p match_weighted_cat match_max match_max_task imp_weighted imp_weighted_p imp_weighted_cat imp_max imp_max_task seq_match seq_match_p seq_match_cat match/Klf4 match/Nanog match/Oct4 match/Sox2 imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 example_chrom example_start example_end example_strand example_interval_from_task pattern_short pattern_name pattern_start_abs pattern_end_abs pattern_strand
example_idx
0 metacluster_0/pattern_0 104 119 + 15 111 0.2469 0.0010 low 0.4484 Oct4 0.0365 NaN NaN 0.0524 Oct4 3.9825 0.0085 low 0.2654 -0.3593 0.4484 0.3725 0.0235 0.0246 0.0524 0.0313 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482648 143482663 +
0 metacluster_0/pattern_0 263 278 - 15 270 0.2518 0.0011 low 0.2893 Oct4 0.0217 NaN NaN 0.0342 Nanog 1.1066 0.0007 low 0.1618 0.2170 0.2893 0.2803 0.0143 0.0342 0.0200 0.0203 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482807 143482822 -
0 metacluster_0/pattern_0 282 297 + 15 289 0.3466 0.0069 low 0.3870 Sox2 0.0272 NaN NaN 0.0296 Sox2 5.0100 0.0188 low 0.2257 0.3454 0.3723 0.3870 0.0233 0.0258 0.0279 0.0296 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482826 143482841 +
0 metacluster_0/pattern_0 445 460 + 15 452 0.2399 0.0008 low 0.2469 Sox2 0.1852 0.0008 high 0.4145 Nanog 1.7775 0.0012 low 0.2458 0.2408 0.2314 0.2469 0.1234 0.4145 0.0949 0.1896 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482989 143483004 +
0 metacluster_0/pattern_0 475 490 - 15 482 0.2903 0.0025 low 0.3322 Oct4 0.2889 0.0179 high 0.5196 Nanog 3.3390 0.0057 low 0.2092 0.2681 0.3322 0.2991 0.1613 0.5196 0.2049 0.3241 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143483019 143483034 -
In [12]:
df_seqlets.head()
Out[12]:
start end name strand pattern center
example_idx
2 498 513 + metacluster_0/pattern_0 505.5
49274 496 511 + metacluster_0/pattern_0 503.5
31492 488 503 + metacluster_0/pattern_0 495.5
31190 461 476 + metacluster_0/pattern_0 468.5
31157 489 504 + metacluster_0/pattern_0 496.5

Assumptions

  • two motifs can't be at the sample place (not even rc). Hence the exclusion still holds
In [13]:
# create a table with all possible pairs
In [14]:
raw_patterns = [mr.get_pattern(pn).trim_seq_ic(0.08).pad(70) for pn in mr.patterns()]

# Get the major motifs
major_motifs = [p for p in raw_patterns if p.name in [longer_pattern(x) for x in motifs.values()]]

def get_offset(d):
    if d['use_rc']:
        return - d['offset']
    else:
        return d['offset']

# align seqlet -> pattern and get the diff
shifts = pd.DataFrame([{"seqlet_pattern": p_seqlet.name,
                        "pattern": p_instance.name,
                        "offset": get_offset(p_seqlet.align(p_instance).attrs['align'])}
                       for p_seqlet in raw_patterns 
                       for p_instance in major_motifs])
In [16]:
df_seqlets['seqlet_idx'] = np.arange(len(df_seqlets))
In [18]:
df_seqlets.head()
Out[18]:
start end name strand pattern center seqlet_idx
example_idx
2 498 513 + metacluster_0/pattern_0 505.5 0
49274 496 511 + metacluster_0/pattern_0 503.5 1
31492 488 503 + metacluster_0/pattern_0 495.5 2
31190 461 476 + metacluster_0/pattern_0 468.5 3
31157 489 504 + metacluster_0/pattern_0 496.5 4
In [19]:
def seqlet_minstance_coocurrence(df_seqlets, dfi, min_dist=15, max_dist=150):
    """Compute the co-occurence of the seqlets the motif instances
    
    Args:
      df_seqlets: pd.DataFrame returned by mr.seqlet_df_instances(trim_frac=0.08) with example_idx index
      dfi: pd.DataFrame loaded from the parquet file with example_idx index
      min_dist: minimum distance between the seqlet and minstance
      max_dist: maxium distance between the seqlet and minstance
      
    Returns:
      pd.DataFrame with seqlet pattern name as index and columns 'pattern_name' columns
    """
    assert df_seqlets.index.name == 'example_idx'
    assert dfi.index.name == 'example_idx'
    
    df_seqlets['seqlet_idx'] = np.arange(len(df_seqlets))
    
    dfi_crossp = pd.merge(df_seqlets[['center', 'pattern', 'strand', 'seqlet_idx']].rename(columns=dict(pattern="seqlet_pattern", 
                                                                                      strand='seqlet_strand',
                                                                                      center='seqlet_center')), 
                          dfi[["pattern_center", "pattern", "pattern_name", 'pattern_strand']], 
                          how='outer', left_index=True, right_index=True).reset_index()
    # Note that this table can have None-values where one of both is not found

    dfi_crossp = pd.merge(dfi_crossp, shifts, on=['seqlet_pattern', 'pattern'], how='left')

    # shift the seqlet accordingly
    dfi_crossp['shifted_center'] = dfi_crossp.seqlet_center - dfi_crossp.offset * dfi_crossp.seqlet_strand.map({"+": 1, "-": -1})
    
    # TODO - this is wrong - we ditch seqlet instances that don't exist

    # consider only valid pairs
    dfi_crossp = dfi_crossp[(np.abs(dfi_crossp.shifted_center - dfi_crossp.pattern_center) > min_dist) &
                            (np.abs(dfi_crossp.shifted_center - dfi_crossp.pattern_center) < max_dist)]
    

    counts = dfi_crossp.pivot_table(index=['seqlet_idx', 'seqlet_pattern'], 
                                    columns='pattern_name', 
                                    values='shifted_center', 
                                    aggfunc=lambda x: int(np.sum(x.isnull()==False) > 0), 
                                    fill_value=0)
    
    dfi_crossp['pattern_name_random'] = dfi_crossp.pattern_name.sample(frac=1).values
    
    # TODO - index should be seqlet_idx not example_idx
    counts_random = dfi_crossp.pivot_table(index=['seqlet_idx', 'seqlet_pattern'], 
                                    columns='pattern_name_random', 
                                    values='shifted_center', 
                                    aggfunc=lambda x: int(np.sum(x.isnull()==False) > 0), 
                                    fill_value=0)
    # return counts, counts_random
    c = counts.groupby(level=1).sum()
    c.columns.name = None
    c.index.name = 'pattern'
    
    cr = counts_random.groupby(level=1).sum()
    cr.columns.name = None
    cr.index.name = 'pattern'
    
    return c, cr
In [20]:
nte_patterns = [p.name for p in patterns if p.attrs['pattern_group'] == 'nte']

match_weighted_cat!="low", imp_weighted_cat=="high", [15, 80]

Question: does pattern occur significantly more often with seqlet_pattern or not?

In [21]:
c, cr = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=80)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)
In [22]:
from scipy.stats import binom_test

n_seqlets = df_seqlets.groupby("pattern").size()
p0 = cr.divide(n_seqlets, axis='rows')
p1 = c.divide(n_seqlets, axis='rows')

dfp = pd.DataFrame([{tf: binom_test(c.loc[pattern, tf], n_seqlets.loc[pattern], p0.loc[pattern, tf], alternative='greater')
     for tf in c.columns}
    for pattern in c.index], index=c.index)

# Control for multiple testing
dfp = dfp / dfp.size
In [23]:
dfco = p1
In [24]:
# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']
In [25]:
signif_threshold= 1e-5
signif = dfp.loc[nte_patterns] < signif_threshold
a = np.zeros_like(signif).astype(str)
a[signif] = "*"
a[~signif] = ""
np.fill_diagonal(a, '')
fig, ax = plt.subplots(figsize=(6, 22))
# log_odds = np.log10(dfco / (p0 + 1e-6))
# log_odds= dfco / (p0 + 1e-3)
sns.heatmap((p1 / (p0 + 0.001)).loc[nte_patterns], annot=a, fmt="", vmin=0, vmax=4, center=1, cmap='RdBu_r')
plt.title(f"Log2 odds-ratio. (*: p<{signif_threshold})");

In [50]:
dfi_crossp = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=150)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)
In [21]:
dfco, dfco_sum = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=150)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-21-e74024e71b6f> in <module>()
      1 dfco, dfco_sum = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
----> 2                                     min_dist=15, max_dist=150)
      3 # standardize
      4 #dfco = (dfco- dfco.mean(0)) / dfco.std(0)

<ipython-input-19-6b2505df5129> in seqlet_minstance_coocurrence(df_seqlets, dfi, min_dist, max_dist)
     35                                     values='shifted_center',
     36                                     aggfunc=lambda x: int(np.sum(x.isnull()==False) > 0),
---> 37                                     fill_value=0)
     38     c = counts.groupby(level=1).mean()
     39     cs = counts.groupby(level=1).sum()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in pivot_table(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
   5301                            aggfunc=aggfunc, fill_value=fill_value,
   5302                            margins=margins, dropna=dropna,
-> 5303                            margins_name=margins_name)
   5304 
   5305     def stack(self, level=-1, dropna=True):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
     85     # if we have a categorical
     86     grouped = data.groupby(keys, observed=False)
---> 87     agged = grouped.agg(aggfunc)
     88     if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
     89         agged = agged.dropna(how='all')

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in aggregate(self, arg, *args, **kwargs)
   4654         axis=''))
   4655     def aggregate(self, arg, *args, **kwargs):
-> 4656         return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
   4657 
   4658     agg = aggregate

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in aggregate(self, arg, *args, **kwargs)
   4093             # grouper specific aggregations
   4094             if self.grouper.nkeys > 1:
-> 4095                 return self._python_agg_general(arg, *args, **kwargs)
   4096             else:
   4097 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _python_agg_general(self, func, *args, **kwargs)
   1066         for name, obj in self._iterate_slices():
   1067             try:
-> 1068                 result, counts = self.grouper.agg_series(obj, f)
   1069                 output[name] = self._try_cast(result, obj, numeric_only=True)
   1070             except TypeError:

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in agg_series(self, obj, func)
   2668     def agg_series(self, obj, func):
   2669         try:
-> 2670             return self._aggregate_series_fast(obj, func)
   2671         except Exception:
   2672             return self._aggregate_series_pure_python(obj, func)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _aggregate_series_fast(self, obj, func)
   2688         grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups,
   2689                                           dummy)
-> 2690         result, counts = grouper.get_result()
   2691         return result, counts
   2692 

pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesGrouper.get_result()

pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesGrouper.get_result()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in <lambda>(x)
   1060     def _python_agg_general(self, func, *args, **kwargs):
   1061         func = self._is_builtin_func(func)
-> 1062         f = lambda x: func(x, *args, **kwargs)
   1063 
   1064         # iterate through "columns" ex exclusions to populate output dict

<ipython-input-19-6b2505df5129> in <lambda>(x)
     34                                     columns='pattern_name',
     35                                     values='shifted_center',
---> 36                                     aggfunc=lambda x: int(np.sum(x.isnull()==False) > 0),
     37                                     fill_value=0)
     38     c = counts.groupby(level=1).mean()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/series.py in isnull(self)
   3802     @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
   3803     def isnull(self):
-> 3804         return super(Series, self).isnull()
   3805 
   3806     @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py in isnull(self)
   6224     @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
   6225     def isnull(self):
-> 6226         return isna(self).__finalize__(self)
   6227 
   6228     _shared_docs['notna'] = """

KeyboardInterrupt: 
In [ ]:
 

Sort by metacluster

In [17]:
# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']
In [18]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[18]:

Sort by motif grouping

In [19]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[19]:

match_weighted_cat!="low", imp_weighted_cat=="high", [15, 50]

In [20]:
dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=50)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

Sort by metacluster

In [21]:
# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']
In [22]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[22]:

Sort by motif grouping

In [23]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[23]:

match_weighted_cat!="low", imp_weighted_cat!="low", [15, 150]

In [24]:
dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat!="low"'),
                                    min_dist=15, max_dist=150)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

Sort by metacluster

In [25]:
# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']
In [26]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[26]:

Sort by motif grouping

In [27]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[27]:

match_weighted_cat=="high", imp_weighted_cat!="low", [15, 150]

In [28]:
dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat=="high"').query('imp_weighted_cat!="low"'),
                                    min_dist=15, max_dist=150)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

Sort by metacluster

In [29]:
# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']
In [30]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[30]:

Sort by motif grouping

In [31]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[31]:

match_weighted_cat=="low", imp_weighted_cat=="high", [15, 150]

In [53]:
dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat=="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=150)

Sort by metacluster

In [54]:
# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']
In [55]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[55]:

match_weighted_cat=="low", imp_weighted_cat=="high", [15, 50]

In [56]:
dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat=="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=50)

Sort by metacluster

In [57]:
# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']
In [58]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[58]:

Sort by motif grouping

In [59]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )
Out[59]:

Append to patterns

In [39]:
dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=50)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)
In [40]:
patterns[0].attrs['motif_freq']
Out[40]:
{'Essrb': 0.0,
 'Klf4': 0.3333333333333333,
 'Nanog': 0.1111111111111111,
 'Nanog-periodic': 0.0,
 'Oct4': 0.1111111111111111,
 'Oct4-Sox2': 0.2222222222222222,
 'Oct4-Sox2-deg': 0.2222222222222222,
 'Sox2': 0.3333333333333333}
In [268]:
def get_values(dfco, pname):
    if pname not in dfco.index:
        return {c: 0 for c in dfco.columns}
    else:
        return dict(dfco.loc[pname])
In [315]:
patterns = [p.add_attr('motif_odds', get_values(p1 / (p0 + 0.001), p.name)) for p in patterns]
In [316]:
patterns = [p.add_attr('motif_odds_p', get_values(dfp, p.name)) for p in patterns]
In [271]:
# Write back the pickle file
write_pkl(patterns, output_dir / 'patterns.pkl')

Visualize

Convert patterns -> table

In [317]:
# split the patterns back into the groups
patterns_nte_clustered = [x for x in patterns if x.attrs['pattern_group'] == 'nte']
patterns_te_clustered = [x for x in patterns if x.attrs['pattern_group'] == 'te']
In [318]:
from basepair.modisco.motif_clustering import *
In [320]:
pattern_table_nte_seq.columns
Out[320]:
Index(['Essrb/lod', 'Essrb/lod_p', 'Klf4 footprint counts',
       'Klf4 footprint entropydiff', 'Klf4 footprint max',
       'Klf4 footprint strandcor', 'Klf4 imp counts', 'Klf4 imp profile',
       'Klf4 periodicity 10bp', 'Klf4 pos meandiff', 'Klf4 pos std',
       'Klf4 pos unimodal', 'Klf4 region counts', 'Klf4/d', 'Klf4/f',
       'Klf4/lod', 'Klf4/lod_p', 'Nanog footprint counts',
       'Nanog footprint entropydiff', 'Nanog footprint max',
       'Nanog footprint strandcor', 'Nanog imp counts', 'Nanog imp profile',
       'Nanog periodicity 10bp', 'Nanog pos meandiff', 'Nanog pos std',
       'Nanog pos unimodal', 'Nanog region counts', 'Nanog-periodic/lod',
       'Nanog-periodic/lod_p', 'Nanog/d', 'Nanog/f', 'Nanog/lod',
       'Nanog/lod_p', 'Oct4 footprint counts', 'Oct4 footprint entropydiff',
       'Oct4 footprint max', 'Oct4 footprint strandcor', 'Oct4 imp counts',
       'Oct4 imp profile', 'Oct4 periodicity 10bp', 'Oct4 pos meandiff',
       'Oct4 pos std', 'Oct4 pos unimodal', 'Oct4 region counts',
       'Oct4-Sox2-deg/lod', 'Oct4-Sox2-deg/lod_p', 'Oct4-Sox2/lod',
       'Oct4-Sox2/lod_p', 'Oct4/d', 'Oct4/f', 'Oct4/lod', 'Oct4/lod_p',
       'Sox2 footprint counts', 'Sox2 footprint entropydiff',
       'Sox2 footprint max', 'Sox2 footprint strandcor', 'Sox2 imp counts',
       'Sox2 imp profile', 'Sox2 periodicity 10bp', 'Sox2 pos meandiff',
       'Sox2 pos std', 'Sox2 pos unimodal', 'Sox2 region counts', 'Sox2/d',
       'Sox2/f', 'Sox2/lod', 'Sox2/lod_p', 'cluster', 'consensus',
       'ic pwm mean', 'idx', 'logo_imp', 'logo_seq', 'n seqlets', 'pattern',
       'Klf4/d_p', 'Nanog/d_p', 'Oct4/d_p', 'Sox2/d_p'],
      dtype='object')
In [321]:
pattern_table_nte_seq = create_pattern_table(patterns_nte_clustered,
                                             logo_len=50, 
                                             seqlogo_kwargs=dict(width=420),
                                             n_jobs=20,
                                             footprint_width=120,
                                             footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 96/96 [00:04<00:00, 20.56it/s]
In [322]:
pattern_table_te_seq = create_pattern_table(patterns_te_clustered,
                                            logo_len=70, 
                                            seqlogo_kwargs=dict(width=420),
                                            n_jobs=20,
                                            footprint_width=120,
                                            footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 46/46 [00:01<00:00, 26.07it/s]

Focus only on the things to visualize

In [323]:
background_motifs = ['Essrb', 'Klf4', 'Nanog','Oct4','Oct4-Sox2', 'Sox2']

colorder = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + [task+'/f' for task in tasks] + [t + '/d_p' for t in tasks] + [m+'/odds' for m in background_motifs]
In [324]:
(output_dir / 'motif_clustering').mkdir(exist_ok=True)
In [329]:
remove = [task+'/f' for task in tasks] + ['logo_imp', 'logo_seq']
In [326]:
html_str_contrib = df2html(pattern_table_nte_seq[colorder], "table_contrib" + str(uuid4()), "")
with open(output_dir / 'motif_clustering/smaller.seq-cluster.v3.html', 'w') as fo:
    fo.write(html_str_contrib)
# HTML(html_str_contrib)
In [327]:
html_str_te_contrib = df2html(pattern_table_te_seq[colorder], "table_seq" + str(uuid4()), "")
with open(output_dir / 'motif_clustering/te-cluster.v3.html', 'w') as fo:
    fo.write(html_str_te_contrib)
# html_str_seq
In [337]:
pattern_table_nte_seq['pattern'] = [shorten_pattern(p.name) for p in patterns_nte_clustered]
In [338]:
pattern_table_nte_seq.iloc[:, ~pattern_table_nte_seq.columns.isin(remove)].to_csv(output_dir / 'motif_clustering/smaller.seq-cluster.v3.csv')