TODO¶

[ ] look at some of the examples for some motifs to debug it further
[ ] explicitly exclude some pairwise distances (align trimmed motif to the reference)

Requires¶

{modisco_dir}/modisco.h5
{modisco_dir}/instances.parq
{output_dir}/patterns.pkl

Produces¶

{output_dir}/profile/pattern-table.TE.contrib-cluster.html
{output_dir}/profile/pattern-table.TE.contrib-cluster.csv

%matplotlib inline
from uuid import uuid4
from collections import OrderedDict
from kipoi.utils import unique_list
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_long
from basepair.imports import *


model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = modisco_dir

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = [x.split("/")[0] for x in mr.tasks()]

patterns = read_pkl(output_dir / 'patterns.pkl')

Add the fraction of other motifs¶

Use the following information to remove duplicates: 'align': {'use_rc': False, 'offset': 7},

import holoviews as hv
hv.extension('bokeh')

from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix

# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])

# Load instances
#
dfi = load_instances(f"{modisco_dir}/instances.parq", motifs, dedup=False)
dfi.set_index("example_idx", inplace=True)

# load seqlets
df_seqlets = mr.seqlet_df_instances(trim_frac=0.08)
df_seqlets = df_seqlets.rename(columns={"seqname": "example_idx"}).set_index('example_idx')

TF-MoDISco is using the TensorFlow backend.

dfi['pattern_strand'] = dfi['strand']

dfi.head()

df_seqlets.head()

Assumptions¶

two motifs can't be at the sample place (not even rc). Hence the exclusion still holds

# create a table with all possible pairs

raw_patterns = [mr.get_pattern(pn).trim_seq_ic(0.08).pad(70) for pn in mr.patterns()]

# Get the major motifs
major_motifs = [p for p in raw_patterns if p.name in [longer_pattern(x) for x in motifs.values()]]

def get_offset(d):
    if d['use_rc']:
        return - d['offset']
    else:
        return d['offset']

# align seqlet -> pattern and get the diff
shifts = pd.DataFrame([{"seqlet_pattern": p_seqlet.name,
                        "pattern": p_instance.name,
                        "offset": get_offset(p_seqlet.align(p_instance).attrs['align'])}
                       for p_seqlet in raw_patterns 
                       for p_instance in major_motifs])

df_seqlets['seqlet_idx'] = np.arange(len(df_seqlets))

df_seqlets.head()

def seqlet_minstance_coocurrence(df_seqlets, dfi, min_dist=15, max_dist=150):
    """Compute the co-occurence of the seqlets the motif instances
    
    Args:
      df_seqlets: pd.DataFrame returned by mr.seqlet_df_instances(trim_frac=0.08) with example_idx index
      dfi: pd.DataFrame loaded from the parquet file with example_idx index
      min_dist: minimum distance between the seqlet and minstance
      max_dist: maxium distance between the seqlet and minstance
      
    Returns:
      pd.DataFrame with seqlet pattern name as index and columns 'pattern_name' columns
    """
    assert df_seqlets.index.name == 'example_idx'
    assert dfi.index.name == 'example_idx'
    
    df_seqlets['seqlet_idx'] = np.arange(len(df_seqlets))
    
    dfi_crossp = pd.merge(df_seqlets[['center', 'pattern', 'strand', 'seqlet_idx']].rename(columns=dict(pattern="seqlet_pattern", 
                                                                                      strand='seqlet_strand',
                                                                                      center='seqlet_center')), 
                          dfi[["pattern_center", "pattern", "pattern_name", 'pattern_strand']], 
                          how='outer', left_index=True, right_index=True).reset_index()
    # Note that this table can have None-values where one of both is not found

    dfi_crossp = pd.merge(dfi_crossp, shifts, on=['seqlet_pattern', 'pattern'], how='left')

    # shift the seqlet accordingly
    dfi_crossp['shifted_center'] = dfi_crossp.seqlet_center - dfi_crossp.offset * dfi_crossp.seqlet_strand.map({"+": 1, "-": -1})
    
    # TODO - this is wrong - we ditch seqlet instances that don't exist

    # consider only valid pairs
    dfi_crossp = dfi_crossp[(np.abs(dfi_crossp.shifted_center - dfi_crossp.pattern_center) > min_dist) &
                            (np.abs(dfi_crossp.shifted_center - dfi_crossp.pattern_center) < max_dist)]
    

    counts = dfi_crossp.pivot_table(index=['seqlet_idx', 'seqlet_pattern'], 
                                    columns='pattern_name', 
                                    values='shifted_center', 
                                    aggfunc=lambda x: int(np.sum(x.isnull()==False) > 0), 
                                    fill_value=0)
    
    dfi_crossp['pattern_name_random'] = dfi_crossp.pattern_name.sample(frac=1).values
    
    # TODO - index should be seqlet_idx not example_idx
    counts_random = dfi_crossp.pivot_table(index=['seqlet_idx', 'seqlet_pattern'], 
                                    columns='pattern_name_random', 
                                    values='shifted_center', 
                                    aggfunc=lambda x: int(np.sum(x.isnull()==False) > 0), 
                                    fill_value=0)
    # return counts, counts_random
    c = counts.groupby(level=1).sum()
    c.columns.name = None
    c.index.name = 'pattern'
    
    cr = counts_random.groupby(level=1).sum()
    cr.columns.name = None
    cr.index.name = 'pattern'
    
    return c, cr

nte_patterns = [p.name for p in patterns if p.attrs['pattern_group'] == 'nte']

`match_weighted_cat!="low", imp_weighted_cat=="high"`, [15, 80]¶

Question: does pattern occur significantly more often with seqlet_pattern or not?

c, cr = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=80)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

from scipy.stats import binom_test

n_seqlets = df_seqlets.groupby("pattern").size()
p0 = cr.divide(n_seqlets, axis='rows')
p1 = c.divide(n_seqlets, axis='rows')

dfp = pd.DataFrame([{tf: binom_test(c.loc[pattern, tf], n_seqlets.loc[pattern], p0.loc[pattern, tf], alternative='greater')
     for tf in c.columns}
    for pattern in c.index], index=c.index)

# Control for multiple testing
dfp = dfp / dfp.size

dfco = p1

# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']

signif_threshold= 1e-5
signif = dfp.loc[nte_patterns] < signif_threshold
a = np.zeros_like(signif).astype(str)
a[signif] = "*"
a[~signif] = ""
np.fill_diagonal(a, '')
fig, ax = plt.subplots(figsize=(6, 22))
# log_odds = np.log10(dfco / (p0 + 1e-6))
# log_odds= dfco / (p0 + 1e-3)
sns.heatmap((p1 / (p0 + 0.001)).loc[nte_patterns], annot=a, fmt="", vmin=0, vmax=4, center=1, cmap='RdBu_r')
plt.title(f"Log2 odds-ratio. (*: p<{signif_threshold})");

dfi_crossp = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=150)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

dfco, dfco_sum = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=150)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-21-e74024e71b6f> in <module>()
      1 dfco, dfco_sum = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
----> 2                                     min_dist=15, max_dist=150)
      3 # standardize
      4 #dfco = (dfco- dfco.mean(0)) / dfco.std(0)

<ipython-input-19-6b2505df5129> in seqlet_minstance_coocurrence(df_seqlets, dfi, min_dist, max_dist)
     35                                     values='shifted_center',
     36                                     aggfunc=lambda x: int(np.sum(x.isnull()==False) > 0),
---> 37                                     fill_value=0)
     38     c = counts.groupby(level=1).mean()
     39     cs = counts.groupby(level=1).sum()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in pivot_table(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
   5301                            aggfunc=aggfunc, fill_value=fill_value,
   5302                            margins=margins, dropna=dropna,
-> 5303                            margins_name=margins_name)
   5304 
   5305     def stack(self, level=-1, dropna=True):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
     85     # if we have a categorical
     86     grouped = data.groupby(keys, observed=False)
---> 87     agged = grouped.agg(aggfunc)
     88     if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
     89         agged = agged.dropna(how='all')

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in aggregate(self, arg, *args, **kwargs)
   4654         axis=''))
   4655     def aggregate(self, arg, *args, **kwargs):
-> 4656         return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
   4657 
   4658     agg = aggregate

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in aggregate(self, arg, *args, **kwargs)
   4093             # grouper specific aggregations
   4094             if self.grouper.nkeys > 1:
-> 4095                 return self._python_agg_general(arg, *args, **kwargs)
   4096             else:
   4097 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _python_agg_general(self, func, *args, **kwargs)
   1066         for name, obj in self._iterate_slices():
   1067             try:
-> 1068                 result, counts = self.grouper.agg_series(obj, f)
   1069                 output[name] = self._try_cast(result, obj, numeric_only=True)
   1070             except TypeError:

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in agg_series(self, obj, func)
   2668     def agg_series(self, obj, func):
   2669         try:
-> 2670             return self._aggregate_series_fast(obj, func)
   2671         except Exception:
   2672             return self._aggregate_series_pure_python(obj, func)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _aggregate_series_fast(self, obj, func)
   2688         grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups,
   2689                                           dummy)
-> 2690         result, counts = grouper.get_result()
   2691         return result, counts
   2692 

pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesGrouper.get_result()

pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesGrouper.get_result()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in <lambda>(x)
   1060     def _python_agg_general(self, func, *args, **kwargs):
   1061         func = self._is_builtin_func(func)
-> 1062         f = lambda x: func(x, *args, **kwargs)
   1063 
   1064         # iterate through "columns" ex exclusions to populate output dict

<ipython-input-19-6b2505df5129> in <lambda>(x)
     34                                     columns='pattern_name',
     35                                     values='shifted_center',
---> 36                                     aggfunc=lambda x: int(np.sum(x.isnull()==False) > 0),
     37                                     fill_value=0)
     38     c = counts.groupby(level=1).mean()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/series.py in isnull(self)
   3802     @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
   3803     def isnull(self):
-> 3804         return super(Series, self).isnull()
   3805 
   3806     @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py in isnull(self)
   6224     @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
   6225     def isnull(self):
-> 6226         return isna(self).__finalize__(self)
   6227 
   6228     _shared_docs['notna'] = """

KeyboardInterrupt:

Sort by metacluster¶

# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

Sort by motif grouping¶

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

`match_weighted_cat!="low", imp_weighted_cat=="high"`, [15, 50]¶

dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=50)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

Sort by metacluster¶

# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

Sort by motif grouping¶

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

`match_weighted_cat!="low", imp_weighted_cat!="low"`, [15, 150]¶

dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat!="low"'),
                                    min_dist=15, max_dist=150)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

Sort by metacluster¶

# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

Sort by motif grouping¶

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

`match_weighted_cat=="high", imp_weighted_cat!="low"`, [15, 150]¶

dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat=="high"').query('imp_weighted_cat!="low"'),
                                    min_dist=15, max_dist=150)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

Sort by metacluster¶

# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

Sort by motif grouping¶

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

`match_weighted_cat=="low", imp_weighted_cat=="high"`, [15, 150]¶

dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat=="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=150)

Sort by metacluster¶

# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

`match_weighted_cat=="low", imp_weighted_cat=="high"`, [15, 50]¶

dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat=="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=50)

Sort by metacluster¶

# sort by metacluster and pattern
dfco = pd.concat([dfco, pd.DataFrame(pd.Series(dfco.index).apply(extract_name_long).tolist(), index=dfco.index)], axis=1).sort_values(['metacluster', 'pattern'])
del dfco['metacluster']
del dfco['pattern']

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco[dfco.index.isin(nte_patterns)].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

Sort by motif grouping¶

%opts HeatMap [xrotation=90] (cmap='Blues') 
dfco.loc[nte_patterns].reset_index().melt(id_vars='pattern').hvplot.heatmap(x='variable', y='pattern', C='value', width=600, height=800, )

Append to patterns¶

dfco = seqlet_minstance_coocurrence(df_seqlets, dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'),
                                    min_dist=15, max_dist=50)
# standardize
#dfco = (dfco- dfco.mean(0)) / dfco.std(0)

patterns[0].attrs['motif_freq']

{'Essrb': 0.0,
 'Klf4': 0.3333333333333333,
 'Nanog': 0.1111111111111111,
 'Nanog-periodic': 0.0,
 'Oct4': 0.1111111111111111,
 'Oct4-Sox2': 0.2222222222222222,
 'Oct4-Sox2-deg': 0.2222222222222222,
 'Sox2': 0.3333333333333333}

def get_values(dfco, pname):
    if pname not in dfco.index:
        return {c: 0 for c in dfco.columns}
    else:
        return dict(dfco.loc[pname])

patterns = [p.add_attr('motif_odds', get_values(p1 / (p0 + 0.001), p.name)) for p in patterns]

patterns = [p.add_attr('motif_odds_p', get_values(dfp, p.name)) for p in patterns]

# Write back the pickle file
write_pkl(patterns, output_dir / 'patterns.pkl')

Visualize¶

Results¶

Convert patterns -> table¶

# split the patterns back into the groups
patterns_nte_clustered = [x for x in patterns if x.attrs['pattern_group'] == 'nte']
patterns_te_clustered = [x for x in patterns if x.attrs['pattern_group'] == 'te']

from basepair.modisco.motif_clustering import *

pattern_table_nte_seq.columns

Index(['Essrb/lod', 'Essrb/lod_p', 'Klf4 footprint counts',
       'Klf4 footprint entropydiff', 'Klf4 footprint max',
       'Klf4 footprint strandcor', 'Klf4 imp counts', 'Klf4 imp profile',
       'Klf4 periodicity 10bp', 'Klf4 pos meandiff', 'Klf4 pos std',
       'Klf4 pos unimodal', 'Klf4 region counts', 'Klf4/d', 'Klf4/f',
       'Klf4/lod', 'Klf4/lod_p', 'Nanog footprint counts',
       'Nanog footprint entropydiff', 'Nanog footprint max',
       'Nanog footprint strandcor', 'Nanog imp counts', 'Nanog imp profile',
       'Nanog periodicity 10bp', 'Nanog pos meandiff', 'Nanog pos std',
       'Nanog pos unimodal', 'Nanog region counts', 'Nanog-periodic/lod',
       'Nanog-periodic/lod_p', 'Nanog/d', 'Nanog/f', 'Nanog/lod',
       'Nanog/lod_p', 'Oct4 footprint counts', 'Oct4 footprint entropydiff',
       'Oct4 footprint max', 'Oct4 footprint strandcor', 'Oct4 imp counts',
       'Oct4 imp profile', 'Oct4 periodicity 10bp', 'Oct4 pos meandiff',
       'Oct4 pos std', 'Oct4 pos unimodal', 'Oct4 region counts',
       'Oct4-Sox2-deg/lod', 'Oct4-Sox2-deg/lod_p', 'Oct4-Sox2/lod',
       'Oct4-Sox2/lod_p', 'Oct4/d', 'Oct4/f', 'Oct4/lod', 'Oct4/lod_p',
       'Sox2 footprint counts', 'Sox2 footprint entropydiff',
       'Sox2 footprint max', 'Sox2 footprint strandcor', 'Sox2 imp counts',
       'Sox2 imp profile', 'Sox2 periodicity 10bp', 'Sox2 pos meandiff',
       'Sox2 pos std', 'Sox2 pos unimodal', 'Sox2 region counts', 'Sox2/d',
       'Sox2/f', 'Sox2/lod', 'Sox2/lod_p', 'cluster', 'consensus',
       'ic pwm mean', 'idx', 'logo_imp', 'logo_seq', 'n seqlets', 'pattern',
       'Klf4/d_p', 'Nanog/d_p', 'Oct4/d_p', 'Sox2/d_p'],
      dtype='object')

pattern_table_nte_seq = create_pattern_table(patterns_nte_clustered,
                                             logo_len=50, 
                                             seqlogo_kwargs=dict(width=420),
                                             n_jobs=20,
                                             footprint_width=120,
                                             footprint_kwargs=dict(figsize=(3,1.5)))

100%|██████████| 96/96 [00:04<00:00, 20.56it/s]

pattern_table_te_seq = create_pattern_table(patterns_te_clustered,
                                            logo_len=70, 
                                            seqlogo_kwargs=dict(width=420),
                                            n_jobs=20,
                                            footprint_width=120,
                                            footprint_kwargs=dict(figsize=(3,1.5)))

100%|██████████| 46/46 [00:01<00:00, 26.07it/s]

Focus only on the things to visualize¶

background_motifs = ['Essrb', 'Klf4', 'Nanog','Oct4','Oct4-Sox2', 'Sox2']

colorder = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + [task+'/f' for task in tasks] + [t + '/d_p' for t in tasks] + [m+'/odds' for m in background_motifs]

(output_dir / 'motif_clustering').mkdir(exist_ok=True)

remove = [task+'/f' for task in tasks] + ['logo_imp', 'logo_seq']

html_str_contrib = df2html(pattern_table_nte_seq[colorder], "table_contrib" + str(uuid4()), "")
with open(output_dir / 'motif_clustering/smaller.seq-cluster.v3.html', 'w') as fo:
    fo.write(html_str_contrib)
# HTML(html_str_contrib)

html_str_te_contrib = df2html(pattern_table_te_seq[colorder], "table_seq" + str(uuid4()), "")
with open(output_dir / 'motif_clustering/te-cluster.v3.html', 'w') as fo:
    fo.write(html_str_te_contrib)
# html_str_seq

pattern_table_nte_seq['pattern'] = [shorten_pattern(p.name) for p in patterns_nte_clustered]

pattern_table_nte_seq.iloc[:, ~pattern_table_nte_seq.columns.isin(remove)].to_csv(output_dir / 'motif_clustering/smaller.seq-cluster.v3.csv')

	pattern	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_weighted_p	match_weighted_cat	match_max	match_max_task	imp_weighted	imp_weighted_p	imp_weighted_cat	imp_max	imp_max_task	seq_match	seq_match_p	seq_match_cat	match/Klf4	match/Nanog	match/Oct4	match/Sox2	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	example_chrom	example_start	example_end	example_strand	example_interval_from_task	pattern_short	pattern_name	pattern_start_abs	pattern_end_abs	pattern_strand
example_idx
0	metacluster_0/pattern_0	104	119	+	15	111	0.2469	0.0010	low	0.4484	Oct4	0.0365	NaN	NaN	0.0524	Oct4	3.9825	0.0085	low	0.2654	-0.3593	0.4484	0.3725	0.0235	0.0246	0.0524	0.0313	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143482648	143482663	+
0	metacluster_0/pattern_0	263	278	-	15	270	0.2518	0.0011	low	0.2893	Oct4	0.0217	NaN	NaN	0.0342	Nanog	1.1066	0.0007	low	0.1618	0.2170	0.2893	0.2803	0.0143	0.0342	0.0200	0.0203	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143482807	143482822	-
0	metacluster_0/pattern_0	282	297	+	15	289	0.3466	0.0069	low	0.3870	Sox2	0.0272	NaN	NaN	0.0296	Sox2	5.0100	0.0188	low	0.2257	0.3454	0.3723	0.3870	0.0233	0.0258	0.0279	0.0296	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143482826	143482841	+
0	metacluster_0/pattern_0	445	460	+	15	452	0.2399	0.0008	low	0.2469	Sox2	0.1852	0.0008	high	0.4145	Nanog	1.7775	0.0012	low	0.2458	0.2408	0.2314	0.2469	0.1234	0.4145	0.0949	0.1896	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143482989	143483004	+
0	metacluster_0/pattern_0	475	490	-	15	482	0.2903	0.0025	low	0.3322	Oct4	0.2889	0.0179	high	0.5196	Nanog	3.3390	0.0057	low	0.2092	0.2681	0.3322	0.2991	0.1613	0.5196	0.2049	0.3241	chrX	143482544	143483544	*	Oct4	m0_p0	Oct4-Sox2	143483019	143483034	-

	start	end	strand	pattern	center
example_idx
2	498	513	+	metacluster_0/pattern_0	505.5
49274	496	511	+	metacluster_0/pattern_0	503.5
31492	488	503	+	metacluster_0/pattern_0	495.5
31190	461	476	+	metacluster_0/pattern_0	468.5
31157	489	504	+	metacluster_0/pattern_0	496.5

	start	end	strand	pattern	center	seqlet_idx
example_idx
2	498	513	+	metacluster_0/pattern_0	505.5	0
49274	496	511	+	metacluster_0/pattern_0	503.5	1
31492	488	503	+	metacluster_0/pattern_0	495.5	2
31190	461	476	+	metacluster_0/pattern_0	468.5	3
31157	489	504	+	metacluster_0/pattern_0	496.5	4

TODO¶

Requires¶

Produces¶

Add the fraction of other motifs¶

Assumptions¶

match_weighted_cat!="low", imp_weighted_cat=="high", [15, 80]¶

Sort by metacluster¶

Sort by motif grouping¶

match_weighted_cat!="low", imp_weighted_cat=="high", [15, 50]¶

Sort by metacluster¶

Sort by motif grouping¶

match_weighted_cat!="low", imp_weighted_cat!="low", [15, 150]¶

Sort by metacluster¶

Sort by motif grouping¶

match_weighted_cat=="high", imp_weighted_cat!="low", [15, 150]¶

Sort by metacluster¶

Sort by motif grouping¶

match_weighted_cat=="low", imp_weighted_cat=="high", [15, 150]¶

Sort by metacluster¶

match_weighted_cat=="low", imp_weighted_cat=="high", [15, 50]¶

Sort by metacluster¶

Sort by motif grouping¶

Append to patterns¶

Visualize¶

Results¶

Convert patterns -> table¶

Focus only on the things to visualize¶

`match_weighted_cat!="low", imp_weighted_cat=="high"`, [15, 80]¶

`match_weighted_cat!="low", imp_weighted_cat=="high"`, [15, 50]¶

`match_weighted_cat!="low", imp_weighted_cat!="low"`, [15, 150]¶

`match_weighted_cat=="high", imp_weighted_cat!="low"`, [15, 150]¶

`match_weighted_cat=="low", imp_weighted_cat=="high"`, [15, 150]¶

`match_weighted_cat=="low", imp_weighted_cat=="high"`, [15, 50]¶