In [1]:
from basepair.config import get_data_dir
ddir = get_data_dir()
modisco_subdir = "modisco/all/deeplift/profile/"
model_dir = f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = os.path.join(model_dir, modisco_subdir)
output_dir = modisco_dir
Using TensorFlow backend.

Cluster the motifs by their sequence similarity

Requires

  • {modisco_dir}/modisco.h5
  • {output_dir}/pattern_table.csv
  • {output_dir}/footprints.pkl

Produces

  • {output_dir}/patterns.pkl
In [33]:
print(plt.style.available)
['seaborn-poster', 'seaborn-ticks', 'ggplot', '_classic_test', 'bmh', 'seaborn-darkgrid', 'dark_background', 'tableau-colorblind10', 'seaborn-deep', 'fast', 'seaborn-talk', 'seaborn-bright', 'seaborn-notebook', 'seaborn-dark', 'grayscale', 'seaborn', 'fivethirtyeight', 'seaborn-pastel', 'seaborn-dark-palette', 'seaborn-paper', 'seaborn-colorblind', 'classic', 'Solarize_Light2', 'seaborn-muted', 'seaborn-whitegrid', 'seaborn-white']
In [6]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *

model_dir = Path(model_dir)
modisco_dir = Path(modisco_dir)
In [10]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
patterns = [mr.get_pattern(p) for p in mr.patterns()]

# patterns = [mr.get_pattern(p) for p in mr.patterns()]
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")

tasks = [x.split("/")[0] for x in mr.tasks()]

footprints = read_pkl(output_dir / 'footprints.pkl')

# Add profile to patterns
patterns = [p.add_profile(footprints[p.name]) for p in patterns]

# Add features
patterns = [p.add_attr('features', OrderedDict(pattern_table[pattern_table.pattern == shorten_pattern(p.name)].iloc[0])) for p in patterns]

# check that the pattern names match
assert patterns[4].attrs['features']['pattern'] == shorten_pattern(patterns[4].name)

In [11]:
patterns[:2]
Out[11]:
[Pattern('metacluster_0/pattern_0', 'AAAAAATTAGATATTAAGTGTCGTAATTTGCATAACAAAGGACACCATAGGATCATATCTTTGAATGTTT' ...),
 Pattern('metacluster_0/pattern_1', 'ACCTAGGGTGGAGCCTTCCGACCTAGATGACTCCATTGTTCTGCCACTTCCCCCGTTTCTTTTCAAAAAA' ...)]

1. split into two groups: {TE, non-TE}

In [12]:
p = patterns[0]
In [13]:
df = patterns_to_df(patterns, properties=['short_name', 'seq_info_content'])
In [14]:
df.head(2)
Out[14]:
short_name seq_info_content
0 m0_p0 15.3199
1 m0_p1 16.9463
In [15]:
# example borderline motif (ic == 21.498123)
patterns[28].plot(kind='seq');
In [16]:
# one can also plot the profile of the Pattern
patterns[0].plot(kind='profile', rotate_y=0);
In [61]:
TE_cutoff = 34
In [62]:
df.seq_info_content.plot.hist(bins=50)
plt.xlabel("Sequence information content");
plt.axvline(TE_cutoff, color='orange', linewidth=2);
In [47]:
patterns_te = [p for p in patterns if p.seq_info_content > TE_cuttof]
patterns_nte = [p for p in patterns if p.seq_info_content <= TE_cuttof]
assert len(patterns_te) + len(patterns_nte) == len(patterns)
In [48]:
sizes = np.array([p.seq_info_content for p in patterns_te])
In [63]:
p = patterns_te[0]
In [64]:
 
Out[64]:
'TTTGGGGAAGTCCAGGTACTCCATTCAATTGCAAATTAGCAACGATCAAGCTCAGACTCTGACCTCCCGG'
In [ ]:
 
In [68]:
for p in patterns_nte:
    p.plot('seq_ic');
    plt.title(f"{p.name} {p.seq_info_content:.2f}")
In [66]:
for p in patterns_nte:
    print(f">{p.name}")
    print(p.get_consensus())
>metacluster_0/pattern_0
AAAAAATTAGATATTAAGTGTCGTAATTTGCATAACAAAGGACACCATAGGATCATATCTTTGAATGTTT
>metacluster_0/pattern_1
ACCTAGGGTGGAGCCTTCCGACCTAGATGACTCCATTGTTCTGCCACTTCCCCCGTTTCTTTTCAAAAAA
>metacluster_0/pattern_2
CCATGAATAATTTATTAAGGGCCTTAAAAAGAGCCATCAAGATCCAATTAGCGCACTTTAGGACTCTTTA
>metacluster_0/pattern_4
ATCCCAATATATAGGAAGAAAAGGAGGCAGAATTAGAAGTTCAAGGTCAGCCTCGGCTACATAGTAAGTT
>metacluster_0/pattern_5
ATTTAGTTTAAAGGTAACTGTGATGGTTTGCATATGCAAGGCCCAGGGAGTGGCACTATTAGCAGGTGTG
>metacluster_0/pattern_7
TGGGTTAAGTTTGTGCTCTGTGTATTCAAATGCTGATTTCTGTGCCCAGAGATCTGGTTTCAGTCCAGAC
>metacluster_0/pattern_8
ATTTACTCCTTTCTGCCTACATTTGCATAACAGAAGCCGCCCCTAGCCGCTTTTAAGAATGACAATTATC
>metacluster_0/pattern_11
ATTTTAATTCTTATAAAACTTTGTTATTGTTATTCAAATTACTCATCAGCTGAATCATTATTTTTTATGT
>metacluster_0/pattern_12
GGAAGGCTCCACCCCAGGTGGTCTCCTTCTCAGTGGCAGCAAGGTCAAAGTCAGCCTAAGCCAGGCTCAG
>metacluster_0/pattern_17
AATACCAGCACTCAAAAGGCGGAGGCAGGAGGATCACAAGTTCAAGGTCAGCCTGGGCTACATAGTAAGT
>metacluster_0/pattern_26
TTACAATGGATCCTCATTTTCATCTCAATCGCCGCCATTCTGATGCAAATGCACACAGTTTGTATTACAA
>metacluster_0/pattern_27
GCCGCGGCGCCGCGGGGGCGGGGGGGGGAGAACAAAGGGCCGCGCCCGCCCGCGCGGCCGGCCCGCGCCC
>metacluster_1/pattern_0
ATTGGTAAAAAAAAAGATGTAGAGCTGGGTGTGGCCGCGCACGCCTTTAATCCCAGCACTTGGGAGGCAG
>metacluster_1/pattern_1
CCCCCCCGCCGGGGGGGGGGGCGGCCGGGGCCCCGCCCTGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
>metacluster_1/pattern_2
GGGGCCGACGCTCTCTCTAGCCCAGGCTGACCTTGAACTCCGGACCCTCCCCCCCCCTGCCCCCACCTGC
>metacluster_1/pattern_4
AATCGCCCCCGCTGCGGACAGGCTCAAGGGCGTGGCCTTCCTTGCTTGAAAAACAAAGTCGAGAGCGTGA
>metacluster_1/pattern_5
TAGGAGTAGGTGTGGCCTTGTTGGAGTAGGTGTGTCACTGTGGGTGTGGGCTTTAAGACCCTCATCCTAG
>metacluster_1/pattern_6
AGCAGGAACTTGGTGGTATCAGGGTGTGGTCAGGACAGCGGTGATGACTTAGGGCTGGAACATCCCGTGG
>metacluster_1/pattern_7
GCGTGGGAACACGTGAAGAGGGATGCGGGGCCGGCCCCGCCCCACCCTGCCCCCAGGGGATTTTATTGAT
>metacluster_1/pattern_8
GAAGAAAAAGCGGAAGAAATTGTAAGAGGGAAGGGTGGGGCAGTAGAATTAGTGCCCTTGTCAATGACAC
>metacluster_1/pattern_10
ACACCATGAGCACTTTCAGAACGGAGGGTGTGGGTGGGTGTGGCCGCTCCCCTCTTTTTCTCAAGGTTTG
>metacluster_1/pattern_11
GTGAGAATTTGAATGTGCGGGGCCCAGGGTGTGGCACTAGGGGGTGGGGCCGCGTTGGGGTAGGGGTTGC
>metacluster_1/pattern_12
AGGCTGGGGGGGGGGTGGCCGAGGAGTGGGCGTGGCCACCGGGTGGGCGTGGCCCCCGGGGGGGGGGCGG
>metacluster_1/pattern_13
TCAGTTACGATAGGAAAGAAAATTGCTGGGTGGGGCCCCACCCAGCTATCTTATTTTCAGTGCTTAAGGA
>metacluster_1/pattern_16
CCGGGGCGGGGCGGGGGCCGCGCGCGGCAGCCTGCGCGGGAGCGCCGCCGGCGGCGGCCGCCCGGGGCCC
>metacluster_2/pattern_0
AATAAAAAATAAAAAAAAAAAAAAAAAAGAGCCATCAAAACAATTAAAAAAAAATAAATATAAAAAATAA
>metacluster_2/pattern_1
ACATTAGATTTTTATACATTATTAGAGGTCCTAATGAGGTGTTAATTGCTCCCTATTGAGGCTTAACTGT
>metacluster_2/pattern_2
TATATACTACTATTAATTGATATTTAAGTACCCATTAATAAAAACAATGGAAGCAATTAGATAACAATTA
>metacluster_2/pattern_3
AATTAGTTTTTTAATCTTCAGTTAATGGGCCATCAACAGCAAATTAAAAATAATAAAGTAAATATTGATG
>metacluster_2/pattern_4
AAATTAGCTATTAAGTAACTTAATTTGCATAACAATAGACCCATCAAAAAGCTAATGTATCATTTATTAA
>metacluster_2/pattern_5
TTAAATAGAAAGGATCCTCCTGCTTCAGCCTCCTGAGTGCTGGGATTAAAGGCATGTACCAGCATGACCA
>metacluster_2/pattern_6
TCGTAAAGAGTGTATGATTGTTGATTGCTGATTGCTTTCTATTAGTCTTCTTACTTGTTGTTATTGCTGT
>metacluster_2/pattern_7
GACCTGAGGTTGAAAAAGACATAAAAAGATATAATTAATCACTGATTGCTCTTATTATTAAAAAACAGGA
>metacluster_2/pattern_8
TAAAATGGTGATAATAATCATGATGATTCTGATGATTCTTATTATGAATTTGATTTTTGTATTATTTTAA
>metacluster_2/pattern_9
AAAAGACTTCAAGTATGCTTTTGATTGTCCTAATAATCAGCATTATTAGCATGAAATCCTTTAATTTTGA
>metacluster_2/pattern_10
CCGTGTATTCTCAACTCTCTCTATTTTATACTTGATGGCTTTAGCTCCCCTGTAAATGATCGTTTGGACT
>metacluster_2/pattern_11
GAACTCAGGACCTCTGGAAGAGCAATCAGTGCTCTTAACCGCTGAGCCATCTCTCCAGCCCTCAATTTAT
>metacluster_2/pattern_12
AGTAGTTTTTGATCTTTGTTTCTTGATTGCTCCTTGATTGCTTTTCTATTACACTTATATTTTTTTTATA
>metacluster_2/pattern_13
GCTAAGAAATAATAATAATAAGTAAAAAAAAAAATCAATGGAGCCATCAAGGGGCTATGTGTATATTGAT
>metacluster_2/pattern_14
TTTAGGGAAGTCAAAGTAATCCATTCAATTGCAAATTAGCAACAATCAGGCTCAGACTTAGACCTCCAAT
>metacluster_2/pattern_17
AAAAAAAAAGGGGGCTGGAGAGATGGCTCAGCGGTTAAGAGCACTTACTGCTCTTCCAAAGGTCCTGAGT
>metacluster_2/pattern_18
AAAGAACTTTTTTTGCATGTTCTTGTTGATTGCTTGATTGCTTGATTGATTGTATTTCTTTTATGAATTT
>metacluster_2/pattern_20
AGTATCTTCAAGAGAAAAAGTCATTGAGAAAGTCAAAAAGAGCCATCAAGATACAAAGGATCCACCCTAG
>metacluster_2/pattern_21
ATTGAAGATAAGGTGAAACTAATTTTGCATAACAATGAACTCAGTCAGGGTAAATCAAGCCCAGTCTTTA
>metacluster_2/pattern_22
CACCTTTCCCACTCAGGGAGTACCTAGTTTGAATTTGATCGATGATAATTTTCAACTAAATGAAGTAACT
>metacluster_3/pattern_0
GAAAAAAAAATAATTAATTAGAACAATGGAAAATTTTATCCAGAAAATGAAATTCAAAAATGCCTAGTTA
>metacluster_3/pattern_1
TTTTAATTCTTAATATGTTTTTGATGGCTTTTTTTTTATTTTTTTTTAGTTTATATTTTTTTTTTTTGTT
>metacluster_3/pattern_2
TAAAAAGAGTTAAATGCGGGCCATTGTTCCCCCAATTTTACCTTTTTATATTTCCTTATTTATTTATTTT
>metacluster_3/pattern_3
TAAAAAAAATTTTAACTGATTTTAATGCCCCCAATGGGCTCTTAATGGCTCTTTTTAGGAGGAAAATAAA
>metacluster_3/pattern_4
ATTGCATTAAAAAATTTAAGTTTTAAAAAACTTAATTTGCATAACAATGGAGTACAAAAAAATCTTCAAA
>metacluster_3/pattern_6
TTCTTCGGAGAATACAGAAGCAGGTGGCAGAACAATAGAGCCATCTAGGTCGGAAGGCTCCACCCTAGGT
>metacluster_3/pattern_8
GAGGCATGTCTCAAAAAAGAGAACAATCAAGCTGTCTCAATTGTTTTCTGTATCTACACTTATGAGAAAA
>metacluster_4/pattern_0
TAGTAGAGACAGTGAAACTAATTTTGCATAACAAAGGACAGAACTAGAATATCTAATGGAAAGTTTGAAA
>metacluster_4/pattern_1
ATTTCTCTTTAATTAGCTTCATTTGCATATCAATAAAAAATACAATAATGATATTATAAATAAAAATATA
>metacluster_4/pattern_2
TTTTATTAAAGTTTTTTTTTTAATTATTTTATGCAAAATTTGTCCTTTGTTCTGCAAATAGACCAATTTA
>metacluster_4/pattern_3
TGAAATCAATCTCAAAAATTTTACATTTTTCTTTTCTAATGCAAATTAATTTATTCAAAAAATTTTTTTT
>metacluster_4/pattern_6
ATATAAGCTCCCTAAAATCCCATTGATGTGTTGATTGCTCTAATTAGAAATCCAATTTTTTCTAATCGAG
>metacluster_4/pattern_7
TTGTTTTGGGTGTTGCTTGTCTATTGTCTTAATTGATCCCTCTGTATACTAGTCCTGATCTTTTCTAGCT
>metacluster_4/pattern_8
TCTAGGACTAAGAGGGATTTCTCACGTTTACAAGCTTTTGTTGTGCAAACCTTGTTCCTTAATTTAAAAC
>metacluster_4/pattern_9
AAGTATATATAACAAGAAGACAGGTGATGGTTCATTAAAATAACAATACCAAAGTCCAGCCCAGAATTAG
>metacluster_4/pattern_10
CTTACTGATTCAAATATTTCCTTGTTATCCTCGTCATATGCATATGCATAGGCCTACTATGTGTACTATA
>metacluster_4/pattern_11
TAAGAAACACCAAGTTACATTATTTTTATCACAATAGAATTGAAATGATGATCTAATTAGGGGTTGATTG
>metacluster_5/pattern_0
ACGCTCTTGTGGACCTCTACCACCCAAAACCGCCCCATTGTTCTCCCACAGCGCCAGTTCCTTGTCTAAA
>metacluster_5/pattern_1
TCCTCCTGCCTTGAGCTGCCCAAATAGGCGTTGATGGCTCCTTTTCAGTTGCTGTAATTGTCTCTTGAAG
>metacluster_5/pattern_4
CAAGCATGTTGATCCAGACTTTGACCTTGAACTTAATAAGATATTATTTACCGAAAAAAAATGCTATTTA
>metacluster_5/pattern_5
TCTTCCTTTGTAGCCAAGGATGACCTTGAACTCTCGATCTTCCTGCCTCTACTCCTCCATTTCCTGGACA
>metacluster_5/pattern_6
CCCGCGGCCCCCGCGGGGGCGCCCCCGCCGGCCTTTGTCCCCCCCCCGCCGCCGGCCGCGCCCGCCCCCC
>metacluster_5/pattern_10
CAGGTTGGGCCTCAGGCAGGTAGTGGGACATCAAAGGAGAGGGACGGGAGGCGGCACAGCAGATGGCCCT
>metacluster_6/pattern_0
GGCAGGCGACGCGGGCGCGGGCCGGAAGGGAGCCATCAGCGCCGCCCCGCGGCCCACGGGAGCCATCAGC
>metacluster_6/pattern_1
TCGTAATAATTAGGAAGAGGAGGCAGGAGGATCATGAGTTCAAGGTCAGCCTGGGCTACATAGTGAGTTA
>metacluster_6/pattern_2
GCCTCCCGAGTGCTGGGATTAAAGGCATGCGCCGCCACACCCAGCTTTTTTTTTTTTTTTTGTTTCATGG
>metacluster_6/pattern_3
ACGTTGCCCCTAGGCCATTGTAAATTGGTGTTGATGGGTGTGGCCCCGCTTTCTATAGATCTAAGTATTT
>metacluster_6/pattern_4
CTGCTACCTGCTGAGAGGCCCCTGAGCTATTCTGGAGACTACACCCTATCCTGCATGTCGTCACCTGCAG
>metacluster_6/pattern_8
GGGGGAGGGGAGCGGCCGCGGGAGAGGGTGGGGCCTGCGGGCTCCCCCGGCGGCGGGGGGTCGGGGGGCG
>metacluster_6/pattern_9
GGCCCAGCTGGCGGGGAGGCCACCCTCGGCCCCCGGCAAGGGGGCGCTGAGGAGGCGGTGGCGGGGCCGG
>metacluster_6/pattern_12
CCCCCGGGGCGAGGCGCCGCGGGTGCGAGGCCCGGGCGGCCCCTGAGCCGGCCCGCGCGCGCCCCCAGCC
>metacluster_7/pattern_0
AAATAATTTAATATTTACTTTTTGTGTCCTTTGTTATGCAAATTTAGTTACTTAATATTAAACTTTTGTA
>metacluster_7/pattern_1
CTTATAAAGAAATACATATATATATGCATATGCATATAAGTGTACATCCAAATAAATATATTCTCACATG
In [65]:
for p in patterns_te:
    print(f">{p.name}")
    print(p.get_consensus())
>metacluster_0/pattern_3
TTTGGGGAAGTCCAGGTACTCCATTCAATTGCAAATTAGCAACGATCAAGCTCAGACTCTGACCTCCCGG
>metacluster_0/pattern_6
TAGGTAAGTGTTGTGGATGGCCCTGGTGCTGTTTGTATTTTGATGCTAATTCTGCTTTCCCAGGAGGGGC
>metacluster_0/pattern_9
GTCTTGAAGAGAACAGGGGCAGGTGGCAGAACAATAGAGCCATCTAGGTCGGAAGGCTCCACCCTAGGTG
>metacluster_0/pattern_10
GGGAAAACCCACAGACCCATCCTTTGTTTCAGCTGGATTGAGTTGCAAAGCAAGCTCAGCAGGCAGTCAA
>metacluster_0/pattern_13
AGAACCTGGAGGCCAGGTCTGCTTTGATATGTAAAATAGGCACCTCAGCCCTTTGTCCTGGGTTTGAAAC
>metacluster_0/pattern_14
GTCTCAGAGAAGAGCAGCGGTAGTTGACATAACAATAGAGCCATCTAAGTCGGAAGGCTACACCCCAGGT
>metacluster_0/pattern_15
TCTTAAAGAACAAAGCCCAGTCTTTTATTTACATATCAATTCAGTCATTTACTCCAGGTTCCCTTTTGAT
>metacluster_0/pattern_16
CAGTATCTTGAAGAGGAACAGCGGCAGTGGCTGAACAATAGAGTGATCTAGGGAGGAAGGCTCCACCCTA
>metacluster_0/pattern_18
CTTGTTTGGGTGAGACTCGGCTATTGTCCTAAGTAATCACTATGCAGACTAGCCCTGAGCTATTCTAGCT
>metacluster_0/pattern_19
CAAACTTGAGACAGGAGATTCTTGGCATAAGCATTACAATGCCATGTCTGGATTGAAACAAGATATGAGG
>metacluster_0/pattern_20
CAGGTTGGGCCTCAGGCAGGTAGTGGCACATCAAAGGAGAAGCATGGGAGGCGGCACAGCAGATGGCTCG
>metacluster_0/pattern_21
CATTGCAAAAGAATTCAGAGATCTGTCTGCCTTTGTTAAGCACAGATAATAAACTAGTTGGGTGGGACCC
>metacluster_0/pattern_22
GAGTCTTGGAGAAGAGCAGCGGTAGTTGACATAACAATATAGCCATCTAAGTGAGAAGGCTACACCCCAG
>metacluster_0/pattern_23
TTGTTGGCCCGAGCTGACCTTTTCATTCCCCCCTTTAGAAATGGTAATTAGCCTCTCCCTTCCAGAGAGG
>metacluster_0/pattern_24
AGCGGCAGTGGCTGAACAATAGAGTGATCTAGGGAGGAAGGCTCCACCCTAGGTAATCTCCTTAGTGGCA
>metacluster_0/pattern_25
GTCCCCACCCGCAGGTCGGTTCCCACCCTGTGGTCTGAGAAATGCTAATTAGCCTCTCCCTTCCGGAGGG
>metacluster_0/pattern_28
GTGTCTTGGAGAAGAGCAGCGGCAGGTGACAGAACAATAGAGCCATCTAAGTCGGAAGGCTCCACCCCAG
>metacluster_0/pattern_29
ATTCACTAGGCCATTGTGAATTCCCTTGTCTGGGTAAGACTGGCTATTGTCCCAAGTAAACACTCTGCAG
>metacluster_1/pattern_3
ACCCCCACCCACAGTGACACACTTCCTCCAACAAGGCCACACCTACTCCAACAAGGCCACACCTCCTAAT
>metacluster_1/pattern_9
TGCCTCCCAAGTGCTGGGATTAAAGGCGTGGGCCACCACCGCCCGGCTTTTTTTTTTTTTTTAAAGAGTG
>metacluster_1/pattern_14
AGGAGGTGTGGCCTTGTTGGAGTGGGTGTGTCACTGTGGGTGTGGGCTTTAAGACCCTACTCCTAGTTGC
>metacluster_1/pattern_15
AATGTTAACAACTGGGATCAAAAGCAGCCCCACCTAAGGTCAGCTTAATCTTAGAAGCCAGGGGCAAGGG
>metacluster_1/pattern_17
GTTCTAATCACCAATTGATGGGGGAGGGCCCCGCCCACTGTGGGTGGGGCCATCCCTAGTCAGGTGGTCC
>metacluster_2/pattern_15
TAAACCTGCTTCCCACCTTAATTTGTAAAATAAAGGAGAGCTGATGATTGGGCAGATAAAAGGGAAGGTG
>metacluster_2/pattern_16
GTTGTTTGGGTGAGACTGGCTACTGTCCTAAGTAATCACTCTGCAGACTAGCCCTGAGCTATTCTAGCTC
>metacluster_2/pattern_19
AGGTGGGAAGCAGGTTTACAGGAAATCACCTGAGTGCTGACTCATTCCTTGTTCACAGCCACTCACAGGA
>metacluster_3/pattern_5
AGCTAGAAAAGCTCAGGGCTAGTCTGCATAGTAATTACTTAGGACAATAGCCAAGTCACACCCAAACACG
>metacluster_3/pattern_7
GAACAAGCAAGACAATAAACACAAAGCCCCATGATCACTCTGGTGATAGCTGTTTCTAAGGATGATCAAG
>metacluster_3/pattern_9
TTTGTTTGGGTAAGACTGGCTACTGTCCTAAGTAATCACTCTGCAGACTAGCCCTGAGCTATTCTAGCTC
>metacluster_3/pattern_10
AAGTAGCCAGTCTCACCCAAACAAGGGAATACACAATGGCCTAGCGAATAGGTGGGTTTACCATGAAAAA
>metacluster_4/pattern_4
CCAGGAGCTCAAAGTCTAAGCTTGATCGTTGCTAATTTGCAATTGAATGGAGTACCTGGACTTCCCCAAA
>metacluster_4/pattern_5
TTAAAAGGGAACCTGGAGTTAATGACTGAATTGATATGTAAATTAAAAACTGGGCTTTAGTTTTTAAAAA
>metacluster_4/pattern_12
TAGGATATTGTTGTGGATTGCCCTGGTGTTGTTTGTATTTTGATGCTAATTCTGCTTTCCCAGGAGGGGA
>metacluster_5/pattern_2
AGTATCTTGAAGAGGAACAGCGGCAGTGGCAGAACAATAGAGTGATCTAGGGAGGAAGGCTCCACCCTAG
>metacluster_5/pattern_3
CGAAGGAGATTACCTAGGGTGGAGCCTTCCTACCTAGATGGCTCTATTGTTCTGCCACCTGCCCTTTTTC
>metacluster_5/pattern_7
TCACTAGGCCATTGTGAATTCCCTTGTCTGGGTAAGACTGGCTATTGTCCCAAGTAAACACTCTGCAGAC
>metacluster_5/pattern_8
TCGGAAGGCTCCACCCTAGGTAATCTCCTTCTCAGTGGCAGCAAGGTCAAAGTCTGGATCAGCCTGGTTC
>metacluster_5/pattern_9
AGTCTGCATAGTAATTACTTAGGACAATAGCCGAGTCACACCCAAACACAGGAATACACAATGGCCTAGG
>metacluster_5/pattern_11
CAGGGCTAGTCTGCATAGTAATTACTTAGGACAATAGCCGAGTCACACCCAAACACAGGAATACACAATG
>metacluster_6/pattern_5
AGAAGGAGATTACCTAGGGTGGAGCCTTCCGACCTAGATGGCTCTATTGTTCTGTCACCTGCCCCTGCTC
>metacluster_6/pattern_6
GCCACTAAGGAGATTACCTAGGGTGGAGCCTTCCTCCCTAGATCACTCTATTGTTCAGCCACTGCCCCTG
>metacluster_6/pattern_7
ACTCAACCCTTTCAAATAGTGCCACTCCCTGGTGACCAAGCATTCAAATTTTTGAGCCTATGGGGGCCAT
>metacluster_6/pattern_10
CCGGAATAGCTCAGGGGCCTCTCAGCAGGTAGCAGTATCTTGAAGAGGAACAGCGACAGTGGCTGAACAA
>metacluster_6/pattern_11
TCTGCCTCCTAAGTGCTGGGATTAAAGGCGTGCGCCACCACCCCCCGCTGAATTTAGTTTTTTTAAATTT
In [49]:
for i in np.argsort([p.seq_info_content for p in patterns_te]):
    p = patterns_te[i]
    p.plot('seq_ic');
    plt.title(f"{p.name} {p.seq_info_content:.2f}")

Pattern len distribution

In [20]:
ns = pattern_table['n seqlets']
In [21]:
np.log10(ns).plot.hist(30)
plt.xlabel("log10(n seqlets)");
In [22]:
ns.quantile(0.1)
Out[22]:
89.0
In [69]:
print("# long motifs: ", len(patterns_te))
print("# short motifs:", len(patterns_nte))
# long motifs:  44
# short motifs: 78

2. cluster by seq-similarty, freeze groups

In [70]:
# Whole pipeline from this notebook
def cluster_patterns(patterns, n_clusters=9, cluster_track='seq_ic'):
    """Cluster patterns
    """
    sim = similarity_matrix(patterns, track=cluster_track)
    
    # cluster
    lm_nte_seq = linkage(1-sim_nte_seq[iu1], 'ward', optimal_ordering=True)
    cluster = cut_tree(lm_nte_seq, n_clusters=n_clusters)[:,0]
    
    cluster_order = np.argsort(leaves_list(lm_nte_seq))
    pattern_table_nte_seq = create_pattern_table(patterns_nte, cluster_order, cluster, 
                                                 align_track='contrib/mean',
                                                 logo_len=70, 
                                                 seqlogo_kwargs=dict(width=320), 
                                                 footprint_width=320,
                                                 footprint_kwargs=dict())
    return sim, lm_nte_seq, cluster, cluster_order, pattern_table_nte_seq
In [71]:
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
In [72]:
sim_nte_seq = similarity_matrix(patterns_nte, track='seq_ic')
sim_te_seq = similarity_matrix(patterns_te, track='seq_ic')
100%|██████████| 78/78 [00:24<00:00,  3.17it/s]
100%|██████████| 44/44 [00:08<00:00,  4.86it/s]
In [104]:
%tqdm_restart
In [105]:
sim_all_seq = similarity_matrix(patterns, track='seq_ic')
100%|██████████| 122/122 [01:00<00:00,  2.07it/s]

non-TE

In [125]:
iu1 = np.triu_indices(len(sim_nte_seq), 1)
plt.hist(sim_nte_seq[iu1], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (non-TE)");

TE

In [74]:
iu2 = np.triu_indices(len(sim_te_seq), 1)
plt.hist(sim_te_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (TE)");

All

In [123]:
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (all)");
In [75]:
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

Cluster the motifs

Short motifs

In [126]:
# Seq IC-based clustering
iu_nte = np.triu_indices(len(sim_nte_seq), 1)
lm_nte_seq = linkage(1-sim_nte_seq[iu_nte], 'ward', optimal_ordering=True)
clusters_nte_seq = cut_tree(lm_nte_seq, n_clusters=9)[:,0]
cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte_seq),
                            row_linkage=lm_nte_seq,
                            col_linkage=lm_nte_seq,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)))),
                            cmap='Blues'
                           ).fig.suptitle('Seq-ic sim');

Long motifs

In [128]:
iu_te = np.triu_indices(len(sim_te_seq), 1)
lm_te_seq = linkage(1-sim_te_seq[iu_te], 'ward', optimal_ordering=True)
clusters_te_seq = cut_tree(lm_te_seq, n_clusters=9)[:,0]
cm_te = sns.clustermap(sim_te_seq,
                        row_linkage=lm_te_seq,
                        col_linkage=lm_te_seq,
                        cmap='Blues'
                       ).fig.suptitle('Seq IC sim');

All motifs

In [134]:
iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
cm_all = sns.clustermap(sim_all_seq,
                        row_linkage=lm_all_seq,
                        col_linkage=lm_all_seq,
                        cmap='Blues'
                       ).fig.suptitle('Seq IC sim');

Cluster by profile

In [24]:
# sim_nte = similarity_matrix(patterns_nte, track='profile/mean')
In [25]:
# Contrib-score based clustering
# lm_nte_contrib = linkage(1-sim_nte[iu1], 'ward', optimal_ordering=True)
# clusters_nte_contrib = cut_tree(lm_nte_contrib, n_clusters=9)[:,0]
# cm_nte_contrib = sns.clustermap(pd.DataFrame(sim_nte),
#                                 row_linkage=lm_nte_contrib,
#                                 col_linkage=lm_nte_contrib,
#                                 col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_contrib)))),
#                                 cmap='Blues',
#                                )

3. Visualize the clusters

In [79]:
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
In [80]:
cluster_order = np.argsort(leaves_list(lm_nte_seq))
cluster = clusters_nte_seq

patterns_nte_clustered = align_clustered_patterns(patterns_nte, cluster_order, cluster, 
                                                  align_track='seq_ic',
                                                  # don't shit the major patterns 
                                                  # by more than 15 when aligning
                                                  max_shift=15)
# add the major motif group
patterns_nte_clustered = [x.add_attr("pattern_group", 'nte') for x in patterns_nte_clustered]
100%|██████████| 78/78 [00:00<00:00, 208.95it/s]
In [81]:
cluster_order = np.argsort(leaves_list(lm_te_seq))
cluster = clusters_te_seq

patterns_te_clustered = align_clustered_patterns(patterns_te, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning                                             
                                                 max_shift=15)
patterns_te_clustered = [x.add_attr("pattern_group", 'te') for x in patterns_te_clustered]
100%|██████████| 44/44 [00:00<00:00, 221.81it/s]
In [135]:
cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq

patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning                                             
                                                 max_shift=15)
# patterns_te_clustered = [x.add_attr("pattern_group", 'te') for x in patterns_te_clustered]
100%|██████████| 122/122 [00:00<00:00, 215.39it/s]
In [82]:
write_pkl(patterns_nte_clustered + patterns_te_clustered, output_dir / 'patterns.pkl')

Write a simple table

In [92]:
patterns = read_pkl(modisco_dir / 'patterns.pkl')
In [317]:
# split the patterns back into the groups
patterns_nte_clustered = [x for x in patterns if x.attrs['pattern_group'] == 'nte']
patterns_te_clustered = [x for x in patterns if x.attrs['pattern_group'] == 'te']
In [88]:
pattern_table_nte_seq = create_pattern_table(patterns_nte_clustered,
                                             logo_len=50, 
                                             seqlogo_kwargs=dict(width=420),
                                             n_jobs=20,
                                             footprint_width=120,
                                             footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 78/78 [00:03<00:00, 18.10it/s]
In [87]:
pattern_table_te_seq = create_pattern_table(patterns_te_clustered,
                                            logo_len=70, 
                                            seqlogo_kwargs=dict(width=420),
                                            n_jobs=20,
                                            footprint_width=120,
                                            footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 44/44 [00:02<00:00, 20.16it/s]
In [204]:
pattern_table_all = create_pattern_table(patterns,
                                            logo_len=70, 
                                            seqlogo_kwargs=dict(width=420),
                                            n_jobs=20,
                                            footprint_width=120,
                                            footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 122/122 [00:16<00:00,  7.29it/s]
In [140]:
background_motifs = ['Essrb', 'Klf4', 'Nanog','Oct4','Oct4-Sox2', 'Sox2']

first_columns = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + [task+'/f' for task in tasks] + [t + '/d_p' for t in tasks] # + [m+'/odds' for m in background_motifs]
remaining_columns = [c for c in pattern_table_nte_seq.columns if c not in first_columns]
colorder = first_columns + remaining_columns
(output_dir / 'motif_clustering').mkdir(exist_ok=True)
In [178]:
from basepair.modisco.table import write_modisco_table
In [177]:
remove = [task + '/f' for task in tasks] + ['logo_imp', 'logo_seq']
In [210]:
pattern_table_nte_seq['i'] = pattern_table_nte_seq.index.astype(int)
pattern_table_nte_seq = pattern_table_nte_seq[['i'] + colorder]
write_modisco_table(pattern_table_nte_seq, output_dir / 'motif_clustering', report_url='../results.html', prefix='patterns_short', exclude_when_writing=remove)
In [211]:
pattern_table_te_seq['i'] = pattern_table_te_seq.index.astype(int)
pattern_table_te_seq = pattern_table_te_seq[['i'] + colorder]
write_modisco_table(pattern_table_te_seq, output_dir / 'motif_clustering', report_url='../results.html', prefix='patterns_long', exclude_when_writing=remove)
In [207]:
pattern_table_all['i'] = pattern_table_all.index.astype(int)
pattern_table_all = pattern_table_all[['i'] + colorder]
write_modisco_table(pattern_table_all, output_dir, report_url='../results.html', prefix='patterns_all.datatable', exclude_when_writing=remove)

TODO

  • [ ] integrate this notebook into the pipeline
    • [ ] re-run the pipeline
  • [ ] For the paper, separate out TE's and non-Te's by using the seqlet locations with repbase
    • [ ] do all very long motifs match the repbase?