Goal¶

get the motif spacing information

Questions¶

[x] How often do motifs co-occur in the sequences?
[ ] What is their distance distribution?
[ ] Are there signs of nearby weak binding sites in the presence of one major motif?
- [ ] Does modisco actually flag those weak binding sites?

TODO¶

[x] make venn-like plot using upsetr
[ ] get the motif spacing information
[ ] for a particular combination of motifs, plot the distance distribution

TODO¶

add number of sequences

Conclusions¶

-

Motif spacing¶

import pandas as pd
import numpy as np
from basepair.modisco import ModiscoResult
from basepair.config import get_data_dir

from basepair.datasets import sox2_oct4_peaks_sox2

train, valid, test = sox2_oct4_peaks_sox2()

ddir = get_data_dir()

incl.sum()

780

How often do motifs co-occur in the sequences?¶

def plot_modisco_results(pattern, split):
    fpath = pattern.format(split=split)
    mr = ModiscoResult(fpath)
    incl = np.load(fpath + f".{split}.npy")
    print(f"# sequences: {incl.sum()}")
    mr.stats()
    mr.plot_profiles(valid[0][incl], {"Sox2": eval(split)[1]['sox2'][incl],
                                      "Oct4": eval(split)[1]['oct4'][incl]},
                    #rc_vec=[False, True],
                    #start_vec = [15, 15],
                    #width=40,
                    legend=False,
                    #ylim=[0, 7.45],
                    #seq_height=1.5,
                     n_bootstrap=100,
                    fpath_template=None,
                    figsize=(6,2.5))
    mr.plot_seqlet_upset()
    mr.close()

mdir = f"{ddir}/processed/chipnexus/motifs"

Sox2 - signal¶

plot_modisco_results(f"{mdir}/sox2/modisco/multi-task.0.{{split}}.h5",
                    "valid")

# sequences: 780
# seqlets assigned to patterns: 738 / 1396 (53%)
# Dataset size:

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Oct4 - signal¶

plot_modisco_results(f"{mdir}/oct4/modisco/multi-task.1.{{split}}.h5",
                    "valid")

# sequences: 402
# seqlets assigned to patterns: 370 / 1383 (27%)
# Dataset size:

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Sox2 - counts¶

plot_modisco_results(f"{mdir}/sox2/modisco/multi-task.2.{{split}}.h5",
                    "valid")

# sequences: 1883
# seqlets assigned to patterns: 1458 / 5479 (27%)

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

plot_modisco_results(f"{mdir}/sox2/modisco/multi-task.2.{{split}}.h5",
                    "valid")

# sequences: 1883
# seqlets assigned to patterns: 1458 / 5479 (27%)
# Dataset size:

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Oct4 - counts¶

plot_modisco_results(f"{mdir}/oct4/modisco/multi-task.3.{{split}}.h5",
                    "valid")

# sequences: 1884
# seqlets assigned to patterns: 1268 / 5453 (23%)
# Dataset size:

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

mr.f.ls()

[('/metacluster_idx_to_submetacluster_results/metacluster0/activity_pattern',
  <HDF5 dataset "activity_pattern": shape (1,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets',
  <HDF5 dataset "seqlets": shape (1247,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/affmat',
  <HDF5 dataset "affmat": shape (884, 884), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/cluster_results/cluster_indices',
  <HDF5 dataset "cluster_indices": shape (884,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/all_pattern_names',
  <HDF5 dataset "all_pattern_names": shape (2,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/seqlets_and_alnmts/alnmts',
  <HDF5 dataset "alnmts": shape (443,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/seqlets_and_alnmts/seqlets',
  <HDF5 dataset "seqlets": shape (443,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/sequence/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/sequence/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_contrib_scores/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_contrib_scores/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_hypothetical_contribs/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_hypothetical_contribs/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/seqlets_and_alnmts/alnmts',
  <HDF5 dataset "alnmts": shape (295,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/seqlets_and_alnmts/seqlets',
  <HDF5 dataset "seqlets": shape (295,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/sequence/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/sequence/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_contrib_scores/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_contrib_scores/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_hypothetical_contribs/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_hypothetical_contribs/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster1/activity_pattern',
  <HDF5 dataset "activity_pattern": shape (1,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster1/seqlets',
  <HDF5 dataset "seqlets": shape (149,), type "|O">),
 ('/metaclustering_results/all_metacluster_names',
  <HDF5 dataset "all_metacluster_names": shape (2,), type "|O">),
 ('/metaclustering_results/metacluster_indices',
  <HDF5 dataset "metacluster_indices": shape (1396,), type "<i8">),
 ('/multitask_seqlet_creation_results/final_seqlets',
  <HDF5 dataset "final_seqlets": shape (1396,), type "|O">),
 ('/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/coords',
  <HDF5 dataset "coords": shape (1396,), type "|O">),
 ('/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/thresholding_results/densities',
  <HDF5 dataset "densities": shape (100,), type "<f8">),
 ('/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/vals_to_threshold',
  <HDF5 dataset "vals_to_threshold": shape (4825,), type "<f8">),
 ('/task_names', <HDF5 dataset "task_names": shape (1,), type "|O">)]

mr.f.f['/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/vals_to_threshold'][:4]

array([12.83460903, 18.52757454, 17.23535347, 13.16116619])

valid[0].shape

(1884, 201, 4)

d.shape

(1396,)

sl = mr.seqlets()

## TODO - add no patterns to the plot

bdir = f"{ddir}/processed/chipnexus/motifs/sox2-oct4/modisco/bed"

mkdir -p {bdir}

mr.export_seqlets_bed(f"{bdir}/seqlet", True)

!head {bdir}/seqlet.pattern_0.bed

140	16	86	pattern_0	0	0
311	17	87	pattern_0	0	0
273	29	99	pattern_0	0	0
378	50	120	pattern_0	0	0
695	65	135	pattern_0	0	+
543	30	100	pattern_0	0	0
332	96	166	pattern_0	0	0
306	72	142	pattern_0	0	0
238	83	153	pattern_0	0	0
408	34	104	pattern_0	0	0

import pandas as pd

instances = []
for p, seqlets in sl.items():
    for seqlet in seqlets:
        seqlet["id"] = p
        instances.append(seqlet)

dfi = pd.DataFrame(instances)
dfi['n'] = dfi.groupby('example').example.transform('size')
dfi.groupby('example').size().plot.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7eff47d8b0b8>

examples = {}
for p, seqlets in sl.items():
    for seqlet in seqlets:
        seqlet["id"] = p
        if seqlet['example'] in examples:
            examples[seqlet['example']].append(seqlet)
        else:
            examples[seqlet['example']] = [seqlet]

examples2 = {e: l for e,l in examples.items() if len(l) == 2}

examples20 = {e: sorted([x['id'] for x in l]) for e,l in examples.items() if len(l) == 2}

c = {}
for v in examples20.values():
    if tuple(v) not in c:
        c[tuple(v)] = 1
    else:
        c[tuple(v)] +=1

c

{('pattern_0', 'pattern_1'): 43,
 ('pattern_0', 'pattern_0'): 21,
 ('pattern_1', 'pattern_1'): 9}

ids = [k for k,v in examples20.items() if v == ['pattern_0', 'pattern_1']]

len(ids)

43

examples2_01 = [(l[0]['end'] + l[0]['start'])/2 - (l[1]['end'] + l[1]['start'])/2 for e,l in examples2.items() if e in ids]

import matplotlib.pyplot as plt

plt.hist(examples2_01, bins=30);

plt.hist((dfi.end + dfi.start)/2, 30);
plt.xlabel("Seqlet position distribution")

Text(0.5,0,'Seqlet position distribution')

len(examples2)

73

examples2

{756: [{'example': 756,
   'start': 41,
   'end': 111,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 756, 'start': 103, 'end': 173, 'rc': True, 'id': 'pattern_1'}],
 485: [{'example': 485,
   'start': 63,
   'end': 133,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 485, 'start': 18, 'end': 88, 'rc': True, 'id': 'pattern_1'}],
 653: [{'example': 653,
   'start': 35,
   'end': 105,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 653, 'start': 77, 'end': 147, 'rc': True, 'id': 'pattern_1'}],
 98: [{'example': 98, 'start': 47, 'end': 117, 'rc': False, 'id': 'pattern_0'},
  {'example': 98, 'start': 16, 'end': 86, 'rc': True, 'id': 'pattern_0'}],
 623: [{'example': 623,
   'start': 61,
   'end': 131,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 623, 'start': 30, 'end': 100, 'rc': True, 'id': 'pattern_0'}],
 276: [{'example': 276,
   'start': 73,
   'end': 143,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 276, 'start': 18, 'end': 88, 'rc': True, 'id': 'pattern_1'}],
 558: [{'example': 558,
   'start': 69,
   'end': 139,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 558, 'start': 102, 'end': 172, 'rc': True, 'id': 'pattern_0'}],
 375: [{'example': 375,
   'start': 20,
   'end': 90,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 375, 'start': 108, 'end': 178, 'rc': True, 'id': 'pattern_1'}],
 494: [{'example': 494,
   'start': 80,
   'end': 150,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 494, 'start': 25, 'end': 95, 'rc': True, 'id': 'pattern_1'}],
 719: [{'example': 719,
   'start': 54,
   'end': 124,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 719, 'start': 97, 'end': 167, 'rc': False, 'id': 'pattern_0'}],
 175: [{'example': 175, 'start': 3, 'end': 73, 'rc': False, 'id': 'pattern_0'},
  {'example': 175, 'start': 58, 'end': 128, 'rc': False, 'id': 'pattern_1'}],
 522: [{'example': 522,
   'start': 43,
   'end': 113,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 522, 'start': 89, 'end': 159, 'rc': True, 'id': 'pattern_1'}],
 462: [{'example': 462,
   'start': 54,
   'end': 124,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 462, 'start': 98, 'end': 168, 'rc': True, 'id': 'pattern_0'}],
 576: [{'example': 576,
   'start': 66,
   'end': 136,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 576, 'start': 104, 'end': 174, 'rc': True, 'id': 'pattern_1'}],
 663: [{'example': 663,
   'start': 22,
   'end': 92,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 663, 'start': 67, 'end': 137, 'rc': False, 'id': 'pattern_0'}],
 487: [{'example': 487,
   'start': 85,
   'end': 155,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 487, 'start': 53, 'end': 123, 'rc': True, 'id': 'pattern_1'}],
 407: [{'example': 407,
   'start': 79,
   'end': 149,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 407, 'start': 52, 'end': 122, 'rc': True, 'id': 'pattern_0'}],
 747: [{'example': 747, 'start': 18, 'end': 88, 'rc': True, 'id': 'pattern_0'},
  {'example': 747, 'start': 47, 'end': 117, 'rc': True, 'id': 'pattern_0'}],
 732: [{'example': 732,
   'start': 94,
   'end': 164,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 732, 'start': 65, 'end': 135, 'rc': True, 'id': 'pattern_0'}],
 536: [{'example': 536,
   'start': 62,
   'end': 132,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 536, 'start': 31, 'end': 101, 'rc': True, 'id': 'pattern_0'}],
 155: [{'example': 155,
   'start': 23,
   'end': 93,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 155, 'start': 67, 'end': 137, 'rc': True, 'id': 'pattern_1'}],
 99: [{'example': 99, 'start': 76, 'end': 146, 'rc': True, 'id': 'pattern_0'},
  {'example': 99, 'start': 106, 'end': 176, 'rc': True, 'id': 'pattern_0'}],
 624: [{'example': 624,
   'start': 53,
   'end': 123,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 624, 'start': 87, 'end': 157, 'rc': True, 'id': 'pattern_0'}],
 541: [{'example': 541,
   'start': 60,
   'end': 130,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 541, 'start': 94, 'end': 164, 'rc': True, 'id': 'pattern_0'}],
 601: [{'example': 601,
   'start': 97,
   'end': 167,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 601, 'start': 54, 'end': 124, 'rc': True, 'id': 'pattern_0'}],
 552: [{'example': 552,
   'start': 86,
   'end': 156,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 552, 'start': 36, 'end': 106, 'rc': True, 'id': 'pattern_0'}],
 766: [{'example': 766,
   'start': 52,
   'end': 122,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 766, 'start': 93, 'end': 163, 'rc': False, 'id': 'pattern_0'}],
 659: [{'example': 659,
   'start': 101,
   'end': 171,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 659, 'start': 52, 'end': 122, 'rc': False, 'id': 'pattern_1'}],
 625: [{'example': 625,
   'start': 69,
   'end': 139,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 625, 'start': 107, 'end': 177, 'rc': False, 'id': 'pattern_1'}],
 376: [{'example': 376,
   'start': 100,
   'end': 170,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 376, 'start': 69, 'end': 139, 'rc': False, 'id': 'pattern_0'}],
 769: [{'example': 769,
   'start': 89,
   'end': 159,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 769, 'start': 13, 'end': 83, 'rc': False, 'id': 'pattern_1'}],
 122: [{'example': 122,
   'start': 44,
   'end': 114,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 122, 'start': 79, 'end': 149, 'rc': False, 'id': 'pattern_1'}],
 512: [{'example': 512, 'start': 12, 'end': 82, 'rc': True, 'id': 'pattern_0'},
  {'example': 512, 'start': 42, 'end': 112, 'rc': False, 'id': 'pattern_1'}],
 772: [{'example': 772,
   'start': 97,
   'end': 167,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 772, 'start': 53, 'end': 123, 'rc': False, 'id': 'pattern_1'}],
 190: [{'example': 190,
   'start': 116,
   'end': 186,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 190, 'start': 69, 'end': 139, 'rc': False, 'id': 'pattern_0'}],
 546: [{'example': 546,
   'start': 106,
   'end': 176,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 546, 'start': 66, 'end': 136, 'rc': False, 'id': 'pattern_1'}],
 60: [{'example': 60, 'start': 87, 'end': 157, 'rc': True, 'id': 'pattern_0'},
  {'example': 60, 'start': 45, 'end': 115, 'rc': True, 'id': 'pattern_0'}],
 763: [{'example': 763,
   'start': 41,
   'end': 111,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 763, 'start': 91, 'end': 161, 'rc': True, 'id': 'pattern_0'}],
 221: [{'example': 221,
   'start': 92,
   'end': 162,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 221, 'start': 93, 'end': 163, 'rc': False, 'id': 'pattern_1'}],
 317: [{'example': 317,
   'start': 88,
   'end': 158,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 317, 'start': 24, 'end': 94, 'rc': True, 'id': 'pattern_1'}],
 61: [{'example': 61, 'start': 52, 'end': 122, 'rc': False, 'id': 'pattern_0'},
  {'example': 61, 'start': 100, 'end': 170, 'rc': True, 'id': 'pattern_1'}],
 291: [{'example': 291, 'start': 10, 'end': 80, 'rc': True, 'id': 'pattern_0'},
  {'example': 291, 'start': 74, 'end': 144, 'rc': True, 'id': 'pattern_1'}],
 39: [{'example': 39, 'start': 100, 'end': 170, 'rc': True, 'id': 'pattern_0'},
  {'example': 39, 'start': 67, 'end': 137, 'rc': True, 'id': 'pattern_1'}],
 142: [{'example': 142,
   'start': 59,
   'end': 129,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 142, 'start': 113, 'end': 183, 'rc': True, 'id': 'pattern_1'}],
 737: [{'example': 737,
   'start': 80,
   'end': 150,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 737, 'start': 44, 'end': 114, 'rc': True, 'id': 'pattern_1'}],
 713: [{'example': 713, 'start': 25, 'end': 95, 'rc': True, 'id': 'pattern_0'},
  {'example': 713, 'start': 75, 'end': 145, 'rc': True, 'id': 'pattern_1'}],
 572: [{'example': 572,
   'start': 123,
   'end': 193,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 572, 'start': 55, 'end': 125, 'rc': True, 'id': 'pattern_1'}],
 185: [{'example': 185,
   'start': 127,
   'end': 197,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 185, 'start': 83, 'end': 153, 'rc': False, 'id': 'pattern_1'}],
 587: [{'example': 587,
   'start': 74,
   'end': 144,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 587, 'start': 39, 'end': 109, 'rc': False, 'id': 'pattern_1'}],
 497: [{'example': 497,
   'start': 119,
   'end': 189,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 497, 'start': 35, 'end': 105, 'rc': False, 'id': 'pattern_1'}],
 643: [{'example': 643,
   'start': 70,
   'end': 140,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 643, 'start': 100, 'end': 170, 'rc': True, 'id': 'pattern_1'}],
 179: [{'example': 179,
   'start': 91,
   'end': 161,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 179, 'start': 45, 'end': 115, 'rc': False, 'id': 'pattern_1'}],
 496: [{'example': 496,
   'start': 62,
   'end': 132,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 496, 'start': 20, 'end': 90, 'rc': False, 'id': 'pattern_1'}],
 440: [{'example': 440,
   'start': 103,
   'end': 173,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 440, 'start': 42, 'end': 112, 'rc': False, 'id': 'pattern_1'}],
 710: [{'example': 710,
   'start': 52,
   'end': 122,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 710, 'start': 82, 'end': 152, 'rc': True, 'id': 'pattern_1'}],
 759: [{'example': 759,
   'start': 79,
   'end': 149,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 759, 'start': 35, 'end': 105, 'rc': True, 'id': 'pattern_1'}],
 518: [{'example': 518,
   'start': 58,
   'end': 128,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 518, 'start': 106, 'end': 176, 'rc': True, 'id': 'pattern_1'}],
 51: [{'example': 51,
   'start': 121,
   'end': 191,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 51, 'start': 73, 'end': 143, 'rc': False, 'id': 'pattern_1'}],
 330: [{'example': 330,
   'start': 69,
   'end': 139,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 330, 'start': 16, 'end': 86, 'rc': False, 'id': 'pattern_0'}],
 506: [{'example': 506,
   'start': 111,
   'end': 181,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 506, 'start': 58, 'end': 128, 'rc': False, 'id': 'pattern_1'}],
 577: [{'example': 577,
   'start': 59,
   'end': 129,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 577, 'start': 107, 'end': 177, 'rc': True, 'id': 'pattern_1'}],
 47: [{'example': 47, 'start': 62, 'end': 132, 'rc': True, 'id': 'pattern_0'},
  {'example': 47, 'start': 8, 'end': 78, 'rc': False, 'id': 'pattern_1'}],
 187: [{'example': 187,
   'start': 98,
   'end': 168,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 187, 'start': 68, 'end': 138, 'rc': True, 'id': 'pattern_1'}],
 614: [{'example': 614,
   'start': 40,
   'end': 110,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 614, 'start': 76, 'end': 146, 'rc': False, 'id': 'pattern_1'}],
 556: [{'example': 556,
   'start': 89,
   'end': 159,
   'rc': False,
   'id': 'pattern_1'},
  {'example': 556, 'start': 26, 'end': 96, 'rc': False, 'id': 'pattern_1'}],
 585: [{'example': 585,
   'start': 88,
   'end': 158,
   'rc': True,
   'id': 'pattern_1'},
  {'example': 585, 'start': 48, 'end': 118, 'rc': False, 'id': 'pattern_1'}],
 613: [{'example': 613,
   'start': 103,
   'end': 173,
   'rc': False,
   'id': 'pattern_1'},
  {'example': 613, 'start': 32, 'end': 102, 'rc': False, 'id': 'pattern_1'}],
 561: [{'example': 561, 'start': 15, 'end': 85, 'rc': True, 'id': 'pattern_1'},
  {'example': 561, 'start': 56, 'end': 126, 'rc': True, 'id': 'pattern_1'}],
 557: [{'example': 557,
   'start': 53,
   'end': 123,
   'rc': False,
   'id': 'pattern_1'},
  {'example': 557, 'start': 111, 'end': 181, 'rc': True, 'id': 'pattern_1'}],
 699: [{'example': 699,
   'start': 31,
   'end': 101,
   'rc': False,
   'id': 'pattern_1'},
  {'example': 699, 'start': 115, 'end': 185, 'rc': True, 'id': 'pattern_1'}],
 739: [{'example': 739,
   'start': 54,
   'end': 124,
   'rc': True,
   'id': 'pattern_1'},
  {'example': 739, 'start': 95, 'end': 165, 'rc': True, 'id': 'pattern_1'}],
 704: [{'example': 704, 'start': 9, 'end': 79, 'rc': True, 'id': 'pattern_1'},
  {'example': 704, 'start': 50, 'end': 120, 'rc': True, 'id': 'pattern_1'}],
 200: [{'example': 200,
   'start': 46,
   'end': 116,
   'rc': True,
   'id': 'pattern_1'},
  {'example': 200, 'start': 94, 'end': 164, 'rc': False, 'id': 'pattern_1'}]}

dfi.head()

pd.pivot()

df[dfi.n==2].pivot("example", "id", "n")

dfi[dfi.n==2]

dfi[dfi.n==2].groupby('example')['id'].value_counts()

example  id       
39       pattern_0    1
         pattern_1    1
47       pattern_0    1
         pattern_1    1
51       pattern_0    1
         pattern_1    1
60       pattern_0    2
61       pattern_0    1
         pattern_1    1
98       pattern_0    2
99       pattern_0    2
122      pattern_0    1
         pattern_1    1
142      pattern_0    1
         pattern_1    1
155      pattern_0    1
         pattern_1    1
175      pattern_0    1
         pattern_1    1
179      pattern_0    1
         pattern_1    1
185      pattern_0    1
         pattern_1    1
187      pattern_0    1
         pattern_1    1
190      pattern_0    2
200      pattern_1    2
221      pattern_0    1
         pattern_1    1
276      pattern_0    1
                     ..
625      pattern_1    1
643      pattern_0    1
         pattern_1    1
653      pattern_0    1
         pattern_1    1
659      pattern_0    1
         pattern_1    1
663      pattern_0    2
699      pattern_1    2
704      pattern_1    2
710      pattern_0    1
         pattern_1    1
713      pattern_0    1
         pattern_1    1
719      pattern_0    2
732      pattern_0    2
737      pattern_0    1
         pattern_1    1
739      pattern_1    2
747      pattern_0    2
756      pattern_0    1
         pattern_1    1
759      pattern_0    1
         pattern_1    1
763      pattern_0    2
766      pattern_0    2
769      pattern_0    1
         pattern_1    1
772      pattern_0    1
         pattern_1    1
Name: id, Length: 116, dtype: int64

n_examples.index[n_examples==2]

Int64Index([ 39,  47,  51,  60,  61,  98,  99, 122, 142, 155, 175, 179, 185,
            187, 190, 200, 221, 276, 291, 317, 330, 375, 376, 407, 440, 462,
            485, 487, 494, 496, 497, 506, 512, 518, 522, 536, 541, 546, 552,
            556, 557, 558, 561, 572, 576, 577, 585, 587, 601, 613, 614, 623,
            624, 625, 643, 653, 659, 663, 699, 704, 710, 713, 719, 732, 737,
            739, 747, 756, 759, 763, 766, 769, 772],
           dtype='int64', name='example')

dfi.set_index("example", inplace=True)

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2524             try:
-> 2525                 return self._engine.get_loc(key)
   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'example'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-52-a44ddf197554> in <module>()
----> 1 dfi.set_index("example", inplace=True)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
   3144                 names.append(None)
   3145             else:
-> 3146                 level = frame[col]._values
   3147                 names.append(col)
   3148                 if drop:

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3841 
   3842             if not isna(item):
-> 3843                 loc = self.items.get_loc(item)
   3844             else:
   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2525                 return self._engine.get_loc(key)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 
   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'example'

n_examples[n_examples==2]

example
39     2
47     2
51     2
60     2
61     2
98     2
99     2
122    2
142    2
155    2
175    2
179    2
185    2
187    2
190    2
200    2
221    2
276    2
291    2
317    2
330    2
375    2
376    2
407    2
440    2
462    2
485    2
487    2
494    2
496    2
      ..
572    2
576    2
577    2
585    2
587    2
601    2
613    2
614    2
623    2
624    2
625    2
643    2
653    2
659    2
663    2
699    2
704    2
710    2
713    2
719    2
732    2
737    2
739    2
747    2
756    2
759    2
763    2
766    2
769    2
772    2
Length: 73, dtype: int64

dfi.loc[n_examples.index[n_examples==2]]

	end	example	id	rc	start
0	86	140	pattern_0	False	16
1	87	311	pattern_0	False	17
2	99	273	pattern_0	False	29
3	120	378	pattern_0	False	50
4	135	695	pattern_0	True	65

	end	id	rc	start
example
39	170	pattern_0	True	100
39	137	NaN	True	67
47	132	pattern_0	True	62
47	78	NaN	False	8
51	191	pattern_0	False	121
51	143	NaN	False	73
60	157	pattern_0	True	87
60	115	NaN	True	45
61	122	pattern_0	False	52
61	170	NaN	True	100
98	117	pattern_0	False	47
98	86	NaN	True	16
99	146	pattern_0	True	76
99	176	NaN	True	106
122	114	pattern_0	True	44
122	149	NaN	False	79
142	129	pattern_0	True	59
142	183	NaN	True	113
155	93	pattern_0	False	23
155	137	NaN	True	67
175	73	pattern_0	False	3
175	128	NaN	False	58
179	161	pattern_0	True	91
179	115	NaN	False	45
185	197	pattern_0	True	127
185	153	NaN	False	83
187	168	pattern_0	True	98
187	138	NaN	True	68
190	186	pattern_0	True	116
190	139	NaN	False	69
...	...	...	...	...
699	101	pattern_1	False	31
699	185	NaN	True	115
704	79	pattern_1	True	9
704	120	NaN	True	50
710	122	pattern_0	False	52
710	152	NaN	True	82
713	95	pattern_0	True	25
713	145	NaN	True	75
719	124	pattern_0	False	54
719	167	NaN	False	97
732	164	pattern_0	False	94
732	135	NaN	True	65
737	150	pattern_0	True	80
737	114	NaN	True	44
739	124	pattern_1	True	54
739	165	NaN	True	95
747	88	pattern_0	True	18
747	117	NaN	True	47
756	111	pattern_0	False	41
756	173	NaN	True	103
759	149	pattern_0	False	79
759	105	NaN	True	35
763	111	pattern_0	False	41
763	161	NaN	True	91
766	122	pattern_0	True	52
766	163	NaN	False	93
769	159	pattern_0	True	89
769	83	NaN	False	13
772	167	pattern_0	True	97
772	123	NaN	False	53