Goal

  • get the motif spacing information

Questions

  • [x] How often do motifs co-occur in the sequences?
  • [ ] What is their distance distribution?
  • [ ] Are there signs of nearby weak binding sites in the presence of one major motif?
    • [ ] Does modisco actually flag those weak binding sites?

TODO

  • [x] make venn-like plot using upsetr
  • [ ] get the motif spacing information
  • [ ] for a particular combination of motifs, plot the distance distribution

TODO

  • add number of sequences

Conclusions

-

Motif spacing

In [65]:
import pandas as pd
import numpy as np
from basepair.modisco import ModiscoResult
from basepair.config import get_data_dir
In [66]:
from basepair.datasets import sox2_oct4_peaks_sox2
In [67]:
train, valid, test = sox2_oct4_peaks_sox2()
In [68]:
ddir = get_data_dir()
In [84]:
incl.sum()
Out[84]:
780

How often do motifs co-occur in the sequences?

In [108]:
def plot_modisco_results(pattern, split):
    fpath = pattern.format(split=split)
    mr = ModiscoResult(fpath)
    incl = np.load(fpath + f".{split}.npy")
    print(f"# sequences: {incl.sum()}")
    mr.stats()
    mr.plot_profiles(valid[0][incl], {"Sox2": eval(split)[1]['sox2'][incl],
                                      "Oct4": eval(split)[1]['oct4'][incl]},
                    #rc_vec=[False, True],
                    #start_vec = [15, 15],
                    #width=40,
                    legend=False,
                    #ylim=[0, 7.45],
                    #seq_height=1.5,
                     n_bootstrap=100,
                    fpath_template=None,
                    figsize=(6,2.5))
    mr.plot_seqlet_upset()
    mr.close()
In [98]:
mdir = f"{ddir}/processed/chipnexus/motifs"

Sox2 - signal

In [99]:
plot_modisco_results(f"{mdir}/sox2/modisco/multi-task.0.{{split}}.h5",
                    "valid")
# sequences: 780
# seqlets assigned to patterns: 738 / 1396 (53%)
# Dataset size: 
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Oct4 - signal

In [100]:
plot_modisco_results(f"{mdir}/oct4/modisco/multi-task.1.{{split}}.h5",
                    "valid")
# sequences: 402
# seqlets assigned to patterns: 370 / 1383 (27%)
# Dataset size: 
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Sox2 - counts

In [109]:
plot_modisco_results(f"{mdir}/sox2/modisco/multi-task.2.{{split}}.h5",
                    "valid")
# sequences: 1883
# seqlets assigned to patterns: 1458 / 5479 (27%)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [106]:
plot_modisco_results(f"{mdir}/sox2/modisco/multi-task.2.{{split}}.h5",
                    "valid")
# sequences: 1883
# seqlets assigned to patterns: 1458 / 5479 (27%)
# Dataset size: 
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Oct4 - counts

In [107]:
plot_modisco_results(f"{mdir}/oct4/modisco/multi-task.3.{{split}}.h5",
                    "valid")
# sequences: 1884
# seqlets assigned to patterns: 1268 / 5453 (23%)
# Dataset size: 
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [42]:
mr.f.ls()
Out[42]:
[('/metacluster_idx_to_submetacluster_results/metacluster0/activity_pattern',
  <HDF5 dataset "activity_pattern": shape (1,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets',
  <HDF5 dataset "seqlets": shape (1247,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/affmat',
  <HDF5 dataset "affmat": shape (884, 884), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/cluster_results/cluster_indices',
  <HDF5 dataset "cluster_indices": shape (884,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/all_pattern_names',
  <HDF5 dataset "all_pattern_names": shape (2,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/seqlets_and_alnmts/alnmts',
  <HDF5 dataset "alnmts": shape (443,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/seqlets_and_alnmts/seqlets',
  <HDF5 dataset "seqlets": shape (443,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/sequence/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/sequence/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_contrib_scores/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_contrib_scores/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_hypothetical_contribs/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_0/task0_hypothetical_contribs/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/seqlets_and_alnmts/alnmts',
  <HDF5 dataset "alnmts": shape (295,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/seqlets_and_alnmts/seqlets',
  <HDF5 dataset "seqlets": shape (295,), type "|O">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/sequence/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/sequence/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_contrib_scores/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_contrib_scores/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_hypothetical_contribs/fwd',
  <HDF5 dataset "fwd": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster0/seqlets_to_patterns_result/patterns/pattern_1/task0_hypothetical_contribs/rev',
  <HDF5 dataset "rev": shape (70, 4), type "<f8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster1/activity_pattern',
  <HDF5 dataset "activity_pattern": shape (1,), type "<i8">),
 ('/metacluster_idx_to_submetacluster_results/metacluster1/seqlets',
  <HDF5 dataset "seqlets": shape (149,), type "|O">),
 ('/metaclustering_results/all_metacluster_names',
  <HDF5 dataset "all_metacluster_names": shape (2,), type "|O">),
 ('/metaclustering_results/metacluster_indices',
  <HDF5 dataset "metacluster_indices": shape (1396,), type "<i8">),
 ('/multitask_seqlet_creation_results/final_seqlets',
  <HDF5 dataset "final_seqlets": shape (1396,), type "|O">),
 ('/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/coords',
  <HDF5 dataset "coords": shape (1396,), type "|O">),
 ('/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/thresholding_results/densities',
  <HDF5 dataset "densities": shape (100,), type "<f8">),
 ('/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/vals_to_threshold',
  <HDF5 dataset "vals_to_threshold": shape (4825,), type "<f8">),
 ('/task_names', <HDF5 dataset "task_names": shape (1,), type "|O">)]
In [79]:
mr.f.f['/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/vals_to_threshold'][:4]
Out[79]:
array([12.83460903, 18.52757454, 17.23535347, 13.16116619])
In [83]:
valid[0].shape
Out[83]:
(1884, 201, 4)
In [50]:
d.shape
Out[50]:
(1396,)
In [38]:
sl = mr.seqlets()
In [15]:
## TODO - add no patterns to the plot
In [136]:
bdir = f"{ddir}/processed/chipnexus/motifs/sox2-oct4/modisco/bed"
In [137]:
mkdir -p {bdir}
In [138]:
mr.export_seqlets_bed(f"{bdir}/seqlet", True)
In [141]:
!head {bdir}/seqlet.pattern_0.bed 
140	16	86	pattern_0	0	0
311	17	87	pattern_0	0	0
273	29	99	pattern_0	0	0
378	50	120	pattern_0	0	0
695	65	135	pattern_0	0	+
543	30	100	pattern_0	0	0
332	96	166	pattern_0	0	0
306	72	142	pattern_0	0	0
238	83	153	pattern_0	0	0
408	34	104	pattern_0	0	0
In [25]:
import pandas as pd
In [68]:
instances = []
for p, seqlets in sl.items():
    for seqlet in seqlets:
        seqlet["id"] = p
        instances.append(seqlet)
In [85]:
dfi = pd.DataFrame(instances)
dfi['n'] = dfi.groupby('example').example.transform('size')
dfi.groupby('example').size().plot.hist()
Out[85]:
<matplotlib.axes._subplots.AxesSubplot at 0x7eff47d8b0b8>
In [80]:
examples = {}
for p, seqlets in sl.items():
    for seqlet in seqlets:
        seqlet["id"] = p
        if seqlet['example'] in examples:
            examples[seqlet['example']].append(seqlet)
        else:
            examples[seqlet['example']] = [seqlet]
In [83]:
examples2 = {e: l for e,l in examples.items() if len(l) == 2}
In [89]:
examples20 = {e: sorted([x['id'] for x in l]) for e,l in examples.items() if len(l) == 2}
In [92]:
c = {}
for v in examples20.values():
    if tuple(v) not in c:
        c[tuple(v)] = 1
    else:
        c[tuple(v)] +=1
In [93]:
c
Out[93]:
{('pattern_0', 'pattern_1'): 43,
 ('pattern_0', 'pattern_0'): 21,
 ('pattern_1', 'pattern_1'): 9}
In [96]:
ids = [k for k,v in examples20.items() if v == ['pattern_0', 'pattern_1']]
In [99]:
len(ids)
Out[99]:
43
In [109]:
examples2_01 = [(l[0]['end'] + l[0]['start'])/2 - (l[1]['end'] + l[1]['start'])/2 for e,l in examples2.items() if e in ids]
In [112]:
import matplotlib.pyplot as plt
In [124]:
plt.hist(examples2_01, bins=30);
In [122]:
plt.hist((dfi.end + dfi.start)/2, 30);
plt.xlabel("Seqlet position distribution")
Out[122]:
Text(0.5,0,'Seqlet position distribution')
In [86]:
len(examples2)
Out[86]:
73
In [87]:
examples2
Out[87]:
{756: [{'example': 756,
   'start': 41,
   'end': 111,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 756, 'start': 103, 'end': 173, 'rc': True, 'id': 'pattern_1'}],
 485: [{'example': 485,
   'start': 63,
   'end': 133,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 485, 'start': 18, 'end': 88, 'rc': True, 'id': 'pattern_1'}],
 653: [{'example': 653,
   'start': 35,
   'end': 105,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 653, 'start': 77, 'end': 147, 'rc': True, 'id': 'pattern_1'}],
 98: [{'example': 98, 'start': 47, 'end': 117, 'rc': False, 'id': 'pattern_0'},
  {'example': 98, 'start': 16, 'end': 86, 'rc': True, 'id': 'pattern_0'}],
 623: [{'example': 623,
   'start': 61,
   'end': 131,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 623, 'start': 30, 'end': 100, 'rc': True, 'id': 'pattern_0'}],
 276: [{'example': 276,
   'start': 73,
   'end': 143,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 276, 'start': 18, 'end': 88, 'rc': True, 'id': 'pattern_1'}],
 558: [{'example': 558,
   'start': 69,
   'end': 139,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 558, 'start': 102, 'end': 172, 'rc': True, 'id': 'pattern_0'}],
 375: [{'example': 375,
   'start': 20,
   'end': 90,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 375, 'start': 108, 'end': 178, 'rc': True, 'id': 'pattern_1'}],
 494: [{'example': 494,
   'start': 80,
   'end': 150,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 494, 'start': 25, 'end': 95, 'rc': True, 'id': 'pattern_1'}],
 719: [{'example': 719,
   'start': 54,
   'end': 124,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 719, 'start': 97, 'end': 167, 'rc': False, 'id': 'pattern_0'}],
 175: [{'example': 175, 'start': 3, 'end': 73, 'rc': False, 'id': 'pattern_0'},
  {'example': 175, 'start': 58, 'end': 128, 'rc': False, 'id': 'pattern_1'}],
 522: [{'example': 522,
   'start': 43,
   'end': 113,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 522, 'start': 89, 'end': 159, 'rc': True, 'id': 'pattern_1'}],
 462: [{'example': 462,
   'start': 54,
   'end': 124,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 462, 'start': 98, 'end': 168, 'rc': True, 'id': 'pattern_0'}],
 576: [{'example': 576,
   'start': 66,
   'end': 136,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 576, 'start': 104, 'end': 174, 'rc': True, 'id': 'pattern_1'}],
 663: [{'example': 663,
   'start': 22,
   'end': 92,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 663, 'start': 67, 'end': 137, 'rc': False, 'id': 'pattern_0'}],
 487: [{'example': 487,
   'start': 85,
   'end': 155,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 487, 'start': 53, 'end': 123, 'rc': True, 'id': 'pattern_1'}],
 407: [{'example': 407,
   'start': 79,
   'end': 149,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 407, 'start': 52, 'end': 122, 'rc': True, 'id': 'pattern_0'}],
 747: [{'example': 747, 'start': 18, 'end': 88, 'rc': True, 'id': 'pattern_0'},
  {'example': 747, 'start': 47, 'end': 117, 'rc': True, 'id': 'pattern_0'}],
 732: [{'example': 732,
   'start': 94,
   'end': 164,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 732, 'start': 65, 'end': 135, 'rc': True, 'id': 'pattern_0'}],
 536: [{'example': 536,
   'start': 62,
   'end': 132,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 536, 'start': 31, 'end': 101, 'rc': True, 'id': 'pattern_0'}],
 155: [{'example': 155,
   'start': 23,
   'end': 93,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 155, 'start': 67, 'end': 137, 'rc': True, 'id': 'pattern_1'}],
 99: [{'example': 99, 'start': 76, 'end': 146, 'rc': True, 'id': 'pattern_0'},
  {'example': 99, 'start': 106, 'end': 176, 'rc': True, 'id': 'pattern_0'}],
 624: [{'example': 624,
   'start': 53,
   'end': 123,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 624, 'start': 87, 'end': 157, 'rc': True, 'id': 'pattern_0'}],
 541: [{'example': 541,
   'start': 60,
   'end': 130,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 541, 'start': 94, 'end': 164, 'rc': True, 'id': 'pattern_0'}],
 601: [{'example': 601,
   'start': 97,
   'end': 167,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 601, 'start': 54, 'end': 124, 'rc': True, 'id': 'pattern_0'}],
 552: [{'example': 552,
   'start': 86,
   'end': 156,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 552, 'start': 36, 'end': 106, 'rc': True, 'id': 'pattern_0'}],
 766: [{'example': 766,
   'start': 52,
   'end': 122,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 766, 'start': 93, 'end': 163, 'rc': False, 'id': 'pattern_0'}],
 659: [{'example': 659,
   'start': 101,
   'end': 171,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 659, 'start': 52, 'end': 122, 'rc': False, 'id': 'pattern_1'}],
 625: [{'example': 625,
   'start': 69,
   'end': 139,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 625, 'start': 107, 'end': 177, 'rc': False, 'id': 'pattern_1'}],
 376: [{'example': 376,
   'start': 100,
   'end': 170,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 376, 'start': 69, 'end': 139, 'rc': False, 'id': 'pattern_0'}],
 769: [{'example': 769,
   'start': 89,
   'end': 159,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 769, 'start': 13, 'end': 83, 'rc': False, 'id': 'pattern_1'}],
 122: [{'example': 122,
   'start': 44,
   'end': 114,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 122, 'start': 79, 'end': 149, 'rc': False, 'id': 'pattern_1'}],
 512: [{'example': 512, 'start': 12, 'end': 82, 'rc': True, 'id': 'pattern_0'},
  {'example': 512, 'start': 42, 'end': 112, 'rc': False, 'id': 'pattern_1'}],
 772: [{'example': 772,
   'start': 97,
   'end': 167,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 772, 'start': 53, 'end': 123, 'rc': False, 'id': 'pattern_1'}],
 190: [{'example': 190,
   'start': 116,
   'end': 186,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 190, 'start': 69, 'end': 139, 'rc': False, 'id': 'pattern_0'}],
 546: [{'example': 546,
   'start': 106,
   'end': 176,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 546, 'start': 66, 'end': 136, 'rc': False, 'id': 'pattern_1'}],
 60: [{'example': 60, 'start': 87, 'end': 157, 'rc': True, 'id': 'pattern_0'},
  {'example': 60, 'start': 45, 'end': 115, 'rc': True, 'id': 'pattern_0'}],
 763: [{'example': 763,
   'start': 41,
   'end': 111,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 763, 'start': 91, 'end': 161, 'rc': True, 'id': 'pattern_0'}],
 221: [{'example': 221,
   'start': 92,
   'end': 162,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 221, 'start': 93, 'end': 163, 'rc': False, 'id': 'pattern_1'}],
 317: [{'example': 317,
   'start': 88,
   'end': 158,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 317, 'start': 24, 'end': 94, 'rc': True, 'id': 'pattern_1'}],
 61: [{'example': 61, 'start': 52, 'end': 122, 'rc': False, 'id': 'pattern_0'},
  {'example': 61, 'start': 100, 'end': 170, 'rc': True, 'id': 'pattern_1'}],
 291: [{'example': 291, 'start': 10, 'end': 80, 'rc': True, 'id': 'pattern_0'},
  {'example': 291, 'start': 74, 'end': 144, 'rc': True, 'id': 'pattern_1'}],
 39: [{'example': 39, 'start': 100, 'end': 170, 'rc': True, 'id': 'pattern_0'},
  {'example': 39, 'start': 67, 'end': 137, 'rc': True, 'id': 'pattern_1'}],
 142: [{'example': 142,
   'start': 59,
   'end': 129,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 142, 'start': 113, 'end': 183, 'rc': True, 'id': 'pattern_1'}],
 737: [{'example': 737,
   'start': 80,
   'end': 150,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 737, 'start': 44, 'end': 114, 'rc': True, 'id': 'pattern_1'}],
 713: [{'example': 713, 'start': 25, 'end': 95, 'rc': True, 'id': 'pattern_0'},
  {'example': 713, 'start': 75, 'end': 145, 'rc': True, 'id': 'pattern_1'}],
 572: [{'example': 572,
   'start': 123,
   'end': 193,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 572, 'start': 55, 'end': 125, 'rc': True, 'id': 'pattern_1'}],
 185: [{'example': 185,
   'start': 127,
   'end': 197,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 185, 'start': 83, 'end': 153, 'rc': False, 'id': 'pattern_1'}],
 587: [{'example': 587,
   'start': 74,
   'end': 144,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 587, 'start': 39, 'end': 109, 'rc': False, 'id': 'pattern_1'}],
 497: [{'example': 497,
   'start': 119,
   'end': 189,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 497, 'start': 35, 'end': 105, 'rc': False, 'id': 'pattern_1'}],
 643: [{'example': 643,
   'start': 70,
   'end': 140,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 643, 'start': 100, 'end': 170, 'rc': True, 'id': 'pattern_1'}],
 179: [{'example': 179,
   'start': 91,
   'end': 161,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 179, 'start': 45, 'end': 115, 'rc': False, 'id': 'pattern_1'}],
 496: [{'example': 496,
   'start': 62,
   'end': 132,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 496, 'start': 20, 'end': 90, 'rc': False, 'id': 'pattern_1'}],
 440: [{'example': 440,
   'start': 103,
   'end': 173,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 440, 'start': 42, 'end': 112, 'rc': False, 'id': 'pattern_1'}],
 710: [{'example': 710,
   'start': 52,
   'end': 122,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 710, 'start': 82, 'end': 152, 'rc': True, 'id': 'pattern_1'}],
 759: [{'example': 759,
   'start': 79,
   'end': 149,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 759, 'start': 35, 'end': 105, 'rc': True, 'id': 'pattern_1'}],
 518: [{'example': 518,
   'start': 58,
   'end': 128,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 518, 'start': 106, 'end': 176, 'rc': True, 'id': 'pattern_1'}],
 51: [{'example': 51,
   'start': 121,
   'end': 191,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 51, 'start': 73, 'end': 143, 'rc': False, 'id': 'pattern_1'}],
 330: [{'example': 330,
   'start': 69,
   'end': 139,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 330, 'start': 16, 'end': 86, 'rc': False, 'id': 'pattern_0'}],
 506: [{'example': 506,
   'start': 111,
   'end': 181,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 506, 'start': 58, 'end': 128, 'rc': False, 'id': 'pattern_1'}],
 577: [{'example': 577,
   'start': 59,
   'end': 129,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 577, 'start': 107, 'end': 177, 'rc': True, 'id': 'pattern_1'}],
 47: [{'example': 47, 'start': 62, 'end': 132, 'rc': True, 'id': 'pattern_0'},
  {'example': 47, 'start': 8, 'end': 78, 'rc': False, 'id': 'pattern_1'}],
 187: [{'example': 187,
   'start': 98,
   'end': 168,
   'rc': True,
   'id': 'pattern_0'},
  {'example': 187, 'start': 68, 'end': 138, 'rc': True, 'id': 'pattern_1'}],
 614: [{'example': 614,
   'start': 40,
   'end': 110,
   'rc': False,
   'id': 'pattern_0'},
  {'example': 614, 'start': 76, 'end': 146, 'rc': False, 'id': 'pattern_1'}],
 556: [{'example': 556,
   'start': 89,
   'end': 159,
   'rc': False,
   'id': 'pattern_1'},
  {'example': 556, 'start': 26, 'end': 96, 'rc': False, 'id': 'pattern_1'}],
 585: [{'example': 585,
   'start': 88,
   'end': 158,
   'rc': True,
   'id': 'pattern_1'},
  {'example': 585, 'start': 48, 'end': 118, 'rc': False, 'id': 'pattern_1'}],
 613: [{'example': 613,
   'start': 103,
   'end': 173,
   'rc': False,
   'id': 'pattern_1'},
  {'example': 613, 'start': 32, 'end': 102, 'rc': False, 'id': 'pattern_1'}],
 561: [{'example': 561, 'start': 15, 'end': 85, 'rc': True, 'id': 'pattern_1'},
  {'example': 561, 'start': 56, 'end': 126, 'rc': True, 'id': 'pattern_1'}],
 557: [{'example': 557,
   'start': 53,
   'end': 123,
   'rc': False,
   'id': 'pattern_1'},
  {'example': 557, 'start': 111, 'end': 181, 'rc': True, 'id': 'pattern_1'}],
 699: [{'example': 699,
   'start': 31,
   'end': 101,
   'rc': False,
   'id': 'pattern_1'},
  {'example': 699, 'start': 115, 'end': 185, 'rc': True, 'id': 'pattern_1'}],
 739: [{'example': 739,
   'start': 54,
   'end': 124,
   'rc': True,
   'id': 'pattern_1'},
  {'example': 739, 'start': 95, 'end': 165, 'rc': True, 'id': 'pattern_1'}],
 704: [{'example': 704, 'start': 9, 'end': 79, 'rc': True, 'id': 'pattern_1'},
  {'example': 704, 'start': 50, 'end': 120, 'rc': True, 'id': 'pattern_1'}],
 200: [{'example': 200,
   'start': 46,
   'end': 116,
   'rc': True,
   'id': 'pattern_1'},
  {'example': 200, 'start': 94, 'end': 164, 'rc': False, 'id': 'pattern_1'}]}
In [70]:
dfi.head()
Out[70]:
end example id rc start
0 86 140 pattern_0 False 16
1 87 311 pattern_0 False 17
2 99 273 pattern_0 False 29
3 120 378 pattern_0 False 50
4 135 695 pattern_0 True 65
In [ ]:
pd.pivot()
In [ ]:
df[dfi.n==2].pivot("example", "id", "n")
In [79]:
dfi[dfi.n==2]
Out[79]:
end example id rc start n
16 111 756 pattern_0 False 41 2
25 133 485 pattern_0 False 63 2
40 105 653 pattern_0 False 35 2
42 117 98 pattern_0 False 47 2
46 131 623 pattern_0 False 61 2
50 143 276 pattern_0 False 73 2
52 139 558 pattern_0 False 69 2
66 90 375 pattern_0 False 20 2
72 150 494 pattern_0 False 80 2
81 124 719 pattern_0 False 54 2
82 73 175 pattern_0 False 3 2
83 113 522 pattern_0 False 43 2
88 124 462 pattern_0 False 54 2
93 136 576 pattern_0 False 66 2
94 92 663 pattern_0 False 22 2
100 155 487 pattern_0 False 85 2
105 149 407 pattern_0 False 79 2
106 88 747 pattern_0 True 18 2
107 86 98 pattern_0 True 16 2
108 100 623 pattern_0 True 30 2
109 164 732 pattern_0 False 94 2
110 132 536 pattern_0 False 62 2
111 93 155 pattern_0 False 23 2
112 146 99 pattern_0 True 76 2
113 123 624 pattern_0 True 53 2
114 130 541 pattern_0 True 60 2
117 167 601 pattern_0 False 97 2
118 167 719 pattern_0 False 97 2
119 156 552 pattern_0 True 86 2
132 164 541 pattern_0 True 94 2
... ... ... ... ... ... ...
658 164 200 pattern_1 False 94 2
659 114 737 pattern_1 True 44 2
660 159 522 pattern_1 True 89 2
661 123 487 pattern_1 True 53 2
663 177 577 pattern_1 True 107 2
664 90 496 pattern_1 False 20 2
665 88 485 pattern_1 True 18 2
666 105 759 pattern_1 True 35 2
667 170 61 pattern_1 True 100 2
668 88 276 pattern_1 True 18 2
670 125 572 pattern_1 True 55 2
671 165 739 pattern_1 True 95 2
672 78 47 pattern_1 False 8 2
673 120 704 pattern_1 True 50 2
674 94 317 pattern_1 True 24 2
675 95 494 pattern_1 True 25 2
676 183 142 pattern_1 True 113 2
678 178 375 pattern_1 True 108 2
679 176 518 pattern_1 True 106 2
680 144 291 pattern_1 True 74 2
687 181 557 pattern_1 True 111 2
707 152 710 pattern_1 True 82 2
708 170 643 pattern_1 True 100 2
712 174 576 pattern_1 True 104 2
713 147 653 pattern_1 True 77 2
714 177 625 pattern_1 False 107 2
716 115 179 pattern_1 False 45 2
718 128 175 pattern_1 False 58 2
726 146 614 pattern_1 False 76 2
737 118 585 pattern_1 False 48 2

146 rows × 6 columns

In [76]:
dfi[dfi.n==2].groupby('example')['id'].value_counts()
Out[76]:
example  id       
39       pattern_0    1
         pattern_1    1
47       pattern_0    1
         pattern_1    1
51       pattern_0    1
         pattern_1    1
60       pattern_0    2
61       pattern_0    1
         pattern_1    1
98       pattern_0    2
99       pattern_0    2
122      pattern_0    1
         pattern_1    1
142      pattern_0    1
         pattern_1    1
155      pattern_0    1
         pattern_1    1
175      pattern_0    1
         pattern_1    1
179      pattern_0    1
         pattern_1    1
185      pattern_0    1
         pattern_1    1
187      pattern_0    1
         pattern_1    1
190      pattern_0    2
200      pattern_1    2
221      pattern_0    1
         pattern_1    1
276      pattern_0    1
                     ..
625      pattern_1    1
643      pattern_0    1
         pattern_1    1
653      pattern_0    1
         pattern_1    1
659      pattern_0    1
         pattern_1    1
663      pattern_0    2
699      pattern_1    2
704      pattern_1    2
710      pattern_0    1
         pattern_1    1
713      pattern_0    1
         pattern_1    1
719      pattern_0    2
732      pattern_0    2
737      pattern_0    1
         pattern_1    1
739      pattern_1    2
747      pattern_0    2
756      pattern_0    1
         pattern_1    1
759      pattern_0    1
         pattern_1    1
763      pattern_0    2
766      pattern_0    2
769      pattern_0    1
         pattern_1    1
772      pattern_0    1
         pattern_1    1
Name: id, Length: 116, dtype: int64
In [49]:
n_examples.index[n_examples==2]
Out[49]:
Int64Index([ 39,  47,  51,  60,  61,  98,  99, 122, 142, 155, 175, 179, 185,
            187, 190, 200, 221, 276, 291, 317, 330, 375, 376, 407, 440, 462,
            485, 487, 494, 496, 497, 506, 512, 518, 522, 536, 541, 546, 552,
            556, 557, 558, 561, 572, 576, 577, 585, 587, 601, 613, 614, 623,
            624, 625, 643, 653, 659, 663, 699, 704, 710, 713, 719, 732, 737,
            739, 747, 756, 759, 763, 766, 769, 772],
           dtype='int64', name='example')
In [52]:
dfi.set_index("example", inplace=True)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2524             try:
-> 2525                 return self._engine.get_loc(key)
   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'example'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-52-a44ddf197554> in <module>()
----> 1 dfi.set_index("example", inplace=True)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
   3144                 names.append(None)
   3145             else:
-> 3146                 level = frame[col]._values
   3147                 names.append(col)
   3148                 if drop:

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3841 
   3842             if not isna(item):
-> 3843                 loc = self.items.get_loc(item)
   3844             else:
   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2525                 return self._engine.get_loc(key)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 
   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'example'
In [54]:
n_examples[n_examples==2]
Out[54]:
example
39     2
47     2
51     2
60     2
61     2
98     2
99     2
122    2
142    2
155    2
175    2
179    2
185    2
187    2
190    2
200    2
221    2
276    2
291    2
317    2
330    2
375    2
376    2
407    2
440    2
462    2
485    2
487    2
494    2
496    2
      ..
572    2
576    2
577    2
585    2
587    2
601    2
613    2
614    2
623    2
624    2
625    2
643    2
653    2
659    2
663    2
699    2
704    2
710    2
713    2
719    2
732    2
737    2
739    2
747    2
756    2
759    2
763    2
766    2
769    2
772    2
Length: 73, dtype: int64
In [51]:
dfi.loc[n_examples.index[n_examples==2]]
Out[51]:
end id rc start
example
39 170 pattern_0 True 100
39 137 NaN True 67
47 132 pattern_0 True 62
47 78 NaN False 8
51 191 pattern_0 False 121
51 143 NaN False 73
60 157 pattern_0 True 87
60 115 NaN True 45
61 122 pattern_0 False 52
61 170 NaN True 100
98 117 pattern_0 False 47
98 86 NaN True 16
99 146 pattern_0 True 76
99 176 NaN True 106
122 114 pattern_0 True 44
122 149 NaN False 79
142 129 pattern_0 True 59
142 183 NaN True 113
155 93 pattern_0 False 23
155 137 NaN True 67
175 73 pattern_0 False 3
175 128 NaN False 58
179 161 pattern_0 True 91
179 115 NaN False 45
185 197 pattern_0 True 127
185 153 NaN False 83
187 168 pattern_0 True 98
187 138 NaN True 68
190 186 pattern_0 True 116
190 139 NaN False 69
... ... ... ... ...
699 101 pattern_1 False 31
699 185 NaN True 115
704 79 pattern_1 True 9
704 120 NaN True 50
710 122 pattern_0 False 52
710 152 NaN True 82
713 95 pattern_0 True 25
713 145 NaN True 75
719 124 pattern_0 False 54
719 167 NaN False 97
732 164 pattern_0 False 94
732 135 NaN True 65
737 150 pattern_0 True 80
737 114 NaN True 44
739 124 pattern_1 True 54
739 165 NaN True 95
747 88 pattern_0 True 18
747 117 NaN True 47
756 111 pattern_0 False 41
756 173 NaN True 103
759 149 pattern_0 False 79
759 105 NaN True 35
763 111 pattern_0 False 41
763 161 NaN True 91
766 122 pattern_0 True 52
766 163 NaN False 93
769 159 pattern_0 True 89
769 83 NaN False 13
772 167 pattern_0 True 97
772 123 NaN False 53

146 rows × 4 columns