-
import pandas as pd
import numpy as np
from basepair.modisco import ModiscoResult
from basepair.config import get_data_dir
from basepair.datasets import sox2_oct4_peaks_sox2
train, valid, test = sox2_oct4_peaks_sox2()
ddir = get_data_dir()
incl.sum()
def plot_modisco_results(pattern, split):
fpath = pattern.format(split=split)
mr = ModiscoResult(fpath)
incl = np.load(fpath + f".{split}.npy")
print(f"# sequences: {incl.sum()}")
mr.stats()
mr.plot_profiles(valid[0][incl], {"Sox2": eval(split)[1]['sox2'][incl],
"Oct4": eval(split)[1]['oct4'][incl]},
#rc_vec=[False, True],
#start_vec = [15, 15],
#width=40,
legend=False,
#ylim=[0, 7.45],
#seq_height=1.5,
n_bootstrap=100,
fpath_template=None,
figsize=(6,2.5))
mr.plot_seqlet_upset()
mr.close()
mdir = f"{ddir}/processed/chipnexus/motifs"
plot_modisco_results(f"{mdir}/sox2/modisco/multi-task.0.{{split}}.h5",
"valid")
plot_modisco_results(f"{mdir}/oct4/modisco/multi-task.1.{{split}}.h5",
"valid")
plot_modisco_results(f"{mdir}/sox2/modisco/multi-task.2.{{split}}.h5",
"valid")
plot_modisco_results(f"{mdir}/sox2/modisco/multi-task.2.{{split}}.h5",
"valid")
plot_modisco_results(f"{mdir}/oct4/modisco/multi-task.3.{{split}}.h5",
"valid")
mr.f.ls()
mr.f.f['/multitask_seqlet_creation_results/task_name_to_coord_producer_results/task0/vals_to_threshold'][:4]
valid[0].shape
d.shape
sl = mr.seqlets()
## TODO - add no patterns to the plot
bdir = f"{ddir}/processed/chipnexus/motifs/sox2-oct4/modisco/bed"
mkdir -p {bdir}
mr.export_seqlets_bed(f"{bdir}/seqlet", True)
!head {bdir}/seqlet.pattern_0.bed
import pandas as pd
instances = []
for p, seqlets in sl.items():
for seqlet in seqlets:
seqlet["id"] = p
instances.append(seqlet)
dfi = pd.DataFrame(instances)
dfi['n'] = dfi.groupby('example').example.transform('size')
dfi.groupby('example').size().plot.hist()
examples = {}
for p, seqlets in sl.items():
for seqlet in seqlets:
seqlet["id"] = p
if seqlet['example'] in examples:
examples[seqlet['example']].append(seqlet)
else:
examples[seqlet['example']] = [seqlet]
examples2 = {e: l for e,l in examples.items() if len(l) == 2}
examples20 = {e: sorted([x['id'] for x in l]) for e,l in examples.items() if len(l) == 2}
c = {}
for v in examples20.values():
if tuple(v) not in c:
c[tuple(v)] = 1
else:
c[tuple(v)] +=1
c
ids = [k for k,v in examples20.items() if v == ['pattern_0', 'pattern_1']]
len(ids)
examples2_01 = [(l[0]['end'] + l[0]['start'])/2 - (l[1]['end'] + l[1]['start'])/2 for e,l in examples2.items() if e in ids]
import matplotlib.pyplot as plt
plt.hist(examples2_01, bins=30);
plt.hist((dfi.end + dfi.start)/2, 30);
plt.xlabel("Seqlet position distribution")
len(examples2)
examples2
dfi.head()
pd.pivot()
df[dfi.n==2].pivot("example", "id", "n")
dfi[dfi.n==2]
dfi[dfi.n==2].groupby('example')['id'].value_counts()
n_examples.index[n_examples==2]
dfi.set_index("example", inplace=True)
n_examples[n_examples==2]
dfi.loc[n_examples.index[n_examples==2]]