# Imports
from basepair.imports import *
dfi_subset_pr = pr.PyRanges(dfi_subset)
# duplicated number of instances
len(dfi_subset_pr)
# non-duplicated number of instances
len(dfi_subset_pr.drop_duplicate_positions())
dfi_subset.head()
dict(dfi_subset.pattern_name.value_counts())
fig, ax = plt.subplots(figsize=get_figsize(0.23))
dfi_subset.pattern_name.value_counts().plot.bar(width=.9)
plt.ylabel("Motif instances")
plt.xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.pdf')
# Number of regions with a motif (total = 150k. Only 14k lack a motif)
len(dfi_subset.groupby('example_idx').size())
dfi_subset.groupby('example_idx').size().value_counts()
total_regions = len(ranges)
fig, ax = plt.subplots(figsize=get_figsize(0.4))
vec = dfi_subset.groupby('example_idx').size()
vec = pd.concat([vec, pd.Series(np.zeros(total_regions - len(vec)))])
vec.plot.hist(bins=np.arange(11), rwidth=0.9, align='left')
plt.xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.pdf')
dfi_subset.head()
19.535 / (212.63-196.32) * 0.3
3.505 / (212.63-196.32) * 0.3
dfi_subset.groupby(['pattern_name', 'example_idx']).size().reset_index()
dfi_subset.head()
fig, axes = plt.subplots(len(tasks), 1,
figsize=get_figsize(0.23*1.4, aspect=len(tasks)* 0.618/1.4),
sharey=True,
gridspec_kw=dict(hspace=0),
sharex=True)
for i, task in enumerate(tasks):
ax = axes[i]
vec = dfi_subset[dfi_subset.example_interval_from_task == task].groupby('example_idx').size()
vec = pd.concat([vec, pd.Series(np.zeros(peaks_per_task[task] - vec.index.nunique()))])
vec.plot.hist(bins=np.arange(10), rwidth=0.9, align='left', ax=ax)
ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.per-task.pdf')
fig, axes = plt.subplots(len(tasks), 1,
figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
sharey=True,
gridspec_kw=dict(hspace=0),
sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
ax = axes[i]
if i == 0:
ax.set_title("Total number of motifs")
vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
vec.plot.bar(width=.9, ax=ax)
ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.per-task.pdf')
fig, axes = plt.subplots(len(tasks), 1,
figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
sharey=True,
gridspec_kw=dict(hspace=0),
sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
ax = axes[i]
if i == 0:
ax.set_title("Average number of motifs per peak")
vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
vec = vec / peaks_per_task[task]
vec.plot.bar(width=.9, ax=ax)
ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.per-task.avg-motifs-per-peak.pdf')
np.sum(dfi_subset.groupby('example_idx').size() >= 3)
np.sum(dfi_subset.groupby('example_idx').size() >= 3) / 150908
np.sum(dfi_subset.groupby('example_idx').size() >= 5)
72696 / 150000
dfi_subset.groupby('example_idx').size()
patterns = [p for p in mr.get_all_patterns()]
ic = [p.seq_info_content for p in mr.get_all_patterns() if mr.n_seqlets(p.name) > 100]
n_seqlets = [mr.n_seqlets(p.name) for p in mr.get_all_patterns()]
len(ic)
ic = [p.seq_info_content for p in mr.get_all_patterns()]
fig, ax = plt.subplots(figsize=get_figsize(0.4))
plt.hist(ic, bins=np.arange(0, 110, step=8), rwidth=0.9);
# plt.hist(ic, bins=15, rwidth=0.9, align='left');
plt.xlabel("IC")
plt.ylabel("Motifs")
fig.savefig(fdir / '../seq-IC.hist.all.pdf')
p = patterns[0]
p.seq_info_content
tracks_dir = Path(ddir) / 'www/paper/data/tracks'
!mkdir -p {tracks_dir}
bed_cols = ['Chromosome', 'Start', 'End', 'pattern_name', 'match_weighted_p', 'strand', 'imp_weighted_p', 'seq_match_p']
duplicates = dfi_subset[bed_cols[:4]].duplicated() # Drop duplicates
dfi_subset[~duplicates][bed_cols].sort_values(bed_cols[:2]).to_csv(tracks_dir / "motif-instances.bed", sep='\t', header=False, index=False)
!head {tracks_dir}/motif-instances.bed
!wc -l {tracks_dir}/motif-instances.bed
# write out the region
ranges = isf.get_ranges()
ranges[['chrom', 'start', 'end', 'interval_from_task']].to_csv(tracks_dir / "regions.bed", sep='\t', header=False, index=False)
!wc -l {tracks_dir}/regions.bed
ls {model_dir}/
model_dir
ls {model_dir}/log