import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/notebooks/reports/"))
import util
import motif.read_motifs as read_motifs
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from feature.util import one_hot_to_seq
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import matplotlib.patches as patches
import tempfile
import tqdm
tqdm.tqdm_notebook(range(1))

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`

<tqdm.notebook.tqdm_notebook at 0x7fa2ab6cf890>


# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold",
    "svg.fonttype": "none"
}
plt.rcParams.update(plot_params)

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.


out_path = "/users/amtseng/tfmodisco/figures/ZNF248_pwm_vs_cwm"
os.makedirs(out_path, exist_ok=True)


tfm_motifs_path = os.path.join(
    "/users/amtseng/tfmodisco/results/reports/tfmodisco_results/cache/",
    "ZNF248",
    "ZNF248_ChIPexo",
    "ZNF248_ChIPexo_count",
    "all_motifs.h5"
)

homer25_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/homer_25"
homer50_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/homer_50"
dichipmunk25_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/dichipmunk_25"
dichipmunk50_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/dichipmunk_50"
meme_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/memechip/meme_out"

b1h_motif_path = "/users/amtseng/tfmodisco/data/processed/B1H/ZNF248_B1Hcode.txt"

moods_hits_dir = "/users/amtseng/tfmodisco/results/moods/ZNF248/"
reference_fasta_path = "/users/amtseng/genomes/hg38.fasta"
peaks_bed_path = "/users/amtseng/tfmodisco/data/processed/ZNF248/labels/ChIPexo/ZNF248_Exo_HEK239T_all_peakints.bed.gz"

true_profiles_hdf5_path = "/users/amtseng/tfmodisco/data/processed/ZNF248/labels/ChIPexo/ZNF248_profiles.h5"
pred_profiles_hdf5_path = "/users/amtseng/tfmodisco/results/peak_predictions/ZNF248/ZNF248_ChIPexo_pred.h5"
imp_scores_hdf5_path = "/users/amtseng/tfmodisco/results/importance_scores/ZNF248/ZNF248_ChIPexo_shap_scores.h5"
hyp_score_key = "count_hyp_scores"

repeats_bed_path = "/users/amtseng/tfmodisco/data/processed/ZNF248/ZNF248_ChIPexo_peak_repeats.bed.gz"


# Import and show TF-MoDISco motifs
with h5py.File(tfm_motifs_path, "r") as f:
    tfm_pfms, tfm_cwms = {}, {}
    tfm_motif_keys = []
    for key in f.keys():
        tfm_pfms[key] = f[key]["pfm_trimmed"][:]
        tfm_cwms[key] = f[key]["cwm_trimmed"][:]
        if not np.all(tfm_pfms[key] == util.purine_rich_motif(tfm_pfms[key])):
            tfm_pfms[key] = util.purine_rich_motif(tfm_pfms[key])
            tfm_cwms[key] = util.purine_rich_motif(tfm_cwms[key])
        tfm_motif_keys.append(key)
        
for key in tfm_motif_keys:
    viz_sequence.plot_weights(read_motifs.pfm_to_pwm(tfm_pfms[key]))


# Import and show HOMER 25 motifs
homer25_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_homer_pfms(homer25_motifs_path)[0]]

for pfm in homer25_pfms:
    viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))


# Import and show HOMER 50 motifs
homer50_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_homer_pfms(homer50_motifs_path)[0]]

for pfm in homer50_pfms:
    viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))


# Import and show MEME motifs
meme_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_meme_pfms(meme_motifs_path)[0]]

for pfm in meme_pfms:
    viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))


# Import and show DiChIPMunk 25 motifs
dichipmunk25_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_dichipmunk_pfms(dichipmunk25_motifs_path)[0]]

for pfm in dichipmunk25_pfms:
    viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))


# Import and show DiChIPMunk 50 motifs
dichipmunk50_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_dichipmunk_pfms(dichipmunk50_motifs_path)[0]]

for pfm in dichipmunk50_pfms:
    viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))


# Import B1H recognition code
with open(b1h_motif_path, "r") as f:
    line = next(f)
    while not line.startswith("Pos"):
        line = next(f)
    rows = []
    for line in f:
        tokens = line.strip().split("\t")[1:]
        if len(tokens) != 4:
            continue
        rows.append(np.array([float(x) for x in tokens]))
b1h_code = np.stack(rows)


# Align the B1H recognition to the TF-MoDISco motifs
b1h_code_aligned = np.zeros((50, 4))
b1h_code_aligned[20 : 20 + 21] = b1h_code


# Align the benchmark motifs to the TF-MoDISco motifs
homer25_aligned = np.zeros((50, 4))
homer25_aligned[22:22 + 25] = homer25_pfms[0]

homer50_aligned = np.zeros((50, 4))
homer50_aligned[12:] = homer50_pfms[0][:50 - 12]

meme_aligned = np.zeros((50, 4))
meme_aligned[18:18 + len(meme_pfms[0])] = meme_pfms[0]

dichipmunk25_aligned = np.zeros((50, 4))
dichipmunk25_aligned[13:13 + 26] = dichipmunk25_pfms[0]

dichipmunk50_aligned = np.zeros((50, 4))
dichipmunk50_aligned[6:] = dichipmunk50_pfms[0][:50 - 6]


viz_sequence.plot_weights(read_motifs.pfm_to_pwm(homer25_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
    os.path.join(out_path, "ZNF248_homer25_pwm.svg"),
    format="svg"
)

viz_sequence.plot_weights(read_motifs.pfm_to_pwm(homer50_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
    os.path.join(out_path, "ZNF248_homer50_pwm.svg"),
    format="svg"
)

viz_sequence.plot_weights(read_motifs.pfm_to_pwm(meme_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
    os.path.join(out_path, "ZNF248_meme_pwm.svg"),
    format="svg"
)

viz_sequence.plot_weights(read_motifs.pfm_to_pwm(dichipmunk25_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
    os.path.join(out_path, "ZNF248_dichipmunk25_pwm.svg"),
    format="svg"
)

viz_sequence.plot_weights(read_motifs.pfm_to_pwm(dichipmunk50_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
    os.path.join(out_path, "ZNF248_dichipmunk50_pwm.svg"),
    format="svg"
)

viz_sequence.plot_weights(read_motifs.pfm_to_pwm(tfm_pfms["0_0"]), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
    os.path.join(out_path, "ZNF248_tfm_pwm.svg"),
    format="svg"
)
viz_sequence.plot_weights(tfm_cwms["0_0"], subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
    os.path.join(out_path, "ZNF248_tfm_cwm.svg"),
    format="svg"
)

viz_sequence.plot_weights(read_motifs.pfm_to_pwm(b1h_code_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
    os.path.join(out_path, "ZNF248_B1H.svg"),
    format="svg"
)


# Construct and export motifs in MOODS-ready format
full_pfm = tfm_pfms["0_0"]
core_pfm = full_pfm[21:41]

viz_sequence.plot_weights(full_pfm)
viz_sequence.plot_weights(core_pfm)

moods.export_motifs({"core": core_pfm, "full": full_pfm}, moods_hits_dir)


hits_bed_path = os.path.join(moods_hits_dir, "moods_out_filtered.bed")
if not os.path.exists(hits_bed_path):
    moods.run_moods(moods_hits_dir, reference_fasta_path)
    moods.moods_hits_to_bed(
        os.path.join(moods_hits_dir, "moods_out.csv"), os.path.join(moods_hits_dir, "moods_out.bed")
    )
    
    # Create peak bed expanded to 400 base pairs
    center_cut_size = 400
    peaks_table = pd.read_csv(
        peaks_bed_path, sep="\t", header=None, index_col=False,
        usecols=[0, 1, 2, 9],
        names=["chrom", "start", "end", "summit_offset"]
    )
    peaks_table["start"] = \
        (peaks_table["start"] + peaks_table["summit_offset"]) - \
        (center_cut_size // 2)
    peaks_table["end"] = peaks_table["start"] + center_cut_size
    # Make sure nothing is negative
    peaks_table["min"] = 0
    peaks_table["start"] = peaks_table[["start", "min"]].max(axis=1)
    del peaks_table["min"]
    peaks_table[["chrom", "start", "end"]].to_csv(
        os.path.join(moods_hits_dir, "peaks_cut.bed"), sep="\t",
        header=False, index=False
    )
    filter_peak_bed_path = os.path.join(moods_hits_dir, "peaks_cut.bed")
    
    # Filter for peaks
    moods.filter_hits_for_peaks(
        os.path.join(moods_hits_dir, "moods_out.bed"), hits_bed_path,
        filter_peak_bed_path
    )


def import_hit_table(path):
    return pd.read_csv(
        path, sep="\t", header=None, index_col=False,
        names=["chrom", "start", "end", "key", "strand", "score", "peak_index"]
    )


# Import hits table
hit_table = import_hit_table(hits_bed_path)


core_hits_table = hit_table[hit_table["key"] == "core"]
full_hits_table = hit_table[hit_table["key"] == "full"]


min_score = 12

fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(core_hits_table["score"], bins=100, color="blue", label="core", density=True, alpha=0.3)
ax.hist(full_hits_table["score"], bins=100, color="red", label="full", density=True, alpha=0.3)
ax.axvline(min_score, color="gray")
ax.set_title("Histogram of motif instance match score")
ax.set_xlabel("MOODS match score")
plt.legend()
plt.show()


# Limit to high-scoring motif hits
core_hits_table = core_hits_table[core_hits_table["score"] >= min_score]
full_hits_table = full_hits_table[full_hits_table["score"] >= min_score]


core_peaks, full_peaks = np.unique(core_hits_table["peak_index"]), np.unique(full_hits_table["peak_index"])
core_shared_peaks = np.intersect1d(core_peaks, full_peaks)
core_unique_peaks, full_unique_peaks = np.setdiff1d(core_peaks, full_peaks), np.setdiff1d(full_peaks, core_peaks)
print("Number of core peaks: %d" % len(core_peaks))
print("Number of full peaks: %d" % len(full_peaks))
print("Number of shared peaks: %d" % len(core_shared_peaks))
print("Number of unique core peaks: %d" % len(core_unique_peaks))
print("Number of unique full peaks: %d" % len(full_unique_peaks))

Number of core peaks: 4689
Number of full peaks: 4687
Number of shared peaks: 4532
Number of unique core peaks: 157
Number of unique full peaks: 155


# Get set of core and full hits that overlap, unique core, and unique full hits
core_f = tempfile.NamedTemporaryFile("w")
full_f = tempfile.NamedTemporaryFile("w")

core_shared_f = tempfile.NamedTemporaryFile("w")
core_unique_f = tempfile.NamedTemporaryFile("w")
full_unique_f = tempfile.NamedTemporaryFile("w")

core_hits_table.to_csv(core_f, header=False, sep="\t", index=False)
full_hits_table.to_csv(full_f, header=False, sep="\t", index=False)

!bedtools intersect -a {core_f.name} -b {full_f.name} -u > {core_shared_f.name}
!bedtools intersect -a {core_f.name} -b {full_f.name} -v > {core_unique_f.name}
!bedtools intersect -b {core_f.name} -a {full_f.name} -v > {full_unique_f.name}

core_shared_hits_table = import_hit_table(core_shared_f.name)
core_unique_hits_table = import_hit_table(core_unique_f.name)
full_unique_hits_table = import_hit_table(full_unique_f.name)

for f in (core_f, full_f, core_shared_f, core_unique_f, full_unique_f):
    f.close()
    
print("Number of core hits: %d" % len(core_hits_table))
print("Number of full hits: %d" % len(full_hits_table))
print("Number of shared hits: %d" % len(core_shared_hits_table))
print("Number of unique core hits: %d" % len(core_unique_hits_table))
print("Number of unique full hits: %d" % len(full_unique_hits_table))

/bin/bash: bedtools: command not found
/bin/bash: bedtools: command not found
/bin/bash: bedtools: command not found
Number of core hits: 4717
Number of full hits: 4735
Number of shared hits: 0
Number of unique core hits: 0
Number of unique full hits: 0


bar_labels = ["Peaks with motif instances", "Motif instances"]
segment_labels = ["Unique core motifs", "Shared motifs", "Unique full motifs"]
core_unique = np.array([len(core_unique_peaks), len(core_unique_hits_table)])
core_shared = np.array([len(core_shared_peaks), len(core_shared_hits_table)])
full_unique = np.array([len(full_unique_peaks), len(full_unique_hits_table)])

total = core_unique + core_shared + full_unique
core_unique = core_unique / total
core_shared = core_shared / total
full_unique = full_unique / total

fig, ax = plt.subplots(figsize=(4, 8))
ax.bar(bar_labels, core_unique, label=segment_labels[0], width=0.25)
ax.bar(bar_labels, core_shared, bottom=core_unique, label=segment_labels[1], width=0.25)
ax.bar(bar_labels, full_unique, bottom=(core_unique + core_shared), label=segment_labels[2], width=0.25)
ax.set_title("Prevalence of core and full motifs")
ax.set_ylabel("Proportion")

ax.legend()

plt.savefig(
    os.path.join(out_path, "ZNF248_core_full_prevalence.svg"), format="svg"
)
plt.show()

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:8: RuntimeWarning: invalid value encountered in true_divide
  
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:9: RuntimeWarning: invalid value encountered in true_divide
  if __name__ == '__main__':
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
  # Remove the CWD from sys.path while we load stuff.


# Import peaks
peaks_table = pd.read_csv(
    peaks_bed_path, sep="\t", header=None, index_col=False,
    usecols=[0, 1, 2, 9],
    names=["chrom", "start", "end", "summit_offset"]
)
peaks_table["summit"] = peaks_table["start"] + peaks_table["summit_offset"]


# Import repeats in the peaks
repeats_table = pd.read_csv(
    repeats_bed_path, sep="\t", index_col=False,
    names=[
        "genoName", "genoStart", "genoEnd", "genoLeft", "strand",
        "repName", "repClass", "repFamily", "repStart", "repEnd",
        "repLeft", "id"
    ]
)

# Compute match coordinates
repeats_table["match_start"] = 0
repeats_table["match_end"] = 0
pos_mask = repeats_table["strand"] == "+"
neg_mask = ~pos_mask
repeats_table["match_start"].loc[pos_mask] = repeats_table["genoStart"][pos_mask] + repeats_table["repStart"][pos_mask]
repeats_table["match_end"].loc[pos_mask] = repeats_table["genoStart"][pos_mask] + repeats_table["repEnd"][pos_mask]
repeats_table["match_start"].loc[neg_mask] = repeats_table["genoStart"][neg_mask] + repeats_table["repLeft"][neg_mask]
repeats_table["match_end"].loc[neg_mask] = repeats_table["genoStart"][neg_mask] + repeats_table["repEnd"][neg_mask]

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


# Peaks that overlap repeats
rep_inds = []
for _, row in tqdm.notebook.tqdm(peaks_table.iterrows(), total=len(peaks_table)):
    matches = repeats_table[
        (repeats_table["genoName"] == row["chrom"]) &
        (repeats_table["genoStart"] <= row["summit"]) &
        (repeats_table["genoEnd"] > row["summit"])
    ]
    if matches.empty:
        rep_inds.append(-1)
    else:
        rep_inds.append(matches.index[0])

peaks_table["repeat_index"] = np.array(rep_inds)


# Obtain distribution of repeat classes 
repeat_class_names = ["LINE", "SINE", "LTR", "DNA", "Other", "None"]
repeat_class_counts = {name : 0 for name in repeat_class_names}

rep_inds = peaks_table["repeat_index"]
repeat_class_counts["None"] = np.sum(rep_inds == -1)

matches = repeats_table.loc[rep_inds[rep_inds != -1]]
matched_classes = matches["repClass"]
for c in matched_classes:
    if c in repeat_class_counts:
        repeat_class_counts[c] += 1
    else:
        repeat_class_counts["Other"] += 1
        
all_line_subclasses = np.array(matches[matches["repClass"] == "LINE"]["repName"])
unique_subclasses, subclass_counts = np.unique(all_line_subclasses, return_counts=True)
inds = np.flip(np.argsort(subclass_counts))
unique_subclasses, subclass_counts = unique_subclasses[inds], subclass_counts[inds]
line_subclasses = np.concatenate([unique_subclasses[:10], ["Other LINE"]])
line_subclass_counts = np.concatenate([subclass_counts[:10], [np.sum(subclass_counts[5:])]])


# Plot distribution of repeat classes
repeat_class_labels = [
    "LINE", "SINE", "LTR", "DNA repeat element", "Other", "None"
]

# Plot pie chart
fig, ax = plt.subplots(figsize=(8, 8))
ax.pie(
    [repeat_class_counts[name] for name in repeat_class_names],
    labels=repeat_class_labels, autopct='%1.1f%%', shadow=False, startangle=90
)
ax.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle.

if out_path:
    plt.savefig(
        os.path.join(out_path, "ZNF248_repeat_prevalence.svg"), format="svg"
    )
plt.show()

# Plot bar chart to highlight LINEs
fig, ax = plt.subplots(figsize=(8, 8))
ax.bar(repeat_class_labels, [repeat_class_counts[name] for name in repeat_class_names])
bottom = 0
for subclass, count in zip(line_subclasses, line_subclass_counts):
    ax.bar(["LINE"], count, bottom=bottom, label=subclass, color=("gray" if subclass == "Other LINE" else None))
    bottom += count
ax.set_xticklabels(repeat_class_labels, rotation=45)
ax.legend()
ax.set_ylabel("Number of peaks")
ax.set_title("Repeat elements underlying ZNF248 peaks")

if out_path:
    plt.savefig(
        os.path.join(out_path, "ZNF248_repeat_prevalence_bar.svg"), format="svg"
    )
plt.show()

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:27: UserWarning: FixedFormatter should only be used together with FixedLocator


def plot_example_hit(chrom, start, end, prof_center_size=1000, score_center_size=50, save_path=None):
    """
    For a given putative motif hit, plots the ChIP-nexus and ChIP-seq
    profiles, as well as the importance scores for the task around the
    hit.
    """
    mid = (start + end) // 2
    prof_start = mid - (prof_center_size // 2)
    prof_end = prof_start + prof_center_size
    score_start = mid - (score_center_size // 2)
    score_end = score_start + score_center_size
    
    with h5py.File(true_profiles_hdf5_path, "r") as f:
        true_profs = f[chrom][prof_start:prof_end][:, 0]
    with h5py.File(pred_profiles_hdf5_path, "r") as f:
        # Need to use the coordinates of the profiles themselves
        prof_len = f["predictions"]["log_pred_profs"].shape[2]
        prof_coords_chrom = f["coords"]["coords_chrom"][:].astype(str)
        prof_coords_start = f["coords"]["coords_start"][:]
        prof_coords_end = f["coords"]["coords_end"][:]
        mid = (prof_coords_start + prof_coords_end) // 2
        prof_coords_start = mid - (prof_len // 2)
        prof_coords_end = prof_coords_start + prof_len
        match_inds = np.where(
            (prof_coords_chrom == chrom) &
            (prof_coords_start <= prof_start) &
            (prof_coords_end >= prof_end)
        )[0]
        if not match_inds.size:
            print("Warning: did not find sufficiently large prediction track for %s:%d-%s" % (chrom, prof_start, prof_end))
            return
        
        match_ind = match_inds[0]
        
        coord_start, coord_end = prof_coords_start[match_ind], prof_coords_end[match_ind]
        cut_start = prof_start - coord_start
        cut_end = cut_start + prof_center_size
        pred_profs = np.exp(f["predictions"]["log_pred_profs"][match_ind][0][cut_start:cut_end])
    with h5py.File(imp_scores_hdf5_path, "r") as f:
        match_inds = np.where(
            (f["coords_chrom"][:].astype(str) == chrom) &
            (f["coords_start"][:] <= score_start) &
            (f["coords_end"][:] >= score_end)
        )[0]
        if not match_inds.size:
            print("Warning: did not find sufficiently large importance score track for %s:%d-%s" % (chrom, score_start, score_end))
            return
        
        match_ind = match_inds[0]
        
        coord_start, coord_end = f["coords_start"][match_ind], f["coords_end"][match_ind]
        hyp_scores = f[hyp_score_key][match_ind]
        one_hot_seq = f["input_seqs"][match_ind]
        
        cut_start = score_start - coord_start
        cut_end = cut_start + score_center_size
        hyp_scores = hyp_scores[cut_start:cut_end]
        one_hot_seq = one_hot_seq[cut_start:cut_end]
        
    repeats_matches = repeats_table[
        (repeats_table["genoName"] == chrom) &
        (repeats_table["genoStart"] < prof_end) &
        (repeats_table["genoEnd"] > prof_start)
    ]
        
    prof_fig, ax = plt.subplots(nrows=3, sharex=True, figsize=(20, 10))
    # Draw profiles
    ax[0].plot(true_profs[:, 0], color="darkslateblue")
    ax[0].plot(-true_profs[:, 1], color="darkorange")
    ax[0].set_title("True ChIP-exo profiles")
    ax[1].plot(pred_profs[:, 0], color="darkslateblue")
    ax[1].plot(-pred_profs[:, 1], color="darkorange")
    ax[1].set_title("Predicted ChIP-exo profiles")
    
    # Draw repeat annotation(s)
    for _, row in repeats_matches.iterrows():
        start_pos = max(row["genoStart"] - prof_start, 0)
        end_pos = min(row["genoEnd"] - prof_start, prof_center_size)
        ax[2].add_patch(patches.Rectangle(
            xy=(start_pos, 0), width=(end_pos - start_pos), height=1, color="red", fill=False
        ))
        ax[2].text(
            x=((end_pos + start_pos) // 2), y=0.5, s=row["repName"]
        )
    
    # Draw vertical lines that denote the portion with importance scores
    for i in range(3):
        ax[i].axvline(score_start - prof_start, color="gray")
        ax[i].axvline(score_end - prof_start, color="gray")
    if save_path:
        plt.savefig(
            os.path.join(save_path + "_profiles.svg"), format="svg"
        )
    plt.show()

    score_fig = viz_sequence.plot_weights(
        hyp_scores * one_hot_seq, figsize=(20, 4), subticks_frequency=score_center_size, return_fig=True
    )
    score_fig.tight_layout()
    if save_path:
        plt.savefig(
            os.path.join(save_path + "_impscores.svg"), format="svg"
        )
    plt.show()
    print(one_hot_to_seq(one_hot_seq))


num_to_take = 10
hits = full_hits_table.sort_values(by="score", ascending=False).head(10)

for index, row in hits.iterrows():
    chrom, start, end = row["chrom"], row["start"], row["end"]
    print("%s:%d-%d (index %d)" % (chrom, start, end, index))
    
    if index == 1615:
        save_path = os.path.join(out_path, "ZNF248_hit_example_%d" % index)
        plot_example_hit(chrom, start, end, save_path=save_path)
    else:
        plot_example_hit(chrom, start, end)
    print("")

chr1:231143722-231143772 (index 1615)

AGACACAAACAAATGGAAAGATATCCCATGTTCATGGATTGGAAGAATTA

chr8:81567152-81567202 (index 8571)
Warning: did not find sufficiently large prediction track for chr8:81566677-81567677

chr20:13145487-13145537 (index 15136)
Warning: did not find sufficiently large prediction track for chr20:13145012-13146012

chr2:183198893-183198943 (index 2845)
Warning: did not find sufficiently large prediction track for chr2:183198418-183199418

chr10:90950927-90950977 (index 10146)
Warning: did not find sufficiently large prediction track for chr10:90950452-90951452

chrX:129398031-129398081 (index 16549)
Warning: did not find sufficiently large prediction track for chrX:129397556-129398556

chr2:22724066-22724116 (index 2370)
Warning: did not find sufficiently large prediction track for chr2:22723591-22724591

chr3:22515758-22515808 (index 3628)

AGACACCAAAAAATGGAAAGATATCCCATGTTCATGGATTGGAAGAATCA

chr6:32258710-32258760 (index 6617)
Warning: did not find sufficiently large prediction track for chr6:32258235-32259235

chr14:35839483-35839533 (index 12643)
Warning: did not find sufficiently large prediction track for chr14:35839008-35840008

Define constants and paths¶

PWM vs CWM vs B1H¶

Distribution of core motifs and TE via MOODS¶

Distribution of core motifs and TE via RepeatMasker¶

Show an example of a TE motif instance¶