import matplotlib.pyplot as plt
import basepair
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from pathlib import Path
import pandas as pd
import numpy as np
from basepair.config import create_tf_session, get_data_dir
ddir = get_data_dir()
mkdir -p {ddir}/raw/dnase/6mer
!wget https://raw.githubusercontent.com/jpiper/pyDNase/master/pyDNase/data/IMR90_6mer.pickle -O {ddir}/raw/dnase/6mer/IMR90_6mer.pickle
from basepair.utils import read_pkl
# Load the kmer file
kmer_model = read_pkl(f"{ddir}/raw/dnase/6mer/IMR90_6mer.pickle")
# dict -> pd.DataFrame
df = pd.DataFrame([{"motif": k, **v} for k,v in kmer_model.items()])
df['forward'] = df['forward'].astype(float)
df['reverse'] = df['reverse'].astype(float)
kmer_model['AAAAAA']
len(df) # same as 4**6
df.head()
from basepair.utils import reverse_complement
df['rc_motif'] = df.motif.map(reverse_complement)
df.head()
df.forward.plot.hist(bins=50);
plt.xlabel("Forward");
df.plot.scatter('forward', 'reverse', alpha=0.3);
Why are there forward and reverse strands?
dfm = pd.merge(df[['motif', 'forward']], df[['rc_motif', 'reverse']], left_on='motif', right_on='rc_motif')
dfm.plot.scatter('forward', 'reverse', alpha=0.3);
np.allclose(dfm.forward, dfm.reverse)
Both forward and reverse scores are symmetrically stored
from basepair.exp.dnase.models import KmerBiasModel
kmb.counts.get("asd", 0)
kmb = KmerBiasModel()
len("ACGTAGTCGAG")
kmb.fit_on_batch(encodeDNA(["ACGTAGTCGAG"]*10), np.ones((10,11)))
kmb.normalize(False)
kmb.d
kmb.normalize()
kmb.d
{k:v for k,v in kmb.counts.items()}
sum(kmb.counts.values())
d = {k: float(v['forward']) for k,v in kmer_model.items()}
km = KmerBiasModel(d)
km.predict_seq("ACGTAGTCGAG", pad=True)
km.predict_seq("ACGTAGTCGAG", pad=False)
km.predict(encodeDNA(["ACGTAGTCGAG"])[0], pad=False)
km.predict_on_batch(encodeDNA(["ACGTAGTCGAG"]), pad=False)
dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")
SEQ_WIDTH = 1000
ds = DataSpec(
task_specs={"DNase": TaskSpec(task="DNase",
pos_counts=dpath / "IMR90_Naked_DNase.pos.bw",
neg_counts=dpath / "IMR90_Naked_DNase.neg.bw",
peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
fasta_file=dpath / "GRCh38.genome.fa",
)
valid_chr=['chr2', 'chr3', 'chr4']
test_chr=['chr1', 'chr8', 'chr9']
from basepair.datasets import StrandedProfile
class AppendCounts:
def transform(self, x):
counts = {k.replace("profile/", "counts/"): np.log(1 + x[k].sum(0))
for k in x}
return {**x, **counts}
SEQ_WIDTH=1000
dl = StrandedProfile(ds,
excl_chromosomes=valid_chr+test_chr,
peak_width=SEQ_WIDTH, shuffle=True,
target_transformer=AppendCounts())
dl_valid = StrandedProfile(ds,
incl_chromosomes=valid_chr,
peak_width=SEQ_WIDTH,
shuffle=True,
target_transformer=AppendCounts())
dl_test = StrandedProfile(ds,
incl_chromosomes=test_chr,
peak_width=SEQ_WIDTH,
shuffle=True,
target_transformer=AppendCounts())
from kipoi.data_utils import numpy_collate_concat
def touch_file(file, verbose=True):
if verbose:
add = "v"
else:
add = ""
!vmtouch -{add}tf {file}
def touch_all_files(ds, verbose=True):
touch_file(ds.fasta_file, verbose)
for ts in ds.task_specs.values():
touch_file(ts.pos_counts, verbose)
touch_file(ts.neg_counts, verbose)
#touch_file(ts.peaks)
touch_all_files(ds)
from tqdm import tqdm
from kipoi.writers import HDF5BatchWriter
!mkdir -p {ddir}/processed/dnase/6mer
#!rm {ddir}/processed/dnase/6mer/IMR90_6mer.preds.h5
dl_test[0]['targets']['profile/DNase'].shape
%load_ext line_profiler
from numba import jit
new_kmer_model = KmerBiasModel(k=6)
%lprun -f new_kmer_model.fit_on_batch new_kmer_model.fit_on_batch(batch['inputs'], batch['targets']['profile/DNase'].sum(axis=-1))
from kipoi.readers import HDF5Reader
ls {ddir}/processed/dnase/6mer/
r = HDF5Reader(f"{ddir}/processed/dnase/6mer/IMR90_6mer.preds.h5")
r = HDF5Reader(f"{ddir}/processed/dnase/6mer/retrained.IMR90_6mer.w200.normalized.preds.h5")
r.open()
r.ls()
preds = r.f['/preds'][:]
targets = r.f['/targets/profile/DNase'][:].sum(axis=-1)
y_true = targets[:,3:(1000-2)]
#y_true = targets[:,3:(200-1)]
assert preds.shape == y_true.shape
y_true.shape
preds.shape
y_true_l = np.ravel(y_true)
y_pred_l = np.ravel(preds)
y_true_l[:5]
y_true_l.max()
import random
idx = np.unique(random.sample(range(0, len(y_pred_l)), 1000000))
plt.scatter(y_true_l[idx], y_pred_l[idx], alpha=0.1)
plt.xlabel("true values")
plt.ylabel("predictions")
from basepair.plots import regression_eval
yt_counts = np.log(1+y_true.sum(axis=-1))
yp_counts = np.log(1+preds.sum(axis=-1))
regression_eval(yt_counts, yp_counts, alpha=0.1)
regression_eval(yt_counts, yp_counts, alpha=0.1)
regression_eval(yt_counts, yp_counts, alpha=0.1)
plt.scatter(yt_counts, yp_counts, alpha=0.1)
plt.xlabel("true values")
plt.ylabel("predictions")
from scipy.stats import pearsonr, spearmanr
yt = y_true_l
yp =y_pred_l
rp = pearsonr(yt, yp)[0]
rs = spearmanr(yt, yp)[0]
print(f"pearsonr: {rp}")
print(f"spearmanr: {rs}")
They don't match for some reason. Check again how they compute the vector.
from basepair.utils import read_pkl
km_new = read_pkl(f"{ddir}/processed/dnase/6mer/new_model.train.pkl")
ls {ddir}/processed/dnase/6mer/
km_new_pos3 = read_pkl(f"{ddir}/processed/dnase/6mer/new_model.train-pos3.w200.normalized.pkl")
from basepair.exp.dnase.models import KmerBiasModel
kmer_model = read_pkl(f"{ddir}/raw/dnase/6mer/IMR90_6mer.pickle")
km_old = KmerBiasModel({k: float(v['forward']) for k,v in kmer_model.items()})
df_new = pd.DataFrame([{"kmer": k, "value": v} for k,v in km_new.d.items()])
df_new_pos3 = pd.DataFrame([{"kmer": k, "value": v} for k,v in km_new_pos3.d.items()])
df_old = pd.DataFrame([{"kmer": k, "value": v} for k,v in km_old.d.items()])
dfeval = pd.merge(df_new, df_old, on='kmer', suffixes=('_new', '_old'))
dfeval2 = pd.merge(df_new_pos3, df_old, on='kmer', suffixes=('_new', '_old'))
plt.scatter(dfeval.value_new*100, dfeval.value_old, alpha=0.1)
#plt.xlim([0, 0.3])
#plt.ylim([0, 0.3])
plt.xlabel("value_new")
plt.ylabel("value_old")
plt.scatter(np.log(1+dfeval2.value_new*100), np.log(1+dfeval.value_old), alpha=0.1)
plt.xlabel("value_new")
plt.ylabel("value_old")
plt.scatter(dfeval2.value_new*100, dfeval.value_old, alpha=0.1)
#plt.xlim([0, 0.3])
#plt.ylim([0, 0.3])
plt.xlabel("value_new")
plt.ylabel("value_old")
plt.scatter(dfeval.value_new*100, dfeval.kmer.map({k: float(v['forward']) for k,v in kmer_model.items()}), alpha=0.1)
#plt.xlim([0, 0.3])
#plt.ylim([0, 0.3])
plt.xlabel("value_new")
plt.ylabel("value_old")
plt.scatter(dfeval.value_new*100, dfeval.kmer.map({k: float(v['reverse']) for k,v in kmer_model.items()}), alpha=0.1)
#plt.xlim([0, 0.3])
#plt.ylim([0, 0.3])
plt.xlabel("value_new")
plt.ylabel("value_old")
dfeval.value_new.min()