from basepair.config import valid_chr, test_chr
from basepair.imports import *
" ".join(test_chr)
" ".join(valid_chr)
ls /srv/scratch/avsec/workspace/basepair/chip-seq/Oct4_12/out/peak/idr/optimal_set/
ls /srv/scratch/avsec/workspace/basepair/chip-seq/Sox2_1/out/peak/idr/optimal_set/
sox_chipseq = "/srv/scratch/avsec/workspace/basepair/chip-seq/Sox2_1/out/peak/idr/optimal_set/Sox2_1_rep1-pr.IDR0.05.filt.narrowPeak.gz"
oct_chipseq = "/srv/scratch/avsec/workspace/basepair/chip-seq/Oct4_12/out/peak/idr/optimal_set/Oct4_12_ppr.IDR0.05.filt.narrowPeak.gz"
!zcat {sox_chipseq} | wc -l
!zcat {oct_chipseq} | wc -l
!zcat /srv/scratch/avsec/workspace/basepair/chip-seq/Sox2_1/out/peak/idr/optimal_set/Sox2_1_rep1-pr.IDR0.05.filt.narrowPeak.gz | wc -l
cat {ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml
oct_chipnexus = "/users/avsec/workspace/basepair-workflow/data/Oct4/peaks.bed.gz"
sox_chipnexus = "/users/avsec/workspace/basepair-workflow/data/Sox2/peaks.bed.gz"
ls /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/idr/conservative_set/
!zcat /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/idr/conservative_set/Oct4_rep2-rep3.IDR0.05.filt.12-col.bed.gz | wc -l
!zcat /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/pooled_rep/Oct4_2c_matched_barcode.filt_filtered_pooled.pval0.01.500K.narrowPeak.gz | wc -l
!ls /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/
!zcat /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/pooled_rep/Oct4_2c_matched_barcode.filt_filtered_pooled.pval0.01.500K.narrowPeak.gz | head
ls /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/idr/optimal_set
!zcat /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/overlap/pooled_pseudo_reps/Oct4_ppr.naive_overlap.filt.narrowPeak.gz | wc -l
oct_chipnexus_sorted = "/tmp/oct4.chipnexus.sorted.bed"
sox_chipnexus_sorted = "/tmp/sox2.chipnexus.sorted.bed"
oct_chipseq_sorted = "/tmp/oct4.chipseq.sorted.bed"
sox_chipseq_sorted = "/tmp/sox2.chipseq.sorted.bed"
!zcat {oct_chipnexus} | wc -l
!zcat {sox_chipnexus} | wc -l
!bedtools intersect -a {oct_chipnexus} -b {oct_chipseq} -wa -u | wc -l
genome_file = "/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes"
!bedtools sort -i {oct_chipseq} > {oct_chipseq_sorted}
!bedtools sort -i {sox_chipseq} > {sox_chipseq_sorted}
!bedtools sort -i {oct_chipnexus} | bedtools slop -b 100 -g {genome_file}> {oct_chipnexus_sorted}
!bedtools sort -i {sox_chipnexus} | bedtools slop -b 100 -g {genome_file}> {sox_chipnexus_sorted}
oct_chipseq_sorted
!bedtools jaccard -a {oct_chipnexus_sorted} -b {oct_chipseq_sorted}
!bedtools jaccard -a {sox_chipnexus_sorted} -b {oct_chipseq_sorted}
13971 / 21841
sox_chipseq_sorted
!bedtools intersect -a {sox_chipnexus} -b {sox_chipseq} -wa -u | wc -l
!bedtools intersect -a {sox_chipseq} -b {sox_chipnexus} -wa -u | wc -l
3192/9396
3192/4073
~60% of the peaks overlap with ChIP-seq and ChIP-nexus
!bedtools intersect -e -f 0.5 -F 0.5
oct_sox_binary_intervals = "/srv/scratch/avsec/workspace/chipnexus/data/processed/chipseq/labels/oct4-sox2.Oct4.intervals_file.tsv.gz"
fasta_file = "/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta"
!zcat {oct_sox_binary_intervals} | head -n 2
df = pd.read_csv(oct_sox_binary_intervals, sep='\t', header=None)
df.columns = ['chr', 'start', 'end', 'Oct4', 'Sox2']
len(df)
df['Oct4'].value_counts() / len(df['Oct4'])
df['Sox2'].value_counts() / len(df)
df['Oct4'].value_counts()
df['Sox2'].value_counts()
from basepair.datasets import *
tsv = TsvReader(oct_sox_binary_intervals, label_dtype=int, mask_ambigous=-1)
np.all(tsv.df.iloc[:, 3:] == -1, axis=1).sum()
from basepair.config import test_chr
ds_seq = SeqClassification(oct_sox_binary_intervals, fasta_file, incl_chromosomes=test_chr)
ds_seq.tsv.df[0].unique()
ds_seq.tsv.df.head()
len(ds_seq)
ds_seq[0]
create_tf_session(0)
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
# Load the model
model = load_model(model_dir / "model.h5")
from kipoi.data_utils import numpy_collate_concat
lpreds = []
llabels = []
for inputs, targets in tqdm(ds_seq.batch_train_iter(cycle=False, num_workers=10, batch_size=256)):
lpreds.append(model.predict(inputs))
# TODO - discard the targets
llabels.append(targets)
preds = numpy_collate_concat(lpreds)
labels = numpy_collate_concat(llabels)
del llabels
del lpreds
a=1
oct4_total = preds[4].sum(axis=1)
plt.hist(oct4_total, 100);
import seaborn as sns
labels.max()
dfl = pd.DataFrame(dict(label=labels[:,1], counts=sox2_counts.sum(axis=-1)))
auprc(dfl.label, dfl.counts)
dfl = pd.DataFrame(dict(label=labels[:,0], counts=oct4_counts.sum(axis=-1)))
auprc(dfl.label, dfl.counts)
preds[0].shape
sox2_profile = preds[1]
oct4_profile = preds[0]
oct4_counts = np.exp(preds[4])-1
sox2_counts = np.exp(preds[5])-1
del preds
dfl = pd.DataFrame(dict(label=labels[:,0], counts=oct4_counts.mean(axis=-1)*oct4_profile.max(axis=1).mean(axis=1)))
auprc(dfl.label, dfl.counts)
np.mean(oct4_profile[:, 400:600].sum(axis=1) * oct4_counts, axis=1)
# only sum counts for the central 200 bp
dfl = pd.DataFrame(dict(label=labels[:,0], counts=np.mean(oct4_profile[:, 400:600].sum(axis=1) * oct4_counts, axis=1)))
auprc(dfl.label, dfl.counts)
dfl.groupby("label").mean()
auprc(labels[:,0], oct4_total)
dfl.groupby("label").std()
dfl.label.value_counts()
from concise.eval_metrics import auprc, auc
from plotnine import *
ggplot(aes(x="label", y="counts"), dfl) + geom_boxplot()
sns.boxplot()
preds[4].shape