%load_ext autoreload
%autoreload 2

import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
#from basepair.modisco.results import ModiscoResult
from matlas.modisco_results import ModiscoResult


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"

load data from labcluster

Using TensorFlow backend.
TF-MoDISco is using the TensorFlow backend.

from matlas.dlutils import load_deeplift_data
from matlas.fimo_hits import (dichotomize_chipseq_for_gold_standard, 
                              assign_significance_on_fimo_hits,
                              retain_fimo_hits_within_nb,
                              optimize_performance_info,
                              check_performance_competition,
                              collect_motif_df,
                              plot_various_instance_scores,
                              decide_optimum_threshold,
                              collect_distance_info,
                              compute_distances_from_peaks,
                              plot_positional_information
                             )



def prepare_verification_data_per_cell(
    task_idx=273,
    tf_biosample = "MEL cell line",
    the_k=0.05627951420283896,
    atac_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits",
    instance_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits",
    deeplift_root="/mnt/lab_data2/msharmin/oc-atlas/deeplift2019_h5",
    background_files_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats"
):

    tf_targets = sorted(set(CHIP_PEAK_BEDS[tf_biosample].keys()).intersection(set(CHIP_TO_MOTIF.keys())))
    motifs = [CHIP_TO_MOTIF[tf] for tf in tf_targets]
    
    # divide chipseq peaks into gold_standard and nongold for each tf under a cell type
    fimodir = "{0}/task_{1}".format(atac_root, task_idx)
    atacpeaks = "{}/peaks.bed".format(fimodir)
    #print(fimodir)
    #print(atacpeaks)
    for tf, motif in zip(tf_targets, motifs):
        print(tf_biosample, tf, motif)
        chippeaks = "{0}/{1}".format(CHIP_PEAK_BED_ROOT, CHIP_PEAK_BEDS[tf_biosample][tf])
        instancedir="{}/task_{}/{}".format(instance_root, task_idx, motif)
        #print(chippeaks)
        #print(instancedir)
        #print()

        best_key = decide_optimum_threshold(instancedir)
        distances = collect_distance_info(best_key=best_key, instancedir=instancedir,
                                          null_key='deeplift_cdf', key_to_split='deeplift_cdf',
                                          fimobed="{}/{}.bed.gz".format(fimodir, motif),
                                          verbose=False
                                         )
        plot_positional_information(distances, motif)

    
        # compute_joint_chip_matrix and plot_joint_chip_matrix
        
        
    return None
from ast import literal_eval

def prepare_verification_data():
    total_zeros = 0
    for key, tf_biosamples in OC_TO_TF_SAMPLES.items():
        task_idx, sample_id, oc_sample = literal_eval(key)
        
        print(task_idx, oc_sample, tf_biosamples, len(tf_biosamples))
        total_zeros = prepare_verification_data_per_cell(task_idx=task_idx, 
                                           tf_biosample=tf_biosamples[0],
                                           the_k=PER_BASE_BACKGROUND_IMPORTANCES[task_idx]
                                          )
        #break
    return None


prepare_verification_data()

140 Bone marrow derived pre-B cell ['bone marrow'] 1
bone marrow CTCF-mouse Ctcf

142 bone marrow-derived macrophages (Unstimulated) ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf

143 bone marrow-derived macrophages(Lipid A stimulation 120min) ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf

177 Cortical excitatory neurons ['cortical plate'] 1
cortical plate CTCF-mouse Ctcf

180 Thymus MECs (Aire+/+) ['thymus'] 1
thymus CTCF-mouse Ctcf

189 Megakaryocyte primary cell ['megakaryocyte'] 1
megakaryocyte FLI1-mouse Fli1

192 Erythroblast primary cell ['erythroblast'] 1
erythroblast GATA1-mouse Gata1

193 Megakaryocyte progenitor cell ['megakaryocyte'] 1
megakaryocyte FLI1-mouse Fli1

202 Liver (Adult 8 weeks) ['liver'] 1
liver CTCF-mouse Ctcf

liver EP300-mouse Ep300

# def prepare_verification_data():
#     total_zeros = 0
#     for key, tf_biosamples in OC_TO_TF_SAMPLES.items():
#         task_idx, sample_id, oc_sample = literal_eval(key)
        
#         print(task_idx, oc_sample, tf_biosamples, len(tf_biosamples))
#         total_zeros = prepare_verification_data_per_cell(task_idx=task_idx, 
#                                            tf_biosample=tf_biosamples[0],
#                                            the_k=PER_BASE_BACKGROUND_IMPORTANCES[task_idx]
#                                           )
#         #break
#     return None


# prepare_verification_data()

from matlas.fimo_hits import compute_distances_from_peaks
#mdf = compute_distances_from_peaks()
mdf

bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/gold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Fimo.bed -wb > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Fimo.tp.r.bed

# determine the threshold and send it to both display as params

instancedir = "/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_140/Ctcf"
tmp = np.load("{}/cdf_cdf.npz".format(instancedir), allow_pickle=True)
dl_performdict, nondl_performdict, performdict = tmp.f.a.tolist(), tmp.f.b.tolist(), tmp.f.c.tolist()
performdict

OrderedDict([('Fimo',
                 Balanced Accuracy  F1-score  Precision    Recall  auPRC_labeled  \
              0            0.74047  0.424785   0.284553  0.837535       0.534617   
              
                 auPRC_scored  N_instance_passing_k  N_atacs_passing_k  Imbalance  
              0      0.523323                200814              73877   0.144816  ),
             ('Fimo50',
                 Balanced Accuracy  F1-score  Precision    Recall  auPRC_labeled  \
              0           0.713836  0.494412   0.460747  0.533384       0.485627   
              
                 auPRC_scored  N_instance_passing_k  N_atacs_passing_k  Imbalance  
              0      0.411968                 83013              31050   0.144816  )])

precisions = [(v)['Precision'].values[0] for v in dl_performdict.values()]
df = pd.DataFrame({'keys': list(dl_performdict.keys()),
              'Precision': [(v)['Precision'].values[0] for v in dl_performdict.values()],
              'Recall': [(v)['Recall'].values[0] for v in dl_performdict.values()],
                   'auPRC_scored': [(v)['auPRC_scored'].values[0] for v in dl_performdict.values()],
                   'F1-score': [(v)['F1-score'].values[0] for v in dl_performdict.values()],
                   'Balanced Accuracy': [(v)['Balanced Accuracy'].values[0] for v in dl_performdict.values()]
              
             })
nondf = pd.DataFrame({'keys': list(nondl_performdict.keys()),
              'Precision': [(v)['Precision'].values[0] for v in nondl_performdict.values()],
              'Recall': [(v)['Recall'].values[0] for v in nondl_performdict.values()],
                      'auPRC_scored': [(v)['auPRC_scored'].values[0] for v in nondl_performdict.values()],
                      'F1-score': [(v)['F1-score'].values[0] for v in nondl_performdict.values()],
                   'Balanced Accuracy': [(v)['Balanced Accuracy'].values[0] for v in nondl_performdict.values()]
              
             })
precision_cond = df['Precision']>=performdict['Fimo50']['Precision'].values[0]
df = df[precision_cond]
nondf = nondf[precision_cond]

recall_cond = df['Recall']>=nondf['Recall']
df = df[recall_cond]
nondf = nondf[recall_cond]
pd.concat([df, nondf], axis=1)
recall_cond

0    True
1    True
2    True
3    True
4    True
5    True
Name: Recall, dtype: bool

from scipy.stats import *
from scipy import stats
from matlas.fimo_hits import collect_motif_df, PEAK_TYPES

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot as plt


from matlas.fimo_hits import compute_instance_calling_performance

perform_df, truth_df = compute_instance_calling_performance(
            bedname="Dl_k50.0.bed",
            key_to_optimize="deeplift_cdf",
            instancedir="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4",
            atac_bed="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits/task_224/peaks.bed",
            verbose=True
        )

# fimo_df, truth_df = compute_peak_calling_performance(key_to_optimize='mean_deeplift_0')
# fimo_df.T

bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/gold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -wb > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.tp.r.bed
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/nongold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -wb > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.fp.r.bed
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/gold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -v > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.fn.bed
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/nongold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -v > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.tn.bed
possible bug set()
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits/task_224/peaks.bed -wb > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.tp.atac.bed
sanity checks for atac overlap 8430 8430
sanity checks for area 0.1400144203157756 0.11710181851089989 0.1400144203157756
saniti checks for f1 0.08964103388672268 0.08964103388672268
saniti checks for recall 0.06303352365877099 [5.65323082e-05]

            true peaks  false peaks   total
pred_true         1115         6073    7188
pred_false       16574       142907  159481
total            17689       148980  166669

dlk = fimo_df[fimo_df['deeplift_cdf']>0.9]
nondlk = fimo_df[fimo_df['deeplift_cdf']<=0.9]

from matplotlib import pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(16,6))
ax_pdf = axes[0]
ax_cdf = axes[1]

key_to_show = 'sum_deeplift_0'
ax_pdf.hist(dlk[key_to_show].values, density=True, histtype='stepfilled',
        alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_pdf.hist(nondlk[key_to_show].values, density=True, histtype='stepfilled',
        alpha=0.6, bins=60, label='low')
ax_pdf.legend(loc='best', frameon=False)
ax_pdf.legend(loc='best', frameon=False)

ax_cdf.hist(dlk[key_to_show].values,  cumulative=True, density=True, histtype='bar',
        alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_cdf.hist(nondlk[key_to_show].values,  cumulative=True, density=True, histtype='bar',
        alpha=0.6, bins=60, label='low')
ax_cdf.legend(loc='best', frameon=False)
plt.show()

# revise performance measurements with changing the probability metric

# revise fimo50
# revise positional distribution and cdfs for a bed


# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a ../../../atac_specific_hits/task_140/peaks.bed -b stdin -wb -wa -v > nongold3.bed



# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b ../../../atac_specific_hits/task_140/peaks.bed -wb -wa > gold.bed
# zcat /mnt/lab_data2/msharmin/tf-mouse-atlas/chip_idrs/ENCFF806PDR.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b atac_specific_hits/task_140/peaks.bed -wb > atac_various_hits/task_140/Ctcf/gold.bed

b = np.sum(null_scores, axis=2)
a = np.sum(deeplift_scores, axis=2)
a.shape, b.shape

((174993, 1000), (174993, 200))

np.argmin(cdf_scores[:,5]), np.min(cdf_scores[:,5])

(171195, 5.979713769975963e-05)

i = [0, 1]
c = b[0].flatten()
print(c.shape)
shape, mean, stdev = t.fit(c)
kstat, kval = stats.kstest(c, 't', args=[shape, mean, stdev])
# mean, stdev = norm.fit(c)
# kstat, kval = stats.kstest(c, 'norm', args=[ mean, stdev])
# shape, mean, stdev = gamma.fit(c)
# kstat, kval = stats.kstest(c, 'gamma', args=[shape,  mean, stdev])

print(kstat, kval)

(200,)
0.045556525408073756 0.8008351657723536

from matlas.fimo_hits import plot_fitted_dist_for_importance
plot_fitted_dist_for_importance(c, a[0], shape, mean, stdev)

# scores_ = [np.sum(j) for j in scores[0][300:700]]
# shape, mean, stdev = t.fit(scores_)
# kstat, kval = stats.kstest(scores_, 't', args=[shape, mean, stdev])
# print('kstat:', kstat, ' kval:', kval)

from scipy import stats

from scipy.stats import *
shape, mean, stdev = t.fit(b)
print(shape, mean, stdev)
kstat, kval = stats.kstest(b, 't', args=[shape, mean, stdev])
print('t sitribution', kstat, kval)
kstat, kval = stats.kstest(b, 'norm')
print('normal sitribution', kstat, kval)

1.5123294107168623 0.0008743328517088086 0.005880160282513114
t sitribution 0.06499847605991883 0.0
normal sitribution 0.48375382933815286 0.0

#get the scores for all the windows, take per_base_imp distribution
# nonagnostic_out = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/Ctcf.bed.gz"
# df = pd.read_csv(nonagnostic_out, sep="\t")
# selected_peaks = np.sort(np.array(list(set(df['seqno'].values))))
# selected_null_scores = hyp_scores[selected_peaks,:,:]
# selected_null_scores.shape
# atacdf = pd.read_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks.bed", header=None, sep="\t")
# #selected_atacs = atacdf.loc[selected_peaks,:]
# #selected_atacs.to_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks_Ctcf.bed", 
#                       #header=None, index=None, sep="\t")

# from matlas.render_report import render_cell_tf_reports_in_parallel

# render_cell_tf_reports_in_parallel()

# call dichotomize_chipseq_for_gold_standard for each tf under MEL
# call get_deeplift_passed_fimo_hits for each tf under MEL
# call get_peaks_within_nb for each tf under MEL
# for a tf: collect_distance_info, plot_distances_from_peaks
# for a tf: collect_performance_info, plot_performance_info
# for a tf: compute_joint_chip_matrix, plot_joint_chip_heatmap
# for a tf: load the png file

from matlas.fimo_hits import combine_motif_dfs

logdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf"

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_{}".format(273)
nonagnostic_out = "{0}/{1}.bed.gz".format(outdir, "Adnp2")
df_dict = combine_motif_dfs(logdir, outdir, 
                      combine_func="collect_motif_df",
                      )
df = pd.concat([df_dict[key] for key in df_dict.keys()])

#nonz_dfs = combine_motif_dfs(logdir, outdir, zscored=False)

from matlas.fimo_hits import plot_motif_scores

motifs = ['Spi1', 'Ctcf', 'Gata1']
turn = 0
fig, axes = None, None
for key in motifs:
    fig, axes = plot_motif_scores(df_dict[key], title=key, turn=turn, axes=axes, fig=fig, kind='scatter')
    turn = (turn+1)%4

import seaborn as sns
combined_df = pd.concat([df_dict[key] for key in df_dict.keys()])
sns.jointplot(x="avg_deeplift", y="fimo", data=combined_df, s=1, marginal_kws=dict(bins=100));

df_dict['Spi1'][:5]

from matlas.render_report import render_test_plots
render_test_plots(nbname="FimoVsDeepliftReport")

gencode_file = '/mnt/data/annotations/by_release/mm10/GENCODE_ann/gencode.vM6.annotation.gtf'
gencode = pd.read_table(gencode_file, sep='\t', header=None, comment='#')
genes = gencode[(gencode[0]!='chrM') & (gencode[2]=='gene')&(np.array(['protein_coding' in i for i in gencode[8].values]))]
gene_df = genes.loc[:, [0,3,4,8,5,6]]
gene_df = gene_df.sort_values([0,3])
gene_df.to_csv("{0}/genecode.bed.gz".format(modisco_dir), header=False, index=False, compression='gzip', sep="\t")
genes[:5]

	peakchr	peakstart	peakend	n	point	chr	istart	iend	iname	iscore	istrand	mean_deeplift	mean_deeplift_0	deeplift_cdf	imidpoint	summit	distance
0	chr19	3353359	3353360	13	69	chr19	3353359	3353360	138701	14.0000	-	0.000009	0.000009	0.506270	3353359	3353428	-69
1	chr19	3353359	3353361	13	69	chr19	3353359	3353361	138702	15.2241	+	0.000020	0.000020	0.506379	3353359	3353428	-69
2	chr9	92211224	92211235	92	180	chr9	92211224	92211235	281510	15.0690	-	0.061769	0.062867	0.947707	92211224	92211404	-180
3	chr9	92211225	92211235	92	180	chr9	92211225	92211235	281511	14.0000	+	0.069153	0.069153	0.963151	92211225	92211405	-180
4	chr3	97876403	97876413	133	180	chr3	97876403	97876413	178395	13.7121	-	0.018780	0.021136	0.890439	97876403	97876583	-180
5	chr3	97876403	97876414	133	180	chr3	97876403	97876414	178396	15.2069	+	0.015547	0.019214	0.807427	97876403	97876583	-180
6	chr4	57839659	57839671	143	180	chr4	57839659	57839671	192577	13.8991	+	-0.006044	0.003068	0.483946	57839659	57839839	-180
7	chr2	64070549	64070561	155	180	chr2	64070549	64070561	152834	13.9358	-	-0.001180	0.007046	0.504524	64070549	64070729	-180
8	chr2	64070730	64070742	155	180	chr2	64070730	64070742	152835	15.6239	+	-0.003105	0.000162	0.421774	64070730	64070910	-180
9	chr2	36236976	36236986	164	180	chr2	36236976	36236986	149381	13.5455	-	0.006439	0.011360	0.501202	36236976	36237156	-180
10	chr5	142402937	142402938	170	180	chr5	142402937	142402948	219735	14.8966	+	-0.001932	0.002020	0.373960	142402937	142403117	-180
11	chr9	63717308	63717320	188	180	chr9	63717308	63717320	278183	14.1009	-	-0.000969	0.005634	0.519866	63717308	63717488	-180
12	chr1	138328353	138328363	220	180	chr1	138328353	138328363	16653	13.6515	+	0.025300	0.033887	0.708505	138328353	138328533	-180
13	chr7	89368032	89368044	228	180	chr7	89368032	89368044	249373	15.6239	+	-0.007330	0.002958	0.286235	89368032	89368212	-180
14	chr14	31433515	31433526	229	0	chr14	31433515	31433526	83360	14.5690	+	-0.001226	0.002116	0.498010	31433515	31433515	0
15	chr11	88719079	88719083	237	180	chr11	88719071	88719083	48572	14.9450	+	0.010161	0.015953	0.451415	88719071	88719259	-188
16	chr17	71204789	71204801	249	180	chr17	71204789	71204801	125370	14.9633	+	-0.005869	0.001158	0.305810	71204789	71204969	-180
17	chr17	87055304	87055316	284	180	chr17	87055304	87055316	127151	14.6330	+	-0.003392	0.002511	0.414718	87055304	87055484	-180
18	chr4	40723434	40723446	295	180	chr4	40723434	40723446	190625	14.3486	-	0.004248	0.009394	0.424002	40723434	40723614	-180
19	chrX	13347156	13347164	324	180	chrX	13347154	13347164	286367	13.6515	+	0.020639	0.027710	0.625309	13347154	13347336	-182
20	chr18	84914855	84914866	458	180	chr18	84914855	84914866	137955	14.5690	-	0.043586	0.046236	0.912073	84914855	84915035	-180
21	chr18	84914856	84914866	458	180	chr18	84914856	84914866	137956	13.6515	+	0.042969	0.045883	0.855519	84914856	84915036	-180
22	chr9	118506345	118506357	510	180	chr9	118506345	118506357	284463	14.0550	+	0.010347	0.012425	0.657760	118506345	118506525	-180
23	chr4	33189448	33189452	512	180	chr4	33189442	33189452	189632	13.6515	-	0.059905	0.059905	0.861427	33189442	33189628	-186
24	chr4	33189448	33189453	512	180	chr4	33189442	33189453	189633	14.5690	+	0.065274	0.065274	0.881431	33189442	33189628	-186
25	chr19	31891617	31891628	520	180	chr19	31891617	31891628	142109	14.6034	-	-0.007655	0.000175	0.431695	31891617	31891797	-180
26	chr19	31891618	31891628	520	180	chr19	31891618	31891628	142110	13.7121	+	-0.007866	0.000193	0.429193	31891618	31891798	-180
27	chr1	138328353	138328363	521	180	chr1	138328353	138328363	16653	13.6515	+	0.025300	0.033887	0.708505	138328353	138328533	-180
28	chr5	129002962	129002972	533	180	chr5	129002962	129002972	218381	14.5909	-	0.033325	0.037000	0.891563	129002962	129003142	-180
29	chr5	129002962	129002973	533	180	chr5	129002962	129002973	218382	14.8276	+	0.030694	0.034035	0.872494	129002962	129003142	-180
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1985	chr12	54460378	54460388	23988	165	chr12	54460378	54460388	58019	14.6364	+	0.041227	0.043190	0.883674	54460378	54460543	-165
1986	chr2	4917864	4917875	24004	141	chr2	4917864	4917875	145812	14.4483	-	0.026546	0.031428	0.898278	4917864	4918005	-141
1987	chr16	95861399	95861410	24007	191	chr16	95861399	95861410	117106	15.1552	+	0.090691	0.090691	0.969027	95861399	95861590	-191
1988	chr16	95861569	95861581	24007	191	chr16	95861569	95861581	117107	13.9358	+	-0.006067	0.009372	0.381054	95861569	95861760	-191
1989	chr19	21914271	21914283	24008	141	chr19	21914271	21914283	140970	14.9633	-	-0.008817	0.003724	0.572142	21914271	21914412	-141
1990	chr14	31404144	31404155	24010	133	chr14	31404144	31404155	83354	14.4483	+	0.136148	0.137778	0.974815	31404144	31404277	-133
1991	chr3	96580268	96580278	24011	141	chr3	96580268	96580278	178259	13.5455	-	0.029698	0.033939	0.717987	96580268	96580409	-141
1992	chr18	74740003	74740014	24018	134	chr18	74740003	74740014	136858	14.5172	-	0.082407	0.087873	0.957729	74740003	74740137	-134
1993	chr3	145798887	145798897	24020	167	chr3	145798887	145798897	184009	14.5909	-	0.015289	0.019580	0.802202	145798887	145799054	-167
1994	chr3	145798887	145798898	24020	167	chr3	145798887	145798898	184010	15.2759	+	0.014038	0.017939	0.773085	145798887	145799054	-167
1995	chr3	101507498	101507509	24028	174	chr3	101507498	101507509	178842	14.4483	+	0.035862	0.036364	0.894372	101507498	101507672	-174
1996	chr2	119251032	119251043	24029	148	chr2	119251032	119251043	159427	15.1552	+	-0.000419	0.012172	0.530666	119251032	119251180	-148
1997	chr15	85774617	85774627	24045	161	chr15	85774617	85774627	104220	14.6364	-	0.022734	0.022762	0.858381	85774617	85774778	-161
1998	chr15	85774617	85774628	24045	161	chr15	85774617	85774628	104221	15.5862	+	0.022096	0.022122	0.798481	85774617	85774778	-161
1999	chr16	95636821	95636831	24048	118	chr16	95636821	95636831	117091	13.7576	+	0.132582	0.135230	0.963767	95636821	95636939	-118
2000	chr11	117336375	117336385	24049	139	chr11	117336375	117336385	51614	14.0455	-	0.057277	0.057277	0.856385	117336375	117336514	-139
2001	chr11	117336375	117336386	24049	139	chr11	117336375	117336386	51615	15.9828	+	0.050633	0.052070	0.833818	117336375	117336514	-139
2002	chr11	117336420	117336431	24049	139	chr11	117336420	117336431	51616	15.3793	-	0.030477	0.034232	0.811284	117336420	117336559	-139
2003	chr11	117336421	117336431	24049	139	chr11	117336421	117336431	51617	14.0455	+	0.032955	0.037086	0.861280	117336421	117336560	-139
2004	chr9	21265174	21265185	24050	129	chr9	21265174	21265185	273094	14.4483	-	0.029732	0.038658	0.703107	21265174	21265303	-129
2005	chrX	103287148	103287160	24053	137	chrX	103287148	103287160	298244	14.9633	-	0.028596	0.031282	0.796116	103287148	103287285	-137
2006	chr9	23453429	23453440	24057	154	chr9	23453429	23453440	273328	14.6034	-	0.021790	0.025639	0.663244	23453429	23453583	-154
2007	chr9	23453430	23453440	24057	154	chr9	23453430	23453440	273329	13.7121	+	0.025294	0.028203	0.685594	23453430	23453584	-154
2008	chr11	86962179	86962189	24059	136	chr11	86962179	86962189	48397	13.5455	-	0.043214	0.045640	0.796347	86962179	86962315	-136
2009	chr15	89476382	89476392	24067	187	chr15	89476382	89476392	104606	13.7576	-	0.032280	0.032305	0.688963	89476382	89476569	-187
2010	chr15	89476382	89476393	24067	187	chr15	89476382	89476393	104607	15.0690	+	0.033723	0.033746	0.705416	89476382	89476569	-187
2011	chr18	38670122	38670133	24070	223	chr18	38670122	38670133	132619	14.4483	-	0.013610	0.015090	0.613286	38670122	38670345	-223
2012	chr19	40297635	40297645	24080	179	chr19	40297635	40297645	143127	13.7576	+	0.091819	0.091819	0.970569	40297635	40297814	-179
2013	chr10	69219198	69219209	24081	160	chr10	69219198	69219209	31563	15.6724	-	0.078809	0.078850	0.911700	69219198	69219358	-160
2014	chr10	69219199	69219209	24081	160	chr10	69219199	69219209	31564	14.0000	+	0.085234	0.085279	0.928227	69219199	69219359	-160

	motif	fimo	deeplift	avg_deeplift
0	Spi1	13.9878	0.587939	0.053449
1	Spi1	15.0000	1.235800	0.065042
2	Spi1	14.7753	0.065585	0.003858
3	Spi1	14.2121	0.042971	0.002686
4	Spi1	15.8902	0.103061	0.009369

	0	1	2	3	4	5	6	7	8
6	chr1	HAVANA	gene	3205901	3671498	.	-	.	gene_id "ENSMUSG00000051951.5"; gene_type "pro...
74	chr1	HAVANA	gene	4290846	4409241	.	-	.	gene_id "ENSMUSG00000025900.8"; gene_type "pro...
104	chr1	HAVANA	gene	4490931	4497354	.	-	.	gene_id "ENSMUSG00000025902.11"; gene_type "pr...
227	chr1	HAVANA	gene	4773206	4785739	.	-	.	gene_id "ENSMUSG00000033845.11"; gene_type "pr...
298	chr1	HAVANA	gene	4807788	4848410	.	+	.	gene_id "ENSMUSG00000025903.12"; gene_type "pr...