{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#load  tutorial utilities \n",
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ingesting data into tileDB "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from seqdataloader.dbingest import * "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The header of the input task file should contain (one or more) of the following fields: \n",
    "    * dataset (this one's required -- it's a unique label for your dataset) \n",
    "    * pval_bigwig \n",
    "    * fc_bigwig \n",
    "    * count_bigwig_plus_5p \n",
    "    * count_bigwig_minux_5p\n",
    "    * idr_peak\n",
    "    * overlap_peak \n",
    "    * ambig_peak \n",
    "    \n",
    "The file paths can be either local or web-based URL's. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cat: tasks.dbingest.tsv: No such file or directory\r\n"
     ]
    }
   ],
   "source": [
    "!cat tasks.dbingest.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can run the ingest code as a python function: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loaded tiledb metadata\n",
      "loaded chrom sizes\n",
      "tiledb group already exists\n",
      "got data dict\n",
      "parsed pool inputs\n",
      "made pool!\n",
      "warning: the array: hepg2_dnase_encode/ENCFF209DJG.chr21 already exists. You provided the --overwrite flag, so it will be updated/overwritten\n",
      "here\n",
      "store_summits:True\n",
      "summit_indicator:2\n",
      "got:idr_peak for chrom:chr21\n",
      "store_summits:False\n",
      "summit_indicator:None\n",
      "got:fc_bigwig for chrom:chr21\n",
      "store_summits:False\n",
      "summit_indicator:None\n",
      "got:ambig_peak for chrom:chr21\n",
      "starting to write output\n",
      "got cur vals\n",
      "idr_peak\n",
      "dict_to_write[key].shape:(46709983,)\n",
      "fc_bigwig\n",
      "dict_to_write[key].shape:(46709983,)\n",
      "ambig_peak\n",
      "dict_to_write[key].shape:(46709983,)\n",
      "updated data dict for writing\n",
      "finalizing the write\n",
      "0\n",
      "1000000\n",
      "2000000\n",
      "3000000\n",
      "4000000\n",
      "5000000\n",
      "6000000\n",
      "7000000\n",
      "8000000\n",
      "9000000\n",
      "10000000\n",
      "11000000\n",
      "12000000\n",
      "13000000\n",
      "14000000\n",
      "15000000\n",
      "16000000\n",
      "17000000\n",
      "18000000\n",
      "19000000\n",
      "20000000\n",
      "21000000\n",
      "22000000\n",
      "23000000\n",
      "24000000\n",
      "25000000\n",
      "26000000\n",
      "27000000\n",
      "28000000\n",
      "29000000\n",
      "30000000\n",
      "31000000\n",
      "32000000\n",
      "33000000\n",
      "34000000\n",
      "35000000\n",
      "36000000\n",
      "37000000\n",
      "38000000\n",
      "39000000\n",
      "40000000\n",
      "41000000\n",
      "42000000\n",
      "43000000\n",
      "44000000\n",
      "45000000\n",
      "46000000\n",
      "length of pool inputs:48\n",
      "made pool\n",
      "start:0, end:1000000\n",
      "start:1000000, end:2000000\n",
      "start:2000000, end:3000000\n",
      "start:3000000, end:4000000\n",
      "start:4000000, end:5000000\n",
      "start:5000000, end:6000000\n",
      "start:6000000, end:7000000\n",
      "start:7000000, end:8000000\n",
      "start:8000000, end:9000000\n",
      "start:9000000, end:10000000\n",
      "start:10000000, end:11000000\n",
      "start:11000000, end:12000000\n",
      "start:12000000, end:13000000\n",
      "start:13000000, end:14000000\n",
      "start:14000000, end:15000000\n",
      "start:15000000, end:16000000\n",
      "start:16000000, end:17000000\n",
      "start:17000000, end:18000000\n",
      "start:18000000, end:19000000\n",
      "start:19000000, end:20000000\n",
      "start:20000000, end:21000000\n",
      "start:21000000, end:22000000\n",
      "start:22000000, end:23000000\n",
      "start:23000000, end:24000000\n",
      "start:24000000, end:25000000\n",
      "start:25000000, end:26000000\n",
      "start:26000000, end:27000000\n",
      "start:27000000, end:28000000\n",
      "start:28000000, end:29000000\n",
      "start:29000000, end:30000000\n",
      "start:30000000, end:31000000\n",
      "start:31000000, end:32000000\n",
      "start:32000000, end:33000000\n",
      "start:33000000, end:34000000\n",
      "start:34000000, end:35000000\n",
      "start:35000000, end:36000000\n",
      "start:36000000, end:37000000\n",
      "start:37000000, end:38000000\n",
      "start:38000000, end:39000000\n",
      "start:39000000, end:40000000\n",
      "start:40000000, end:41000000\n",
      "start:41000000, end:42000000\n",
      "start:42000000, end:43000000\n",
      "start:43000000, end:44000000\n",
      "start:44000000, end:45000000\n",
      "start:45000000, end:46000000\n",
      "start:46000000, end:47000000\n",
      "start:47000000, end:46709983\n",
      "done writing\n",
      "wrote array to disk for dataset:hepg2_dnase_encode/ENCFF209DJG.chr21\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'done'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "args={\"tiledb_metadata\":\"tasks.dbingest.tsv\",\n",
    "      \"tiledb_group\":\"hepg2_dnase_encode\",\n",
    "     \"overwrite\":True,\n",
    "     \"chrom_sizes\":\"hg38.chrom21.sizes\",\n",
    "     \"chrom_threads\":1,\n",
    "     \"task_threads\":1,\n",
    "     \"write_threads\":1}\n",
    "\n",
    "ingest(args)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or you can run the code as a script: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cat: tasks.dbingest.local.tsv: No such file or directory\r\n"
     ]
    }
   ],
   "source": [
    "!cat ~/seqdataltasks.dbingest.local.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!db_ingest --tiledb_metadata tasks.dbingest.local.tsv \\\n",
    "    --tiledb_group hepg2_dnase_encode \\\n",
    "    --overwrite \\\n",
    "    --chrom_sizes hg38.chrom.sizes \\\n",
    "    --chrom_threads 25 \\\n",
    "    --attribute_config encode_pipeline \\\n",
    "    --tile_size 9000 \\\n",
    "    --batch_size 1000000\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "odict_keys(['pval_bigwig', 'fc_bigwig', 'count_bigwig_plus_5p', 'count_bigwig_minux_5p', 'idr_peak', 'overlap_peak', 'ambig_peak'])\n",
      "odict_keys(['pval_bigwig', 'fc_bigwig', 'count_bigwig_plus_5p', 'count_bigwig_minux_5p', 'idr_peak', 'overlap_peak', 'ambig_peak'])\n"
     ]
    }
   ],
   "source": [
    "#we can examine the array \n",
    "import tiledb \n",
    "data=tiledb.DenseArray(\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY.chr1\",'r')\n",
    "subset=data[30000000:31000000]\n",
    "print(subset.keys())\n",
    "data=tiledb.DenseArray(\"/mnt/data/tiledb/encode/dnase/ENCSR000EOY.chr21\",'r')\n",
    "subset=data[30000000:31000000]\n",
    "print(subset.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00303804, 0.00303804, 0.00303804, 0.00303804, 0.00303804,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00607608, 0.00607608, 0.00607608, 0.00607608, 0.00607608,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.00911412, 0.00911412, 0.00911412, 0.00911412, 0.00911412,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ,\n",
       "       0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 , 0.0121522 ],\n",
       "      dtype=float32)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset['fc_bigwig'][0:1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset['idr_peak'][0:1000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Genomewide classification labels "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from seqdataloader.labelgen import *\n",
    "classification_params={\n",
    "    'task_list':\"tasks.labelgen.tsv\",\n",
    "    'outf':\"classificationlabels.SummitWithin200bpCenter.tsv.gz\",\n",
    "    'output_type':'gzip',\n",
    "    'chrom_sizes':'hg38.chrom.sizes',\n",
    "    'chroms_to_keep':['chr21'],\n",
    "    \"store_positives_only\":True,\n",
    "    'bin_stride':50,\n",
    "    'left_flank':400,\n",
    "    'right_flank':400,\n",
    "    'bin_size':200,\n",
    "    'task_threads':10,\n",
    "    'chrom_threads':4,\n",
    "    'allow_ambiguous':True,\n",
    "    'labeling_approach':'peak_summit_in_bin_classification'\n",
    "    }\n",
    "genomewide_labels(classification_params)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Genomewide regression labels "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regression_params={\n",
    "    'task_list':\"tasks.labelgen.tsv\",\n",
    "    'outf':\"regressionlabels.all_genome_bins_regression.hdf5\",\n",
    "    'output_type':'hdf5',\n",
    "    'chrom_sizes':'hg38.chrom.sizes',\n",
    "    'store_values_above_thresh': 0,\n",
    "    'chroms_to_keep':['chr21'],\n",
    "    'bin_stride':50,\n",
    "    'left_flank':400,\n",
    "    'right_flank':400,\n",
    "    'bin_size':200,\n",
    "    'threads':10,\n",
    "    'subthreads':4,\n",
    "    'labeling_approach':'all_genome_bins_regression'\n",
    "    }\n",
    "genomewide_labels(regression_params)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "let's examine the output dataframe for the regression case: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regression_data=pd.read_hdf(\"regressionlabels.all_genome_bins_regression.hdf5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regression_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regression_negatives=pd.read_hdf(\"universal_negatives.regressionlabels.all_genome_bins_regression.hdf5\")\n",
    "regression_negatives.head"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "for the classification case, we specified \"store_positives_only\", so the script generated two dataframes: \n",
    "    * Universal negatives \n",
    "    * Dataframe where each bin is >0 for at least one task "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classification_pos=pd.read_csv(\"classificationlabels.SummitWithin200bpCenter.tsv.gz\",sep='\\t',header=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classification_pos.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classification_neg=pd.read_csv(\"universal_negatives.classificationlabels.SummitWithin200bpCenter.tsv.gz\",sep='\\t',header=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classification_neg.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
