Goal

  • generate the table with activity for Julia

Tasks

  • [ ]

Required files

-

In [1]:
# Imports
from basepair.imports import *
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
hv.extension('bokeh')
Using TensorFlow backend.
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:115: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:116: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)
In [8]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
dataset_dir = modisco_dir / 'perturbation-analysis'
In [6]:
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, align_instance_center
from basepair.exp.paper.config import motifs, profile_mapping
from basepair.utils import flatten
from kipoi.writers import HDF5BatchWriter
from basepair.exp.chipnexus.spacing import motif_pair_dfi, plot_spacing, get_motif_pairs

pairs = get_motif_pairs(motifs)
In [10]:
dfs = pd.read_csv(dataset_dir / 'pair.total_counts.csv.gz')
In [12]:
ls {dataset_dir}
dfab.csv.gz        dfs.csv.gz                        pair.total_counts.csv.gz
dfabf.csv.gz       double_mut.h5                     ref.h5
dfabf.feather      motif_pair_lpdata.incl-whole.pkl  single_mut.h5
dfabf.parq         motif_pair_lpdata.pkl
dfi_subset.csv.gz  pair.total_counts.csv
In [11]:
dfs.head()
Out[11]:
Wt_obs Wt dA dB dAB motif_pair task center_diff strand_combination
0 16198.0 58234.414 28042.203 38972.977 23831.658 Oct4-Sox2<>Oct4-Sox2 Oct4 21.0 ++
1 16198.0 58234.414 28042.203 17461.543 9037.368 Oct4-Sox2<>Oct4-Sox2 Oct4 62.0 ++
2 16198.0 58234.414 28042.203 37178.540 22600.363 Oct4-Sox2<>Oct4-Sox2 Oct4 83.0 ++
3 16198.0 58234.414 28042.203 23178.790 13922.051 Oct4-Sox2<>Oct4-Sox2 Oct4 104.0 ++
4 16198.0 58234.414 38972.977 17461.543 9234.978 Oct4-Sox2<>Oct4-Sox2 Oct4 41.0 ++
In [7]:
dfi = load_instances(modisco_dir / 'instances.parq', motifs=motifs, dedup=False)
dfi = filter_nonoverlapping_intervals(dfi)
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-7-2ce577a949aa> in <module>
----> 1 dfi = load_instances(modisco_dir / 'instances.parq', motifs=motifs, dedup=False)
      2 dfi = filter_nonoverlapping_intervals(dfi)

~/workspace/basepair/basepair/modisco/pattern_instances.py in load_instances(parq_file, motifs, dedup)
     27         dfi = parq_file
     28     else:
---> 29         dfi = pd.read_parquet(str(parq_file), engine='fastparquet')
     30 
     31 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, **kwargs)
    279     """
    280 
--> 281     impl = get_engine(engine)
    282     return impl.read(path, columns=columns, **kwargs)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/io/parquet.py in get_engine(engine)
     41         return PyArrowImpl()
     42     elif engine == 'fastparquet':
---> 43         return FastParquetImpl()
     44 
     45 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/io/parquet.py in __init__(self)
    154         if LooseVersion(fastparquet.__version__) < '0.2.1':
    155             raise ImportError(
--> 156                 "fastparquet >= 0.2.1 is required for parquet "
    157                 "support\n\n"
    158                 "you can install via conda\n"

ImportError: fastparquet >= 0.2.1 is required for parquet support

you can install via conda
conda install fastparquet -c conda-forge

or via pip
pip install -U fastparquet
In [ ]:
# create_tf_session(0)
In [12]:
# get the interesting motif location
dfi_subset = (dfi.query('match_weighted_p > 0.2')
                 .query('imp_weighted_p > 0'))
dfi_subset['row_idx'] = np.arange(len(dfi_subset)).astype(int)