Goal

  • debug modisco-table
In [1]:
from basepair.imports import *
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
2018-10-16 14:44:11,326 [WARNING] doc empty for the `info:` field
In [27]:
output_dir = '/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile'
modisco_dir = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile'
imp_scores =  '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/grad.all.h5'
In [28]:
report_url='http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html'
In [29]:
from basepair.modisco.table import ModiscoData, modisco_table, write_modisco_table
from basepair.modisco.motif_clustering import hirearchically_reorder_table
assert os.path.exists(output_dir)
In [10]:
data = ModiscoData.load(modisco_dir, imp_scores)
In [ ]:
data
In [15]:
data.mr.fpath
Out[15]:
PosixPath('/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/modisco.h5')
In [17]:
!cat /users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/kwargs.json
{"imp_scores": "/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/grad.all.h5", "grad_type": "weighted", "output_dir": ".", "hparams": "modisco.yaml", "filter_npy": null, "skip_dist_filter": false, "use_all_seqlets": false, "max_strand_distance": 0.1, "gpu": "1"}
In [19]:
len(data.d['metadata']['interval_from_task'])
Out[19]:
98428
In [23]:
pd.Series(data.d['metadata']['interval_from_task'].count_values()
  File "<ipython-input-23-6afb8cfaaaa2>", line 1
    pd.Series(data.d['metadata']['interval_from_task'].count_values()
                                                                     ^
SyntaxError: unexpected EOF while parsing
In [26]:
df = modisco_table(data)
  1%|▏         | 2/142 [00:20<24:05, 10.33s/it]Exception ignored in: <bound method tqdm.__del__ of   0%|          | 0/142 [01:15<?, ?it/s]>
Traceback (most recent call last):
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tqdm/_tqdm.py", line 879, in __del__
    self.close()
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tqdm/_tqdm.py", line 1098, in close
    self._decr_instances(self)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tqdm/_tqdm.py", line 438, in _decr_instances
    cls._instances.remove(instance)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/_weakrefset.py", line 109, in remove
    self.data.remove(ref(item))
KeyError: <weakref at 0x7fa43c0225e8; to 'tqdm' at 0x7fa43e71acf8>
100%|██████████| 142/142 [21:35<00:00,  9.12s/it]
In [30]:
output_dir
Out[30]:
'/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile'
In [31]:
report_url
Out[31]:
'http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html'
In [32]:
data.tasks
Out[32]:
['Klf4', 'Nanog', 'Oct4', 'Sox2']
In [34]:
output_dir
Out[34]:
'/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile'
In [33]:
print("Writing the results")
write_modisco_table(df, output_dir, report_url, 'pattern_table')

print("Writing clustered table")
write_modisco_table(hirearchically_reorder_table(df, data.tasks), 
                    output_dir, report_url, 'pattern_table.sorted')
print("Done!")
Writing the results
Writing clustered table
Done!

Dump profiles as well

In [ ]:
data.get_peak_task_idx
In [38]:
profiles = OrderedDict([(pattern, {task: data.get_profile_wide(pattern, task).mean(axis=0)
                                   for task in data.tasks})
                        for pattern in data.mr.patterns()])
write_pkl(profiles, Path(output_dir) / 'footprints.pkl')