from basepair.imports import *
# get NRLB PSAMs
import csv
import numpy as np
oct4_selex_psams = []
consensus_seqs = []
with open('POU5F1-NRLBConfig.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader, None) # skip the headers
for row in csv_reader:
consensus_seqs.append(row[24])
count = 26
this_selex_psam = []
while(row[count] != 'NSB>'):
this_selex_psam.append(float(row[count]))
count+=1
this_selex_psam = np.array(this_selex_psam)
this_selex_psam = this_selex_psam.reshape(int((count-26)/4), 4)
oct4_selex_psams.append(this_selex_psam)
oct4_selex_psams = np.array(oct4_selex_psams)
# one hot encode full peak
import os
def one_hot_encode_along_channel_axis(sequence):
to_return = np.zeros((len(sequence),4), dtype=np.int8)
seq_to_one_hot_fill_in_array(zeros_array=to_return,
sequence=sequence, one_hot_axis=1)
return to_return
def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
assert one_hot_axis==0 or one_hot_axis==1
if (one_hot_axis==0):
assert zeros_array.shape[1] == len(sequence)
elif (one_hot_axis==1):
assert zeros_array.shape[0] == len(sequence)
#will mutate zeros_array
for (i,char) in enumerate(sequence):
if (char=="A" or char=="a"):
char_idx = 0
elif (char=="C" or char=="c"):
char_idx = 1
elif (char=="G" or char=="g"):
char_idx = 2
elif (char=="T" or char=="t"):
char_idx = 3
elif (char=="N" or char=="n"):
continue #leave that pos as all 0's
else:
raise RuntimeError("Unsupported character: "+str(char))
if (one_hot_axis==0):
zeros_array[char_idx,i] = 1
elif (one_hot_axis==1):
zeros_array[i,char_idx] = 1
full_peaks = []
with open("oct4_peaks.fa.out") as peaks_file:
for line in peaks_file:
if ">" in line:
continue
full_peaks.append(one_hot_encode_along_channel_axis(line.strip()))
# one hot encode seqlets
seqlets_dir = "oct4_seqlets/"
patterns = ["metacluster_0_pattern_18",
"metacluster_3_pattern_10",
"metacluster_12_pattern_2",
"metacluster_9_pattern_4"]
pattern_info = ["http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_0/pattern_18",
"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_3/pattern_10",
"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_12/pattern_2",
"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_9/pattern_4"]
patterns_to_seqlets = {}
for pattern in patterns:
patterns_to_seqlets[pattern] = []
with open(seqlets_dir+pattern+".fa.out") as seq_f:
for line in seq_f:
if ">" in line:
continue
patterns_to_seqlets[pattern].append(one_hot_encode_along_channel_axis(line.strip()))
# they see me rollin
def rolling_window(a, window):
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
# plots
import seaborn as sns
import matplotlib.pyplot as plt
from concise.utils.plot import seqlogo_fig, seqlogo
from modisco.visualization import viz_sequence
for idx, psam in enumerate(oct4_selex_psams):
print("PSAM for Consensus: "+consensus_seqs[idx])
viz_sequence.plot_weights(psam, figsize=(10,3))
peak_affinities = []
for peak in full_peaks:
peak_affinities.append(np.sum(np.multiply(rolling_window(peak.T, len(psam)), psam.T[:,None,:]), axis=(0,2)))
peak_affinities = np.array(peak_affinities).flatten()
for idy, pattern in enumerate(patterns):
print("Modisco "+pattern+": "+pattern_info[idy])
seqlet_affinities = []
seqlet_affinities_argmax = []
seqlet_affinities_orientation = []
for seqlet in patterns_to_seqlets[pattern]:
per_position_affinities_fwd = np.sum(np.multiply(rolling_window(seqlet.T, len(psam)), psam.T[:,None,:]), axis=(0,2))
per_position_affinities_rev = np.sum(np.multiply(rolling_window(seqlet.T, len(psam)), psam[::-1,::-1].T[:,None,:]), axis=(0,2))
per_position_affinities = np.max(np.array([per_position_affinities_fwd, per_position_affinities_rev]),
axis=0)
per_position_affinities_fwd_or_rev = np.argmax(np.array([per_position_affinities_fwd, per_position_affinities_rev]),
axis=0)
argmax_pos = np.argmax(per_position_affinities)
seqlet_affinities_argmax.append(argmax_pos)
seqlet_affinities_orientation.append(per_position_affinities_fwd_or_rev[argmax_pos])
seqlet_affinities.append(per_position_affinities[argmax_pos])
seqlet_affinities = np.array(seqlet_affinities).flatten()
sorted_seqlet_affinities = sorted(enumerate(seqlet_affinities), key=lambda x: x[1])
sorted_seqlet_indices = [x[0] for x in sorted_seqlet_affinities]
n_to_viz = 3
print("lowest affinities seqlets incoming")
for seqlet_idx in sorted_seqlet_indices[:n_to_viz]:
the_seqlet = patterns_to_seqlets[pattern][seqlet_idx]
pos_of_best_match_to_nrlb_within_seqlet = seqlet_affinities_argmax[seqlet_idx]
best_matching_subseqlet = the_seqlet[pos_of_best_match_to_nrlb_within_seqlet:
(pos_of_best_match_to_nrlb_within_seqlet+len(psam))]
if (seqlet_affinities_orientation[seqlet_idx]==1):
best_matching_subseqlet = best_matching_subseqlet[::-1,::-1]
masked_best_matching_subseqlet = best_matching_subseqlet*psam
print("PSAM masked by best matching subseqlet")
viz_sequence.plot_weights(masked_best_matching_subseqlet, figsize=(10,3))
print("highest affinities seqlets incoming")
for seqlet_idx in sorted_seqlet_indices[-n_to_viz:]:
the_seqlet = patterns_to_seqlets[pattern][seqlet_idx]
pos_of_best_match_to_nrlb_within_seqlet = seqlet_affinities_argmax[seqlet_idx]
best_matching_subseqlet = the_seqlet[pos_of_best_match_to_nrlb_within_seqlet:
(pos_of_best_match_to_nrlb_within_seqlet+len(psam))]
if (seqlet_affinities_orientation[seqlet_idx]==1):
best_matching_subseqlet = best_matching_subseqlet[::-1,::-1]
masked_best_matching_subseqlet = best_matching_subseqlet*psam
print("PSAM masked by best matching subseqlet")
viz_sequence.plot_weights(masked_best_matching_subseqlet, figsize=(10,3))
print("histogram incoming")
sns.distplot(peak_affinities, bins='auto', label='full_peak_affinities')
sns.distplot(seqlet_affinities, bins='auto', label='seqlet_affinities')
plt.legend(loc='upper left')
plt.title("histogram of affinities and seqlet for pattern: "+pattern)
plt.show()
print("*********************************************************************************************************************")
modisco_dir = f"/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/"
imp_scores = f"/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/grad.all.h5"
import h5py
f = h5py.File("/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/grad.all.h5")
list(f.keys())
list(f['metadata'].keys())
len(list(f['metadata']['range']['chr'][:]))
#(1) print out the list of intervals corresponding to the gradient arrays
#(2) Do bedtools intersect with the seqlet intervals to figure out which larger interval each seqlet interval originates from
#(3) Create a dictionary that maps the larger interval to the gradient array, and another dictionary that maps the seqlet interval to the larger interval; link those two
!ls