from basepair.imports import *

Using TensorFlow backend.

2018-12-03 16:50:05,050 [WARNING] doc empty for the `info:` field

# get NRLB PSAMs
import csv
import numpy as np

oct4_selex_psams = []
consensus_seqs = []

with open('POU5F1-NRLBConfig.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    next(csv_reader, None)  # skip the headers
    for row in csv_reader:
        consensus_seqs.append(row[24])
        count = 26
        this_selex_psam = []
        while(row[count] != 'NSB>'):
            this_selex_psam.append(float(row[count]))
            count+=1
        this_selex_psam = np.array(this_selex_psam)
        this_selex_psam = this_selex_psam.reshape(int((count-26)/4), 4)
        oct4_selex_psams.append(this_selex_psam)
        
oct4_selex_psams = np.array(oct4_selex_psams)

# one hot encode full peak
import os

def one_hot_encode_along_channel_axis(sequence):
    to_return = np.zeros((len(sequence),4), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=sequence, one_hot_axis=1)
    return to_return

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1
            
full_peaks = []
with open("oct4_peaks.fa.out") as peaks_file:
    for line in peaks_file:
        if ">" in line:
            continue
        full_peaks.append(one_hot_encode_along_channel_axis(line.strip()))

# one hot encode seqlets
seqlets_dir = "oct4_seqlets/"
patterns = ["metacluster_0_pattern_18",
            "metacluster_3_pattern_10",
            "metacluster_12_pattern_2",
            "metacluster_9_pattern_4"]
pattern_info = ["http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_0/pattern_18",
                "http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_3/pattern_10",
                "http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_12/pattern_2",
                "http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_9/pattern_4"]
patterns_to_seqlets = {}

for pattern in patterns:
    patterns_to_seqlets[pattern] = []
    with open(seqlets_dir+pattern+".fa.out") as seq_f:
        for line in seq_f:
            if ">" in line:
                continue
            patterns_to_seqlets[pattern].append(one_hot_encode_along_channel_axis(line.strip()))

# they see me rollin
def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

# plots
import seaborn as sns
import matplotlib.pyplot as plt
from concise.utils.plot import seqlogo_fig, seqlogo
from modisco.visualization import viz_sequence

for idx, psam in enumerate(oct4_selex_psams):
    print("PSAM for Consensus: "+consensus_seqs[idx])
    viz_sequence.plot_weights(psam, figsize=(10,3))
    peak_affinities = []
    for peak in full_peaks:
        peak_affinities.append(np.sum(np.multiply(rolling_window(peak.T, len(psam)), psam.T[:,None,:]), axis=(0,2)))
    peak_affinities = np.array(peak_affinities).flatten()
    
    for idy, pattern in enumerate(patterns):
        print("Modisco "+pattern+": "+pattern_info[idy])
        seqlet_affinities = []
        seqlet_affinities_argmax = []
        seqlet_affinities_orientation = []
        for seqlet in patterns_to_seqlets[pattern]:
            per_position_affinities_fwd = np.sum(np.multiply(rolling_window(seqlet.T, len(psam)), psam.T[:,None,:]), axis=(0,2))
            per_position_affinities_rev = np.sum(np.multiply(rolling_window(seqlet.T, len(psam)), psam[::-1,::-1].T[:,None,:]), axis=(0,2))
            per_position_affinities = np.max(np.array([per_position_affinities_fwd, per_position_affinities_rev]),
                                             axis=0)
            per_position_affinities_fwd_or_rev = np.argmax(np.array([per_position_affinities_fwd, per_position_affinities_rev]),
                                                           axis=0)
            argmax_pos = np.argmax(per_position_affinities)
            seqlet_affinities_argmax.append(argmax_pos)
            seqlet_affinities_orientation.append(per_position_affinities_fwd_or_rev[argmax_pos])
            seqlet_affinities.append(per_position_affinities[argmax_pos])
        seqlet_affinities = np.array(seqlet_affinities).flatten()

        sorted_seqlet_affinities = sorted(enumerate(seqlet_affinities), key=lambda x: x[1])
        sorted_seqlet_indices = [x[0] for x in sorted_seqlet_affinities]
        n_to_viz = 3
        print("lowest affinities seqlets incoming")
        for seqlet_idx in sorted_seqlet_indices[:n_to_viz]:
            the_seqlet = patterns_to_seqlets[pattern][seqlet_idx]
            pos_of_best_match_to_nrlb_within_seqlet = seqlet_affinities_argmax[seqlet_idx]
            best_matching_subseqlet = the_seqlet[pos_of_best_match_to_nrlb_within_seqlet:
                                                 (pos_of_best_match_to_nrlb_within_seqlet+len(psam))]
            if (seqlet_affinities_orientation[seqlet_idx]==1):
                best_matching_subseqlet = best_matching_subseqlet[::-1,::-1]
                
            masked_best_matching_subseqlet = best_matching_subseqlet*psam
            print("PSAM masked by best matching subseqlet")
            viz_sequence.plot_weights(masked_best_matching_subseqlet, figsize=(10,3))
        print("highest affinities seqlets incoming")
        for seqlet_idx in sorted_seqlet_indices[-n_to_viz:]:
            the_seqlet = patterns_to_seqlets[pattern][seqlet_idx]
            pos_of_best_match_to_nrlb_within_seqlet = seqlet_affinities_argmax[seqlet_idx]
            best_matching_subseqlet = the_seqlet[pos_of_best_match_to_nrlb_within_seqlet:
                                                 (pos_of_best_match_to_nrlb_within_seqlet+len(psam))]
            if (seqlet_affinities_orientation[seqlet_idx]==1):
                best_matching_subseqlet = best_matching_subseqlet[::-1,::-1]
                
            masked_best_matching_subseqlet = best_matching_subseqlet*psam
            print("PSAM masked by best matching subseqlet")
            viz_sequence.plot_weights(masked_best_matching_subseqlet, figsize=(10,3))
        print("histogram incoming")
        sns.distplot(peak_affinities, bins='auto', label='full_peak_affinities')
        sns.distplot(seqlet_affinities, bins='auto', label='seqlet_affinities')
        plt.legend(loc='upper left')
        plt.title("histogram of affinities and seqlet for pattern: "+pattern)
        plt.show()
        print("*********************************************************************************************************************")

PSAM for Consensus: TTGCATAA

Modisco metacluster_0_pattern_18: http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_0/pattern_18
lowest affinities seqlets incoming
PSAM masked by best matching subseqlet

PSAM masked by best matching subseqlet

PSAM masked by best matching subseqlet

highest affinities seqlets incoming
PSAM masked by best matching subseqlet

PSAM masked by best matching subseqlet

PSAM masked by best matching subseqlet

histogram incoming

*********************************************************************************************************************
Modisco metacluster_3_pattern_10: http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/results.html#metacluster_3/pattern_10
lowest affinities seqlets incoming
PSAM masked by best matching subseqlet

PSAM masked by best matching subseqlet

modisco_dir = f"/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/"
imp_scores =  f"/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/grad.all.h5"

import h5py
f = h5py.File("/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/grad.all.h5")

list(f.keys())

['grads', 'inputs', 'metadata', 'targets']

list(f['metadata'].keys())

['interval_from_task', 'range']

len(list(f['metadata']['range']['chr'][:]))

98428

#(1) print out the list of intervals corresponding to the gradient arrays
#(2) Do bedtools intersect with the seqlet intervals to figure out which larger interval each seqlet interval originates from
#(3) Create a dictionary that maps the larger interval to the gradient array, and another dictionary that maps the seqlet interval to the larger interval; link those two

!ls

oct4_peaks.fa.out  POU5F1-NRLBConfig.csv  sorted_expanded_oct4_peaks.bed
oct4_seqlets	   selex_oct4.ipynb