def get_sox2_data():
"""Loads the dataframe for sox2
"""
from concise.utils.fasta import read_fasta
import pyBigWig
fas = read_fasta(f"{ddir}/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.fasta")
bed = BedTool(f"{ddir}/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak")
assert len(fas) == len(bed)
cuts_pos = pyBigWig.open(f"{ddir}/raw/chipnexus/mnt_jzeitlinger_collab/analysis/sox2_pooled_reps_1b_2b_4b.pos_strand.bw")
cuts_neg = pyBigWig.open(f"{ddir}/raw/chipnexus/mnt_jzeitlinger_collab/analysis/sox2_pooled_reps_1b_2b_4b.neg_strand.bw")
#cuts_pos = []
#cuts_neg = []
l = []
for interval in tqdm(bed):
l.append({"chr": interval.chrom,
"start": interval.start,
"end": interval.stop,
"cuts_pos": cuts_pos.values(interval.chrom, interval.start, interval.stop, numpy=True),
"cuts_neg": cuts_neg.values(interval.chrom, interval.start, interval.stop, numpy=True),
})
dfc = pd.DataFrame(l)
dfc['seq'] = list(fas.values())
dfc['seq_id'] = list(fas)
dfc['seq'] = dfc.seq.str.upper()
return dfc
def seq_inp_exo_out(valid_chr=['chr2', 'chr3', 'chr4'],
test_chr=['chr1', 'chr8', 'chr9']):
dfc = get_sox2_data()
seq = encodeDNA(dfc.seq)
cuts_pos = np.stack(dfc.cuts_pos)
cuts_neg = np.stack(dfc.cuts_neg)
ids = dfc.seq_id
is_test = dfc.chr.isin(test_chr)
is_valid = dfc.chr.isin(valid_chr)
is_train = (~is_test) & (~is_valid)
cuts = np.stack([cuts_pos, cuts_neg], axis=-1)
return (seq[is_train], cuts[is_train], dfc[is_train]), \
(seq[is_valid], cuts[is_valid], dfc[is_valid]), \
(seq[is_test], cuts[is_test], dfc[is_test])