Goal

  • [ ] Prototype wide input data
In [3]:
import pandas as pd
import numpy as np
from pybedtools import BedTool
from basepair.config import get_data_dir
from tqdm import tqdm
from concise.preprocessing import encodeDNA

ddir = get_data_dir()
In [18]:
def get_sox2_data():
    """Loads the dataframe for sox2
    """
    from concise.utils.fasta import read_fasta
    import pyBigWig
    
    fas = read_fasta(f"{ddir}/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.fasta")

    bed = BedTool(f"{ddir}/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak")

    assert len(fas) == len(bed)

    cuts_pos = pyBigWig.open(f"{ddir}/raw/chipnexus/mnt_jzeitlinger_collab/analysis/sox2_pooled_reps_1b_2b_4b.pos_strand.bw")
    cuts_neg = pyBigWig.open(f"{ddir}/raw/chipnexus/mnt_jzeitlinger_collab/analysis/sox2_pooled_reps_1b_2b_4b.neg_strand.bw")

    #cuts_pos = []
    #cuts_neg = []
    l = []
    for interval in tqdm(bed):
        l.append({"chr": interval.chrom,
                  "start": interval.start,
                  "end": interval.stop,
                  "cuts_pos": cuts_pos.values(interval.chrom, interval.start, interval.stop, numpy=True),
                  "cuts_neg": cuts_neg.values(interval.chrom, interval.start, interval.stop, numpy=True),
                 })

    dfc = pd.DataFrame(l)

    dfc['seq'] = list(fas.values())
    dfc['seq_id'] = list(fas)

    dfc['seq'] = dfc.seq.str.upper()
    return dfc


def seq_inp_exo_out(valid_chr=['chr2', 'chr3', 'chr4'], 
                    test_chr=['chr1', 'chr8', 'chr9']):
    dfc = get_sox2_data()
    seq = encodeDNA(dfc.seq)
    cuts_pos = np.stack(dfc.cuts_pos)
    cuts_neg = np.stack(dfc.cuts_neg)
    ids = dfc.seq_id
    is_test = dfc.chr.isin(test_chr)
    is_valid = dfc.chr.isin(valid_chr)
    is_train = (~is_test) & (~is_valid)
    
    cuts = np.stack([cuts_pos, cuts_neg], axis=-1)
    
    return (seq[is_train], cuts[is_train], dfc[is_train]), \
           (seq[is_valid], cuts[is_valid], dfc[is_valid]), \
           (seq[is_test], cuts[is_test], dfc[is_test])
In [19]:
train, valid, test = seq_inp_exo_out()
100%|██████████| 9396/9396 [00:02<00:00, 3518.12it/s]
In [20]:
len(train[1])/len(dfc)
Out[20]:
0.5918475947211579
In [21]:
len(valid[1])/len(dfc)
Out[21]:
0.20051085568326948
In [22]:
len(test[1])/len(dfc)
Out[22]:
0.20764154959557257
In [5]:
dfc = get_sox2_data()
100%|██████████| 9396/9396 [00:47<00:00, 195.80it/s]
In [11]:
dfc.chr.value_counts() / len(dfc)
Out[11]:
chr8     0.087484
chr4     0.078970
chr1     0.072584
chr2     0.072371
chr11    0.061728
chr5     0.061303
chr13    0.060345
chr6     0.058110
chr10    0.054491
chr3     0.049170
chr9     0.047573
chr7     0.044274
chr17    0.039911
chr14    0.038740
chr12    0.035441
chr15    0.034483
chr16    0.033951
chr18    0.028842
chr19    0.021392
chrX     0.018838
Name: chr, dtype: float64
In [12]:
0.078970 + 0.072371+0.049170
Out[12]:
0.200511
In [6]:
0.8*0.2
Out[6]:
0.16000000000000003