Goal

  • make some simple scatterplots

TODO

  • [ ] weight the sequences by counts
In [4]:
!ls ~/Downloads/*Sheet2.csv
'/home/avsec/Downloads/nanog - Sheet2.csv'
'/home/avsec/Downloads/oct4 - Sheet2.csv'
'/home/avsec/Downloads/sox2-ac - Sheet2.csv'
'/home/avsec/Downloads/sox2-af - Sheet2.csv'
In [5]:
tfs = ['nanog', 'oct4', 'sox2-ac', 'sox2-af']
In [8]:
df = pd.concat([pd.read_csv(f"/home/avsec/Downloads/{t} - Sheet2.csv").assign(tf=t) for t in tfs])
In [9]:
df.head()
Out[9]:
subseqlet counts mean importance var importance mean affinity (conv with PSAM) modisco match (conv with motif) # peaks w subseqlet # subseqlets in peaks tf
0 TATTGTTC 401 0.6487 0.0522 5.3968 0.5257 1057 1079 nanog
1 TAAGACAG 1 0.6496 0.0000 3.8327 0.0488 1125 1190 nanog
2 CTTTGTTT 151 0.4967 0.0463 4.5310 0.4303 2893 3113 nanog
3 CATTGTTG 57 0.5652 0.0786 5.0486 0.5261 825 845 nanog
4 CATTGTTA 44 0.5689 0.0473 5.9720 0.4943 999 1050 nanog
In [15]:
import plotnine
from plotnine import *
In [26]:
plotnine.options.figure_size = (8, 2)
ggplot(aes(x='mean affinity (conv with PSAM)', y='mean importance'), data=df) + geom_point(alpha=0.05, size=.4) + facet_grid(".~tf") + theme_classic()
Out[26]:
<ggplot: (-9223363288308394871)>
In [27]:
# free scales
plotnine.options.figure_size = (8, 2)
ggplot(aes(x='mean affinity (conv with PSAM)', y='mean importance'), data=df) + geom_point(alpha=0.05, size=.4) + facet_wrap("~tf", scales='free', ncol=4) + theme_classic()
Out[27]:
<ggplot: (8748544319222)>
In [28]:
plotnine.options.figure_size = (8, 2)
ggplot(aes(x='mean affinity (conv with PSAM)', y='modisco match (conv with motif)'), data=df) + geom_point(alpha=0.05, size=.4) + facet_grid(".~tf") + theme_classic()
Out[28]:
<ggplot: (8748543632683)>
In [29]:
# free scales
plotnine.options.figure_size = (8, 2)
(ggplot(aes(x='mean affinity (conv with PSAM)', y='modisco match (conv with motif)'), data=df) + 
 geom_point(alpha=0.05, size=.4) + 
 facet_wrap("~tf", scales='free', ncol=4) + theme_classic())
Out[29]:
<ggplot: (-9223363288310695003)>