Fig 6. Quantifying the decrease in uncertainty from increased sequencing
Figure 6:
Histograms show distributions of overlap estimates $\hat{s}$, computed using Eq (11), for various values of s which are indicated by color-matched dotted lines. While all estimates are distributed around the true values of s, increasing the number of colonies c from 48 (top) to 96 (middle) and to 144 (bottom) substantially decreases the error of the estimates. For example the bottom plot shows that successfully sequencing c = 144 colonies from each parasite is guaranteed to produce estimates $\hat{s}$ that are off by at most 5 (8.3%) in either direction of the true s.
Link to the published figure: https://doi.org/10.1371/journal.pcbi.1006898.g006
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import hypergeom
from scipy.stats import binned_statistic as binsta
from scipy.special import logsumexp
from util import *
import palettable as pal
clrx = pal.cartocolors.qualitative.Prism_10.mpl_colors
clr = tuple(x for n,x in enumerate(clrx) if n in [1,2,4,5,6])
clr2 = pal.cartocolors.sequential.agSunset_7.mpl_colors
import matplotlib.gridspec as gridspec
import matplotlib.patches as patches
import plotly.graph_objects as go
import plotly.tools as tls
from plotly.offline import plot, iplot, init_notebook_mode
from IPython.core.display import display, HTML
init_notebook_mode(connected = True)
config={'showLink': False, 'displayModeBar': False}
# CCP, the Coupon Collector's Problem
def ccp_sample(c,pool=60):
return len(set(np.random.choice(pool,c)))
# Draw overlap
def nab_sample(s,na,nb,pool=60):
sa = np.random.hypergeometric(s,pool-s,na)
nab = np.random.hypergeometric(sa,pool-sa,nb)
return nab
# Overlap between two PCRs of depth c and overlap s
def pcr_sample(c,s):
na = ccp_sample(c)
nb = ccp_sample(c)
return nab_sample(s,na,nb),na,nb
# Draw na and nb samples from two populations of size pool_a and pool_b, with true overlap s
# and return empirical overlap between na and nb
# note that this is basically the same as nab_sample, but with two different size pools!
def nab_sample_unequal(s,na,nb,pool_a,pool_b):
sa = np.random.hypergeometric(s,pool_a-s,na)
nab = np.random.hypergeometric(sa,pool_b-sa,nb)
return nab
def p_ccp(c, pool=60):
p = np.zeros([c+1,pool+1])
p[0,0] = 1;
for row in range(1,c+1):
for k in range(1,np.min([row+2,pool+1])):
p[row,k] = p[row-1,k]*k/pool + p[row-1,k-1]*(1-(k-1)/pool)
return p[-1,:]
def p_overlap(na,nb,nab,pool=60):
p_s = np.zeros(pool+1)
# reference: hypergeom.pmf(outcome, Total, hits, Draws, loc=0)
for s in np.arange(pool+1):
# p_sa is the probability that we'd get sa from the overlap (s), just in na draws of a
p_sa = hypergeom.pmf(np.arange(pool+1),pool,s,na)
# p_nab_given_sa is the probability of getting that nab, given sa
p_nab_given_sa = hypergeom.pmf(nab,pool,np.arange(pool+1),nb)
p_s[s] = np.dot(p_sa,p_nab_given_sa)
return p_s/np.sum(p_s)
def e_overlap(na,nb,nab,pool=60):
p_s = p_overlap(na,nb,nab,pool=pool)
return np.dot(np.arange(pool+1),p_s)
def credible_interval(na,nb,nab,pct=90,pool=60):
p_s = p_overlap(na,nb,nab,pool=pool)
cdf = np.cumsum(p_s)
ccdf = np.flipud(np.cumsum(np.flipud(p_s)))
# adjust for fractions vs percents; put everything as a fraction
if pct > 1:
pct = pct/100
cutoff = (1-pct)/2
# get the lower bound.
# it's the first index at which cdf ≥ cutoff
try:
lower = np.where(cdf >= cutoff)[0][0]
except IndexError:
lower = 0
# get the upper bound
# it's the first index at which ccdf ≥ 0.05
try:
upper = np.where(ccdf >= cutoff)[0][-1]
except IndexError:
upper=pool
expectation = np.dot(np.arange(pool+1),p_s)
# Sanity and indexing check: uncomment this line to see true tail probability ≤ 0.05
# print([cdf[lower-1],(1-ccdf[upper+1])])
return lower,expectation,upper
def p_nab_given_c(s,c,pool=60):
pna = p_ccp(c)
pnb = p_ccp(c)
nas = np.arange(1,len(pna))
nbs = np.arange(1,len(pnb))
p_gen = np.zeros([pool+1,pool+1,pool+1])
for na in nas:
p_sa = hypergeom.pmf(np.arange(pool+1),pool,s,na)
for nb in nbs:
pna_pnb = pna[na] * pnb[nb]
for nab in range(0,np.minimum(na,nb)):
p_nab_given_sa = hypergeom.pmf(nab,pool,np.arange(pool+1),nb)
p_nab_given_s = np.dot(p_sa,p_nab_given_sa)
p_gen[na,nb,nab] = p_nab_given_s * pna_pnb
return p_gen
def p_shat_given_sc(s,c,shat,pool=60):
masses = p_nab_given_c(s,c,pool=pool)
if np.sum(masses)<0.99:
print('Swapping to Monte Carlo')
return p_shat_given_sc_montecarlo(s,c,shat,pool=pool)
hist = binsta(np.ravel(shat),np.ravel(masses),statistic='sum',bins=(np.arange(pool+2)-0.5))
return hist
def p_shat_given_sc_montecarlo(s,c,shat,pool=60,n_mc=int(1e5)):
masses = np.zeros([pool+1,pool+1,pool+1])
for ii in range(n_mc):
nab,na,nb = pcr_sample(c,s)
masses[na,nb,nab] += 1
hist = binsta(np.ravel(shat),np.ravel(masses/n_mc),statistic='sum',bins=(np.arange(pool+2)-0.5))
return hist
def compute_all_estimates(pool=60):
shat = np.zeros([pool+1,pool+1,pool+1])
for na in range(1,pool+1):
for nb in range(1,pool+1):
for nab in range(0,np.minimum(na+1,nb+1)):
shat[na,nb,nab] = e_overlap(na,nb,nab,pool=pool)
return shat
def p_overlap_unequal(na,nb,nab,pool_a,pool_b):
# all loops are in terms of pool_a, which is assumed to be ≤ pool_b.
p_s = np.zeros(pool_a+1)
# reference: hypergeom.pmf(outcome, Total, hits, Draws, loc=0)
for s in np.arange(pool_a+1):
# p_sa is the probability that we'd get sa from the overlap (s), just in na draws of a
p_sa = hypergeom.pmf(np.arange(pool_a+1),pool_a,s,na)
# p_nab_given_sa is the probability of getting that nab, given sa
p_nab_given_sa = hypergeom.pmf(nab,pool_b,np.arange(pool_a+1),nb)
p_s[s] = np.dot(p_sa,p_nab_given_sa)
return p_s/np.sum(p_s)
def e_overlap_unequal(na,nb,nab,pool_a,pool_b):
# TODO. Code expects that pool_b > pool_a...
p_s = p_overlap_unequal(na,nb,nab,pool_a,pool_b)
return np.dot(np.arange(pool_a+1),p_s)
# shat = compute_all_estimates(pool=60)
# np.save('shat_60.npy',shat)
shat = np.load('shat_60.npy')
shat = np.load('shat_60.npy')
hist_error = {}
pool = 60
ss = [10,20,30,40,50]
cc = [48,96,144]
for c in cc:
hist_error[c] = np.zeros([len(ss),pool+1])
for idx,s in enumerate(ss):
hist_error[c][idx,:] = np.array(p_shat_given_sc(s,c,shat)[0])
fig5 = go.Figure()
colors = ['rgb(126,169,195)', 'rgb(143,204,204)','rgb(183,214,161)', 'rgb(243,207,113)','rgb(237,181,114)']
c1 = [96,144]
for idx, s in enumerate(ss):
fig5.add_trace(go.Scatter(x = s*np.ones(15), y =np.linspace(0,0.29,15), mode = 'markers',
marker=dict(color=colors[idx]),showlegend=False, name = s ))
for idx, s in enumerate(ss):
fig5.add_trace(go.Bar(x = np.arange(pool+1), y = hist_error[48][idx],marker_color = colors[idx],opacity=0.75,
showlegend=False, marker_line_color = "Black", marker_line_width=1.5, name = s))
for c in c1:
for idx, s in enumerate(ss):
fig5.add_trace(go.Bar(x = np.arange(pool+1), y = hist_error[c][idx],marker_color = colors[idx],opacity=0.75,
showlegend=False, visible = False,marker_line_color = "Black", marker_line_width=1.5, name = s))
fig5.update_layout(barmode='overlay', plot_bgcolor='rgb(255,255,255)', height = 350,
xaxis_title = 'Estimated overlap, \u015D', yaxis_title = 'Probability of \u015D',
updatemenus=[
dict(
active=0,
buttons=list([
dict(label="c = 48 colonies",
method="update",
args=[{"visible":
[True,True,True,True, True,True,True,True,True, True,
False, False, False, False,False,False, False, False, False,False]},
{"title": ""}]),
dict(label="c = 96 colonies",
method="update",
args=[{"visible":
[True,True,True,True, True, False, False, False, False,False,
True,True,True,True, True,False, False, False, False,False]},
{"title": ""
}]),
dict(label="c = 144 colonies",
method="update",
args=[{"visible":
[True,True,True,True, True, False, False, False, False,False,
False, False, False, False,False,True,True,True,True, True]},
{"title": ""}]),
]),
x=0.2,
y = 1.27
)
])
# PLot figure
plot(fig5, filename = 'plotly_figures/fig5.html', config = config)
display(HTML('plotly_figures/fig5.html'))