Tutorial¶
This tutorial demonstrates the usage of the sweights package.
We will first cook up a toy model with a discriminat variable (invariant mass) and a control variable (decay time) and use it to generate some toy data.
Then we will use a fit to the invariant mass to obtain some component pdf estimates and use these to extract some weights which project out the signal only component in the decay time.
We will demonstrate both the classic sWeights and the Custom Ortogonal Weight functions (COWs) method. See arXiv:2112.04575 for more details.
Finally we will fit the weighted decay time distribution and correct the covariance matrix according to the description in arXiv:1911.01303.
[1]:
# external requirements
import os
import numpy as np
from scipy.stats import norm, expon, uniform
import matplotlib.pyplot as plt
from iminuit import Minuit
from iminuit.cost import ExtendedUnbinnedNLL
from iminuit.pdg_format import pdg_format
import boost_histogram as bh
# from this package
from sweights import SWeight # for classic sweights
from sweights import Cow # for custom orthogonal weight functions
from sweights import cov_correct, approx_cov_correct # for covariance corrections
from sweights import kendall_tau # for an independence test
from sweights import plot_indep_scatter
# set a reproducible seed
np.random.seed(21011987)
Make the toy model and generate some data¶
[2]:
Ns = 5000
Nb = 5000
ypars = [Ns,Nb]
# mass
mrange = (0,1)
mu = 0.5
sg = 0.1
lb = 1
mpars = [mu,sg,lb]
# decay time
trange = (0,1)
tlb = 2
tpars = [tlb]
# generate the toy
def generate(Ns,Nb,mu,sg,lb,tlb,poisson=False,ret_true=False):
Nsig = np.random.poisson(Ns) if poisson else Ns
Nbkg = np.random.poisson(Nb) if poisson else Nb
sigM = norm(mu,sg)
bkgM = expon(mrange[0], lb)
sigT = expon(trange[0], tlb)
bkgT = uniform(trange[0],trange[1]-trange[0])
# generate
sigMflt = sigM.cdf(mrange)
bkgMflt = bkgM.cdf(mrange)
sigTflt = sigT.cdf(trange)
bkgTflt = bkgT.cdf(trange)
sigMvals = sigM.ppf( np.random.uniform(*sigMflt,size=Nsig) )
sigTvals = sigT.ppf( np.random.uniform(*sigTflt,size=Nsig) )
bkgMvals = bkgM.ppf( np.random.uniform(*bkgMflt,size=Nbkg) )
bkgTvals = bkgT.ppf( np.random.uniform(*bkgTflt,size=Nbkg) )
Mvals = np.concatenate( (sigMvals, bkgMvals) )
Tvals = np.concatenate( (sigTvals, bkgTvals) )
truth = np.concatenate( ( np.ones_like(sigMvals), np.zeros_like(bkgMvals) ) )
if ret_true:
return np.stack( (Mvals,Tvals,truth), axis=1 )
else:
return np.stack( (Mvals,Tvals), axis=1 )
toy = generate(Ns,Nb,mu,sg,lb,tlb,ret_true=True)
Plot the toy data and generating pdfs¶
[3]:
# useful function for plotting data points with error bars
def myerrorbar(data, ax, bins, range, wts=None, label=None, col=None):
col = col or 'k'
nh, xe = np.histogram(data,bins=bins,range=range)
cx = 0.5*(xe[1:]+xe[:-1])
err = nh**0.5
if wts is not None:
whist = bh.Histogram( bh.axis.Regular(bins,*range), storage=bh.storage.Weight() )
whist.fill( data, weight = wts )
cx = whist.axes[0].centers
nh = whist.view().value
err = whist.view().variance**0.5
ax.errorbar(cx, nh, err, capsize=2,label=label,fmt=f'{col}o')
# define the mass pdf for plotting etc.
def mpdf(x, Ns, Nb, mu, sg, lb, comps=['sig','bkg']):
sig = norm(mu,sg)
sigN = np.diff( sig.cdf(mrange) )
bkg = expon(mrange[0], lb)
bkgN = np.diff( bkg.cdf(mrange) )
tot = 0
if 'sig' in comps: tot += Ns * sig.pdf(x) / sigN
if 'bkg' in comps: tot += Nb * bkg.pdf(x) / bkgN
return tot
# define time pdf for plotting etc.
def tpdf(x, Ns, Nb, tlb, comps=['sig','bkg']):
sig = expon(trange[0],tlb)
sigN = np.diff( sig.cdf(trange) )
bkg = uniform(trange[0],trange[1]-trange[0])
bkgN = np.diff( bkg.cdf(trange) )
tot = 0
if 'sig' in comps: tot += Ns * sig.pdf(x) / sigN
if 'bkg' in comps: tot += Nb * bkg.pdf(x) / bkgN
return tot
# define plot function
def plot(toy, draw_pdf=True):
nbins = 50
fig, ax = plt.subplots(1,2,figsize=(12,4))
myerrorbar(toy[:,0],ax[0],bins=nbins,range=mrange)
myerrorbar(toy[:,1],ax[1],bins=nbins,range=trange)
if draw_pdf:
m = np.linspace(*mrange,400)
mN = (mrange[1]-mrange[0])/nbins
bkgm = mpdf(m, *(ypars+mpars),comps=['bkg'])
sigm = mpdf(m, *(ypars+mpars),comps=['sig'])
totm = bkgm + sigm
ax[0].plot(m, mN*bkgm, 'r--', label='Background')
ax[0].plot(m, mN*sigm, 'g:' , label='Signal')
ax[0].plot(m, mN*totm, 'b-' , label='Total PDF')
t = np.linspace(*trange,400)
tN = (trange[1]-trange[0])/nbins
bkgt = tpdf(t, *(ypars+tpars),comps=['bkg'])
sigt = tpdf(t, *(ypars+tpars),comps=['sig'])
tott = bkgt + sigt
ax[1].plot(t, tN*bkgt, 'r--', label='Background')
ax[1].plot(t, tN*sigt, 'g:' , label='Signal')
ax[1].plot(t, tN*tott, 'b-' , label='Total PDF')
ax[0].set_xlabel('Invariant mass')
ax[0].set_ylim(bottom=0)
ax[0].legend()
ax[1].set_xlabel('Decay time')
ax[1].set_ylim(bottom=0)
ax[1].legend()
fig.tight_layout()
plot(toy)

Check the independence of our data¶
By computing the kendall rank coefficient and seeing how compatibile it is with 0
[4]:
kts = kendall_tau(toy[:,0],toy[:,1])
print('Kendall Tau:', pdg_format( kts[0], kts[1] ) )
plot_indep_scatter(toy[:,0],toy[:,1],reduction_factor=2);
Kendall Tau: -0.017 ± 0.010

Fit the toy in invariant mass¶
This provides us with estimates for the component shapes and the component yields
[5]:
# define mass pdf for iminuit fitting
def mpdf_min(x, Ns, Nb, mu, sg, lb):
return (Ns+Nb, mpdf(x, Ns, Nb, mu, sg, lb) )
mi = Minuit( ExtendedUnbinnedNLL(toy[:,0], mpdf_min), Ns=Ns, Nb=Nb, mu=mu, sg=sg, lb=lb )
mi.limits['Ns'] = (0,Ns+Nb)
mi.limits['Nb'] = (0,Ns+Nb)
mi.limits['mu'] = mrange
mi.limits['sg'] = (0,mrange[1]-mrange[0])
mi.limits['lb'] = (0,10)
mi.migrad()
mi.hesse()
display(mi) # only valid for ipython notebooks
Migrad | ||||
---|---|---|---|---|
FCN = -1.684e+05 | Nfcn = 135 | |||
EDM = 7.37e-05 (Goal: 0.0002) | ||||
Valid Minimum | No Parameters at limit | |||
Below EDM threshold (goal x 10) | Below call limit | |||
Covariance | Hesse ok | Accurate | Pos. def. | Not forced |
Name | Value | Hesse Error | Minos Error- | Minos Error+ | Limit- | Limit+ | Fixed | |
---|---|---|---|---|---|---|---|---|
0 | Ns | 5.11e3 | 0.11e3 | 0 | 1E+04 | |||
1 | Nb | 4.89e3 | 0.11e3 | 0 | 1E+04 | |||
2 | mu | 0.4981 | 0.0021 | 0 | 1 | |||
3 | sg | 0.1031 | 0.0022 | 0 | 1 | |||
4 | lb | 1.07 | 0.06 | 0 | 10 |
Ns | Nb | mu | sg | lb | |
---|---|---|---|---|---|
Ns | 1.22e+04 | -7.12e+03 (-0.587) | -0.00789 (-0.034) | 0.124 (0.516) | -1.44 (-0.203) |
Nb | -7.12e+03 (-0.587) | 1.2e+04 | 0.00788 (0.034) | -0.124 (-0.520) | 1.44 (0.205) |
mu | -0.00789 (-0.034) | 0.00788 (0.034) | 4.46e-06 | -2.09e-07 (-0.045) | -3.35e-05 (-0.247) |
sg | 0.124 (0.516) | -0.124 (-0.520) | -2.09e-07 (-0.045) | 4.72e-06 | -2.39e-05 (-0.172) |
lb | -1.44 (-0.203) | 1.44 (0.205) | -3.35e-05 (-0.247) | -2.39e-05 (-0.172) | 0.00411 |
Construct the sweighter¶
Note that this will run much quicker if verbose=False
and checks=False
[6]:
# define estimated functions
spdf = lambda m: mpdf(m,*mi.values,comps=['sig'])
bpdf = lambda m: mpdf(m,*mi.values,comps=['bkg'])
# make the sweighter
sweighter = SWeight( toy[:,0], [spdf,bpdf], [mi.values['Ns'],mi.values['Nb']], (mrange,), method='summation', compnames=('sig','bkg'), verbose=True, checks=True )
Initialising sweight with the summation method:
PDF normalisations:
0 5107.224408977729
1 4892.156810093505
W-matrix:
[[1.38699027e-04 5.96207082e-05]
[5.96207082e-05 1.42184207e-04]]
A-matrix:
[[ 8795.16287156 -3687.98933236]
[-3687.98933236 8579.57828096]]
Integral of w*pdf matrix (should be close to the
identity):
[[ 1.00011709e+00 -1.32635071e-04]
[-1.57992838e-04 1.00004886e+00]]
Check of weight sums (should match yields):
Component | sWeightSum | Yield | Diff |
---------------------------------------------------
0 | 5107.2244 | 5107.2244 | -0.00% |
1 | 4892.1568 | 4892.1568 | 0.00% |
Construct the COW¶
Note that COW pdfs are always normalised in the COW numerator so that the W and A matrices tend to come out with elements of order one. This example uses a variance function of unity, I(m)=1, but also codes the case where I(m) = g(m), which is equiavalent to sweights.
[7]:
# unity
Im = 1
# sweight equiavlent
# Im = lambda m: mpdf(m,*mi.values) / (mi.values['Ns'] + mi.values['Nb'] )
# make the cow
cw = Cow(mrange, spdf, bpdf, Im, verbose=True)
Initialising COW:
W-matrix:
[[2.73492037 0.97050271]
[0.97050271 1.07239023]]
A-matrix:
[[ 0.53861176 -0.48743839]
[-0.48743839 1.37362336]]
Comparison of the sweight and COW methods¶
Compare the weight distributoins¶
[8]:
def plot_wts(x, sw, bw, title=None):
fig,ax = plt.subplots()
ax.plot(x, sw, 'b--', label='Signal')
ax.plot(x, bw, 'r:' , label='Background')
ax.plot(x, sw+bw, 'k-', label='Sum')
ax.set_xlabel('Mass')
ax.set_ylabel('Weight')
if title: ax.set_title(title)
fig.tight_layout()
for meth, cls in zip( ['SW','COW'], [sweighter,cw] ):
# plot weights
x = np.linspace(*mrange,400)
swp = cls.get_weight(0,x)
bwp = cls.get_weight(1,x)
plot_wts(x, swp, bwp, meth)


Fit the weighted data in decay time and correct the covariance¶
[9]:
# define weighted nll
def wnll(tlb, tdata, wts):
sig = expon(trange[0],tlb)
sigN = np.diff( sig.cdf(trange) )
return -np.sum( wts * np.log( sig.pdf( tdata ) / sigN ) )
# define signal only time pdf for cov corrector
def tpdf_cor(x, tlb):
return tpdf(x,1,0,tlb,['sig'])
flbs=[]
for meth, cls in zip( ['SW','COW'], [sweighter,cw] ):
print('Method:', meth)
# get the weights
wts = cls.get_weight(0,toy[:,0])
# define the nll
nll = lambda tlb: wnll(tlb, toy[:,1], wts)
# do the minimisation
tmi = Minuit( nll, tlb=tlb )
tmi.limits['tlb'] = (1,3)
tmi.errordef = Minuit.LIKELIHOOD
tmi.migrad()
tmi.hesse()
# and do the correction
fval = np.array(tmi.values)
flbs.append(fval[0])
fcov = np.array( tmi.covariance.tolist() )
# first order correction
ncov = approx_cov_correct(tpdf_cor, toy[:,1], wts, fval, fcov, verbose=False)
# second order correction
hs = tpdf_cor
ws = lambda m: cls.get_weight(0,m)
W = cls.Wkl
# these derivatives can be done numerically but for the sweights / COW case it's straightfoward to compute them
ws = lambda Wss, Wsb, Wbb, gs, gb: (Wbb*gs - Wsb*gb) / ((Wbb-Wsb)*gs + (Wss-Wsb)*gb)
dws_Wss = lambda Wss, Wsb, Wbb, gs, gb: gb * ( Wsb*gb - Wbb*gs ) / (-Wss*gb + Wsb*gs + Wsb*gb - Wbb*gs)**2
dws_Wsb = lambda Wss, Wsb, Wbb, gs, gb: ( Wbb*gs**2 - Wss*gb**2 ) / (Wss*gb - Wsb*gs - Wsb*gb + Wbb*gs)**2
dws_Wbb = lambda Wss, Wsb, Wbb, gs, gb: gs * ( Wss*gb - Wsb*gs ) / (-Wss*gb + Wsb*gs + Wsb*gb - Wbb*gs)**2
tcov = cov_correct(hs, [spdf,bpdf], toy[:,1], toy[:,0], wts, [mi.values['Ns'],mi.values['Nb']], fval, fcov, [dws_Wss,dws_Wsb,dws_Wbb],[W[0,0],W[0,1],W[1,1]], verbose=False)
print('Method:', meth, f'- covariance corrected {fval[0]:.2f} +/- {fcov[0,0]**0.5:.2f} ---> {fval[0]:.2f} +/- {tcov[0,0]**0.5:.2f}')
Method: SW
Method: SW - covariance corrected 1.71 +/- 0.14 ---> 1.71 +/- 0.19
Method: COW
Method: COW - covariance corrected 1.64 +/- 0.13 ---> 1.64 +/- 0.18
[10]:
### Plot the weighted decay distributions and the fit result
def plot_tweighted(x, wts, wtnames=[], funcs=[]):
fig, ax = plt.subplots()
t = np.linspace(*trange,400)
N = (trange[1]-trange[0])/50
for i, wt in enumerate(wts):
label = None
if i<len(wtnames): label = wtnames[i]
myerrorbar(x, ax, bins=50, range=trange, wts=wt, label=label, col=f'C{i}')
if i<len(funcs):
ax.plot(t,N*funcs[i](t),f'C{i}-')
ax.legend()
ax.set_xlabel('Time')
ax.set_ylabel('Weighted Events')
fig.tight_layout()
swf = lambda t: tpdf(t, mi.values['Ns'], 0, flbs[0], comps=['sig'] )
cowf = lambda t: tpdf(t, mi.values['Ns'], 0, flbs[1], comps=['sig'] )
sws = sweighter.get_weight(0, toy[:,0])
scow = cw.get_weight(0, toy[:,0])
plot_tweighted(toy[:,1], [sws,scow], ['SW','COW'], funcs=[swf,cowf] )

Write the weights back into a file¶
For example into a pandas dataframe or root file
[11]:
import pandas as pd
df = pd.DataFrame( )
df['mass'] = toy[:,0]
df['time'] = toy[:,1]
df['sw_sws'] = sweighter.get_weight(0, df['mass'].to_numpy() )
df['sw_bws'] = sweighter.get_weight(1, df['mass'].to_numpy() )
df['cow_sws'] = cw.get_weight(0, df['mass'].to_numpy() )
df['cow_bws'] = cw.get_weight(1, df['mass'].to_numpy() )
import uproot
with uproot.recreate('outf.root') as f:
f['tree'] = df
print(df)
mass time sw_sws sw_bws cow_sws cow_bws
0 0.276401 0.287028 -0.129313 1.129223 -0.372759 1.446466
1 0.302426 0.426394 0.150924 0.848999 -0.220921 1.282285
2 0.578586 0.449921 1.149476 -0.149511 1.099331 -0.159718
3 0.373953 0.741222 0.839109 0.160843 0.481153 0.576620
4 0.515549 0.598659 1.244910 -0.244940 1.590285 -0.553110
... ... ... ... ... ... ...
9995 0.846222 0.426296 -0.706615 1.706501 -0.332575 0.950587
9996 0.166248 0.153495 -0.711678 1.711563 -0.631136 1.801133
9997 0.535665 0.266288 1.231984 -0.232015 1.494705 -0.483188
9998 0.357202 0.365763 0.708016 0.291931 0.282588 0.772361
9999 0.643105 0.593656 0.832079 0.167873 0.364086 0.456587
[10000 rows x 6 columns]