Project Description

Recreate dataset and visualize

First step is to recreate the dataset given in the project description and plot a histogram to make sure everything looks in order:

require(tidyverse)

m.inv <- seq(2.1,4,0.1)
counts <- c(69,41,43,46,43,30,44,36,61,72,70,74,53,37,26,20,14,16,19,11)

dat <- c()

for(mass in m.inv){
  dat <- c(dat,rep(m.inv,counts))
  
}

According to the Particle Data Group (http://pdg.lbl.gov/2014/listings/rpp2014-list-J-psi-1S.pdf), the mass of J/Ψ can be expected to be 3096.916±0.011 MeV/c^2, in other words, around ≈ 3.097 GeV/c^2
According to this paper: https://arxiv.org/pdf/1406.0484.pdf we can estimate the signal region by a dimuon (J/Psi decay products) invariant mass in the interval between 2.8 and 3.35GeV/c2
We add this additional information to our histogram below, in red; and use it for downstream analysis

hist(dat,breaks = c(2,m.inv),right=TRUE,col="blue",border="orange",include.lowest = T,main = "Uncorrected Invariant Mass Spectrum in the J/Psi region",xlab="Invariant Mass (GeV/c^2)\n Signal region indicated with arrows",ylab = "dN/dm")
abline(v=3.097,col="red",lwd=10)
arrows(x0=2.8,y0=500,x1=3.4,y1=500,code=3,col="red",lwd=3)
text("Expected Invariant Mass=3.097 GeV",col="white",cex=1.5,x=3.1,y=100)

Linear model

Next, we want to see what a linear model of the background data, can tell us about background in the signal region.

framed.dat <- as.data.frame(cbind(m.inv,counts))

signal.reg <- framed.dat[8:14,]

noise.reg <- rbind(framed.dat[1:7,],framed.dat[15:20,])

plot(signal.reg,type="p",col="red",cex=2,xlim=c(1,5),ylim=c(0,80),pch=19,xlab="", ylab="",main="Linear model of background region (blue line) \noverplotted with actual data points",sub="Note: Red circles are measurements that fall into \nwhat we've defined as the signal region (2.8-3.4GeV/c^2)")
points(noise.reg,type="p",col="blue",cex=1,pch=16)


bg.mod <- lm(noise.reg$counts~noise.reg$m.inv)
abline(bg.mod,col="blue")

stand.dev <- sigma(bg.mod)
stand.err <- coef(summary(bg.mod))[[2,2]]

Analysis of model accuracy

#R SQUARED IS:
summary(bg.mod)$r.squared

## [1] 0.823606

#ADJUSTED R SQUARED IS:
summary(bg.mod)$adj.r.squared

## [1] 0.8075702

#F-STATISTIC IS:
summary(bg.mod)$fstatistic[[1]]

## [1] 51.3604

require(pander)
pander(bg.mod)

Fitting linear model: noise.reg$counts ~ noise.reg$m.inv
	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	96.01	9.101	10.55	4.321e-07
noise.reg$m.inv	-21.02	2.933	-7.167	1.829e-05

An analysis of the linear model describing the background region (tabulated above), yields the following observations:

The residuals are approximately normally distributed, as can be seen in the histogram below:

hist(bg.mod$residuals,col="lightblue", main = "Histogram of distribution of residuals",xlab="Residuals")

We also find that:

Standard errors on the coefficients are low, relative to magnitude of coefficient values
Both coefficients are significant at the p << 0.01 level
Adjusted R-squared suggests that ≈ 80% of the variation in the background-region data is explained by the linear model
F-statistic is relatively large at 11 degrees of freedom (p<<0.01)

Looking only at statistically significant observations (>5σ)

predicted.points <- c()

for(point in m.inv){
  ans <- coefficients(bg.mod)[[1]] + (coefficients(bg.mod)[[2]]*point)
  predicted.points <- c(predicted.points,ans)
}


signal <- c(rep(F,7),rep(T,7),rep(F,6))

framed.dat <- cbind(framed.dat,predicted.points,signal)

backgr.in.sign <- framed.dat %>%
  filter(signal==T)
backgr.in.sign <- backgr.in.sign %>%
  mutate(minus.background=counts-predicted.points)

plot(framed.dat$m.inv,framed.dat$counts,pch=16,col="blue",ylim = c(min(framed.dat$predicted.points)-(5*stand.dev),max(framed.dat$predicted.points)+(5*stand.dev)),main = "Predicted Data Points (green),\nvs. Observed Data Points (red in signal region,\n blue in background region)",xlab = "",ylab = "",sub="NOTE: Purple points indicate 5 standard deviations \nabove expected background in signal region")

points(backgr.in.sign$m.inv,backgr.in.sign$counts,pch=16,col="red")
points(framed.dat$m.inv,framed.dat$predicted.points,pch=16,col="green")
points(x=backgr.in.sign$m.inv, y=backgr.in.sign$predicted.points+(5*stand.dev),pch=16,col="purple",cex=0.7)

arrows(x0=backgr.in.sign$m.inv, y0=backgr.in.sign$predicted.points+(5*stand.dev)+stand.err, x1=backgr.in.sign$m.inv, y1=backgr.in.sign$predicted.points+(5*stand.dev)-stand.err, length=0.05, angle=90, code=3,col="purple")

In the above plot, green points indicate expected counts due to background, red points indicate number of observations per bin in the signal region, and blue points indicate number of observations per bin outside the signal region, the purple points indicate where data will be 5 standard deviations above background in the signal region (error bars are the standard error on the coefficient measurement)
So, we will take the three points that are 5 standard deviations above background into account for our estimate of the number of measured J/Psi’s (invariant mass range 3-3.2 GeV/c^2); and subtract the background estimated by our linear model
We multiple the standard error by three, effectively summing over the uncertainty for each of the three observations used in our final prediction, to get the confidence interval.

num.j.psi <- sum(framed.dat$counts[10:12])-sum(framed.dat$predicted.points[10:12])
conf <- stand.err*3

num.j.psi-conf

## [1] 114.6668

num.j.psi+conf

## [1] 132.266

Conclusion:

We conclude that the 99% Confidence interval for the number of actual J/Psi’s measured is: [114.6668,132.266]

PHY5007Z Project 2: Signal Significance

Gerhard Viljoen

06 November 2017

Project Description

Recreate dataset and visualize

Linear model

Analysis of model accuracy

We also find that:

Looking only at statistically significant observations (>5σ)

Conclusion: