Chapter 10: Statistical Models and Hypothesis Tests
Statistical Indices from the Global Temperature From 1880 to
2015
#setwd("~/sshen/mathmodel")
dat1 <- read.table("~/Desktop/RMathModel/data/aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
dim(dat1)
## [1] 136 6
tmean15 <- dat1[,2] #Take only the second column of this data matrix
head(tmean15) #The first five values
## [1] -0.367918 -0.317154 -0.317069 -0.393357 -0.457649 -0.468707
mean(tmean15)
## [1] -0.2034367
sd(tmean15)
## [1] 0.3038567
var(tmean15)
## [1] 0.09232888
library(e1071) # R library needed to compute the following parameters
skewness(tmean15)
## [1] 0.7141481
kurtosis(tmean15)
## [1] -0.3712142
median(tmean15)
## [1] -0.29694
quantile(tmean15,
probs= c(0.05,0.25, 0.75, 0.95))
## 5% 25% 75% 95%
## -0.5792472 -0.4228540 -0.0159035 0.3743795
yrtime15 <- seq(1880,2015)
reg8015 <- lm(tmean15 ~ yrtime15)
reg8015 # Display regression results
##
## Call:
## lm(formula = tmean15 ~ yrtime15)
##
## Coefficients:
## (Intercept) yrtime15
## -13.208662 0.006678
# Plot the temperature time series and its trend line
plot(yrtime15, tmean15,
xlab="Year",ylab=expression(paste("Temperature [ ", degree, "C]")),
main="Global Annual Mean Land and Ocean Surface \nTemperature Anomalies 1880-2015",
type="l")
grid(nx = NULL, ny = NULL)
abline(reg8015, col="red")
text(1933, 0.4,
expression(paste("Linear Temperature Trend = 0.6678 ", degree, "C per century")),
col="red",cex=1.2)

Histogram of a Set of Data
h <- hist(tmean15, #Plot histogram
main="Histogram of 1880-2015 Temperature Anomalies",
xlab="Temperature Anomalies")
xfit <- seq(min(tmean15),
max(tmean15), length=30)
areat <- diff(h$mids[1:2])*length(tmean15)
#Normalization area
yfit <- areat*dnorm(xfit,
mean=mean(tmean15), sd=sd(tmean15))
lines(xfit,yfit,col="blue",lwd=2)

#Plot the normal fit
plot(density(tmean15), #R estimate density
main="R Estimate of Density",xlab="Temperature Anomalies")
lines(xfit,dnorm(xfit, #Moment estimated normal
mean=mean(tmean15), sd=sd(tmean15)), col="blue")
grid(nx = NULL, ny = NULL)

Box Plot
b <- boxplot(tmean15, ylab="Temperature Anomalies")
grid(nx = NULL, ny = NULL)

Scatter Plot
#Use setwd("working directory") to work in the right directory
rm(list=ls())
#setwd("~/sshen/mathmodel")
par(mgp=c(1.5,0.5,0))
ust <- read.csv("~/Desktop/RMathModel/data/USJantemp1951-2016-nohead.csv",
header=FALSE)
soi <- read.csv("data/soi-data-nohead.csv",
header=FALSE)
soid <- soi[,2] #Take the second column SOI data
soim <- matrix(soid, ncol=12, byrow=TRUE)
#Make the SOI into a matrix with each month as a column
soij <- soim[,1] #Take the first column for Jan SOI
ustj <- ust[,3] #Take the third column: Jan US temp data
plot(soij, ustj, # Plot the scatter plot
xlim=c(-4,4), ylim=c(-8,8),
main="January SOI and the U.S. Temperature",
xlab="SOI [dimensionless]",
ylab=expression(paste("U.S. Temperature [ ", degree, "F]")),
pch=19, cex.lab=1.3)
grid(nx = NULL, ny = NULL)
soiust <- lm(ustj ~ soij)
#Linear regression
abline(soiust, col="red", lwd=3)

#Linear regression line
#El Nino Years
soijc <- soij[c(1:7,9:32,34:41,43:47,49:65)]
ustjc <- ustj[c(1:7,9:32,34:41,43:47,49:65)]
QQ-Plot
#setwd("~/sshen/mathmodel")
dat1 <- read.table("~/Desktop/RMathModel/data/aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
dim(dat1)
## [1] 136 6
tmean15 <- dat1[,2]
#qq-plot for standardized anomalies
tstand <- (tmean15 - mean(tmean15))/sd(tmean15)
qqnorm(tstand,
ylab=expression(paste("Global Temperature Anomalies [ ", degree, "C]")),
xlab="Quantile of N(0,1)",
xlim=c(-3,3),ylim=c(-3,3))
qqline(tstand, col = "red", lwd=2)
grid(nx = NULL, ny = NULL)

What is a Probability Distribution?
#plot.new()
## Figure 10.6
par(mar = c(2.5,4,4,1))
layout(matrix(c(1,2,3), 1, 3, byrow = TRUE),
widths=c(3,3,3), heights=c(1,1,1))
lasvegas <- c(0.58,0.42)
sandiego <- c(0.4,0.6)
seattle <- c(0.16,0.84)
names(lasvegas) <- c("Clear","Cloudy")
names(sandiego) <- c("Clear","Cloudy")
names(seattle) <- c("Clear","Cloudy")
barplot(lasvegas,col=c("skyblue","gray"),ylab="Probability")
mtext("Las Vegas", side=3,line=1)
barplot(sandiego,col=c("skyblue","gray"))
mtext("San Diego", side=3,line=1)
barplot(seattle,col=c("skyblue","gray"))
mtext("Seattle", side=3,line=1)
mtext("Probability Distribution of Weather",
cex=1.3,side = 3, line = -1.5, outer = TRUE)

Figure 10.7
# Create data for the area to shade
cord.x <- c(-3,seq(-3,3,0.01),-1)
cord.y <- c(0,dnorm(seq(-3,3,0.01)),0)
# Make a curve
curve(dnorm(x,0,1), xlim=c(-3,3), lwd=3,
main='PDF of the Standard Normal Distribution',
xlab='Random Variable x',
ylab='Probability Density')
# Add the shaded area using many lines
polygon(cord.x,cord.y,col='skyblue')
polygon(c(-1.5,-1.5, -1.2, -1.2),
c(0, dnorm(-1.5),dnorm(-1.2), 0.0),col='white')
text(0,0.18, "Area = 1", cex=1.5)
text(-1.65,0.045,"f(x)")
text(-1.35,0.075,"dx")
text(-1.6,0.0095,"x")
text(-0.9,0.0095,"x+dx")
arrows(-2,0.2,-1.35,0.13, length=0.1)
text(-2,0.21,"dA = f(x)dx")
text(0,0.09,expression(paste(integral(f(x)*dx,- infinity,infinity),"=1")))

Normal Distribution
#Normal distribution plot
x <- seq(-8, 8, length=200)
plot(x,dnorm(x, mean=0, sd=1), type="l", lwd=4, col="red",
ylim = c(0,1),
xlab="Random Variable x",
ylab ="Probability Density",
main=expression(Normal ~Distribution ~ N(mu,sigma^2)))
grid(nx = NULL, ny = NULL)
lines(x,dnorm(x, mean=0, sd=2), type="l", lwd=2, col="blue")
lines(x,dnorm(x, mean=0, sd=0.6), type="l", lwd=2, col="black")
lines(x,dnorm(x, mean=3, sd=1), type="l", lwd=2, col="purple")
lines(x,dnorm(x, mean=-4, sd=1), type="l", lwd=2, col="green")
#ex.cs1 <- expression(plain(sin) * phi, paste("cos", phi))
ex.cs1 <- expression(paste(mu, "=0", ~"," ~ sigma, "=1"),
paste(mu, "=0", ~"," ~ sigma, "=2"),
paste(mu, "=0", ~"," ~ sigma, "=1/2"),
paste(mu, "=3", ~"," ~ sigma, "=1"),
paste(mu, "=-4", ~"," ~ sigma, "=1"))
legend("topleft",legend = ex.cs1, lty=1,
col=c('red','blue','black','purple','green'), cex=1, bty="n")

mu <- 0
sig <- 1
intg <- function(x){(1/(sig*sqrt(2*pi)))*exp(-(x-mu)^2/(2*sig^2))}
integrate(intg,-2,2)
## 0.9544997 with absolute error < 1.8e-11
#Or using the R built-in function dnorm to get the same result
integrate(dnorm,-2,2)
## 0.9544997 with absolute error < 1.8e-11
integrate(dnorm,-1.96,1.96)
## 0.9500042 with absolute error < 1e-11
Student’s t-Distribution
#Plot t-distribution by R
x <- seq(-4, 4, length=200)
#plot.new()
plot(x,dt(x, df=3), type="l", lwd=4, col="red", ylim = c(0,0.6),
xlab="Random Variable t",
ylab ="Probability Density",
main="Student t-Distribution T(t,df)")
grid(nx = NULL, ny = NULL)
lines(x,dt(x, df=1), type="l", lwd=2, col="blue")
lines(x,dt(x, df=2), type="l", lwd=2, col="black")
lines(x,dt(x, df=6), type="l", lwd=2, col="purple")
lines(x,dt(x, df=Inf), type="l", lwd=2, col="green")
#ex.cs1 <- expression(plain(sin) * phi, paste("cos", phi))
ex.cs1 <- c("df = 3", "df = 1","df = 2","df = 6",expression(paste("df = ",infinity)))
legend("topleft",legend = ex.cs1, lty=1,
col=c('red','blue','black','purple','green'), cex=1, bty="n")

Probability of a Sample Inside a Confidence Interval
#Confidence interval simulation
mu <- 14 #true mean
sig <- 0.3 #true sd
n <- 50 #sample size
d <- 1.96*sig/sqrt(n)
lowerlim <- mu-d
upperlim <- mu+d
ksim <- 10000 #number of simulations
k <- 0 # simulation counter
xbar <- 1:ksim
for (i in 1:ksim)
{
xbar[i] <- mean(rnorm(n, mean = mu, sd = sig))
if (xbar[i] >= lowerlim & xbar[i] <= upperlim)
k <- k+1
}
print(c(k,ksim))
## [1] 9482 10000
#plot the histogram
#plot.new()
hist(xbar,breaks=51,xlab=expression(paste("Temperature [ ", degree, "C]")),
main="Histogram of Simulated Sample Mean Temperatures",
xaxt="n",ylim=c(0,600))
axis(1,pos = -20, at =c(13.92, 14.0, 14.08))
text(14,550,"95% Confidence Interval: (13.92,14.08)",cex=1.2)

#dev.off()
Confidence Interval of the Sample Mean
#Figure of confidence intervals and tail probability
par(mar = c(2.5,3.5,2.0,0.5))
rm(list=ls())
par(mgp=c(1.4,0.5,0))
curve(dnorm(x,0,1), xlim=c(-3,3), lwd=3,
main='Confidence Intervals and Confidence Levels',
xlab="True Mean as a Random Variable",
ylab='Probability Density',xaxt="n", cex.lab=1.3)
polygon(c(-1.96, seq(-1.96,1.96,len=100), 1.96),
c(0,dnorm(seq(-1.96,1.96,len=100)),0),col='skyblue')
polygon(c(-1.0,seq(-1.0, 1, length=100), 1),
c(0, dnorm(seq(-1.0, 1, length=100)), 0.0),col='white')
polygon(c(-3.0,seq(-3.0, -1.96, length=100), -1.96),
c(0, dnorm(seq(-3.0, -1.96, length=100)), 0.0),col='red')
polygon(c(1.96,seq(1.96, 3.0, length=100), 3.0),
c(0, dnorm(seq(1.96, 3.0, length=100)), 0.0),col='red')
points(c(-1,1), c(0,0), pch=19, col="blue")
points(0,0, pch=19)
points(c(-1.96,1.96),c(0,0),pch=19, col="red")
text(0,0.02, expression(bar(x)), cex=1.0)
text(-1.50,0.02, "SE", cex=1.0)
text(-0.60,0.02, "SE", cex=1.0)
text(1.50,0.02, "SE", cex=1.0)
text(0.60,0.02, "SE", cex=1.0)
text(0,0.2, "Probability = 0.68")
arrows(-2.8,0.06,-2.35,0.01, length=0.1)
text(-2.5,0.09, "Probability")

Example 10.1
#Estimate the mean and error bar for a large sample
#Confidence interval for NOAAGlobalTemp 1880-2015
#setwd("~/sshen/mathmodel")
dat1 <- read.table("~/Desktop/RMathModel/data/aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
dim(dat1)
## [1] 136 6
tmean15 <- dat1[,2]
MeanEst <- mean(tmean15)
sd1 <- sd(tmean15)
StandErr <- sd1/sqrt(length(tmean15))
ErrorMar <- 1.96*StandErr
MeanEst
## [1] -0.2034367
print(c(MeanEst-ErrorMar, MeanEst+ErrorMar))
## [1] -0.2545055 -0.1523680
Statistical Inference for x-bar Using a Z-score
rm(list=ls())
par(mar = c(2.3,3.0,2.0,0.5))
rm(list = ls())
par(mgp = c(1.0,0.5,0))
curve(dnorm(x,0,1), xlim = c(-3,3), lwd = 3,
main = 'Z-score, p-value, and Significance Level',
xlab = "z: Standard Normal Random Variable",
ylab = 'Probability Density',xaxt = "n",yaxt = "n",
cex.lab = 1.2,cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35, ylim = c(-0.1,0.4))
lines(c(-3,3),c(0,0))
arrows(-3,-0.1,-2.02,-0.1, lwd = 12,col = 'skyblue', length = 0.2, code = 3)
arrows(3,-0.1,-1.90,-0.1, lwd = 12,col = 'green', length = 0.2, code = 3)
polygon(c(-3.0,seq(-3.0, -2.5, length = 100), -2.5),
c(0, dnorm(seq(-3.0, -2.5, length = 100)), 0.0),col = 'skyblue')
polygon(c(-1.96,seq(-1.96, 3, length = 100), 3),
c(0, dnorm(seq(-1.96, 3, length = 100)), 0.0),col = 'lightgreen')
points(-1.96,0, pch = 19, col = "red")
points(-2.5,0,pch = 19, col = "skyblue")
text(-1.4,-0.02, expression(z[0.025]~' = -1.96'), cex = 1.1, col = 'red')
text(-2.40,-0.02, "z-score", cex = 1.1, col = 'skyblue')
arrows(-2.8,0.06,-2.6,0.003, length = 0.1)
lines(c(-1.96,-1.96),c(-0.1, .4),lwd = 1.5, col = 'red')
text(-2.5,0.09, "p-value", cex = 1.3)
text(1.0,-0.06, expression(H[0] ~'region'), cex = 1.1)
text(-2.5,-0.06, expression(H[1] ~'region'), cex = 1.1)
text(0,0.15, expression(H[0] ~'probability 0.975'), cex = 1.1)

Example 10.3
#Hypothesis test for NOAAGlobalTemp 2006-2015
#setwd("~/sshen/mathmodel")
dat1 <- read.table("~/Desktop/RMathModel/data/aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
tm0615 <- dat1[127:136,2]
MeanEst <- mean(tm0615)
MeanEst
## [1] 0.4107391
sd1 <- sd(tm0615)
sd1
## [1] 0.1023293
n <- 10
t_score <- (MeanEst -0)/(sd1/sqrt(n))
t_score
## [1] 12.69306
1-pt(t_score, df=n-1) #p-value
## [1] 2.383058e-07
qt(1-0.025, df=n-1) #critical t-score
## [1] 2.262157
Example 10.4
#Hypothesis test for global temp for 1981-1990 and 1991-2000
#setwd("~/sshen/mathmodel")
dat1 <- read.table("~/Desktop/RMathModel/data/aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
tm8190 <- dat1[102:111,2]
tm9100 <- dat1[112:121,2]
barT1 <- mean(tm8190)
barT2 <- mean(tm9100)
S1sd <- sd(tm8190)
S2sd <- sd(tm9100)
n1 <- n2 <- 10
Spool <- sqrt(((n1 - 1)*S1sd^2 + (n2 - 1)*S2sd^2)/(n1 + n2 -2))
t <- (barT2 - barT1)/(Spool*sqrt(1/n1 + 1/n2))
tlow <- qt(0.025, df= n1 + n2 -2)
tup <- qt(0.975, df= n1 + n2 -2)
paste("t-score=", round(t,digits=5),
"tlow=", round(tlow,digits=5),
"tup=", round(tup,digits=5))
## [1] "t-score= 2.57836 tlow= -2.10092 tup= 2.10092"
pvalue <- 1-pt(t, df= n1 + n2 -2)
paste( "p-value=", pvalue)
## [1] "p-value= 0.00947040009284539"
paste("1981-90 temp=", barT1, "1991-00 temp=",barT2)
## [1] "1981-90 temp= 0.0368621 1991-00 temp= 0.1612545"
barT2 - barT1
## [1] 0.1243924
Statistical Inference of Linear Trend
#setwd("~/sshen/mathmodel")
dat1 <- read.table("~/Desktop/RMathModel/data/aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
tm <- dat1[,2]
x <- 1880:2015
summary(lm(tm ~ x))
##
## Call:
## lm(formula = tm ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31028 -0.09609 -0.01724 0.11583 0.40289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.321e+01 6.489e-01 -20.36 <2e-16 ***
## x 6.678e-03 3.331e-04 20.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1525 on 134 degrees of freedom
## Multiple R-squared: 0.7499, Adjusted R-squared: 0.7481
## F-statistic: 401.9 on 1 and 134 DF, p-value: < 2.2e-16