kiama<-c(83,51,87,60,28,95,8,27,15,10, 18,16,29,54,91,8,17,55,10,35)
summary(kiama)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 15.75 28.50 39.85 56.25 95.00
hist(kiama)
hist(kiama, probability=T)
boxplot(kiama)
plot(kiama)
firearm=data.frame(Year=c(1983:1997), Rate=c(4.31, 4.42, 4.52, 4.35,4.39,4.21, 3.40,3.61, 3.67, 3.61,2.98,2.95, 2.72,2.96,2.3))
attach(firearm)
plot(Year, Rate)
Yes, using a linear regression model would be appropriate as we see a relatively linear relationship between Year and Rate.
model = lm(Rate~Year)
summary(model)
##
## Call:
## lm(formula = Rate ~ Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38142 -0.16824 -0.01667 0.22071 0.30701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 306.3199 29.8444 10.26 1.33e-07 ***
## Year -0.1521 0.0150 -10.14 1.53e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.251 on 13 degrees of freedom
## Multiple R-squared: 0.8878, Adjusted R-squared: 0.8792
## F-statistic: 102.9 on 1 and 13 DF, p-value: 1.527e-07
*We note that the t-value of the Year variable is -10.14. We can conduct a t-test at a 5% significance level - since the absolute t-value (10.14) is greater than 1.96, we have sufficient evidence to reject the null hypothesis \(H_0: \beta_{Year} = 0\) at a 5% significance level.
par(mfrow=c(2,2))
plot(model)
The residuals vs fitted diagnostic plot suggests a non-linear trend as there is a U-shape in the residuals, indicating a violation of the constant variance assumption. Furthermore, the Normal Q-Q plot shows a violation of the normality assumption.
EyeColour<-factor(rep(c("Brown","Green","Blue"),times=c(8,5,6)))
Flicker<-c(26.8,27.9,23.7,25.0,26.3,24.8,25.7, 24.5,26.4,24.2,28.0,26.9,29.1,25.7,27.2,29.9,28.5,29.4,28.3)
Flicker.df<-data.frame(EyeColour=EyeColour,Flicker=Flicker)
plot.design(Flicker.df)
plot.design(Flicker.df,fun=median)
plot(Flicker.df)
Flicker.aov<-aov(Flicker ~ EyeColour, data=Flicker.df)
summary(Flicker.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## EyeColour 2 23.00 11.499 4.802 0.0232 *
## Residuals 16 38.31 2.394
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
age<-c(39,42,20,37,20,21,41,52)
signs<-c(0,1,0,1,1,0,1,1)
# Fit a logistic regression with sign as response and age as predictor
model = glm(signs ~ age, family=binomial)
summary(model)
##
## Call:
## glm(formula = signs ~ age, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7112 -0.8686 0.5063 0.6939 1.5336
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.92086 2.64655 -1.104 0.270
## age 0.10569 0.08037 1.315 0.188
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10.5850 on 7 degrees of freedom
## Residual deviance: 8.4345 on 6 degrees of freedom
## AIC: 12.435
##
## Number of Fisher Scoring iterations: 4
Perform a Wald Test on Age.
# Wald test statistic: bj / se(bj)
0.10569 / 0.08037
## [1] 1.315043
In the Wald test, the null hypothesis is \(H_0: \beta_{age} = 0\) with alternative hypothesis \(H_1: \beta_{age}\ne 0\). Since our test statistic is 1.315 which is less than the upper 2.5 percentile of the normal distribution (1.96), we do not have sufficient evidence to reject the null hypothesis at a 5% significance level. So, the Wald test does not indicate a significant effect for Age.
Consider the reduction in the deviance when covariate Age is added to the model:
# calculate upper 5% point of a chi-squared random variable:
qchisq(0.95, 1)
## [1] 3.841459
anova(model)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: signs
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 7 10.5850
## age 1 2.1505 6 8.4345
The reduction in deviance is 10.585 - 8.4345 = 2.150488. Since this is below the upper 5% point of a chi-squared random variable with 1 degree of freedom, we do not have sufficient evidence to reject the null hypothesis and conclude that the age effect is not significant. This conclusion agrees with the Wald test.
titanic=read.table("data/titanic.txt", header=T)
attach(titanic)
tapply(Survived, Sex, mean)
## female male
## 0.6666667 0.1668625
tapply(Survived, PClass, mean)
## 1st 2nd 3rd
## 0.5993789 0.4250000 0.1940928
model = glm(Survived ~ PClass + Sex, family=binomial, na.action=na.exclude)
summary(model)
##
## Call:
## glm(formula = Survived ~ PClass + Sex, family = binomial, na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0414 -0.6992 -0.3908 0.5156 2.2848
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.9507 0.1718 11.356 < 2e-16 ***
## PClass2nd -0.7894 0.1954 -4.039 5.36e-05 ***
## PClass3rd -2.0391 0.1774 -11.496 < 2e-16 ***
## Sexmale -2.4454 0.1512 -16.177 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1688.1 on 1312 degrees of freedom
## Residual deviance: 1199.0 on 1309 degrees of freedom
## AIC: 1207
##
## Number of Fisher Scoring iterations: 5
model = glm(Survived ~ PClass + Sex + PClass * Sex, family=binomial, na.action=na.exclude)
summary(model)
##
## Call:
## glm(formula = Survived ~ PClass + Sex + PClass * Sex, family = binomial,
## na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3519 -0.5587 -0.4971 0.3606 2.0747
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7006 0.3443 7.843 4.41e-15 ***
## PClass2nd -0.7223 0.4540 -1.591 0.112
## PClass3rd -3.2014 0.3724 -8.598 < 2e-16 ***
## Sexmale -3.4106 0.3793 -8.992 < 2e-16 ***
## PClass2nd:Sexmale -0.3461 0.5274 -0.656 0.512
## PClass3rd:Sexmale 1.8827 0.4283 4.396 1.10e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1688.1 on 1312 degrees of freedom
## Residual deviance: 1155.9 on 1307 degrees of freedom
## AIC: 1167.9
##
## Number of Fisher Scoring iterations: 5
For logistic regression, a one unit change in the jith predicitor variable increases the log odds of \(y_i = 1\) by \(\beta_j\). Consider the predictor Sexmale in the model without interaction \(\beta_{Sexmale} = -2.4454\). This means the log-odds of \(y_i = 1\) decreases by 2.4454 holding all else being equal. For the model with interaction, the interaction terms capture the interaction effect between the predictors as well as the effect of each predictor class. For example, the effect of a passenger being male would increase the log-odds of \(y_i = 1\) by \(\beta_{\text{Sexmale}} + \beta_{\text{PClass2nd:Sexmale}} + \beta_{PClass3rd:Sexmale}\).
dose<-c(1.6907,1.7242,1.7552,1.7842, 1.8113,1.8369,1.8610,1.8839)
exposed<-c(59,60,62,56,63,59,62,60)
mortality<-c(6,13,18,28,52,53,61,60)
response<-mortality/exposed
Plot \(\log{\left(\frac{\text{response}}{(1 - \text{response})}\right)}\) against Dose. Do you think a logistic regression model with just Dose as the response seems reasonable?
plot(dose, log(response/(1-response)))
Since Dose seems to have a linear relationship to the log odds of the response, a logistic regression with just Dose may be reasonable.
Fit a logistic regression model including just Dose as a predictor, and fit a logistic regression model including Dose and Dose2 as predictors. Is the reduction in deviance when the quadratic term is added to the model significant?
model = glm(response ~ dose + I(dose^2), family=binomial, weights=exposed)
summary(model)
##
## Call:
## glm(formula = response ~ dose + I(dose^2), family = binomial,
## weights = exposed)
##
## Deviance Residuals:
## 1 2 3 4 5 6 7 8
## -0.4215 0.8189 -0.2769 -0.5232 0.8746 -0.8249 0.1698 0.7224
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 431.11 180.65 2.386 0.01702 *
## dose -520.62 204.52 -2.546 0.01091 *
## I(dose^2) 156.41 57.86 2.703 0.00687 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 284.2024 on 7 degrees of freedom
## Residual deviance: 3.1949 on 5 degrees of freedom
## AIC: 35.393
##
## Number of Fisher Scoring iterations: 4
# Wald test statistic for quadratic dose:
156.41/57.86
## [1] 2.703249
Since the test statistic is higher than the upper 5% point of the normal distribution (1.64), we have sufficient evidence to reject the null hypothesis under a 5% significance level. Thus, we conclude that the quadratic term of dose is needed.
Performing a reduction in deviance test:
anova(model)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: response
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 7 284.202
## dose 1 272.970 6 11.232
## I(dose^2) 1 8.037 5 3.195
qchisq(0.95, 1)
## [1] 3.841459
11.232 - 3.195
## [1] 8.037
The reduction in deviance when the quadratic term is added to the model with just the linear term is 8.037, which is greater than the upper 5% of a chi-squared distribution 3.841. We therefore have sufficient evidence to reject the null hypothesis that the quadratic term is not significant, and conclude that the quadratic term for dose is needed.
Consider the titanic data set. We considered modelling the variable Survived as a function of Age.
titanic = read.table('data/titanic.txt', header=T)
attach(titanic)
## The following objects are masked from titanic (pos = 3):
##
## Age, Name, PClass, Sex, Survived
After fitting a logistic regression model with Survived as the response and Age as predictor (you will need to remove missing values by including the na.action=na.exclude argument), extract the Pearson and deviance residuals.
model = glm(Survived ~ Age, family=binomial, na.action=na.exclude)
summary(model)
##
## Call:
## glm(formula = Survived ~ Age, family = binomial, na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1418 -1.0489 -0.9792 1.3039 1.4801
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.081428 0.173862 -0.468 0.6395
## Age -0.008795 0.005232 -1.681 0.0928 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1025.6 on 755 degrees of freedom
## Residual deviance: 1022.7 on 754 degrees of freedom
## (557 observations deleted due to missingness)
## AIC: 1026.7
##
## Number of Fisher Scoring iterations: 4
(Note: after extracting the Pearson residuals using the residuals function, you will need to remove missing values in the vector of Pearson residuals. If pr is the vector of Pearson residuals, pr[!is.na(pr)] is the vector of non-missing values in pr).
pr = residuals(model, type='pearson')
pr = pr[!is.na(pr)]
Plot the Pearson residuals against the fitted values and superimpose a loess smooth on this plot. To do this, type lines(scatter.smooth(titanic.glm$fit,pr)). Repeat this procedure for the deviance residuals. Do you think a logistic regression model is appropriate?
plot(model$fitted.values, pr)
lines(loess.smooth(model$fitted.values, pr))
dr = residuals(model, type='deviance')
dr = dr[!is.na(dr)]
plot(model$fitted.values, dr)
lines(loess.smooth(model$fitted.values, dr))
rain.df=read.table("data/rain.txt", header=T)
attach(rain.df)
head(rain.df)
## rainfall positive ntested
## 1 1.735 0.5 4
## 2 1.936 0.3 10
## 3 2.000 0.2 5
## 4 1.973 0.3 10
## 5 1.750 1.0 2
## 6 1.800 0.6 5
plot(rainfall, log(positive/(1-positive)), ylim=c(-2,2), xlim=c(1.5,2.5))
model = glm(positive ~ rainfall + I(rainfall^2) + I(rainfall^3) + I(rainfall^4), family=binomial, weights=ntested)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
anova(model)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: positive
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 33 74.185
## rainfall 1 0.1227 32 74.062
## I(rainfall^2) 1 0.0000 31 74.062
## I(rainfall^3) 1 11.4572 30 62.605
## I(rainfall^4) 1 0.1810 29 62.424
qchisq(0.95, 1)
## [1] 3.841459
Adding the cubic term to the model with the linear and quadratic term resulted in a reduction in deviance of 12.6. Since this is greater than the upper 5% point of a chi-squared distribution with 1 degree of freedom, we have sufficient evidence to reject the null hypothesis that the cubic term is insignificant. We conclude that the cubic term is significant, and that the appropriate model is has the linear, quadratic, and cubic term. We note that the decrease from the quadratic term is insignificant.
downer=read.table("data/downer.txt", header=T)
attach(downer)
# plot CK's density in terms of the responses
plot(density(CK[Outcome==1], na.rm=T), xlab='CK')
lines(density(CK[Outcome==0], na.rm=T), lty=2, xlab='CK')
# plot the density of the log of CK in terms of the responses
plot(density(log(CK[Outcome==1]), na.rm=T), xlab='CK', xlim=c(0,13))
lines(density(log(CK[Outcome==0]), na.rm=T), lty=2, xlab='CK', xlim=c(0,13))
Now that weβve shown the log-transformation makes them roughly unimodal and normal, we can fit the model and perform a reduction of deviance test on the quadratic term too.
model = glm(Outcome ~ log(CK) + I(log(CK)^2), family=binomial)
anova(model)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Outcome
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 412 550.49
## log(CK) 1 75.315 411 475.18
## I(log(CK)^2) 1 9.263 410 465.91
It looks like the quadratic term is also significant.
mode1<-c(33.3,52.2,64.7,137,125.9,116.3,131.7,85,91.9)
mode2<-c(25.3,14.4,32.5,20.5,97.6,53.6,56.6,87.3,47.8)
failures<-c(15,9,14,24,27,27,23,18,22)
equipment<-data.frame(mode1=mode1,mode2=mode2,failures=failures)
Fit the Poisson regression model discussed in lectures to the number of failures: that is, fit a model where the mean number of failures for the \(i\)th device is \[\lambda_i=\beta_1 T_{i1}+\beta_2 T_{i2}\] where \(T_{i1}\) is the time that the \(i\)th device spends in operating mode 1 and \(T_{i2}\) is the time that the \(i\)th device spends in operating mode 2.
model = glm(failures ~ mode1 + mode2 - 1, family=poisson(link=identity), data=equipment)
summary(model)
##
## Call:
## glm(formula = failures ~ mode1 + mode2 - 1, family = poisson(link = identity),
## data = equipment)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8920 -0.5212 -0.1373 0.5240 2.2700
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## mode1 0.16661 0.03486 4.779 1.76e-06 ***
## mode2 0.09040 0.06301 1.435 0.151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: Inf on 9 degrees of freedom
## Residual deviance: 7.5716 on 7 degrees of freedom
## AIC: 54.628
##
## Number of Fisher Scoring iterations: 5
Test if \(\beta_2\) is significantly different from zero using both a Wald test and a test based on the deviance.
Looking at the wald test where the null hypothesis is \(H_0: \beta_2 = 0\) and the alternative is \(H_1: \beta_2 \ne 0\), we see the test statistic is 1.435. Since this is less than 1.96, the upper 2.5% point of the normal distribution, we do not have sufficient evidence to reject the null. Thus, we accept that mode2 is not a significant predictor at the 5 percent level.
Now for the reduction in deviance test.
qchisq(0.95, 1)
## [1] 3.841459
anova(model)
## Analysis of Deviance Table
##
## Model: poisson, link: identity
##
## Response: failures
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 9 Inf
## mode1 1 Inf 8 10
## mode2 1 2 7 8
We see that the reduction in deviance of adding \(\beta_2\) into the model with just mode1 was 2. Since this is less than the upper 5% point of the chi-squared distribution, we do not have sufficient evidence to reject the null hypothesis. This result agrees with the Wald test.
photo<-c(56,38,25,48,38,22,22,42,34,14)
observer<-c(50,25,30,35,25,20,12,34,20,10)
geese<-data.frame(photo=photo,observer=observer)
Fit a Poisson regression model with identity link to these data with the photo count as the response and the observer count as the predictor. Do you think the observer count is reliable? Support your answer by considering appropriate hypothesis tests.
model = glm(data=geese, formula=photo ~ observer, family=poisson(link="identity"))
summary(model)
##
## Call:
## glm(formula = photo ~ observer, family = poisson(link = "identity"),
## data = geese)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2274 -0.7366 0.2709 0.8565 1.1508
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.4732 4.0870 1.829 0.0675 .
## observer 1.0125 0.1652 6.128 8.92e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 47.118 on 9 degrees of freedom
## Residual deviance: 10.862 on 8 degrees of freedom
## AIC: 67.774
##
## Number of Fisher Scoring iterations: 4
If the observer count is reliable, \(\beta_1\) should be 1 and \(\beta_0\) should be 0. Therefore, our null hypothesis is \(H_0: \beta_0 = 0 \text{ and } 1-\beta_1 = 0\) with alternative hypothesis \(H_1 \beta_0 \ne 0 \text{ and } 1-\beta_1 \ne 0\).
The Wald test statistic for the intercept indicates it is not significant at the 5% level. For the observer, the Wald test statistic is \((\beta_1- 1)/\hat{se}(\beta_1)\).
(1.0125 - 1)/0.1652
## [1] 0.07566586
Since this value is less than 1.96, we do not have evidence to reject the hypothesis that it is much different to 1. So, we conclude that the observer count is reliable.
Do a scatterplot of ozone against radiation and superimpose on the scatterplot:
(You can get different lines for the different smooths by using the lty argument of lines: passing lty=0 gives a solid line, lty=1 gives dots, lty=2 gives dashes, etc.). You may need to change the default level of smoothing for the kernel and loess smooths, although for the cubic smoothing spline the default choice of smoothing parameter is usually sensible.
air=read.table("data/air.txt", header=T)
attach(air)
plot(radiation, ozone)
lines(loess.smooth(radiation, ozone, span=0.25), lty=1)
lines(ksmooth(radiation, ozone, kernel='normal', bandwidth=20), lty=2)
lines(smooth.spline(radiation, ozone), lty=3)
legend('topleft', lty=c(1,2,3), legend=c('loess', 'kernel', 'cubic'))
The ethanol data frame contains 88 measurements generated in an experiment in which ethanol was burned in a single cylinder automobile test engine. The variables are NOx (a measure of the nitrogen oxides in the exhaust), C (compression ratio of an engine) and E (a measure of the richness of the air/ethanol mix used).
ethanol=read.table("data/ethanol.dat", header=T)
attach(ethanol)
A. Do a scatterplot of NOx against E. After looking at the scatterplot you might suggest a polynomial regression model to capture the relationship between the response NOx and the predictor E. Fit a quadratic regression model and superimpose the fitted curve on the scatterplot.
B. Superimpose on the scatterplot in a) the cubic smoothing spline fit to the data. Do you think the cubic smoothing spline captures structure in the data that is missed by the quadratic regression model?
plot(E, NOx)
model = glm(NOx ~ E + I(E^2), family=gaussian)
temp = data.frame(x=E, y=model$fitted.values)
temp =temp[order(temp$x),]
lines(x=temp$x, y=temp$y)
lines(smooth.spline(x=E, y=NOx), lty=2)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-35. For overview type 'help("mgcv-package")'.
air<-read.table("data/air.txt", header=T)
attach(air)
## The following objects are masked from air (pos = 6):
##
## ozone, radiation, temperature, wind
Fit an additive model to the data with ozone as the response and temperature and radiation as predictors and plot the smooth terms in the model as well as the fitted surface.
model = gam(ozone ~ s(temperature)+s(radiation))
plot(model)
summary(air)
## ozone radiation temperature wind
## Min. :1.000 Min. : 7.0 Min. :57.00 Min. : 2.300
## 1st Qu.:2.621 1st Qu.:113.5 1st Qu.:71.00 1st Qu.: 7.400
## Median :3.141 Median :207.0 Median :79.00 Median : 9.700
## Mean :3.248 Mean :184.8 Mean :77.79 Mean : 9.939
## 3rd Qu.:3.958 3rd Qu.:255.5 3rd Qu.:84.50 3rd Qu.:11.500
## Max. :5.518 Max. :334.0 Max. :97.00 Max. :20.700
nrow(air)
## [1] 111
n = 50
par(mfrow=c(2,2))
plot(model)
grid = list(temperature=seq(56, 97, length=n),
radiation=seq(7,334, length=n))
pr = predict.gam(model, newdata=expand.grid(grid))
pr.mat = matrix(pr, nrow=n, ncol=n)
persp(x=grid$temperature, y=grid$radiation, pr.mat,
xlab="temperature", ylab="radiation",zlab="ozone",
theta=-45, phi=25, d=2.0, tick="detailed")
persp(grid$temperature ,grid$radiation, pr.mat,
xlab="temperature",ylab="radiation", zlab="ozone",
theta=40, phi=15,d=2.0, tick="detailed")
model = lm(NOx ~ E + C, data=ethanol)
summary(model)
##
## Call:
## lm(formula = NOx ~ E + C, data = ethanol)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7693 -0.9496 -0.2541 1.0381 2.0959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.559101 0.662396 3.863 0.000218 ***
## E -0.557137 0.601464 -0.926 0.356912
## C -0.007109 0.031135 -0.228 0.819941
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.14 on 85 degrees of freedom
## Multiple R-squared: 0.01095, Adjusted R-squared: -0.01232
## F-statistic: 0.4707 on 2 and 85 DF, p-value: 0.6262
Partial t-statistics are below 1.95, and so we do not have evidence to reject the null hypothesis that E and C are significant predictors. Similarly, the F-statistic is 0.4707, and we have a p-value of 0.62. This means that overall we do not have data to reject the null hypothesis that our coefficients are non-zero, and so we conclude that a linear model is not helpful for explaining variation in the plot.
par(mfrow=c(2,2))
plot(model)
Diagnostic plots show that our data violates the normality assumption (non-linear normal Q-Q plot), the constant variance assumption and the linearity assumption (u-shape in the residuals vs fitted plot). I would suggest transforming the data, for example with a log-transformation to make it satisfy the assumptions.
model = gam(NOx ~ s(E) + s(C, k=5), data=ethanol, )
plot(model)
gascons=read.table("data/gascons.dat", header=T)
attach(gascons)
## The following object is masked from firearm:
##
## Year
Standard economic theory states that there should be an inverse relationship between demand for a product and its price.
plot(Pricegas,Consump)
lines(loess.smooth(Pricegas,Consump))
Economic theory says that there should be an inverse relationship between demand for a product and its price. This means that as price increases, demand should actually decrease - but what we see in the chart is that demand seemingly increases with price, against economic theory.
gas.gam<-gam(Consump ~ s(Pricegas, k = 3)+s(PCIncome, k = 3)+s(Prcusecr, k = 3), bf.maxit=50)
summary(gas.gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Consump ~ s(Pricegas, k = 3) + s(PCIncome, k = 3) + s(Prcusecr,
## k = 3)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.0333 0.9597 215.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Pricegas) 1.000 1.00 34.46 6.9e-06 ***
## s(PCIncome) 1.000 1.00 479.15 < 2e-16 ***
## s(Prcusecr) 1.902 1.99 9.77 0.0011 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.987 Deviance explained = 98.9%
## GCV = 30.385 Scale est. = 24.868 n = 27
plot(gas.gam, se=T)
Looking at the plot of the smoothed price of gas against consumption, after correcting for income and price index for used cars, we see that there is actually an inverse relationship with price of gas and consumption.
The Swiss bank notes data swissmon discussed in lectures are part of a larger set, which includes both real notes and forged notes and also includes measurement of the image diagonal length of the bill.
The data frame swissmon contains the columns:
Use the density command to produce density estimates for each of the variables.
swissmon=read.table("data/swissmon.txt", header=T)
attach(swissmon)
par(mfrow=c(2,2))
plot(density(Bottforg, width="sj"), xlab="Bottforg", ylab="Density")
plot(density(Diagforg, width="sj"), xlab="Diagforg", ylab="Density")
plot(density(Bottreal, width="sj"), xlab="Bottreal", ylab="Density")
plot(density(Diagreal, width="sj"), xlab="Diagreal", ylab="Density")
No
Yes, Bottom margin are more right skewed, while diagonals are more left skewed. There also seem to be some multi modality in forged bottom margin, while both real and forged have unimodal diagonals.
par(mfrow=c(2,2))
plot(density(Bottforg, kernel="gaussian", bw=1), xlab="Bottforg", ylab="Density")
plot(density(Bottforg, kernel="gaussian", bw=0.5), xlab="Bottforg", ylab="Density")
plot(density(Bottforg, kernel="gaussian", bw=0.2), xlab="Bottforg", ylab="Density")
plot(density(Bottforg, kernel="gaussian", bw=0.1), xlab="Bottforg", ylab="Density")
Construct a variability plot for the variable Bottforg in the counterfeit Swiss bank notes data set.
m = 200 # number of resamples
np = 50 # number of points on the density plot
dvals = matrix(nrow=m, ncol=np) # matrix to hold samples
for (i in 1:m) {
bottforg.s = sample(Bottforg, replace=T)
bottforg.density = density(bottforg.s, from=min(Bottforg), to=max(Bottforg), n=np)
dvals[i, ] = bottforg.density$y
}
# get lower and upper bounds of confidence intervals
lower = apply(dvals, 2, quantile, probs=0.025)
upper = apply(dvals, 2, quantile, probs=0.975)
min(lower)
## [1] 0.005682548
max(upper)
## [1] 0.4432041
# plot
xaxis<-seq(from=min(Bottforg),to=max(Bottforg),length=50)
plot(xaxis, upper, type="l", lty=2, xlab="Bottforg", ylab="Density", main="Variability Plot of Bottforg")
lines(xaxis,lower,lty=2)
lines(density(Bottforg,from=min(Bottforg),to=max(Bottforg)),lty=1)
Obtain the data frames chondrit, galaxy and quake.
The galaxy data frame contains the velocity of galaxies data we discussed in lectures.
The data frame quake contains details of 2178 earthquakes: we are interested in the variable Depth in this data frame.
The data frame chondrit contains a single variable Silica which is the percentage of silica in chondrite meteors.
The data frame chondrit is a small data set (22 observations),
The data frame galaxy is a moderately sized data set (82 observations)
The data frame quake is a large data set (2178 observations).
chondrit = read.table("data/chondrit.dat", header=T)
attach(chondrit)
galaxy = read.table("data/galaxy.txt", header=T)
attach(galaxy)
quake = read.table("data/quake.txt", header=T)
attach(quake)
par(mfrow=c(2,3))
plot(density(Silica, width="sj"), main="Silica Density | SJ")
plot(density(Velocity, width="sj"), main="Velocity Density | SJ")
plot(density(Depth, width="sj"), main="Depth Density | SJ")
plot(density(Silica, width="nrd"), main="Silica Density | NRD")
plot(density(Velocity, width="nrd"), main="Velocity Density | NRD")
plot(density(Depth, width="nrd"), main="Depth Density | NRD")
Sheather-Jones method seems to be more sensitive to multimodality than Normal Reference Density methods for all sample sizes. For example, it captures the multimodality in smaller samples (e.g.Β Silica and Velocity). However, this sensitivity may be problematic in larger samples, such as in Depth.
The mass library contains a function kde2d which implements two-dimensional kernel density estimation. We will illustrate the use of this function for the data set geyser (type help(geyser) to learn more about the data).
The geyser data frame contains two variables, waiting and duration. The variables are:
# install.packages("MASS")
library(MASS)
attach(geyser)
f1<-kde2d(waiting,duration)
persp(f1,xlab="waiting",ylab="duration",zlab="Density")
contour(f1,xlab="waiting",ylab="duration")
The graphs show a perspective plot and a contour plot of the bivariate density for waiting and duration. Try to understand what the above commands do by reading the online help.
\[\mathbb{E}(\hat{f}(x)) \approx f(x) + \frac{h^2 f''(x) \sigma^2_k }{2}\]
With \(K(u)\) the kernel \[\begin{aligned} K(u) & = & \frac{1}{2}I(|u|\leq 1)\end{aligned}\] plot \(f(x)\) against the above approximation for \(h=0.5, 1.0\) when \(f(x)\) is the density \[f(x)=0.6\phi_1(x)+0.4\phi_2(x)\] where \(\phi_1(x)\) is a normal density with mean -1 and variance1, and \(\phi_2(x)\) is a normal density with mean 2 and variance
After differentiating we have:
\[\sigma_K^2=\frac{1}{3}\]
\[\phi_1''(x)=((x+1)^2-1)\phi_1(x)\] \[\phi_2''(x)=((x-2)^2-1)\phi_2(x)\] \[f''(x)=0.6\phi_1''(x)+0.4\phi_2''(x).\]
z<-seq(from=-4.0,to=5.0,by=0.1)
phi1<-1/sqrt(2*pi)*exp(-(z+1)*(z+1)/2)
phi2<-1/sqrt(2*pi)*exp(-(z-2)*(z-2)/2)
phi1dd<-((z+1)*(z+1)-1)*phi1
phi2dd<-((z-2)*(z-2)-1)*phi2
h<-0.5
fx<-0.6*phi1+0.4*phi2
fddx<-0.6*phi1dd+0.4*phi2dd
meanfx<-fx+h*h*fddx/6
plot(z,fx,type="l",xlab="x",ylab="f(x)", main="h = 0.5")
lines(z,meanfx,lty=2)
h<-1.0
fx<-0.6*phi1+0.4*phi2
fddx<-0.6*phi1dd+0.4*phi2dd
meanfx<-fx+h*h*fddx/6
plot(z,fx,type="l",xlab="x",ylab="f(x)", main="h = 1")
lines(z,meanfx,lty=2)
What does the kernel smoother tend to do to maxima and minima in the density?
Increasing the bandwidth of the kernel smoother tends to decrease our maxima and increase our minima. This results in a more βsmoothβ plot.
For simulated random samples of size n=100, 800 and 6400, plot histograms based on the optimal bin width as well as histograms obtained by using the bin width 3.491πΜ πβ1/3 where πΜ is the sample standard deviation.
n<-100
n1<-rnorm(n)-1
n2<-rnorm(n)+2
u<-runif(n)
z<-ifelse(u<=0.6,n1,n2)
h1<-3.491*sqrt(var(z))*100^(-1/3)
h2<-4.394*sqrt(var(z))*100^(-1/3)
breaks1<-seq(from=min(z),to=max(z)+h1,by=h1)
breaks2<-seq(from=min(z),to=max(z)+h2,by=h2)
par(mfrow=c(3, 2))
hist(z,breaks=breaks1)
hist(z,breaks=breaks2)
n<-800
n1<-rnorm(n)-1
n2<-rnorm(n)+2
u<-runif(n)
z<-ifelse(u<=0.6,n1,n2)
h1<-3.491*sqrt(var(z))*n^(-1/3)
h2<-4.394*sqrt(var(z))*n^(-1/3)
breaks1<-seq(from=min(z),to=max(z)+h1,by=h1)
breaks2<-seq(from=min(z),to=max(z)+h2,by=h2)
hist(z,breaks=breaks1)
hist(z,breaks=breaks2)
n<-6400
n1<-rnorm(n)-1
n2<-rnorm(n)+2
u<-runif(n)
z<-ifelse(u<=0.6,n1,n2)
h1<-3.491*sqrt(var(z))*n^(-1/3)
h2<-4.394*sqrt(var(z))*n^(-1/3)
breaks1<-seq(from=min(z),to=max(z)+h1,by=h1)
breaks2<-seq(from=min(z),to=max(z)+h2,by=h2)
hist(z,breaks=breaks1)
hist(z,breaks=breaks2)
pdf<-function(x)
{
y <- 1/4 * (x >= 0 & x <= 1) + 1/2 * (x > 1 & x <= 2) -
0.5 * (x - 3) * (x > 2 & x <= 3)
y
}
cdf<-function(x)
{
y <- x/4 * (x >= 0 & x <= 1) + (2 * x - 1)/4 * (x > 1 & x <= 2) +
((5 - x) * (x - 1))/4 * (x > 2 & x <= 3) + (x > 3)
y
}
efhat = function(x, h) {
y = 1/(2*h) * (cdf(x+h) - cdf(x-h))
y
}
xpoints = seq(from=-2, to=4, by=0.001)
plot(x=xpoints, y=cdf(xpoints), type="l")
lines(x=xpoints, y=pdf(xpoints), lty=2)
lines(x=xpoints, y=efhat(xpoints, 0.1), lty=3, col='red')
lines(x=xpoints, y=efhat(xpoints, 0.5), lty=4, col='blue')
legend('topleft', legend=c('CDF', 'PDF', 'E, h=0.1', 'E, h=0.5'), lty=c(1,2,3,4), col=c('black', 'black', 'red', 'blue'))
We see the bias increase as h increases.
Generate standard normal random variables until you have generated \(n\) of them, where \(n\geq 30\) is such that \(S/\sqrt{n}<0.1\) where \(S\) is the sample standard deviation of the data values.
1. How many normals do you think will be generated?
2. How many normals did you generate?
3. What is the sample mean of all the normals generated?
4. What is the sample variance?
# Initialise with 30 samples, and compute standard deviation
n = 30
x = rnorm(n)
s = sqrt(var(x))
m = mean(x)
# Continue sampling; stop when we have s/sqrt(n) < 0.1
while (s/sqrt(n) > 0.1) {
newx = rnorm(1)
newm = m + (newx - m)/(n+1)
s = sqrt( (1 - 1/n)*s + (n+1)*(newm -m)^2 )
m = newm
n = n+1
}
# Print final mean, variance, and n
m
## [1] 0.09255031
s^2
## [1] 0.991691
n
## [1] 100