Hypothesis Test:
\(H_0:\beta_0=0\)
\(H_A:\beta_1\neq0\)
library(UsingR)
## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
data("father.son")
y=father.son$sheight
x=father.son$fheight
n=nrow(father.son)
We know that \(\beta_1=cor(Y,X)\frac{SdY}{SdX}\) and \(\beta_0 = \bar{Y}-\beta_1\bar{X}\)
b1<-cor(y,x)*sd(y)/sd(x)
b0<-mean(y)-b1*mean(x)
b1
## [1] 0.514093
b0
## [1] 33.8866
the predicted outcome is \(\hat{Y_i}\) at predicted value of \(\hat{X_i}\) \(e_i =Y_i -\hat{\beta_0}-\hat{\beta_1}X_i\)
e<-y-b0-b1*x
We have the standard error of the residuals\(\hat{\sigma}=\sqrt{(\frac{1}{n-2}\sum_{i=1}^n}e_{i}^2)\)
sig<-sqrt(sum(e^2)/(n-2))
sx<-sum((x-mean(x))^2)
sig
## [1] 2.436556
sx
## [1] 8114.444
The standard error of the regression slope is
\(\sigma_{\hat{\beta_1}}=\sqrt{Var(\beta_1)}=\frac{\sigma}{\sqrt{\sum_{i=1}^n}(X_i-\bar{X})^2}\)
ser <- sig/sqrt(sx)
ser
## [1] 0.02704874
If the true value is 0, we have T, \(T=\frac{\hat{\beta_j}}{\hat{\sigma_j}}\)
tb1 <-b1/ser
pb1 <- 2 * pt(abs(tb1), df = n - 2, lower.tail = FALSE)
coefTab <- data.frame(b1, ser, tb1, pb1)
colnames(coefTab) <- c("Estimate", "Std. Error", "t value", "P(>|t|)")
rownames(coefTab) <- "x"
coefTab
## Estimate Std. Error t value P(>|t|)
## x 0.514093 0.02704874 19.00618 1.121268e-69
In comparison with the coefficient of the linear model of the father and son’s height,
summary(lm(sheight~fheight, father.son))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.886604 1.83235382 18.49348 1.604044e-66
## fheight 0.514093 0.02704874 19.00618 1.121268e-69
coefTab[1,4]
## [1] 1.121268e-69
summary(lm(sheight~fheight, father.son))$coef[2,4]
## [1] 1.121268e-69
p-value of the slope is very small, hence we have to reject the null hypothesis. The slope coefficient is different than 0. So we can consider that the father’s height is a statistically significant predictor of the son’s height.
scf<-(summary(lm(sheight~fheight, father.son))$coef)
scf
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.886604 1.83235382 18.49348 1.604044e-66
## fheight 0.514093 0.02704874 19.00618 1.121268e-69
Let us use the 2 methods in computing the confidence interval
lmf <- lm(sheight ~ fheight, father.son)
scf[2,1] + c(-1,1) * qt(0.975, df= lmf$df.residual) * scf[2,2]
## [1] 0.4610188 0.5671673
confint(lmf)
## 2.5 % 97.5 %
## (Intercept) 30.2912126 37.4819961
## fheight 0.4610188 0.5671673
Let us get the intercept that center’s the fathers’ height.
summary(lmf)$coef[1,1]
## [1] 33.8866
So the estimate for the intercept is 33.8866. Next, we get the regression centering the height at the mean of the father’s height
lmfsc <- lm(sheight ~ I(fheight-mean(fheight)), father.son)
lmfsc
##
## Call:
## lm(formula = sheight ~ I(fheight - mean(fheight)), data = father.son)
##
## Coefficients:
## (Intercept) I(fheight - mean(fheight))
## 68.6841 0.5141
We have a new intercept, which is 68.6841 inches. This tells us that the son’s height at average height of the father is 68.6841 inches. Next is the CI for this intercept,
confint(lmfsc)
## 2.5 % 97.5 %
## (Intercept) 68.5384554 68.8296839
## I(fheight - mean(fheight)) 0.4610188 0.5671673
There is no change in the slope coefficients.
predict(lmf, newdata = data.frame(fheight = mean(father.son$fheight)), interval = 'confidence')
## fit lwr upr
## 1 68.68407 68.53846 68.82968
predict(lmf, newdata = data.frame(fheight = mean(father.son$fheight)), interval = "prediction")
## fit lwr upr
## 1 68.68407 63.90091 73.46723
fit1<-lm(mpg~hp, mtcars)
summary(fit1)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.09886054 1.6339210 18.421246 6.642736e-18
## hp -0.06822828 0.0101193 -6.742389 1.787835e-07
as we see that the p-value is very small (1.787835e-07), so we fail to accept the null hypothesis. Hence, the horse power is statistically significant predictor of miles per gallon.
METHOD 1
confint(fit1)
## 2.5 % 97.5 %
## (Intercept) 26.76194879 33.4357723
## hp -0.08889465 -0.0475619
METHOD 2
with(mtcars, cor(mpg, hp)*sd(mpg)/sd(hp))+ c(-1,1) * qt(0.975, df=nrow(mtcars)-2)*summary(fit1)$coef[2,2]
## [1] -0.08889465 -0.04756190
Center coefficients
fit2 <- lm(mpg ~ I(hp-mean(hp)), mtcars)
Method 1
confint(fit2)
## 2.5 % 97.5 %
## (Intercept) 18.69599452 21.4852555
## I(hp - mean(hp)) -0.08889465 -0.0475619
Method 2
predict(fit2, newdata = data.frame(hp = mean(mtcars$hp)), interval = 'confidence')
## fit lwr upr
## 1 20.09062 18.69599 21.48526
fit3 <- lm(mpg~hp,mtcars)
predict(fit3, newdata = data.frame(hp=mean(mtcars$hp)), interval = 'confidence')
## fit lwr upr
## 1 20.09062 18.69599 21.48526
predict(fit3, newdata = data.frame(hp=mean(mtcars$hp)), interval = 'prediction')
## fit lwr upr
## 1 20.09062 12.07908 28.10217
y <- mtcars$mpg; x <- mtcars$hp; n <- length(y)
fit <- lm(y~x)
p1 <- data.frame(predict(fit, newdata = data.frame(x=x), interval = 'confidence'))
p2 <- data.frame(predict(fit, newdata = data.frame(x=x), interval = 'prediction'))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
p1 <- p1%>%
mutate(interval = 'confidence') %>%
mutate(x = mtcars$hp)
p2 <- p2 %>%
mutate(interval = 'prediction') %>%
mutate(x = mtcars$hp)
dat <- rbind(p1,p2)
names(dat)[1] <- "y"
library(ggplot2)
dat %>%
ggplot(aes(x = x, y = y)) +
geom_ribbon(aes(ymin= lwr, ymax= upr, fill= interval), alpha= 0.4) +
geom_line(colour="black", lwd=.5) +
geom_point(data= data.frame(x=x,y=y), aes(x=x,y=y))
Seatbelts <- as.data.frame(Seatbelts)
round(summary(lm(DriversKilled ~ kms + PetrolPrice, Seatbelts))$coef,4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 215.7461 14.6656 14.7110 0.0000
## kms -0.0017 0.0006 -2.8469 0.0049
## PetrolPrice -643.7895 148.2896 -4.3414 0.0000
The intercept is the number of deaths for 0 kms and 0 PetrolPrice whichis not useful. An it’s good to center the variables. The estimate kms is the impact on deaths of 1 km which is insignificant and unworthy. Better to rescale it. PetroPrice is an index which is better to normalize. Better to normalize.
Seatbelts <- Seatbelts %>%
mutate(ppn = (PetrolPrice - mean(PetrolPrice))/sd(PetrolPrice),
mkm = kms / 1000) %>%
mutate(mkm = mkm - mean(mkm))
summary(lm(DriversKilled ~ mkm + ppn, Seatbelts))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122.802083 1.6628507 73.850336 2.395106e-141
## mkm -1.749546 0.6145401 -2.846919 4.902428e-03
## ppn -7.838674 1.8055491 -4.341435 2.304713e-05
fit5 <- lm(DriversKilled ~ kms + PetrolPrice, Seatbelts)
predict(fit5, newdata = data.frame(kms= mean(Seatbelts$kms),
PetrolPrice= mean(Seatbelts$PetrolPrice)))
## 1
## 122.8021
Hence, this is the number of deaths at the average of kms and petrol levels.
dk <- Seatbelts$DriversKilled
kms <- Seatbelts$kms
pp <- Seatbelts$PetrolPrice
fitfl <- lm(dk~kms+pp)
Now we get the residuals.
for the Driver’s deaths and petrol price
edk <- resid(lm(dk~kms))
epp <- resid(lm(pp~kms))
With this, we can get the estimate for pp,
sum(edk*epp)/sum(epp^2)
## [1] -643.7895
If we get the coefficient for the fitfl, we’ll have
round(summary(fitfl)$coef, 4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 215.7461 14.6656 14.7110 0.0000
## kms -0.0017 0.0006 -2.8469 0.0049
## pp -643.7895 148.2896 -4.3414 0.0000
As was calculated.
round(summary(lm(edk~epp- 1))$coef,4)
## Estimate Std. Error t value Pr(>|t|)
## epp -643.7895 147.5111 -4.3643 0
edk2 <- resid(lm(dk ~ pp))
ekms <- resid(lm(kms ~ pp))
round(summary(lm(edk2 ~ ekms -1))$coef, 4)
## Estimate Std. Error t value Pr(>|t|)
## ekms -0.0017 6e-04 -2.8619 0.0047
Do exercise 1 of the previous chapter if you have not already. Load the dataset Seatbelts as part of the datasets package via data(Seatbelts). Use as.data.frame to convert the object to a dataframe. Fit a linear model of driver deaths with kms and PetrolPrice as predictors. Interpret your results.
Seatbelts <- as.data.frame(Seatbelts)
round(summary(lm(DriversKilled ~ mkm + ppn, Seatbelts))$coef,4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122.8021 1.6629 73.8503 0.0000
## mkm -1.7495 0.6145 -2.8469 0.0049
## ppn -7.8387 1.8055 -4.3414 0.0000
The intercept is the number of deaths for 0 kms and 0 PetrolPrice. It’s good to center the variables. The estimate kms is the impact on deaths of 1 km which is insignificant. Better to rescale it.
Repeat question 1 for the outcome being the log of the count of driver deaths. Interpret your coefficients.
Seatbelts2 <- Seatbelts %>%
mutate(logkill = log(DriversKilled))
summary(lm(logkill ~ ppn + mkm, Seatbelts2))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.78966306 0.013426810 356.723817 2.737888e-269
## ppn -0.06412578 0.014579039 -4.398492 1.818005e-05
## mkm -0.01400794 0.004962149 -2.822959 5.267843e-03
summary(lm(log(DriversKilled) ~ ppn + mkm, Seatbelts))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.78966306 0.013426810 356.723817 2.737888e-269
## ppn -0.06412578 0.014579039 -4.398492 1.818005e-05
## mkm -0.01400794 0.004962149 -2.822959 5.267843e-03
Our normalized petrol price variable (ppn) can have this
interpretation:
We are estimating an expected 6% decrease in the geometric mean of drivers killed for every 1 standard deviation increase in normalized petrol price, holding the kilometers constant.
1-exp(-0.06412578)
## [1] 0.06211298We are estimating an expected 1% decrease in the geometric mean of drivers killed for every additional 1000 miles driven, holding petrol price constant
1-exp(-0.01400794)
## [1] 0.01391029
We will have to consider model 2, \(E[Y|X_1X_2]=\beta_0 +X_{ 1i}\beta_1+X_{2i}\beta_{2i}\) since the model includes the variables \(X_1\) and \(X_2\) w/o reciprocal interaction, then it fits 2 models that have the same slope.
round(summary(lm(DriversKilled ~ mkm + ppn + I(factor(law)), Seatbelts))$coef,4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 124.2263 1.8012 68.9674 0.0000
## mkm -1.2233 0.6657 -1.8378 0.0677
## ppn -6.9199 1.8514 -3.7377 0.0002
## I(factor(law))1 -11.8892 6.0258 -1.9731 0.0500
There were some little changes in the intercept.
If we wanted to know the value of DriversKilled after the law was taken into effect, at the average mkm and ppn we would have to add -11.89.
The value -11.8892 is associated with the factor level 1. Changing the reference level from No to Yes, we wil have:
Seatbelts<-Seatbelts %>%
mutate(law = factor(law, levels = c(1,0)))
round(summary(lm(DriversKilled ~ mkm + ppn + I(factor(law)), Seatbelts))$coef,4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.3371 5.5547 20.2236 0.0000
## mkm -1.2233 0.6657 -1.8378 0.0677
## ppn -6.9199 1.8514 -3.7377 0.0002
## I(factor(law))0 11.8892 6.0258 1.9731 0.0500
Seatbelts<-Seatbelts %>%
mutate(law = factor(law, levels = c(0,1)))
Seatbelts <- Seatbelts %>%
mutate(ppf = as.factor((ppn <= -1.5)+ (ppn <= 0)+ (ppn <= 1.5)+ (ppn < Inf)))
round(summary(lm(DriversKilled ~ ppf + mkm + I(factor(law)), Seatbelts))$coef,4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109.8405 9.5066 11.5541 0.0000
## ppf2 10.8271 9.9462 1.0886 0.2778
## ppf3 18.6904 9.9374 1.8808 0.0616
## ppf4 25.0074 10.9163 2.2908 0.0231
## mkm -1.2991 0.7668 -1.6942 0.0919
## I(factor(law))1 -15.3445 6.0345 -2.5428 0.0118
library(grid)
swiss <- swiss %>%
mutate(CatholicBin = 1 * (Catholic > 50))
g <- ggplot(swiss, aes(x = Agriculture, y = Fertility, colour = factor(CatholicBin)))+
geom_point(size = 5) +
xlab("% in Agriculture") +
ylab("Fertility")
fit <- lm(Fertility ~ Agriculture + factor(CatholicBin), data = swiss)
g +
geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2], size=1, colour='black') +
geom_abline(intercept = coef(fit)[1] + coef(fit)[3], slope = coef(fit)[2], size=1, colour='black') +
annotation_custom(grob = grobTree(textGrob("Catholic",
x=0.05,
y=.62,
hjust=0,
gp=gpar(col="black",
fontsize=10,
fontface="italic")))) +
annotation_custom(grob = grobTree(textGrob("Non catholic",
x=0.05,
y=.5,
hjust=0,
gp=gpar(col="black",
fontsize=10,
fontface="italic"))))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
With interaction
fitfull <- lm(Fertility ~ Agriculture * factor(CatholicBin), data = swiss)
g +
geom_abline(intercept = coef(fitfull)[1], slope = coef(fitfull)[2], size=1, colour='black') +
geom_abline(intercept = coef(fitfull)[1] + coef(fitfull)[3],
slope = coef(fitfull)[2] + coef(fitfull)[4], size=1, colour='black') +
annotation_custom(grob = grobTree(textGrob("Catholic",
x=0.05,
y=.58,
hjust=0,
gp=gpar(col="black",
fontsize=10,
fontface="italic")))) +
annotation_custom(grob = grobTree(textGrob("Non catholic",
x=0.05,
y=.46,
hjust=0,
gp=gpar(col="black",
fontsize=10,
fontface="italic"))))
summary(lm(DriversKilled ~ mkm + ppn, Seatbelts))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122.802083 1.6628507 73.850336 2.395106e-141
## mkm -1.749546 0.6145401 -2.846919 4.902428e-03
## ppn -7.838674 1.8055491 -4.341435 2.304713e-05
cor(Seatbelts$mkm, Seatbelts$ppn)
## [1] 0.3839004
With this, both variables are measuring similar things. The petrol price, possibly, because it has an impact in the total of kms driven.
fit8 <- lm(DriversKilled ~ mkm, Seatbelts)
fit9 <- lm(DriversKilled ~ mkm + ppn, Seatbelts)
summary(fit8)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122.802083 1.7391997 70.60839 2.665611e-138
## mkm -2.773787 0.5935049 -4.67357 5.596266e-06
summary(fit9)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122.802083 1.6628507 73.850336 2.395106e-141
## mkm -1.749546 0.6145401 -2.846919 4.902428e-03
## ppn -7.838674 1.8055491 -4.341435 2.304713e-05
The estimate in this case is negative. Which makes sense because both effects are in the same direction and are correlated. The estimate change from -2.8 to -1.7 which means that the confounding effect of the variable PetrolPrice on the regression between DriversKiller and kms. By obtaining the anova, we can see that,
anova(fit8,fit9)
## Analysis of Variance Table
##
## Model 1: DriversKilled ~ mkm
## Model 2: DriversKilled ~ mkm + ppn
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 190 110345
## 2 189 100339 1 10006 18.848 2.305e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
the p-value is the same in ppn estimate.
fit10 <- lm(DriversKilled ~ ppn, Seatbelts)
fit11 <- lm(DriversKilled ~ ppn + mkm, Seatbelts)
summary(fit10)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122.802083 1.693656 72.507096 2.061333e-140
## ppn -9.812019 1.698084 -5.778288 3.044208e-08
summary(fit11)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122.802083 1.6628507 73.850336 2.395106e-141
## ppn -7.838674 1.8055491 -4.341435 2.304713e-05
## mkm -1.749546 0.6145401 -2.846919 4.902428e-03
The estimate in this case is negative. Which makes sense because both effects are in the same direction and are correlated. The estimate change from -9.8 to -7.8 which means that the confounding effect of the variable kms on the regression between DriversKiller and PetrolPrice.
Seatbelts <- as.data.frame(Seatbelts)
round(summary(lm(DriversKilled ~ kms + PetrolPrice +law, Seatbelts))$coef,4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 201.4614 16.2559 12.3931 0.0000
## kms -0.0012 0.0007 -1.8378 0.0677
## PetrolPrice -568.3347 152.0552 -3.7377 0.0002
## law1 -11.8892 6.0258 -1.9731 0.0500
Seatbelts <- Seatbelts %>%
mutate(ppn = (PetrolPrice - mean(PetrolPrice))/sd(PetrolPrice),
mkm = kms / 1000) %>%
mutate(mkm = mkm - mean(mkm))
summary(lm(DriversKilled ~ mkm + ppn + law, Seatbelts))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 124.226311 1.8012324 68.967399 1.976641e-135
## mkm -1.223318 0.6656567 -1.837761 6.767594e-02
## ppn -6.919949 1.8513987 -3.737687 2.463128e-04
## law1 -11.889202 6.0257850 -1.973055 4.995497e-02
fit <- lm(DriversKilled ~ mkm + ppn + law, Seatbelts)
#Method 1
sum(resid(fit)^2) / (nrow(Seatbelts) - 4)
## [1] 522.8903
#Method 2
summary(fit)$sigma^2
## [1] 522.8903
plot(fit, which = c(1,1))
The first plot, which compares the residuals to the fitted values, is where you’re looking for any kind of pattern. In this case, it doesn’t look too bad; it labels some plots that appear on the extreme. This is because, in most cases, it’s difficult to identify a pattern when looking at the points alone; however, when you fit a smoother through the residuals, you frequently see patterns that you wouldn’t have noticed.
plot(fit, which = c(1,2))
The purpose of the Q-Q plot is to determine the normality of the errors; in this instance, it provides you with the reference line. What you’re looking for is whether or not your points fall mostly on a line. These plots typically show deviations, such as the ones shown here in the tails, which indicate that the residuals in the left tail are larger than the theoretical quantiles from the normal distribution. When examining these plots, keep in mind that the estimate of the quantiles from the standardized residuals varies. Even when simulating data from a normal distribution, it will somewhat deviate from this line; therefore, you should focus more on identifying any large gross departures than on analyzing small ones. Right now, this doesn’t seem all that horrible.
plot(dffits(fit))
datdffits <- data.frame(dffits = dffits(fit)) %>%
mutate(id=row_number()) %>%
mutate(specialPoint= as.factor(abs(dffits) >= .35))
ggplot(datdffits, aes(y=dffits, x= id, color= specialPoint))+
geom_point(size=3) +
geom_text(data=subset(datdffits, specialPoint==TRUE),
aes(y=dffits,x=id,label=id), colour='black') +
geom_hline(yintercept = c(-0.35,0.35), color= 'blue') +
geom_hline(yintercept = 0)
head(dfbeta(fit))
## (Intercept) mkm ppn law1
## 1 -0.1022614 0.11881682 -0.09349927 -0.26195987
## 2 -0.1331656 0.22275746 -0.15807657 -0.56786007
## 3 -0.1279422 0.11452337 -0.07920934 -0.23464337
## 4 -0.2032275 0.13006726 -0.06413772 -0.23584650
## 5 -0.0524778 0.02353784 -0.01104844 -0.02756976
## 6 -0.1188004 0.03961875 -0.01108847 -0.02386593
plot(dfbeta(fit)[ , 2])
datdfbeta <- data.frame(dfbetamkm = dfbeta(fit)[ , 2]) %>%
mutate(id=row_number()) %>%
mutate(specialPoint= as.factor(abs(dfbetamkm) >= .2))
ggplot(datdfbeta, aes(y=dfbetamkm, x= id, color= specialPoint))+
geom_point(size=3) +
geom_text(data=subset(datdfbeta, specialPoint==TRUE),
aes(y=dfbetamkm,x=id,label=id), colour='black') +
geom_hline(yintercept = 0)+
geom_hline(yintercept = c(-0.2,0.2), color= 'blue') +
ylab('dfbeta: mkm')
datdfbeta2 <- data.frame(dfbetappn = dfbeta(fit)[ , 3]) %>%
mutate(id=row_number()) %>%
mutate(specialPoint= as.factor(abs(dfbetappn) >= .4))
ggplot(datdfbeta2, aes(y=dfbetappn, x= id, color= specialPoint))+
geom_point(size=3) +
geom_text(data=subset(datdfbeta2, specialPoint==TRUE),
aes(y=dfbetappn,x=id,label=id), colour='black') +
geom_hline(yintercept = 0)+
geom_hline(yintercept = c(-0.4,0.4), color= 'blue') +
ylab('dfbeta: PetrolPrice')
cooks.distance: provides a numerical list. Cooksdistance is an overview of the main ideas. Since a plot makes things easier to understand, the cooks.distance is bounded from below by zero. This is known as a Mahalanobis distance, which takes into consideration the correlation between the parameters, the euclidean distance, and the gaussian distance (normalization of the parameters).
datcooks <- data.frame(cooksdist = cooks.distance(fit)) %>%
mutate(id=row_number()) %>%
mutate(specialPoint= as.factor(cooksdist >= .03))
ggplot(datcooks, aes(y=cooksdist, x= id, color= specialPoint))+
geom_point(size=3) +
geom_text(data=subset(datcooks, specialPoint==TRUE),
aes(y=cooksdist,x=id,label=id), colour='black') +
geom_hline(yintercept = 0)+
geom_hline(yintercept = 0.03, color= 'blue') +
ylab('Cooks distance')
Evaluating the leverage
summary(round(hatvalues(fit), 3))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00600 0.01175 0.01700 0.02082 0.02600 0.06700
dathatv <- data.frame(hatv = hatvalues(fit)) %>%
mutate(id=row_number()) %>%
mutate(specialPoint= as.factor(hatv >= .06))
ggplot(dathatv, aes(y=hatv, x= id, color= specialPoint))+
geom_point(size=3) +
geom_text(data=subset(dathatv, specialPoint==TRUE),
aes(y=hatv,x=id,label=id), colour='black') +
geom_hline(yintercept = 0.06, color= 'blue') +
ylab('hatvalues')
Seatbelts <- as.data.frame(Seatbelts)
round(summary(lm(DriversKilled ~ kms + PetrolPrice +law, Seatbelts))$coef,4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 201.4614 16.2559 12.3931 0.0000
## kms -0.0012 0.0007 -1.8378 0.0677
## PetrolPrice -568.3347 152.0552 -3.7377 0.0002
## law1 -11.8892 6.0258 -1.9731 0.0500
fit12 <- lm(DriversKilled~law, Seatbelts)
fit13 <- lm(DriversKilled~law + mkm, Seatbelts)
fit14 <- lm(DriversKilled~law + ppn, Seatbelts)
fit15 <- lm(DriversKilled~law + mkm + ppn, Seatbelts)
anova(fit12,fit13,fit14,fit15)
## Analysis of Variance Table
##
## Model 1: DriversKilled ~ law
## Model 2: DriversKilled ~ law + mkm
## Model 3: DriversKilled ~ law + ppn
## Model 4: DriversKilled ~ law + mkm + ppn
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 190 109754
## 2 189 105608 1 4145.3 7.9276 0.005388 **
## 3 189 100069 0 5538.9
## 4 188 98303 1 1766.0 3.3774 0.067676 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit12,fit15)
## Analysis of Variance Table
##
## Model 1: DriversKilled ~ law
## Model 2: DriversKilled ~ law + mkm + ppn
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 190 109754
## 2 188 98303 2 11450 10.949 3.177e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This is our base model, which only has the law variable. This is our second model, which adds kilometers to the petrol price. Because it adds two parameters, it is a two-degree, two-degree test. The test’s high level of significance indicates that we should definitely take into account both of those factors.
rbind(summary(fit12)$coef[2, ],
summary(fit13)$coef[2, ],
summary(fit14)$coef[2, ],
summary(fit15)$coef[2, ])
## Estimate Std. Error t value Pr(>|t|)
## [1,] -25.60895 5.341655 -4.794198 3.288375e-06
## [2,] -17.55372 6.028888 -2.911602 4.028394e-03
## [3,] -16.32618 5.555579 -2.938700 3.706585e-03
## [4,] -11.88920 6.025785 -1.973055 4.995497e-02
We are also curious about how my coefficients vary based on the variables I include. Thus, to review, this is the summary of our fitted model dollar sign cuff the coefficient table, of course.
Using the law row from each of these fitted models, we can examine the coefficient table. This is the model that I labeled fit 0 1 2 3 4 in order 1 2 3 4 by rows. The summary of the fit grabbed the coefficient table and grabbed the second row. By binding them together, we can look at them all at once. We can thus observe what happens to the law estimate: the lowest amount estimates that we should expect 25 fewer deaths per month after the law went into effect.
Using the law row from each of these fitted models, we can examine the coefficient table. This is the model that I labeled fit 0 1 2 3 4 in order 1 2 3 4 by rows. The summary of the fit grabbed the coefficient table and grabbed the second row. By binding them together, we can look at them all at once. We can thus observe what happens to the law estimate: the lowest amount estimates that we should expect 25 fewer deaths per month after the law went into effect.
Case1. log(Y)=β0+β1+ϵ
Recall that if we if we have an observed outcome and we do something like log of our outcome Y is beta naught plus beta 1 X plus, here we are not doing in generalizing your model we are simply fitting a linear model where we’ve transformed the outcome.
In contrast, we might have a model where we would have log of the expected value of y* is equal to beta naught plus beta 1 X: Case 2. log(E[Y])=β0+β1X
Now that’s a very different model. In the first case, we’ve transformed our data so we actually have to transform the data before we profit the model and put it into R for example to fit them all.
On the other hand, in this case what we’re saying is take for example the plus on regression case the mean that the Poisson likelihood assumes we’re going to assume is parameterised in terms of a collection of parameters and our other data the X values so this a major difference between transforming the outcome and transforming a parameter which is what occurs in Generalized Linear Models (GLM).
The two most common examples that we come up with for transforming a parameter is transforming the mean with the log in the Poisson regression case so in that case we assume that the mean that the linear predictor usually is on the log scale of the mean. Then the data the observed outcome is related to the mean via the Poisson distribution function.
Case 3. log(E(Y)1−E(Y))=β0+β1X
Another very common one is the log of the expected value of y over 1 minus the expected value of y when y is a binary random variable Bernoulli. Y can take a value either 0 or 1 so it’s expected value will be a value between 0 & 1 and so we take the log of the odds expected value over 1 minus expected value is the linear predictor so we might have beta naught plus beta 1 X. So in these cases are what’s common in general linear models this is a case of just transforming the outcome both very useful techniques however only the 2 and 3 are examples of generalized linear models.
log(E[Y|X=x])=β0+β1X
Take for example Poisson regression: our model states that the log of the expected value of the outcome given X takes the value little X is beta naught plus beta 1 X. This is for just a single regressor. So, when we get an estimate for beta 1 and see it in our coefficient table that is interpreted on the scale of the log of the expected value of the outcome so it is interpreted on the link function scale. This is an example of Poisson regression.
If we were doing logistic regression it would be interpreted on the scale of the logit of the expected value of the response.
We note that we can take log of the expected value of y given that X takes the value x + 1 we get beta naught plus beta 1 x + 1 log(E[Y|X=x+1])=β_0+β_1(X+1)
if we subtract these two entities +[log(E[Y|X=x+1])=β_0+β_1(X+1)]−[log(E[Y|X=x])=β_0+β_1x]=β_1 being equal to beta 1 so what we see is that the interpretation of beta 1 is the change in the link function of the expected value per unit change in the regressor. If there were other coefficients for example plus beta_2 x_2 plus beta_2 x_2 as long as we held that constant it would drop out and then our beta 1 would be interpreted as the link function change in the expected value of the response per unit change in the regressor holding all other regressors constant,
In the logistic regression case because of the natural log we can invert the natural log and if we were to take e 2 this quantity and e to this quantity and then thus get e to the beta one we see that e to the beta 1 is expect a value of y given x takes the value X plus 1 over the expected value of y given x takes the value of x. elog(E[Y|X=x+1]−elog(E[Y|X=x]=eβ_1
eβ_1=E[Y|X=x+1]E[Y|X=x]
In this particular case where our link function is the log we see that we can take e to the power of our coefficients and then they are interpreted back on the original scale as relative quantities so in this case e to the beta_1 e to our coefficient which is not what’s printed out in our coefficient table we have to then take the exponent of it e to our coefficient is the relative increase or decrease in the expected value of the response per unit increase in the regressor holding all the other regression variables constant.
A generalized linear model does not form such an equation.
In contrast, linear model we’re going to start with assuming a distribution for the Y, that Y we’re going to assume, COMES FROM AN EXPONENTIAL FAMILY DISTRIBUTION. Y<=>EF An exponential family includes the normal so linear models are a special case of generalized linear models it includes the Bernoulli and binomial distributions the Poisson distribution the gamma.
It’s a very rich family of distributions so it’s a large collection of models that we can build. And then we still even though we at this point only assumed a distribution for the Y we need a way to relate the Y to the x1 and x2. That is by the fact that the exponential family is always parametrized by a couple of parameters in the same way that the normal distribution is parametrized by a mean and the variance the binomial distribution is defined by a number of trials and the success probability and the Poisson distribution is parametrized by a mean ALL THESE DISTRIBUTIONS DEPEND ON A PARAMETER.
One of the most important parameters that they always depend on is the mean. So let’s call the expected value of Y our mu: Y<=>EF//(E[Y]=μ)=>LinkFun=>X_1,X_2… Mu is one of these important parameters the exponential family depends on. So the way we connect our Y to the Xs is through this EXPONETNIAL FAMILY. Through its PARAMETER (mu), this parameter is now going to be CONNECTED to our X1 and X2 by the LINK FUNCTION.
The LINK FUNCTION is g we’re going to assume that g of our μ say is g(μ)=β_0+β_1X_1+β_2X_2 β_0 + β_1X_1 + β_2X_2 and that might seem like a very circuitous way to connect Ys and Xs, when in our linear model we do it with an equality it’s a very powerful way of doing things.
For exmple in Poisson regression we’re going to assume G is the natural log function: log(μ)=β_0+β_1X_1+β_2X_2
The log of the expected value of Y which let’s call that log of μ_i (so I don’t have to keep writing so much) is beta naught and then let’s just assume one regressor with beta_1X. log(E[Y_i])=log(μi)=β_0+β_1
so this by implication says that e to the MU I is equal to e to the β_0 plus beta_1X. log(E[Y_i])=log(μi)=β_0+β_1=>eμi=eβ_0+β_1X
So, what this means is we’re going to assume that Y_i, every observation is independent Poisson μ_i: Y_i∼indPoisson(μ_i)
Which means that the likelihood looks something like this μ_i to the Y_i e to the negative μ_i all over Y_i factorial. μie−μiYi! and since many of you have taken the inference class you know that, in terms of the likelihood, it doesn’t matter for anything that doesn’t depend on the parameter. In this case the parameter of interest is μ_i, so, we can get rid of that part in the bottom that doesn’t depend on the parameter anymore.
Now the likelihood will be the product over all the subjects, that is the likelihood that depends on β_0 and β_1 and, given Y. That is our likelihood and that function we will maximize with respect to beta naught and beta 1 to obtain our estimates.
∏μie−μi=L(β0,β1|Y) 5. True or false, some GLM distributions impose restrictions on the relationship between the mean and the variance. (Discuss.) TRUE
Seatbelts <- Seatbelts %>%
mutate(binkill = 1 * (DriversKilled > 119))
logRegSeatbelts <- glm(Seatbelts$binkill~Seatbelts$mkm + Seatbelts$ppn + Seatbelts$law, family = 'binomial')
summary(logRegSeatbelts)
##
## Call:
## glm(formula = Seatbelts$binkill ~ Seatbelts$mkm + Seatbelts$ppn +
## Seatbelts$law, family = "binomial")
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.024314 0.160775 0.151 0.8798
## Seatbelts$mkm -0.002938 0.059848 -0.049 0.9608
## Seatbelts$ppn -0.416408 0.169734 -2.453 0.0142 *
## Seatbelts$law1 -0.615522 0.577808 -1.065 0.2868
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 266.09 on 191 degrees of freedom
## Residual deviance: 253.62 on 188 degrees of freedom
## AIC: 261.62
##
## Number of Fisher Scoring iterations: 4
round(exp(logRegSeatbelts$coeff),3)
## (Intercept) Seatbelts$mkm Seatbelts$ppn Seatbelts$law1
## 1.025 0.997 0.659 0.540
round(1-exp(logRegSeatbelts$coeff), 3)
## (Intercept) Seatbelts$mkm Seatbelts$ppn Seatbelts$law1
## -0.025 0.003 0.341 0.460
The intercept is 1.025, the coefficients are 0.997 for kilometers, 0.659 for the price of gasoline, and 0.540 for law when we exponentiate the seatbelt coefficients. More importantly, we find that, when all other factors are held constant, there was a 46% reduction in the likelihood of drivers of more than 119 drivers being killed in a month following the law’s enactment as opposed to before it wasn’t.
fit18<-glm(cbind(DriversKilled, drivers-DriversKilled)~ppn+mkm+law, family = binomial, Seatbelts)
summary(fit18)
##
## Call:
## glm(formula = cbind(DriversKilled, drivers - DriversKilled) ~
## ppn + mkm + law, family = binomial, data = Seatbelts)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.536637 0.007399 -342.829 <2e-16 ***
## ppn -0.007829 0.007479 -1.047 0.295
## mkm 0.003645 0.002733 1.334 0.182
## law1 0.030785 0.026527 1.161 0.246
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 234.93 on 191 degrees of freedom
## Residual deviance: 229.93 on 188 degrees of freedom
## AIC: 1496
##
## Number of Fisher Scoring iterations: 3
In order to answer this question, we must examine a case of model selection in which we first want to model whether or not the driver’s skill level exceeded 119. To do this, we must fit the model with just law, then with law and petrol price, and finally with law of petrol price and kilometers. Finally, we must do a model search to see if the successive models add anything.
fit1 <- glm(binkill~law, family = 'binomial', data = Seatbelts)
fit2 <- update(fit1, binkill~law + PetrolPrice)
fit3 <- update(fit2, binkill~law + PetrolPrice + kms)
anova(fit1,fit2,fit3)
## Analysis of Deviance Table
##
## Model 1: binkill ~ law
## Model 2: binkill ~ law + PetrolPrice
## Model 3: binkill ~ law + PetrolPrice + kms
## Resid. Df Resid. Dev Df Deviance
## 1 190 260.40
## 2 189 253.62 1 6.7760
## 3 188 253.62 1 0.0024
The residual deviation might be thought of as a kind of analog to the sums of squares we examine with normal data.
It appears that the typical chi square cutoff is approximately 4, which would result in a 5% hypothesis test [3.84, df = 1]. Let’s take a look at the coefficient table since it appears that raising petrol price will have a large impact, but adding kilometers won’t.
summary(fit1)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.08288766 0.1539783 0.5383074 0.59036483
## law1 -1.12434153 0.4991985 -2.2522935 0.02430373
summary(fit2)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.5875552 1.3851773 2.589961 0.009598679
## law1 -0.6260282 0.5367204 -1.166396 0.243454555
## PetrolPrice -34.3737200 13.4887754 -2.548320 0.010824304
summary(fit3)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.612261e+00 1.473634e+00 2.45126028 0.01423570
## law1 -6.155225e-01 5.778076e-01 -1.06527242 0.28675267
## PetrolPrice -3.419952e+01 1.394026e+01 -2.45329074 0.01415559
## kms -2.938343e-06 5.984816e-05 -0.04909663 0.96084229
Now let’s examine our legal variable, which piqued our curiosity. It will indicate a significant DECREASE in the LOG ODDS for each period prior to and following the law’s passage. However, the inclusion of gas price attenuates that decrease in the probability of more than 119 drivers dying that month rather a bit. And take note of how important the price of gasoline is.
It’s interesting to observe that the law variable increases by a factor of the p value and changes from significant to not significant by a factor of 10.
The variable that is included when we include kilometers is now completely unimportant and has little effect on the law variable.
Thus, if I were choosing a model, I would probably choose this second model because I was mostly interested in the law coefficient [first model].
Load the dataset Seatbelts as part of the datasets package via data(Seatbelts). Use as.data.frame to convert the object to a dataframe. Fit a Poisson regression GLM with UKDriversKilled as the outcome and kms, PetrolPrice and law as predictors. Interpret your results.
fit24 <- glm(DriversKilled ~ mkm + ppn +law, family = poisson, data = Seatbelts)
round(summary(fit)$coef[ ,1],4)
## (Intercept) mkm ppn law1
## 124.2263 -1.2233 -6.9199 -11.8892
Generally speaking, it is preferable to exponentiate it so that, in the instance of the law variable, the decrease in the law is the log of the expected numbers of drivers killed before and after the legislation has been passed. We have the coefficients, to exponentiate the coefficient and substitute the positive (1-exp) values for the negative ones, we employ a loop.
bi<- data.frame(Intercept=0,mkm=0,ppn=0,law1=0)
for (i in 1:length(bi)) {
bi[i] <- if(summary(fit24)$coef[ i,1] < 0)
{1-exp(summary(fit24)$coef[ i,1])} else {exp(summary(fit24)$coef[ i,1])}
}
print(bi)
## Intercept mkm ppn law1
## 1 123.9459 0.00993133 0.05385679 0.1085243
Let’s look at law1, if we exponentiate it, we find that the expected number of drivers killed was reduced by around 11% both before and after the law was enacted. It is therefore estimated that the expected number of drivers killed was reduced by 11%. looking at mkm, as you can see, this stands for millikilometers. Since that is also negative, let’s look at 1 minus X, which should result in a decrease of roughly 1%. This coefficient indicates that for every thousand extra kilometers driven, the estimated number of driver fatalities will drop by approximately 1%. Finally, looking at the intercept, since we centered those two variables and the period of time when the legislation was not passed for when the law variable, it is the expected number of drivers killed on the log scale for the average gasoline price and the average number of kilometers driven.
Refer to question 1. Fit a linear model with the log of drivers killed as the outcome. Interpret your results.
round(exp(coef(lm(I(log(DriversKilled + 1)) ~ mkm + ppn + law, data = Seatbelts))), 5)
## (Intercept) mkm ppn law1
## 123.18577 0.99190 0.94787 0.87807
fit25 <-lm(I(log(DriversKilled + 1)) ~ mkm + ppn + law, data = Seatbelts)
summary(fit24)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.819845137 0.007127388 676.242884 0.000000e+00
## mkm -0.009980975 0.002614002 -3.818274 1.343887e-04
## ppn -0.055361338 0.007243262 -7.643150 2.119715e-14
## law1 -0.114877106 0.025557951 -4.494770 6.964526e-06
summary(fit25)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.813693546 0.014291860 336.813651 2.082663e-263
## mkm -0.008133355 0.005281647 -1.539928 1.252593e-01
## ppn -0.053538784 0.014689904 -3.644597 3.464468e-04
## law1 -0.130030803 0.047811530 -2.719654 7.147529e-03Refer to question 1. Fit your Poisson log-linear model with drivers as a log offset (to consider the proportion of drivers killed of those killed or seriously injured.)
fit26 <- glm(DriversKilled ~ mkm + ppn +law, family = poisson, data = Seatbelts, offset = log(drivers))
fit27 <- glm(DriversKilled ~ mkm + ppn +law, family = poisson, data = Seatbelts)
summary(fit26)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.612798146 0.007122545 -366.834931 0.0000000
## mkm 0.003377675 0.002630717 1.283937 0.1991640
## ppn -0.007255064 0.007199577 -1.007707 0.3135952
## law1 0.028484328 0.025512651 1.116479 0.2642173
summary(fit27)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.819845137 0.007127388 676.242884 0.000000e+00
## mkm -0.009980975 0.002614002 -3.818274 1.343887e-04
## ppn -0.055361338 0.007243262 -7.643150 2.119715e-14
## law1 -0.114877106 0.025557951 -4.494770 6.964526e-06
The first GLM is modeling the log of the expected value of drivers killed divided by the total number of drivers killed or seriously injured.
exp(0.028484328)
## [1] 1.028894
1-exp(-0.114877106)
## [1] 0.1085243
When comparing the two cases, the first shows a 2% increase in the rate of drivers killed relative to the total number of killed or injured, which is connected with the implementation of the law; the second shows a 10% decrease in the number of drivers killed.
anova(fit26, fit27)
## Analysis of Deviance Table
##
## Model 1: DriversKilled ~ mkm + ppn + law
## Model 2: DriversKilled ~ mkm + ppn + law
## Resid. Df Resid. Dev Df Deviance
## 1 188 213.07
## 2 188 778.32 0 -565.25