require(faraway)
## Loading required package: faraway
data(swiss, package="faraway")
## Warning in data(swiss, package = "faraway"): data set 'swiss' not found
head(swiss)
## Fertility Agriculture Examination Education Catholic
## Courtelary 80.2 17.0 15 12 9.96
## Delemont 83.1 45.1 6 9 84.84
## Franches-Mnt 92.5 39.7 5 5 93.40
## Moutier 85.8 36.5 12 7 33.77
## Neuveville 76.9 43.5 17 15 5.16
## Porrentruy 76.1 35.3 9 7 90.57
## Infant.Mortality
## Courtelary 22.2
## Delemont 22.2
## Franches-Mnt 20.2
## Moutier 20.3
## Neuveville 20.6
## Porrentruy 26.6
lmod<-lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data=swiss)
summary(lmod)
##
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education +
## Infant.Mortality, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.513 -5.011 -1.188 5.522 16.982
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.77314 11.62836 5.914 5.28e-07 ***
## Agriculture -0.12929 0.07485 -1.727 0.09144 .
## Examination -0.68799 0.22628 -3.040 0.00406 **
## Education -0.61965 0.17631 -3.515 0.00107 **
## Infant.Mortality 1.30710 0.40658 3.215 0.00251 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.796 on 42 degrees of freedom
## Multiple R-squared: 0.6444, Adjusted R-squared: 0.6105
## F-statistic: 19.02 on 4 and 42 DF, p-value: 5.413e-09
plot(fitted(lmod), residuals(lmod), xlab = "Fitted", ylab = "Residuals")
abline(h=0)
Residuals vs. fitted plot - the plot indicates some non-linearity & mild nonconstant variance which prompts some change in the structual form.
plot(fitted(lmod), sqrt(abs(residuals(lmod))), xlab = "Fitted", ylab = expression(sqrt(hat(epsilon))))
The plot shows somewhat approximate constant variance.
sumary(lm(sqrt(abs(residuals(lmod))) ~ fitted(lmod)))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.8004365 0.9336222 1.9284 0.06012
## fitted(lmod) 0.0069266 0.0131792 0.5256 0.60176
##
## n = 47, p = 2, Residual SE = 0.89630, R-Squared = 0.01
par(mfrow=c(3,3))
n <- 50
# constant variance
for(i in 1:9) {x <- runif(n) ; plot(x, rnorm(n))}
for(i in 1:9) {x <- runif(n) ; plot(x, x*rnorm(n))}
for(i in 1:9) {x <- runif(n) ; plot(x, sqrt ((x)) * rnorm(n))}
for(i in 1:9) {x <- runif(n) ; plot(x, cos(x*pi/25)+rnorm(n, sd = 1))}
par(mfrow=c(1,1))
with a log transformation f response Fertility
lmod1<-lm(sqrt(Fertility)~Agriculture+Examination+Education+Infant.Mortality, data=swiss)
summary(lmod1)
##
## Call:
## lm(formula = sqrt(Fertility) ~ Agriculture + Examination + Education +
## Infant.Mortality, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92649 -0.29485 -0.04204 0.32966 0.95955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.243810 0.695814 11.848 5.62e-15 ***
## Agriculture -0.007792 0.004479 -1.740 0.089237 .
## Examination -0.037648 0.013540 -2.781 0.008088 **
## Education -0.043909 0.010550 -4.162 0.000153 ***
## Infant.Mortality 0.079912 0.024329 3.285 0.002065 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4665 on 42 degrees of freedom
## Multiple R-squared: 0.6698, Adjusted R-squared: 0.6383
## F-statistic: 21.3 on 4 and 42 DF, p-value: 1.182e-09
plot(fitted(lmod1), residuals(lmod1), xlab = "Fitted", ylab = "Residuals")
abline(h=0)
plot(fitted(lmod1), sqrt(abs(residuals(lmod1))), xlab = "Fitted", ylab = expression(sqrt(hat(epsilon))))
Seems to have somewhat improved the non-linearity issue. The constant variance assumption is slightly improved.
plot(swiss$Agriculture, residuals(lmod), xlab = "Agriculture", ylab = "Residuals")
abline(h=0)
var.test(residuals(lmod)[swiss$Agriculture>50], residuals(lmod)[swiss$Agriculture<50])
##
## F test to compare two variances
##
## data: residuals(lmod)[swiss$Agriculture > 50] and residuals(lmod)[swiss$Agriculture < 50]
## F = 0.89547, num df = 25, denom df = 20, p-value = 0.7844
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3737444 2.0599872
## sample estimates:
## ratio of variances
## 0.8954696
plot(swiss$Examination, residuals(lmod), xlab = "Examination", ylab = "Residuals")
abline(h=0)
var.test(residuals(lmod)[swiss$Examination>23], residuals(lmod)[swiss$Examination<23])
##
## F test to compare two variances
##
## data: residuals(lmod)[swiss$Examination > 23] and residuals(lmod)[swiss$Examination < 23]
## F = 0.56835, num df = 7, denom df = 38, p-value = 0.4464
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.2150604 2.4537699
## sample estimates:
## ratio of variances
## 0.5683459
plot(swiss$Education, residuals(lmod), xlab = "Education", ylab = "Residuals")
abline(h=0)
var.test(residuals(lmod)[swiss$Education>20], residuals(lmod)[swiss$Education<20])
##
## F test to compare two variances
##
## data: residuals(lmod)[swiss$Education > 20] and residuals(lmod)[swiss$Education < 20]
## F = 2.3502, num df = 4, denom df = 40, p-value = 0.141
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7517852 19.7675604
## sample estimates:
## ratio of variances
## 2.350166
plot(swiss$Infant.Mortality, residuals(lmod), xlab = "Infant.Mortality", ylab = "Residuals")
abline(h=0)
var.test(residuals(lmod)[swiss$Infant.Mortality>20], residuals(lmod)[swiss$Infant.Mortality<20])
##
## F test to compare two variances
##
## data: residuals(lmod)[swiss$Infant.Mortality > 20] and residuals(lmod)[swiss$Infant.Mortality < 20]
## F = 1.1186, num df = 22, denom df = 20, p-value = 0.8056
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4596246 2.6722591
## sample estimates:
## ratio of variances
## 1.118576
A significant difference is seen only in the variable Education.
qqnorm(residuals(lmod), ylab = "Residuals", main = "")
qqline(residuals(lmod))
The plot shows that the observations approximately fit on the line, and therefore the normality assumption is satisfied.
par(mfrow=c(3,3))
n <- 50
# normal
for(i in 1:9) {x <- rnorm(n) ; qqnorm(x) ; qqline(x)}
for(i in 1:9) {x <- exp(rnorm(n)); qqnorm(x); qqline(x)}
for(i in 1:9) {x <- rcauchy(n); qqnorm(x); qqline(x)}
for(i in 1:9) {x <- runif(n); qqnorm(x); qqline(x)}
par(mfrow=c(1,1))
shapiro.test(residuals(lmod))
##
## Shapiro-Wilk normality test
##
## data: residuals(lmod)
## W = 0.98067, p-value = 0.6201
The null hypothesis is that the residuals are normal. But because p-value is high here, we can not reject null hypothesis.
hatv <- hatvalues(lmod)
head(hatv)
## Courtelary Delemont Franches-Mnt Moutier Neuveville Porrentruy
## 0.13363393 0.12243954 0.16067890 0.07653710 0.02929924 0.18893002
sum(hatv)
## [1] 5
We verify that the sum of the leverages is indeed five—the number of parameters in the model.
provinces <- row.names(swiss)
inf <- influence(lmod)
halfnorm(inf$hat, labs = provinces, ylab = "Leverages")
qqnorm(rstandard(lmod))
abline(0,1)
Because these residuals have been standardized, the points to follow the y = x line if normality holds. An absolute value of 2 would be large but not exceptional for a standardized residual whereas a value of 4 would be very unusual under the standard normal.
stud <- rstudent(lmod)
stud[which.max(abs(stud))]
## Sierre
## 2.483946
qt(0.05/(50*2),44)
## [1] -3.525801
since the value 2.4839 is smaller than 3.53, Sierre is not an outlier.
cook <- cooks.distance(lmod)
halfnorm(cook, 3, labs = provinces, ylab = "Cook's distance")
lmodn<-lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data=swiss, subset = (cook < max(cook)))
summary(lmod)
##
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education +
## Infant.Mortality, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.513 -5.011 -1.188 5.522 16.982
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.77314 11.62836 5.914 5.28e-07 ***
## Agriculture -0.12929 0.07485 -1.727 0.09144 .
## Examination -0.68799 0.22628 -3.040 0.00406 **
## Education -0.61965 0.17631 -3.515 0.00107 **
## Infant.Mortality 1.30710 0.40658 3.215 0.00251 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.796 on 42 degrees of freedom
## Multiple R-squared: 0.6444, Adjusted R-squared: 0.6105
## F-statistic: 19.02 on 4 and 42 DF, p-value: 5.413e-09
summary(lmodn)
##
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education +
## Infant.Mortality, data = swiss, subset = (cook < max(cook)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.4794 -4.2677 -0.9448 4.5559 14.1863
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.39046 11.18455 5.668 1.28e-06 ***
## Agriculture -0.13887 0.07073 -1.963 0.056419 .
## Examination -0.57810 0.21805 -2.651 0.011350 *
## Education -0.65484 0.16697 -3.922 0.000327 ***
## Infant.Mortality 1.50886 0.39216 3.848 0.000409 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.357 on 41 degrees of freedom
## Multiple R-squared: 0.6679, Adjusted R-squared: 0.6355
## F-statistic: 20.61 on 4 and 41 DF, p-value: 2.257e-09
plot(dfbeta(lmod)[,3],ylab="Change in Examination coef")
abline(h=0)
By looking at the differences between these 2 models, we can see that the significance level of the parameters Examination, Education and Infant.Mortality had been changed. also the coefficent of Examination changed by about 16%.
d <-residuals(lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data=swiss))
m<- residuals(lm(Education~ Agriculture+Examination+Infant.Mortality, data=swiss))
plot(m, d, xlab ="Education residuals", ylab = "Fertility residuals")
abline(d,m)
coef(lm(d ~m))
## (Intercept) m
## -3.238853e-16 -1.506442e-17
coef(lmod)
## (Intercept) Agriculture Examination Education
## 68.7731361 -0.1292921 -0.6879944 -0.6196489
## Infant.Mortality
## 1.3070966
lmodx <- lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data = swiss, subset = (Education>10))
lmody <- lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data = swiss, subset = (Education<10))
summary(lmodx)
##
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education +
## Infant.Mortality, data = swiss, subset = (Education > 10))
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.3619 -7.3173 0.4795 5.3427 12.4949
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.44244 21.27685 2.700 0.0193 *
## Agriculture -0.07256 0.14709 -0.493 0.6307
## Examination -0.20762 0.46009 -0.451 0.6598
## Education -0.88826 0.22907 -3.878 0.0022 **
## Infant.Mortality 1.55734 0.67727 2.299 0.0402 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.189 on 12 degrees of freedom
## Multiple R-squared: 0.7374, Adjusted R-squared: 0.6498
## F-statistic: 8.422 on 4 and 12 DF, p-value: 0.00178
summary(lmody)
##
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education +
## Infant.Mortality, data = swiss, subset = (Education < 10))
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.3587 -3.6655 -0.6091 4.4455 14.7797
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.41385 15.48716 4.353 0.000234 ***
## Agriculture -0.08819 0.09702 -0.909 0.372762
## Examination -1.04758 0.29103 -3.600 0.001512 **
## Education -0.02254 0.72875 -0.031 0.975593
## Infant.Mortality 1.26857 0.59792 2.122 0.044857 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.87 on 23 degrees of freedom
## Multiple R-squared: 0.4347, Adjusted R-squared: 0.3364
## F-statistic: 4.422 on 4 and 23 DF, p-value: 0.008497
swiss$status <- ifelse(swiss$Education > 10, "high", "low")
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
ggplot(swiss, aes(x = Education, y = Fertility, shape = status)) +
geom_point()
ggplot(swiss, aes(x = Education, y = Fertility, shape = status)) +
geom_point() +
facet_grid(~ status) +
stat_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'