1. For the swiss data (?swiss for documentation), fit a model with Fertility as the response and the other variables as predictors. Perform regression diagnostics on this model to answer the following questions. Display any plots that are relevant. Do not provide any plots about which you have nothing to say. Suggest possible improvements or corrections to the model where appropriate.
require(faraway)
## Loading required package: faraway
data(swiss, package="faraway")
## Warning in data(swiss, package = "faraway"): data set 'swiss' not found
head(swiss)
##              Fertility Agriculture Examination Education Catholic
## Courtelary        80.2        17.0          15        12     9.96
## Delemont          83.1        45.1           6         9    84.84
## Franches-Mnt      92.5        39.7           5         5    93.40
## Moutier           85.8        36.5          12         7    33.77
## Neuveville        76.9        43.5          17        15     5.16
## Porrentruy        76.1        35.3           9         7    90.57
##              Infant.Mortality
## Courtelary               22.2
## Delemont                 22.2
## Franches-Mnt             20.2
## Moutier                  20.3
## Neuveville               20.6
## Porrentruy               26.6
lmod<-lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data=swiss)
summary(lmod)
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education + 
##     Infant.Mortality, data = swiss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.513  -5.011  -1.188   5.522  16.982 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      68.77314   11.62836   5.914 5.28e-07 ***
## Agriculture      -0.12929    0.07485  -1.727  0.09144 .  
## Examination      -0.68799    0.22628  -3.040  0.00406 ** 
## Education        -0.61965    0.17631  -3.515  0.00107 ** 
## Infant.Mortality  1.30710    0.40658   3.215  0.00251 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.796 on 42 degrees of freedom
## Multiple R-squared:  0.6444, Adjusted R-squared:  0.6105 
## F-statistic: 19.02 on 4 and 42 DF,  p-value: 5.413e-09
  1. Check the constant variance assumption for the errors.
plot(fitted(lmod), residuals(lmod), xlab = "Fitted", ylab = "Residuals")
abline(h=0)

Residuals vs. fitted plot - the plot indicates some non-linearity & mild nonconstant variance which prompts some change in the structual form.

plot(fitted(lmod), sqrt(abs(residuals(lmod))), xlab = "Fitted", ylab = expression(sqrt(hat(epsilon))))

The plot shows somewhat approximate constant variance.

sumary(lm(sqrt(abs(residuals(lmod))) ~ fitted(lmod)))
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.8004365  0.9336222  1.9284  0.06012
## fitted(lmod) 0.0069266  0.0131792  0.5256  0.60176
## 
## n = 47, p = 2, Residual SE = 0.89630, R-Squared = 0.01
par(mfrow=c(3,3))
n <- 50
# constant variance
for(i in 1:9) {x <- runif(n) ; plot(x, rnorm(n))}

for(i in 1:9) {x <- runif(n) ; plot(x, x*rnorm(n))}

for(i in 1:9) {x <- runif(n) ; plot(x, sqrt ((x)) * rnorm(n))}

for(i in 1:9) {x <- runif(n) ; plot(x, cos(x*pi/25)+rnorm(n, sd = 1))}

par(mfrow=c(1,1))

with a log transformation f response Fertility

lmod1<-lm(sqrt(Fertility)~Agriculture+Examination+Education+Infant.Mortality, data=swiss)
summary(lmod1)
## 
## Call:
## lm(formula = sqrt(Fertility) ~ Agriculture + Examination + Education + 
##     Infant.Mortality, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92649 -0.29485 -0.04204  0.32966  0.95955 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       8.243810   0.695814  11.848 5.62e-15 ***
## Agriculture      -0.007792   0.004479  -1.740 0.089237 .  
## Examination      -0.037648   0.013540  -2.781 0.008088 ** 
## Education        -0.043909   0.010550  -4.162 0.000153 ***
## Infant.Mortality  0.079912   0.024329   3.285 0.002065 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4665 on 42 degrees of freedom
## Multiple R-squared:  0.6698, Adjusted R-squared:  0.6383 
## F-statistic:  21.3 on 4 and 42 DF,  p-value: 1.182e-09
plot(fitted(lmod1), residuals(lmod1), xlab = "Fitted", ylab = "Residuals")
abline(h=0)

plot(fitted(lmod1), sqrt(abs(residuals(lmod1))), xlab = "Fitted", ylab = expression(sqrt(hat(epsilon))))

Seems to have somewhat improved the non-linearity issue. The constant variance assumption is slightly improved.

plot(swiss$Agriculture, residuals(lmod), xlab = "Agriculture", ylab = "Residuals")
abline(h=0)

var.test(residuals(lmod)[swiss$Agriculture>50], residuals(lmod)[swiss$Agriculture<50])
## 
##  F test to compare two variances
## 
## data:  residuals(lmod)[swiss$Agriculture > 50] and residuals(lmod)[swiss$Agriculture < 50]
## F = 0.89547, num df = 25, denom df = 20, p-value = 0.7844
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3737444 2.0599872
## sample estimates:
## ratio of variances 
##          0.8954696
plot(swiss$Examination, residuals(lmod), xlab = "Examination", ylab = "Residuals")
abline(h=0)

var.test(residuals(lmod)[swiss$Examination>23], residuals(lmod)[swiss$Examination<23])
## 
##  F test to compare two variances
## 
## data:  residuals(lmod)[swiss$Examination > 23] and residuals(lmod)[swiss$Examination < 23]
## F = 0.56835, num df = 7, denom df = 38, p-value = 0.4464
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.2150604 2.4537699
## sample estimates:
## ratio of variances 
##          0.5683459
plot(swiss$Education, residuals(lmod), xlab = "Education", ylab = "Residuals")
abline(h=0)

var.test(residuals(lmod)[swiss$Education>20], residuals(lmod)[swiss$Education<20])
## 
##  F test to compare two variances
## 
## data:  residuals(lmod)[swiss$Education > 20] and residuals(lmod)[swiss$Education < 20]
## F = 2.3502, num df = 4, denom df = 40, p-value = 0.141
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##   0.7517852 19.7675604
## sample estimates:
## ratio of variances 
##           2.350166
plot(swiss$Infant.Mortality, residuals(lmod), xlab = "Infant.Mortality", ylab = "Residuals")
abline(h=0)

var.test(residuals(lmod)[swiss$Infant.Mortality>20], residuals(lmod)[swiss$Infant.Mortality<20])
## 
##  F test to compare two variances
## 
## data:  residuals(lmod)[swiss$Infant.Mortality > 20] and residuals(lmod)[swiss$Infant.Mortality < 20]
## F = 1.1186, num df = 22, denom df = 20, p-value = 0.8056
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4596246 2.6722591
## sample estimates:
## ratio of variances 
##           1.118576

A significant difference is seen only in the variable Education.

  1. Check the normality assumption.
qqnorm(residuals(lmod), ylab = "Residuals", main = "")
qqline(residuals(lmod))

The plot shows that the observations approximately fit on the line, and therefore the normality assumption is satisfied.

par(mfrow=c(3,3))
n <- 50 
# normal 
for(i in 1:9) {x <- rnorm(n) ; qqnorm(x) ; qqline(x)}

for(i in 1:9) {x <- exp(rnorm(n)); qqnorm(x); qqline(x)}

for(i in 1:9) {x <- rcauchy(n); qqnorm(x); qqline(x)}

for(i in 1:9) {x <- runif(n); qqnorm(x); qqline(x)}

par(mfrow=c(1,1))
shapiro.test(residuals(lmod))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(lmod)
## W = 0.98067, p-value = 0.6201

The null hypothesis is that the residuals are normal. But because p-value is high here, we can not reject null hypothesis.

  1. Check for large leverage points.
hatv <- hatvalues(lmod)
head(hatv)
##   Courtelary     Delemont Franches-Mnt      Moutier   Neuveville   Porrentruy 
##   0.13363393   0.12243954   0.16067890   0.07653710   0.02929924   0.18893002
sum(hatv)
## [1] 5

We verify that the sum of the leverages is indeed five—the number of parameters in the model.

provinces <- row.names(swiss)
inf <- influence(lmod)
halfnorm(inf$hat, labs = provinces, ylab = "Leverages")

qqnorm(rstandard(lmod))
abline(0,1)

Because these residuals have been standardized, the points to follow the y = x line if normality holds. An absolute value of 2 would be large but not exceptional for a standardized residual whereas a value of 4 would be very unusual under the standard normal.

  1. Check for outliers.
stud <- rstudent(lmod)
stud[which.max(abs(stud))]
##   Sierre 
## 2.483946
qt(0.05/(50*2),44)
## [1] -3.525801

since the value 2.4839 is smaller than 3.53, Sierre is not an outlier.

  1. Check for influential points.
cook <- cooks.distance(lmod)
halfnorm(cook, 3, labs = provinces, ylab = "Cook's distance")

lmodn<-lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data=swiss, subset = (cook < max(cook)))
summary(lmod)
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education + 
##     Infant.Mortality, data = swiss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.513  -5.011  -1.188   5.522  16.982 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      68.77314   11.62836   5.914 5.28e-07 ***
## Agriculture      -0.12929    0.07485  -1.727  0.09144 .  
## Examination      -0.68799    0.22628  -3.040  0.00406 ** 
## Education        -0.61965    0.17631  -3.515  0.00107 ** 
## Infant.Mortality  1.30710    0.40658   3.215  0.00251 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.796 on 42 degrees of freedom
## Multiple R-squared:  0.6444, Adjusted R-squared:  0.6105 
## F-statistic: 19.02 on 4 and 42 DF,  p-value: 5.413e-09
summary(lmodn)
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education + 
##     Infant.Mortality, data = swiss, subset = (cook < max(cook)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.4794  -4.2677  -0.9448   4.5559  14.1863 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      63.39046   11.18455   5.668 1.28e-06 ***
## Agriculture      -0.13887    0.07073  -1.963 0.056419 .  
## Examination      -0.57810    0.21805  -2.651 0.011350 *  
## Education        -0.65484    0.16697  -3.922 0.000327 ***
## Infant.Mortality  1.50886    0.39216   3.848 0.000409 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.357 on 41 degrees of freedom
## Multiple R-squared:  0.6679, Adjusted R-squared:  0.6355 
## F-statistic: 20.61 on 4 and 41 DF,  p-value: 2.257e-09
plot(dfbeta(lmod)[,3],ylab="Change in Examination coef")
abline(h=0)

By looking at the differences between these 2 models, we can see that the significance level of the parameters Examination, Education and Infant.Mortality had been changed. also the coefficent of Examination changed by about 16%.

  1. Check the structure of the relationship between the predictors and the response.
d <-residuals(lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data=swiss))
m<- residuals(lm(Education~ Agriculture+Examination+Infant.Mortality, data=swiss))
plot(m, d, xlab ="Education residuals", ylab = "Fertility residuals")
abline(d,m)

coef(lm(d ~m))
##   (Intercept)             m 
## -3.238853e-16 -1.506442e-17
coef(lmod)
##      (Intercept)      Agriculture      Examination        Education 
##       68.7731361       -0.1292921       -0.6879944       -0.6196489 
## Infant.Mortality 
##        1.3070966
lmodx <- lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data = swiss, subset = (Education>10))
lmody <- lm(Fertility~Agriculture+Examination+Education+Infant.Mortality, data = swiss, subset = (Education<10))

summary(lmodx)
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education + 
##     Infant.Mortality, data = swiss, subset = (Education > 10))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.3619  -7.3173   0.4795   5.3427  12.4949 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      57.44244   21.27685   2.700   0.0193 * 
## Agriculture      -0.07256    0.14709  -0.493   0.6307   
## Examination      -0.20762    0.46009  -0.451   0.6598   
## Education        -0.88826    0.22907  -3.878   0.0022 **
## Infant.Mortality  1.55734    0.67727   2.299   0.0402 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.189 on 12 degrees of freedom
## Multiple R-squared:  0.7374, Adjusted R-squared:  0.6498 
## F-statistic: 8.422 on 4 and 12 DF,  p-value: 0.00178
summary(lmody)
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education + 
##     Infant.Mortality, data = swiss, subset = (Education < 10))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.3587  -3.6655  -0.6091   4.4455  14.7797 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      67.41385   15.48716   4.353 0.000234 ***
## Agriculture      -0.08819    0.09702  -0.909 0.372762    
## Examination      -1.04758    0.29103  -3.600 0.001512 ** 
## Education        -0.02254    0.72875  -0.031 0.975593    
## Infant.Mortality  1.26857    0.59792   2.122 0.044857 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.87 on 23 degrees of freedom
## Multiple R-squared:  0.4347, Adjusted R-squared:  0.3364 
## F-statistic: 4.422 on 4 and 23 DF,  p-value: 0.008497
swiss$status <- ifelse(swiss$Education > 10, "high", "low")
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
ggplot(swiss, aes(x = Education, y = Fertility, shape = status)) +
  geom_point()

ggplot(swiss, aes(x = Education, y = Fertility, shape = status)) +
  geom_point() +
  facet_grid(~ status) +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'