Libraries and load data.
library(tidyverse)
air_pol <- read_csv('airpollution.csv')
head(air_pol)
## # A tibble: 6 x 17
## CITY Mortality Precip Humidity JanTemp JulyTemp Over65 House Educ Sound
## <chr> <dbl> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 San … 791. 13 71 49 68 7 3.36 12.2 90.7
## 2 Wich… 824. 28 54 32 81 7 3.27 12.1 81
## 3 San … 840. 10 61 55 70 7.3 3.11 12.1 88.9
## 4 Lanc… 844. 43 54 32 74 10.1 3.38 9.5 79.2
## 5 Minn… 858. 25 58 12 73 9.2 3.28 12.1 83.1
## 6 Dall… 860. 35 54 46 85 7.1 3.22 11.8 79.9
## # ... with 7 more variables: Density <int>, NonWhite <dbl>,
## # WhiteCol <dbl>, Poor <dbl>, HC <int>, NOX <int>, SO2 <int>
dim(air_pol)[2]
## [1] 17
# outcome vs main predictors
pairs(air_pol[,c(2, 15:17)])
# outcome vs other predictors (in two chunks)
pairs(air_pol[,c(2, 3:8)])
pairs(air_pol[,c(2, 9:14)])
Q.1 - Fit a model and comment on residuals.
# fit model
big_model <- lm(Mortality ~. , data = air_pol[-c(1,15:17)])
summary(big_model)
##
## Call:
## lm(formula = Mortality ~ ., data = air_pol[-c(1, 15:17)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.677 -19.583 -3.084 20.636 82.627
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.770e+03 4.443e+02 3.984 0.000234 ***
## Precip 1.572e+00 8.250e-01 1.906 0.062842 .
## Humidity -1.145e-01 1.104e+00 -0.104 0.917840
## JanTemp -2.166e+00 9.995e-01 -2.167 0.035349 *
## JulyTemp -3.103e+00 1.859e+00 -1.669 0.101750
## Over65 -4.593e+00 8.267e+00 -0.556 0.581169
## House -1.033e+02 7.238e+01 -1.428 0.160027
## Educ -2.089e+01 1.122e+01 -1.861 0.068970 .
## Sound -3.761e-01 1.814e+00 -0.207 0.836618
## Density 5.325e-03 4.174e-03 1.276 0.208298
## NonWhite 5.741e+00 1.157e+00 4.962 9.58e-06 ***
## WhiteCol -3.992e-01 1.644e+00 -0.243 0.809197
## Poor -7.119e-01 3.291e+00 -0.216 0.829669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.68 on 47 degrees of freedom
## Multiple R-squared: 0.723, Adjusted R-squared: 0.6523
## F-statistic: 10.22 on 12 and 47 DF, p-value: 1.829e-09
# plot residuals vs fitted
par(mfrow = c(1,3))
plot(big_model, 1:3)
In the first plot the variance is not constantly symmetrical. The errors are concentrated more towards the center of the plot, and there is some evidence of HK. And in the second plot the observations are mostly around the line so the residuals look normal.
Q.2 - Cook’s distance and leverage plots.
# plots to help identify 'influential points'
par(mfrow=c(1,2), mar = c(4,4,2,2))
plot(big_model, 4:5)
# plot influential observations
air_pol$unusual = 'no'; air_pol$unusual[c(4,7,20)] = 'yes';
ggplot(air_pol, aes(x = Mortality, y = CITY, colour = unusual)) +
geom_point() +
theme_bw() +
theme(legend.position = "top",plot.title = element_text(face = "bold", hjust = 0.5), axis.text.y = element_text(size = rel(0.5))) +
labs(title = "City vs Mortality", y = "City")
filter(air_pol, unusual == 'yes')
## # A tibble: 3 x 18
## CITY Mortality Precip Humidity JanTemp JulyTemp Over65 House Educ Sound
## <chr> <dbl> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Lanc… 844. 43 54 32 74 10.1 3.38 9.5 79.2
## 2 Miam… 861. 60 60 67 82 10 2.98 11.5 88.6
## 3 York… 912. 42 54 33 76 9.7 3.22 9 76.2
## # ... with 8 more variables: Density <int>, NonWhite <dbl>,
## # WhiteCol <dbl>, Poor <dbl>, HC <int>, NOX <int>, SO2 <int>,
## # unusual <chr>
The extreme observations are “Lancaster, PA”, “Miami, FL” and “York, PA”.
Q.3 - Running a stepwise selection.
base_model <- step(big_model, trace = 0)
summary(base_model)
##
## Call:
## lm(formula = Mortality ~ Precip + JanTemp + JulyTemp + House +
## Educ + Density + NonWhite, data = air_pol[-c(1, 15:17)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.109 -20.783 -1.205 19.554 81.604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.525e+03 2.308e+02 6.608 2.09e-08 ***
## Precip 1.276e+00 6.075e-01 2.100 0.04063 *
## JanTemp -2.123e+00 6.089e-01 -3.487 0.00100 **
## JulyTemp -2.728e+00 1.278e+00 -2.134 0.03758 *
## House -7.003e+01 4.852e+01 -1.443 0.15492
## Educ -2.003e+01 7.112e+00 -2.817 0.00684 **
## Density 5.555e-03 3.567e-03 1.557 0.12543
## NonWhite 5.892e+00 8.061e-01 7.309 1.59e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.07 on 52 degrees of freedom
## Multiple R-squared: 0.7198, Adjusted R-squared: 0.6821
## F-statistic: 19.09 on 7 and 52 DF, p-value: 2.387e-12
Keep the following variables. Step: AIC=434.3 Mortality ~ Precip + JanTemp + JulyTemp + House + Educ + Density + NonWhite
Q.4 - Re-evaluate after using stepwise.
par(mfrow = c(1,3))
plot(base_model, 1:3)
In the first plot the variance is not constantly symmetrical. The errors are concentrated more towards the center of the plot, and there is some evidence of HK. And in the second plot the observations are mostly around the line so the residuals look normal.
# cook's distance plot
par(mfrow=c(1,2), mar = c(4,4,2,2))
plot(base_model, 4:5)
# plot the unusual observations
air_pol$unusual = 'no'; air_pol$unusual[c(7,20,60)] = 'yes';
ggplot(air_pol, aes(x = Mortality, y = CITY, colour = unusual)) +
geom_point() +
theme_bw() +
theme(legend.position = "top",plot.title = element_text(face = "bold", hjust = 0.5), axis.text.y = element_text(size = rel(0.5))) +
labs(title = "City vs Mortality", y = "City")
filter(air_pol, unusual == 'yes')
## # A tibble: 3 x 18
## CITY Mortality Precip Humidity JanTemp JulyTemp Over65 House Educ Sound
## <chr> <dbl> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Miam… 861. 60 60 67 82 10 2.98 11.5 88.6
## 2 York… 912. 42 54 33 76 9.7 3.22 9 76.2
## 3 New … 1113. 54 62 54 81 7.4 3.36 9.7 72.8
## # ... with 8 more variables: Density <int>, NonWhite <dbl>,
## # WhiteCol <dbl>, Poor <dbl>, HC <int>, NOX <int>, SO2 <int>,
## # unusual <chr>
The extreme observations are “Miami, FL”, “York, PA” and “New Orleans LA”.
Q.5 - Fit a linear model with the pollution variables.
# fit model
airpol_model <- lm(Mortality ~ Precip + JanTemp + JulyTemp + House + Educ + Density + NonWhite + HC + NOX + SO2, data = air_pol)
summary(airpol_model)
##
## Call:
## lm(formula = Mortality ~ Precip + JanTemp + JulyTemp + House +
## Educ + Density + NonWhite + HC + NOX + SO2, data = air_pol)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.228 -17.685 0.436 16.681 87.620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.401e+03 2.446e+02 5.726 6.18e-07 ***
## Precip 1.304e+00 6.989e-01 1.867 0.0680 .
## JanTemp -1.596e+00 6.588e-01 -2.422 0.0192 *
## JulyTemp -2.573e+00 1.331e+00 -1.933 0.0590 .
## House -5.407e+01 4.855e+01 -1.114 0.2708
## Educ -1.549e+01 7.102e+00 -2.181 0.0340 *
## Density 3.566e-03 3.612e-03 0.987 0.3284
## NonWhite 5.168e+00 8.524e-01 6.063 1.87e-07 ***
## HC -5.871e-01 4.358e-01 -1.347 0.1841
## NOX 1.117e+00 8.746e-01 1.277 0.2074
## SO2 9.922e-02 1.267e-01 0.783 0.4375
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33.63 on 49 degrees of freedom
## Multiple R-squared: 0.7572, Adjusted R-squared: 0.7076
## F-statistic: 15.28 on 10 and 49 DF, p-value: 6.643e-12
# plot residuals vs fitted
par(mfrow = c(1,3))
plot(airpol_model, 1:3)
In the first plot the variance is not constantly symmetrical. The errors are concentrated more towards the center of the plot, and there is some evidence of HK. And in the second plot the observations are further from the line than in the other plots, so the normal assumption is less than before.
# cook's distance plot
par(mfrow=c(1,2), mar = c(4,4,2,2))
plot(airpol_model, 4:5)
# plot the unusual observations
air_pol$unusual = 'no'; air_pol$unusual[c(7,8,20)] = 'yes';
ggplot(air_pol, aes(x = Mortality, y = CITY, colour = unusual)) +
geom_point() +
theme_bw() +
theme(legend.position = "top",plot.title = element_text(face = "bold", hjust = 0.5), axis.text.y = element_text(size = rel(0.5))) +
labs(title = "City vs Mortality", y = "City")
filter(air_pol, unusual == 'yes')
## # A tibble: 3 x 18
## CITY Mortality Precip Humidity JanTemp JulyTemp Over65 House Educ Sound
## <chr> <dbl> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Miam… 861. 60 60 67 82 10 2.98 11.5 88.6
## 2 Los … 862. 11 47 53 68 9.2 2.99 12.1 90.6
## 3 York… 912. 42 54 33 76 9.7 3.22 9 76.2
## # ... with 8 more variables: Density <int>, NonWhite <dbl>,
## # WhiteCol <dbl>, Poor <dbl>, HC <int>, NOX <int>, SO2 <int>,
## # unusual <chr>
The extreme observations are “Miami, FL”, “York, PA” and “New Orleans LA”.
Q.6 - Pollution variables hypothesis test. H0: HC = NOX = SO2 = 0 vs. HC ≠ NOX ≠ SO2 ≠ 0
Q.7 - Hypothesis test in R
# hypothesis test
anova(base_model, airpol_model)
## Analysis of Variance Table
##
## Model 1: Mortality ~ Precip + JanTemp + JulyTemp + House + Educ + Density +
## NonWhite
## Model 2: Mortality ~ Precip + JanTemp + JulyTemp + House + Educ + Density +
## NonWhite + HC + NOX + SO2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 52 63955
## 2 49 55431 3 8524 2.5117 0.06942 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We reject H0, as the p-value is small.
# confidence interval
confint(airpol_model)[9:11,]
## 2.5 % 97.5 %
## HC -1.4627662 0.2885874
## NOX -0.6402996 2.8749487
## SO2 -0.1554831 0.3539143
Q.8 - Rerun stepwise and hypothesis tests.
big_model2 <- lm(Mortality ~., data = air_pol[-c(4,7,20), -c(1,15:17)])
base_model2 <- step(big_model2, trace = 0)
names(coef(base_model))
## [1] "(Intercept)" "Precip" "JanTemp" "JulyTemp" "House"
## [6] "Educ" "Density" "NonWhite"
names(coef(base_model2))
## [1] "(Intercept)" "Precip" "JulyTemp" "Educ" "Density"
## [6] "NonWhite"
airpol_model2 <- lm(Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite + HC + NOX + SO2, data = air_pol)
par(mfrow = c(2,2))
plot(airpol_model2,c(1:2,4:5))
air_pol$CITY[c(7,8,20)]
## [1] "Miami, FL" "Los Angeles, CA" "York, PA"
Same as before, but now LA not Lancaster is high leverage.
base_model3 <- lm(Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite, data = air_pol[-c(7,8,20),])
airpol_model3 <- lm(Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite + HC + NOX + SO2, data = air_pol[-c(7,8,20),])
anova(base_model3, airpol_model3)
## Analysis of Variance Table
##
## Model 1: Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite
## Model 2: Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite + HC +
## NOX + SO2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 51 55052
## 2 48 46184 3 8867.7 3.0721 0.03646 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(airpol_model3)[7:9,]
## 2.5 % 97.5 %
## HC -1.7753141 0.02233103
## NOX -0.2108511 3.10653057
## SO2 -0.1892345 0.29670925
Now the overall effect of the three air pollution predictors is significant(we do explain significant more variability in y with these three included). But the individual CI’s all are still 0.