Libraries and load data.

library(tidyverse)
air_pol <- read_csv('airpollution.csv')
head(air_pol)
## # A tibble: 6 x 17
##   CITY  Mortality Precip Humidity JanTemp JulyTemp Over65 House  Educ Sound
##   <chr>     <dbl>  <int>    <int>   <int>    <int>  <dbl> <dbl> <dbl> <dbl>
## 1 San …      791.     13       71      49       68    7    3.36  12.2  90.7
## 2 Wich…      824.     28       54      32       81    7    3.27  12.1  81  
## 3 San …      840.     10       61      55       70    7.3  3.11  12.1  88.9
## 4 Lanc…      844.     43       54      32       74   10.1  3.38   9.5  79.2
## 5 Minn…      858.     25       58      12       73    9.2  3.28  12.1  83.1
## 6 Dall…      860.     35       54      46       85    7.1  3.22  11.8  79.9
## # ... with 7 more variables: Density <int>, NonWhite <dbl>,
## #   WhiteCol <dbl>, Poor <dbl>, HC <int>, NOX <int>, SO2 <int>
dim(air_pol)[2]
## [1] 17
# outcome vs main predictors
pairs(air_pol[,c(2, 15:17)])

# outcome vs other predictors (in two chunks)
pairs(air_pol[,c(2, 3:8)])

pairs(air_pol[,c(2, 9:14)])

Q.1 - Fit a model and comment on residuals.

# fit model
big_model <- lm(Mortality ~. , data = air_pol[-c(1,15:17)])
summary(big_model)
## 
## Call:
## lm(formula = Mortality ~ ., data = air_pol[-c(1, 15:17)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.677 -19.583  -3.084  20.636  82.627 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.770e+03  4.443e+02   3.984 0.000234 ***
## Precip       1.572e+00  8.250e-01   1.906 0.062842 .  
## Humidity    -1.145e-01  1.104e+00  -0.104 0.917840    
## JanTemp     -2.166e+00  9.995e-01  -2.167 0.035349 *  
## JulyTemp    -3.103e+00  1.859e+00  -1.669 0.101750    
## Over65      -4.593e+00  8.267e+00  -0.556 0.581169    
## House       -1.033e+02  7.238e+01  -1.428 0.160027    
## Educ        -2.089e+01  1.122e+01  -1.861 0.068970 .  
## Sound       -3.761e-01  1.814e+00  -0.207 0.836618    
## Density      5.325e-03  4.174e-03   1.276 0.208298    
## NonWhite     5.741e+00  1.157e+00   4.962 9.58e-06 ***
## WhiteCol    -3.992e-01  1.644e+00  -0.243 0.809197    
## Poor        -7.119e-01  3.291e+00  -0.216 0.829669    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.68 on 47 degrees of freedom
## Multiple R-squared:  0.723,  Adjusted R-squared:  0.6523 
## F-statistic: 10.22 on 12 and 47 DF,  p-value: 1.829e-09
# plot residuals vs fitted
par(mfrow = c(1,3))
plot(big_model, 1:3)

In the first plot the variance is not constantly symmetrical. The errors are concentrated more towards the center of the plot, and there is some evidence of HK. And in the second plot the observations are mostly around the line so the residuals look normal.

Q.2 - Cook’s distance and leverage plots.

# plots to help identify 'influential points'
par(mfrow=c(1,2), mar = c(4,4,2,2))
plot(big_model, 4:5)

# plot influential observations
air_pol$unusual = 'no'; air_pol$unusual[c(4,7,20)] = 'yes';
ggplot(air_pol, aes(x = Mortality, y = CITY, colour = unusual)) +
  geom_point() +
  theme_bw() +
  theme(legend.position = "top",plot.title = element_text(face = "bold", hjust = 0.5), axis.text.y = element_text(size = rel(0.5))) + 
  labs(title = "City vs Mortality", y = "City")

filter(air_pol, unusual == 'yes')
## # A tibble: 3 x 18
##   CITY  Mortality Precip Humidity JanTemp JulyTemp Over65 House  Educ Sound
##   <chr>     <dbl>  <int>    <int>   <int>    <int>  <dbl> <dbl> <dbl> <dbl>
## 1 Lanc…      844.     43       54      32       74   10.1  3.38   9.5  79.2
## 2 Miam…      861.     60       60      67       82   10    2.98  11.5  88.6
## 3 York…      912.     42       54      33       76    9.7  3.22   9    76.2
## # ... with 8 more variables: Density <int>, NonWhite <dbl>,
## #   WhiteCol <dbl>, Poor <dbl>, HC <int>, NOX <int>, SO2 <int>,
## #   unusual <chr>

The extreme observations are “Lancaster, PA”, “Miami, FL” and “York, PA”.

Q.3 - Running a stepwise selection.

base_model <- step(big_model, trace = 0)
summary(base_model)
## 
## Call:
## lm(formula = Mortality ~ Precip + JanTemp + JulyTemp + House + 
##     Educ + Density + NonWhite, data = air_pol[-c(1, 15:17)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.109 -20.783  -1.205  19.554  81.604 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.525e+03  2.308e+02   6.608 2.09e-08 ***
## Precip       1.276e+00  6.075e-01   2.100  0.04063 *  
## JanTemp     -2.123e+00  6.089e-01  -3.487  0.00100 ** 
## JulyTemp    -2.728e+00  1.278e+00  -2.134  0.03758 *  
## House       -7.003e+01  4.852e+01  -1.443  0.15492    
## Educ        -2.003e+01  7.112e+00  -2.817  0.00684 ** 
## Density      5.555e-03  3.567e-03   1.557  0.12543    
## NonWhite     5.892e+00  8.061e-01   7.309 1.59e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.07 on 52 degrees of freedom
## Multiple R-squared:  0.7198, Adjusted R-squared:  0.6821 
## F-statistic: 19.09 on 7 and 52 DF,  p-value: 2.387e-12

Keep the following variables. Step: AIC=434.3 Mortality ~ Precip + JanTemp + JulyTemp + House + Educ + Density + NonWhite

Q.4 - Re-evaluate after using stepwise.

par(mfrow = c(1,3))
plot(base_model, 1:3)

In the first plot the variance is not constantly symmetrical. The errors are concentrated more towards the center of the plot, and there is some evidence of HK. And in the second plot the observations are mostly around the line so the residuals look normal.

# cook's distance plot
par(mfrow=c(1,2), mar = c(4,4,2,2))
plot(base_model, 4:5)

# plot the unusual observations
air_pol$unusual = 'no'; air_pol$unusual[c(7,20,60)] = 'yes';
ggplot(air_pol, aes(x = Mortality, y = CITY, colour = unusual)) +
  geom_point() +
  theme_bw() +
  theme(legend.position = "top",plot.title = element_text(face = "bold", hjust = 0.5), axis.text.y = element_text(size = rel(0.5))) + 
  labs(title = "City vs Mortality", y = "City")

filter(air_pol, unusual == 'yes')
## # A tibble: 3 x 18
##   CITY  Mortality Precip Humidity JanTemp JulyTemp Over65 House  Educ Sound
##   <chr>     <dbl>  <int>    <int>   <int>    <int>  <dbl> <dbl> <dbl> <dbl>
## 1 Miam…      861.     60       60      67       82   10    2.98  11.5  88.6
## 2 York…      912.     42       54      33       76    9.7  3.22   9    76.2
## 3 New …     1113.     54       62      54       81    7.4  3.36   9.7  72.8
## # ... with 8 more variables: Density <int>, NonWhite <dbl>,
## #   WhiteCol <dbl>, Poor <dbl>, HC <int>, NOX <int>, SO2 <int>,
## #   unusual <chr>

The extreme observations are “Miami, FL”, “York, PA” and “New Orleans LA”.

Q.5 - Fit a linear model with the pollution variables.

# fit model
airpol_model <- lm(Mortality ~ Precip + JanTemp + JulyTemp + House + Educ + Density + NonWhite + HC + NOX + SO2, data = air_pol) 
summary(airpol_model)
## 
## Call:
## lm(formula = Mortality ~ Precip + JanTemp + JulyTemp + House + 
##     Educ + Density + NonWhite + HC + NOX + SO2, data = air_pol)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.228 -17.685   0.436  16.681  87.620 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.401e+03  2.446e+02   5.726 6.18e-07 ***
## Precip       1.304e+00  6.989e-01   1.867   0.0680 .  
## JanTemp     -1.596e+00  6.588e-01  -2.422   0.0192 *  
## JulyTemp    -2.573e+00  1.331e+00  -1.933   0.0590 .  
## House       -5.407e+01  4.855e+01  -1.114   0.2708    
## Educ        -1.549e+01  7.102e+00  -2.181   0.0340 *  
## Density      3.566e-03  3.612e-03   0.987   0.3284    
## NonWhite     5.168e+00  8.524e-01   6.063 1.87e-07 ***
## HC          -5.871e-01  4.358e-01  -1.347   0.1841    
## NOX          1.117e+00  8.746e-01   1.277   0.2074    
## SO2          9.922e-02  1.267e-01   0.783   0.4375    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.63 on 49 degrees of freedom
## Multiple R-squared:  0.7572, Adjusted R-squared:  0.7076 
## F-statistic: 15.28 on 10 and 49 DF,  p-value: 6.643e-12
# plot residuals vs fitted
par(mfrow = c(1,3))
plot(airpol_model, 1:3)

In the first plot the variance is not constantly symmetrical. The errors are concentrated more towards the center of the plot, and there is some evidence of HK. And in the second plot the observations are further from the line than in the other plots, so the normal assumption is less than before.

# cook's distance plot
par(mfrow=c(1,2), mar = c(4,4,2,2))
plot(airpol_model, 4:5)

# plot the unusual observations
air_pol$unusual = 'no'; air_pol$unusual[c(7,8,20)] = 'yes';
ggplot(air_pol, aes(x = Mortality, y = CITY, colour = unusual)) +
  geom_point() +
  theme_bw() +
  theme(legend.position = "top",plot.title = element_text(face = "bold", hjust = 0.5), axis.text.y = element_text(size = rel(0.5))) + 
  labs(title = "City vs Mortality", y = "City")

filter(air_pol, unusual == 'yes')
## # A tibble: 3 x 18
##   CITY  Mortality Precip Humidity JanTemp JulyTemp Over65 House  Educ Sound
##   <chr>     <dbl>  <int>    <int>   <int>    <int>  <dbl> <dbl> <dbl> <dbl>
## 1 Miam…      861.     60       60      67       82   10    2.98  11.5  88.6
## 2 Los …      862.     11       47      53       68    9.2  2.99  12.1  90.6
## 3 York…      912.     42       54      33       76    9.7  3.22   9    76.2
## # ... with 8 more variables: Density <int>, NonWhite <dbl>,
## #   WhiteCol <dbl>, Poor <dbl>, HC <int>, NOX <int>, SO2 <int>,
## #   unusual <chr>

The extreme observations are “Miami, FL”, “York, PA” and “New Orleans LA”.

Q.6 - Pollution variables hypothesis test. H0: HC = NOX = SO2 = 0 vs. HC ≠ NOX ≠ SO2 ≠ 0

Q.7 - Hypothesis test in R

# hypothesis test
anova(base_model, airpol_model)
## Analysis of Variance Table
## 
## Model 1: Mortality ~ Precip + JanTemp + JulyTemp + House + Educ + Density + 
##     NonWhite
## Model 2: Mortality ~ Precip + JanTemp + JulyTemp + House + Educ + Density + 
##     NonWhite + HC + NOX + SO2
##   Res.Df   RSS Df Sum of Sq      F  Pr(>F)  
## 1     52 63955                              
## 2     49 55431  3      8524 2.5117 0.06942 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We reject H0, as the p-value is small.

# confidence interval
confint(airpol_model)[9:11,]
##          2.5 %    97.5 %
## HC  -1.4627662 0.2885874
## NOX -0.6402996 2.8749487
## SO2 -0.1554831 0.3539143

Q.8 - Rerun stepwise and hypothesis tests.

big_model2 <- lm(Mortality ~., data = air_pol[-c(4,7,20), -c(1,15:17)])
base_model2 <- step(big_model2, trace = 0)

names(coef(base_model))
## [1] "(Intercept)" "Precip"      "JanTemp"     "JulyTemp"    "House"      
## [6] "Educ"        "Density"     "NonWhite"
names(coef(base_model2))
## [1] "(Intercept)" "Precip"      "JulyTemp"    "Educ"        "Density"    
## [6] "NonWhite"
airpol_model2 <- lm(Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite + HC + NOX + SO2, data = air_pol)
par(mfrow = c(2,2))
plot(airpol_model2,c(1:2,4:5))

air_pol$CITY[c(7,8,20)]
## [1] "Miami, FL"       "Los Angeles, CA" "York, PA"

Same as before, but now LA not Lancaster is high leverage.

base_model3 <- lm(Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite, data = air_pol[-c(7,8,20),])
airpol_model3 <- lm(Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite + HC + NOX + SO2, data = air_pol[-c(7,8,20),])

anova(base_model3, airpol_model3)
## Analysis of Variance Table
## 
## Model 1: Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite
## Model 2: Mortality ~ Precip + JulyTemp + Educ + Density + NonWhite + HC + 
##     NOX + SO2
##   Res.Df   RSS Df Sum of Sq      F  Pr(>F)  
## 1     51 55052                              
## 2     48 46184  3    8867.7 3.0721 0.03646 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(airpol_model3)[7:9,]
##          2.5 %     97.5 %
## HC  -1.7753141 0.02233103
## NOX -0.2108511 3.10653057
## SO2 -0.1892345 0.29670925

Now the overall effect of the three air pollution predictors is significant(we do explain significant more variability in y with these three included). But the individual CI’s all are still 0.