Groupe F
8/01/2019
library(readxl)
mortality <- read_excel("mortality.xlsx")
head(mortality)
# A tibble: 6 x 17
id precipitation temperature01 temperature07 pover65 household
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 36 27 71 8.1 3.34
2 2 35 23 72 11.1 3.14
3 3 44 29 74 10.4 3.21
4 4 47 45 79 6.5 3.41
5 5 43 35 77 7.6 3.44
6 6 53 45 80 7.7 3.45
# ... with 11 more variables: schooling <dbl>, householdKitchen <dbl>,
# populationPerMile <dbl>, notWhitePop <dbl>, officeWorkers <dbl>,
# salaireLess3000 <dbl>, CO <dbl>, NO <dbl>, SO2 <dbl>, humidity <dbl>,
# mortality <dbl>
summary(mortality)
id precipitation temperature01 temperature07
Min. : 1.00 Min. :10.00 Min. :12.00 Min. :63.00
1st Qu.:15.75 1st Qu.:32.75 1st Qu.:27.00 1st Qu.:72.00
Median :30.50 Median :38.00 Median :31.50 Median :74.00
Mean :30.50 Mean :37.37 Mean :34.82 Mean :74.60
3rd Qu.:45.25 3rd Qu.:43.25 3rd Qu.:40.00 3rd Qu.:77.25
Max. :60.00 Max. :60.00 Max. :83.00 Max. :85.00
pover65 household schooling householdKitchen
Min. : 5.600 Min. :2.920 Min. : 9.00 Min. :66.80
1st Qu.: 7.675 1st Qu.:3.210 1st Qu.:10.40 1st Qu.:78.38
Median : 9.000 Median :3.265 Median :11.05 Median :81.15
Mean : 8.798 Mean :3.263 Mean :10.97 Mean :80.91
3rd Qu.: 9.700 3rd Qu.:3.360 3rd Qu.:11.50 3rd Qu.:83.60
Max. :11.800 Max. :3.530 Max. :12.30 Max. :90.70
populationPerMile notWhitePop officeWorkers salaireLess3000
Min. :1441 Min. : 0.80 Min. :33.80 Min. : 9.40
1st Qu.:3104 1st Qu.: 4.95 1st Qu.:43.25 1st Qu.:12.00
Median :3567 Median :10.40 Median :45.50 Median :13.20
Mean :3876 Mean :11.87 Mean :46.07 Mean :14.37
3rd Qu.:4520 3rd Qu.:15.65 3rd Qu.:49.52 3rd Qu.:15.15
Max. :9699 Max. :38.50 Max. :59.70 Max. :26.40
CO NO SO2 humidity
Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. :38.00
1st Qu.: 7.00 1st Qu.: 4.00 1st Qu.: 11.00 1st Qu.:54.75
Median : 14.50 Median : 9.00 Median : 30.00 Median :57.00
Mean : 37.85 Mean : 22.52 Mean : 53.77 Mean :57.53
3rd Qu.: 30.25 3rd Qu.: 23.75 3rd Qu.: 69.00 3rd Qu.:60.00
Max. :648.00 Max. :319.00 Max. :278.00 Max. :73.00
mortality
Min. : 790.7
1st Qu.: 898.4
Median : 943.7
Mean : 940.4
3rd Qu.: 983.2
Max. :1113.2
model <- lm ( mortality ~ . , data= mortality[,-1])
summary(model)
Call:
lm(formula = mortality ~ ., data = mortality[, -1])
Residuals:
Min 1Q Median 3Q Max
-75.285 -14.640 0.694 14.790 75.586
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.863e+03 4.108e+02 4.535 4.4e-05 ***
precipitation 2.072e+00 8.418e-01 2.462 0.01781 *
temperature01 -2.178e+00 6.752e-01 -3.225 0.00238 **
temperature07 -2.834e+00 1.771e+00 -1.600 0.11670
pover65 -1.404e+01 7.746e+00 -1.813 0.07670 .
household -1.154e+02 6.200e+01 -1.862 0.06933 .
schooling -2.425e+01 1.121e+01 -2.163 0.03605 *
householdKitchen -1.146e+00 1.467e+00 -0.781 0.43871
populationPerMile 1.004e-02 4.123e-03 2.435 0.01899 *
notWhitePop 3.533e+00 1.282e+00 2.755 0.00850 **
officeWorkers 5.229e-01 1.551e+00 0.337 0.73760
salaireLess3000 2.671e-01 2.565e+00 0.104 0.91755
CO -8.890e-01 4.524e-01 -1.965 0.05574 .
NO 1.866e+00 9.345e-01 1.997 0.05201 .
SO2 -3.447e-02 1.423e-01 -0.242 0.80968
humidity 5.331e-01 1.052e+00 0.507 0.61474
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 32.33 on 44 degrees of freedom
Multiple R-squared: 0.7985, Adjusted R-squared: 0.7298
F-statistic: 11.63 on 15 and 44 DF, p-value: 9.56e-11
#Diagnostique Plots
library(ggplot2)
p1<-ggplot(model, aes(.fitted, .resid))+geom_point()
p1<-p1+stat_smooth(method="loess")+geom_hline(yintercept=0, col="red", linetype="dashed")
p1<-p1+xlab("Fitted values")+ylab("Residuals")
p1<-p1+ggtitle("Residual vs Fitted Plot")+theme_bw()
x11()
p2<-ggplot(model, aes(qqnorm(.stdresid)[[1]], .stdresid))+geom_point(na.rm = TRUE)
#p2<-p2+geom_abline(aes(qqline(.stdresid)))+xlab("Theoretical Quantiles")+ylab("Standardized Residuals")
p2<-p2+ggtitle("Normal Q-Q")+theme_bw()
p2
p3<-ggplot(model, aes(.fitted, sqrt(abs(.stdresid))))+geom_point(na.rm=TRUE)
p3<-p3+stat_smooth(method="loess", na.rm = TRUE)+xlab("Fitted Value")
p3<-p3+ylab(expression(sqrt("|Standardized residuals|")))
p3<-p3+ggtitle("Scale-Location")+theme_bw()
p4<-ggplot(model, aes(seq_along(.cooksd), .cooksd))+geom_bar(stat="identity", position="identity")
p4<-p4+xlab("Obs. Number")+ylab("Cook's distance")
p4<-p4+ggtitle("Cook's distance")+theme_bw()
p5<-ggplot(model, aes(.hat, .stdresid))+geom_point(aes(size=.cooksd), na.rm=TRUE)
p5<-p5+stat_smooth(method="loess", na.rm=TRUE)
p5<-p5+xlab("Leverage")+ylab("Standardized Residuals")
p5<-p5+ggtitle("Residual vs Leverage Plot")
p5<-p5+scale_size_continuous("Cook's Distance", range=c(1,5))
p5<-p5+theme_bw()+theme(legend.position="bottom")
p6<-ggplot(model, aes(.hat, .cooksd))+geom_point(na.rm=TRUE)+stat_smooth(method="loess", na.rm=TRUE)
p6<-p6+xlab("Leverage hii")+ylab("Cook's Distance")
p6<-p6+ggtitle("Cook's dist vs Leverage hii/(1-hii)")
p6<-p6+geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
p6<-p6+theme_bw()
x11()
ggpubr::ggarrange(p1, p2 , p3 ,p6 , p4, p5 , labels = c("A", "B", "C" , "D" , "E"), ncol = 3, nrow = 3)