Etude des facteurs aggisant sur taux de mortalite aux etats unis en 1960

Groupe F
8/01/2019

Dictionnaire de Variables (1/2):

  • mortality : taux de mortalité.
  • Precipitation : Moyenne anuelle des precipitations.
  • Temperature01 : la moyenne des temperatures au mois de Janvier en Fahrenheit.
  • Temperature01 : la moyenne des temperatures au mois de Juillet Fahrenheit .
  • Pover65 : le pourcentage de la population de plus de 65 ans.
  • Household : taille de menage.
  • schooling : scolarrisation de plus de 22 ans.
  • HouseHoldKitchen : Menages avec des cuisines completes.

Dictionnaire de Variables (2/2):

  • PopulationPerMile : population par mile dans les zones urbanisees.
  • notWhitePop : pourcentage de la population non blanche.
  • officeWorker : pourcentage d'employes de bureau.
  • SalaireLess300 : revenu annuel inferieur à 3k$
  • Co: potentiel de polution des hydrocarbures.
  • No : potentiel de polution des oxydes d'azotes.
  • SO2 : pollution relative de dioxyde de soufre.
  • humidity: pencentage relative a l'humidite moyenne a 1 h du matin.

Dictionnaire de Variables (2/2):

library(readxl)
mortality <- read_excel("mortality.xlsx")
head(mortality)
# A tibble: 6 x 17
     id precipitation temperature01 temperature07 pover65 household
  <dbl>         <dbl>         <dbl>         <dbl>   <dbl>     <dbl>
1     1            36            27            71     8.1      3.34
2     2            35            23            72    11.1      3.14
3     3            44            29            74    10.4      3.21
4     4            47            45            79     6.5      3.41
5     5            43            35            77     7.6      3.44
6     6            53            45            80     7.7      3.45
# ... with 11 more variables: schooling <dbl>, householdKitchen <dbl>,
#   populationPerMile <dbl>, notWhitePop <dbl>, officeWorkers <dbl>,
#   salaireLess3000 <dbl>, CO <dbl>, NO <dbl>, SO2 <dbl>, humidity <dbl>,
#   mortality <dbl>
summary(mortality)
       id        precipitation   temperature01   temperature07  
 Min.   : 1.00   Min.   :10.00   Min.   :12.00   Min.   :63.00  
 1st Qu.:15.75   1st Qu.:32.75   1st Qu.:27.00   1st Qu.:72.00  
 Median :30.50   Median :38.00   Median :31.50   Median :74.00  
 Mean   :30.50   Mean   :37.37   Mean   :34.82   Mean   :74.60  
 3rd Qu.:45.25   3rd Qu.:43.25   3rd Qu.:40.00   3rd Qu.:77.25  
 Max.   :60.00   Max.   :60.00   Max.   :83.00   Max.   :85.00  
    pover65         household       schooling     householdKitchen
 Min.   : 5.600   Min.   :2.920   Min.   : 9.00   Min.   :66.80   
 1st Qu.: 7.675   1st Qu.:3.210   1st Qu.:10.40   1st Qu.:78.38   
 Median : 9.000   Median :3.265   Median :11.05   Median :81.15   
 Mean   : 8.798   Mean   :3.263   Mean   :10.97   Mean   :80.91   
 3rd Qu.: 9.700   3rd Qu.:3.360   3rd Qu.:11.50   3rd Qu.:83.60   
 Max.   :11.800   Max.   :3.530   Max.   :12.30   Max.   :90.70   
 populationPerMile  notWhitePop    officeWorkers   salaireLess3000
 Min.   :1441      Min.   : 0.80   Min.   :33.80   Min.   : 9.40  
 1st Qu.:3104      1st Qu.: 4.95   1st Qu.:43.25   1st Qu.:12.00  
 Median :3567      Median :10.40   Median :45.50   Median :13.20  
 Mean   :3876      Mean   :11.87   Mean   :46.07   Mean   :14.37  
 3rd Qu.:4520      3rd Qu.:15.65   3rd Qu.:49.52   3rd Qu.:15.15  
 Max.   :9699      Max.   :38.50   Max.   :59.70   Max.   :26.40  
       CO               NO              SO2            humidity    
 Min.   :  1.00   Min.   :  1.00   Min.   :  1.00   Min.   :38.00  
 1st Qu.:  7.00   1st Qu.:  4.00   1st Qu.: 11.00   1st Qu.:54.75  
 Median : 14.50   Median :  9.00   Median : 30.00   Median :57.00  
 Mean   : 37.85   Mean   : 22.52   Mean   : 53.77   Mean   :57.53  
 3rd Qu.: 30.25   3rd Qu.: 23.75   3rd Qu.: 69.00   3rd Qu.:60.00  
 Max.   :648.00   Max.   :319.00   Max.   :278.00   Max.   :73.00  
   mortality     
 Min.   : 790.7  
 1st Qu.: 898.4  
 Median : 943.7  
 Mean   : 940.4  
 3rd Qu.: 983.2  
 Max.   :1113.2  
model <- lm ( mortality ~ .  , data=  mortality[,-1])
summary(model)

Call:
lm(formula = mortality ~ ., data = mortality[, -1])

Residuals:
    Min      1Q  Median      3Q     Max 
-75.285 -14.640   0.694  14.790  75.586 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        1.863e+03  4.108e+02   4.535  4.4e-05 ***
precipitation      2.072e+00  8.418e-01   2.462  0.01781 *  
temperature01     -2.178e+00  6.752e-01  -3.225  0.00238 ** 
temperature07     -2.834e+00  1.771e+00  -1.600  0.11670    
pover65           -1.404e+01  7.746e+00  -1.813  0.07670 .  
household         -1.154e+02  6.200e+01  -1.862  0.06933 .  
schooling         -2.425e+01  1.121e+01  -2.163  0.03605 *  
householdKitchen  -1.146e+00  1.467e+00  -0.781  0.43871    
populationPerMile  1.004e-02  4.123e-03   2.435  0.01899 *  
notWhitePop        3.533e+00  1.282e+00   2.755  0.00850 ** 
officeWorkers      5.229e-01  1.551e+00   0.337  0.73760    
salaireLess3000    2.671e-01  2.565e+00   0.104  0.91755    
CO                -8.890e-01  4.524e-01  -1.965  0.05574 .  
NO                 1.866e+00  9.345e-01   1.997  0.05201 .  
SO2               -3.447e-02  1.423e-01  -0.242  0.80968    
humidity           5.331e-01  1.052e+00   0.507  0.61474    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 32.33 on 44 degrees of freedom
Multiple R-squared:  0.7985,    Adjusted R-squared:  0.7298 
F-statistic: 11.63 on 15 and 44 DF,  p-value: 9.56e-11
#Diagnostique Plots 
library(ggplot2)

    p1<-ggplot(model, aes(.fitted, .resid))+geom_point()
    p1<-p1+stat_smooth(method="loess")+geom_hline(yintercept=0, col="red", linetype="dashed")
    p1<-p1+xlab("Fitted values")+ylab("Residuals")
    p1<-p1+ggtitle("Residual vs Fitted Plot")+theme_bw()

    x11()
    p2<-ggplot(model, aes(qqnorm(.stdresid)[[1]], .stdresid))+geom_point(na.rm = TRUE)
    #p2<-p2+geom_abline(aes(qqline(.stdresid)))+xlab("Theoretical Quantiles")+ylab("Standardized Residuals")
    p2<-p2+ggtitle("Normal Q-Q")+theme_bw()
    p2

plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1

    p3<-ggplot(model, aes(.fitted, sqrt(abs(.stdresid))))+geom_point(na.rm=TRUE)
    p3<-p3+stat_smooth(method="loess", na.rm = TRUE)+xlab("Fitted Value")
    p3<-p3+ylab(expression(sqrt("|Standardized residuals|")))
    p3<-p3+ggtitle("Scale-Location")+theme_bw()

    p4<-ggplot(model, aes(seq_along(.cooksd), .cooksd))+geom_bar(stat="identity", position="identity")
    p4<-p4+xlab("Obs. Number")+ylab("Cook's distance")
    p4<-p4+ggtitle("Cook's distance")+theme_bw()

    p5<-ggplot(model, aes(.hat, .stdresid))+geom_point(aes(size=.cooksd), na.rm=TRUE)
    p5<-p5+stat_smooth(method="loess", na.rm=TRUE)
    p5<-p5+xlab("Leverage")+ylab("Standardized Residuals")
    p5<-p5+ggtitle("Residual vs Leverage Plot")
    p5<-p5+scale_size_continuous("Cook's Distance", range=c(1,5))
    p5<-p5+theme_bw()+theme(legend.position="bottom")

    p6<-ggplot(model, aes(.hat, .cooksd))+geom_point(na.rm=TRUE)+stat_smooth(method="loess", na.rm=TRUE)
    p6<-p6+xlab("Leverage hii")+ylab("Cook's Distance")
    p6<-p6+ggtitle("Cook's dist vs Leverage hii/(1-hii)")
    p6<-p6+geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
    p6<-p6+theme_bw()


    x11()
   ggpubr::ggarrange(p1, p2  , p3 ,p6  , p4, p5  , labels = c("A", "B", "C" , "D" , "E"), ncol = 3, nrow = 3)