Description

On charge le jeu de données

library(MASS)

head(birthwt)
##    low age lwt race smoke ptl ht ui ftv  bwt
## 85   0  19 182    2     0   0  0  1   0 2523
## 86   0  33 155    3     0   0  0  0   3 2551
## 87   0  20 105    1     1   0  0  0   1 2557
## 88   0  21 108    1     1   0  0  1   2 2594
## 89   0  18 107    1     1   0  0  1   0 2600
## 91   0  21 124    3     0   0  0  0   0 2622

Question 1. Décrivez la variable “age”

library(psych)

describe(birthwt$age)
##    vars   n  mean  sd median trimmed  mad min max range skew kurtosis   se
## X1    1 189 23.24 5.3     23    22.9 5.93  14  45    31 0.71     0.53 0.39
hist(birthwt$age)

Question 2. Décrivez la variable “smoke”

table(birthwt$smoke)
## 
##   0   1 
## 115  74

Question 3. Sélectionnez les lignes correspondant aux mères ayant plus de 30 ans et ayant fumé pendant leur grossesse

birthwt[birthwt$age > 30 & birthwt$smoke == 1,]
##     low age lwt race smoke ptl ht ui ftv  bwt
## 119   0  35 121    2     1   1  0  0   1 2948
## 126   0  31 215    1     1   0  0  0   2 3005
## 127   0  33 109    1     1   0  0  0   1 3033
## 163   0  31 150    3     1   0  0  0   2 3321
## 170   0  32 134    1     1   1  0  0   4 3430
## 11    1  34 187    2     1   0  1  0   0 1135
## 22    1  32 105    1     1   0  0  0   0 1818
## 56    1  31 102    1     1   1  0  0   1 2353

Question 4. Y a-t-il une différence de poids du bébé à la naissance (“bwt”) entre les mères ayant fumé pendant la grossesse et celles n’ayant pas fumé ?

# On réalise un t de Student (test sur une différence de moyennes)
f <- birthwt[birthwt$smoke == 1,]
nf <- birthwt[birthwt$smoke == 0,]

# le test est bilatéral ("two.sided") car on n'a pas d'hypothèse a priori sur le sens de la différence
t.test(f$bwt, nf$bwt, paired=FALSE, alternative = "two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  f$bwt and nf$bwt
## t = -2.7299, df = 170.1, p-value = 0.007003
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -488.97860  -78.57486
## sample estimates:
## mean of x mean of y 
##  2771.919  3055.696

Question 5a. Peut-on prédire le poids du bébé à la naissance à partir de l’âge de la mère ?

model <- lm(bwt ~ age, data=birthwt) # commande de la regression
summary(model)
## 
## Call:
## lm(formula = bwt ~ age, data = birthwt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2294.78  -517.63    10.51   530.80  1774.92 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2655.74     238.86   11.12   <2e-16 ***
## age            12.43      10.02    1.24    0.216    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 728.2 on 187 degrees of freedom
## Multiple R-squared:  0.008157,   Adjusted R-squared:  0.002853 
## F-statistic: 1.538 on 1 and 187 DF,  p-value: 0.2165
plot(birthwt$age, birthwt$bwt, xlab="Age de la mère", ylab="Poids du bébé", xlim=c(10,50), ylim=c(700,5000))

abline(lm(bwt ~ age, data=birthwt), col="red")

Question 5b. Effectuez la regression séparément pour les mères ayant fumé pendant la grossesse et celles n’ayant pas fumé

by(birthwt, birthwt$smoke, function(x) summary(lm(bwt ~ age, data=x))) 
## birthwt$smoke: 0
## 
## Call:
## lm(formula = bwt ~ age, data = x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2189.27  -449.75    73.58   542.25  1521.39 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2406.06     305.05   7.887 2.14e-12 ***
## age            27.73      12.68   2.186   0.0309 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 740.5 on 113 degrees of freedom
## Multiple R-squared:  0.04058,    Adjusted R-squared:  0.03209 
## F-statistic:  4.78 on 1 and 113 DF,  p-value: 0.03085
## 
## ------------------------------------------------------------ 
## birthwt$smoke: 1
## 
## Call:
## lm(formula = bwt ~ age, data = x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1967.7  -464.1    13.6   498.8  1391.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3204.23     357.96   8.951 2.57e-13 ***
## age           -18.84      15.24  -1.236     0.22    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 657.3 on 72 degrees of freedom
## Multiple R-squared:  0.02078,    Adjusted R-squared:  0.007183 
## F-statistic: 1.528 on 1 and 72 DF,  p-value: 0.2204
# graphique (pas à connaître)
symb <- c(21,22)  # on choisit les symboles
colors <- c("blue","red")  # on choisit les couleurs

class(birthwt$smoke)
## [1] "integer"
birthwt$smoke <- as.factor(birthwt$smoke)

plot(birthwt$age, birthwt$bwt, pch=symb[birthwt$smoke], col=colors[birthwt$smoke], bg=colors[birthwt$smoke], cex=1.0, xlab="Age de la mère", ylab="Poids du bébé", xlim=c(10,50), ylim=c(700,5000))

abline(lm(bwt ~ age, data=birthwt[birthwt$smoke == 0,]), col="blue")
abline(lm(bwt ~ age, data=birthwt[birthwt$smoke == 1,]), col="red")

legend("topleft", 
       legend = c("Non-fumeuses","Fumeuses"), 
       col = c("blue","red"),
       pch = c(21,22), 
       bty = "n", 
       pt.cex = 1, 
       cex = 0.8, 
       text.col = "black", 
       horiz = F , 
       inset = c(0.01, 0.01))