birthweight = read.csv("birthweight_final.csv", header=TRUE)
birthweight$Black = as.factor(birthweight$Black)
birthweight$Married = as.factor(birthweight$Married)
birthweight$Boy = as.factor(birthweight$Boy)
birthweight$MomSmoke = as.factor(birthweight$MomSmoke)
birthweight$Ed = as.factor(birthweight$Ed)
full.model <- lm(Weight ~. - Weight_Gr, data = birthweight)
step.model <- ols_step_both_p(full.model, pent = 0.01, prem = 0.01, details = FALSE)
step.model # MomWtGain MomSmoke Black
##
## Stepwise Selection Summary
## -----------------------------------------------------------------------------------------
## Added/ Adj.
## Step Variable Removed R-Square R-Square C(p) AIC RMSE
## -----------------------------------------------------------------------------------------
## 1 MomWtGain addition 0.089 0.087 20.7600 6201.2327 559.8163
## 2 MomSmoke addition 0.107 0.102 14.6830 6195.4042 555.0626
## 3 Black addition 0.127 0.120 7.4660 6188.2794 549.4600
## -----------------------------------------------------------------------------------------
forward.model <- ols_step_forward_p(full.model, prem = 0.01, details = FALSE)
forward.model # MomWtGain MomSmoke Black Boy Ed Married
##
## Selection Summary
## -----------------------------------------------------------------------------
## Variable Adj.
## Step Entered R-Square R-Square C(p) AIC RMSE
## -----------------------------------------------------------------------------
## 1 MomWtGain 0.0893 0.0870 20.7604 6201.2327 559.8163
## 2 MomSmoke 0.1069 0.1024 14.6832 6195.4042 555.0626
## 3 Black 0.1271 0.1205 7.4659 6188.2794 549.4600
## 4 Boy 0.1364 0.1277 5.1841 6185.9688 547.1986
## 5 Ed 0.1409 0.1300 5.1599 6185.9146 546.4876
## 6 Married 0.1445 0.1314 5.5158 6186.2385 546.0372
## -----------------------------------------------------------------------------
backward.model <- ols_step_backward_p(full.model, pent = 0.01, details = FALSE)
backward.model # MomWtGain MomSmoke Black Boy Ed Married
##
##
## Elimination Summary
## ---------------------------------------------------------------------------
## Variable Adj.
## Step Removed R-Square R-Square C(p) AIC RMSE
## ---------------------------------------------------------------------------
## 1 Visit 0.1453 0.130 7.1307 6187.8448 546.4642
## 2 MomAge 0.1445 0.1314 5.5158 6186.2385 546.0372
## ---------------------------------------------------------------------------
adjRsquared.model <- ols_step_best_subset(full.model, pent = 0.01, details = FALSE)
adjRsquared.model # Black Married Boy MomSmoke Ed MomWtGain
model1 <- lm(Weight ~ MomWtGain + MomSmoke + Black, data = birthweight)
# influential points
InflPoints = which(cooks.distance(model1) > 0.115)
InflPoints
## 129
## 129
model1 <- lm(Weight ~ MomWtGain + MomSmoke + Black, data = birthweight[-InflPoints,])
InflPoints = which(cooks.distance(model1) > 0.115)
InflPoints
## named integer(0)
summary(model1)
##
## Call:
## lm(formula = Weight ~ MomWtGain + MomSmoke + Black, data = birthweight[-InflPoints,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2427.02 -309.20 2.98 315.40 1472.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3434.252 32.078 107.059 < 2e-16 ***
## MomWtGain 13.112 2.113 6.204 1.39e-09 ***
## MomSmoke1 -238.923 76.251 -3.133 0.00186 **
## Black1 -198.519 78.022 -2.544 0.01133 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 542.2 on 395 degrees of freedom
## Multiple R-squared: 0.1366, Adjusted R-squared: 0.1301
## F-statistic: 20.84 on 3 and 395 DF, p-value: 1.493e-12
# diagnostics
par(mfrow=c(2,2))
plot(model1, which=c(1:4))
The final model includes 399 observations. Residuals vs Fitted, Cook’s distance and Scale-Location’s plots look good, so we can assume that the final model doesn’t have any influential points and variance is constant. However, based on the QQ plot, data points with lower values of Weight are not normally distributed.
The final model can explain 13.66% of the variation in Weight.
null.model1 <- glm(Weight_Gr ~ 1, data = birthweight)
full.model1 <- glm(Weight_Gr ~ MomWtGain + MomSmoke + Black + Ed +
Visit + Boy + Married + MomAge, data = birthweight)
# step wise AIC
step.AIC <- step(full.model1, scope = list(upper=null.model1),
direction="both",test="Chisq", trace = F)
summary(step.AIC) # MomWtGain MomSmoke Ed Boy MomAge
##
## Call:
## glm(formula = Weight_Gr ~ MomWtGain + MomSmoke + Ed + Boy + MomAge,
## data = birthweight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9024 -0.4275 -0.1402 0.4574 0.9183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.553685 0.042684 12.972 < 2e-16 ***
## MomWtGain -0.008521 0.001813 -4.701 3.58e-06 ***
## MomSmoke1 0.180962 0.067414 2.684 0.00757 **
## Ed1 -0.082952 0.049023 -1.692 0.09142 .
## Boy1 -0.091813 0.047857 -1.919 0.05577 .
## MomAge -0.010081 0.004267 -2.363 0.01863 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2271311)
##
## Null deviance: 99.978 on 399 degrees of freedom
## Residual deviance: 89.490 on 394 degrees of freedom
## AIC: 550.21
##
## Number of Fisher Scoring iterations: 2
# step wise BIC
step.BIC <- step(full.model1, scope = list(upper=null.model1),
direction="both",test="Chisq", trace = F, k=log(nrow(birthweight)))
summary(step.BIC) # MomWtGain MomSmoke MomAge
##
## Call:
## glm(formula = Weight_Gr ~ MomWtGain + MomSmoke + MomAge, data = birthweight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9252 -0.4393 -0.1846 0.4619 0.9235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.468587 0.026060 17.981 < 2e-16 ***
## MomWtGain -0.008355 0.001820 -4.590 5.96e-06 ***
## MomSmoke1 0.192622 0.067614 2.849 0.00462 **
## MomAge -0.011069 0.004252 -2.603 0.00959 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2296022)
##
## Null deviance: 99.978 on 399 degrees of freedom
## Residual deviance: 90.922 on 396 degrees of freedom
## AIC: 552.57
##
## Number of Fisher Scoring iterations: 2
step.model.BIC <- glm(Weight_Gr ~ MomWtGain + MomSmoke + MomAge, data = birthweight)
InflPoints = which(cooks.distance(step.model.BIC) > 0.1)
InflPoints # No influential points found
## named integer(0)
summary(step.model.BIC)
##
## Call:
## glm(formula = Weight_Gr ~ MomWtGain + MomSmoke + MomAge, data = birthweight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9252 -0.4393 -0.1846 0.4619 0.9235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.468587 0.026060 17.981 < 2e-16 ***
## MomWtGain -0.008355 0.001820 -4.590 5.96e-06 ***
## MomSmoke1 0.192622 0.067614 2.849 0.00462 **
## MomAge -0.011069 0.004252 -2.603 0.00959 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2296022)
##
## Null deviance: 99.978 on 399 degrees of freedom
## Residual deviance: 90.922 on 396 degrees of freedom
## AIC: 552.57
##
## Number of Fisher Scoring iterations: 2
This model uses 400 observations.
length(birthweight$Weight_Gr[birthweight$Weight_Gr == 1])/nrow(birthweight)
## [1] 0.4925
Low birthweight infants represent the 49,25% of the observations.
fit.prob <- predict(step.model.BIC, type = "response")
pred.class <- ifelse(fit.prob > 0.4925, 1, 0)
mean(birthweight$Weight_Gr != pred.class)
## [1] 0.355
The misclassification rate is 35.5%.
hoslem.test(step.model.BIC$y, fitted(step.model.BIC), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: step.model.BIC$y, fitted(step.model.BIC)
## X-squared = 11.554, df = 8, p-value = 0.1722
The p-value of the Hosmer and Lemeshow goodness of fit (GOF) test is less greater than 0.05, so we cannot reject the null hypothesis. We can assume that the model is not poor fit.
The conclusions of the two analyses support and complement each other. For instance, they both shows that smoking moms’ infants have lower weight to an extent that enhance the likehood of having low birthweight babies.
Based on the results, I suggest to pregnant women not to smoke in order to lower of the chance of having a low birthweight infant.