# Low Birth Weight
# URL for data http://www.umass.edu/statdata/statdata/stat-rmult.html
# Description of data file
# ID Identification Code
# LOW Low Birth Weight (0 = Birth Weight >= 2500g,1 = Birth Weight < 2500g)
# AGE Age of the Mother in Years
# LWT Weight in Pounds at the Last Menstrual Period
# RACE Race (1 = White, 2 = Black, 3 = Other)
# SMOKE Smoking Status During Pregnancy (1 = Yes, 0 = No)
# PTL History of Premature Labor (0 = None 1 = One, etc.)
# HT History of Hypertension (1 = Yes, 0 = No)
# UI Presence of Uterine Irritability (1 = Yes, 0 = No)
# FTV Number of Physician Visits During the First Trimester (0 = None, 1 = One, 2 = Two, etc.)
# BWT Birth Weight in Grams
rm(list=ls())
lbwdata <- read.csv("lowbwt.csv")
str(lbwdata)
## 'data.frame': 189 obs. of 11 variables:
## $ ID : int 85 86 87 88 89 91 92 93 94 95 ...
## $ LOW : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AGE : int 19 33 20 21 18 21 22 17 29 26 ...
## $ LWT : int 182 155 105 108 107 124 118 103 123 113 ...
## $ RACE : int 2 3 1 1 1 3 1 3 1 1 ...
## $ SMOKE: int 0 0 1 1 1 0 0 0 1 1 ...
## $ PTL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ UI : int 1 0 0 1 1 0 0 0 0 0 ...
## $ FTV : int 0 3 1 2 0 0 1 1 1 0 ...
## $ BWT : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
summary(lbwdata)
## ID LOW AGE LWT RACE
## Min. : 4 Min. :0.000 Min. :14.0 Min. : 80 Min. :1.00
## 1st Qu.: 68 1st Qu.:0.000 1st Qu.:19.0 1st Qu.:110 1st Qu.:1.00
## Median :123 Median :0.000 Median :23.0 Median :121 Median :1.00
## Mean :121 Mean :0.312 Mean :23.2 Mean :130 Mean :1.85
## 3rd Qu.:176 3rd Qu.:1.000 3rd Qu.:26.0 3rd Qu.:140 3rd Qu.:3.00
## Max. :226 Max. :1.000 Max. :45.0 Max. :250 Max. :3.00
## SMOKE PTL HT UI
## Min. :0.000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.0000 Median :0.000
## Mean :0.392 Mean :0.196 Mean :0.0635 Mean :0.148
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.000
## Max. :1.000 Max. :3.000 Max. :1.0000 Max. :1.000
## FTV BWT
## Min. :0.000 Min. : 709
## 1st Qu.:0.000 1st Qu.:2414
## Median :0.000 Median :2977
## Mean :0.794 Mean :2945
## 3rd Qu.:1.000 3rd Qu.:3475
## Max. :6.000 Max. :4990
# Проверка на нормальность распределения зависимой переменной
qqnorm(lbwdata$BWT); qqline(lbwdata$BWT)

shapiro.test(lbwdata$BWT)
##
## Shapiro-Wilk normality test
##
## data: lbwdata$BWT
## W = 0.9925, p-value = 0.4383
lbwdata <- transform(lbwdata, LWTK = LWT*.454)
fit <- lm(BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) + PTL +
factor(HT) + factor(UI) + FTV, data = lbwdata)
summary(fit)
##
## Call:
## lm(formula = BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) +
## PTL + factor(HT) + factor(UI) + FTV, data = lbwdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1826.2 -434.9 57.6 472.7 1702.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2930.10 312.84 9.37 < 2e-16 ***
## LWTK 9.59 3.82 2.51 0.01299 *
## AGE -3.65 9.62 -0.38 0.70479
## factor(RACE)2 -489.44 149.95 -3.26 0.00132 **
## factor(RACE)3 -357.05 114.73 -3.11 0.00216 **
## factor(SMOKE)1 -350.62 106.45 -3.29 0.00119 **
## PTL -48.84 101.95 -0.48 0.63249
## factor(HT)1 -592.81 202.28 -2.93 0.00382 **
## factor(UI)1 -514.93 138.86 -3.71 0.00028 ***
## FTV -14.07 46.46 -0.30 0.76232
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 650 on 179 degrees of freedom
## Multiple R-squared: 0.243, Adjusted R-squared: 0.205
## F-statistic: 6.37 on 9 and 179 DF, p-value: 7.96e-08
par(mfrow=c(1,2))
plot(fit)


par(mfrow=c(1,1))
# outliers
library(car)
outlierTest(fit)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 132 -2.968 0.003407 0.6439
# influencePlot(model - a linear or generalized-linear model)
# Varaible above +2 or below –2 on the vertical axis are considered outliers
# Variable above 0.2 or 0.3 on the horizontal axis have high leverage
# (unusual combinations of predictor values)
# Circle size is proportional to influence.
# Observations depicted by large circles may have disproportionate
# influence on the parameters estimates of the model.
influencePlot(fit, id.method = "identify")

# У R.I. Kabaсoff "R in Action" на стр. 202 можно найти следующее:
# Roughly speaking, Cook’s D values greater than
# 4/(n-k-1), where n is the sample size and k is the number of predictor variables,
# indicate influential observations.**
cook.cutoff <- 4/(nrow(lbwdata) - length(fit$coefficients) - 1)
cook.cutoff
## [1] 0.02247
cook<-cooks.distance(fit)
cook[cook > cook.cutoff]
## 93 94 102 106 114 130 131 132 133
## 0.02368 0.13570 0.03514 0.04674 0.02653 0.09607 0.03790 0.05922 0.06018
## 134 138
## 0.03614 0.02413
# plot(cook, type="h", ylab="Cook's distance")
plot(fit, which = 4, cook.levels = cook.cutoff)
abline(h=cook.cutoff, col = 2, lty = 2)
identify(cook)

## integer(0)
lbwdata.od<-lbwdata[-c(94, 102, 106, 130:134, 138),]
fit1 <- lm(BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) +
PTL + factor(HT) + factor(UI) + FTV, data = lbwdata.od)
summary(fit1)
##
## Call:
## lm(formula = BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) +
## PTL + factor(HT) + factor(UI) + FTV, data = lbwdata.od)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1546.3 -430.0 39.8 479.0 1283.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2926.82 300.69 9.73 < 2e-16 ***
## LWTK 8.38 3.61 2.32 0.02140 *
## AGE -2.48 9.56 -0.26 0.79584
## factor(RACE)2 -402.72 144.13 -2.79 0.00580 **
## factor(RACE)3 -296.08 106.98 -2.77 0.00627 **
## factor(SMOKE)1 -338.51 99.97 -3.39 0.00088 ***
## PTL -90.27 103.54 -0.87 0.38455
## factor(HT)1 -578.49 218.15 -2.65 0.00876 **
## factor(UI)1 -436.69 131.79 -3.31 0.00113 **
## FTV 0.91 43.03 0.02 0.98315
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 593 on 170 degrees of freedom
## Multiple R-squared: 0.23, Adjusted R-squared: 0.19
## F-statistic: 5.65 on 9 and 170 DF, p-value: 7.81e-07
fit2 <- lm(BWT ~ LWTK + factor(RACE) + factor(SMOKE) +
factor(HT) + factor(UI), data = lbwdata.od)
summary(fit2)
##
## Call:
## lm(formula = BWT ~ LWTK + factor(RACE) + factor(SMOKE) + factor(HT) +
## factor(UI), data = lbwdata.od)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1546 -422 49 418 1290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2862.15 230.92 12.39 < 2e-16 ***
## LWTK 8.36 3.48 2.40 0.01745 *
## factor(RACE)2 -392.41 138.87 -2.83 0.00527 **
## factor(RACE)3 -297.24 104.89 -2.83 0.00515 **
## factor(SMOKE)1 -352.14 97.29 -3.62 0.00039 ***
## factor(HT)1 -572.40 215.70 -2.65 0.00870 **
## factor(UI)1 -451.54 128.85 -3.50 0.00058 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 589 on 173 degrees of freedom
## Multiple R-squared: 0.226, Adjusted R-squared: 0.199
## F-statistic: 8.43 on 6 and 173 DF, p-value: 4.96e-08
anova(fit1, fit2)
## Analysis of Variance Table
##
## Model 1: BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) + PTL + factor(HT) +
## factor(UI) + FTV
## Model 2: BWT ~ LWTK + factor(RACE) + factor(SMOKE) + factor(HT) + factor(UI)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 170 59762548
## 2 173 60078616 -3 -316067 0.3 0.83
AIC(fit1,fit2)
## df AIC
## fit1 11 2821
## fit2 8 2816
# More enhanced methods
library(car)
# Normality
qqPlot(fit2, simulate=T, id.method="identify", main="Q-Q Plot")

# Independence of Errors
durbinWatsonTest(fit2)
## lag Autocorrelation D-W Statistic p-value
## 1 0.7351 0.5292 0
## Alternative hypothesis: rho != 0
# Linearity
crPlots(fit2)

# Homoscedasticity
ncvTest(fit2)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.8803 Df = 1 p = 0.3481
spreadLevelPlot(fit2)

##
## Suggested power transformation: 0.3122
# Multicollinearity
# if sqrt(vif()) > 2 there are problems with multicollinearity
vif(fit2)
## GVIF Df GVIF^(1/(2*Df))
## LWTK 1.117 1 1.057
## factor(RACE) 1.242 2 1.056
## factor(SMOKE) 1.166 1 1.080
## factor(HT) 1.024 1 1.012
## factor(UI) 1.029 1 1.014
sqrt(vif(fit2)) > 2
## GVIF Df GVIF^(1/(2*Df))
## LWTK FALSE FALSE FALSE
## factor(RACE) FALSE FALSE FALSE
## factor(SMOKE) FALSE FALSE FALSE
## factor(HT) FALSE FALSE FALSE
## factor(UI) FALSE FALSE FALSE