# Low Birth Weight
# URL for data http://www.umass.edu/statdata/statdata/stat-rmult.html
# Description of data file
# ID     Identification Code                                     
# LOW    Low Birth Weight (0 = Birth Weight >= 2500g,1 = Birth Weight < 2500g)
# AGE    Age of the Mother in Years                              
# LWT    Weight in Pounds at the Last Menstrual Period           
# RACE   Race (1 = White, 2 = Black, 3 = Other)                  
# SMOKE  Smoking Status During Pregnancy (1 = Yes, 0 = No)       
# PTL    History of Premature Labor (0 = None  1 = One, etc.)    
# HT     History of Hypertension (1 = Yes, 0 = No)               
# UI     Presence of Uterine Irritability (1 = Yes, 0 = No)      
# FTV    Number of Physician Visits During the First Trimester (0 = None, 1 = One, 2 = Two, etc.)
# BWT    Birth Weight in Grams                                   



rm(list=ls())

lbwdata <- read.csv("lowbwt.csv")
str(lbwdata)
## 'data.frame':    189 obs. of  11 variables:
##  $ ID   : int  85 86 87 88 89 91 92 93 94 95 ...
##  $ LOW  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AGE  : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ LWT  : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ RACE : int  2 3 1 1 1 3 1 3 1 1 ...
##  $ SMOKE: int  0 0 1 1 1 0 0 0 1 1 ...
##  $ PTL  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HT   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ UI   : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ FTV  : int  0 3 1 2 0 0 1 1 1 0 ...
##  $ BWT  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
summary(lbwdata)
##        ID           LOW             AGE            LWT           RACE     
##  Min.   :  4   Min.   :0.000   Min.   :14.0   Min.   : 80   Min.   :1.00  
##  1st Qu.: 68   1st Qu.:0.000   1st Qu.:19.0   1st Qu.:110   1st Qu.:1.00  
##  Median :123   Median :0.000   Median :23.0   Median :121   Median :1.00  
##  Mean   :121   Mean   :0.312   Mean   :23.2   Mean   :130   Mean   :1.85  
##  3rd Qu.:176   3rd Qu.:1.000   3rd Qu.:26.0   3rd Qu.:140   3rd Qu.:3.00  
##  Max.   :226   Max.   :1.000   Max.   :45.0   Max.   :250   Max.   :3.00  
##      SMOKE            PTL              HT               UI       
##  Min.   :0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.000   Median :0.000   Median :0.0000   Median :0.000  
##  Mean   :0.392   Mean   :0.196   Mean   :0.0635   Mean   :0.148  
##  3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :3.000   Max.   :1.0000   Max.   :1.000  
##       FTV             BWT      
##  Min.   :0.000   Min.   : 709  
##  1st Qu.:0.000   1st Qu.:2414  
##  Median :0.000   Median :2977  
##  Mean   :0.794   Mean   :2945  
##  3rd Qu.:1.000   3rd Qu.:3475  
##  Max.   :6.000   Max.   :4990
# Проверка на нормальность распределения зависимой переменной

qqnorm(lbwdata$BWT); qqline(lbwdata$BWT)

plot of chunk unnamed-chunk-1

shapiro.test(lbwdata$BWT)
## 
##  Shapiro-Wilk normality test
## 
## data:  lbwdata$BWT
## W = 0.9925, p-value = 0.4383
lbwdata <- transform(lbwdata, LWTK = LWT*.454)

fit <- lm(BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) + PTL + 
            factor(HT) + factor(UI) + FTV, data = lbwdata)
summary(fit)
## 
## Call:
## lm(formula = BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) + 
##     PTL + factor(HT) + factor(UI) + FTV, data = lbwdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1826.2  -434.9    57.6   472.7  1702.7 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2930.10     312.84    9.37  < 2e-16 ***
## LWTK               9.59       3.82    2.51  0.01299 *  
## AGE               -3.65       9.62   -0.38  0.70479    
## factor(RACE)2   -489.44     149.95   -3.26  0.00132 ** 
## factor(RACE)3   -357.05     114.73   -3.11  0.00216 ** 
## factor(SMOKE)1  -350.62     106.45   -3.29  0.00119 ** 
## PTL              -48.84     101.95   -0.48  0.63249    
## factor(HT)1     -592.81     202.28   -2.93  0.00382 ** 
## factor(UI)1     -514.93     138.86   -3.71  0.00028 ***
## FTV              -14.07      46.46   -0.30  0.76232    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 650 on 179 degrees of freedom
## Multiple R-squared:  0.243,  Adjusted R-squared:  0.205 
## F-statistic: 6.37 on 9 and 179 DF,  p-value: 7.96e-08
par(mfrow=c(1,2))
plot(fit)

plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1

par(mfrow=c(1,1))

# outliers

library(car)
outlierTest(fit)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferonni p
## 132   -2.968           0.003407       0.6439
# influencePlot(model - a linear or generalized-linear model)
# Varaible above +2 or below –2 on the vertical axis are considered outliers 
# Variable above 0.2 or 0.3 on the horizontal axis have high leverage 
#       (unusual combinations of predictor values)
# Circle size is proportional to influence. 
# Observations depicted by large circles may have disproportionate
# influence on the parameters estimates of the model.
influencePlot(fit, id.method = "identify")

plot of chunk unnamed-chunk-1

# У R.I. Kabaсoff "R in Action" на стр. 202 можно найти следующее:
# Roughly speaking, Cook’s D values greater than 
# 4/(n-k-1), where n is the sample size and k is the number of predictor variables,
# indicate influential observations.**
  
cook.cutoff <- 4/(nrow(lbwdata) - length(fit$coefficients) - 1)
cook.cutoff
## [1] 0.02247
cook<-cooks.distance(fit)
cook[cook > cook.cutoff]
##      93      94     102     106     114     130     131     132     133 
## 0.02368 0.13570 0.03514 0.04674 0.02653 0.09607 0.03790 0.05922 0.06018 
##     134     138 
## 0.03614 0.02413
# plot(cook, type="h", ylab="Cook's distance")
plot(fit, which = 4, cook.levels = cook.cutoff)
abline(h=cook.cutoff, col = 2, lty = 2)
identify(cook)

plot of chunk unnamed-chunk-1

## integer(0)
lbwdata.od<-lbwdata[-c(94, 102, 106, 130:134, 138),]

fit1 <- lm(BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) +
            PTL + factor(HT) + factor(UI) + FTV, data = lbwdata.od)
summary(fit1)
## 
## Call:
## lm(formula = BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) + 
##     PTL + factor(HT) + factor(UI) + FTV, data = lbwdata.od)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1546.3  -430.0    39.8   479.0  1283.6 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2926.82     300.69    9.73  < 2e-16 ***
## LWTK               8.38       3.61    2.32  0.02140 *  
## AGE               -2.48       9.56   -0.26  0.79584    
## factor(RACE)2   -402.72     144.13   -2.79  0.00580 ** 
## factor(RACE)3   -296.08     106.98   -2.77  0.00627 ** 
## factor(SMOKE)1  -338.51      99.97   -3.39  0.00088 ***
## PTL              -90.27     103.54   -0.87  0.38455    
## factor(HT)1     -578.49     218.15   -2.65  0.00876 ** 
## factor(UI)1     -436.69     131.79   -3.31  0.00113 ** 
## FTV                0.91      43.03    0.02  0.98315    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 593 on 170 degrees of freedom
## Multiple R-squared:  0.23,   Adjusted R-squared:  0.19 
## F-statistic: 5.65 on 9 and 170 DF,  p-value: 7.81e-07
fit2 <- lm(BWT ~ LWTK + factor(RACE) + factor(SMOKE) + 
             factor(HT) + factor(UI), data = lbwdata.od)
summary(fit2)
## 
## Call:
## lm(formula = BWT ~ LWTK + factor(RACE) + factor(SMOKE) + factor(HT) + 
##     factor(UI), data = lbwdata.od)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -1546   -422     49    418   1290 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2862.15     230.92   12.39  < 2e-16 ***
## LWTK               8.36       3.48    2.40  0.01745 *  
## factor(RACE)2   -392.41     138.87   -2.83  0.00527 ** 
## factor(RACE)3   -297.24     104.89   -2.83  0.00515 ** 
## factor(SMOKE)1  -352.14      97.29   -3.62  0.00039 ***
## factor(HT)1     -572.40     215.70   -2.65  0.00870 ** 
## factor(UI)1     -451.54     128.85   -3.50  0.00058 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 589 on 173 degrees of freedom
## Multiple R-squared:  0.226,  Adjusted R-squared:  0.199 
## F-statistic: 8.43 on 6 and 173 DF,  p-value: 4.96e-08
anova(fit1, fit2)
## Analysis of Variance Table
## 
## Model 1: BWT ~ LWTK + AGE + factor(RACE) + factor(SMOKE) + PTL + factor(HT) + 
##     factor(UI) + FTV
## Model 2: BWT ~ LWTK + factor(RACE) + factor(SMOKE) + factor(HT) + factor(UI)
##   Res.Df      RSS Df Sum of Sq   F Pr(>F)
## 1    170 59762548                        
## 2    173 60078616 -3   -316067 0.3   0.83
AIC(fit1,fit2)
##      df  AIC
## fit1 11 2821
## fit2  8 2816
# More enhanced methods

library(car)

# Normality
qqPlot(fit2, simulate=T, id.method="identify", main="Q-Q Plot")

plot of chunk unnamed-chunk-1

# Independence of Errors
durbinWatsonTest(fit2)
##  lag Autocorrelation D-W Statistic p-value
##    1          0.7351        0.5292       0
##  Alternative hypothesis: rho != 0
# Linearity
crPlots(fit2)

plot of chunk unnamed-chunk-1

# Homoscedasticity
ncvTest(fit2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.8803    Df = 1     p = 0.3481
spreadLevelPlot(fit2)

plot of chunk unnamed-chunk-1

## 
## Suggested power transformation:  0.3122
# Multicollinearity 
# if sqrt(vif()) > 2 there are problems with multicollinearity
vif(fit2)
##                GVIF Df GVIF^(1/(2*Df))
## LWTK          1.117  1           1.057
## factor(RACE)  1.242  2           1.056
## factor(SMOKE) 1.166  1           1.080
## factor(HT)    1.024  1           1.012
## factor(UI)    1.029  1           1.014
sqrt(vif(fit2)) > 2
##                GVIF    Df GVIF^(1/(2*Df))
## LWTK          FALSE FALSE           FALSE
## factor(RACE)  FALSE FALSE           FALSE
## factor(SMOKE) FALSE FALSE           FALSE
## factor(HT)    FALSE FALSE           FALSE
## factor(UI)    FALSE FALSE           FALSE