Analysis of birthwt dataset

Abstract
Overview of the Dataset
Develop models
AIC and BIC results
Calculate test and train RMSE for each model.

Abstract

My gold was to:

Simulate table 2.1 & 3.2 mentioned in the book “Model Selection and MOdel Averaging”
Using RMSE to determine how well models predict.

Overview of the Dataset

This dataset is an R dataframe, which is available in R under the name “birthwt” in the library MASS. The data were collected at the Baystate Medical Center, Springfield, Mass during the year of 1986.

This birthwt dataset contains 10 variables.

low - indicator of birth weight of the baby less than 2.5kg (nominal: 0=no, 1=yes)

age - age of mother (continuous: in years)

lwt - mother’s weight at last menstrual period (continuous: pounds)

race - mother’s race (nominal: 1=white, 2=black, 3=other)

smoke - smoking status during pregnancy (nominal: 1=no, 2=yes)

ptl - number of previous premature labors (discrete)

ht - history of hypertension (nominal: 0=no, 1=yes)

ui - presence of uterine irritability (nominal: 0=no, 1=yes)

ftv - number of physician visits during the first trimester (discrete)

bwt - birth weight in grams (continous)

data("birthwt")
head(birthwt)

summary(birthwt)

##       low              age             lwt             race      
##  Min.   :0.0000   Min.   :14.00   Min.   : 80.0   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:19.00   1st Qu.:110.0   1st Qu.:1.000  
##  Median :0.0000   Median :23.00   Median :121.0   Median :1.000  
##  Mean   :0.3122   Mean   :23.24   Mean   :129.8   Mean   :1.847  
##  3rd Qu.:1.0000   3rd Qu.:26.00   3rd Qu.:140.0   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :45.00   Max.   :250.0   Max.   :3.000  
##      smoke             ptl               ht                ui        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.00000   Median :0.0000  
##  Mean   :0.3915   Mean   :0.1958   Mean   :0.06349   Mean   :0.1481  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :3.0000   Max.   :1.00000   Max.   :1.0000  
##       ftv              bwt      
##  Min.   :0.0000   Min.   : 709  
##  1st Qu.:0.0000   1st Qu.:2414  
##  Median :0.0000   Median :2977  
##  Mean   :0.7937   Mean   :2945  
##  3rd Qu.:1.0000   3rd Qu.:3487  
##  Max.   :6.0000   Max.   :4990

dim(birthwt)

## [1] 189  10

Develop models

The main variable of interest is bwt.

Other recorded variables are:

x1: intercept
x2: lwt
x3: age
x4: indicator for ‘race black’
x5: indicator for ‘race other’

Since the mother’s weight is thought to be influential, we decide to include this variable x2 in all of the possible models under investigation, as well as the intercept term (x1 = 1); in other words, x1 and x2 are protected covariates.

For each model, the validation RMSE and also the test RMSE are calculated. The data is then futher split in two parts. One part of datasets will be used to fit (train) our models, which is denoted by bwt_train. The remainder of the original data will be used to assess how well the model is predicting, usualy called test data and is denoted by bwt_test.

set.seed(42)
bwt_trn_idx  = sample(nrow(birthwt), size = trunc(0.8 * nrow(birthwt)))
bwt_train = birthwt[bwt_trn_idx, ]
bwt_test = birthwt[-bwt_trn_idx, ]

x2 <- bwt_train$lwt
x3 <- bwt_train$age
x4 <- factor(ifelse(bwt_train$race == 2, 1, 0))
x5 <- factor(ifelse(bwt_train$race == 3, 1, 0))

Model1 <- lm(bwt~1, data = bwt_train)
Model2 <- lm(bwt~1 + x2 + x3, data = bwt_train)
Model3 <- lm(bwt~1 + x2 + x4, data = bwt_train)
Model4 <- lm(bwt~1 + x2 + x5, data = bwt_train)
Model5 <- lm(bwt~1 + x2 + x3 + x4, data = bwt_train)
Model6 <- lm(bwt~1 + x2 + x3 + x5, data = bwt_train)
Model7 <- lm(bwt~1 + x2 + x4 + x5, data = bwt_train)
Model8 <- lm(bwt~1 + x2 + x3 + x4 + x5, data = bwt_train)

AIC and BIC results

AIC(Model1, Model2, Model3, Model4, Model5, Model6, Model7, Model8)

BIC(Model1, Model2, Model3, Model4, Model5, Model6, Model7, Model8)

Calculate test and train RMSE for each model.

calc_rmse = function(actual, predicted) {
  sqrt(mean((actual - predicted) ^ 2))
}

# create model list

mod_list = list(Model1, Model2, Model3, Model4, Model5, Model6, Model7, Model8)


# get train and test predictions


trn_pred = lapply(mod_list, predict, newdata = bwt_train)
tst_pred = lapply(mod_list, predict, newdata = bwt_test)

## Warning: 'newdata' had 38 rows but variables found have 151 rows

## Warning: 'newdata' had 38 rows but variables found have 151 rows

## Warning: 'newdata' had 38 rows but variables found have 151 rows

## Warning: 'newdata' had 38 rows but variables found have 151 rows

## Warning: 'newdata' had 38 rows but variables found have 151 rows

## Warning: 'newdata' had 38 rows but variables found have 151 rows

## Warning: 'newdata' had 38 rows but variables found have 151 rows

# get train and test RMSE

trn_rmse = sapply(trn_pred, calc_rmse, actual = bwt_train$bwt)
tst_rmse = sapply(tst_pred, calc_rmse, actual = bwt_test$bwt)

## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length

## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length

## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length

## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length

## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length

## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length

## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length

tst_rmse

## [1] 741.6587 757.7663 774.1677 760.2700 770.3551 756.1716 774.4138 771.0373

summary(Model3)

## 
## Call:
## lm(formula = bwt ~ 1 + x2 + x4, data = bwt_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2248.7  -508.5    42.1   496.6  2015.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2286.099    259.799   8.799  3.3e-15 ***
## x2             5.596      1.966   2.846  0.00505 ** 
## x41         -371.854    174.906  -2.126  0.03516 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 705.4 on 148 degrees of freedom
## Multiple R-squared:  0.0698, Adjusted R-squared:  0.05723 
## F-statistic: 5.553 on 2 and 148 DF,  p-value: 0.004728

plot(Model3)