Preparing data for modeling in R

References

Dataset: low birthweight dataset

http://www.umass.edu/statdata/statdata/data/lowbwt.txt

SOURCE: Hosmer and Lemeshow (2000) Applied Logistic Regression: Second Edition. These data are copyrighted by John Wiley & Sons Inc. and must be acknowledged and used accordingly. Data were collected at Baystate Medical Center, Springfield, Massachusetts during 1986.

Load dataset

## Load dataset directly from the internet
lbw <- read.table("http://www.umass.edu/statdata/statdata/data/lowbwt.dat", head = T, skip = 4)

## "Fix" cases 10 and 39 to make them identical to BIO 213 dataset
lbw[c(10,39),"BWT"] <- c(2655,3035)

## Change variable names to lower cases
names(lbw) <- tolower(names(lbw))

## Check first 6 observations
head(lbw)
  id low age lwt race smoke ptl ht ui ftv  bwt
1 85   0  19 182    2     0   0  0  1   0 2523
2 86   0  33 155    3     0   0  0  0   3 2551
3 87   0  20 105    1     1   0  0  0   1 2557
4 88   0  21 108    1     1   0  0  1   2 2594
5 89   0  18 107    1     1   0  0  1   0 2600
6 91   0  21 124    3     0   0  0  0   0 2622

See overview of data using gpairs package

library(gpairs)
gpairs(lbw)

plot of chunk unnamed-chunk-3

Recode data

## Relabel race: 1, 2, 3 -> White, Black, Other
lbw$race.cat <- factor(lbw$race, levels = 1:3, labels = c("White","Black","Other"))

## Dichotomize ptl
table(lbw$ptl)

  0   1   2   3 
159  24   5   1 
lbw$preterm  <- factor(ifelse(lbw$ptl >= 1, "1+", "0"))

## Change 0,1 binary to No,Yes binary
lbw$smoke    <- factor(ifelse(lbw$smoke == 1, "Yes", "No"))
lbw$ht       <- factor(ifelse(lbw$ht    == 1, "Yes", "No"))
lbw$ui       <- factor(ifelse(lbw$ui    == 1, "Yes", "No"))
lbw$low      <- factor(ifelse(lbw$low   == 1, "Yes", "No"))

## Alternative way
## lbw$low      <- factor(lbw$low, levels = 0:1, labels = c("No", "Yes")))

## Categorize ftv (frequency of visit): 0 | 1 2 | 3 4 6 -> None(0) | Normal(1-2) | Many (>= 3)
table(lbw$ftv)

  0   1   2   3   4   6 
100  47  30   7   4   1 
lbw$ftv.cat  <- cut(lbw$ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
levels(lbw$ftv.cat)
[1] "None"   "Normal" "Many"  

## Make Normal the reference level
lbw$ftv.cat  <- relevel(lbw$ftv.cat, ref = "Normal")
levels(lbw$ftv.cat)
[1] "Normal" "None"   "Many"  

## Check first 6 observations
head(lbw)
  id low age lwt race smoke ptl ht  ui ftv  bwt race.cat preterm ftv.cat
1 85  No  19 182    2    No   0 No Yes   0 2523    Black       0    None
2 86  No  33 155    3    No   0 No  No   3 2551    Other       0    Many
3 87  No  20 105    1   Yes   0 No  No   1 2557    White       0  Normal
4 88  No  21 108    1   Yes   0 No Yes   2 2594    White       0  Normal
5 89  No  18 107    1   Yes   0 No Yes   0 2600    White       0    None
6 91  No  21 124    3    No   0 No  No   0 2622    Other       0    None

## Summarize
summary(lbw)
       id       low           age            lwt           race      smoke          ptl          ht        ui     
 Min.   :  4   No :130   Min.   :14.0   Min.   : 80   Min.   :1.00   No :115   Min.   :0.000   No :177   No :161  
 1st Qu.: 68   Yes: 59   1st Qu.:19.0   1st Qu.:110   1st Qu.:1.00   Yes: 74   1st Qu.:0.000   Yes: 12   Yes: 28  
 Median :123             Median :23.0   Median :121   Median :1.00             Median :0.000                      
 Mean   :121             Mean   :23.2   Mean   :130   Mean   :1.85             Mean   :0.196                      
 3rd Qu.:176             3rd Qu.:26.0   3rd Qu.:140   3rd Qu.:3.00             3rd Qu.:0.000                      
 Max.   :226             Max.   :45.0   Max.   :250   Max.   :3.00             Max.   :3.000                      
      ftv             bwt        race.cat  preterm    ftv.cat   
 Min.   :0.000   Min.   : 709   White:96   0 :159   Normal: 77  
 1st Qu.:0.000   1st Qu.:2414   Black:26   1+: 30   None  :100  
 Median :0.000   Median :2977   Other:67            Many  : 12  
 Mean   :0.794   Mean   :2945                                   
 3rd Qu.:1.000   3rd Qu.:3475                                   
 Max.   :6.000   Max.   :4990                                   

Recode data (alternative simplified by within function)

## Recoding using within function
lbw <- within(lbw, {

    ## Relabel race: 1, 2, 3 -> White, Black, Other
    race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))

    ## Dichotomize ptl
    preterm  <- factor(ifelse(ptl >= 1, "1+", "0"))

    ## Change 0,1 binary to No,Yes binary
    smoke    <- factor(ifelse(smoke == 1, "Yes", "No"))
    ht       <- factor(ifelse(ht    == 1, "Yes", "No"))
    ui       <- factor(ifelse(ui    == 1, "Yes", "No"))
    low      <- factor(ifelse(low   == 1, "Yes", "No"))

    ## Categorize ftv (frequency of visit): 0   1   2   3   4   6 -> None(0), Normal(1-2), Many (>= 3)
    ftv.cat  <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))

    ## Make Normal the reference level
    ftv.cat  <- relevel(ftv.cat, ref = "Normal")
})

Alternative to the “## Categorize smoke ht ui low” part.

## Categorize smoke ht ui
lbw[,c("smoke","ht","ui")] <- lapply(lbw[,c("smoke","ht","ui")],
                                     function(var) {
                                         var <- factor(ifelse(var == 1, "Yes", "No"))
                                     })

Yet another way that does not use ifelse()

## Recoding using within function
lbw <- within(lbw, {

    ## Relabel race: 1, 2, 3 -> White, Black, Other
    race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))

    ## Dichotomize ptl
    preterm  <- factor(ptl >= 1, levels = c(FALSE,TRUE), labels = c("0","1+"))
    ## You can alse use cut
    ## preterm  <- cut(ptl, breaks = c(-Inf, 0, Inf), labels = c("0","1+"))

    ## Change 0,1 binary to No,Yes binary
    smoke    <- factor(smoke, levels  = c(0,1), labels = c("No","Yes"))
    ht       <- factor(ht   , levels  = c(0,1), labels = c("No","Yes"))
    ui       <- factor(ui   , levels  = c(0,1), labels = c("No","Yes"))
    low      <- factor(low  , levels  = c(0,1), labels = c("No","Yes"))

    ## Categorize ftv (frequency of visit): 0   1   2   3   4   6 -> None(0), Normal(1-2), Many (>= 3)
    ftv.cat  <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))

    ## Make Normal the reference level
    ftv.cat  <- relevel(ftv.cat, ref = "Normal")
})

Fit a model

lm.full <- lm(bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm, data = lbw)
lm.null <- lm(bwt ~ 1, data = lbw)

See result object

lm.full

Call:
lm(formula = bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + 
    preterm, data = lbw)

Coefficients:
  (Intercept)            age            lwt       smokeYes          htYes          uiYes    ftv.catNone  
      2947.32          -2.91           4.22        -307.34        -568.63        -494.11         -56.50  
  ftv.catMany  race.catBlack  race.catOther      preterm1+  
      -185.86        -467.30        -322.81        -207.91  

See summary

summary(lm.full)

Call:
lm(formula = bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + 
    preterm, data = lbw)

Residuals:
    Min      1Q  Median      3Q     Max 
-1896.7  -443.3    53.2   466.1  1654.4 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)    2947.32     320.48    9.20  < 2e-16 ***
age              -2.91       9.67   -0.30  0.76354    
lwt               4.22       1.72    2.46  0.01488 *  
smokeYes       -307.34     109.13   -2.82  0.00541 ** 
htYes          -568.63     200.88   -2.83  0.00518 ** 
uiYes          -494.11     137.23   -3.60  0.00041 ***
ftv.catNone     -56.50     105.36   -0.54  0.59245    
ftv.catMany    -185.86     203.19   -0.91  0.36158    
race.catBlack  -467.30     149.78   -3.12  0.00211 ** 
race.catOther  -322.81     117.40   -2.75  0.00658 ** 
preterm1+      -207.91     136.35   -1.52  0.12907    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 647 on 178 degrees of freedom
Multiple R-squared: 0.255,  Adjusted R-squared: 0.213 
F-statistic: 6.08 on 10 and 178 DF,  p-value: 0.0000000609 

Confidence intervals

confint(lm.full)
                  2.5 %   97.5 %
(Intercept)   2314.9019 3579.741
age            -22.0027   16.174
lwt              0.8344    7.612
smokeYes      -522.7049  -91.979
htYes         -965.0356 -172.218
uiYes         -764.9083 -223.303
ftv.catNone   -264.4122  151.414
ftv.catMany   -586.8257  215.111
race.catBlack -762.8701 -171.736
race.catOther -554.4737  -91.143
preterm1+     -476.9680   61.155