Regression Workflow 1

Harold Nelson

4/8/2022

A Model Building Exercise

Can we predict how much weight people want to lose or gain?

The Data

I’ll use a sample of records from the Behavioral Risk Factors Surveillance System (BRFSS) conducted by the Centers for Disease Control (CDC). The data I used is available from Openintro.org. See https://www.openintro.org/book/statdata/?data=cdc. It is in Moodle.

The first step in model building is to examine, clean, and transform the data.

Load the data.

load("cdc.Rdata")

Packages

Make a few packages available.

library(broom)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(formula.tools)

Structure

Determine the structure of the object CDC.

str(cdc)
## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : num  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100: num  0 1 1 0 0 0 0 0 1 0 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...

BMI

Before cleaning, create a new variable BMI, the body mass index. This corrects for the influence of height on weight in looking for obesity.

The body mass index (BMI) is a measure which incorprates both height and weight.

The standard interpetation of this measure is as follows:

New Variables.

cdc$BMI = (cdc$weight*703)/(cdc$height)^2
cdc$BMIDes = (cdc$wtdesire*703)/(cdc$height)^2
cdc$DesActRatio = cdc$BMIDes/cdc$BMI
cdc$BMICat = cut(cdc$BMI,c(0,18.5,24.9,29.9,39.9,200),labels = 
       c("Underweight","Normal","Overweight",
       "Obese","Morbidly Obese"),include.lowest=T)
cdc$BMIDesCat = cut(cdc$BMIDes,c(0,18.5,24.9,29.9,39.9,200),labels = 
       c("Underweight","Normal","Overweight",
       "Obese","Morbidly Obese"),include.lowest=T)
cdc$ageCat = cut_number(cdc$age,n=4,labels=c("18-31","32-43","44-57","58-99"))


table(cdc$BMICat,cdc$BMIDesCat)
##                 
##                  Underweight Normal Overweight Obese Morbidly Obese
##   Underweight            232    170          9     0              0
##   Normal                 139   7866        303    13              0
##   Overweight              13   3387       3850    45              1
##   Obese                   13    822       2098   602              6
##   Morbidly Obese           7     80        191   138             15

Examine the data and look for anomalies

summary(cdc)
##       genhlth        exerany          hlthplan         smoke100     
##  excellent:4657   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  very good:6972   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  good     :5675   Median :1.0000   Median :1.0000   Median :0.0000  
##  fair     :2019   Mean   :0.7457   Mean   :0.8738   Mean   :0.4721  
##  poor     : 677   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height          weight         wtdesire          age        gender   
##  Min.   :48.00   Min.   : 68.0   Min.   : 68.0   Min.   :18.00   m: 9569  
##  1st Qu.:64.00   1st Qu.:140.0   1st Qu.:130.0   1st Qu.:31.00   f:10431  
##  Median :67.00   Median :165.0   Median :150.0   Median :43.00            
##  Mean   :67.18   Mean   :169.7   Mean   :155.1   Mean   :45.07            
##  3rd Qu.:70.00   3rd Qu.:190.0   3rd Qu.:175.0   3rd Qu.:57.00            
##  Max.   :93.00   Max.   :500.0   Max.   :680.0   Max.   :99.00            
##       BMI            BMIDes         DesActRatio                BMICat    
##  Min.   :12.40   Min.   :  8.128   Min.   :0.2667   Underweight   : 411  
##  1st Qu.:22.71   1st Qu.: 21.727   1st Qu.:0.8710   Normal        :8321  
##  Median :25.60   Median : 23.746   Median :0.9444   Overweight    :7296  
##  Mean   :26.31   Mean   : 23.971   Mean   :0.9268   Obese         :3541  
##  3rd Qu.:28.89   3rd Qu.: 25.799   3rd Qu.:1.0000   Morbidly Obese: 431  
##  Max.   :73.09   Max.   :100.407   Max.   :3.7778                        
##           BMIDesCat       ageCat    
##  Underweight   :  404   18-31:5087  
##  Normal        :12325   32-43:5263  
##  Overweight    : 6451   44-57:4787  
##  Obese         :  798   58-99:4863  
##  Morbidly Obese:   22               
## 

Look at the records which have unusual values.

cdc[cdc$height==93,]
##         genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 17534 very good       1        0        0     93    179      100  31      m
##            BMI   BMIDes DesActRatio      BMICat   BMIDesCat ageCat
## 17534 14.54931 8.128107   0.5586592 Underweight Underweight  18-31
cdc[cdc$weight==68,]
##       genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 18743    good       1        1        1     52     68       68  44      f
##            BMI   BMIDes DesActRatio      BMICat   BMIDesCat ageCat
## 18743 17.67899 17.67899           1 Underweight Underweight  44-57
cdc[cdc$wtdesire==680,]
##       genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 16874    good       0        1        0     69    180      680  24      m
##            BMI   BMIDes DesActRatio     BMICat      BMIDesCat ageCat
## 16874 26.57845 100.4075    3.777778 Overweight Morbidly Obese  18-31
cdc[cdc$BMIDes > 50,]
##         genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 10034 very good       1        1        1     73    290      601  56      m
## 13086      good       0        1        1     62    300      300  48      f
## 13607 very good       0        1        0     69    350      350  33      f
## 16874      good       0        1        0     69    180      680  24      m
##            BMI    BMIDes DesActRatio         BMICat      BMIDesCat ageCat
## 10034 38.25671  79.28373    2.072414          Obese Morbidly Obese  44-57
## 13086 54.86472  54.86472    1.000000 Morbidly Obese Morbidly Obese  44-57
## 13607 51.68032  51.68032    1.000000 Morbidly Obese Morbidly Obese  32-43
## 16874 26.57845 100.40748    3.777778     Overweight Morbidly Obese  18-31
cdc[cdc$BMIDes < 10,]
##         genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 17534 very good       1        0        0     93    179      100  31      m
##            BMI   BMIDes DesActRatio      BMICat   BMIDesCat ageCat
## 17534 14.54931 8.128107   0.5586592 Underweight Underweight  18-31

Remove Anomalies

Rejects = cdc$BMIDes < 10 | 
   (cdc$BMIDes > 50 & cdc$DesActRatio > 1.0) 
table(Rejects)
## Rejects
## FALSE  TRUE 
## 19997     3
Keepers = !Rejects
cdc2 = cdc[Keepers,]
summary(cdc2)
##       genhlth        exerany          hlthplan         smoke100     
##  excellent:4657   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  very good:6970   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  good     :5674   Median :1.0000   Median :1.0000   Median :0.0000  
##  fair     :2019   Mean   :0.7457   Mean   :0.8738   Mean   :0.4721  
##  poor     : 677   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height          weight         wtdesire        age        gender   
##  Min.   :48.00   Min.   : 68.0   Min.   : 68   Min.   :18.00   m: 9566  
##  1st Qu.:64.00   1st Qu.:140.0   1st Qu.:130   1st Qu.:31.00   f:10431  
##  Median :67.00   Median :165.0   Median :150   Median :43.00            
##  Mean   :67.18   Mean   :169.7   Mean   :155   Mean   :45.07            
##  3rd Qu.:70.00   3rd Qu.:190.0   3rd Qu.:175   3rd Qu.:57.00            
##  Max.   :84.00   Max.   :500.0   Max.   :350   Max.   :99.00            
##       BMI            BMIDes       DesActRatio                BMICat    
##  Min.   :12.40   Min.   :10.44   Min.   :0.2667   Underweight   : 410  
##  1st Qu.:22.71   1st Qu.:21.73   1st Qu.:0.8710   Normal        :8321  
##  Median :25.60   Median :23.75   Median :0.9444   Overweight    :7295  
##  Mean   :26.31   Mean   :23.96   Mean   :0.9266   Obese         :3540  
##  3rd Qu.:28.89   3rd Qu.:25.80   3rd Qu.:1.0000   Morbidly Obese: 431  
##  Max.   :73.09   Max.   :54.86   Max.   :1.9681                        
##           BMIDesCat       ageCat    
##  Underweight   :  403   18-31:5085  
##  Normal        :12325   32-43:5263  
##  Overweight    : 6451   44-57:4786  
##  Obese         :  798   58-99:4863  
##  Morbidly Obese:   20               
## 

We will use cdc2 in our regression models.

save(cdc2,file = "cdc2.Rdata")