Problem 1

Load the cdc dataset. Review my document “Data Analysis Example” from Canvas. Use the cleaning and recoding I did to produce a clean and enhanced dataset for analysis. Do a summary to verify the correctness of your work.

# Place your code here. Include load and library commands.
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.3.0
## ✔ tibble  2.0.1     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(gmodels)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
load("/cloud/project/cdc.Rdata")

cdc$BMI = (cdc$weight*703)/(cdc$height)^2
cdc$BMIDes = (cdc$wtdesire*703)/(cdc$height)^2
cdc$DesActRatio = cdc$BMIDes/cdc$BMI
cdc$BMICat = cut(cdc$BMI,c(18,5,24.9,29.9,39.9,200),labels = 
       c("Underweight","Normal","Overweight",
       "Obese","Morbidly Obese"),include.lowest=T)
cdc$BMIDesCat = cut(cdc$BMIDes,c(18,5,24.9,29.9,39.9,200),labels = 
       c("Underweight","Normal","Overweight",
       "Obese","Morbidly Obese"),include.lowest=T)
cdc$ageCat = cut_number(cdc$age,n=4,labels=c("18-31","32-43","44-57","58-99"))

cdcmale <- cdc %>%
  filter(gender=="m") %>%
  mutate(gender=1)

  

cdcfemale <- cdc %>%
  filter(gender=="f") %>%
  mutate(gender=0)
cdc <- rbind(cdcmale, cdcfemale)
summary(cdc)
##       genhlth        exerany          hlthplan         smoke100     
##  excellent:4657   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  very good:6972   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  good     :5675   Median :1.0000   Median :1.0000   Median :0.0000  
##  fair     :2019   Mean   :0.7457   Mean   :0.8738   Mean   :0.4721  
##  poor     : 677   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height          weight         wtdesire          age       
##  Min.   :48.00   Min.   : 68.0   Min.   : 68.0   Min.   :18.00  
##  1st Qu.:64.00   1st Qu.:140.0   1st Qu.:130.0   1st Qu.:31.00  
##  Median :67.00   Median :165.0   Median :150.0   Median :43.00  
##  Mean   :67.18   Mean   :169.7   Mean   :155.1   Mean   :45.07  
##  3rd Qu.:70.00   3rd Qu.:190.0   3rd Qu.:175.0   3rd Qu.:57.00  
##  Max.   :93.00   Max.   :500.0   Max.   :680.0   Max.   :99.00  
##      gender            BMI            BMIDes         DesActRatio    
##  Min.   :0.0000   Min.   :12.40   Min.   :  8.128   Min.   :0.2667  
##  1st Qu.:0.0000   1st Qu.:22.71   1st Qu.: 21.727   1st Qu.:0.8710  
##  Median :0.0000   Median :25.60   Median : 23.746   Median :0.9444  
##  Mean   :0.4784   Mean   :26.31   Mean   : 23.971   Mean   :0.9268  
##  3rd Qu.:1.0000   3rd Qu.:28.89   3rd Qu.: 25.799   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :73.09   Max.   :100.407   Max.   :3.7778  
##             BMICat              BMIDesCat       ageCat    
##  Underweight   : 271   Underweight   :  226   18-31:5087  
##  Normal        :8461   Normal        :12503   32-43:5263  
##  Overweight    :7296   Overweight    : 6451   44-57:4787  
##  Obese         :3541   Obese         :  798   58-99:4863  
##  Morbidly Obese: 431   Morbidly Obese:   22               
## 
head(cdc)
tail(cdc)

Problem 2

Split the dataframe into train and test with a 70/30 ratio. Do an str() on both train and test to verify that you have what you want.

# Place your code here.

nrows = nrow(cdc)

set.seed(999)

rows <- sample(nrows)

cdc = cdc[rows,]

split <- round(nrow(cdc) * .70)

traincdc = cdc[1:split,]
testcdc = cdc[(split+1):nrows,]

nrow(traincdc)
## [1] 14000
nrow(testcdc)
## [1] 6000
nrow(cdc)
## [1] 20000
str(traincdc)
## 'data.frame':    14000 obs. of  15 variables:
##  $ genhlth    : Factor w/ 5 levels "excellent","very good",..: 2 2 2 2 2 2 2 2 2 3 ...
##  $ exerany    : num  1 1 1 1 0 1 1 1 0 1 ...
##  $ hlthplan   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100   : num  0 0 0 1 0 1 0 0 1 0 ...
##  $ height     : num  69 59 70 63 62 71 63 72 73 63 ...
##  $ weight     : int  175 132 190 145 110 200 170 160 160 165 ...
##  $ wtdesire   : int  170 120 185 135 110 180 120 170 160 145 ...
##  $ age        : int  51 55 32 67 95 21 55 42 21 55 ...
##  $ gender     : num  1 0 1 0 0 1 0 1 1 0 ...
##  $ BMI        : num  25.8 26.7 27.3 25.7 20.1 ...
##  $ BMIDes     : num  25.1 24.2 26.5 23.9 20.1 ...
##  $ DesActRatio: num  0.971 0.909 0.974 0.931 1 ...
##  $ BMICat     : Factor w/ 5 levels "Underweight",..: 3 3 3 3 2 3 4 2 2 3 ...
##  $ BMIDesCat  : Factor w/ 5 levels "Underweight",..: 3 2 3 2 2 3 2 2 2 3 ...
##  $ ageCat     : Factor w/ 4 levels "18-31","32-43",..: 3 3 2 4 4 1 3 2 1 3 ...
str(testcdc)
## 'data.frame':    6000 obs. of  15 variables:
##  $ genhlth    : Factor w/ 5 levels "excellent","very good",..: 3 2 3 1 1 1 2 2 2 1 ...
##  $ exerany    : num  0 1 1 1 1 0 0 0 1 1 ...
##  $ hlthplan   : num  1 1 1 1 1 1 1 0 1 1 ...
##  $ smoke100   : num  1 0 0 0 0 1 0 1 1 0 ...
##  $ height     : num  65 61 62 60 68 67 72 68 77 67 ...
##  $ weight     : int  169 180 150 152 165 150 270 180 210 140 ...
##  $ wtdesire   : int  150 115 120 145 160 135 230 165 230 120 ...
##  $ age        : int  50 43 39 44 35 53 30 21 23 37 ...
##  $ gender     : num  0 0 0 0 1 0 1 1 1 0 ...
##  $ BMI        : num  28.1 34 27.4 29.7 25.1 ...
##  $ BMIDes     : num  25 21.7 21.9 28.3 24.3 ...
##  $ DesActRatio: num  0.888 0.639 0.8 0.954 0.97 ...
##  $ BMICat     : Factor w/ 5 levels "Underweight",..: 3 4 3 3 3 2 4 3 2 2 ...
##  $ BMIDesCat  : Factor w/ 5 levels "Underweight",..: 3 2 2 3 2 2 4 3 3 2 ...
##  $ ageCat     : Factor w/ 4 levels "18-31","32-43",..: 3 2 2 3 2 3 1 1 1 2 ...

Problem 3

Create a logistic regression model which predicts gender based on height and weight using train. Measure its accuracy with a confusion matrix based on test.

# Place your code here.
cdcglm <- glm(gender~height+weight, data = traincdc, family = "binomial")
summary(cdcglm)
## 
## Call:
## glm(formula = gender ~ height + weight, family = "binomial", 
##     data = traincdc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8402  -0.5036  -0.1024   0.4452   4.8863  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.520e+01  7.612e-01  -59.38   <2e-16 ***
## height       6.454e-01  1.158e-02   55.72   <2e-16 ***
## weight       1.023e-02  8.018e-04   12.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19378  on 13999  degrees of freedom
## Residual deviance:  9660  on 13997  degrees of freedom
## AIC: 9666
## 
## Number of Fisher Scoring iterations: 6
predcdc <- predict(cdcglm, newdata = testcdc, type = "response")
predcdc <- predcdc > .5


table(testcdc$gender,predcdc)
##    predcdc
##     FALSE TRUE
##   0  2675  430
##   1   454 2441

Problem 4

Follow the same process as in Problem 3, but add the exercise and smoking variables. Did this improve on the accuracy you achieved in Problem 3.

This did improve the score.

# Place your code here.
cdcglm <- glm(gender~height+weight + exerany + smoke100, data = traincdc, family = "binomial")
summary(cdcglm)
## 
## Call:
## glm(formula = gender ~ height + weight + exerany + smoke100, 
##     family = "binomial", data = traincdc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7950  -0.4963  -0.1025   0.4451   4.8520  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.526e+01  7.629e-01 -59.329  < 2e-16 ***
## height       6.448e-01  1.166e-02  55.285  < 2e-16 ***
## weight       1.013e-02  8.081e-04  12.532  < 2e-16 ***
## exerany     -3.738e-02  5.971e-02  -0.626    0.531    
## smoke100     3.146e-01  5.162e-02   6.096 1.09e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19378  on 13999  degrees of freedom
## Residual deviance:  9622  on 13995  degrees of freedom
## AIC: 9632
## 
## Number of Fisher Scoring iterations: 6
predcdc <- predict(cdcglm, newdata = testcdc, type = "response")
predcdc <- predcdc > .5


table(testcdc$gender,predcdc)
##    predcdc
##     FALSE TRUE
##   0  2696  409
##   1   442 2453

Problem 5

One of the conclusions we reached in the original analysis is that women were more anxious to keep their weight low than men. Formulate a variable that captures this and add it to the model. Explain the formulation you chose. Add this variable to the model in Problem 4. Did it improve your accuracy?

I created the variable, wtdesireddif, which is the difference between weight desired and actual weight. I chose this as a variable because women tend to want to keep their weight lower, wtdesireddif would be a good way of capturing that as women will tend to have negative values more often than men. After adding this variable to the model the accuracy did improve.

# Place your code here.

traincdc$wtdesiredif <- traincdc$wtdesire - traincdc$weight
testcdc$wtdesiredif <- testcdc$wtdesire - testcdc$weight

cdcglm <- glm(gender~height+weight + exerany + smoke100 + wtdesiredif, data = traincdc, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(cdcglm)
## 
## Call:
## glm(formula = gender ~ height + weight + exerany + smoke100 + 
##     wtdesiredif, family = "binomial", data = traincdc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.4553  -0.3529  -0.0662   0.2875   4.6102  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -38.411328   0.842400 -45.597  < 2e-16 ***
## height        0.414967   0.013187  31.468  < 2e-16 ***
## weight        0.070454   0.001820  38.708  < 2e-16 ***
## exerany      -0.067725   0.068460  -0.989    0.323    
## smoke100      0.393103   0.059901   6.563 5.29e-11 ***
## wtdesiredif   0.103677   0.002708  38.283  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19377.7  on 13999  degrees of freedom
## Residual deviance:  7410.3  on 13994  degrees of freedom
## AIC: 7422.3
## 
## Number of Fisher Scoring iterations: 6
predcdc <- predict(cdcglm, newdata = testcdc, type = "response")
predcdc <- predcdc > .5


table(testcdc$gender,predcdc)
##    predcdc
##     FALSE TRUE
##   0  2817  288
##   1   334 2561