library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
load("C:/Users/MSSA/Downloads/cdc.Rdata")
glimpse(cdc)
## Observations: 20,000
## Variables: 9
## $ genhlth  <fctr> good, good, good, good, very good, very good, very g...
## $ exerany  <dbl> 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ hlthplan <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,...
## $ smoke100 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,...
## $ height   <dbl> 70, 64, 60, 66, 61, 64, 71, 67, 65, 70, 69, 69, 66, 7...
## $ weight   <int> 175, 125, 105, 132, 150, 114, 194, 170, 150, 180, 186...
## $ wtdesire <int> 175, 115, 105, 124, 130, 114, 185, 160, 130, 170, 175...
## $ age      <int> 77, 33, 49, 42, 55, 55, 31, 45, 27, 44, 46, 62, 21, 6...
## $ gender   <fctr> m, f, f, f, f, f, m, m, f, m, m, m, m, m, m, m, m, m...

Separate cdc into Train and Test.

cdc$type = sample(c("Train","Test"),size=20000,prob=c(.5,.5),replace = TRUE)
cdc$wtdiff = cdc$weight - cdc$wtdesire
cdc$isMale = cdc$gender =="m"
Train = filter(cdc,type == "Train")
Test = filter(cdc,type=="Test")
str(Train)
## 'data.frame':    9938 obs. of  12 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 2 2 1 4 1 1 3 ...
##  $ exerany : num  0 0 1 1 0 1 1 1 1 1 ...
##  $ hlthplan: num  1 1 1 1 1 1 1 0 1 1 ...
##  $ smoke100: num  0 1 0 0 0 1 1 1 1 1 ...
##  $ height  : num  70 64 66 64 67 69 69 66 70 73 ...
##  $ weight  : int  175 125 132 114 170 186 168 185 170 185 ...
##  $ wtdesire: int  175 115 124 114 160 175 148 220 170 175 ...
##  $ age     : int  77 33 42 55 45 46 62 21 69 79 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 1 1 1 1 1 1 ...
##  $ type    : chr  "Train" "Train" "Train" "Train" ...
##  $ wtdiff  : int  0 10 8 0 10 11 20 -35 0 10 ...
##  $ isMale  : logi  TRUE FALSE FALSE FALSE TRUE TRUE ...
str(Test)
## 'data.frame':    10062 obs. of  12 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 2 2 3 3 4 4 2 3 2 ...
##  $ exerany : num  1 0 1 0 1 1 0 1 1 0 ...
##  $ hlthplan: num  1 1 1 1 1 0 1 1 1 0 ...
##  $ smoke100: num  1 0 0 1 0 0 1 0 1 1 ...
##  $ height  : num  60 61 71 65 70 69 71 67 67 64 ...
##  $ weight  : int  105 150 194 150 180 170 185 125 165 105 ...
##  $ wtdesire: int  105 130 185 130 170 170 185 120 158 120 ...
##  $ age     : int  49 55 31 27 44 23 76 33 30 27 ...
##  $ gender  : Factor w/ 2 levels "m","f": 2 2 1 2 1 1 1 2 1 2 ...
##  $ type    : chr  "Test" "Test" "Test" "Test" ...
##  $ wtdiff  : int  0 20 9 20 10 0 0 5 7 -15 ...
##  $ isMale  : logi  FALSE FALSE TRUE FALSE TRUE TRUE ...
genMod = glm(isMale~age + smoke100 + wtdiff,data = Train,family=binomial)
summary(genMod)
## 
## Call:
## glm(formula = isMale ~ age + smoke100 + wtdiff, family = binomial, 
##     data = Train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6771  -1.1218  -0.7778   1.1659   2.4178  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.1494733  0.0599436   2.494   0.0126 *  
## age         -0.0053146  0.0012088  -4.397  1.1e-05 ***
## smoke100     0.4814231  0.0414841  11.605  < 2e-16 ***
## wtdiff      -0.0161106  0.0009986 -16.132  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13758  on 9937  degrees of freedom
## Residual deviance: 13316  on 9934  degrees of freedom
## AIC: 13324
## 
## Number of Fisher Scoring iterations: 4
probMale = predict(genMod,newdata=Test,type="response")
predMale = probMale > .5
mean(predMale == Test$isMale)
## [1] 0.5854701
mean(Test$gender == "m")
## [1] 0.4786325