library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
load("C:/Users/MSSA/Downloads/cdc.Rdata")
glimpse(cdc)
## Observations: 20,000
## Variables: 9
## $ genhlth <fctr> good, good, good, good, very good, very good, very g...
## $ exerany <dbl> 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ hlthplan <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,...
## $ smoke100 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,...
## $ height <dbl> 70, 64, 60, 66, 61, 64, 71, 67, 65, 70, 69, 69, 66, 7...
## $ weight <int> 175, 125, 105, 132, 150, 114, 194, 170, 150, 180, 186...
## $ wtdesire <int> 175, 115, 105, 124, 130, 114, 185, 160, 130, 170, 175...
## $ age <int> 77, 33, 49, 42, 55, 55, 31, 45, 27, 44, 46, 62, 21, 6...
## $ gender <fctr> m, f, f, f, f, f, m, m, f, m, m, m, m, m, m, m, m, m...
Separate cdc into Train and Test.
cdc$type = sample(c("Train","Test"),size=20000,prob=c(.5,.5),replace = TRUE)
cdc$wtdiff = cdc$weight - cdc$wtdesire
cdc$isMale = cdc$gender =="m"
Train = filter(cdc,type == "Train")
Test = filter(cdc,type=="Test")
str(Train)
## 'data.frame': 9938 obs. of 12 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 2 2 1 4 1 1 3 ...
## $ exerany : num 0 0 1 1 0 1 1 1 1 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 0 1 1 ...
## $ smoke100: num 0 1 0 0 0 1 1 1 1 1 ...
## $ height : num 70 64 66 64 67 69 69 66 70 73 ...
## $ weight : int 175 125 132 114 170 186 168 185 170 185 ...
## $ wtdesire: int 175 115 124 114 160 175 148 220 170 175 ...
## $ age : int 77 33 42 55 45 46 62 21 69 79 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 1 1 1 1 1 1 ...
## $ type : chr "Train" "Train" "Train" "Train" ...
## $ wtdiff : int 0 10 8 0 10 11 20 -35 0 10 ...
## $ isMale : logi TRUE FALSE FALSE FALSE TRUE TRUE ...
str(Test)
## 'data.frame': 10062 obs. of 12 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 2 2 3 3 4 4 2 3 2 ...
## $ exerany : num 1 0 1 0 1 1 0 1 1 0 ...
## $ hlthplan: num 1 1 1 1 1 0 1 1 1 0 ...
## $ smoke100: num 1 0 0 1 0 0 1 0 1 1 ...
## $ height : num 60 61 71 65 70 69 71 67 67 64 ...
## $ weight : int 105 150 194 150 180 170 185 125 165 105 ...
## $ wtdesire: int 105 130 185 130 170 170 185 120 158 120 ...
## $ age : int 49 55 31 27 44 23 76 33 30 27 ...
## $ gender : Factor w/ 2 levels "m","f": 2 2 1 2 1 1 1 2 1 2 ...
## $ type : chr "Test" "Test" "Test" "Test" ...
## $ wtdiff : int 0 20 9 20 10 0 0 5 7 -15 ...
## $ isMale : logi FALSE FALSE TRUE FALSE TRUE TRUE ...
genMod = glm(isMale~age + smoke100 + wtdiff,data = Train,family=binomial)
summary(genMod)
##
## Call:
## glm(formula = isMale ~ age + smoke100 + wtdiff, family = binomial,
## data = Train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6771 -1.1218 -0.7778 1.1659 2.4178
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1494733 0.0599436 2.494 0.0126 *
## age -0.0053146 0.0012088 -4.397 1.1e-05 ***
## smoke100 0.4814231 0.0414841 11.605 < 2e-16 ***
## wtdiff -0.0161106 0.0009986 -16.132 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13758 on 9937 degrees of freedom
## Residual deviance: 13316 on 9934 degrees of freedom
## AIC: 13324
##
## Number of Fisher Scoring iterations: 4
probMale = predict(genMod,newdata=Test,type="response")
predMale = probMale > .5
mean(predMale == Test$isMale)
## [1] 0.5854701
mean(Test$gender == "m")
## [1] 0.4786325