Harold Nelson
April 28, 2016
Review the lab presentation by Trevor Hastie at
https://www.youtube.com/watch?v=TxvEVc8YNlU
This technique is used when we have a binary dependent variable.
In this presentation we’ll see how it can be used to predict a person’s gender using other characteristics such as height and weight. There is a data set produced by the Centers for Disease Control. Download this dataset from Moodle to follow this exercise.
load("~/Desktop/cdc.Rdata")
str(cdc)
## 'data.frame': 20000 obs. of 9 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100: num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire: int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
summary(cdc)
## genhlth exerany hlthplan smoke100
## excellent:4657 Min. :0.0000 Min. :0.0000 Min. :0.0000
## very good:6972 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## good :5675 Median :1.0000 Median :1.0000 Median :0.0000
## fair :2019 Mean :0.7457 Mean :0.8738 Mean :0.4721
## poor : 677 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## height weight wtdesire age gender
## Min. :48.00 Min. : 68.0 Min. : 68.0 Min. :18.00 m: 9569
## 1st Qu.:64.00 1st Qu.:140.0 1st Qu.:130.0 1st Qu.:31.00 f:10431
## Median :67.00 Median :165.0 Median :150.0 Median :43.00
## Mean :67.18 Mean :169.7 Mean :155.1 Mean :45.07
## 3rd Qu.:70.00 3rd Qu.:190.0 3rd Qu.:175.0 3rd Qu.:57.00
## Max. :93.00 Max. :500.0 Max. :680.0 Max. :99.00
We need a training subset and a separate subset for testing our models.
trainbool = sample(c(TRUE,FALSE),
replace=TRUE,
prob=c(.75,.25),
size=nrow(cdc))
train = cdc[trainbool,]
test = cdc[!trainbool,]
table(train$gender)/nrow(train)
##
## m f
## 0.4770519 0.5229481
table(test$gender)/nrow(test)
##
## m f
## 0.4825616 0.5174384
In this model, we will include all of the variables in the dataset.
gfit1 = glm(gender~.,data=train,family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(gfit1)
##
## Call:
## glm(formula = gender ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.6073 -0.2759 0.0646 0.3377 5.4911
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 37.9530161 0.8508910 44.604 < 2e-16 ***
## genhlthvery good 0.0488545 0.0791602 0.617 0.537
## genhlthgood -0.0936020 0.0849071 -1.102 0.270
## genhlthfair -0.1305492 0.1132346 -1.153 0.249
## genhlthpoor -0.1033822 0.1696789 -0.609 0.542
## exerany 0.0739928 0.0690627 1.071 0.284
## hlthplan 0.4603485 0.0893242 5.154 2.55e-07 ***
## smoke100 -0.3520266 0.0595896 -5.908 3.47e-09 ***
## height -0.4104360 0.0132090 -31.073 < 2e-16 ***
## weight 0.0344111 0.0015119 22.761 < 2e-16 ***
## wtdesire -0.1064397 0.0027164 -39.184 < 2e-16 ***
## age -0.0002439 0.0018092 -0.135 0.893
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20659.0 on 14924 degrees of freedom
## Residual deviance: 7689.5 on 14913 degrees of freedom
## AIC: 7713.5
##
## Number of Fisher Scoring iterations: 6
predfit1 = predict(gfit1,new=test,type = "response")
predg1 = ifelse(predfit1>.5,"f","m")
table(as.character(test$gender),predg1)
## predg1
## f m
## f 2363 263
## m 288 2161
mean(test$gender==predg1)
## [1] 0.8914286
This model uses weight alone.
gfit2 = glm(gender~weight,data=train,family=binomial)
summary(gfit2)
##
## Call:
## glm(formula = gender ~ weight, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3178 -0.9941 0.4956 0.8465 4.5703
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.5177483 0.1064781 51.82 <2e-16 ***
## weight -0.0322459 0.0006285 -51.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20659 on 14924 degrees of freedom
## Residual deviance: 16807 on 14923 degrees of freedom
## AIC: 16811
##
## Number of Fisher Scoring iterations: 4
predfit2 = predict(gfit2,new=test,type = "response")
predg2 = ifelse(predfit2>.5,"f","m")
table(as.character(test$gender),predg2)
## predg2
## f m
## f 2058 568
## m 849 1600
mean(test$gender==predg2)
## [1] 0.7207882
This model includes both weight and height
gfit3 = glm(gender~weight+height,data=train,family=binomial)
summary(gfit3)
##
## Call:
## glm(formula = gender ~ weight + height, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.8818 -0.4376 0.0979 0.4885 3.8455
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 45.2339937 0.7391105 61.20 <2e-16 ***
## weight -0.0111525 0.0007798 -14.30 <2e-16 ***
## height -0.6435413 0.0112201 -57.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20659 on 14924 degrees of freedom
## Residual deviance: 10145 on 14922 degrees of freedom
## AIC: 10151
##
## Number of Fisher Scoring iterations: 6
predfit3 = predict(gfit3,new=test,type = "response")
predg3 = ifelse(predfit3>.5,"f","m")
table(as.character(test$gender),predg3)
## predg3
## f m
## f 2244 382
## m 406 2043
mean(test$gender==predg3)
## [1] 0.8447291
This model includes, weight, height and an interaction term.
gfit4 = glm(gender~weight+height+height*weight,data=train,family=binomial)
summary(gfit4)
##
## Call:
## glm(formula = gender ~ weight + height + height * weight, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.8340 -0.4280 0.1296 0.4932 4.0624
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 27.1769341 3.8743315 7.015 2.31e-12 ***
## weight 0.0958444 0.0229189 4.182 2.89e-05 ***
## height -0.3728043 0.0581753 -6.408 1.47e-10 ***
## weight:height -0.0016022 0.0003433 -4.667 3.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20659 on 14924 degrees of freedom
## Residual deviance: 10123 on 14921 degrees of freedom
## AIC: 10131
##
## Number of Fisher Scoring iterations: 6
predfit4 = predict(gfit4,new=test,type = "response")
predg4 = ifelse(predfit4>.5,"f","m")
table(as.character(test$gender),predg4)
## predg4
## f m
## f 2269 357
## m 416 2033
mean(test$gender==predg4)
## [1] 0.8476847