library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(questionr)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(tableone)
load("~/ipumstor/environment.RData")
summary(data)
## YEAR SERIAL STRATA PSU
## Min. :2010 Min. : 1 Min. :6001 Min. :1.000
## 1st Qu.:2010 1st Qu.:10742 1st Qu.:6081 1st Qu.:1.000
## Median :2010 Median :21377 Median :6158 Median :2.000
## Mean :2010 Mean :21438 Mean :6156 Mean :1.507
## 3rd Qu.:2010 3rd Qu.:32091 3rd Qu.:6238 3rd Qu.:2.000
## Max. :2010 Max. :43208 Max. :6300 Max. :2.000
## NHISHID HHWEIGHT PERNUM NHISPID
## Length:89976 Min. : 991 Min. : 1.000 Length:89976
## Class :character 1st Qu.: 1788 1st Qu.: 1.000 Class :character
## Mode :character Median : 2808 Median : 2.000 Mode :character
## Mean : 2822 Mean : 2.271
## 3rd Qu.: 3663 3rd Qu.: 3.000
## Max. :21312 Max. :17.000
## HHX FMX PX PERWEIGHT
## Length:89976 Length:89976 Length:89976 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 2093
## Mode :character Mode :character Mode :character Median : 3356
## Mean : 3380
## 3rd Qu.: 4394
## Max. :28756
## SAMPWEIGHT FWEIGHT ASTATFLG CSTATFLG
## Min. : 0 Min. : 897 Min. :0.000 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 1917 1st Qu.:0.000 1st Qu.:0.0000
## Median : 0 Median : 3123 Median :1.000 Median :0.0000
## Mean : 3380 Mean : 3184 Mean :1.512 Mean :0.5383
## 3rd Qu.: 5466 3rd Qu.: 4184 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :104188 Max. :26845 Max. :5.000 Max. :5.0000
## EDUC HEALTH ALCSTAT1 SMOKFREQNOW
## Min. : 0.0 Min. :1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:109.0 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :301.0 Median :2.000 Median :0.0000 Median :0.0000
## Mean :301.7 Mean :2.149 Mean :0.7574 Mean :0.2228
## 3rd Qu.:402.0 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :999.0 Max. :9.000 Max. :9.0000 Max. :8.0000
## SNUFFREQNOW BADHEALTH
## Min. :0.00000 Length:89976
## 1st Qu.:0.00000 Class :character
## Median :0.00000 Mode :character
## Mean :0.06024
## 3rd Qu.:0.00000
## Max. :9.00000
#first I recode to make binary variables
data$BADHEALTH<-Recode(data$HEALTH, recodes="4:5=1; 1:3=0; else=NA")
summary(data$BADHEALTH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.1047 0.0000 1.0000 116
data$DRINKER<-Recode(data$ALCSTAT1, recodes="3=1; 1:2=0; else=NA")
summary(data$DRINKER)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 1.00 0.62 1.00 1.00 63282
data$SMOKER<-Recode(data$SMOKFREQNOW, recodes="1=0; 2:3=1; else=NA")
summary(data$SMOKER)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.00 0.47 1.00 1.00 79092
data$SNUFF<-Recode(data$SNUFFREQNOW, recodes="1:2=1; 3=0; else=NA")
summary(data$SNUFF)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.0 0.0 0.2 0.0 1.0 87955
data1 <- na.omit(data)
#can health status be determined by education level, whether they consume alcohol, smokes tobacco, and use smokeless tobacco?
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
set.seed(1126)
train<- createDataPartition(y = data1$BADHEALTH,
p = .80,
list=F)
datatrain<-data1[train,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
datatest<-data1[-train,]
table(data1$BADHEALTH)
##
## 0 1
## 1126 232
prop.table(table(data1$BADHEALTH))
##
## 0 1
## 0.8291605 0.1708395
summary(datatrain)
## YEAR SERIAL STRATA PSU
## Min. :2010 Min. : 12 Min. :6001 Min. :1.000
## 1st Qu.:2010 1st Qu.: 9194 1st Qu.:6092 1st Qu.:1.000
## Median :2010 Median :20634 Median :6169 Median :2.000
## Mean :2010 Mean :20828 Mean :6163 Mean :1.512
## 3rd Qu.:2010 3rd Qu.:32311 3rd Qu.:6226 3rd Qu.:2.000
## Max. :2010 Max. :43201 Max. :6300 Max. :2.000
## NHISHID HHWEIGHT PERNUM NHISPID
## Length:1087 Min. : 991 Min. :1.000 Length:1087
## Class :character 1st Qu.: 2933 1st Qu.:1.000 Class :character
## Mode :character Median : 3342 Median :1.000 Mode :character
## Mean : 3384 Mean :1.385
## 3rd Qu.: 3949 3rd Qu.:2.000
## Max. :10822 Max. :8.000
## HHX FMX PX PERWEIGHT
## Length:1087 Length:1087 Length:1087 Min. : 1000
## Class :character Class :character Class :character 1st Qu.: 3374
## Mode :character Mode :character Mode :character Median : 4189
## Mean : 4126
## 3rd Qu.: 4841
## Max. :13102
## SAMPWEIGHT FWEIGHT ASTATFLG CSTATFLG EDUC
## Min. : 853 Min. : 1000 Min. :1 Min. :0 Min. :101.0
## 1st Qu.: 5428 1st Qu.: 3231 1st Qu.:1 1st Qu.:0 1st Qu.:301.0
## Median : 8397 Median : 3996 Median :1 Median :0 Median :401.0
## Mean : 9696 Mean : 3952 Mean :1 Mean :0 Mean :354.9
## 3rd Qu.:12161 3rd Qu.: 4654 3rd Qu.:1 3rd Qu.:0 3rd Qu.:402.0
## Max. :47011 Max. :13102 Max. :1 Max. :0 Max. :999.0
## HEALTH ALCSTAT1 SMOKFREQNOW SNUFFREQNOW
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:3.000
## Median :2.000 Median :3.000 Median :2.000 Median :3.000
## Mean :2.483 Mean :2.748 Mean :1.927 Mean :2.723
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :3.000 Max. :3.000 Max. :3.000
## BADHEALTH DRINKER SMOKER SNUFF
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.1776 Mean :0.7774 Mean :0.5198 Mean :0.1739
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
glm1<-glm(BADHEALTH~factor(EDUC)+scale(DRINKER)+scale(SMOKER)+scale(SNUFF),
data=data1[,-1],
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = BADHEALTH ~ factor(EDUC) + scale(DRINKER) + scale(SMOKER) +
## scale(SNUFF), family = binomial, data = data1[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7341 -0.5641 -0.5028 -0.3740 2.3902
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 16.53891 1641.81531 0.010 0.992
## factor(EDUC)102 -16.77265 1641.81595 -0.010 0.992
## factor(EDUC)103 -17.23366 1641.81592 -0.010 0.992
## factor(EDUC)104 -0.66759 2907.46838 0.000 1.000
## factor(EDUC)105 -16.97833 1641.81576 -0.010 0.992
## factor(EDUC)106 -15.75160 1641.81569 -0.010 0.992
## factor(EDUC)107 -17.37864 1641.81558 -0.011 0.992
## factor(EDUC)108 -16.58578 1641.81541 -0.010 0.992
## factor(EDUC)109 -17.67499 1641.81535 -0.011 0.991
## factor(EDUC)201 -17.77450 1641.81536 -0.011 0.991
## factor(EDUC)202 -17.05851 1641.81534 -0.010 0.992
## factor(EDUC)203 -17.49563 1641.81534 -0.011 0.991
## factor(EDUC)204 -18.00978 1641.81537 -0.011 0.991
## factor(EDUC)301 -18.35416 1641.81532 -0.011 0.991
## factor(EDUC)302 -18.04490 1641.81534 -0.011 0.991
## factor(EDUC)401 -18.20793 1641.81532 -0.011 0.991
## factor(EDUC)402 -18.45489 1641.81533 -0.011 0.991
## factor(EDUC)403 -18.62186 1641.81540 -0.011 0.991
## factor(EDUC)500 -18.93650 1641.81533 -0.012 0.991
## factor(EDUC)601 -19.24781 1641.81542 -0.012 0.991
## factor(EDUC)602 -33.14172 1766.93466 -0.019 0.985
## factor(EDUC)603 -32.84157 1911.52919 -0.017 0.986
## factor(EDUC)999 -17.42541 1641.81578 -0.011 0.992
## scale(DRINKER) -0.37704 0.07033 -5.361 8.28e-08 ***
## scale(SMOKER) 0.06941 0.08139 0.853 0.394
## scale(SNUFF) -0.08686 0.08089 -1.074 0.283
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1241.8 on 1357 degrees of freedom
## Residual deviance: 1121.7 on 1332 degrees of freedom
## AIC: 1173.7
##
## Number of Fisher Scoring iterations: 15
tr_pred<- predict(glm1,
newdata = data1,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.26015331 0.12968960 0.11481002 0.12053383 0.16875934 0.07684211
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=data1$BADHEALTH)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of getting bad health by education attainment, consuming alcohol, smoking, and using smokeless tobaco", subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
ggtitle(label = "Probability of getting bad health by education attainment consuming alcohol, smoking, and using smokeless tobaco", subtitle = "Truth")+
geom_vline(xintercept=.5)
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/integer. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/integer. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tr_predcl,data1$BADHEALTH)
##
## tr_predcl 0 1
## 0 1113 212
## 1 13 20
cm1<-confusionMatrix(data = factor(tr_predcl),
reference = factor(data1$BADHEALTH) )
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1113 212
## 1 13 20
##
## Accuracy : 0.8343
## 95% CI : (0.8135, 0.8537)
## No Information Rate : 0.8292
## P-Value [Acc > NIR] : 0.3219
##
## Kappa : 0.1132
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.98845
## Specificity : 0.08621
## Pos Pred Value : 0.84000
## Neg Pred Value : 0.60606
## Prevalence : 0.82916
## Detection Rate : 0.81959
## Detection Prevalence : 0.97570
## Balanced Accuracy : 0.53733
##
## 'Positive' Class : 0
##
#accuracy is 83.4 percent, sensitivity is 98.8 percent and specificity is 8.6 percent
tr_predcl<-factor(ifelse(tr_pred>mean(I(data1$BADHEALTH==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=data1$BADHEALTH)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of getting bad health by education attainment consuming alcohol, smoking, and using smokeless tobaco", subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(data1$BADHEALTH==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
ggplot(aes(x=pr, fill=modcon))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of getting bad health by education attainment consuming alcohol, smoking, and using smokeless tobaco", subtitle = "Truth")+
geom_vline(xintercept=mean(I(data1$BADHEALTH==1)))
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/integer. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

confusionMatrix(data = factor(tr_predcl),
factor(data1$BADHEALTH),
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 866 110
## 1 260 122
##
## Accuracy : 0.7275
## 95% CI : (0.703, 0.7511)
## No Information Rate : 0.8292
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2347
##
## Mcnemar's Test P-Value : 9.473e-15
##
## Sensitivity : 0.52586
## Specificity : 0.76909
## Pos Pred Value : 0.31937
## Neg Pred Value : 0.88730
## Prevalence : 0.17084
## Detection Rate : 0.08984
## Detection Prevalence : 0.28130
## Balanced Accuracy : 0.64748
##
## 'Positive' Class : 1
##
#use the mean of the response as the cutoff value
#accuracy decreased to 72.8 percent, sensitivity decreased to 52.6 percent, and increased the specificity to 76.9 percent
pred_test<-predict(glm1,
newdata=datatest,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(datatest$BADHEALTH==1)), 1, 0))
table(datatest$BADHEALTH,pred_cl)
## pred_cl
## 0 1
## 0 151 81
## 1 17 22
confusionMatrix(data = factor(pred_cl),factor
(datatest$BADHEALTH) )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 151 17
## 1 81 22
##
## Accuracy : 0.6384
## 95% CI : (0.5781, 0.6956)
## No Information Rate : 0.8561
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1278
##
## Mcnemar's Test P-Value : 1.966e-10
##
## Sensitivity : 0.6509
## Specificity : 0.5641
## Pos Pred Value : 0.8988
## Neg Pred Value : 0.2136
## Prevalence : 0.8561
## Detection Rate : 0.5572
## Detection Prevalence : 0.6199
## Balanced Accuracy : 0.6075
##
## 'Positive' Class : 0
##
#the testing does just as good as the training