library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(questionr)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(tableone)

load("~/ipumstor/environment.RData")

summary(data)
##       YEAR          SERIAL          STRATA          PSU       
##  Min.   :2010   Min.   :    1   Min.   :6001   Min.   :1.000  
##  1st Qu.:2010   1st Qu.:10742   1st Qu.:6081   1st Qu.:1.000  
##  Median :2010   Median :21377   Median :6158   Median :2.000  
##  Mean   :2010   Mean   :21438   Mean   :6156   Mean   :1.507  
##  3rd Qu.:2010   3rd Qu.:32091   3rd Qu.:6238   3rd Qu.:2.000  
##  Max.   :2010   Max.   :43208   Max.   :6300   Max.   :2.000  
##    NHISHID             HHWEIGHT         PERNUM         NHISPID         
##  Length:89976       Min.   :  991   Min.   : 1.000   Length:89976      
##  Class :character   1st Qu.: 1788   1st Qu.: 1.000   Class :character  
##  Mode  :character   Median : 2808   Median : 2.000   Mode  :character  
##                     Mean   : 2822   Mean   : 2.271                     
##                     3rd Qu.: 3663   3rd Qu.: 3.000                     
##                     Max.   :21312   Max.   :17.000                     
##      HHX                FMX                 PX              PERWEIGHT    
##  Length:89976       Length:89976       Length:89976       Min.   :    0  
##  Class :character   Class :character   Class :character   1st Qu.: 2093  
##  Mode  :character   Mode  :character   Mode  :character   Median : 3356  
##                                                           Mean   : 3380  
##                                                           3rd Qu.: 4394  
##                                                           Max.   :28756  
##    SAMPWEIGHT        FWEIGHT         ASTATFLG        CSTATFLG     
##  Min.   :     0   Min.   :  897   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:     0   1st Qu.: 1917   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :     0   Median : 3123   Median :1.000   Median :0.0000  
##  Mean   :  3380   Mean   : 3184   Mean   :1.512   Mean   :0.5383  
##  3rd Qu.:  5466   3rd Qu.: 4184   3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :104188   Max.   :26845   Max.   :5.000   Max.   :5.0000  
##       EDUC           HEALTH         ALCSTAT1       SMOKFREQNOW    
##  Min.   :  0.0   Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:109.0   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :301.0   Median :2.000   Median :0.0000   Median :0.0000  
##  Mean   :301.7   Mean   :2.149   Mean   :0.7574   Mean   :0.2228  
##  3rd Qu.:402.0   3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :999.0   Max.   :9.000   Max.   :9.0000   Max.   :8.0000  
##   SNUFFREQNOW       BADHEALTH        
##  Min.   :0.00000   Length:89976      
##  1st Qu.:0.00000   Class :character  
##  Median :0.00000   Mode  :character  
##  Mean   :0.06024                     
##  3rd Qu.:0.00000                     
##  Max.   :9.00000
#first I recode to make binary variables
data$BADHEALTH<-Recode(data$HEALTH, recodes="4:5=1; 1:3=0; else=NA")
summary(data$BADHEALTH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.1047  0.0000  1.0000     116
data$DRINKER<-Recode(data$ALCSTAT1, recodes="3=1; 1:2=0; else=NA")
summary(data$DRINKER)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    1.00    0.62    1.00    1.00   63282
data$SMOKER<-Recode(data$SMOKFREQNOW, recodes="1=0; 2:3=1; else=NA")
summary(data$SMOKER)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00    0.47    1.00    1.00   79092
data$SNUFF<-Recode(data$SNUFFREQNOW, recodes="1:2=1; 3=0; else=NA")
summary(data$SNUFF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     0.0     0.0     0.2     0.0     1.0   87955
data1 <- na.omit(data)

#can health status be determined by education level, whether they consume alcohol, smokes tobacco, and use smokeless tobacco?

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
set.seed(1126)
train<- createDataPartition(y = data1$BADHEALTH,
                            p = .80,
                            list=F)

datatrain<-data1[train,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
datatest<-data1[-train,]

table(data1$BADHEALTH)
## 
##    0    1 
## 1126  232
prop.table(table(data1$BADHEALTH))
## 
##         0         1 
## 0.8291605 0.1708395
summary(datatrain)
##       YEAR          SERIAL          STRATA          PSU       
##  Min.   :2010   Min.   :   12   Min.   :6001   Min.   :1.000  
##  1st Qu.:2010   1st Qu.: 9194   1st Qu.:6092   1st Qu.:1.000  
##  Median :2010   Median :20634   Median :6169   Median :2.000  
##  Mean   :2010   Mean   :20828   Mean   :6163   Mean   :1.512  
##  3rd Qu.:2010   3rd Qu.:32311   3rd Qu.:6226   3rd Qu.:2.000  
##  Max.   :2010   Max.   :43201   Max.   :6300   Max.   :2.000  
##    NHISHID             HHWEIGHT         PERNUM        NHISPID         
##  Length:1087        Min.   :  991   Min.   :1.000   Length:1087       
##  Class :character   1st Qu.: 2933   1st Qu.:1.000   Class :character  
##  Mode  :character   Median : 3342   Median :1.000   Mode  :character  
##                     Mean   : 3384   Mean   :1.385                     
##                     3rd Qu.: 3949   3rd Qu.:2.000                     
##                     Max.   :10822   Max.   :8.000                     
##      HHX                FMX                 PX              PERWEIGHT    
##  Length:1087        Length:1087        Length:1087        Min.   : 1000  
##  Class :character   Class :character   Class :character   1st Qu.: 3374  
##  Mode  :character   Mode  :character   Mode  :character   Median : 4189  
##                                                           Mean   : 4126  
##                                                           3rd Qu.: 4841  
##                                                           Max.   :13102  
##    SAMPWEIGHT       FWEIGHT         ASTATFLG    CSTATFLG      EDUC      
##  Min.   :  853   Min.   : 1000   Min.   :1   Min.   :0   Min.   :101.0  
##  1st Qu.: 5428   1st Qu.: 3231   1st Qu.:1   1st Qu.:0   1st Qu.:301.0  
##  Median : 8397   Median : 3996   Median :1   Median :0   Median :401.0  
##  Mean   : 9696   Mean   : 3952   Mean   :1   Mean   :0   Mean   :354.9  
##  3rd Qu.:12161   3rd Qu.: 4654   3rd Qu.:1   3rd Qu.:0   3rd Qu.:402.0  
##  Max.   :47011   Max.   :13102   Max.   :1   Max.   :0   Max.   :999.0  
##      HEALTH         ALCSTAT1      SMOKFREQNOW     SNUFFREQNOW   
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:3.000   1st Qu.:1.000   1st Qu.:3.000  
##  Median :2.000   Median :3.000   Median :2.000   Median :3.000  
##  Mean   :2.483   Mean   :2.748   Mean   :1.927   Mean   :2.723  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :3.000   Max.   :3.000   Max.   :3.000  
##    BADHEALTH         DRINKER           SMOKER           SNUFF       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.1776   Mean   :0.7774   Mean   :0.5198   Mean   :0.1739  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000
glm1<-glm(BADHEALTH~factor(EDUC)+scale(DRINKER)+scale(SMOKER)+scale(SNUFF),
          data=data1[,-1],
          family = binomial)
summary(glm1)
## 
## Call:
## glm(formula = BADHEALTH ~ factor(EDUC) + scale(DRINKER) + scale(SMOKER) + 
##     scale(SNUFF), family = binomial, data = data1[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7341  -0.5641  -0.5028  -0.3740   2.3902  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       16.53891 1641.81531   0.010    0.992    
## factor(EDUC)102  -16.77265 1641.81595  -0.010    0.992    
## factor(EDUC)103  -17.23366 1641.81592  -0.010    0.992    
## factor(EDUC)104   -0.66759 2907.46838   0.000    1.000    
## factor(EDUC)105  -16.97833 1641.81576  -0.010    0.992    
## factor(EDUC)106  -15.75160 1641.81569  -0.010    0.992    
## factor(EDUC)107  -17.37864 1641.81558  -0.011    0.992    
## factor(EDUC)108  -16.58578 1641.81541  -0.010    0.992    
## factor(EDUC)109  -17.67499 1641.81535  -0.011    0.991    
## factor(EDUC)201  -17.77450 1641.81536  -0.011    0.991    
## factor(EDUC)202  -17.05851 1641.81534  -0.010    0.992    
## factor(EDUC)203  -17.49563 1641.81534  -0.011    0.991    
## factor(EDUC)204  -18.00978 1641.81537  -0.011    0.991    
## factor(EDUC)301  -18.35416 1641.81532  -0.011    0.991    
## factor(EDUC)302  -18.04490 1641.81534  -0.011    0.991    
## factor(EDUC)401  -18.20793 1641.81532  -0.011    0.991    
## factor(EDUC)402  -18.45489 1641.81533  -0.011    0.991    
## factor(EDUC)403  -18.62186 1641.81540  -0.011    0.991    
## factor(EDUC)500  -18.93650 1641.81533  -0.012    0.991    
## factor(EDUC)601  -19.24781 1641.81542  -0.012    0.991    
## factor(EDUC)602  -33.14172 1766.93466  -0.019    0.985    
## factor(EDUC)603  -32.84157 1911.52919  -0.017    0.986    
## factor(EDUC)999  -17.42541 1641.81578  -0.011    0.992    
## scale(DRINKER)    -0.37704    0.07033  -5.361 8.28e-08 ***
## scale(SMOKER)      0.06941    0.08139   0.853    0.394    
## scale(SNUFF)      -0.08686    0.08089  -1.074    0.283    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1241.8  on 1357  degrees of freedom
## Residual deviance: 1121.7  on 1332  degrees of freedom
## AIC: 1173.7
## 
## Number of Fisher Scoring iterations: 15
tr_pred<- predict(glm1,
                  newdata = data1,
                  type = "response")

head(tr_pred)
##          1          2          3          4          5          6 
## 0.26015331 0.12968960 0.11481002 0.12053383 0.16875934 0.07684211
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=data1$BADHEALTH)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of getting bad health by education attainment, consuming alcohol, smoking, and using smokeless tobaco", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
  ggtitle(label = "Probability of getting bad health by education attainment consuming alcohol, smoking, and using smokeless tobaco", subtitle = "Truth")+
  geom_vline(xintercept=.5)
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/integer. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/integer. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tr_predcl,data1$BADHEALTH)
##          
## tr_predcl    0    1
##         0 1113  212
##         1   13   20
cm1<-confusionMatrix(data = factor(tr_predcl),
                     reference = factor(data1$BADHEALTH) )
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1113  212
##          1   13   20
##                                           
##                Accuracy : 0.8343          
##                  95% CI : (0.8135, 0.8537)
##     No Information Rate : 0.8292          
##     P-Value [Acc > NIR] : 0.3219          
##                                           
##                   Kappa : 0.1132          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98845         
##             Specificity : 0.08621         
##          Pos Pred Value : 0.84000         
##          Neg Pred Value : 0.60606         
##              Prevalence : 0.82916         
##          Detection Rate : 0.81959         
##    Detection Prevalence : 0.97570         
##       Balanced Accuracy : 0.53733         
##                                           
##        'Positive' Class : 0               
## 
#accuracy is 83.4 percent, sensitivity is 98.8 percent and specificity is 8.6 percent

tr_predcl<-factor(ifelse(tr_pred>mean(I(data1$BADHEALTH==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=data1$BADHEALTH)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of getting bad health by education attainment consuming alcohol, smoking, and using smokeless tobaco", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(data1$BADHEALTH==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of getting bad health by education attainment consuming alcohol, smoking, and using smokeless tobaco", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(data1$BADHEALTH==1)))
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/integer. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

confusionMatrix(data = factor(tr_predcl),
                factor(data1$BADHEALTH),
                positive = "1" )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 866 110
##          1 260 122
##                                          
##                Accuracy : 0.7275         
##                  95% CI : (0.703, 0.7511)
##     No Information Rate : 0.8292         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2347         
##                                          
##  Mcnemar's Test P-Value : 9.473e-15      
##                                          
##             Sensitivity : 0.52586        
##             Specificity : 0.76909        
##          Pos Pred Value : 0.31937        
##          Neg Pred Value : 0.88730        
##              Prevalence : 0.17084        
##          Detection Rate : 0.08984        
##    Detection Prevalence : 0.28130        
##       Balanced Accuracy : 0.64748        
##                                          
##        'Positive' Class : 1              
## 
#use the mean of the response as the cutoff value
#accuracy decreased to 72.8 percent, sensitivity decreased to 52.6 percent, and increased the specificity to 76.9 percent

pred_test<-predict(glm1,
                   newdata=datatest,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(datatest$BADHEALTH==1)), 1, 0))

table(datatest$BADHEALTH,pred_cl)
##    pred_cl
##       0   1
##   0 151  81
##   1  17  22
confusionMatrix(data = factor(pred_cl),factor
                (datatest$BADHEALTH) )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 151  17
##          1  81  22
##                                           
##                Accuracy : 0.6384          
##                  95% CI : (0.5781, 0.6956)
##     No Information Rate : 0.8561          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1278          
##                                           
##  Mcnemar's Test P-Value : 1.966e-10       
##                                           
##             Sensitivity : 0.6509          
##             Specificity : 0.5641          
##          Pos Pred Value : 0.8988          
##          Neg Pred Value : 0.2136          
##              Prevalence : 0.8561          
##          Detection Rate : 0.5572          
##    Detection Prevalence : 0.6199          
##       Balanced Accuracy : 0.6075          
##                                           
##        'Positive' Class : 0               
## 
#the testing does just as good as the training