##load libraries
library(car,quietly = T)
library(stargazer,quietly = T)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey,quietly = T)
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr,quietly = T)
library(dplyr,quietly = T)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tableone,quietly = T)
library(haven,quietly = T)
library(ggplot2, quietly = T)
##load data

nhis2 <- read_stata("C:/Users/okabe/OneDrive/Pictures/Stats 2/nhis_00002.dta/nhis_00002.dta")
nhis2<-zap_labels(nhis2)
View(nhis2)
##remove all na
na.omit(nhis2)
## # A tibble: 112,053 x 32
##     year serial strata   psu nhishid hhweight region pernum nhispid hhx   fmx  
##    <dbl>  <dbl>  <dbl> <dbl> <chr>      <dbl>  <dbl>  <dbl> <chr>   <chr> <chr>
##  1  2014      2   6133     1 000020~     3858      3      1 002014~ 0000~ 01   
##  2  2014      3   6158     2 000020~     4215      3      1 002014~ 0000~ 01   
##  3  2014      3   6158     2 000020~     4215      3      2 002014~ 0000~ 01   
##  4  2014      3   6158     2 000020~     4215      3      3 002014~ 0000~ 01   
##  5  2014      5   6287     1 000020~     3771      4      1 002014~ 0000~ 01   
##  6  2014      5   6287     1 000020~     3771      4      2 002014~ 0000~ 01   
##  7  2014      5   6287     1 000020~     3771      4      3 002014~ 0000~ 01   
##  8  2014      6   6078     2 000020~     3300      2      1 002014~ 0000~ 01   
##  9  2014      8   6282     1 000020~     3482      4      1 002014~ 0000~ 01   
## 10  2014     10   6256     2 000020~     3330      4      1 002014~ 0000~ 01   
## # ... with 112,043 more rows, and 21 more variables: px <chr>, perweight <dbl>,
## #   sampweight <dbl>, fweight <dbl>, astatflg <dbl>, cstatflg <dbl>, age <dbl>,
## #   sex <dbl>, marstat <dbl>, hispeth <dbl>, racesr <dbl>, usborn <dbl>,
## #   citizen <dbl>, educ <dbl>, empstat <dbl>, pooryn <dbl>, alclife <dbl>,
## #   smokev <dbl>, mortucodld <dbl>, mortwt <dbl>, mortwtsa <dbl>
  1. Define a binary outcome of your choosing
## My binary outcome will remain poverty. I would like to see if any of the variable affect ones quality of life
  1. Fit a predictive logistic regression model using as many predictor variables as you think you need
##Recode variables
#poor
nhis2$poor<-Recode(nhis2$pooryn, recodes ="7:9=NA; 1=1;2=0",as.factor=T)

##Alcohol
nhis2$alcohol<-Recode(nhis2$alclife, recodes ="7:9=NA; 1=1;2=0;0=NA")

##smoking
nhis2$smoke<-Recode(nhis2$smokev, recodes ="7:9=NA;1=1;2=0; 0=NA")

##usborn
nhis2$born<-Recode(nhis2$usborn, recodes ="96:99=NA; 20=1;10:12=0")

##educ
nhis2$education<-Recode(nhis2$educ, recodes="101='0Prim'; 101:204='1somehs'; 300:302='2hsgrad'; 400:401='3somecol'; 500='4colgrad';600:603='5masteranddoc';402:403='6Techandacademic';604:999=NA;000=NA;100=NA", as.factor=T)

##Recode
##citizen
nhis2$mycitizen<-Recode(nhis2$citizen, recodes ="7:9=NA; 1=1;2=0")

##sex
nhis2$male<-as.factor(ifelse(nhis2$sex==1, "Male", "Female"))



#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~strata, weights=~mortwt, data =nhis2 )


#Logit model
fit.logit<-svyglm(poor~male+mycitizen+education+alcohol+smoke+born,
                  design= des,
                  family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
## 
## Call:
## svyglm(formula = poor ~ male + mycitizen + education + alcohol + 
##     smoke + born, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~strata, weights = ~mortwt, data = nhis2)
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.32010    0.26870   1.191  0.23356    
## maleMale                   0.16505    0.06117   2.698  0.00698 ** 
## mycitizen                 -0.45187    0.09675  -4.670 3.04e-06 ***
## education1somehs          -0.04976    0.25663  -0.194  0.84627    
## education2hsgrad           0.58796    0.25785   2.280  0.02261 *  
## education3somecol          0.73529    0.26187   2.808  0.00500 ** 
## education4colgrad          1.58361    0.27202   5.822 5.98e-09 ***
## education5masteranddoc     2.08684    0.29534   7.066 1.69e-12 ***
## education6Techandacademic  1.24040    0.27377   4.531 5.93e-06 ***
## alcohol                   -0.34763    0.05872  -5.921 3.30e-09 ***
## smoke                      0.30029    0.06496   4.623 3.83e-06 ***
## born                       0.35292    0.07954   4.437 9.19e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.282438)
## 
## Number of Fisher Scoring iterations: 4
##logistic regression model
library(dplyr)

model.dat2<-nhis2%>% 
  filter( complete.cases(.))%>%
dplyr::select(poor, region, male,age,education,alcohol,mycitizen)

knitr::kable(head(model.dat2))
poor region male age education alcohol mycitizen
1 4 Female 85 2hsgrad 0 0
1 4 Female 36 4colgrad 1 0
1 2 Female 42 3somecol 1 0
1 4 Male 50 3somecol 0 0
1 4 Female 54 5masteranddoc 0 0
1 3 Male 46 2hsgrad 0 0
  1. Use a 80% training/20% test split for your data.
nhis2$poor = as.numeric(nhis2$poor)

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
set.seed(1115)

train<- createDataPartition(y = model.dat2$poor,
                            p = .80,
                            list=F)



model.dat2train<-model.dat2[train,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
model.dat2test<-model.dat2[-train,]

table(model.dat2train$poor)
## 
##    0    1 
## 2317 7594
prop.table(table(model.dat2train$poor))
## 
##         0         1 
## 0.2337806 0.7662194
summary(model.dat2train)
##  poor         region          male           age                   education   
##  0:2317   Min.   :1.000   Female:6709   Min.   :18.00   0Prim           :  71  
##  1:7594   1st Qu.:2.000   Male  :3202   1st Qu.:35.00   1somehs         :2095  
##           Median :3.000                 Median :52.00   2hsgrad         :2917  
##           Mean   :2.785                 Mean   :51.55   3somecol        :1830  
##           3rd Qu.:4.000                 3rd Qu.:67.00   4colgrad        :1236  
##           Max.   :4.000                 Max.   :85.00   5masteranddoc   : 768  
##                                                         6Techandacademic: 994  
##     alcohol        mycitizen     
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000  
##  Median :1.000   Median :0.0000  
##  Mean   :0.565   Mean   :0.1256  
##  3rd Qu.:1.000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.0000  
## 
  1. Report the % correct classification from the training data using the .5 decision rule and again using the mean of the outcome decision rule
glm1<-glm(poor~scale(mycitizen)+scale(age)+factor(male)+factor(education)+scale(alcohol),
          data=model.dat2train,
          family = binomial)
summary(glm1)
## 
## Call:
## glm(formula = poor ~ scale(mycitizen) + scale(age) + factor(male) + 
##     factor(education) + scale(alcohol), family = binomial, data = model.dat2train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6628   0.3030   0.5415   0.7407   1.5436  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -0.01232    0.24537  -0.050 0.959967    
## scale(mycitizen)                  -0.10101    0.02388  -4.230 2.34e-05 ***
## scale(age)                         0.38058    0.02627  14.485  < 2e-16 ***
## factor(male)Male                   0.29737    0.05567   5.342 9.19e-08 ***
## factor(education)1somehs           0.27377    0.24755   1.106 0.268765    
## factor(education)2hsgrad           1.06572    0.24877   4.284 1.84e-05 ***
## factor(education)3somecol          1.36424    0.25291   5.394 6.89e-08 ***
## factor(education)4colgrad          2.19935    0.26286   8.367  < 2e-16 ***
## factor(education)5masteranddoc     2.54001    0.28304   8.974  < 2e-16 ***
## factor(education)6Techandacademic  1.81985    0.26299   6.920 4.52e-12 ***
## scale(alcohol)                    -0.09635    0.02587  -3.724 0.000196 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10779.3  on 9910  degrees of freedom
## Residual deviance:  9701.9  on 9900  degrees of freedom
## AIC: 9723.9
## 
## Number of Fisher Scoring iterations: 5
##The above shows a correlation with citizenship,age, gender and education.
##There is a linear variation with age. When one is a citizen the likelihood of poverty goes up. As education attainment increases,poverty goes down. The use of alcohol increases likelihood of poverty

3a) Does changing the decision rule threshold affect your classification accuracy?

##Create class probabilities
tr_pred<- predict(glm1,
                  newdata = model.dat2train,
                  type = "response")

head(tr_pred)
##         1         2         3         4         5         6 
## 0.8656419 0.8621390 0.8540418 0.8004379 0.8893209 0.8538094
##Using decision rule
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=model.dat2train$poor)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of poverty", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
  ggtitle(label = "Probability of poverty", subtitle = "Truth")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  1. Report the % correct classification from the test data using the .5 decision rule and again using the mean of the outcome decision rule
##For train
##create table
table( tr_predcl,model.dat2train$poor)
##          
## tr_predcl    0    1
##         0  336  320
##         1 1981 7274
##mean of response
tr_predcl2<-factor(ifelse(tr_pred>mean(I(model.dat2train$poor==1)), 1, 0)) 

pred2<-data.frame(pr=tr_pred, gr=tr_predcl2, modcon=model.dat2train$poor)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.5)+
  ggtitle(label = "Probability of poverty", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(model.dat2train$poor==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.5)+
  ggtitle(label = "Probability of Poverty", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(model.dat2train$poor==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##Confusion matrix
cm1<-confusionMatrix(data = tr_predcl2,
                     reference = model.dat2train$poor )
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1591 2583
##          1  726 5011
##                                           
##                Accuracy : 0.6661          
##                  95% CI : (0.6567, 0.6754)
##     No Information Rate : 0.7662          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.271           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6867          
##             Specificity : 0.6599          
##          Pos Pred Value : 0.3812          
##          Neg Pred Value : 0.8735          
##              Prevalence : 0.2338          
##          Detection Rate : 0.1605          
##    Detection Prevalence : 0.4211          
##       Balanced Accuracy : 0.6733          
##                                           
##        'Positive' Class : 0               
## 
##The accuracy in the train data is 66% accurate which is okay, 66% sensitive with a specificity of 65% who live in poverty
##Test
tt_pred<- predict(glm1,
                  newdata = model.dat2test,
                  type = "response")

head(tt_pred)
##         1         2         3         4         5         6 
## 0.7533802 0.9384372 0.9049033 0.9273640 0.9386347 0.6463174
##Using decision rule
tt_predcl<-factor(ifelse(tt_pred>.5, 1, 0))

library(ggplot2)

pret1<-data.frame(pr=tt_pred, gr=tt_predcl, modcon=model.dat2test$poor)

pret1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of poverty", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
  ggtitle(label = "Probability of poverty", subtitle = "Truth")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##Test
##mean
tt_predcl<-factor(ifelse(tt_pred>mean(I(model.dat2test$poor==1)), 1, 0)) 

pret2<-data.frame(pr=tt_pred, gr=tt_predcl, modcon=model.dat2test$poor)

pret2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.5)+
  ggtitle(label = "Probability of poverty", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(model.dat2test$poor==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pret2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.5)+
  ggtitle(label = "Probability of Poverty", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(model.dat2test$poor==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##The above show that there is a high probability of people living in poverty that are affected by mortality based on the variables above.

##Confusion matrix to test accuracy

cm2<-confusionMatrix(data = tt_predcl,
                     reference = model.dat2test$poor )
cm2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  397  639
##          1  182 1259
##                                           
##                Accuracy : 0.6686          
##                  95% CI : (0.6496, 0.6871)
##     No Information Rate : 0.7662          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2739          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6857          
##             Specificity : 0.6633          
##          Pos Pred Value : 0.3832          
##          Neg Pred Value : 0.8737          
##              Prevalence : 0.2338          
##          Detection Rate : 0.1603          
##    Detection Prevalence : 0.4182          
##       Balanced Accuracy : 0.6745          
##                                           
##        'Positive' Class : 0               
## 
##In the train there is 67% accuracy of the data. When removed outside train the accuracy slightly increases. However, the sensitivity remains the same. The specificity increases to 66%. Showing that this model is better compared to the train data.
##More confusion matrix calculation for train
confusionMatrix(data = tr_predcl,
                model.dat2train$poor,
                positive = "1" )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  336  320
##          1 1981 7274
##                                           
##                Accuracy : 0.7678          
##                  95% CI : (0.7594, 0.7761)
##     No Information Rate : 0.7662          
##     P-Value [Acc > NIR] : 0.3572          
##                                           
##                   Kappa : 0.137           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9579          
##             Specificity : 0.1450          
##          Pos Pred Value : 0.7860          
##          Neg Pred Value : 0.5122          
##              Prevalence : 0.7662          
##          Detection Rate : 0.7339          
##    Detection Prevalence : 0.9338          
##       Balanced Accuracy : 0.5514          
##                                           
##        'Positive' Class : 1               
## 
##Using this code, the accuracy increases to 77% but the sensitivity increases to 96% as well. THe specify data here is low as well, only 16% with a higher number of positive predicted values
##More Confusion matrix for test using a different decision rule
pred_test<-predict(glm1,
                   newdata=model.dat2test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(model.dat2test$poor==1)), 1, 0))

table(model.dat2test$poor,tt_predcl)
##    tt_predcl
##        0    1
##   0  397  182
##   1  639 1259
confusionMatrix(data = tt_predcl,model.dat2test$poor)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  397  639
##          1  182 1259
##                                           
##                Accuracy : 0.6686          
##                  95% CI : (0.6496, 0.6871)
##     No Information Rate : 0.7662          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2739          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6857          
##             Specificity : 0.6633          
##          Pos Pred Value : 0.3832          
##          Neg Pred Value : 0.8737          
##              Prevalence : 0.2338          
##          Detection Rate : 0.1603          
##    Detection Prevalence : 0.4182          
##       Balanced Accuracy : 0.6745          
##                                           
##        'Positive' Class : 0               
## 
##However, outside the training data, the accuracy is 67% which is slightly lower than the train test but the sensitivity reduces to 66% which is way better than in the training data. Specify data increase to 66%

##Table: 1259 out of 9911 were predicted correctly as living in poverty and affected by mortality
##More confusion matrix calculation for test
##When comparing the test and train, the train data is more accurate than the test data as seen  with the above calculations
confusionMatrix(data = tt_predcl,
                model.dat2test$poor,
                positive = "1" )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  397  639
##          1  182 1259
##                                           
##                Accuracy : 0.6686          
##                  95% CI : (0.6496, 0.6871)
##     No Information Rate : 0.7662          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2739          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6633          
##             Specificity : 0.6857          
##          Pos Pred Value : 0.8737          
##          Neg Pred Value : 0.3832          
##              Prevalence : 0.7662          
##          Detection Rate : 0.5083          
##    Detection Prevalence : 0.5818          
##       Balanced Accuracy : 0.6745          
##                                           
##        'Positive' Class : 1               
##