##load libraries
library(car,quietly = T)
library(stargazer,quietly = T)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey,quietly = T)
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr,quietly = T)
library(dplyr,quietly = T)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tableone,quietly = T)
library(haven,quietly = T)
library(ggplot2, quietly = T)
##load data
nhis2 <- read_stata("C:/Users/okabe/OneDrive/Pictures/Stats 2/nhis_00002.dta/nhis_00002.dta")
nhis2<-zap_labels(nhis2)
View(nhis2)
##remove all na
na.omit(nhis2)
## # A tibble: 112,053 x 32
## year serial strata psu nhishid hhweight region pernum nhispid hhx fmx
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 2014 2 6133 1 000020~ 3858 3 1 002014~ 0000~ 01
## 2 2014 3 6158 2 000020~ 4215 3 1 002014~ 0000~ 01
## 3 2014 3 6158 2 000020~ 4215 3 2 002014~ 0000~ 01
## 4 2014 3 6158 2 000020~ 4215 3 3 002014~ 0000~ 01
## 5 2014 5 6287 1 000020~ 3771 4 1 002014~ 0000~ 01
## 6 2014 5 6287 1 000020~ 3771 4 2 002014~ 0000~ 01
## 7 2014 5 6287 1 000020~ 3771 4 3 002014~ 0000~ 01
## 8 2014 6 6078 2 000020~ 3300 2 1 002014~ 0000~ 01
## 9 2014 8 6282 1 000020~ 3482 4 1 002014~ 0000~ 01
## 10 2014 10 6256 2 000020~ 3330 4 1 002014~ 0000~ 01
## # ... with 112,043 more rows, and 21 more variables: px <chr>, perweight <dbl>,
## # sampweight <dbl>, fweight <dbl>, astatflg <dbl>, cstatflg <dbl>, age <dbl>,
## # sex <dbl>, marstat <dbl>, hispeth <dbl>, racesr <dbl>, usborn <dbl>,
## # citizen <dbl>, educ <dbl>, empstat <dbl>, pooryn <dbl>, alclife <dbl>,
## # smokev <dbl>, mortucodld <dbl>, mortwt <dbl>, mortwtsa <dbl>
## My binary outcome will remain poverty. I would like to see if any of the variable affect ones quality of life
##Recode variables
#poor
nhis2$poor<-Recode(nhis2$pooryn, recodes ="7:9=NA; 1=1;2=0",as.factor=T)
##Alcohol
nhis2$alcohol<-Recode(nhis2$alclife, recodes ="7:9=NA; 1=1;2=0;0=NA")
##smoking
nhis2$smoke<-Recode(nhis2$smokev, recodes ="7:9=NA;1=1;2=0; 0=NA")
##usborn
nhis2$born<-Recode(nhis2$usborn, recodes ="96:99=NA; 20=1;10:12=0")
##educ
nhis2$education<-Recode(nhis2$educ, recodes="101='0Prim'; 101:204='1somehs'; 300:302='2hsgrad'; 400:401='3somecol'; 500='4colgrad';600:603='5masteranddoc';402:403='6Techandacademic';604:999=NA;000=NA;100=NA", as.factor=T)
##Recode
##citizen
nhis2$mycitizen<-Recode(nhis2$citizen, recodes ="7:9=NA; 1=1;2=0")
##sex
nhis2$male<-as.factor(ifelse(nhis2$sex==1, "Male", "Female"))
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~strata, weights=~mortwt, data =nhis2 )
#Logit model
fit.logit<-svyglm(poor~male+mycitizen+education+alcohol+smoke+born,
design= des,
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = poor ~ male + mycitizen + education + alcohol +
## smoke + born, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~strata, weights = ~mortwt, data = nhis2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.32010 0.26870 1.191 0.23356
## maleMale 0.16505 0.06117 2.698 0.00698 **
## mycitizen -0.45187 0.09675 -4.670 3.04e-06 ***
## education1somehs -0.04976 0.25663 -0.194 0.84627
## education2hsgrad 0.58796 0.25785 2.280 0.02261 *
## education3somecol 0.73529 0.26187 2.808 0.00500 **
## education4colgrad 1.58361 0.27202 5.822 5.98e-09 ***
## education5masteranddoc 2.08684 0.29534 7.066 1.69e-12 ***
## education6Techandacademic 1.24040 0.27377 4.531 5.93e-06 ***
## alcohol -0.34763 0.05872 -5.921 3.30e-09 ***
## smoke 0.30029 0.06496 4.623 3.83e-06 ***
## born 0.35292 0.07954 4.437 9.19e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.282438)
##
## Number of Fisher Scoring iterations: 4
##logistic regression model
library(dplyr)
model.dat2<-nhis2%>%
filter( complete.cases(.))%>%
dplyr::select(poor, region, male,age,education,alcohol,mycitizen)
knitr::kable(head(model.dat2))
| poor | region | male | age | education | alcohol | mycitizen |
|---|---|---|---|---|---|---|
| 1 | 4 | Female | 85 | 2hsgrad | 0 | 0 |
| 1 | 4 | Female | 36 | 4colgrad | 1 | 0 |
| 1 | 2 | Female | 42 | 3somecol | 1 | 0 |
| 1 | 4 | Male | 50 | 3somecol | 0 | 0 |
| 1 | 4 | Female | 54 | 5masteranddoc | 0 | 0 |
| 1 | 3 | Male | 46 | 2hsgrad | 0 | 0 |
nhis2$poor = as.numeric(nhis2$poor)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
set.seed(1115)
train<- createDataPartition(y = model.dat2$poor,
p = .80,
list=F)
model.dat2train<-model.dat2[train,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
model.dat2test<-model.dat2[-train,]
table(model.dat2train$poor)
##
## 0 1
## 2317 7594
prop.table(table(model.dat2train$poor))
##
## 0 1
## 0.2337806 0.7662194
summary(model.dat2train)
## poor region male age education
## 0:2317 Min. :1.000 Female:6709 Min. :18.00 0Prim : 71
## 1:7594 1st Qu.:2.000 Male :3202 1st Qu.:35.00 1somehs :2095
## Median :3.000 Median :52.00 2hsgrad :2917
## Mean :2.785 Mean :51.55 3somecol :1830
## 3rd Qu.:4.000 3rd Qu.:67.00 4colgrad :1236
## Max. :4.000 Max. :85.00 5masteranddoc : 768
## 6Techandacademic: 994
## alcohol mycitizen
## Min. :0.000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.000 Median :0.0000
## Mean :0.565 Mean :0.1256
## 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.0000
##
glm1<-glm(poor~scale(mycitizen)+scale(age)+factor(male)+factor(education)+scale(alcohol),
data=model.dat2train,
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = poor ~ scale(mycitizen) + scale(age) + factor(male) +
## factor(education) + scale(alcohol), family = binomial, data = model.dat2train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6628 0.3030 0.5415 0.7407 1.5436
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.01232 0.24537 -0.050 0.959967
## scale(mycitizen) -0.10101 0.02388 -4.230 2.34e-05 ***
## scale(age) 0.38058 0.02627 14.485 < 2e-16 ***
## factor(male)Male 0.29737 0.05567 5.342 9.19e-08 ***
## factor(education)1somehs 0.27377 0.24755 1.106 0.268765
## factor(education)2hsgrad 1.06572 0.24877 4.284 1.84e-05 ***
## factor(education)3somecol 1.36424 0.25291 5.394 6.89e-08 ***
## factor(education)4colgrad 2.19935 0.26286 8.367 < 2e-16 ***
## factor(education)5masteranddoc 2.54001 0.28304 8.974 < 2e-16 ***
## factor(education)6Techandacademic 1.81985 0.26299 6.920 4.52e-12 ***
## scale(alcohol) -0.09635 0.02587 -3.724 0.000196 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10779.3 on 9910 degrees of freedom
## Residual deviance: 9701.9 on 9900 degrees of freedom
## AIC: 9723.9
##
## Number of Fisher Scoring iterations: 5
##The above shows a correlation with citizenship,age, gender and education.
##There is a linear variation with age. When one is a citizen the likelihood of poverty goes up. As education attainment increases,poverty goes down. The use of alcohol increases likelihood of poverty
3a) Does changing the decision rule threshold affect your classification accuracy?
##Create class probabilities
tr_pred<- predict(glm1,
newdata = model.dat2train,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.8656419 0.8621390 0.8540418 0.8004379 0.8893209 0.8538094
##Using decision rule
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=model.dat2train$poor)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of poverty", subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
ggtitle(label = "Probability of poverty", subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##For train
##create table
table( tr_predcl,model.dat2train$poor)
##
## tr_predcl 0 1
## 0 336 320
## 1 1981 7274
##mean of response
tr_predcl2<-factor(ifelse(tr_pred>mean(I(model.dat2train$poor==1)), 1, 0))
pred2<-data.frame(pr=tr_pred, gr=tr_predcl2, modcon=model.dat2train$poor)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity", alpha=.5)+
ggtitle(label = "Probability of poverty", subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(model.dat2train$poor==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred2%>%
ggplot(aes(x=pr, fill=modcon))+
geom_histogram(position="identity", alpha=.5)+
ggtitle(label = "Probability of Poverty", subtitle = "Truth")+
geom_vline(xintercept=mean(I(model.dat2train$poor==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##Confusion matrix
cm1<-confusionMatrix(data = tr_predcl2,
reference = model.dat2train$poor )
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1591 2583
## 1 726 5011
##
## Accuracy : 0.6661
## 95% CI : (0.6567, 0.6754)
## No Information Rate : 0.7662
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.271
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6867
## Specificity : 0.6599
## Pos Pred Value : 0.3812
## Neg Pred Value : 0.8735
## Prevalence : 0.2338
## Detection Rate : 0.1605
## Detection Prevalence : 0.4211
## Balanced Accuracy : 0.6733
##
## 'Positive' Class : 0
##
##The accuracy in the train data is 66% accurate which is okay, 66% sensitive with a specificity of 65% who live in poverty
##Test
tt_pred<- predict(glm1,
newdata = model.dat2test,
type = "response")
head(tt_pred)
## 1 2 3 4 5 6
## 0.7533802 0.9384372 0.9049033 0.9273640 0.9386347 0.6463174
##Using decision rule
tt_predcl<-factor(ifelse(tt_pred>.5, 1, 0))
library(ggplot2)
pret1<-data.frame(pr=tt_pred, gr=tt_predcl, modcon=model.dat2test$poor)
pret1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of poverty", subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
ggtitle(label = "Probability of poverty", subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##Test
##mean
tt_predcl<-factor(ifelse(tt_pred>mean(I(model.dat2test$poor==1)), 1, 0))
pret2<-data.frame(pr=tt_pred, gr=tt_predcl, modcon=model.dat2test$poor)
pret2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity", alpha=.5)+
ggtitle(label = "Probability of poverty", subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(model.dat2test$poor==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pret2%>%
ggplot(aes(x=pr, fill=modcon))+
geom_histogram(position="identity", alpha=.5)+
ggtitle(label = "Probability of Poverty", subtitle = "Truth")+
geom_vline(xintercept=mean(I(model.dat2test$poor==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##The above show that there is a high probability of people living in poverty that are affected by mortality based on the variables above.
##Confusion matrix to test accuracy
cm2<-confusionMatrix(data = tt_predcl,
reference = model.dat2test$poor )
cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 397 639
## 1 182 1259
##
## Accuracy : 0.6686
## 95% CI : (0.6496, 0.6871)
## No Information Rate : 0.7662
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2739
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6857
## Specificity : 0.6633
## Pos Pred Value : 0.3832
## Neg Pred Value : 0.8737
## Prevalence : 0.2338
## Detection Rate : 0.1603
## Detection Prevalence : 0.4182
## Balanced Accuracy : 0.6745
##
## 'Positive' Class : 0
##
##In the train there is 67% accuracy of the data. When removed outside train the accuracy slightly increases. However, the sensitivity remains the same. The specificity increases to 66%. Showing that this model is better compared to the train data.
##More confusion matrix calculation for train
confusionMatrix(data = tr_predcl,
model.dat2train$poor,
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 336 320
## 1 1981 7274
##
## Accuracy : 0.7678
## 95% CI : (0.7594, 0.7761)
## No Information Rate : 0.7662
## P-Value [Acc > NIR] : 0.3572
##
## Kappa : 0.137
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9579
## Specificity : 0.1450
## Pos Pred Value : 0.7860
## Neg Pred Value : 0.5122
## Prevalence : 0.7662
## Detection Rate : 0.7339
## Detection Prevalence : 0.9338
## Balanced Accuracy : 0.5514
##
## 'Positive' Class : 1
##
##Using this code, the accuracy increases to 77% but the sensitivity increases to 96% as well. THe specify data here is low as well, only 16% with a higher number of positive predicted values
##More Confusion matrix for test using a different decision rule
pred_test<-predict(glm1,
newdata=model.dat2test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(model.dat2test$poor==1)), 1, 0))
table(model.dat2test$poor,tt_predcl)
## tt_predcl
## 0 1
## 0 397 182
## 1 639 1259
confusionMatrix(data = tt_predcl,model.dat2test$poor)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 397 639
## 1 182 1259
##
## Accuracy : 0.6686
## 95% CI : (0.6496, 0.6871)
## No Information Rate : 0.7662
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2739
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6857
## Specificity : 0.6633
## Pos Pred Value : 0.3832
## Neg Pred Value : 0.8737
## Prevalence : 0.2338
## Detection Rate : 0.1603
## Detection Prevalence : 0.4182
## Balanced Accuracy : 0.6745
##
## 'Positive' Class : 0
##
##However, outside the training data, the accuracy is 67% which is slightly lower than the train test but the sensitivity reduces to 66% which is way better than in the training data. Specify data increase to 66%
##Table: 1259 out of 9911 were predicted correctly as living in poverty and affected by mortality
##More confusion matrix calculation for test
##When comparing the test and train, the train data is more accurate than the test data as seen with the above calculations
confusionMatrix(data = tt_predcl,
model.dat2test$poor,
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 397 639
## 1 182 1259
##
## Accuracy : 0.6686
## 95% CI : (0.6496, 0.6871)
## No Information Rate : 0.7662
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2739
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6633
## Specificity : 0.6857
## Pos Pred Value : 0.8737
## Neg Pred Value : 0.3832
## Prevalence : 0.7662
## Detection Rate : 0.5083
## Detection Prevalence : 0.5818
## Balanced Accuracy : 0.6745
##
## 'Positive' Class : 1
##