#1. Define a binary outcome of your choosing
self rated health status
2) Fit a predictive logistic regression model using as many predictor variables as you think you need
data$age2<- data$age^2
library(dplyr)
model.data<- data %>%
select(serial, badhealth, opportunity_youth_cat, urban_rural, male, educ, age2)
knitr::kable(head(model.data))
22 |
0 |
Not opportunity youth |
urban |
Male |
1Less than HS |
529 |
73 |
0 |
Not opportunity youth |
urban |
Female |
3More than HS |
400 |
81 |
0 |
Not opportunity youth |
urban |
Female |
2hsgrad |
484 |
82 |
0 |
Not opportunity youth |
urban |
Female |
3More than HS |
441 |
88 |
0 |
Not opportunity youth |
urban |
Male |
3More than HS |
484 |
136 |
0 |
Not opportunity youth |
urban |
Female |
3More than HS |
361 |
3) Use a 80% training/20% test split for your data
set.seed(1115)
train<- createDataPartition(y = model.data$badhealth,
p = .80,
list=F)
model.dat2train<-model.data[train,]
model.dat2test<-model.data[-train,]
table(model.dat2train$badhealth)
##
## 0 1
## 2848 162
prop.table(table(model.dat2train$badhealth))
##
## 0 1
## 0.9461794 0.0538206
summary(model.dat2train)
## serial badhealth opportunity_youth_cat urban_rural
## Min. : 22 Min. :0.00000 Opportunity youth : 312 rural: 374
## 1st Qu.: 8486 1st Qu.:0.00000 Not opportunity youth:2698 urban:2636
## Median :16876 Median :0.00000
## Mean :16646 Mean :0.05382
## 3rd Qu.:24905 3rd Qu.:0.00000
## Max. :33099 Max. :1.00000
## male educ age2
## Female:1514 1Less than HS: 295 Min. :324.0
## Male :1496 2hsgrad :1000 1st Qu.:400.0
## 3More than HS:1715 Median :441.0
## Mean :458.7
## 3rd Qu.:529.0
## Max. :576.0
Logistic regression for classification
glm1<-glm(badhealth~factor(opportunity_youth_cat)+factor(urban_rural)+scale(age2)+factor(male)+factor(educ),
data=model.dat2train[,-1],
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = badhealth ~ factor(opportunity_youth_cat) + factor(urban_rural) +
## scale(age2) + factor(male) + factor(educ), family = binomial,
## data = model.dat2train[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8531 -0.3388 -0.2971 -0.2622 2.7703
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -0.98218 0.29381 -3.343
## factor(opportunity_youth_cat)Not opportunity youth -0.93572 0.20295 -4.611
## factor(urban_rural)urban -0.35696 0.21904 -1.630
## scale(age2) 0.18906 0.08664 2.182
## factor(male)Male -0.41848 0.16714 -2.504
## factor(educ)2hsgrad -0.54191 0.24907 -2.176
## factor(educ)3More than HS -0.81846 0.25567 -3.201
## Pr(>|z|)
## (Intercept) 0.000829 ***
## factor(opportunity_youth_cat)Not opportunity youth 4.01e-06 ***
## factor(urban_rural)urban 0.103168
## scale(age2) 0.029107 *
## factor(male)Male 0.012289 *
## factor(educ)2hsgrad 0.029576 *
## factor(educ)3More than HS 0.001369 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1261.9 on 3009 degrees of freedom
## Residual deviance: 1212.1 on 3003 degrees of freedom
## AIC: 1226.1
##
## Number of Fisher Scoring iterations: 6
tr_pred<- predict(glm1,
newdata = model.dat2train,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.07345859 0.05954407 0.03062989 0.02547889 0.07228006 0.02547889
Using 50% as the predictor
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tr_pred,
gr=tr_predcl,
badht=model.dat2train$badhealth)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of Bad health",
subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=badht, fill=badht))+
ggtitle(label = "Probability of Bad health",
subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tr_predcl,
model.dat2train$badhealth)
##
## tr_predcl 0 1
## 0 2848 162
model.dat2train$badhealth<- as.factor(model.dat2train$badhealth)
cm1<-confusionMatrix(data = tr_predcl,
reference = model.dat2train$badhealth)
## Warning in confusionMatrix.default(data = tr_predcl, reference =
## model.dat2train$badhealth): Levels are not in the same order for reference and
## data. Refactoring data to match.
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2848 162
## 1 0 0
##
## Accuracy : 0.9462
## 95% CI : (0.9375, 0.954)
## No Information Rate : 0.9462
## P-Value [Acc > NIR] : 0.5209
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9462
## Neg Pred Value : NaN
## Prevalence : 0.9462
## Detection Rate : 0.9462
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
3) Report the % correct classification from the training data using the .5 decision rule
Overall the model has a 94.62% accuracy.
Using mean as a predictor
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$badhealth==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred,
gr=tr_predcl,
badht=model.dat2train$badhealth)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of Bad health",
subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(model.dat2train$badhealth==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
ggplot(aes(x=pr, fill=badht))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of Bad health",
subtitle = "Truth")+
geom_vline(xintercept=mean(I(model.dat2train$badhealth==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tr_predcl,
model.dat2train$badhealth)
##
## tr_predcl 0 1
## 0 1995 81
## 1 853 81
model.dat2train$badhealth<- as.factor(model.dat2train$badhealth)
confusionMatrix(data = tr_predcl,
model.dat2train$badhealth,
positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1995 81
## 1 853 81
##
## Accuracy : 0.6897
## 95% CI : (0.6728, 0.7062)
## No Information Rate : 0.9462
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0617
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.50000
## Specificity : 0.70049
## Pos Pred Value : 0.08672
## Neg Pred Value : 0.96098
## Prevalence : 0.05382
## Detection Rate : 0.02691
## Detection Prevalence : 0.31030
## Balanced Accuracy : 0.60025
##
## 'Positive' Class : 1
##
3) Report the % correct classification from the training data using the mean as the decision rule
answer: Overral, from the training data, the model has a 68.97% accuracy
3a) Does changing the decision rule threshold affect your classification accuracy?
Answer: Yes, using the .5 the classification accuracy is 94.62%, while using the mean the accuracy reduced to 68.97%. Even though the mean accuracy is lesser, the .5 classification did not account for false positive and true negative.
Testing data using mean
pred_test<-predict(glm1,
newdata=model.dat2test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(model.dat2test$badhealth==1)), 1, 0))
table(model.dat2test$badhealth,pred_cl)
## pred_cl
## 0 1
## 0 532 178
## 1 25 17
model.dat2test$badhealth<- as.factor(model.dat2test$badhealth)
confusionMatrix(data = pred_cl,model.dat2test$badhealth)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 532 25
## 1 178 17
##
## Accuracy : 0.7301
## 95% CI : (0.6968, 0.7615)
## No Information Rate : 0.9441
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0568
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.74930
## Specificity : 0.40476
## Pos Pred Value : 0.95512
## Neg Pred Value : 0.08718
## Prevalence : 0.94415
## Detection Rate : 0.70745
## Detection Prevalence : 0.74069
## Balanced Accuracy : 0.57703
##
## 'Positive' Class : 0
##
4. Report the % correct classification from the test data using the mean as the decision rule
The Overral, from the testing data, the model has a 73.01% accuracy
Testing data using 0.5
pred_test2<-predict(glm1,
newdata=model.dat2test,
type="response")
pred_cl<-factor(ifelse(pred_test >.5, 1, 0))
table(model.dat2test$badhealth,pred_cl)
## pred_cl
## 0
## 0 710
## 1 42
model.dat2test$badhealth<- as.factor(model.dat2test$badhealth)
confusionMatrix(data = pred_cl,model.dat2test$badhealth)
## Warning in confusionMatrix.default(data = pred_cl, model.dat2test$badhealth):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 710 42
## 1 0 0
##
## Accuracy : 0.9441
## 95% CI : (0.9253, 0.9595)
## No Information Rate : 0.9441
## P-Value [Acc > NIR] : 0.5409
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 2.509e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9441
## Neg Pred Value : NaN
## Prevalence : 0.9441
## Detection Rate : 0.9441
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
4 Report the % correct classification from the test data using the .5 decision rule
answer: The percentage correct classification is 94.41 % accurate. Still did not account for the false positive and true negative
---
title: "Assignment 4"
author: "Joseph Jaiyeola"
date:  "`r format(Sys.time(), '%d %B, %Y')`"
output:
   html_document:
    df_print: paged
    fig_height: 7
    fig_width: 7
    toc: yes
    toc_float: yes
    code_download: true
---

#1. Define a binary outcome of your choosing

self rated health status







```{r include=FALSE}
library(stargazer, quietly = T)
library(survey, quietly = T)
library(car, quietly = T)
library(questionr, quietly = T)
library(dplyr, quietly = T)
library(forcats, quietly = T)
library(tidyverse, quietly = T)
library(srvyr, quietly = T)
library( gtsummary, quietly = T)
library(caret, quietly = T)

```


```{r include=FALSE}
library(ipumsr)
```

```{r include=FALSE}
ddi <- read_ipums_ddi("nhis_00002.xml")
data <- read_ipums_micro(ddi)
data<- haven::zap_labels(data)
```

```{r include=FALSE}
names(data) <- tolower(gsub(pattern = "_",replacement =  "",x =  names(data)))
```


```{r include=FALSE}
#sex
data$male<-as.factor(ifelse(data$sex==1, "Male", "Female"))


#race/ethnicity
data$whitemajority<- car::Recode(data$hisprace,
                       recodes="02=1; 99=NA; else=0")

data$otherminority<- car::Recode(data$hisprace,
                      recodes="c(1,3,4,5,6,7)=1; 99=NA; else=0")

data$race_eth<-car::Recode(data$hisprace,
recodes="02='whitemajority'; c(1,3,4,5,6,7)='otherminority';else=NA",
as.factor = T)
data$race_eth<-relevel(data$race_eth,
                          ref = "whitemajority")


#education level



data$educ<- car::Recode(data$educ,
                     recodes="102:116='1Less than HS'; 201:202='2hsgrad'; 301:503='3More than HS';997:999=NA;000=NA",
                     as.factor=T)
data$educ<-fct_relevel(data$educ,'1Less than HS','2hsgrad','3More than HS') 


#Urban-rural classification


data$urban_rural<- car::Recode(data$urbrrl,
                     recodes="1:3='urban'; 4='rural';000=NA",
                     as.factor=T)
data$urban_rural<-relevel(data$urban_rural, ref='rural') 


data$rural<- car::Recode(data$hisprace,
                       recodes="4=1; 99=NA; else=0")

data$urban<- car::Recode(data$hisprace,
                       recodes="1:3=1; 99=NA; else=0")




#employment status


data$unemploy<- car::Recode(data$empstat,
                       recodes="200=1; 00=NA; 999=NA; else=0")


data$employ_status<- car::Recode(data$empstat,
                       recodes="100='Employed'; 200='unemployed';else=NA",
                       as.factor=T)
data$employ_status<-relevel(data$employ_status, ref='Employed')


# currently in school

data$non_schooling<- car::Recode(data$schoolnow,
                       recodes="1=1; 0=NA; 7:9=NA; else=0")

data$schoolstatus<- car::Recode(data$schoolnow,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$schoolstatus<-relevel(data$schoolstatus, ref='no')

data<-data%>%
  filter(complete.cases(unemploy,non_schooling))


# merging schooling and working

data$opportunity_youth <- paste( data$unemploy, data$non_schooling, sep ="")


data$opportunity_youth_cat_num<- car::Recode(data$opportunity_youth,
                       recodes="11=1; 00:10=0;else=NA",
                       as.factor=F)


data$opportunity_youth_cat<- car::Recode(data$opportunity_youth,
                       recodes="11='Opportunity youth';00:10='Not opportunity youth';else=NA",
                       as.factor=T)
data$opportunity_youth_cat<-relevel(data$opportunity_youth_cat, ref='Opportunity youth')

data$non_opportunity_youth_cat_num<- car::Recode(data$opportunity_youth_cat_num,
                       recodes="0=1; 99=NA; else=0")

#income grouping

data$familyincome <- data$incfam07on

# born in the US

data$usborn<- car::Recode(data$usborn,
                       recodes="20=1; 97:98=NA; else=0")

# US Citizen
data$citizen<- car::Recode(data$citizen,
                       recodes="2=1; 8:9=NA; else=0")

#last employed



data$employedlast<- car::Recode(data$emplast,
                       recodes="1='Within past 12months'; 2='1-5years ago'; 3='over 5years ago'; 4='never worked';else=NA",
                       as.factor=T)
data$employedlast<-relevel(data$employedlast, ref='1-5years ago')


#HEALTH VARIABLES

#Poor or fair self rated health
data$badhealth<-car::Recode(data$health, recodes="4:5=1; 1:3=0; else=NA")


#place for medical care


data$medicalplace<- car::Recode(data$usualpl,
                       recodes="1='no'; 2:3:='yes';else=NA",
                       as.factor=T)
data$medicalplace<-relevel(data$medicalplace, ref='no')


# delayed medical care due to cost


data$medical_care_cost<- car::Recode(data$delaycost,
                       recodes="2='yes'; 1='no';else=NA",
                       as.factor=T)
data$medical_care_cost<-relevel(data$medical_care_cost, ref='yes')

# worried about paying medical bills


data$medical_bill_worried<- car::Recode(data$wormedbill,
                       recodes="1='very worried'; 2='somewhat worried'; 3='not at all';else=NA",
                       as.factor=T)
data$medical_bill_worried<-relevel(data$medical_bill_worried, ref='very worried')

# unable to pay medical bills


data$medical_bill_unabletopay<- car::Recode(data$hiunablepay,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$medical_bill_unabletopay<-relevel(data$medical_bill_unabletopay, ref='yes')


# health insurance coverage


data$healthinsurace_coverage<- car::Recode(data$hinotcove,
                       recodes="1='no, has coverage'; 2='yes, no coverage';else=NA",
                       as.factor=T)
data$healthinsurace_coverage<-relevel(data$healthinsurace_coverage, ref='yes, no coverage')

# dont have health insurance cuz of cost


data$nohealthinsurace_cost<- car::Recode(data$hinocostr,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$nohealthinsurace_cost<-relevel(data$nohealthinsurace_cost, ref='yes')

# used medication in the past year


data$usedmedications<- car::Recode(data$premedyr,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$usedmedications<-relevel(data$usedmedications, ref='yes')


# if they smoked


data$smoke_frequently<- car::Recode(data$smokfreqnow,
                       recodes="1='no'; 2:3='somedays/everyday';else=NA",
                       as.factor=T)
data$smoke_frequently<-relevel(data$smoke_frequently, ref='somedays/everyday')

# smoked up to 100 cigarreth in life time


data$smoke_100cig<- car::Recode(data$smokev,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$smoke_100cig<-relevel(data$smoke_100cig, ref='yes')


#Mental health

# ever had anxiety disoreder


data$anxiety_disoreder<- car::Recode(data$anxietyev,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$anxiety_disoreder<-relevel(data$anxiety_disoreder, ref='yes')

# medication for worrying


data$medication_for_worry<- car::Recode(data$worrx,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$medication_for_worry<-relevel(data$medication_for_worry, ref='yes')


# medication for depression



data$medication_for_depression<- car::Recode(data$deprx,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$medication_for_depression<-relevel(data$medication_for_depression, ref='yes')

# level of worry



data$level_of_worry<- car::Recode(data$worfeelevl,
                       recodes="1='alot'; 2='a little'; 3='btw little and alot';else=NA",
                       as.factor=T)
data$level_of_worry<-relevel(data$level_of_worry, ref='alot')


# level of depression



data$level_of_depression<- car::Recode(data$depfeelevl,
                       recodes="1='alot'; 2='a little'; 3='btw little and alot';else=NA",
                       as.factor=T)
data$level_of_depression<-relevel(data$level_of_depression, ref='alot')




```


```{r include=FALSE}
data <- data%>%
filter(age >=16 & age<=24)

data<-data%>%
  filter(is.na(badhealth)==F)
data<-data%>%
  filter(is.na(educ)==F)
```



```{r include=FALSE}
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~strata, weights=~sampweight, data = data )
des
```






```{r include=FALSE}
sv.table<-svyby(formula = ~badhealth,
                by = ~opportunity_youth_cat,
                design = des,
                FUN = svymean,
                na.rm=T)


knitr::kable(sv.table,
      caption = "Survey Estimates of Poor SRH by Opportunity Youths",
      align = 'c',  
      format = "html")
```


```{r include=FALSE}
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1,  weights = ~1, data = data)

sv.table<-svyby(formula = ~factor(badhealth),
                by = ~opportunity_youth_cat,
                design = nodes,
                FUN = svymean,
                na.rm=T)
```

```{r include=FALSE}
knitr::kable(sv.table,
      caption = "Estimates of Poor SRH by Opportunity youth - No survey design",
      align = 'c',  
      format = "html")
```


```{r include=FALSE}
library(srvyr)

data%>%
  as_survey_design(strata = strata,
                   weights = sampweight)%>%
  group_by(opportunity_youth_cat)%>%
  summarise(mean_bh = survey_mean(badhealth, na.rm=T))%>%
  ungroup()
```


```{r include=FALSE}
library(gtsummary)

data%>%
  as_survey_design( strata =strata,
                    weights = sampweight)%>%
  select(badhealth, opportunity_youth_cat)%>%
  tbl_svysummary(by = opportunity_youth_cat, 
              label = list(badhealth = "Fair/Poor Health"))%>%
  add_p()%>%
  add_n()
```


```{r include=FALSE}
library(dplyr)
sub<-data%>%
  select(badhealth, opportunity_youth_cat_num, opportunity_youth_cat, race_eth,educ,whitemajority, otherminority,urban_rural, male,sampweight,male,smoke_frequently, strata)
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
               strata= ~strata,
               weights= ~sampweight,
               data = sub )
```


```{r include=FALSE}
fit.logit1<-svyglm(badhealth~opportunity_youth_cat,design= des, family=binomial) # Opportunity youth cat
```


```{r include=FALSE}
fit.logit2<-svyglm(badhealth~opportunity_youth_cat+educ,design= des, family=binomial) #Opportunity youth cat+education
```


```{r include=FALSE}
fit.logit3<-svyglm(badhealth~opportunity_youth_cat+educ+male+urban_rural,design= des, family=binomial)#Opportunity youth cat+education , gender, urban-rural
```


```{r include=FALSE}
summary(fit.logit2)
```


```{r include=FALSE}
regTermTest(fit.logit2,
            test.terms = ~educ,
            method="Wald",
            df = NULL)
```


```{r include=FALSE}
summary(fit.logit3)
```



```{r include=FALSE}
regTermTest(fit.logit3,
            test.terms=~male+urban_rural,
            method="Wald",
            df = NULL)
```


```{r include=FALSE, results='asis'}
f1<- fit.logit1%>%
  tbl_regression(exponentiate =T)

f2<- fit.logit2%>%
  tbl_regression(exponentiate =T)

f3<- fit.logit3%>%
  tbl_regression(exponentiate =T)



f_all <- tbl_merge(tbls =list(f1, f2, f3),
                    tab_spanner = c("**Model 1**", "**Model 2**", "**Model 3**"))

f_all
```


```{r include=FALSE}
AIC(fit.logit1, fit.logit2, fit.logit3)
```

```{r include=FALSE}
anova(fit.logit1, fit.logit2)
```


```{r include=FALSE}
anova(fit.logit2, fit.logit3)
```



# 2) Fit a predictive logistic regression model using as many predictor variables as you think you need

```{r}
data$age2<- data$age^2
```


```{r}
library(dplyr)
model.data<- data %>%
  select(serial, badhealth, opportunity_youth_cat, urban_rural, male, educ, age2)
```


```{r, results='asis'}
knitr::kable(head(model.data))
```


# 3) Use a 80% training/20% test split for your data

```{r}
set.seed(1115)
train<- createDataPartition(y = model.data$badhealth,
                            p = .80,
                            list=F)

model.dat2train<-model.data[train,]
model.dat2test<-model.data[-train,]

table(model.dat2train$badhealth)
```


```{r}
prop.table(table(model.dat2train$badhealth))
```


```{r}
summary(model.dat2train)
```


# Logistic regression for classification

```{r}
glm1<-glm(badhealth~factor(opportunity_youth_cat)+factor(urban_rural)+scale(age2)+factor(male)+factor(educ),
          data=model.dat2train[,-1],
          family = binomial)
summary(glm1)
```

```{r}
tr_pred<- predict(glm1,
                  newdata = model.dat2train,
                  type = "response")

head(tr_pred)
```


Using 50% as the predictor

```{r}
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  badht=model.dat2train$badhealth)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
```

```{r}
pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=badht, fill=badht))+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Truth")+
  geom_vline(xintercept=.5)
```


```{r}
table( tr_predcl,
       model.dat2train$badhealth)
```

```{r}
model.dat2train$badhealth<- as.factor(model.dat2train$badhealth)
```


```{r}
cm1<-confusionMatrix(data = tr_predcl,
                     reference = model.dat2train$badhealth)
cm1
```




# 3) Report the % correct classification from the training data using the .5 decision rule 

Overall the model has a 94.62% accuracy. 


## Using mean as a predictor

```{r}
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$badhealth==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  badht=model.dat2train$badhealth)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(model.dat2train$badhealth==1)))
```


```{r}
pred2%>%
  ggplot(aes(x=pr, fill=badht))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Truth")+
  geom_vline(xintercept=mean(I(model.dat2train$badhealth==1)))
```

```{r}
table( tr_predcl,
       model.dat2train$badhealth)
```

```{r}
model.dat2train$badhealth<- as.factor(model.dat2train$badhealth)
```


```{r}
confusionMatrix(data = tr_predcl,
                model.dat2train$badhealth,
                positive = "1" )
```

# 3) Report the % correct classification from the training data using the  mean as the decision rule 

answer: Overral, from the training data, the model has a 68.97% accuracy

# 3a) Does changing the decision rule threshold affect your classification accuracy?

Answer: Yes, using the .5 the classification accuracy is 94.62%, while using the mean the accuracy reduced to 68.97%. Even though the mean accuracy is lesser, the .5 classification did not account for false positive and true negative.




## Testing data using mean

```{r}
pred_test<-predict(glm1,
                   newdata=model.dat2test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(model.dat2test$badhealth==1)), 1, 0))

table(model.dat2test$badhealth,pred_cl)
```

```{r}
model.dat2test$badhealth<- as.factor(model.dat2test$badhealth)
```


```{r}
confusionMatrix(data = pred_cl,model.dat2test$badhealth)
```

# 4. Report the % correct classification from the test data using the mean as the decision rule 

The Overral, from the testing data, the model has a 73.01% accuracy


## Testing data using 0.5

```{r}
pred_test2<-predict(glm1,
                   newdata=model.dat2test,
                   type="response")

pred_cl<-factor(ifelse(pred_test >.5, 1, 0))

table(model.dat2test$badhealth,pred_cl)
```


```{r}
model.dat2test$badhealth<- as.factor(model.dat2test$badhealth)
```

```{r}
confusionMatrix(data = pred_cl,model.dat2test$badhealth)
```


# 4 Report the % correct classification from the test data using the .5 decision rule
answer: The percentage correct classification is 94.41 % accurate. Still did not account for the false positive and true negative

