#1. Define a binary outcome of your choosing

self rated health status

2) Fit a predictive logistic regression model using as many predictor variables as you think you need

data$age2<- data$age^2
library(dplyr)
model.data<- data %>%
  select(serial, badhealth, opportunity_youth_cat, urban_rural, male, educ, age2)
knitr::kable(head(model.data))
serial badhealth opportunity_youth_cat urban_rural male educ age2
22 0 Not opportunity youth urban Male 1Less than HS 529
73 0 Not opportunity youth urban Female 3More than HS 400
81 0 Not opportunity youth urban Female 2hsgrad 484
82 0 Not opportunity youth urban Female 3More than HS 441
88 0 Not opportunity youth urban Male 3More than HS 484
136 0 Not opportunity youth urban Female 3More than HS 361

3) Use a 80% training/20% test split for your data

set.seed(1115)
train<- createDataPartition(y = model.data$badhealth,
                            p = .80,
                            list=F)

model.dat2train<-model.data[train,]
model.dat2test<-model.data[-train,]

table(model.dat2train$badhealth)
## 
##    0    1 
## 2848  162
prop.table(table(model.dat2train$badhealth))
## 
##         0         1 
## 0.9461794 0.0538206
summary(model.dat2train)
##      serial        badhealth                 opportunity_youth_cat urban_rural 
##  Min.   :   22   Min.   :0.00000   Opportunity youth    : 312      rural: 374  
##  1st Qu.: 8486   1st Qu.:0.00000   Not opportunity youth:2698      urban:2636  
##  Median :16876   Median :0.00000                                               
##  Mean   :16646   Mean   :0.05382                                               
##  3rd Qu.:24905   3rd Qu.:0.00000                                               
##  Max.   :33099   Max.   :1.00000                                               
##      male                 educ           age2      
##  Female:1514   1Less than HS: 295   Min.   :324.0  
##  Male  :1496   2hsgrad      :1000   1st Qu.:400.0  
##                3More than HS:1715   Median :441.0  
##                                     Mean   :458.7  
##                                     3rd Qu.:529.0  
##                                     Max.   :576.0

Logistic regression for classification

glm1<-glm(badhealth~factor(opportunity_youth_cat)+factor(urban_rural)+scale(age2)+factor(male)+factor(educ),
          data=model.dat2train[,-1],
          family = binomial)
summary(glm1)
## 
## Call:
## glm(formula = badhealth ~ factor(opportunity_youth_cat) + factor(urban_rural) + 
##     scale(age2) + factor(male) + factor(educ), family = binomial, 
##     data = model.dat2train[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8531  -0.3388  -0.2971  -0.2622   2.7703  
## 
## Coefficients:
##                                                    Estimate Std. Error z value
## (Intercept)                                        -0.98218    0.29381  -3.343
## factor(opportunity_youth_cat)Not opportunity youth -0.93572    0.20295  -4.611
## factor(urban_rural)urban                           -0.35696    0.21904  -1.630
## scale(age2)                                         0.18906    0.08664   2.182
## factor(male)Male                                   -0.41848    0.16714  -2.504
## factor(educ)2hsgrad                                -0.54191    0.24907  -2.176
## factor(educ)3More than HS                          -0.81846    0.25567  -3.201
##                                                    Pr(>|z|)    
## (Intercept)                                        0.000829 ***
## factor(opportunity_youth_cat)Not opportunity youth 4.01e-06 ***
## factor(urban_rural)urban                           0.103168    
## scale(age2)                                        0.029107 *  
## factor(male)Male                                   0.012289 *  
## factor(educ)2hsgrad                                0.029576 *  
## factor(educ)3More than HS                          0.001369 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1261.9  on 3009  degrees of freedom
## Residual deviance: 1212.1  on 3003  degrees of freedom
## AIC: 1226.1
## 
## Number of Fisher Scoring iterations: 6
tr_pred<- predict(glm1,
                  newdata = model.dat2train,
                  type = "response")

head(tr_pred)
##          1          2          3          4          5          6 
## 0.07345859 0.05954407 0.03062989 0.02547889 0.07228006 0.02547889

Using 50% as the predictor

tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  badht=model.dat2train$badhealth)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=badht, fill=badht))+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Truth")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tr_predcl,
       model.dat2train$badhealth)
##          
## tr_predcl    0    1
##         0 2848  162
model.dat2train$badhealth<- as.factor(model.dat2train$badhealth)
cm1<-confusionMatrix(data = tr_predcl,
                     reference = model.dat2train$badhealth)
## Warning in confusionMatrix.default(data = tr_predcl, reference =
## model.dat2train$badhealth): Levels are not in the same order for reference and
## data. Refactoring data to match.
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2848  162
##          1    0    0
##                                          
##                Accuracy : 0.9462         
##                  95% CI : (0.9375, 0.954)
##     No Information Rate : 0.9462         
##     P-Value [Acc > NIR] : 0.5209         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.9462         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.9462         
##          Detection Rate : 0.9462         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : 0              
## 

3) Report the % correct classification from the training data using the .5 decision rule

Overall the model has a 94.62% accuracy.

Using mean as a predictor

tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$badhealth==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  badht=model.dat2train$badhealth)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(model.dat2train$badhealth==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=badht))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Truth")+
  geom_vline(xintercept=mean(I(model.dat2train$badhealth==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tr_predcl,
       model.dat2train$badhealth)
##          
## tr_predcl    0    1
##         0 1995   81
##         1  853   81
model.dat2train$badhealth<- as.factor(model.dat2train$badhealth)
confusionMatrix(data = tr_predcl,
                model.dat2train$badhealth,
                positive = "1" )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1995   81
##          1  853   81
##                                           
##                Accuracy : 0.6897          
##                  95% CI : (0.6728, 0.7062)
##     No Information Rate : 0.9462          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0617          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.50000         
##             Specificity : 0.70049         
##          Pos Pred Value : 0.08672         
##          Neg Pred Value : 0.96098         
##              Prevalence : 0.05382         
##          Detection Rate : 0.02691         
##    Detection Prevalence : 0.31030         
##       Balanced Accuracy : 0.60025         
##                                           
##        'Positive' Class : 1               
## 

3) Report the % correct classification from the training data using the mean as the decision rule

answer: Overral, from the training data, the model has a 68.97% accuracy

3a) Does changing the decision rule threshold affect your classification accuracy?

Answer: Yes, using the .5 the classification accuracy is 94.62%, while using the mean the accuracy reduced to 68.97%. Even though the mean accuracy is lesser, the .5 classification did not account for false positive and true negative.

Testing data using mean

pred_test<-predict(glm1,
                   newdata=model.dat2test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(model.dat2test$badhealth==1)), 1, 0))

table(model.dat2test$badhealth,pred_cl)
##    pred_cl
##       0   1
##   0 532 178
##   1  25  17
model.dat2test$badhealth<- as.factor(model.dat2test$badhealth)
confusionMatrix(data = pred_cl,model.dat2test$badhealth)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 532  25
##          1 178  17
##                                           
##                Accuracy : 0.7301          
##                  95% CI : (0.6968, 0.7615)
##     No Information Rate : 0.9441          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0568          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.74930         
##             Specificity : 0.40476         
##          Pos Pred Value : 0.95512         
##          Neg Pred Value : 0.08718         
##              Prevalence : 0.94415         
##          Detection Rate : 0.70745         
##    Detection Prevalence : 0.74069         
##       Balanced Accuracy : 0.57703         
##                                           
##        'Positive' Class : 0               
## 

4. Report the % correct classification from the test data using the mean as the decision rule

The Overral, from the testing data, the model has a 73.01% accuracy

Testing data using 0.5

pred_test2<-predict(glm1,
                   newdata=model.dat2test,
                   type="response")

pred_cl<-factor(ifelse(pred_test >.5, 1, 0))

table(model.dat2test$badhealth,pred_cl)
##    pred_cl
##       0
##   0 710
##   1  42
model.dat2test$badhealth<- as.factor(model.dat2test$badhealth)
confusionMatrix(data = pred_cl,model.dat2test$badhealth)
## Warning in confusionMatrix.default(data = pred_cl, model.dat2test$badhealth):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 710  42
##          1   0   0
##                                           
##                Accuracy : 0.9441          
##                  95% CI : (0.9253, 0.9595)
##     No Information Rate : 0.9441          
##     P-Value [Acc > NIR] : 0.5409          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 2.509e-10       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9441          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9441          
##          Detection Rate : 0.9441          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 

4 Report the % correct classification from the test data using the .5 decision rule

answer: The percentage correct classification is 94.41 % accurate. Still did not account for the false positive and true negative

---
title: "Assignment 4"
author: "Joseph Jaiyeola"
date:  "`r format(Sys.time(), '%d %B, %Y')`"
output:
   html_document:
    df_print: paged
    fig_height: 7
    fig_width: 7
    toc: yes
    toc_float: yes
    code_download: true
---

#1. Define a binary outcome of your choosing

self rated health status







```{r include=FALSE}
library(stargazer, quietly = T)
library(survey, quietly = T)
library(car, quietly = T)
library(questionr, quietly = T)
library(dplyr, quietly = T)
library(forcats, quietly = T)
library(tidyverse, quietly = T)
library(srvyr, quietly = T)
library( gtsummary, quietly = T)
library(caret, quietly = T)

```


```{r include=FALSE}
library(ipumsr)
```

```{r include=FALSE}
ddi <- read_ipums_ddi("nhis_00002.xml")
data <- read_ipums_micro(ddi)
data<- haven::zap_labels(data)
```

```{r include=FALSE}
names(data) <- tolower(gsub(pattern = "_",replacement =  "",x =  names(data)))
```


```{r include=FALSE}
#sex
data$male<-as.factor(ifelse(data$sex==1, "Male", "Female"))


#race/ethnicity
data$whitemajority<- car::Recode(data$hisprace,
                       recodes="02=1; 99=NA; else=0")

data$otherminority<- car::Recode(data$hisprace,
                      recodes="c(1,3,4,5,6,7)=1; 99=NA; else=0")

data$race_eth<-car::Recode(data$hisprace,
recodes="02='whitemajority'; c(1,3,4,5,6,7)='otherminority';else=NA",
as.factor = T)
data$race_eth<-relevel(data$race_eth,
                          ref = "whitemajority")


#education level



data$educ<- car::Recode(data$educ,
                     recodes="102:116='1Less than HS'; 201:202='2hsgrad'; 301:503='3More than HS';997:999=NA;000=NA",
                     as.factor=T)
data$educ<-fct_relevel(data$educ,'1Less than HS','2hsgrad','3More than HS') 


#Urban-rural classification


data$urban_rural<- car::Recode(data$urbrrl,
                     recodes="1:3='urban'; 4='rural';000=NA",
                     as.factor=T)
data$urban_rural<-relevel(data$urban_rural, ref='rural') 


data$rural<- car::Recode(data$hisprace,
                       recodes="4=1; 99=NA; else=0")

data$urban<- car::Recode(data$hisprace,
                       recodes="1:3=1; 99=NA; else=0")




#employment status


data$unemploy<- car::Recode(data$empstat,
                       recodes="200=1; 00=NA; 999=NA; else=0")


data$employ_status<- car::Recode(data$empstat,
                       recodes="100='Employed'; 200='unemployed';else=NA",
                       as.factor=T)
data$employ_status<-relevel(data$employ_status, ref='Employed')


# currently in school

data$non_schooling<- car::Recode(data$schoolnow,
                       recodes="1=1; 0=NA; 7:9=NA; else=0")

data$schoolstatus<- car::Recode(data$schoolnow,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$schoolstatus<-relevel(data$schoolstatus, ref='no')

data<-data%>%
  filter(complete.cases(unemploy,non_schooling))


# merging schooling and working

data$opportunity_youth <- paste( data$unemploy, data$non_schooling, sep ="")


data$opportunity_youth_cat_num<- car::Recode(data$opportunity_youth,
                       recodes="11=1; 00:10=0;else=NA",
                       as.factor=F)


data$opportunity_youth_cat<- car::Recode(data$opportunity_youth,
                       recodes="11='Opportunity youth';00:10='Not opportunity youth';else=NA",
                       as.factor=T)
data$opportunity_youth_cat<-relevel(data$opportunity_youth_cat, ref='Opportunity youth')

data$non_opportunity_youth_cat_num<- car::Recode(data$opportunity_youth_cat_num,
                       recodes="0=1; 99=NA; else=0")

#income grouping

data$familyincome <- data$incfam07on

# born in the US

data$usborn<- car::Recode(data$usborn,
                       recodes="20=1; 97:98=NA; else=0")

# US Citizen
data$citizen<- car::Recode(data$citizen,
                       recodes="2=1; 8:9=NA; else=0")

#last employed



data$employedlast<- car::Recode(data$emplast,
                       recodes="1='Within past 12months'; 2='1-5years ago'; 3='over 5years ago'; 4='never worked';else=NA",
                       as.factor=T)
data$employedlast<-relevel(data$employedlast, ref='1-5years ago')


#HEALTH VARIABLES

#Poor or fair self rated health
data$badhealth<-car::Recode(data$health, recodes="4:5=1; 1:3=0; else=NA")


#place for medical care


data$medicalplace<- car::Recode(data$usualpl,
                       recodes="1='no'; 2:3:='yes';else=NA",
                       as.factor=T)
data$medicalplace<-relevel(data$medicalplace, ref='no')


# delayed medical care due to cost


data$medical_care_cost<- car::Recode(data$delaycost,
                       recodes="2='yes'; 1='no';else=NA",
                       as.factor=T)
data$medical_care_cost<-relevel(data$medical_care_cost, ref='yes')

# worried about paying medical bills


data$medical_bill_worried<- car::Recode(data$wormedbill,
                       recodes="1='very worried'; 2='somewhat worried'; 3='not at all';else=NA",
                       as.factor=T)
data$medical_bill_worried<-relevel(data$medical_bill_worried, ref='very worried')

# unable to pay medical bills


data$medical_bill_unabletopay<- car::Recode(data$hiunablepay,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$medical_bill_unabletopay<-relevel(data$medical_bill_unabletopay, ref='yes')


# health insurance coverage


data$healthinsurace_coverage<- car::Recode(data$hinotcove,
                       recodes="1='no, has coverage'; 2='yes, no coverage';else=NA",
                       as.factor=T)
data$healthinsurace_coverage<-relevel(data$healthinsurace_coverage, ref='yes, no coverage')

# dont have health insurance cuz of cost


data$nohealthinsurace_cost<- car::Recode(data$hinocostr,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$nohealthinsurace_cost<-relevel(data$nohealthinsurace_cost, ref='yes')

# used medication in the past year


data$usedmedications<- car::Recode(data$premedyr,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$usedmedications<-relevel(data$usedmedications, ref='yes')


# if they smoked


data$smoke_frequently<- car::Recode(data$smokfreqnow,
                       recodes="1='no'; 2:3='somedays/everyday';else=NA",
                       as.factor=T)
data$smoke_frequently<-relevel(data$smoke_frequently, ref='somedays/everyday')

# smoked up to 100 cigarreth in life time


data$smoke_100cig<- car::Recode(data$smokev,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$smoke_100cig<-relevel(data$smoke_100cig, ref='yes')


#Mental health

# ever had anxiety disoreder


data$anxiety_disoreder<- car::Recode(data$anxietyev,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$anxiety_disoreder<-relevel(data$anxiety_disoreder, ref='yes')

# medication for worrying


data$medication_for_worry<- car::Recode(data$worrx,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$medication_for_worry<-relevel(data$medication_for_worry, ref='yes')


# medication for depression



data$medication_for_depression<- car::Recode(data$deprx,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$medication_for_depression<-relevel(data$medication_for_depression, ref='yes')

# level of worry



data$level_of_worry<- car::Recode(data$worfeelevl,
                       recodes="1='alot'; 2='a little'; 3='btw little and alot';else=NA",
                       as.factor=T)
data$level_of_worry<-relevel(data$level_of_worry, ref='alot')


# level of depression



data$level_of_depression<- car::Recode(data$depfeelevl,
                       recodes="1='alot'; 2='a little'; 3='btw little and alot';else=NA",
                       as.factor=T)
data$level_of_depression<-relevel(data$level_of_depression, ref='alot')




```


```{r include=FALSE}
data <- data%>%
filter(age >=16 & age<=24)

data<-data%>%
  filter(is.na(badhealth)==F)
data<-data%>%
  filter(is.na(educ)==F)
```



```{r include=FALSE}
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~strata, weights=~sampweight, data = data )
des
```






```{r include=FALSE}
sv.table<-svyby(formula = ~badhealth,
                by = ~opportunity_youth_cat,
                design = des,
                FUN = svymean,
                na.rm=T)


knitr::kable(sv.table,
      caption = "Survey Estimates of Poor SRH by Opportunity Youths",
      align = 'c',  
      format = "html")
```


```{r include=FALSE}
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1,  weights = ~1, data = data)

sv.table<-svyby(formula = ~factor(badhealth),
                by = ~opportunity_youth_cat,
                design = nodes,
                FUN = svymean,
                na.rm=T)
```

```{r include=FALSE}
knitr::kable(sv.table,
      caption = "Estimates of Poor SRH by Opportunity youth - No survey design",
      align = 'c',  
      format = "html")
```


```{r include=FALSE}
library(srvyr)

data%>%
  as_survey_design(strata = strata,
                   weights = sampweight)%>%
  group_by(opportunity_youth_cat)%>%
  summarise(mean_bh = survey_mean(badhealth, na.rm=T))%>%
  ungroup()
```


```{r include=FALSE}
library(gtsummary)

data%>%
  as_survey_design( strata =strata,
                    weights = sampweight)%>%
  select(badhealth, opportunity_youth_cat)%>%
  tbl_svysummary(by = opportunity_youth_cat, 
              label = list(badhealth = "Fair/Poor Health"))%>%
  add_p()%>%
  add_n()
```


```{r include=FALSE}
library(dplyr)
sub<-data%>%
  select(badhealth, opportunity_youth_cat_num, opportunity_youth_cat, race_eth,educ,whitemajority, otherminority,urban_rural, male,sampweight,male,smoke_frequently, strata)
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
               strata= ~strata,
               weights= ~sampweight,
               data = sub )
```


```{r include=FALSE}
fit.logit1<-svyglm(badhealth~opportunity_youth_cat,design= des, family=binomial) # Opportunity youth cat
```


```{r include=FALSE}
fit.logit2<-svyglm(badhealth~opportunity_youth_cat+educ,design= des, family=binomial) #Opportunity youth cat+education
```


```{r include=FALSE}
fit.logit3<-svyglm(badhealth~opportunity_youth_cat+educ+male+urban_rural,design= des, family=binomial)#Opportunity youth cat+education , gender, urban-rural
```


```{r include=FALSE}
summary(fit.logit2)
```


```{r include=FALSE}
regTermTest(fit.logit2,
            test.terms = ~educ,
            method="Wald",
            df = NULL)
```


```{r include=FALSE}
summary(fit.logit3)
```



```{r include=FALSE}
regTermTest(fit.logit3,
            test.terms=~male+urban_rural,
            method="Wald",
            df = NULL)
```


```{r include=FALSE, results='asis'}
f1<- fit.logit1%>%
  tbl_regression(exponentiate =T)

f2<- fit.logit2%>%
  tbl_regression(exponentiate =T)

f3<- fit.logit3%>%
  tbl_regression(exponentiate =T)



f_all <- tbl_merge(tbls =list(f1, f2, f3),
                    tab_spanner = c("**Model 1**", "**Model 2**", "**Model 3**"))

f_all
```


```{r include=FALSE}
AIC(fit.logit1, fit.logit2, fit.logit3)
```

```{r include=FALSE}
anova(fit.logit1, fit.logit2)
```


```{r include=FALSE}
anova(fit.logit2, fit.logit3)
```



# 2) Fit a predictive logistic regression model using as many predictor variables as you think you need

```{r}
data$age2<- data$age^2
```


```{r}
library(dplyr)
model.data<- data %>%
  select(serial, badhealth, opportunity_youth_cat, urban_rural, male, educ, age2)
```


```{r, results='asis'}
knitr::kable(head(model.data))
```


# 3) Use a 80% training/20% test split for your data

```{r}
set.seed(1115)
train<- createDataPartition(y = model.data$badhealth,
                            p = .80,
                            list=F)

model.dat2train<-model.data[train,]
model.dat2test<-model.data[-train,]

table(model.dat2train$badhealth)
```


```{r}
prop.table(table(model.dat2train$badhealth))
```


```{r}
summary(model.dat2train)
```


# Logistic regression for classification

```{r}
glm1<-glm(badhealth~factor(opportunity_youth_cat)+factor(urban_rural)+scale(age2)+factor(male)+factor(educ),
          data=model.dat2train[,-1],
          family = binomial)
summary(glm1)
```

```{r}
tr_pred<- predict(glm1,
                  newdata = model.dat2train,
                  type = "response")

head(tr_pred)
```


Using 50% as the predictor

```{r}
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  badht=model.dat2train$badhealth)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
```

```{r}
pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=badht, fill=badht))+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Truth")+
  geom_vline(xintercept=.5)
```


```{r}
table( tr_predcl,
       model.dat2train$badhealth)
```

```{r}
model.dat2train$badhealth<- as.factor(model.dat2train$badhealth)
```


```{r}
cm1<-confusionMatrix(data = tr_predcl,
                     reference = model.dat2train$badhealth)
cm1
```




# 3) Report the % correct classification from the training data using the .5 decision rule 

Overall the model has a 94.62% accuracy. 


## Using mean as a predictor

```{r}
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$badhealth==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred,
                  gr=tr_predcl,
                  badht=model.dat2train$badhealth)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(model.dat2train$badhealth==1)))
```


```{r}
pred2%>%
  ggplot(aes(x=pr, fill=badht))+
  geom_histogram(position="identity",
                 alpha=.2)+
  ggtitle(label = "Probability of Bad health",
          subtitle = "Truth")+
  geom_vline(xintercept=mean(I(model.dat2train$badhealth==1)))
```

```{r}
table( tr_predcl,
       model.dat2train$badhealth)
```

```{r}
model.dat2train$badhealth<- as.factor(model.dat2train$badhealth)
```


```{r}
confusionMatrix(data = tr_predcl,
                model.dat2train$badhealth,
                positive = "1" )
```

# 3) Report the % correct classification from the training data using the  mean as the decision rule 

answer: Overral, from the training data, the model has a 68.97% accuracy

# 3a) Does changing the decision rule threshold affect your classification accuracy?

Answer: Yes, using the .5 the classification accuracy is 94.62%, while using the mean the accuracy reduced to 68.97%. Even though the mean accuracy is lesser, the .5 classification did not account for false positive and true negative.




## Testing data using mean

```{r}
pred_test<-predict(glm1,
                   newdata=model.dat2test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(model.dat2test$badhealth==1)), 1, 0))

table(model.dat2test$badhealth,pred_cl)
```

```{r}
model.dat2test$badhealth<- as.factor(model.dat2test$badhealth)
```


```{r}
confusionMatrix(data = pred_cl,model.dat2test$badhealth)
```

# 4. Report the % correct classification from the test data using the mean as the decision rule 

The Overral, from the testing data, the model has a 73.01% accuracy


## Testing data using 0.5

```{r}
pred_test2<-predict(glm1,
                   newdata=model.dat2test,
                   type="response")

pred_cl<-factor(ifelse(pred_test >.5, 1, 0))

table(model.dat2test$badhealth,pred_cl)
```


```{r}
model.dat2test$badhealth<- as.factor(model.dat2test$badhealth)
```

```{r}
confusionMatrix(data = pred_cl,model.dat2test$badhealth)
```


# 4 Report the % correct classification from the test data using the .5 decision rule
answer: The percentage correct classification is 94.41 % accurate. Still did not account for the false positive and true negative

