library(NHANES)
## Warning: package 'NHANES' was built under R version 4.2.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data1<- NHANES %>%
  select(HardDrugs,Gender,HealthGen,Depressed,SleepHrsNight,AlcoholDay,SmokeNow,Marijuana) %>% 
  na.omit()
glimpse(data1)
## Rows: 1,704
## Columns: 8
## $ HardDrugs     <fct> Yes, No, No, Yes, Yes, No, Yes, Yes, Yes, No, No, No, No…
## $ Gender        <fct> female, male, female, male, male, male, male, female, fe…
## $ HealthGen     <fct> Good, Fair, Excellent, Fair, Excellent, Excellent, Excel…
## $ Depressed     <fct> Several, None, None, None, Several, None, None, None, No…
## $ SleepHrsNight <int> 8, 6, 8, 6, 6, 6, 8, 4, 4, 6, 6, 6, 6, 6, 5, 8, 8, 8, 8,…
## $ AlcoholDay    <int> 2, 3, 1, 6, 3, 12, 5, 3, 3, 7, 7, 7, 7, 1, 4, 1, 1, 1, 1…
## $ SmokeNow      <fct> Yes, No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes…
## $ Marijuana     <fct> Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes…
conTable <- table(data1$Marijuana)
conTable_2 <- table(data1$HardDrugs)

conTable_1 <-prop.table(conTable)
conTable_1
## 
##        No       Yes 
## 0.1866197 0.8133803
conTable_3 <- prop.table(conTable_2)
conTable_3
## 
##        No       Yes 
## 0.6238263 0.3761737
set.seed(99)
train1 <- data1 %>% sample_frac(size = 0.8)
test1 <- data1 %>% setdiff(train1)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.2
## Loading required package: Matrix
## Loaded glmnet 4.1-4
form<- as.formula(
  "HardDrugs ~Gender+HealthGen+Depressed+SleepHrsNight+AlcoholDay+SmokeNow+Marijuana")
predictors <- model.matrix(form, data = train1) 
cv.fit <- cv.glmnet(predictors, train1$HardDrugs, family = "binomial", type = "class")
cv.fit$lambda.1se
## [1] 0.02558189
## [1] 0.02558189
plot(cv.fit)

lambda_opt=cv.fit$lambda.1se
mod_lr <- glmnet(predictors, train1$HardDrugs, family = "binomial", lambda = lambda_opt)
mod_lr$beta
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                          s0
## (Intercept)      .         
## Gendermale       0.19515911
## HealthGenVgood   .         
## HealthGenGood    .         
## HealthGenFair    .         
## HealthGenPoor    .         
## DepressedSeveral 0.35997057
## DepressedMost    0.25541222
## SleepHrsNight    .         
## AlcoholDay       0.01346756
## SmokeNowYes      .         
## MarijuanaYes     2.06711656
form2 <- as.formula(HardDrugs~Gender+Depressed+AlcoholDay+Marijuana)
mod_lr2 <- glm(form2, data=train1,family=binomial)
summary(mod_lr2)
## 
## Call:
## glm(formula = form2, family = binomial, data = train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7033  -1.0783  -0.2785   1.1782   2.7732  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.94152    0.34909 -11.291  < 2e-16 ***
## Gendermale        0.51475    0.12882   3.996 6.45e-05 ***
## DepressedSeveral  0.72179    0.15015   4.807 1.53e-06 ***
## DepressedMost     0.91592    0.24315   3.767 0.000165 ***
## AlcoholDay        0.03929    0.01872   2.099 0.035833 *  
## MarijuanaYes      3.11054    0.33172   9.377  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1810.3  on 1362  degrees of freedom
## Residual deviance: 1555.5  on 1357  degrees of freedom
## AIC: 1567.5
## 
## Number of Fisher Scoring iterations: 5

2) The most significant variable is Marijuana because it has the smallest p-value.

3) Marijuana has a negative relationship with hard drugs so it is weak.

4) Males are more likely with 51%.

library(rpart)
## Warning: package 'rpart' was built under R version 4.2.2
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.2
mod_tree <- rpart(form2,data=train1)
rpart.plot(mod_tree)

5) Depression seems to be the most important factor.

6) Males who are depressed.

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
data2<- NHANES %>%
  select(Gender,Age,Race1,Education,MaritalStatus,HHIncomeMid,Poverty,HomeOwn,Weight,Height,
         BMI,Pulse,BPSysAve,BPDiaAve,Diabetes,HealthGen,DaysPhysHlthBad,DaysMentHlthBad,
         Depressed,SleepHrsNight,SleepTrouble,AlcoholDay,Smoke100,Marijuana,HardDrugs) %>% 
  drop_na()
train2 <- data2 %>% sample_frac(size = 0.8, fac=HardDrugs)
test2 <- data2 %>% setdiff(train2)
library(glmnet)
form_full<- 
as.formula(HardDrugs~Gender+Age+Race1+Education+MaritalStatus+HHIncomeMid+Poverty
           +HomeOwn+Weight+Height+BMI+Pulse+BPSysAve+BPDiaAve+Diabetes+HealthGen+DaysPhysHlthBad+
             DaysMentHlthBad+Depressed+SleepHrsNight+SleepTrouble+AlcoholDay+Smoke100+Marijuana)
predictors <- model.matrix(form_full, data = train2) 
cv.fit <- cv.glmnet(predictors, train2$HardDrugs, family = "binomial", type = "class")
cv.fit$lambda.1se
## [1] 0.02034798
## [1] 0.02034798
plot(cv.fit)

lambda_opt=cv.fit$lambda.1se
mod_lr2 <- glmnet(predictors, train2$HardDrugs, family = "binomial", lambda = lambda_opt)
mod_lr2$beta
## 40 x 1 sparse Matrix of class "dgCMatrix"
##                                     s0
## (Intercept)                .          
## Gendermale                 .          
## Age                        0.018239709
## Race1Hispanic              .          
## Race1Mexican               .          
## Race1White                 .          
## Race1Other                 .          
## Education9 - 11th Grade    .          
## EducationHigh School       .          
## EducationSome College      .          
## EducationCollege Grad     -0.242664359
## MaritalStatusLivePartner   .          
## MaritalStatusMarried       .          
## MaritalStatusNeverMarried  .          
## MaritalStatusSeparated     .          
## MaritalStatusWidowed       .          
## HHIncomeMid                .          
## Poverty                    .          
## HomeOwnRent                .          
## HomeOwnOther               .          
## Weight                     .          
## Height                     0.006936587
## BMI                        .          
## Pulse                      .          
## BPSysAve                   .          
## BPDiaAve                   .          
## DiabetesYes                .          
## HealthGenVgood             .          
## HealthGenGood              .          
## HealthGenFair              .          
## HealthGenPoor              .          
## DaysPhysHlthBad            .          
## DaysMentHlthBad            .          
## DepressedSeveral           0.274178338
## DepressedMost              0.392290332
## SleepHrsNight              .          
## SleepTroubleYes            0.089592858
## AlcoholDay                 0.056135046
## Smoke100Yes                0.935530210
## MarijuanaYes               1.815769760
mod_tree2<- rpart(form_full,data=train2)
rpart.plot(mod_tree2)

confusion_matrix <- function(data,y,mod){
  confusion_matrix <- data %>% 
  mutate(pred = predict(mod, newdata = data, type = "class"),
         y=y) %>%
  select(y,pred) %>% table()
}
misclass <- function(confusion){
  misclass <- 1- sum(diag(confusion))/sum(confusion)
  return(misclass)
}
logistic.misclassrate <- function(dataset, y, fit, form){
  misclass_lr <- dataset %>% 
  mutate(pred.logistic = predict(fit, newx = model.matrix(form, data = dataset), 
         type = "class")) %>% 
  mutate(misclassify = ifelse(y != pred.logistic, 1,0)) %>%
  summarize(misclass.rate = mean(misclassify))
  return(misclass_lr$misclass.rate)
}
logistic.misclassrate(test2, test2$HardDrugs, mod_lr2, form_full)
## [1] 0.1712062