library(NHANES)
## Warning: package 'NHANES' was built under R version 4.2.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data1<- NHANES %>%
select(HardDrugs,Gender,HealthGen,Depressed,SleepHrsNight,AlcoholDay,SmokeNow,Marijuana) %>%
na.omit()
glimpse(data1)
## Rows: 1,704
## Columns: 8
## $ HardDrugs <fct> Yes, No, No, Yes, Yes, No, Yes, Yes, Yes, No, No, No, No…
## $ Gender <fct> female, male, female, male, male, male, male, female, fe…
## $ HealthGen <fct> Good, Fair, Excellent, Fair, Excellent, Excellent, Excel…
## $ Depressed <fct> Several, None, None, None, Several, None, None, None, No…
## $ SleepHrsNight <int> 8, 6, 8, 6, 6, 6, 8, 4, 4, 6, 6, 6, 6, 6, 5, 8, 8, 8, 8,…
## $ AlcoholDay <int> 2, 3, 1, 6, 3, 12, 5, 3, 3, 7, 7, 7, 7, 1, 4, 1, 1, 1, 1…
## $ SmokeNow <fct> Yes, No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes…
## $ Marijuana <fct> Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes…
conTable <- table(data1$Marijuana)
conTable_2 <- table(data1$HardDrugs)
conTable_1 <-prop.table(conTable)
conTable_1
##
## No Yes
## 0.1866197 0.8133803
conTable_3 <- prop.table(conTable_2)
conTable_3
##
## No Yes
## 0.6238263 0.3761737
set.seed(99)
train1 <- data1 %>% sample_frac(size = 0.8)
test1 <- data1 %>% setdiff(train1)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.2
## Loading required package: Matrix
## Loaded glmnet 4.1-4
form<- as.formula(
"HardDrugs ~Gender+HealthGen+Depressed+SleepHrsNight+AlcoholDay+SmokeNow+Marijuana")
predictors <- model.matrix(form, data = train1)
cv.fit <- cv.glmnet(predictors, train1$HardDrugs, family = "binomial", type = "class")
cv.fit$lambda.1se
## [1] 0.02558189
## [1] 0.02558189
plot(cv.fit)

lambda_opt=cv.fit$lambda.1se
mod_lr <- glmnet(predictors, train1$HardDrugs, family = "binomial", lambda = lambda_opt)
mod_lr$beta
## 12 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) .
## Gendermale 0.19515911
## HealthGenVgood .
## HealthGenGood .
## HealthGenFair .
## HealthGenPoor .
## DepressedSeveral 0.35997057
## DepressedMost 0.25541222
## SleepHrsNight .
## AlcoholDay 0.01346756
## SmokeNowYes .
## MarijuanaYes 2.06711656
form2 <- as.formula(HardDrugs~Gender+Depressed+AlcoholDay+Marijuana)
mod_lr2 <- glm(form2, data=train1,family=binomial)
summary(mod_lr2)
##
## Call:
## glm(formula = form2, family = binomial, data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7033 -1.0783 -0.2785 1.1782 2.7732
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.94152 0.34909 -11.291 < 2e-16 ***
## Gendermale 0.51475 0.12882 3.996 6.45e-05 ***
## DepressedSeveral 0.72179 0.15015 4.807 1.53e-06 ***
## DepressedMost 0.91592 0.24315 3.767 0.000165 ***
## AlcoholDay 0.03929 0.01872 2.099 0.035833 *
## MarijuanaYes 3.11054 0.33172 9.377 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1810.3 on 1362 degrees of freedom
## Residual deviance: 1555.5 on 1357 degrees of freedom
## AIC: 1567.5
##
## Number of Fisher Scoring iterations: 5
2) The most significant variable is Marijuana because it has the
smallest p-value.
3) Marijuana has a negative relationship with hard drugs so it is
weak.
4) Males are more likely with 51%.
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.2
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.2
mod_tree <- rpart(form2,data=train1)
rpart.plot(mod_tree)

5) Depression seems to be the most important factor.
6) Males who are depressed.
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
data2<- NHANES %>%
select(Gender,Age,Race1,Education,MaritalStatus,HHIncomeMid,Poverty,HomeOwn,Weight,Height,
BMI,Pulse,BPSysAve,BPDiaAve,Diabetes,HealthGen,DaysPhysHlthBad,DaysMentHlthBad,
Depressed,SleepHrsNight,SleepTrouble,AlcoholDay,Smoke100,Marijuana,HardDrugs) %>%
drop_na()
train2 <- data2 %>% sample_frac(size = 0.8, fac=HardDrugs)
test2 <- data2 %>% setdiff(train2)
library(glmnet)
form_full<-
as.formula(HardDrugs~Gender+Age+Race1+Education+MaritalStatus+HHIncomeMid+Poverty
+HomeOwn+Weight+Height+BMI+Pulse+BPSysAve+BPDiaAve+Diabetes+HealthGen+DaysPhysHlthBad+
DaysMentHlthBad+Depressed+SleepHrsNight+SleepTrouble+AlcoholDay+Smoke100+Marijuana)
predictors <- model.matrix(form_full, data = train2)
cv.fit <- cv.glmnet(predictors, train2$HardDrugs, family = "binomial", type = "class")
cv.fit$lambda.1se
## [1] 0.02034798
## [1] 0.02034798
plot(cv.fit)

lambda_opt=cv.fit$lambda.1se
mod_lr2 <- glmnet(predictors, train2$HardDrugs, family = "binomial", lambda = lambda_opt)
mod_lr2$beta
## 40 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) .
## Gendermale .
## Age 0.018239709
## Race1Hispanic .
## Race1Mexican .
## Race1White .
## Race1Other .
## Education9 - 11th Grade .
## EducationHigh School .
## EducationSome College .
## EducationCollege Grad -0.242664359
## MaritalStatusLivePartner .
## MaritalStatusMarried .
## MaritalStatusNeverMarried .
## MaritalStatusSeparated .
## MaritalStatusWidowed .
## HHIncomeMid .
## Poverty .
## HomeOwnRent .
## HomeOwnOther .
## Weight .
## Height 0.006936587
## BMI .
## Pulse .
## BPSysAve .
## BPDiaAve .
## DiabetesYes .
## HealthGenVgood .
## HealthGenGood .
## HealthGenFair .
## HealthGenPoor .
## DaysPhysHlthBad .
## DaysMentHlthBad .
## DepressedSeveral 0.274178338
## DepressedMost 0.392290332
## SleepHrsNight .
## SleepTroubleYes 0.089592858
## AlcoholDay 0.056135046
## Smoke100Yes 0.935530210
## MarijuanaYes 1.815769760
mod_tree2<- rpart(form_full,data=train2)
rpart.plot(mod_tree2)

confusion_matrix <- function(data,y,mod){
confusion_matrix <- data %>%
mutate(pred = predict(mod, newdata = data, type = "class"),
y=y) %>%
select(y,pred) %>% table()
}
misclass <- function(confusion){
misclass <- 1- sum(diag(confusion))/sum(confusion)
return(misclass)
}
logistic.misclassrate <- function(dataset, y, fit, form){
misclass_lr <- dataset %>%
mutate(pred.logistic = predict(fit, newx = model.matrix(form, data = dataset),
type = "class")) %>%
mutate(misclassify = ifelse(y != pred.logistic, 1,0)) %>%
summarize(misclass.rate = mean(misclassify))
return(misclass_lr$misclass.rate)
}
logistic.misclassrate(test2, test2$HardDrugs, mod_lr2, form_full)
## [1] 0.1712062