library(NHANES)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-6
dim(NHANES)
## [1] 10000 76
names(NHANES)
## [1] "ID" "SurveyYr" "Gender" "Age"
## [5] "AgeDecade" "AgeMonths" "Race1" "Race3"
## [9] "Education" "MaritalStatus" "HHIncome" "HHIncomeMid"
## [13] "Poverty" "HomeRooms" "HomeOwn" "Work"
## [17] "Weight" "Length" "HeadCirc" "Height"
## [21] "BMI" "BMICatUnder20yrs" "BMI_WHO" "Pulse"
## [25] "BPSysAve" "BPDiaAve" "BPSys1" "BPDia1"
## [29] "BPSys2" "BPDia2" "BPSys3" "BPDia3"
## [33] "Testosterone" "DirectChol" "TotChol" "UrineVol1"
## [37] "UrineFlow1" "UrineVol2" "UrineFlow2" "Diabetes"
## [41] "DiabetesAge" "HealthGen" "DaysPhysHlthBad" "DaysMentHlthBad"
## [45] "LittleInterest" "Depressed" "nPregnancies" "nBabies"
## [49] "Age1stBaby" "SleepHrsNight" "SleepTrouble" "PhysActive"
## [53] "PhysActiveDays" "TVHrsDay" "CompHrsDay" "TVHrsDayChild"
## [57] "CompHrsDayChild" "Alcohol12PlusYr" "AlcoholDay" "AlcoholYear"
## [61] "SmokeNow" "Smoke100" "Smoke100n" "SmokeAge"
## [65] "Marijuana" "AgeFirstMarij" "RegularMarij" "AgeRegMarij"
## [69] "HardDrugs" "SexEver" "SexAge" "SexNumPartnLife"
## [73] "SexNumPartYear" "SameSex" "SexOrientation" "PregnantNow"
data2<- NHANES %>%
select(Gender,Age,Race1,Education,MaritalStatus,HHIncomeMid,Poverty,HomeOwn,Weight,Height,
BMI,Pulse,BPSysAve,BPDiaAve,Diabetes,HealthGen,DaysPhysHlthBad,DaysMentHlthBad,
Depressed,SleepHrsNight,SleepTrouble,AlcoholDay,Smoke100,Marijuana,HardDrugs) %>%
drop_na()
dim(data2)
## [1] 3407 25
glimpse(data2)
## Rows: 3,407
## Columns: 25
## $ Gender <fct> female, female, female, female, male, male, male, fema…
## $ Age <int> 49, 45, 45, 45, 58, 54, 33, 56, 56, 54, 54, 36, 26, 51…
## $ Race1 <fct> White, White, White, White, White, White, White, White…
## $ Education <fct> Some College, College Grad, College Grad, College Grad…
## $ MaritalStatus <fct> LivePartner, Married, Married, Married, Divorced, Marr…
## $ HHIncomeMid <int> 40000, 87500, 87500, 87500, 100000, 70000, 30000, 8750…
## $ Poverty <dbl> 1.91, 5.00, 5.00, 5.00, 5.00, 2.20, 1.27, 5.00, 5.00, …
## $ HomeOwn <fct> Rent, Own, Own, Own, Rent, Rent, Own, Own, Own, Rent, …
## $ Weight <dbl> 86.7, 75.7, 75.7, 75.7, 78.4, 74.7, 93.8, 57.5, 57.5, …
## $ Height <dbl> 168.4, 166.7, 166.7, 166.7, 181.9, 169.4, 181.3, 170.7…
## $ BMI <dbl> 30.57, 27.24, 27.24, 27.24, 23.69, 26.03, 28.54, 19.73…
## $ Pulse <int> 86, 62, 62, 62, 62, 76, 96, 64, 64, 64, 64, 68, 94, 10…
## $ BPSysAve <int> 112, 118, 118, 118, 104, 134, 128, 95, 95, 90, 90, 117…
## $ BPDiaAve <int> 75, 64, 64, 64, 74, 85, 74, 69, 69, 41, 41, 80, 77, 67…
## $ Diabetes <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ HealthGen <fct> Good, Vgood, Vgood, Vgood, Vgood, Fair, Fair, Good, Go…
## $ DaysPhysHlthBad <int> 0, 0, 0, 0, 0, 4, 3, 3, 3, 0, 0, 0, 0, 0, 0, 30, 0, 0,…
## $ DaysMentHlthBad <int> 10, 3, 3, 3, 0, 0, 7, 0, 0, 0, 0, 0, 2, 25, 25, 10, 0,…
## $ Depressed <fct> Several, None, None, None, None, None, None, None, Non…
## $ SleepHrsNight <int> 8, 8, 8, 8, 5, 4, 6, 7, 7, 6, 6, 6, 7, 7, 7, 6, 6, 6, …
## $ SleepTrouble <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, Yes, No, No…
## $ AlcoholDay <int> 2, 3, 3, 3, 2, 6, 3, 1, 1, 2, 2, 12, 4, 1, 1, 3, 2, 1,…
## $ Smoke100 <fct> Yes, No, No, No, No, No, Yes, No, No, No, No, No, No, …
## $ Marijuana <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, No, No, Yes,…
## $ HardDrugs <fct> Yes, No, No, No, Yes, Yes, No, No, No, No, No, Yes, No…
set.seed(100)
train2 <- data2 %>% sample_frac(size = 0.8, fac=HardDrugs)
test2 <- data2 %>% setdiff(train2)
library(rpart)
library(rpart.plot)
form_full<- as.formula(HardDrugs~Gender+Age+Race1+Education+MaritalStatus+HHIncomeMid+Poverty+HomeOwn+Weight+Height+BMI+Pulse+BPSysAve+BPDiaAve+Diabetes+HealthGen+DaysPhysHlthBad+DaysMentHlthBad+Depressed+SleepHrsNight+SleepTrouble+AlcoholDay+Smoke100+Marijuana)
mod_tree <- rpart(form_full,data=train2)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
form_full<- as.formula(HardDrugs~Gender+Age+Race1+Education+MaritalStatus+HHIncomeMid+Poverty+HomeOwn+Weight+Height+BMI+Pulse+BPSysAve+BPDiaAve+Diabetes+HealthGen+DaysPhysHlthBad+DaysMentHlthBad+Depressed+SleepHrsNight+SleepTrouble+AlcoholDay+Smoke100+Marijuana)
mod_rf<- randomForest(form_full,train2,ntree=1000)
mod_rf
##
## Call:
## randomForest(formula = form_full, data = train2, ntree = 1000)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 8.69%
## Confusion matrix:
## No Yes class.error
## No 2095 39 0.01827554
## Yes 198 394 0.33445946
library(tibble)
importance(mod_rf) %>%
as.data.frame() %>%
rownames_to_column() %>%
arrange(desc(MeanDecreaseGini))
## rowname MeanDecreaseGini
## 1 Marijuana 75.683306
## 2 Height 67.783562
## 3 Smoke100 67.274402
## 4 Age 60.989990
## 5 Weight 59.315992
## 6 BPDiaAve 57.610172
## 7 BPSysAve 57.081268
## 8 BMI 56.827216
## 9 AlcoholDay 48.058360
## 10 Pulse 47.771050
## 11 Poverty 47.203125
## 12 MaritalStatus 34.324542
## 13 Education 33.913065
## 14 HHIncomeMid 32.774399
## 15 DaysMentHlthBad 29.275513
## 16 SleepHrsNight 25.705263
## 17 HealthGen 25.234292
## 18 DaysPhysHlthBad 23.672735
## 19 Race1 22.259317
## 20 Depressed 16.640597
## 21 HomeOwn 11.912374
## 22 SleepTrouble 10.206039
## 23 Gender 8.196673
## 24 Diabetes 4.565172
confusion_matrix <- function(data,y,mod){
confusion_matrix <- data %>%
mutate(pred = predict(mod, newdata = data, type = "class"),
y=y) %>%
select(y,pred) %>% table()
}
misclass <- function(confusion){
misclass <- 1- sum(diag(confusion))/sum(confusion)
return(misclass)
}
logistic.misclassrate <- function(dataset, y, fit, form){
misclass_lr <- dataset %>%
mutate(pred.logistic = predict(fit, newx = model.matrix(form, data = dataset),
type = "class")) %>%
mutate(misclassify = ifelse(y != pred.logistic, 1,0)) %>%
summarize(misclass.rate = mean(misclassify))
return(misclass_lr$misclass.rate)
}
miss_rf <- predict(mod_rf, newdata = test2)
confusion_rf <- table(miss_rf, test2$Gender);confusion_rf
##
## miss_rf female male
## No 112 145
## Yes 9 19
missclass_test_rf <- 1-sum(diag(confusion_rf))/sum(confusion_rf)
missclass_test_rf
## [1] 0.5403509
library(e1071)
mod_nb <- naiveBayes(Gender~.,data=train2)
gender_nb <- predict(mod_nb, newdata = test2)
confusion_nb <- table(gender_nb, test2$Gender);confusion_nb
##
## gender_nb female male
## female 109 27
## male 12 137
confusion_nb
##
## gender_nb female male
## female 109 27
## male 12 137
missclass_test_nb <- 1-sum(diag(confusion_nb))/sum(confusion_nb)
missclass_test_nb
## [1] 0.1368421
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
confusionMatrix(table(gender_nb, test2$Gender))
## Confusion Matrix and Statistics
##
##
## gender_nb female male
## female 109 27
## male 12 137
##
## Accuracy : 0.8632
## 95% CI : (0.8177, 0.9008)
## No Information Rate : 0.5754
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7244
##
## Mcnemar's Test P-Value : 0.02497
##
## Sensitivity : 0.9008
## Specificity : 0.8354
## Pos Pred Value : 0.8015
## Neg Pred Value : 0.9195
## Prevalence : 0.4246
## Detection Rate : 0.3825
## Detection Prevalence : 0.4772
## Balanced Accuracy : 0.8681
##
## 'Positive' Class : female
##
mod_rf<- randomForest(form_full,train2,ntree=1000)
gender_rf <- predict(mod_rf, newdata = test2)
confusionMatrix(table(gender_rf, test2$HardDrugs))
## Confusion Matrix and Statistics
##
##
## gender_rf No Yes
## No 222 34
## Yes 19 10
##
## Accuracy : 0.814
## 95% CI : (0.7639, 0.8575)
## No Information Rate : 0.8456
## P-Value [Acc > NIR] : 0.93744
##
## Kappa : 0.1725
##
## Mcnemar's Test P-Value : 0.05447
##
## Sensitivity : 0.9212
## Specificity : 0.2273
## Pos Pred Value : 0.8672
## Neg Pred Value : 0.3448
## Prevalence : 0.8456
## Detection Rate : 0.7789
## Detection Prevalence : 0.8982
## Balanced Accuracy : 0.5742
##
## 'Positive' Class : No
##
predictors_1 <- model.matrix(mod_rf, data = train2)
cv.fit_rf <- cv.glmnet(predictors_1, train2$HardDrugs, family = "binomial", type = "class")
lambda_opt_rf=cv.fit_rf$lambda.1se
mod_lr_rf <- glmnet(predictors_1, train2$HardDrugs, family = "binomial", lambda = lambda_opt_rf)
y_lr_rf = predict(mod_lr_rf, newx = model.matrix(mod_rf, data = test2), type = "class")
confusion_lr_rf = table(test2$HardDrugs, y_lr_rf)
confusion_lr_rf
## y_lr_rf
## No Yes
## No 238 3
## Yes 37 7
tpr_lr_rf = confusion_lr_rf[2,2]/sum(confusion_lr_rf[2,]); tpr_lr_rf
## [1] 0.1590909
tnr_lr_rf = confusion_lr_rf[1,1]/sum(confusion_lr_rf[1,]); tnr_lr_rf
## [1] 0.9875519
library(ROCR)
roc_data <- function(test,y_test,model,type){
prob = model %>%
predict(newdata=test, type=type) %>%
as.data.frame()
pred_prob = prediction(prob[,2], y_test)
perf = performance(pred_prob, 'tpr', 'fpr')
perf_df = data.frame(perf@x.values, perf@y.values)
names(perf_df)=c('fpr','tpr')
return(perf_df)
}
point_data <- function(test,y_test,model,type){
y_pred = predict(model, newdata=test,type=type)
confusion_matrix = table(y_test, y_pred)
tpr = confusion_matrix['Yes','Yes']/sum(confusion_matrix['Yes',])
fpr = confusion_matrix['No','Yes']/sum(confusion_matrix['No',])
return(c(fpr,tpr))
}
perf_df_rf = roc_data(test2, test2$HardDrugs, mod_rf, "prob")
point_rf = point_data(test2, test2$HardDrugs, mod_rf, "class")
ggplot(data =perf_df_rf, aes(x=fpr, y=tpr))+
geom_line(color="purple",lwd=1)+
geom_point(x=point_rf[1],y=point_rf[2],size=3,col="red")+
labs(x='False Positive Rate', y='True Positive Rate')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

library(rpart)
library(rpart.plot)
set.seed(100)
mod_tree <- rpart(form_full,data=train2)
perf_df_dt = roc_data(test2, test2$HardDrugs, mod_tree, "prob")
point_dt = point_data(test2, test2$HardDrugs, mod_tree, "class")
ggplot(data =perf_df_dt, aes(x=fpr, y=tpr))+
geom_line(color="purple",lwd=1)+
geom_point(x=point_dt[1],y=point_dt[2],size=3,col="red")+
labs(x='False Positive Rate', y='True Positive Rate')

mod_nb_roc <- naiveBayes(form_full,data=train2)
roc_pred <- predict(mod_nb_roc, test2, type="raw")
## perf_df_nb = roc_data(test2, test2$HardDrugs, roc_pred, "prob")
## point_nb = point_data(test2, test2$HardDrugs, roc_pred, "class")
library(caret)
set.seed(100)
rfControl2 <- trainControl(method="repeatedcv", number=5, repeats=2)
print(rfControl2)
## $method
## [1] "repeatedcv"
##
## $number
## [1] 5
##
## $repeats
## [1] 2
##
## $search
## [1] "grid"
##
## $p
## [1] 0.75
##
## $initialWindow
## NULL
##
## $horizon
## [1] 1
##
## $fixedWindow
## [1] TRUE
##
## $skip
## [1] 0
##
## $verboseIter
## [1] FALSE
##
## $returnData
## [1] TRUE
##
## $returnResamp
## [1] "final"
##
## $savePredictions
## [1] FALSE
##
## $classProbs
## [1] FALSE
##
## $summaryFunction
## function (data, lev = NULL, model = NULL)
## {
## if (is.character(data$obs))
## data$obs <- factor(data$obs, levels = lev)
## postResample(data[, "pred"], data[, "obs"])
## }
## <bytecode: 0x00000110c0a63d08>
## <environment: namespace:caret>
##
## $selectionFunction
## [1] "best"
##
## $preProcOptions
## $preProcOptions$thresh
## [1] 0.95
##
## $preProcOptions$ICAcomp
## [1] 3
##
## $preProcOptions$k
## [1] 5
##
## $preProcOptions$freqCut
## [1] 19
##
## $preProcOptions$uniqueCut
## [1] 10
##
## $preProcOptions$cutoff
## [1] 0.9
##
##
## $sampling
## NULL
##
## $index
## NULL
##
## $indexOut
## NULL
##
## $indexFinal
## NULL
##
## $timingSamps
## [1] 0
##
## $predictionBounds
## [1] FALSE FALSE
##
## $seeds
## [1] NA
##
## $adaptive
## $adaptive$min
## [1] 5
##
## $adaptive$alpha
## [1] 0.05
##
## $adaptive$method
## [1] "gls"
##
## $adaptive$complete
## [1] TRUE
##
##
## $trim
## [1] FALSE
##
## $allowParallel
## [1] TRUE
mod_rf2<- randomForest(form_full,train2,ntree=1000,mtry=2)
mod_rf4<- randomForest(form_full,train2,ntree=1000,mtry=4)
mod_rf6<- randomForest(form_full,train2,ntree=1000,mtry=6)
mod_rf8<- randomForest(form_full,train2,ntree=1000,mtry=8)
mod_rf10<- randomForest(form_full,train2,ntree=1000,mtry=10)
mod_rf12<- randomForest(form_full,train2,ntree=1000,mtry=12)
mod_rf14<- randomForest(form_full,train2,ntree=1000,mtry=14)
mod_rf16<- randomForest(form_full,train2,ntree=1000,mtry=16)
mod_rf18<- randomForest(form_full,train2,ntree=1000,mtry=18)
mod_rf20<- randomForest(form_full,train2,ntree=1000,mtry=20)
mod_rf10
##
## Call:
## randomForest(formula = form_full, data = train2, ntree = 1000, mtry = 10)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 10
##
## OOB estimate of error rate: 8.84%
## Confusion matrix:
## No Yes class.error
## No 2072 62 0.02905342
## Yes 179 413 0.30236486
miss_rf10 <- predict(mod_rf10, newdata = test2)
confusion_rf10 <- table(miss_rf10, test2$Gender);confusion_rf10
##
## miss_rf10 female male
## No 111 135
## Yes 10 29
missclass_test_rf10 <- 1-sum(diag(confusion_rf10))/sum(confusion_rf10)
missclass_test_rf10
## [1] 0.5087719
mod_rf6<- randomForest(form_full,train2,ntree=1000,mtry=6)
print(mod_rf6)
##
## Call:
## randomForest(formula = form_full, data = train2, ntree = 1000, mtry = 6)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 9.13%
## Confusion matrix:
## No Yes class.error
## No 2081 53 0.02483599
## Yes 196 396 0.33108108
miss_rf6 <- predict(mod_rf6, newdata = test2)
confusion_rf6 <- table(miss_rf6, test2$Gender);confusion_rf6
##
## miss_rf6 female male
## No 110 136
## Yes 11 28
missclass_test_rf6 <- 1-sum(diag(confusion_rf6))/sum(confusion_rf6)
missclass_test_rf6
## [1] 0.5157895
As models become more complex, they are more likely to overfit the
trainig data.