library(NHANES)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-6
dim(NHANES)
## [1] 10000    76
names(NHANES)
##  [1] "ID"               "SurveyYr"         "Gender"           "Age"             
##  [5] "AgeDecade"        "AgeMonths"        "Race1"            "Race3"           
##  [9] "Education"        "MaritalStatus"    "HHIncome"         "HHIncomeMid"     
## [13] "Poverty"          "HomeRooms"        "HomeOwn"          "Work"            
## [17] "Weight"           "Length"           "HeadCirc"         "Height"          
## [21] "BMI"              "BMICatUnder20yrs" "BMI_WHO"          "Pulse"           
## [25] "BPSysAve"         "BPDiaAve"         "BPSys1"           "BPDia1"          
## [29] "BPSys2"           "BPDia2"           "BPSys3"           "BPDia3"          
## [33] "Testosterone"     "DirectChol"       "TotChol"          "UrineVol1"       
## [37] "UrineFlow1"       "UrineVol2"        "UrineFlow2"       "Diabetes"        
## [41] "DiabetesAge"      "HealthGen"        "DaysPhysHlthBad"  "DaysMentHlthBad" 
## [45] "LittleInterest"   "Depressed"        "nPregnancies"     "nBabies"         
## [49] "Age1stBaby"       "SleepHrsNight"    "SleepTrouble"     "PhysActive"      
## [53] "PhysActiveDays"   "TVHrsDay"         "CompHrsDay"       "TVHrsDayChild"   
## [57] "CompHrsDayChild"  "Alcohol12PlusYr"  "AlcoholDay"       "AlcoholYear"     
## [61] "SmokeNow"         "Smoke100"         "Smoke100n"        "SmokeAge"        
## [65] "Marijuana"        "AgeFirstMarij"    "RegularMarij"     "AgeRegMarij"     
## [69] "HardDrugs"        "SexEver"          "SexAge"           "SexNumPartnLife" 
## [73] "SexNumPartYear"   "SameSex"          "SexOrientation"   "PregnantNow"
data2<- NHANES %>%
  select(Gender,Age,Race1,Education,MaritalStatus,HHIncomeMid,Poverty,HomeOwn,Weight,Height,
         BMI,Pulse,BPSysAve,BPDiaAve,Diabetes,HealthGen,DaysPhysHlthBad,DaysMentHlthBad,
         Depressed,SleepHrsNight,SleepTrouble,AlcoholDay,Smoke100,Marijuana,HardDrugs) %>% 
  drop_na()
dim(data2)
## [1] 3407   25
glimpse(data2)
## Rows: 3,407
## Columns: 25
## $ Gender          <fct> female, female, female, female, male, male, male, fema…
## $ Age             <int> 49, 45, 45, 45, 58, 54, 33, 56, 56, 54, 54, 36, 26, 51…
## $ Race1           <fct> White, White, White, White, White, White, White, White…
## $ Education       <fct> Some College, College Grad, College Grad, College Grad…
## $ MaritalStatus   <fct> LivePartner, Married, Married, Married, Divorced, Marr…
## $ HHIncomeMid     <int> 40000, 87500, 87500, 87500, 100000, 70000, 30000, 8750…
## $ Poverty         <dbl> 1.91, 5.00, 5.00, 5.00, 5.00, 2.20, 1.27, 5.00, 5.00, …
## $ HomeOwn         <fct> Rent, Own, Own, Own, Rent, Rent, Own, Own, Own, Rent, …
## $ Weight          <dbl> 86.7, 75.7, 75.7, 75.7, 78.4, 74.7, 93.8, 57.5, 57.5, …
## $ Height          <dbl> 168.4, 166.7, 166.7, 166.7, 181.9, 169.4, 181.3, 170.7…
## $ BMI             <dbl> 30.57, 27.24, 27.24, 27.24, 23.69, 26.03, 28.54, 19.73…
## $ Pulse           <int> 86, 62, 62, 62, 62, 76, 96, 64, 64, 64, 64, 68, 94, 10…
## $ BPSysAve        <int> 112, 118, 118, 118, 104, 134, 128, 95, 95, 90, 90, 117…
## $ BPDiaAve        <int> 75, 64, 64, 64, 74, 85, 74, 69, 69, 41, 41, 80, 77, 67…
## $ Diabetes        <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ HealthGen       <fct> Good, Vgood, Vgood, Vgood, Vgood, Fair, Fair, Good, Go…
## $ DaysPhysHlthBad <int> 0, 0, 0, 0, 0, 4, 3, 3, 3, 0, 0, 0, 0, 0, 0, 30, 0, 0,…
## $ DaysMentHlthBad <int> 10, 3, 3, 3, 0, 0, 7, 0, 0, 0, 0, 0, 2, 25, 25, 10, 0,…
## $ Depressed       <fct> Several, None, None, None, None, None, None, None, Non…
## $ SleepHrsNight   <int> 8, 8, 8, 8, 5, 4, 6, 7, 7, 6, 6, 6, 7, 7, 7, 6, 6, 6, …
## $ SleepTrouble    <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, Yes, No, No…
## $ AlcoholDay      <int> 2, 3, 3, 3, 2, 6, 3, 1, 1, 2, 2, 12, 4, 1, 1, 3, 2, 1,…
## $ Smoke100        <fct> Yes, No, No, No, No, No, Yes, No, No, No, No, No, No, …
## $ Marijuana       <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, No, No, Yes,…
## $ HardDrugs       <fct> Yes, No, No, No, Yes, Yes, No, No, No, No, No, Yes, No…
set.seed(100)
train2 <- data2 %>% sample_frac(size = 0.8, fac=HardDrugs)
test2 <- data2 %>% setdiff(train2)
library(rpart)
library(rpart.plot)
form_full<- as.formula(HardDrugs~Gender+Age+Race1+Education+MaritalStatus+HHIncomeMid+Poverty+HomeOwn+Weight+Height+BMI+Pulse+BPSysAve+BPDiaAve+Diabetes+HealthGen+DaysPhysHlthBad+DaysMentHlthBad+Depressed+SleepHrsNight+SleepTrouble+AlcoholDay+Smoke100+Marijuana)

mod_tree <- rpart(form_full,data=train2)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
form_full<- as.formula(HardDrugs~Gender+Age+Race1+Education+MaritalStatus+HHIncomeMid+Poverty+HomeOwn+Weight+Height+BMI+Pulse+BPSysAve+BPDiaAve+Diabetes+HealthGen+DaysPhysHlthBad+DaysMentHlthBad+Depressed+SleepHrsNight+SleepTrouble+AlcoholDay+Smoke100+Marijuana)

mod_rf<- randomForest(form_full,train2,ntree=1000)
mod_rf
## 
## Call:
##  randomForest(formula = form_full, data = train2, ntree = 1000) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 8.69%
## Confusion matrix:
##       No Yes class.error
## No  2095  39  0.01827554
## Yes  198 394  0.33445946
library(tibble)
importance(mod_rf) %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  arrange(desc(MeanDecreaseGini))
##            rowname MeanDecreaseGini
## 1        Marijuana        75.683306
## 2           Height        67.783562
## 3         Smoke100        67.274402
## 4              Age        60.989990
## 5           Weight        59.315992
## 6         BPDiaAve        57.610172
## 7         BPSysAve        57.081268
## 8              BMI        56.827216
## 9       AlcoholDay        48.058360
## 10           Pulse        47.771050
## 11         Poverty        47.203125
## 12   MaritalStatus        34.324542
## 13       Education        33.913065
## 14     HHIncomeMid        32.774399
## 15 DaysMentHlthBad        29.275513
## 16   SleepHrsNight        25.705263
## 17       HealthGen        25.234292
## 18 DaysPhysHlthBad        23.672735
## 19           Race1        22.259317
## 20       Depressed        16.640597
## 21         HomeOwn        11.912374
## 22    SleepTrouble        10.206039
## 23          Gender         8.196673
## 24        Diabetes         4.565172
confusion_matrix <- function(data,y,mod){
  confusion_matrix <- data %>% 
  mutate(pred = predict(mod, newdata = data, type = "class"),
         y=y) %>%
  select(y,pred) %>% table()
}
misclass <- function(confusion){
  misclass <- 1- sum(diag(confusion))/sum(confusion)
  return(misclass)
}
logistic.misclassrate <- function(dataset, y, fit, form){
  misclass_lr <- dataset %>% 
  mutate(pred.logistic = predict(fit, newx = model.matrix(form, data = dataset), 
         type = "class")) %>% 
  mutate(misclassify = ifelse(y != pred.logistic, 1,0)) %>%
  summarize(misclass.rate = mean(misclassify))
  return(misclass_lr$misclass.rate)
}
miss_rf <- predict(mod_rf, newdata = test2)
confusion_rf <- table(miss_rf, test2$Gender);confusion_rf
##        
## miss_rf female male
##     No     112  145
##     Yes      9   19
missclass_test_rf <- 1-sum(diag(confusion_rf))/sum(confusion_rf)
missclass_test_rf
## [1] 0.5403509
library(e1071)
mod_nb <- naiveBayes(Gender~.,data=train2)
gender_nb <- predict(mod_nb, newdata = test2)
confusion_nb <- table(gender_nb, test2$Gender);confusion_nb
##          
## gender_nb female male
##    female    109   27
##    male       12  137
confusion_nb
##          
## gender_nb female male
##    female    109   27
##    male       12  137
missclass_test_nb <- 1-sum(diag(confusion_nb))/sum(confusion_nb)
missclass_test_nb
## [1] 0.1368421
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
confusionMatrix(table(gender_nb, test2$Gender))
## Confusion Matrix and Statistics
## 
##          
## gender_nb female male
##    female    109   27
##    male       12  137
##                                           
##                Accuracy : 0.8632          
##                  95% CI : (0.8177, 0.9008)
##     No Information Rate : 0.5754          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7244          
##                                           
##  Mcnemar's Test P-Value : 0.02497         
##                                           
##             Sensitivity : 0.9008          
##             Specificity : 0.8354          
##          Pos Pred Value : 0.8015          
##          Neg Pred Value : 0.9195          
##              Prevalence : 0.4246          
##          Detection Rate : 0.3825          
##    Detection Prevalence : 0.4772          
##       Balanced Accuracy : 0.8681          
##                                           
##        'Positive' Class : female          
## 
mod_rf<- randomForest(form_full,train2,ntree=1000)
gender_rf <- predict(mod_rf, newdata = test2)
confusionMatrix(table(gender_rf, test2$HardDrugs))
## Confusion Matrix and Statistics
## 
##          
## gender_rf  No Yes
##       No  222  34
##       Yes  19  10
##                                           
##                Accuracy : 0.814           
##                  95% CI : (0.7639, 0.8575)
##     No Information Rate : 0.8456          
##     P-Value [Acc > NIR] : 0.93744         
##                                           
##                   Kappa : 0.1725          
##                                           
##  Mcnemar's Test P-Value : 0.05447         
##                                           
##             Sensitivity : 0.9212          
##             Specificity : 0.2273          
##          Pos Pred Value : 0.8672          
##          Neg Pred Value : 0.3448          
##              Prevalence : 0.8456          
##          Detection Rate : 0.7789          
##    Detection Prevalence : 0.8982          
##       Balanced Accuracy : 0.5742          
##                                           
##        'Positive' Class : No              
## 
predictors_1 <- model.matrix(mod_rf, data = train2) 
cv.fit_rf <- cv.glmnet(predictors_1, train2$HardDrugs, family = "binomial", type = "class")
lambda_opt_rf=cv.fit_rf$lambda.1se
mod_lr_rf <- glmnet(predictors_1, train2$HardDrugs, family = "binomial", lambda = lambda_opt_rf)

y_lr_rf = predict(mod_lr_rf, newx = model.matrix(mod_rf, data = test2), type = "class")
confusion_lr_rf = table(test2$HardDrugs, y_lr_rf)
confusion_lr_rf
##      y_lr_rf
##        No Yes
##   No  238   3
##   Yes  37   7
tpr_lr_rf = confusion_lr_rf[2,2]/sum(confusion_lr_rf[2,]); tpr_lr_rf
## [1] 0.1590909
tnr_lr_rf = confusion_lr_rf[1,1]/sum(confusion_lr_rf[1,]); tnr_lr_rf
## [1] 0.9875519
library(ROCR)
roc_data <- function(test,y_test,model,type){
  prob = model %>% 
    predict(newdata=test, type=type) %>% 
    as.data.frame()
  pred_prob = prediction(prob[,2], y_test)
  perf = performance(pred_prob, 'tpr', 'fpr')
  perf_df = data.frame(perf@x.values, perf@y.values)
  names(perf_df)=c('fpr','tpr')
  return(perf_df)
}
point_data <- function(test,y_test,model,type){
  y_pred = predict(model, newdata=test,type=type)
  confusion_matrix = table(y_test, y_pred)
  tpr = confusion_matrix['Yes','Yes']/sum(confusion_matrix['Yes',])
  fpr = confusion_matrix['No','Yes']/sum(confusion_matrix['No',])
  return(c(fpr,tpr))
}
perf_df_rf = roc_data(test2, test2$HardDrugs, mod_rf, "prob")
point_rf = point_data(test2, test2$HardDrugs, mod_rf, "class")
ggplot(data =perf_df_rf, aes(x=fpr, y=tpr))+
  geom_line(color="purple",lwd=1)+
  geom_point(x=point_rf[1],y=point_rf[2],size=3,col="red")+
  labs(x='False Positive Rate', y='True Positive Rate')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

library(rpart)
library(rpart.plot)
set.seed(100)
mod_tree <- rpart(form_full,data=train2)
perf_df_dt = roc_data(test2, test2$HardDrugs, mod_tree, "prob")
point_dt = point_data(test2, test2$HardDrugs, mod_tree, "class")
ggplot(data =perf_df_dt, aes(x=fpr, y=tpr))+
  geom_line(color="purple",lwd=1)+
  geom_point(x=point_dt[1],y=point_dt[2],size=3,col="red")+
  labs(x='False Positive Rate', y='True Positive Rate')

mod_nb_roc <- naiveBayes(form_full,data=train2)
roc_pred <- predict(mod_nb_roc, test2, type="raw")

## perf_df_nb = roc_data(test2, test2$HardDrugs, roc_pred, "prob")
## point_nb = point_data(test2, test2$HardDrugs, roc_pred, "class")
library(caret)
set.seed(100)
rfControl2 <- trainControl(method="repeatedcv", number=5, repeats=2)
print(rfControl2)
## $method
## [1] "repeatedcv"
## 
## $number
## [1] 5
## 
## $repeats
## [1] 2
## 
## $search
## [1] "grid"
## 
## $p
## [1] 0.75
## 
## $initialWindow
## NULL
## 
## $horizon
## [1] 1
## 
## $fixedWindow
## [1] TRUE
## 
## $skip
## [1] 0
## 
## $verboseIter
## [1] FALSE
## 
## $returnData
## [1] TRUE
## 
## $returnResamp
## [1] "final"
## 
## $savePredictions
## [1] FALSE
## 
## $classProbs
## [1] FALSE
## 
## $summaryFunction
## function (data, lev = NULL, model = NULL) 
## {
##     if (is.character(data$obs)) 
##         data$obs <- factor(data$obs, levels = lev)
##     postResample(data[, "pred"], data[, "obs"])
## }
## <bytecode: 0x00000110c0a63d08>
## <environment: namespace:caret>
## 
## $selectionFunction
## [1] "best"
## 
## $preProcOptions
## $preProcOptions$thresh
## [1] 0.95
## 
## $preProcOptions$ICAcomp
## [1] 3
## 
## $preProcOptions$k
## [1] 5
## 
## $preProcOptions$freqCut
## [1] 19
## 
## $preProcOptions$uniqueCut
## [1] 10
## 
## $preProcOptions$cutoff
## [1] 0.9
## 
## 
## $sampling
## NULL
## 
## $index
## NULL
## 
## $indexOut
## NULL
## 
## $indexFinal
## NULL
## 
## $timingSamps
## [1] 0
## 
## $predictionBounds
## [1] FALSE FALSE
## 
## $seeds
## [1] NA
## 
## $adaptive
## $adaptive$min
## [1] 5
## 
## $adaptive$alpha
## [1] 0.05
## 
## $adaptive$method
## [1] "gls"
## 
## $adaptive$complete
## [1] TRUE
## 
## 
## $trim
## [1] FALSE
## 
## $allowParallel
## [1] TRUE
mod_rf2<- randomForest(form_full,train2,ntree=1000,mtry=2)
mod_rf4<- randomForest(form_full,train2,ntree=1000,mtry=4)
mod_rf6<- randomForest(form_full,train2,ntree=1000,mtry=6)
mod_rf8<- randomForest(form_full,train2,ntree=1000,mtry=8)
mod_rf10<- randomForest(form_full,train2,ntree=1000,mtry=10)
mod_rf12<- randomForest(form_full,train2,ntree=1000,mtry=12)
mod_rf14<- randomForest(form_full,train2,ntree=1000,mtry=14)
mod_rf16<- randomForest(form_full,train2,ntree=1000,mtry=16)
mod_rf18<- randomForest(form_full,train2,ntree=1000,mtry=18)
mod_rf20<- randomForest(form_full,train2,ntree=1000,mtry=20)

mod_rf10
## 
## Call:
##  randomForest(formula = form_full, data = train2, ntree = 1000,      mtry = 10) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 10
## 
##         OOB estimate of  error rate: 8.84%
## Confusion matrix:
##       No Yes class.error
## No  2072  62  0.02905342
## Yes  179 413  0.30236486
miss_rf10 <- predict(mod_rf10, newdata = test2)
confusion_rf10 <- table(miss_rf10, test2$Gender);confusion_rf10
##          
## miss_rf10 female male
##       No     111  135
##       Yes     10   29
missclass_test_rf10 <- 1-sum(diag(confusion_rf10))/sum(confusion_rf10)
missclass_test_rf10
## [1] 0.5087719
mod_rf6<- randomForest(form_full,train2,ntree=1000,mtry=6)
print(mod_rf6)
## 
## Call:
##  randomForest(formula = form_full, data = train2, ntree = 1000,      mtry = 6) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 9.13%
## Confusion matrix:
##       No Yes class.error
## No  2081  53  0.02483599
## Yes  196 396  0.33108108
miss_rf6 <- predict(mod_rf6, newdata = test2)
confusion_rf6 <- table(miss_rf6, test2$Gender);confusion_rf6
##         
## miss_rf6 female male
##      No     110  136
##      Yes     11   28
missclass_test_rf6 <- 1-sum(diag(confusion_rf6))/sum(confusion_rf6)
missclass_test_rf6
## [1] 0.5157895

As models become more complex, they are more likely to overfit the trainig data.