\(~\)
\(~\)
NHANES data#install.packages('svmLinear3')
#install.packages('LiblineaR')
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(NHANES)
NHANES_DATA_12 <- NHANES %>%
filter(!is.na(Depressed))
Depressed is has 3 potiential classesNHANES_DATA_12 %>%
select(Depressed) %>%
distinct()
## # A tibble: 3 x 1
## Depressed
## <fct>
## 1 Several
## 2 None
## 3 Most
\(~\)
\(~\)
\(~\)
\(~\)
SumNa <- function(col){sum(is.na(col))}
data.sum <- NHANES_DATA_12 %>%
summarise_all(SumNa) %>%
tidyr::gather(key='feature', value='SumNa') %>%
arrange(-SumNa) %>%
mutate(PctNa = SumNa/nrow(NHANES_DATA_12))
data.sum2 <- data.sum %>%
filter(! (feature %in% c('ID','Depressed'))) %>%
filter(PctNa < .85)
data.sum2$feature
## [1] "UrineFlow2" "UrineVol2" "AgeRegMarij" "PregnantNow"
## [5] "Age1stBaby" "nBabies" "nPregnancies" "SmokeAge"
## [9] "AgeFirstMarij" "SmokeNow" "Testosterone" "AgeMonths"
## [13] "TVHrsDay" "Race3" "CompHrsDay" "PhysActiveDays"
## [17] "SexOrientation" "AlcoholDay" "SexNumPartYear" "Marijuana"
## [21] "RegularMarij" "SexAge" "SexNumPartnLife" "HardDrugs"
## [25] "SexEver" "SameSex" "AlcoholYear" "HHIncome"
## [29] "HHIncomeMid" "Poverty" "UrineFlow1" "BPSys1"
## [33] "BPDia1" "AgeDecade" "DirectChol" "TotChol"
## [37] "BPSys2" "BPDia2" "Education" "BPSys3"
## [41] "BPDia3" "MaritalStatus" "Smoke100" "Smoke100n"
## [45] "Alcohol12PlusYr" "BPSysAve" "BPDiaAve" "Pulse"
## [49] "BMI_WHO" "BMI" "Weight" "HomeRooms"
## [53] "Height" "HomeOwn" "UrineVol1" "SleepHrsNight"
## [57] "LittleInterest" "DaysPhysHlthBad" "DaysMentHlthBad" "Diabetes"
## [61] "Work" "SurveyYr" "Gender" "Age"
## [65] "Race1" "HealthGen" "SleepTrouble" "PhysActive"
data_F <- NHANES_DATA_12 %>%
select(ID, Depressed, data.sum2$feature) %>%
filter(!is.na(Depressed))
data_F still has missing valuesAmelia::missmap(as.data.frame(data_F))
\(~\)
\(~\)
\(~\)
\(~\)
rfImputelibrary('randomForest')
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
data_F.imputed <- rfImpute(Depressed ~ . ,
data_F,
iter=2,
ntree=300)
## ntree OOB 1 2 3
## 300: 8.11% 0.74% 36.47% 32.06%
## ntree OOB 1 2 3
## 300: 8.54% 1.20% 36.27% 33.73%
Amelia::missmap(as.data.frame(data_F.imputed))
\(~\)
\(~\)
\(~\)
\(~\)
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(8576309)
trainIndex <- createDataPartition(data_F.imputed$Depressed,
p = .6,
list = FALSE,
times = 1)
TRAIN <- data_F.imputed[trainIndex, ]
TEST <- data_F.imputed[-trainIndex, ]
\(~\)
\(~\)
\(~\)
\(~\)
To see list of models compatible and tuneable hyperparameters within the caret package you can visit: https://topepo.github.io/caret/available-models.html
SVMGrid <- expand.grid(cost = c(1,2,5), Loss = c(1,5,6))
train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
number=3)
toc <- Sys.time()
model_svm <- train(Depressed ~ .,
data = TRAIN,
method = "svmLinear3", # this will use the svmLinear3 library
metric = "Accuracy", # which metric should be optimized for
trControl = train_ctrl,
tuneGrid = SVMGrid)
tic <- Sys.time()
tic - toc
## Time difference of 5.958181 mins
model_svm
## L2 Regularized Support Vector Machine (dual) with Linear Kernel
##
## 4005 samples
## 69 predictor
## 3 classes: 'None', 'Several', 'Most'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 2670, 2669, 2671
## Resampling results across tuning parameters:
##
## cost Loss Accuracy Kappa
## 1 1 0.5346848 0.029585510
## 1 5 0.5709959 0.007226782
## 1 6 0.7437887 0.028501308
## 2 1 0.7860176 0.000000000
## 2 5 0.7410419 0.033046429
## 2 6 0.7860176 0.000000000
## 5 1 0.7860176 0.000000000
## 5 5 0.4763128 0.059009727
## 5 6 0.7860176 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were cost = 2 and Loss = 1.
varImp(model_svm)
## ROC curve variable importance
##
## variables are sorted by maximum importance across the classes
## only 20 most important variables shown (out of 69)
##
## None Several Most
## DaysMentHlthBad 100.00 70.34 100.00
## LittleInterest 90.65 57.67 90.65
## Age1stBaby 76.93 49.47 76.93
## SmokeNow 57.47 36.67 57.47
## HealthGen 51.08 30.53 51.08
## HHIncome 44.77 26.09 44.77
## Poverty 43.57 25.38 43.57
## nPregnancies 41.20 27.79 41.20
## HHIncomeMid 39.49 23.65 39.49
## Education 38.66 25.36 38.66
## DaysPhysHlthBad 34.68 19.16 34.68
## SleepTrouble 33.63 19.78 33.63
## Work 33.07 23.61 33.07
## TVHrsDay 32.73 20.04 32.73
## SleepHrsNight 32.09 17.46 32.09
## Testosterone 31.44 22.91 31.44
## HomeRooms 31.43 17.13 31.43
## PhysActive 29.67 15.33 29.67
## PregnantNow 28.31 22.63 28.31
## AgeMonths 28.25 28.25 21.14
plot(varImp(model_svm), top = 20)
\(~\)
knitr::opts_chunk$set(echo = TRUE)
#install.packages('svmLinear3')
#install.packages('LiblineaR')
library('tidyverse')
library(NHANES)
NHANES_DATA_12 <- NHANES %>%
filter(!is.na(Depressed))
NHANES_DATA_12 %>%
select(Depressed) %>%
distinct()
SumNa <- function(col){sum(is.na(col))}
data.sum <- NHANES_DATA_12 %>%
summarise_all(SumNa) %>%
tidyr::gather(key='feature', value='SumNa') %>%
arrange(-SumNa) %>%
mutate(PctNa = SumNa/nrow(NHANES_DATA_12))
data.sum2 <- data.sum %>%
filter(! (feature %in% c('ID','Depressed'))) %>%
filter(PctNa < .85)
data.sum2$feature
data_F <- NHANES_DATA_12 %>%
select(ID, Depressed, data.sum2$feature) %>%
filter(!is.na(Depressed))
Amelia::missmap(as.data.frame(data_F))
library('randomForest')
data_F.imputed <- rfImpute(Depressed ~ . ,
data_F,
iter=2,
ntree=300)
Amelia::missmap(as.data.frame(data_F.imputed))
library('caret')
set.seed(8576309)
trainIndex <- createDataPartition(data_F.imputed$Depressed,
p = .6,
list = FALSE,
times = 1)
TRAIN <- data_F.imputed[trainIndex, ]
TEST <- data_F.imputed[-trainIndex, ]
SVMGrid <- expand.grid(cost = c(1,2,5), Loss = c(1,5,6))
train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
number=3)
toc <- Sys.time()
model_svm <- train(Depressed ~ .,
data = TRAIN,
method = "svmLinear3", # this will use the svmLinear3 library
metric = "Accuracy", # which metric should be optimized for
trControl = train_ctrl,
tuneGrid = SVMGrid)
tic <- Sys.time()
tic - toc
model_svm
varImp(model_svm)
plot(varImp(model_svm), top = 20)