caret
\(~\)
\(~\)
\(~\)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
\(~\)
\(~\)
#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin - Categorical
##### dmdeduc2 - Education level - Adults 20+ - Categorical
##### dmdmartl - Marital status - Categorical
##### indhhin2 - Annual household income - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical
\(~\)
\(~\)
tidyverse
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
levels(diab_pop$diq010) <- c("Diabetes", "No_Diabetes")
diab_pop.no_na_vals <- diab_pop %>% na.omit()
\(~\)
\(~\)
caret
packagelibrary('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())
set.seed(8675309)
# The createDataPartition function is used to create training and test sets
trainIndex <- createDataPartition(diab_pop.no_na_vals$diq010,
p = .6,
list = FALSE,
times = 1)
dm2.train <- diab_pop.no_na_vals[trainIndex, ]
dm2.test <- diab_pop.no_na_vals[-trainIndex, ]
# we will make a grid of values to test in cross-validation.
knnGrid <- expand.grid(k = c(1:15))
trainControl
function# the method here is cv for cross-validation you could try "repeatedcv" for repeated cross-fold validation
fitControl <- trainControl(method = "cv",
number = 10,
# repeats = 10, # uncomment for repeatedcv
## Estimate class probabilities
classProbs = TRUE,
## Evaluate performance using
## the following function
summaryFunction = twoClassSummary)
train
functionknnFit <- train(diq010 ~ ., # formula
data = dm2.train, # train data
method = "knn", # method for caret see https://topepo.github.io/caret/available-models.html for list of models
trControl = fitControl,
tuneGrid = knnGrid,
## Specify which metric to optimize
metric = "ROC")
knnFit
## k-Nearest Neighbors
##
## 1126 samples
## 9 predictor
## 2 classes: 'Diabetes', 'No_Diabetes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1013, 1013, 1013, 1014, 1013, 1013, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 1 0.6756443 0.419117647 0.9321711
## 2 0.7264957 0.361029412 0.9331798
## 3 0.7402515 0.272426471 0.9749123
## 4 0.7324585 0.207352941 0.9811513
## 5 0.7319863 0.177573529 0.9926864
## 6 0.7178345 0.124632353 0.9916447
## 7 0.7164019 0.082720588 0.9989474
## 8 0.7139003 0.100367647 0.9989583
## 9 0.7110242 0.070955882 1.0000000
## 10 0.6955999 0.047058824 1.0000000
## 11 0.6962759 0.041176471 1.0000000
## 12 0.6969150 0.029411765 1.0000000
## 13 0.6909944 0.017647059 1.0000000
## 14 0.6847469 0.005882353 1.0000000
## 15 0.6795452 0.012132353 1.0000000
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 3.
plot(knnFit)
str(knnFit,1)
## List of 24
## $ method : chr "knn"
## $ modelInfo :List of 13
## $ modelType : chr "Classification"
## $ results :'data.frame': 15 obs. of 7 variables:
## $ pred : NULL
## $ bestTune :'data.frame': 1 obs. of 1 variable:
## $ call : language train.formula(form = diq010 ~ ., data = dm2.train, method = "knn", trControl = fitControl, tuneGrid = knnGri| __truncated__
## $ dots : list()
## $ metric : chr "ROC"
## $ control :List of 27
## $ finalModel :List of 8
## ..- attr(*, "class")= chr "knn3"
## $ preProcess : NULL
## $ trainingData:'data.frame': 1126 obs. of 10 variables:
## $ resample :'data.frame': 10 obs. of 4 variables:
## $ resampledCM :'data.frame': 150 obs. of 6 variables:
## $ perfNames : chr [1:3] "ROC" "Sens" "Spec"
## $ maximize : logi TRUE
## $ yLimits : NULL
## $ times :List of 3
## $ levels : chr [1:2] "Diabetes" "No_Diabetes"
## ..- attr(*, "ordered")= logi FALSE
## $ terms :Classes 'terms', 'formula' language diq010 ~ seqn + riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu
## .. ..- attr(*, "variables")= language list(diq010, seqn, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, lbxglu)
## .. ..- attr(*, "factors")= int [1:10, 1:9] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. ..- attr(*, "term.labels")= chr [1:9] "seqn" "riagendr" "ridageyr" "ridreth1" ...
## .. ..- attr(*, "order")= int [1:9] 1 1 1 1 1 1 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(diq010, seqn, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, lbxglu)
## .. ..- attr(*, "dataClasses")= Named chr [1:10] "factor" "numeric" "factor" "numeric" ...
## .. .. ..- attr(*, "names")= chr [1:10] "diq010" "seqn" "riagendr" "ridageyr" ...
## $ coefnames : chr [1:31] "seqn" "riagendrFemale" "ridageyr" "ridreth1Other Hispanic" ...
## $ contrasts :List of 5
## $ xlevels :List of 5
## - attr(*, "class")= chr [1:2] "train" "train.formula"
knnFit$finalModel
## 3-nearest neighbor model
## Training set outcome distribution:
##
## Diabetes No_Diabetes
## 169 957
# let's score the test set using this model
pred_class <- predict(knnFit, dm2.test,'raw')
probs <- predict(knnFit, dm2.test,'prob')
dm2.test.scored <- cbind(dm2.test, pred_class, probs)
glimpse(dm2.test.scored)
## Rows: 750
## Columns: 13
## $ seqn <dbl> 83734, 83737, 83757, 83761, 83789, 83820, 83822, 83823,...
## $ riagendr <fct> Male, Female, Female, Female, Male, Male, Female, Femal...
## $ ridageyr <dbl> 78, 72, 57, 24, 66, 70, 20, 29, 69, 71, 37, 49, 41, 54,...
## $ ridreth1 <fct> Non-Hispanic White, MexicanAmerican, Other Hispanic, Ot...
## $ dmdeduc2 <fct> High school graduate/GED, Grades 9-11th, Less than 9th ...
## $ dmdmartl <fct> Married, Separated, Separated, Never married, Living wi...
## $ indhhin2 <fct> "$20,000-$24,999", "$75,000-$99,999", "$20,000-$24,999"...
## $ bmxbmi <dbl> 28.8, 28.6, 35.4, 25.3, 34.0, 27.0, 22.2, 29.7, 28.2, 2...
## $ diq010 <fct> Diabetes, No_Diabetes, Diabetes, No_Diabetes, No_Diabet...
## $ lbxglu <dbl> 84, 107, 398, 95, 113, 94, 80, 102, 105, 76, 79, 126, 1...
## $ pred_class <fct> No_Diabetes, No_Diabetes, Diabetes, No_Diabetes, No_Dia...
## $ Diabetes <dbl> 0.0000000, 0.0000000, 0.6666667, 0.0000000, 0.0000000, ...
## $ No_Diabetes <dbl> 1.0000000, 1.0000000, 0.3333333, 1.0000000, 1.0000000, ...
\(~\)
\(~\)
yardstick
for model metrics# yardstick
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall
## The following object is masked from 'package:readr':
##
## spec
knnFit.conf_mat <- dm2.test.scored %>%
conf_mat(truth=diq010 , pred_class)
knnFit.roc_auc <- dm2.test.scored %>%
roc_auc(truth=diq010 , Diabetes)
knnFit.roc_auc
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.733
knnFit.roc_curve <- dm2.test.scored %>%
roc_curve(truth=diq010 , Diabetes) %>%
autoplot() +
labs(caption = paste0("AUC : ", round(knnFit.roc_auc$.estimate,3)))
knnFit.roc_curve
knnFit.pr_auc <- dm2.test.scored %>%
pr_auc(truth=diq010 , Diabetes)
knnFit.pr_curve <- dm2.test.scored %>%
pr_curve(truth=diq010 , Diabetes) %>%
autoplot() +
labs(caption = paste0("AUC : ", round(knnFit.pr_auc$.estimate,3)))
knnFit.pr_curve
\(~\)
\(~\)
\(~\)
knitr::opts_chunk$set(echo = TRUE)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin - Categorical
##### dmdeduc2 - Education level - Adults 20+ - Categorical
##### dmdmartl - Marital status - Categorical
##### indhhin2 - Annual household income - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical
library('tidyverse')
levels(diab_pop$diq010) <- c("Diabetes", "No_Diabetes")
diab_pop.no_na_vals <- diab_pop %>% na.omit()
library('caret')
# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())
set.seed(8675309)
# The createDataPartition function is used to create training and test sets
trainIndex <- createDataPartition(diab_pop.no_na_vals$diq010,
p = .6,
list = FALSE,
times = 1)
dm2.train <- diab_pop.no_na_vals[trainIndex, ]
dm2.test <- diab_pop.no_na_vals[-trainIndex, ]
# we will make a grid of values to test in cross-validation.
knnGrid <- expand.grid(k = c(1:15))
# the method here is cv for cross-validation you could try "repeatedcv" for repeated cross-fold validation
fitControl <- trainControl(method = "cv",
number = 10,
# repeats = 10, # uncomment for repeatedcv
## Estimate class probabilities
classProbs = TRUE,
## Evaluate performance using
## the following function
summaryFunction = twoClassSummary)
knnFit <- train(diq010 ~ ., # formula
data = dm2.train, # train data
method = "knn", # method for caret see https://topepo.github.io/caret/available-models.html for list of models
trControl = fitControl,
tuneGrid = knnGrid,
## Specify which metric to optimize
metric = "ROC")
knnFit
plot(knnFit)
str(knnFit,1)
knnFit$finalModel
# let's score the test set using this model
pred_class <- predict(knnFit, dm2.test,'raw')
probs <- predict(knnFit, dm2.test,'prob')
dm2.test.scored <- cbind(dm2.test, pred_class, probs)
glimpse(dm2.test.scored)
# yardstick
library('yardstick')
knnFit.conf_mat <- dm2.test.scored %>%
conf_mat(truth=diq010 , pred_class)
knnFit.roc_auc <- dm2.test.scored %>%
roc_auc(truth=diq010 , Diabetes)
knnFit.roc_auc
knnFit.roc_curve <- dm2.test.scored %>%
roc_curve(truth=diq010 , Diabetes) %>%
autoplot() +
labs(caption = paste0("AUC : ", round(knnFit.roc_auc$.estimate,3)))
knnFit.roc_curve
knnFit.pr_auc <- dm2.test.scored %>%
pr_auc(truth=diq010 , Diabetes)
knnFit.pr_curve <- dm2.test.scored %>%
pr_curve(truth=diq010 , Diabetes) %>%
autoplot() +
labs(caption = paste0("AUC : ", round(knnFit.pr_auc$.estimate,3)))
knnFit.pr_curve