\(~\)

\(~\)

\(~\)

1 Read in the Data

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

\(~\)


\(~\)

1.1 Reminders

1.1.1 The Data

#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin  - Categorical
##### dmdeduc2 - Education level - Adults 20+  - Categorical
##### dmdmartl - Marital status  - Categorical
##### indhhin2 - Annual household income  - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical

\(~\)


\(~\)

2 Load tidyverse

library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
levels(diab_pop$diq010) <- c("Diabetes", "No_Diabetes")

diab_pop.no_na_vals <- diab_pop %>% na.omit()

\(~\)


\(~\)

3 The caret package

library('caret')
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())

set.seed(8675309)

3.1 Split Data

# The createDataPartition function is used to create training and test sets

trainIndex <- createDataPartition(diab_pop.no_na_vals$diq010, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

dm2.train <- diab_pop.no_na_vals[trainIndex, ]
dm2.test <- diab_pop.no_na_vals[-trainIndex, ]

3.2 Make Grid

# we will make a grid of values to test in cross-validation.
knnGrid <-  expand.grid(k = c(1:15))

3.3 The trainControl function

# the method here is cv for cross-validation you could try "repeatedcv"  for repeated cross-fold validation
fitControl <- trainControl(method = "cv",
                           number = 10, 
                           # repeats = 10, # uncomment for repeatedcv 
                           ## Estimate class probabilities
                           classProbs = TRUE,
                           ## Evaluate performance using 
                           ## the following function
                           summaryFunction = twoClassSummary)

3.4 The train function

knnFit <- train(diq010 ~ ., # formula
                 data = dm2.train, # train data   
                 method = "knn", # method for caret see https://topepo.github.io/caret/available-models.html for list of models 
                 trControl = fitControl, 
                 tuneGrid = knnGrid,
                 ## Specify which metric to optimize
                 metric = "ROC")

3.5 Results

knnFit
## k-Nearest Neighbors 
## 
## 1126 samples
##    9 predictor
##    2 classes: 'Diabetes', 'No_Diabetes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1013, 1013, 1013, 1014, 1013, 1013, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens         Spec     
##    1  0.6756443  0.419117647  0.9321711
##    2  0.7264957  0.361029412  0.9331798
##    3  0.7402515  0.272426471  0.9749123
##    4  0.7324585  0.207352941  0.9811513
##    5  0.7319863  0.177573529  0.9926864
##    6  0.7178345  0.124632353  0.9916447
##    7  0.7164019  0.082720588  0.9989474
##    8  0.7139003  0.100367647  0.9989583
##    9  0.7110242  0.070955882  1.0000000
##   10  0.6955999  0.047058824  1.0000000
##   11  0.6962759  0.041176471  1.0000000
##   12  0.6969150  0.029411765  1.0000000
##   13  0.6909944  0.017647059  1.0000000
##   14  0.6847469  0.005882353  1.0000000
##   15  0.6795452  0.012132353  1.0000000
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 3.
plot(knnFit)

str(knnFit,1)
## List of 24
##  $ method      : chr "knn"
##  $ modelInfo   :List of 13
##  $ modelType   : chr "Classification"
##  $ results     :'data.frame':    15 obs. of  7 variables:
##  $ pred        : NULL
##  $ bestTune    :'data.frame':    1 obs. of  1 variable:
##  $ call        : language train.formula(form = diq010 ~ ., data = dm2.train, method = "knn", trControl = fitControl,      tuneGrid = knnGri| __truncated__
##  $ dots        : list()
##  $ metric      : chr "ROC"
##  $ control     :List of 27
##  $ finalModel  :List of 8
##   ..- attr(*, "class")= chr "knn3"
##  $ preProcess  : NULL
##  $ trainingData:'data.frame':    1126 obs. of  10 variables:
##  $ resample    :'data.frame':    10 obs. of  4 variables:
##  $ resampledCM :'data.frame':    150 obs. of  6 variables:
##  $ perfNames   : chr [1:3] "ROC" "Sens" "Spec"
##  $ maximize    : logi TRUE
##  $ yLimits     : NULL
##  $ times       :List of 3
##  $ levels      : chr [1:2] "Diabetes" "No_Diabetes"
##   ..- attr(*, "ordered")= logi FALSE
##  $ terms       :Classes 'terms', 'formula'  language diq010 ~ seqn + riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl +      indhhin2 + bmxbmi + lbxglu
##   .. ..- attr(*, "variables")= language list(diq010, seqn, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2,      bmxbmi, lbxglu)
##   .. ..- attr(*, "factors")= int [1:10, 1:9] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. ..- attr(*, "term.labels")= chr [1:9] "seqn" "riagendr" "ridageyr" "ridreth1" ...
##   .. ..- attr(*, "order")= int [1:9] 1 1 1 1 1 1 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(diq010, seqn, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2,      bmxbmi, lbxglu)
##   .. ..- attr(*, "dataClasses")= Named chr [1:10] "factor" "numeric" "factor" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:10] "diq010" "seqn" "riagendr" "ridageyr" ...
##  $ coefnames   : chr [1:31] "seqn" "riagendrFemale" "ridageyr" "ridreth1Other Hispanic" ...
##  $ contrasts   :List of 5
##  $ xlevels     :List of 5
##  - attr(*, "class")= chr [1:2] "train" "train.formula"
knnFit$finalModel  
## 3-nearest neighbor model
## Training set outcome distribution:
## 
##    Diabetes No_Diabetes 
##         169         957

3.6 Score Test Data

# let's score the test set using this model
pred_class <- predict(knnFit, dm2.test,'raw')
probs <- predict(knnFit, dm2.test,'prob')

dm2.test.scored <- cbind(dm2.test, pred_class, probs)

glimpse(dm2.test.scored)
## Rows: 750
## Columns: 13
## $ seqn        <dbl> 83734, 83737, 83757, 83761, 83789, 83820, 83822, 83823,...
## $ riagendr    <fct> Male, Female, Female, Female, Male, Male, Female, Femal...
## $ ridageyr    <dbl> 78, 72, 57, 24, 66, 70, 20, 29, 69, 71, 37, 49, 41, 54,...
## $ ridreth1    <fct> Non-Hispanic White, MexicanAmerican, Other Hispanic, Ot...
## $ dmdeduc2    <fct> High school graduate/GED, Grades 9-11th, Less than 9th ...
## $ dmdmartl    <fct> Married, Separated, Separated, Never married, Living wi...
## $ indhhin2    <fct> "$20,000-$24,999", "$75,000-$99,999", "$20,000-$24,999"...
## $ bmxbmi      <dbl> 28.8, 28.6, 35.4, 25.3, 34.0, 27.0, 22.2, 29.7, 28.2, 2...
## $ diq010      <fct> Diabetes, No_Diabetes, Diabetes, No_Diabetes, No_Diabet...
## $ lbxglu      <dbl> 84, 107, 398, 95, 113, 94, 80, 102, 105, 76, 79, 126, 1...
## $ pred_class  <fct> No_Diabetes, No_Diabetes, Diabetes, No_Diabetes, No_Dia...
## $ Diabetes    <dbl> 0.0000000, 0.0000000, 0.6666667, 0.0000000, 0.0000000, ...
## $ No_Diabetes <dbl> 1.0000000, 1.0000000, 0.3333333, 1.0000000, 1.0000000, ...

\(~\)


\(~\)

4 Use yardstick for model metrics

# yardstick 
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
## 
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
## The following object is masked from 'package:readr':
## 
##     spec

4.1 Confusion Matrix

knnFit.conf_mat <- dm2.test.scored %>% 
  conf_mat(truth=diq010 , pred_class)

4.2 ROC Curve

knnFit.roc_auc <- dm2.test.scored %>% 
  roc_auc(truth=diq010 , Diabetes)

knnFit.roc_auc
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.733
knnFit.roc_curve <- dm2.test.scored %>% 
  roc_curve(truth=diq010 , Diabetes) %>%
  autoplot() + 
  labs(caption = paste0("AUC : ", round(knnFit.roc_auc$.estimate,3)))

knnFit.roc_curve

4.3 Precision-Recall Curve

knnFit.pr_auc <- dm2.test.scored %>% 
  pr_auc(truth=diq010 , Diabetes) 

knnFit.pr_curve <- dm2.test.scored %>% 
  pr_curve(truth=diq010 , Diabetes) %>%
  autoplot() +
  labs(caption = paste0("AUC : ",  round(knnFit.pr_auc$.estimate,3)))

knnFit.pr_curve

\(~\)


\(~\)

5 Code Appendix

\(~\)

knitr::opts_chunk$set(echo = TRUE)

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin  - Categorical
##### dmdeduc2 - Education level - Adults 20+  - Categorical
##### dmdmartl - Marital status  - Categorical
##### indhhin2 - Annual household income  - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical

library('tidyverse')

levels(diab_pop$diq010) <- c("Diabetes", "No_Diabetes")

diab_pop.no_na_vals <- diab_pop %>% na.omit()
library('caret')

# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())

set.seed(8675309)
# The createDataPartition function is used to create training and test sets

trainIndex <- createDataPartition(diab_pop.no_na_vals$diq010, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

dm2.train <- diab_pop.no_na_vals[trainIndex, ]
dm2.test <- diab_pop.no_na_vals[-trainIndex, ]
# we will make a grid of values to test in cross-validation.
knnGrid <-  expand.grid(k = c(1:15))
# the method here is cv for cross-validation you could try "repeatedcv"  for repeated cross-fold validation
fitControl <- trainControl(method = "cv",
                           number = 10, 
                           # repeats = 10, # uncomment for repeatedcv 
                           ## Estimate class probabilities
                           classProbs = TRUE,
                           ## Evaluate performance using 
                           ## the following function
                           summaryFunction = twoClassSummary)
knnFit <- train(diq010 ~ ., # formula
                 data = dm2.train, # train data   
                 method = "knn", # method for caret see https://topepo.github.io/caret/available-models.html for list of models 
                 trControl = fitControl, 
                 tuneGrid = knnGrid,
                 ## Specify which metric to optimize
                 metric = "ROC")
knnFit

plot(knnFit)

str(knnFit,1)

knnFit$finalModel  
# let's score the test set using this model
pred_class <- predict(knnFit, dm2.test,'raw')
probs <- predict(knnFit, dm2.test,'prob')

dm2.test.scored <- cbind(dm2.test, pred_class, probs)

glimpse(dm2.test.scored)

# yardstick 
library('yardstick')
knnFit.conf_mat <- dm2.test.scored %>% 
  conf_mat(truth=diq010 , pred_class)
knnFit.roc_auc <- dm2.test.scored %>% 
  roc_auc(truth=diq010 , Diabetes)

knnFit.roc_auc

knnFit.roc_curve <- dm2.test.scored %>% 
  roc_curve(truth=diq010 , Diabetes) %>%
  autoplot() + 
  labs(caption = paste0("AUC : ", round(knnFit.roc_auc$.estimate,3)))

knnFit.roc_curve
knnFit.pr_auc <- dm2.test.scored %>% 
  pr_auc(truth=diq010 , Diabetes) 

knnFit.pr_curve <- dm2.test.scored %>% 
  pr_curve(truth=diq010 , Diabetes) %>%
  autoplot() +
  labs(caption = paste0("AUC : ",  round(knnFit.pr_auc$.estimate,3)))

knnFit.pr_curve