\(~\)

\(~\)

\(~\)

1 Read in the Data

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

\(~\)


\(~\)

1.1 Reminders

1.1.1 The Data

#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin  - Categorical
##### dmdeduc2 - Education level - Adults 20+  - Categorical
##### dmdmartl - Marital status  - Categorical
##### indhhin2 - Annual household income  - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical

\(~\)


\(~\)

\(~\)

1.1.2 Install if not Function

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

\(~\)

\(~\)


\(~\)

\(~\)

2 Split Data into Training and Test Sets.

library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
diab_pop.no_na_vals <- diab_pop %>% na.omit()

library('caret')
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())

set.seed(8675309)

# The createDataPartition function is used to create training and test sets

trainIndex <- createDataPartition(diab_pop.no_na_vals$diq010, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

dm2.train <- diab_pop.no_na_vals[trainIndex, ]
dm2.test <- diab_pop.no_na_vals[-trainIndex, ]

2.1 Make dummyVars

Often dummyVars or one-hot-encodings of categorical features are required for models:

dummyVars.dm2Train <- dummyVars( ~. , data = dm2.train)

dm2Train_dummies <- as_tibble( predict(dummyVars.dm2Train, dm2.train) )

\(~\)

\(~\)


\(~\)

\(~\)

2.2 findLinearCombos

For many models, we can find and remove features that are linear combinations of one another:

comboInformation <- findLinearCombos(dm2Train_dummies)

comboInformation
## $linearCombos
## $linearCombos[[1]]
## [1] 9 2 3 5 6 7 8
## 
## $linearCombos[[2]]
## [1] 14  2  3 10 11 12 13
## 
## $linearCombos[[3]]
## [1] 20  2  3 15 16 17 18 19
## 
## $linearCombos[[4]]
## [1] 27
## 
## $linearCombos[[5]]
## [1] 29
## 
## $linearCombos[[6]]
##  [1] 34  2  3 21 22 23 24 25 26 28 30 31 32 33
## 
## $linearCombos[[7]]
## [1] 37  2  3 36
## 
## 
## $remove
## [1]  9 14 20 27 29 34 37
dm2Train_dummies_independent <- dm2Train_dummies[,-comboInformation$remove ]

\(~\)

\(~\)


\(~\)

\(~\)

2.3 findCorrelation

Similarly, for many models, we will want to find and remove features that are highly correlated with one another:

features1 <- colnames(dm2Train_dummies_independent)[!colnames(dm2Train_dummies_independent) %in% c("seqn", "diq010.Diabetes")]
features1
##  [1] "riagendr.Male"                       "riagendr.Female"                    
##  [3] "ridageyr"                            "ridreth1.MexicanAmerican"           
##  [5] "ridreth1.Other Hispanic"             "ridreth1.Non-Hispanic White"        
##  [7] "ridreth1.Non-Hispanic Black"         "dmdeduc2.Less than 9th grade"       
##  [9] "dmdeduc2.Grades 9-11th"              "dmdeduc2.High school graduate/GED"  
## [11] "dmdeduc2.Some college or AA degrees" "dmdmartl.Married"                   
## [13] "dmdmartl.Widowed"                    "dmdmartl.Divorced"                  
## [15] "dmdmartl.Separated"                  "dmdmartl.Never married"             
## [17] "indhhin2.$0-$4,999"                  "indhhin2.$5,000-$9,999"             
## [19] "indhhin2.$10,000-$14,999"            "indhhin2.$15,000-$19,999"           
## [21] "indhhin2.$20,000-$24,999"            "indhhin2.$25,000-$34,999"           
## [23] "indhhin2.$45,000-$54,999"            "indhhin2.$65,000-$74,999"           
## [25] "indhhin2.20,000+"                    "indhhin2.less than $20,000"         
## [27] "indhhin2.$75,000-$99,999"            "bmxbmi"                             
## [29] "lbxglu"
cor_matrix <- cor(dm2Train_dummies_independent[, features1])
cor_matrix
##                                     riagendr.Male riagendr.Female     ridageyr
## riagendr.Male                        1.0000000000   -1.0000000000  0.046088493
## riagendr.Female                     -1.0000000000    1.0000000000 -0.046088493
## ridageyr                             0.0460884932   -0.0460884932  1.000000000
## ridreth1.MexicanAmerican            -0.0085703283    0.0085703283 -0.022071061
## ridreth1.Other Hispanic             -0.0412627007    0.0412627007  0.026956617
## ridreth1.Non-Hispanic White          0.0310058641   -0.0310058641  0.156358779
## ridreth1.Non-Hispanic Black         -0.0372062109    0.0372062109 -0.075988046
## dmdeduc2.Less than 9th grade        -0.0009220555    0.0009220555  0.147823014
## dmdeduc2.Grades 9-11th               0.0676150181   -0.0676150181 -0.017679412
## dmdeduc2.High school graduate/GED   -0.0077741175    0.0077741175  0.002357556
## dmdeduc2.Some college or AA degrees -0.0463613674    0.0463613674 -0.043787659
## dmdmartl.Married                     0.0882752805   -0.0882752805  0.105297351
## dmdmartl.Widowed                    -0.1110870515    0.1110870515  0.359280920
## dmdmartl.Divorced                   -0.0095450138    0.0095450138  0.160349391
## dmdmartl.Separated                  -0.0400687435    0.0400687435  0.048347775
## dmdmartl.Never married              -0.0084660196    0.0084660196 -0.391576503
## indhhin2.$0-$4,999                   0.0333800869   -0.0333800869 -0.005337734
## indhhin2.$5,000-$9,999               0.0202719396   -0.0202719396  0.103081009
## indhhin2.$10,000-$14,999            -0.0280078607    0.0280078607  0.099516500
## indhhin2.$15,000-$19,999            -0.0082798716    0.0082798716  0.047583836
## indhhin2.$20,000-$24,999            -0.0136157655    0.0136157655  0.004340429
## indhhin2.$25,000-$34,999            -0.0301474637    0.0301474637 -0.023087596
## indhhin2.$45,000-$54,999            -0.0014296884    0.0014296884 -0.038052431
## indhhin2.$65,000-$74,999             0.0066689392   -0.0066689392  0.005053337
## indhhin2.20,000+                    -0.0334160423    0.0334160423 -0.081919744
## indhhin2.less than $20,000          -0.0141467473    0.0141467473  0.056240125
## indhhin2.$75,000-$99,999             0.0370271813   -0.0370271813 -0.085864313
## bmxbmi                              -0.0740507417    0.0740507417  0.054940490
## lbxglu                               0.0525380850   -0.0525380850  0.221625646
##                                     ridreth1.MexicanAmerican
## riagendr.Male                                   -0.008570328
## riagendr.Female                                  0.008570328
## ridageyr                                        -0.022071061
## ridreth1.MexicanAmerican                         1.000000000
## ridreth1.Other Hispanic                         -0.177716978
## ridreth1.Non-Hispanic White                     -0.321970895
## ridreth1.Non-Hispanic Black                     -0.210901590
## dmdeduc2.Less than 9th grade                     0.328273872
## dmdeduc2.Grades 9-11th                           0.093925466
## dmdeduc2.High school graduate/GED               -0.025177896
## dmdeduc2.Some college or AA degrees             -0.115248278
## dmdmartl.Married                                 0.067258067
## dmdmartl.Widowed                                 0.052896641
## dmdmartl.Divorced                               -0.047754034
## dmdmartl.Separated                               0.052214662
## dmdmartl.Never married                          -0.105639997
## indhhin2.$0-$4,999                              -0.033075430
## indhhin2.$5,000-$9,999                           0.061304872
## indhhin2.$10,000-$14,999                         0.067602978
## indhhin2.$15,000-$19,999                         0.051828844
## indhhin2.$20,000-$24,999                         0.080518667
## indhhin2.$25,000-$34,999                        -0.022372295
## indhhin2.$45,000-$54,999                        -0.021509342
## indhhin2.$65,000-$74,999                        -0.019822460
## indhhin2.20,000+                                 0.106103297
## indhhin2.less than $20,000                       0.004210595
## indhhin2.$75,000-$99,999                        -0.058022583
## bmxbmi                                           0.116792190
## lbxglu                                           0.055799978
##                                     ridreth1.Other Hispanic
## riagendr.Male                                  -0.041262701
## riagendr.Female                                 0.041262701
## ridageyr                                        0.026956617
## ridreth1.MexicanAmerican                       -0.177716978
## ridreth1.Other Hispanic                         1.000000000
## ridreth1.Non-Hispanic White                    -0.308856305
## ridreth1.Non-Hispanic Black                    -0.202311100
## dmdeduc2.Less than 9th grade                    0.094191293
## dmdeduc2.Grades 9-11th                          0.059711756
## dmdeduc2.High school graduate/GED              -0.057372094
## dmdeduc2.Some college or AA degrees            -0.026650804
## dmdmartl.Married                               -0.015939096
## dmdmartl.Widowed                               -0.032212505
## dmdmartl.Divorced                               0.001870325
## dmdmartl.Separated                              0.031938656
## dmdmartl.Never married                         -0.035570780
## indhhin2.$0-$4,999                              0.030129995
## indhhin2.$5,000-$9,999                          0.011555790
## indhhin2.$10,000-$14,999                        0.068372573
## indhhin2.$15,000-$19,999                        0.043839553
## indhhin2.$20,000-$24,999                        0.010538303
## indhhin2.$25,000-$34,999                       -0.031578666
## indhhin2.$45,000-$54,999                       -0.012403737
## indhhin2.$65,000-$74,999                        0.007335408
## indhhin2.20,000+                               -0.015924383
## indhhin2.less than $20,000                      0.008793109
## indhhin2.$75,000-$99,999                        0.021713079
## bmxbmi                                          0.031603671
## lbxglu                                          0.053913405
##                                     ridreth1.Non-Hispanic White
## riagendr.Male                                       0.031005864
## riagendr.Female                                    -0.031005864
## ridageyr                                            0.156358779
## ridreth1.MexicanAmerican                           -0.321970895
## ridreth1.Other Hispanic                            -0.308856305
## ridreth1.Non-Hispanic White                         1.000000000
## ridreth1.Non-Hispanic Black                        -0.366528210
## dmdeduc2.Less than 9th grade                       -0.192023323
## dmdeduc2.Grades 9-11th                             -0.085261491
## dmdeduc2.High school graduate/GED                   0.081203139
## dmdeduc2.Some college or AA degrees                 0.070062959
## dmdmartl.Married                                    0.015294283
## dmdmartl.Widowed                                    0.059593558
## dmdmartl.Divorced                                   0.084753269
## dmdmartl.Separated                                 -0.050557371
## dmdmartl.Never married                             -0.074683860
## indhhin2.$0-$4,999                                 -0.034612554
## indhhin2.$5,000-$9,999                             -0.066432877
## indhhin2.$10,000-$14,999                           -0.060004455
## indhhin2.$15,000-$19,999                           -0.011209740
## indhhin2.$20,000-$24,999                           -0.027214942
## indhhin2.$25,000-$34,999                            0.040134962
## indhhin2.$45,000-$54,999                            0.006137821
## indhhin2.$65,000-$74,999                            0.013117724
## indhhin2.20,000+                                   -0.016619728
## indhhin2.less than $20,000                         -0.020652926
## indhhin2.$75,000-$99,999                           -0.030853880
## bmxbmi                                             -0.047290232
## lbxglu                                             -0.048149775
##                                     ridreth1.Non-Hispanic Black
## riagendr.Male                                      -0.037206211
## riagendr.Female                                     0.037206211
## ridageyr                                           -0.075988046
## ridreth1.MexicanAmerican                           -0.210901590
## ridreth1.Other Hispanic                            -0.202311100
## ridreth1.Non-Hispanic White                        -0.366528210
## ridreth1.Non-Hispanic Black                         1.000000000
## dmdeduc2.Less than 9th grade                       -0.078149171
## dmdeduc2.Grades 9-11th                             -0.004177363
## dmdeduc2.High school graduate/GED                   0.080487107
## dmdeduc2.Some college or AA degrees                 0.063409064
## dmdmartl.Married                                   -0.159818914
## dmdmartl.Widowed                                   -0.037916187
## dmdmartl.Divorced                                   0.009980058
## dmdmartl.Separated                                  0.054691455
## dmdmartl.Never married                              0.192682039
## indhhin2.$0-$4,999                                  0.058020816
## indhhin2.$5,000-$9,999                              0.045378363
## indhhin2.$10,000-$14,999                            0.027025111
## indhhin2.$15,000-$19,999                           -0.046330517
## indhhin2.$20,000-$24,999                            0.012999189
## indhhin2.$25,000-$34,999                            0.015608909
## indhhin2.$45,000-$54,999                            0.026768158
## indhhin2.$65,000-$74,999                           -0.002248780
## indhhin2.20,000+                                   -0.048907525
## indhhin2.less than $20,000                          0.021062862
## indhhin2.$75,000-$99,999                            0.010089490
## bmxbmi                                              0.106419542
## lbxglu                                              0.011610207
##                                     dmdeduc2.Less than 9th grade
## riagendr.Male                                      -0.0009220555
## riagendr.Female                                     0.0009220555
## ridageyr                                            0.1478230144
## ridreth1.MexicanAmerican                            0.3282738722
## ridreth1.Other Hispanic                             0.0941912931
## ridreth1.Non-Hispanic White                        -0.1920233230
## ridreth1.Non-Hispanic Black                        -0.0781491710
## dmdeduc2.Less than 9th grade                        1.0000000000
## dmdeduc2.Grades 9-11th                             -0.1321511581
## dmdeduc2.High school graduate/GED                  -0.2020700027
## dmdeduc2.Some college or AA degrees                -0.2269233016
## dmdmartl.Married                                   -0.0107183027
## dmdmartl.Widowed                                    0.1932578139
## dmdmartl.Divorced                                  -0.0327518205
## dmdmartl.Separated                                  0.0788352705
## dmdmartl.Never married                             -0.0951645902
## indhhin2.$0-$4,999                                  0.0301543083
## indhhin2.$5,000-$9,999                              0.1435950588
## indhhin2.$10,000-$14,999                            0.1268623876
## indhhin2.$15,000-$19,999                            0.0700824303
## indhhin2.$20,000-$24,999                            0.0934027377
## indhhin2.$25,000-$34,999                           -0.0404739930
## indhhin2.$45,000-$54,999                           -0.0354900485
## indhhin2.$65,000-$74,999                           -0.0222554498
## indhhin2.20,000+                                   -0.0192741216
## indhhin2.less than $20,000                          0.0585181739
## indhhin2.$75,000-$99,999                           -0.0842356013
## bmxbmi                                              0.0124665369
## lbxglu                                              0.0788325530
##                                     dmdeduc2.Grades 9-11th
## riagendr.Male                                  0.067615018
## riagendr.Female                               -0.067615018
## ridageyr                                      -0.017679412
## ridreth1.MexicanAmerican                       0.093925466
## ridreth1.Other Hispanic                        0.059711756
## ridreth1.Non-Hispanic White                   -0.085261491
## ridreth1.Non-Hispanic Black                   -0.004177363
## dmdeduc2.Less than 9th grade                  -0.132151158
## dmdeduc2.Grades 9-11th                         1.000000000
## dmdeduc2.High school graduate/GED             -0.194387846
## dmdeduc2.Some college or AA degrees           -0.218296289
## dmdmartl.Married                              -0.024721213
## dmdmartl.Widowed                              -0.038120134
## dmdmartl.Divorced                              0.011198350
## dmdmartl.Separated                             0.070645079
## dmdmartl.Never married                        -0.005731553
## indhhin2.$0-$4,999                             0.101146120
## indhhin2.$5,000-$9,999                         0.075501039
## indhhin2.$10,000-$14,999                       0.040530516
## indhhin2.$15,000-$19,999                       0.048786101
## indhhin2.$20,000-$24,999                       0.033182461
## indhhin2.$25,000-$34,999                       0.047684818
## indhhin2.$45,000-$54,999                      -0.038028797
## indhhin2.$65,000-$74,999                      -0.015752518
## indhhin2.20,000+                               0.033202359
## indhhin2.less than $20,000                    -0.013741434
## indhhin2.$75,000-$99,999                      -0.025204946
## bmxbmi                                         0.006400369
## lbxglu                                        -0.021872981
##                                     dmdeduc2.High school graduate/GED
## riagendr.Male                                            -0.007774118
## riagendr.Female                                           0.007774118
## ridageyr                                                  0.002357556
## ridreth1.MexicanAmerican                                 -0.025177896
## ridreth1.Other Hispanic                                  -0.057372094
## ridreth1.Non-Hispanic White                               0.081203139
## ridreth1.Non-Hispanic Black                               0.080487107
## dmdeduc2.Less than 9th grade                             -0.202070003
## dmdeduc2.Grades 9-11th                                   -0.194387846
## dmdeduc2.High school graduate/GED                         1.000000000
## dmdeduc2.Some college or AA degrees                      -0.333793002
## dmdmartl.Married                                         -0.041561558
## dmdmartl.Widowed                                          0.052185083
## dmdmartl.Divorced                                         0.013911245
## dmdmartl.Separated                                       -0.033930172
## dmdmartl.Never married                                    0.014957400
## indhhin2.$0-$4,999                                        0.014936767
## indhhin2.$5,000-$9,999                                   -0.025510668
## indhhin2.$10,000-$14,999                                  0.066777014
## indhhin2.$15,000-$19,999                                  0.078688206
## indhhin2.$20,000-$24,999                                 -0.028642427
## indhhin2.$25,000-$34,999                                  0.080105990
## indhhin2.$45,000-$54,999                                  0.005154086
## indhhin2.$65,000-$74,999                                  0.038227235
## indhhin2.20,000+                                         -0.012414401
## indhhin2.less than $20,000                                0.007328464
## indhhin2.$75,000-$99,999                                 -0.067298498
## bmxbmi                                                    0.059502755
## lbxglu                                                    0.038260129
##                                     dmdeduc2.Some college or AA degrees
## riagendr.Male                                              -0.046361367
## riagendr.Female                                             0.046361367
## ridageyr                                                   -0.043787659
## ridreth1.MexicanAmerican                                   -0.115248278
## ridreth1.Other Hispanic                                    -0.026650804
## ridreth1.Non-Hispanic White                                 0.070062959
## ridreth1.Non-Hispanic Black                                 0.063409064
## dmdeduc2.Less than 9th grade                               -0.226923302
## dmdeduc2.Grades 9-11th                                     -0.218296289
## dmdeduc2.High school graduate/GED                          -0.333793002
## dmdeduc2.Some college or AA degrees                         1.000000000
## dmdmartl.Married                                           -0.050585041
## dmdmartl.Widowed                                           -0.054164114
## dmdmartl.Divorced                                           0.049643221
## dmdmartl.Separated                                         -0.017812444
## dmdmartl.Never married                                      0.051589769
## indhhin2.$0-$4,999                                         -0.061416580
## indhhin2.$5,000-$9,999                                     -0.009211246
## indhhin2.$10,000-$14,999                                   -0.056464018
## indhhin2.$15,000-$19,999                                   -0.049835260
## indhhin2.$20,000-$24,999                                    0.029408267
## indhhin2.$25,000-$34,999                                    0.017885975
## indhhin2.$45,000-$54,999                                    0.082630518
## indhhin2.$65,000-$74,999                                    0.015921913
## indhhin2.20,000+                                           -0.029218897
## indhhin2.less than $20,000                                  0.006303120
## indhhin2.$75,000-$99,999                                    0.024863521
## bmxbmi                                                      0.071803429
## lbxglu                                                     -0.040285393
##                                     dmdmartl.Married dmdmartl.Widowed
## riagendr.Male                             0.08827528     -0.111087051
## riagendr.Female                          -0.08827528      0.111087051
## ridageyr                                  0.10529735      0.359280920
## ridreth1.MexicanAmerican                  0.06725807      0.052896641
## ridreth1.Other Hispanic                  -0.01593910     -0.032212505
## ridreth1.Non-Hispanic White               0.01529428      0.059593558
## ridreth1.Non-Hispanic Black              -0.15981891     -0.037916187
## dmdeduc2.Less than 9th grade             -0.01071830      0.193257814
## dmdeduc2.Grades 9-11th                   -0.02472121     -0.038120134
## dmdeduc2.High school graduate/GED        -0.04156156      0.052185083
## dmdeduc2.Some college or AA degrees      -0.05058504     -0.054164114
## dmdmartl.Married                          1.00000000     -0.289838225
## dmdmartl.Widowed                         -0.28983822      1.000000000
## dmdmartl.Divorced                        -0.35357750     -0.099608674
## dmdmartl.Separated                       -0.19212731     -0.054125464
## dmdmartl.Never married                   -0.47425428     -0.133605334
## indhhin2.$0-$4,999                       -0.11669433      0.028159045
## indhhin2.$5,000-$9,999                   -0.10623715      0.138000043
## indhhin2.$10,000-$14,999                 -0.10652619      0.113977642
## indhhin2.$15,000-$19,999                 -0.13782316      0.026278741
## indhhin2.$20,000-$24,999                 -0.03946540     -0.002073153
## indhhin2.$25,000-$34,999                 -0.02589437      0.040742036
## indhhin2.$45,000-$54,999                  0.07449836     -0.034556429
## indhhin2.$65,000-$74,999                  0.05005180     -0.042747474
## indhhin2.20,000+                         -0.05884298     -0.051180644
## indhhin2.less than $20,000               -0.05129407      0.027658261
## indhhin2.$75,000-$99,999                  0.06660687     -0.072779967
## bmxbmi                                    0.02601791      0.013266915
## lbxglu                                    0.02798044      0.047303877
##                                     dmdmartl.Divorced dmdmartl.Separated
## riagendr.Male                            -0.009545014       -0.040068744
## riagendr.Female                           0.009545014        0.040068744
## ridageyr                                  0.160349391        0.048347775
## ridreth1.MexicanAmerican                 -0.047754034        0.052214662
## ridreth1.Other Hispanic                   0.001870325        0.031938656
## ridreth1.Non-Hispanic White               0.084753269       -0.050557371
## ridreth1.Non-Hispanic Black               0.009980058        0.054691455
## dmdeduc2.Less than 9th grade             -0.032751820        0.078835270
## dmdeduc2.Grades 9-11th                    0.011198350        0.070645079
## dmdeduc2.High school graduate/GED         0.013911245       -0.033930172
## dmdeduc2.Some college or AA degrees       0.049643221       -0.017812444
## dmdmartl.Married                         -0.353577501       -0.192127306
## dmdmartl.Widowed                         -0.099608674       -0.054125464
## dmdmartl.Divorced                         1.000000000       -0.066028372
## dmdmartl.Separated                       -0.066028372        1.000000000
## dmdmartl.Never married                   -0.162986922       -0.088564001
## indhhin2.$0-$4,999                        0.105461562       -0.005041179
## indhhin2.$5,000-$9,999                    0.053565917        0.047206824
## indhhin2.$10,000-$14,999                  0.034257498        0.077752191
## indhhin2.$15,000-$19,999                  0.074854585        0.050752918
## indhhin2.$20,000-$24,999                 -0.013293014        0.082296889
## indhhin2.$25,000-$34,999                 -0.001623868        0.002469709
## indhhin2.$45,000-$54,999                  0.024608971       -0.061061909
## indhhin2.$65,000-$74,999                 -0.011975714        0.001674479
## indhhin2.20,000+                         -0.029507199       -0.005940246
## indhhin2.less than $20,000               -0.011876911        0.039301796
## indhhin2.$75,000-$99,999                 -0.064856057       -0.069025777
## bmxbmi                                    0.030141728        0.056879393
## lbxglu                                    0.091058683        0.029194977
##                                     dmdmartl.Never married indhhin2.$0-$4,999
## riagendr.Male                                 -0.008466020        0.033380087
## riagendr.Female                                0.008466020       -0.033380087
## ridageyr                                      -0.391576503       -0.005337734
## ridreth1.MexicanAmerican                      -0.105639997       -0.033075430
## ridreth1.Other Hispanic                       -0.035570780        0.030129995
## ridreth1.Non-Hispanic White                   -0.074683860       -0.034612554
## ridreth1.Non-Hispanic Black                    0.192682039        0.058020816
## dmdeduc2.Less than 9th grade                  -0.095164590        0.030154308
## dmdeduc2.Grades 9-11th                        -0.005731553        0.101146120
## dmdeduc2.High school graduate/GED              0.014957400        0.014936767
## dmdeduc2.Some college or AA degrees            0.051589769       -0.061416580
## dmdmartl.Married                              -0.474254283       -0.116694334
## dmdmartl.Widowed                              -0.133605334        0.028159045
## dmdmartl.Divorced                             -0.162986922        0.105461562
## dmdmartl.Separated                            -0.088564001       -0.005041179
## dmdmartl.Never married                         1.000000000        0.052759404
## indhhin2.$0-$4,999                             0.052759404        1.000000000
## indhhin2.$5,000-$9,999                         0.022907899       -0.039986625
## indhhin2.$10,000-$14,999                       0.020394547       -0.049452109
## indhhin2.$15,000-$19,999                       0.014224570       -0.052321316
## indhhin2.$20,000-$24,999                      -0.025338713       -0.053562472
## indhhin2.$25,000-$34,999                       0.031583171       -0.073892443
## indhhin2.$45,000-$54,999                      -0.023905784       -0.056882803
## indhhin2.$65,000-$74,999                      -0.035848411       -0.050099561
## indhhin2.20,000+                               0.116319739       -0.031604563
## indhhin2.less than $20,000                     0.027156280       -0.026040119
## indhhin2.$75,000-$99,999                       0.009495700       -0.064301620
## bmxbmi                                        -0.106358679        0.040396520
## lbxglu                                        -0.127696270       -0.019100099
##                                     indhhin2.$5,000-$9,999
## riagendr.Male                                  0.020271940
## riagendr.Female                               -0.020271940
## ridageyr                                       0.103081009
## ridreth1.MexicanAmerican                       0.061304872
## ridreth1.Other Hispanic                        0.011555790
## ridreth1.Non-Hispanic White                   -0.066432877
## ridreth1.Non-Hispanic Black                    0.045378363
## dmdeduc2.Less than 9th grade                   0.143595059
## dmdeduc2.Grades 9-11th                         0.075501039
## dmdeduc2.High school graduate/GED             -0.025510668
## dmdeduc2.Some college or AA degrees           -0.009211246
## dmdmartl.Married                              -0.106237151
## dmdmartl.Widowed                               0.138000043
## dmdmartl.Divorced                              0.053565917
## dmdmartl.Separated                             0.047206824
## dmdmartl.Never married                         0.022907899
## indhhin2.$0-$4,999                            -0.039986625
## indhhin2.$5,000-$9,999                         1.000000000
## indhhin2.$10,000-$14,999                      -0.063510172
## indhhin2.$15,000-$19,999                      -0.067195027
## indhhin2.$20,000-$24,999                      -0.068789013
## indhhin2.$25,000-$34,999                      -0.094898314
## indhhin2.$45,000-$54,999                      -0.073053236
## indhhin2.$65,000-$74,999                      -0.064341679
## indhhin2.20,000+                              -0.040588991
## indhhin2.less than $20,000                    -0.033442708
## indhhin2.$75,000-$99,999                      -0.082581047
## bmxbmi                                         0.009554692
## lbxglu                                        -0.005718597
##                                     indhhin2.$10,000-$14,999
## riagendr.Male                                    -0.02800786
## riagendr.Female                                   0.02800786
## ridageyr                                          0.09951650
## ridreth1.MexicanAmerican                          0.06760298
## ridreth1.Other Hispanic                           0.06837257
## ridreth1.Non-Hispanic White                      -0.06000445
## ridreth1.Non-Hispanic Black                       0.02702511
## dmdeduc2.Less than 9th grade                      0.12686239
## dmdeduc2.Grades 9-11th                            0.04053052
## dmdeduc2.High school graduate/GED                 0.06677701
## dmdeduc2.Some college or AA degrees              -0.05646402
## dmdmartl.Married                                 -0.10652619
## dmdmartl.Widowed                                  0.11397764
## dmdmartl.Divorced                                 0.03425750
## dmdmartl.Separated                                0.07775219
## dmdmartl.Never married                            0.02039455
## indhhin2.$0-$4,999                               -0.04945211
## indhhin2.$5,000-$9,999                           -0.06351017
## indhhin2.$10,000-$14,999                          1.00000000
## indhhin2.$15,000-$19,999                         -0.08310118
## indhhin2.$20,000-$24,999                         -0.08507249
## indhhin2.$25,000-$34,999                         -0.11736229
## indhhin2.$45,000-$54,999                         -0.09034612
## indhhin2.$65,000-$74,999                         -0.07957240
## indhhin2.20,000+                                 -0.05019706
## indhhin2.less than $20,000                       -0.04135914
## indhhin2.$75,000-$99,999                         -0.10212932
## bmxbmi                                           -0.01186220
## lbxglu                                            0.00247967
##                                     indhhin2.$15,000-$19,999
## riagendr.Male                                   -0.008279872
## riagendr.Female                                  0.008279872
## ridageyr                                         0.047583836
## ridreth1.MexicanAmerican                         0.051828844
## ridreth1.Other Hispanic                          0.043839553
## ridreth1.Non-Hispanic White                     -0.011209740
## ridreth1.Non-Hispanic Black                     -0.046330517
## dmdeduc2.Less than 9th grade                     0.070082430
## dmdeduc2.Grades 9-11th                           0.048786101
## dmdeduc2.High school graduate/GED                0.078688206
## dmdeduc2.Some college or AA degrees             -0.049835260
## dmdmartl.Married                                -0.137823165
## dmdmartl.Widowed                                 0.026278741
## dmdmartl.Divorced                                0.074854585
## dmdmartl.Separated                               0.050752918
## dmdmartl.Never married                           0.014224570
## indhhin2.$0-$4,999                              -0.052321316
## indhhin2.$5,000-$9,999                          -0.067195027
## indhhin2.$10,000-$14,999                        -0.083101181
## indhhin2.$15,000-$19,999                         1.000000000
## indhhin2.$20,000-$24,999                        -0.090008390
## indhhin2.$25,000-$34,999                        -0.124171637
## indhhin2.$45,000-$54,999                        -0.095587998
## indhhin2.$65,000-$74,999                        -0.084189185
## indhhin2.20,000+                                -0.053109495
## indhhin2.less than $20,000                      -0.043758794
## indhhin2.$75,000-$99,999                        -0.108054857
## bmxbmi                                          -0.021396061
## lbxglu                                           0.064868950
##                                     indhhin2.$20,000-$24,999
## riagendr.Male                                   -0.013615765
## riagendr.Female                                  0.013615765
## ridageyr                                         0.004340429
## ridreth1.MexicanAmerican                         0.080518667
## ridreth1.Other Hispanic                          0.010538303
## ridreth1.Non-Hispanic White                     -0.027214942
## ridreth1.Non-Hispanic Black                      0.012999189
## dmdeduc2.Less than 9th grade                     0.093402738
## dmdeduc2.Grades 9-11th                           0.033182461
## dmdeduc2.High school graduate/GED               -0.028642427
## dmdeduc2.Some college or AA degrees              0.029408267
## dmdmartl.Married                                -0.039465404
## dmdmartl.Widowed                                -0.002073153
## dmdmartl.Divorced                               -0.013293014
## dmdmartl.Separated                               0.082296889
## dmdmartl.Never married                          -0.025338713
## indhhin2.$0-$4,999                              -0.053562472
## indhhin2.$5,000-$9,999                          -0.068789013
## indhhin2.$10,000-$14,999                        -0.085072490
## indhhin2.$15,000-$19,999                        -0.090008390
## indhhin2.$20,000-$24,999                         1.000000000
## indhhin2.$25,000-$34,999                        -0.127117211
## indhhin2.$45,000-$54,999                        -0.097855517
## indhhin2.$65,000-$74,999                        -0.086186303
## indhhin2.20,000+                                -0.054369347
## indhhin2.less than $20,000                      -0.044796831
## indhhin2.$75,000-$99,999                        -0.110618112
## bmxbmi                                           0.035630116
## lbxglu                                           0.051934542
##                                     indhhin2.$25,000-$34,999
## riagendr.Male                                   -0.030147464
## riagendr.Female                                  0.030147464
## ridageyr                                        -0.023087596
## ridreth1.MexicanAmerican                        -0.022372295
## ridreth1.Other Hispanic                         -0.031578666
## ridreth1.Non-Hispanic White                      0.040134962
## ridreth1.Non-Hispanic Black                      0.015608909
## dmdeduc2.Less than 9th grade                    -0.040473993
## dmdeduc2.Grades 9-11th                           0.047684818
## dmdeduc2.High school graduate/GED                0.080105990
## dmdeduc2.Some college or AA degrees              0.017885975
## dmdmartl.Married                                -0.025894374
## dmdmartl.Widowed                                 0.040742036
## dmdmartl.Divorced                               -0.001623868
## dmdmartl.Separated                               0.002469709
## dmdmartl.Never married                           0.031583171
## indhhin2.$0-$4,999                              -0.073892443
## indhhin2.$5,000-$9,999                          -0.094898314
## indhhin2.$10,000-$14,999                        -0.117362287
## indhhin2.$15,000-$19,999                        -0.124171637
## indhhin2.$20,000-$24,999                        -0.127117211
## indhhin2.$25,000-$34,999                         1.000000000
## indhhin2.$45,000-$54,999                        -0.134997191
## indhhin2.$65,000-$74,999                        -0.118898854
## indhhin2.20,000+                                -0.075005573
## indhhin2.less than $20,000                      -0.061799749
## indhhin2.$75,000-$99,999                        -0.152603909
## bmxbmi                                          -0.018310770
## lbxglu                                           0.012113054
##                                     indhhin2.$45,000-$54,999
## riagendr.Male                                   -0.001429688
## riagendr.Female                                  0.001429688
## ridageyr                                        -0.038052431
## ridreth1.MexicanAmerican                        -0.021509342
## ridreth1.Other Hispanic                         -0.012403737
## ridreth1.Non-Hispanic White                      0.006137821
## ridreth1.Non-Hispanic Black                      0.026768158
## dmdeduc2.Less than 9th grade                    -0.035490049
## dmdeduc2.Grades 9-11th                          -0.038028797
## dmdeduc2.High school graduate/GED                0.005154086
## dmdeduc2.Some college or AA degrees              0.082630518
## dmdmartl.Married                                 0.074498358
## dmdmartl.Widowed                                -0.034556429
## dmdmartl.Divorced                                0.024608971
## dmdmartl.Separated                              -0.061061909
## dmdmartl.Never married                          -0.023905784
## indhhin2.$0-$4,999                              -0.056882803
## indhhin2.$5,000-$9,999                          -0.073053236
## indhhin2.$10,000-$14,999                        -0.090346124
## indhhin2.$15,000-$19,999                        -0.095587998
## indhhin2.$20,000-$24,999                        -0.097855517
## indhhin2.$25,000-$34,999                        -0.134997191
## indhhin2.$45,000-$54,999                         1.000000000
## indhhin2.$65,000-$74,999                        -0.091528982
## indhhin2.20,000+                                -0.057739696
## indhhin2.less than $20,000                      -0.047573781
## indhhin2.$75,000-$99,999                        -0.117475316
## bmxbmi                                           0.034408507
## lbxglu                                           0.017172744
##                                     indhhin2.$65,000-$74,999 indhhin2.20,000+
## riagendr.Male                                    0.006668939     -0.033416042
## riagendr.Female                                 -0.006668939      0.033416042
## ridageyr                                         0.005053337     -0.081919744
## ridreth1.MexicanAmerican                        -0.019822460      0.106103297
## ridreth1.Other Hispanic                          0.007335408     -0.015924383
## ridreth1.Non-Hispanic White                      0.013117724     -0.016619728
## ridreth1.Non-Hispanic Black                     -0.002248780     -0.048907525
## dmdeduc2.Less than 9th grade                    -0.022255450     -0.019274122
## dmdeduc2.Grades 9-11th                          -0.015752518      0.033202359
## dmdeduc2.High school graduate/GED                0.038227235     -0.012414401
## dmdeduc2.Some college or AA degrees              0.015921913     -0.029218897
## dmdmartl.Married                                 0.050051795     -0.058842981
## dmdmartl.Widowed                                -0.042747474     -0.051180644
## dmdmartl.Divorced                               -0.011975714     -0.029507199
## dmdmartl.Separated                               0.001674479     -0.005940246
## dmdmartl.Never married                          -0.035848411      0.116319739
## indhhin2.$0-$4,999                              -0.050099561     -0.031604563
## indhhin2.$5,000-$9,999                          -0.064341679     -0.040588991
## indhhin2.$10,000-$14,999                        -0.079572401     -0.050197065
## indhhin2.$15,000-$19,999                        -0.084189185     -0.053109495
## indhhin2.$20,000-$24,999                        -0.086186303     -0.054369347
## indhhin2.$25,000-$34,999                        -0.118898854     -0.075005573
## indhhin2.$45,000-$54,999                        -0.091528982     -0.057739696
## indhhin2.$65,000-$74,999                         1.000000000     -0.050854271
## indhhin2.20,000+                                -0.050854271      1.000000000
## indhhin2.less than $20,000                      -0.041900635     -0.026432392
## indhhin2.$75,000-$99,999                        -0.103466452     -0.065270272
## bmxbmi                                           0.025660367      0.025320599
## lbxglu                                          -0.064348404      0.012292850
##                                     indhhin2.less than $20,000
## riagendr.Male                                     -0.014146747
## riagendr.Female                                    0.014146747
## ridageyr                                           0.056240125
## ridreth1.MexicanAmerican                           0.004210595
## ridreth1.Other Hispanic                            0.008793109
## ridreth1.Non-Hispanic White                       -0.020652926
## ridreth1.Non-Hispanic Black                        0.021062862
## dmdeduc2.Less than 9th grade                       0.058518174
## dmdeduc2.Grades 9-11th                            -0.013741434
## dmdeduc2.High school graduate/GED                  0.007328464
## dmdeduc2.Some college or AA degrees                0.006303120
## dmdmartl.Married                                  -0.051294066
## dmdmartl.Widowed                                   0.027658261
## dmdmartl.Divorced                                 -0.011876911
## dmdmartl.Separated                                 0.039301796
## dmdmartl.Never married                             0.027156280
## indhhin2.$0-$4,999                                -0.026040119
## indhhin2.$5,000-$9,999                            -0.033442708
## indhhin2.$10,000-$14,999                          -0.041359140
## indhhin2.$15,000-$19,999                          -0.043758794
## indhhin2.$20,000-$24,999                          -0.044796831
## indhhin2.$25,000-$34,999                          -0.061799749
## indhhin2.$45,000-$54,999                          -0.047573781
## indhhin2.$65,000-$74,999                          -0.041900635
## indhhin2.20,000+                                  -0.026432392
## indhhin2.less than $20,000                         1.000000000
## indhhin2.$75,000-$99,999                          -0.053778489
## bmxbmi                                             0.014670312
## lbxglu                                             0.047248443
##                                     indhhin2.$75,000-$99,999       bmxbmi
## riagendr.Male                                    0.037027181 -0.074050742
## riagendr.Female                                 -0.037027181  0.074050742
## ridageyr                                        -0.085864313  0.054940490
## ridreth1.MexicanAmerican                        -0.058022583  0.116792190
## ridreth1.Other Hispanic                          0.021713079  0.031603671
## ridreth1.Non-Hispanic White                     -0.030853880 -0.047290232
## ridreth1.Non-Hispanic Black                      0.010089490  0.106419542
## dmdeduc2.Less than 9th grade                    -0.084235601  0.012466537
## dmdeduc2.Grades 9-11th                          -0.025204946  0.006400369
## dmdeduc2.High school graduate/GED               -0.067298498  0.059502755
## dmdeduc2.Some college or AA degrees              0.024863521  0.071803429
## dmdmartl.Married                                 0.066606870  0.026017907
## dmdmartl.Widowed                                -0.072779967  0.013266915
## dmdmartl.Divorced                               -0.064856057  0.030141728
## dmdmartl.Separated                              -0.069025777  0.056879393
## dmdmartl.Never married                           0.009495700 -0.106358679
## indhhin2.$0-$4,999                              -0.064301620  0.040396520
## indhhin2.$5,000-$9,999                          -0.082581047  0.009554692
## indhhin2.$10,000-$14,999                        -0.102129322 -0.011862198
## indhhin2.$15,000-$19,999                        -0.108054857 -0.021396061
## indhhin2.$20,000-$24,999                        -0.110618112  0.035630116
## indhhin2.$25,000-$34,999                        -0.152603909 -0.018310770
## indhhin2.$45,000-$54,999                        -0.117475316  0.034408507
## indhhin2.$65,000-$74,999                        -0.103466452  0.025660367
## indhhin2.20,000+                                -0.065270272  0.025320599
## indhhin2.less than $20,000                      -0.053778489  0.014670312
## indhhin2.$75,000-$99,999                         1.000000000 -0.002492344
## bmxbmi                                          -0.002492344  1.000000000
## lbxglu                                          -0.007916544  0.128391900
##                                           lbxglu
## riagendr.Male                        0.052538085
## riagendr.Female                     -0.052538085
## ridageyr                             0.221625646
## ridreth1.MexicanAmerican             0.055799978
## ridreth1.Other Hispanic              0.053913405
## ridreth1.Non-Hispanic White         -0.048149775
## ridreth1.Non-Hispanic Black          0.011610207
## dmdeduc2.Less than 9th grade         0.078832553
## dmdeduc2.Grades 9-11th              -0.021872981
## dmdeduc2.High school graduate/GED    0.038260129
## dmdeduc2.Some college or AA degrees -0.040285393
## dmdmartl.Married                     0.027980439
## dmdmartl.Widowed                     0.047303877
## dmdmartl.Divorced                    0.091058683
## dmdmartl.Separated                   0.029194977
## dmdmartl.Never married              -0.127696270
## indhhin2.$0-$4,999                  -0.019100099
## indhhin2.$5,000-$9,999              -0.005718597
## indhhin2.$10,000-$14,999             0.002479670
## indhhin2.$15,000-$19,999             0.064868950
## indhhin2.$20,000-$24,999             0.051934542
## indhhin2.$25,000-$34,999             0.012113054
## indhhin2.$45,000-$54,999             0.017172744
## indhhin2.$65,000-$74,999            -0.064348404
## indhhin2.20,000+                     0.012292850
## indhhin2.less than $20,000           0.047248443
## indhhin2.$75,000-$99,999            -0.007916544
## bmxbmi                               0.128391900
## lbxglu                               1.000000000
cor_high <- findCorrelation(cor_matrix, .9)
cor_high
## [1] 1
high_cor_remove <- row.names(cor_matrix)[cor_high] 
high_cor_remove
## [1] "riagendr.Male"
features2 <- setdiff(features1, high_cor_remove)

dm2_Train_independent_non_corr <- dm2Train_dummies_independent[, c("seqn", "diq010.Diabetes",features2) ]

\(~\)

\(~\)


\(~\)

\(~\)

2.4 Normalization

preProcValues.range <- preProcess(dm2_Train_independent_non_corr[,features2], method = c("range"))

dm2Train_transformed.range <- predict(preProcValues.range, dm2_Train_independent_non_corr)

dm2Test_dummies <-  as_tibble( predict(dummyVars.dm2Train, dm2.test) )

dm2Test_transformed.range <- as_tibble( predict(preProcValues.range, dm2Test_dummies) )

\(~\)

\(~\)

\(~\)

3 Train knn Model

install_if_not('class')
## [1] "the package 'class' is already installed"
library('class')

dim(dm2.train)
## [1] 1126   10
dm2Test_transformed.range$knn_pred <- knn(dm2Train_transformed.range[,features2] , 
                                           dm2Test_transformed.range[,features2] ,
                                           cl = dm2Train_transformed.range$diq010.Diabetes,
                                           k = 5)
glimpse(dm2.test)
## Rows: 750
## Columns: 10
## $ seqn     <dbl> 83734, 83737, 83757, 83761, 83789, 83820, 83822, 83823, 83...
## $ riagendr <fct> Male, Female, Female, Female, Male, Male, Female, Female, ...
## $ ridageyr <dbl> 78, 72, 57, 24, 66, 70, 20, 29, 69, 71, 37, 49, 41, 54, 80...
## $ ridreth1 <fct> Non-Hispanic White, MexicanAmerican, Other Hispanic, Other...
## $ dmdeduc2 <fct> High school graduate/GED, Grades 9-11th, Less than 9th gra...
## $ dmdmartl <fct> Married, Separated, Separated, Never married, Living with ...
## $ indhhin2 <fct> "$20,000-$24,999", "$75,000-$99,999", "$20,000-$24,999", "...
## $ bmxbmi   <dbl> 28.8, 28.6, 35.4, 25.3, 34.0, 27.0, 22.2, 29.7, 28.2, 27.6...
## $ diq010   <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,...
## $ lbxglu   <dbl> 84, 107, 398, 95, 113, 94, 80, 102, 105, 76, 79, 126, 110,...
dm2Test_transformed.range <- dm2Test_transformed.range %>%
  mutate(diq010.Diabetes = as.factor(diq010.Diabetes)) 

glimpse(dm2Test_transformed.range)
## Rows: 750
## Columns: 39
## $ seqn                                  <dbl> 83734, 83737, 83757, 83761, 8...
## $ riagendr.Male                         <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,...
## $ riagendr.Female                       <dbl> 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,...
## $ ridageyr                              <dbl> 0.96666667, 0.86666667, 0.616...
## $ ridreth1.MexicanAmerican              <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ `ridreth1.Other Hispanic`             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,...
## $ `ridreth1.Non-Hispanic White`         <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,...
## $ `ridreth1.Non-Hispanic Black`         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,...
## $ ridreth1.Other                        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ `dmdeduc2.Less than 9th grade`        <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,...
## $ `dmdeduc2.Grades 9-11th`              <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `dmdeduc2.High school graduate/GED`   <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ `dmdeduc2.Some college or AA degrees` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,...
## $ `dmdeduc2.College grad or above`      <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,...
## $ dmdmartl.Married                      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ dmdmartl.Widowed                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ dmdmartl.Divorced                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ dmdmartl.Separated                    <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ `dmdmartl.Never married`              <dbl> 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,...
## $ `dmdmartl.Living with partner`        <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,...
## $ `indhhin2.$0-$4,999`                  <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.$5,000-$9,999`              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.$10,000-$14,999`            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,...
## $ `indhhin2.$15,000-$19,999`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.$20,000-$24,999`            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.$25,000-$34,999`            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,...
## $ `indhhin2.$35,000-$44,999`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.$45,000-$54,999`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.$55,000-$64,999`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.$65,000-$74,999`            <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,...
## $ `indhhin2.20,000+`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.less than $20,000`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.$75,000-$99,999`            <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `indhhin2.$100,000+`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bmxbmi                                <dbl> 0.25779626, 0.25363825, 0.395...
## $ diq010.Diabetes                       <fct> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,...
## $ `diq010.No Diabetes`                  <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,...
## $ lbxglu                                <dbl> 0.07925408, 0.13286713, 0.811...
## $ knn_pred                              <fct> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,...
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
## 
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
## The following object is masked from 'package:readr':
## 
##     spec
conf_mat(dm2Test_transformed.range, truth=diq010.Diabetes, knn_pred)
##           Truth
## Prediction   0   1
##          0 608  98
##          1  30  14
summary(conf_mat(dm2Test_transformed.range, truth=diq010.Diabetes, knn_pred))
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.829 
##  2 kap                  binary        0.104 
##  3 sens                 binary        0.953 
##  4 spec                 binary        0.125 
##  5 ppv                  binary        0.861 
##  6 npv                  binary        0.318 
##  7 mcc                  binary        0.118 
##  8 j_index              binary        0.0780
##  9 bal_accuracy         binary        0.539 
## 10 detection_prevalence binary        0.941 
## 11 precision            binary        0.861 
## 12 recall               binary        0.953 
## 13 f_meas               binary        0.905

4 Code Appendix

\(~\)

knitr::opts_chunk$set(echo = TRUE)

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin  - Categorical
##### dmdeduc2 - Education level - Adults 20+  - Categorical
##### dmdmartl - Marital status  - Categorical
##### indhhin2 - Annual household income  - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical
install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

library('tidyverse')

diab_pop.no_na_vals <- diab_pop %>% na.omit()

library('caret')

# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())

set.seed(8675309)

# The createDataPartition function is used to create training and test sets

trainIndex <- createDataPartition(diab_pop.no_na_vals$diq010, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

dm2.train <- diab_pop.no_na_vals[trainIndex, ]
dm2.test <- diab_pop.no_na_vals[-trainIndex, ]
dummyVars.dm2Train <- dummyVars( ~. , data = dm2.train)

dm2Train_dummies <- as_tibble( predict(dummyVars.dm2Train, dm2.train) )
comboInformation <- findLinearCombos(dm2Train_dummies)

comboInformation

dm2Train_dummies_independent <- dm2Train_dummies[,-comboInformation$remove ]

features1 <- colnames(dm2Train_dummies_independent)[!colnames(dm2Train_dummies_independent) %in% c("seqn", "diq010.Diabetes")]
features1

cor_matrix <- cor(dm2Train_dummies_independent[, features1])
cor_matrix

cor_high <- findCorrelation(cor_matrix, .9)
cor_high

high_cor_remove <- row.names(cor_matrix)[cor_high] 
high_cor_remove

features2 <- setdiff(features1, high_cor_remove)

dm2_Train_independent_non_corr <- dm2Train_dummies_independent[, c("seqn", "diq010.Diabetes",features2) ]

preProcValues.range <- preProcess(dm2_Train_independent_non_corr[,features2], method = c("range"))

dm2Train_transformed.range <- predict(preProcValues.range, dm2_Train_independent_non_corr)

dm2Test_dummies <-  as_tibble( predict(dummyVars.dm2Train, dm2.test) )

dm2Test_transformed.range <- as_tibble( predict(preProcValues.range, dm2Test_dummies) )


install_if_not('class')

library('class')

dim(dm2.train)

dm2Test_transformed.range$knn_pred <- knn(dm2Train_transformed.range[,features2] , 
                                           dm2Test_transformed.range[,features2] ,
                                           cl = dm2Train_transformed.range$diq010.Diabetes,
                                           k = 5)
glimpse(dm2.test)

dm2Test_transformed.range <- dm2Test_transformed.range %>%
  mutate(diq010.Diabetes = as.factor(diq010.Diabetes)) 

glimpse(dm2Test_transformed.range)

library('yardstick')

conf_mat(dm2Test_transformed.range, truth=diq010.Diabetes, knn_pred)

summary(conf_mat(dm2Test_transformed.range, truth=diq010.Diabetes, knn_pred))