library(tidyverse)
## ── Attaching packages ──────────────────
## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
load("~/Dropbox/RProjects/Module 8/cdc.Rdata")
Get Reduced CDD
set.seed(123)
cdc %>% sample_frac(.25) -> cdc
Divide: Train and Test
set.seed(3033)
intrain <- createDataPartition(y = cdc$gender, p= 0.7, list = FALSE)
training <- cdc[intrain,]
testing <- cdc[-intrain,]
Fit with Train
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
set.seed(3333)
knn_fit <- train(gender ~ height + weight, data = training, method = "knn",
trControl=trctrl,
preProcess = c("center", "scale"),
tuneLength = 20)
knn_fit
## k-Nearest Neighbors
##
## 3501 samples
## 2 predictor
## 2 classes: 'm', 'f'
##
## Pre-processing: centered (2), scaled (2)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 3151, 3151, 3151, 3150, 3150, 3151, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8436691 0.6850537
## 7 0.8480465 0.6938811
## 9 0.8510985 0.7001794
## 11 0.8490022 0.6959080
## 13 0.8499530 0.6978206
## 15 0.8545271 0.7073267
## 17 0.8548123 0.7080499
## 19 0.8553851 0.7092094
## 21 0.8558610 0.7101730
## 23 0.8547209 0.7077682
## 25 0.8555767 0.7094142
## 27 0.8557652 0.7097738
## 29 0.8549089 0.7081536
## 31 0.8550028 0.7083444
## 33 0.8550039 0.7083254
## 35 0.8541459 0.7066260
## 37 0.8535764 0.7055303
## 39 0.8541467 0.7066599
## 41 0.8539565 0.7063008
## 43 0.8539546 0.7062680
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 21.
Look at Results
knn_fit$results %>%
ggplot(aes(k,Accuracy)) + geom_point()

test_pred <- predict(knn_fit, newdata = testing)
confusionMatrix(test_pred, testing$gender )
## Confusion Matrix and Statistics
##
## Reference
## Prediction m f
## m 584 101
## f 120 694
##
## Accuracy : 0.8526
## 95% CI : (0.8336, 0.8701)
## No Information Rate : 0.5304
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7036
## Mcnemar's Test P-Value : 0.226
##
## Sensitivity : 0.8295
## Specificity : 0.8730
## Pos Pred Value : 0.8526
## Neg Pred Value : 0.8526
## Prevalence : 0.4696
## Detection Rate : 0.3896
## Detection Prevalence : 0.4570
## Balanced Accuracy : 0.8513
##
## 'Positive' Class : m
##