Gender Prediction

library(tidyverse)

## ── Attaching packages ──────────────────

## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.4     
## ✔ tidyr   0.8.0          ✔ stringr 1.3.0     
## ✔ readr   1.1.1          ✔ forcats 0.3.0

## ── Conflicts ── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

load("~/Dropbox/RProjects/Module 8/cdc.Rdata")

Get Reduced CDD

set.seed(123)
cdc %>% sample_frac(.25) -> cdc

Divide: Train and Test

set.seed(3033)
intrain <- createDataPartition(y =  cdc$gender, p= 0.7, list = FALSE)
training <- cdc[intrain,]
testing <- cdc[-intrain,]

Fit with Train

trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
set.seed(3333)
knn_fit <- train(gender ~ height + weight, data = training, method = "knn",
 trControl=trctrl,
 preProcess = c("center", "scale"),
 tuneLength = 20)
knn_fit

## k-Nearest Neighbors 
## 
## 3501 samples
##    2 predictor
##    2 classes: 'm', 'f' 
## 
## Pre-processing: centered (2), scaled (2) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 3151, 3151, 3151, 3150, 3150, 3151, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.8436691  0.6850537
##    7  0.8480465  0.6938811
##    9  0.8510985  0.7001794
##   11  0.8490022  0.6959080
##   13  0.8499530  0.6978206
##   15  0.8545271  0.7073267
##   17  0.8548123  0.7080499
##   19  0.8553851  0.7092094
##   21  0.8558610  0.7101730
##   23  0.8547209  0.7077682
##   25  0.8555767  0.7094142
##   27  0.8557652  0.7097738
##   29  0.8549089  0.7081536
##   31  0.8550028  0.7083444
##   33  0.8550039  0.7083254
##   35  0.8541459  0.7066260
##   37  0.8535764  0.7055303
##   39  0.8541467  0.7066599
##   41  0.8539565  0.7063008
##   43  0.8539546  0.7062680
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 21.

Look at Results

knn_fit$results %>% 
  ggplot(aes(k,Accuracy)) + geom_point()

test_pred <- predict(knn_fit, newdata = testing)
confusionMatrix(test_pred, testing$gender )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   m   f
##          m 584 101
##          f 120 694
##                                           
##                Accuracy : 0.8526          
##                  95% CI : (0.8336, 0.8701)
##     No Information Rate : 0.5304          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7036          
##  Mcnemar's Test P-Value : 0.226           
##                                           
##             Sensitivity : 0.8295          
##             Specificity : 0.8730          
##          Pos Pred Value : 0.8526          
##          Neg Pred Value : 0.8526          
##              Prevalence : 0.4696          
##          Detection Rate : 0.3896          
##    Detection Prevalence : 0.4570          
##       Balanced Accuracy : 0.8513          
##                                           
##        'Positive' Class : m               
##