Notes June 10

KNN Classification

Get the required packages

library(mlbench)
library(tidyverse)

## ── Attaching packages ─────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.0.0       ✔ purrr   0.2.5  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.1       ✔ stringr 1.4.0  
## ✔ readr   1.1.1       ✔ forcats 0.3.0

## ── Conflicts ────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        0     129      110      46     130 67.1    0.319  26      pos
## 2        5     158       84      41     210 39.4    0.395  29      pos
## 3        2      94       76      18      66 31.6    0.649  23      neg

# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]

# Fit the model on the training set
set.seed(123)
model_knn <- train(
  diabetes ~., data = train.data, method = "knn",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center","scale"),
  tuneLength = 20
  )
# Plot model accuracy vs different values of k
plot(model_knn)

Try Logistic Regression

# Fit the model on the training set
set.seed(123)
model_glm <- train(
  diabetes ~., data = train.data, method = "glm",family="binomial",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center","scale")
  )

pred_knn <- model_knn %>% predict(test.data)
head(pred_knn)

## [1] neg pos neg pos pos neg
## Levels: neg pos

pred_glm <- model_glm %>% predict(test.data)
head(pred_glm)

## [1] neg pos neg pos pos neg
## Levels: neg pos

table(pred_knn,pred_glm)

##         pred_glm
## pred_knn neg pos
##      neg  59   5
##      pos   0  14

model_ngc = glm(diabetes~.,data = train.data,family="binomial")
prob_ngc = model_ngc %>% predict(test.data,type="response")
pred_ngc = prob_ngc > .5
head(pred_ngc)

##    21    25    28    29    32    36 
## FALSE  TRUE FALSE  TRUE  TRUE FALSE

# Load the data
data("Boston", package = "MASS")
# Inspect the data
sample_n(Boston, 3)

##       crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 1  0.03659 25  4.86    0 0.426 6.302 32.2 5.4007   4 281    19.0 396.90
## 2 13.35980  0 18.10    0 0.693 5.887 94.7 1.7821  24 666    20.2 396.90
## 3  1.46336  0 19.58    0 0.605 7.489 90.8 1.9709   5 403    14.7 374.43
##   lstat medv
## 1  6.72 24.8
## 2 16.35 12.7
## 3  1.73 50.0

# Split the data into training and test set
set.seed(123)
training.samples <- Boston$medv %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- Boston[training.samples, ]
test.data <- Boston[-training.samples, ]

# Fit the model on the training set
set.seed(123)
model <- train(
  medv~., data = train.data, method = "knn",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center","scale"),
  tuneLength = 20
  )
# Plot model error RMSE vs different values of k
plot(model)

# Best tuning parameter k that minimize the RMSE
model$bestTune

##   k
## 1 5

# Make predictions on the test data
predictions <- model %>% predict(test.data)
head(predictions)

## [1] 31.28 29.98 20.40 21.90 19.84 17.64

# Compute the prediction error RMSE
RMSE(predictions, test.data$medv)

## [1] 4.621668

Notes June 10

Harold Nelson

6/4/2019

KNN Classification

Try Logistic Regression