Notes Nov 4 Part 1

KNN Classification

Get the required packages

library(mlbench)
library(tidyverse)

## ── Attaching packages ───────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ──────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        3     106       54      21     158 30.9    0.292  24      neg
## 2       10     101       76      48     180 32.9    0.171  63      neg
## 3       11     120       80      37     150 42.3    0.785  48      pos

# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]

# Fit the model on the training set
set.seed(123)
model_knn <- train(
  diabetes ~., data = train.data, method = "knn",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center","scale"),
  tuneLength = 20
  )
# Plot model accuracy vs different values of k
plot(model_knn)

Try Logistic Regression

# Fit the model on the training set
set.seed(123)
model_glm <- train(
  diabetes ~., data = train.data, method = "glm",family="binomial",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center","scale")
  )

pred_knn <- model_knn %>% predict(test.data)
head(pred_knn)

## [1] neg neg neg pos neg neg
## Levels: neg pos

pred_glm <- model_glm %>% predict(test.data)
head(pred_glm)

## [1] neg neg pos pos neg neg
## Levels: neg pos

table(pred_knn,pred_glm)

##         pred_glm
## pred_knn neg pos
##      neg  55   9
##      pos   0  14

model_ngc = glm(diabetes~.,data = train.data,family="binomial")
prob_ngc = model_ngc %>% predict(test.data,type="response")
pred_ngc = prob_ngc > .5
head(pred_ngc)

##    19    21    32    55    64    71 
## FALSE FALSE  TRUE  TRUE FALSE FALSE

# Load the data
data("Boston", package = "MASS")
# Inspect the data
sample_n(Boston, 3)

##      crim zn indus chas   nox    rm  age     dis rad tax ptratio  black
## 1 0.04741  0 11.93    0 0.573 6.030 80.8  2.5050   1 273    21.0 396.90
## 2 0.15086  0 27.74    0 0.609 5.454 92.7  1.8209   4 711    20.1 395.09
## 3 0.07950 60  1.69    0 0.411 6.579 35.9 10.7103   4 411    18.3 370.78
##   lstat medv
## 1  7.88 11.9
## 2 18.06 15.2
## 3  5.49 24.1

# Split the data into training and test set
set.seed(123)
training.samples <- Boston$medv %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- Boston[training.samples, ]
test.data <- Boston[-training.samples, ]

# Fit the model on the training set
set.seed(123)
model <- train(
  medv~., data = train.data, method = "knn",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center","scale"),
  tuneLength = 20
  )
# Plot model error RMSE vs different values of k
plot(model)

# Best tuning parameter k that minimize the RMSE
model$bestTune

##   k
## 1 5

# Make predictions on the test data
predictions <- model %>% predict(test.data)
head(predictions)

## [1] 32.60 27.28 19.18 20.88 18.86 18.06

# Compute the prediction error RMSE
RMSE(predictions, test.data$medv)

## [1] 4.762122

Notes Nov 4 Part 1

Harold Nelson

11/4/2019

KNN Classification

Try Logistic Regression