Get the required packages
library(mlbench)
library(tidyverse)
## ── Attaching packages ─────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.1 ✔ stringr 1.4.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 0 129 110 46 130 67.1 0.319 26 pos
## 2 5 158 84 41 210 39.4 0.395 29 pos
## 3 2 94 76 18 66 31.6 0.649 23 neg
# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]
# Fit the model on the training set
set.seed(123)
model_knn <- train(
diabetes ~., data = train.data, method = "knn",
trControl = trainControl("cv", number = 10),
preProcess = c("center","scale"),
tuneLength = 20
)
# Plot model accuracy vs different values of k
plot(model_knn)
# Fit the model on the training set
set.seed(123)
model_glm <- train(
diabetes ~., data = train.data, method = "glm",family="binomial",
trControl = trainControl("cv", number = 10),
preProcess = c("center","scale")
)
pred_knn <- model_knn %>% predict(test.data)
head(pred_knn)
## [1] neg pos neg pos pos neg
## Levels: neg pos
pred_glm <- model_glm %>% predict(test.data)
head(pred_glm)
## [1] neg pos neg pos pos neg
## Levels: neg pos
table(pred_knn,pred_glm)
## pred_glm
## pred_knn neg pos
## neg 59 5
## pos 0 14
model_ngc = glm(diabetes~.,data = train.data,family="binomial")
prob_ngc = model_ngc %>% predict(test.data,type="response")
pred_ngc = prob_ngc > .5
head(pred_ngc)
## 21 25 28 29 32 36
## FALSE TRUE FALSE TRUE TRUE FALSE
# Load the data
data("Boston", package = "MASS")
# Inspect the data
sample_n(Boston, 3)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.03659 25 4.86 0 0.426 6.302 32.2 5.4007 4 281 19.0 396.90
## 2 13.35980 0 18.10 0 0.693 5.887 94.7 1.7821 24 666 20.2 396.90
## 3 1.46336 0 19.58 0 0.605 7.489 90.8 1.9709 5 403 14.7 374.43
## lstat medv
## 1 6.72 24.8
## 2 16.35 12.7
## 3 1.73 50.0
# Split the data into training and test set
set.seed(123)
training.samples <- Boston$medv %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- Boston[training.samples, ]
test.data <- Boston[-training.samples, ]
# Fit the model on the training set
set.seed(123)
model <- train(
medv~., data = train.data, method = "knn",
trControl = trainControl("cv", number = 10),
preProcess = c("center","scale"),
tuneLength = 20
)
# Plot model error RMSE vs different values of k
plot(model)
# Best tuning parameter k that minimize the RMSE
model$bestTune
## k
## 1 5
# Make predictions on the test data
predictions <- model %>% predict(test.data)
head(predictions)
## [1] 31.28 29.98 20.40 21.90 19.84 17.64
# Compute the prediction error RMSE
RMSE(predictions, test.data$medv)
## [1] 4.621668