Get the required packages
library(mlbench)
library(tidyverse)
## ── Attaching packages ───────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 3 106 54 21 158 30.9 0.292 24 neg
## 2 10 101 76 48 180 32.9 0.171 63 neg
## 3 11 120 80 37 150 42.3 0.785 48 pos
# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]
# Fit the model on the training set
set.seed(123)
model_knn <- train(
diabetes ~., data = train.data, method = "knn",
trControl = trainControl("cv", number = 10),
preProcess = c("center","scale"),
tuneLength = 20
)
# Plot model accuracy vs different values of k
plot(model_knn)
# Fit the model on the training set
set.seed(123)
model_glm <- train(
diabetes ~., data = train.data, method = "glm",family="binomial",
trControl = trainControl("cv", number = 10),
preProcess = c("center","scale")
)
pred_knn <- model_knn %>% predict(test.data)
head(pred_knn)
## [1] neg neg neg pos neg neg
## Levels: neg pos
pred_glm <- model_glm %>% predict(test.data)
head(pred_glm)
## [1] neg neg pos pos neg neg
## Levels: neg pos
table(pred_knn,pred_glm)
## pred_glm
## pred_knn neg pos
## neg 55 9
## pos 0 14
model_ngc = glm(diabetes~.,data = train.data,family="binomial")
prob_ngc = model_ngc %>% predict(test.data,type="response")
pred_ngc = prob_ngc > .5
head(pred_ngc)
## 19 21 32 55 64 71
## FALSE FALSE TRUE TRUE FALSE FALSE
# Load the data
data("Boston", package = "MASS")
# Inspect the data
sample_n(Boston, 3)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.04741 0 11.93 0 0.573 6.030 80.8 2.5050 1 273 21.0 396.90
## 2 0.15086 0 27.74 0 0.609 5.454 92.7 1.8209 4 711 20.1 395.09
## 3 0.07950 60 1.69 0 0.411 6.579 35.9 10.7103 4 411 18.3 370.78
## lstat medv
## 1 7.88 11.9
## 2 18.06 15.2
## 3 5.49 24.1
# Split the data into training and test set
set.seed(123)
training.samples <- Boston$medv %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- Boston[training.samples, ]
test.data <- Boston[-training.samples, ]
# Fit the model on the training set
set.seed(123)
model <- train(
medv~., data = train.data, method = "knn",
trControl = trainControl("cv", number = 10),
preProcess = c("center","scale"),
tuneLength = 20
)
# Plot model error RMSE vs different values of k
plot(model)
# Best tuning parameter k that minimize the RMSE
model$bestTune
## k
## 1 5
# Make predictions on the test data
predictions <- model %>% predict(test.data)
head(predictions)
## [1] 32.60 27.28 19.18 20.88 18.86 18.06
# Compute the prediction error RMSE
RMSE(predictions, test.data$medv)
## [1] 4.762122