pacman::p_load(kknn, ISLR, glmtoolbox, DALEX, MASS, lmtest, jtools, broom, car, vip, cowplot, glmnet, caret, tidymodels, tidyverse)
tidymodels_prefer()
theme_set(theme_bw())
# Load and prepare data
data(Credit)
# Exclude the ID column
Credit = Credit[-1] %>% drop_na()
str(Credit)
## 'data.frame': 400 obs. of 11 variables:
## $ Income : num 14.9 106 104.6 148.9 55.9 ...
## $ Limit : int 3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
## $ Rating : int 283 483 514 681 357 569 259 512 266 491 ...
## $ Cards : int 2 3 4 3 2 4 2 2 5 3 ...
## $ Age : int 34 82 71 36 68 77 37 87 66 41 ...
## $ Education: int 11 15 11 11 16 10 12 9 13 19 ...
## $ Gender : Factor w/ 2 levels " Male","Female": 1 2 1 2 1 1 2 1 2 2 ...
## $ Student : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
## $ Married : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
## $ Ethnicity: Factor w/ 3 levels "African American",..: 3 2 2 2 3 3 1 2 3 1 ...
## $ Balance : int 333 903 580 964 331 1151 203 872 279 1350 ...
# Split data into training and testing sets
set.seed(1)
split = initial_split(Credit, prop = 0.7)
train_data = training(split)
test_data = testing(split)
# Common pre-processing recipe
model_recipe = recipe(Balance ~ ., data = train_data) %>%
step_normalize(all_numeric_predictors()) %>% # Important for KNN
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors())
# KNN Regression specification and workflow
# tune(): we will tune this with different values
# weight_func: which kernel to be used to calculate distances of the neighbours
# dist_power: power to calculate Minkowski distance
# weight_func and dist_power can also be tuned
knn_spec = nearest_neighbor(
neighbors = tune(),
weight_func = tune(),
dist_power = tune()) %>%
set_engine("kknn") %>%
set_mode("regression")
# Workflows
knn_workflow = workflow() %>%
add_recipe(model_recipe) %>%
add_model(knn_spec)
# Set up cross-validation folds for tuning knn hyperparameters
set.seed(123)
folds = vfold_cv(train_data, v = 10)
# Tune KNN model
knn_grid = grid_latin_hypercube(
neighbors(range = c(1, 10)),
weight_func(values = c("rectangular", "triangular", "epanechnikov", "biweight",
"triweight", "cos", "inv", "gaussian", "optimal")),
dist_power(range = c(1, 2)),
size = 5
)
## Warning: `grid_latin_hypercube()` was deprecated in dials 1.3.0.
## ℹ Please use `grid_space_filling()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
knn_tune = tune_grid(knn_workflow,
resamples = folds,
grid = knn_grid,
metrics = metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
)
collect_metrics(knn_tune)
## # A tibble: 15 × 9
## neighbors weight_func dist_power .metric .estimator mean n std_err
## <int> <chr> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 5 cos 1.30 mae standard 174. 10 4.95
## 2 5 cos 1.30 rmse standard 223. 10 7.20
## 3 5 cos 1.30 rsq standard 0.768 10 0.0282
## 4 3 epanechnikov 1.06 mae standard 166. 10 6.32
## 5 3 epanechnikov 1.06 rmse standard 218. 10 7.58
## 6 3 epanechnikov 1.06 rsq standard 0.773 10 0.0283
## 7 8 optimal 1.73 mae standard 193. 10 3.89
## 8 8 optimal 1.73 rmse standard 240. 10 6.01
## 9 8 optimal 1.73 rsq standard 0.741 10 0.0337
## 10 9 rectangular 1.93 mae standard 218. 10 4.49
## 11 9 rectangular 1.93 rmse standard 269. 10 7.19
## 12 9 rectangular 1.93 rsq standard 0.702 10 0.0325
## 13 1 triweight 1.44 mae standard 205. 10 10.6
## 14 1 triweight 1.44 rmse standard 270. 10 10.4
## 15 1 triweight 1.44 rsq standard 0.672 10 0.0333
## # ℹ 1 more variable: .config <chr>
knn_tune %>% autoplot()
# Select best KNN model
best_knn = knn_tune %>% select_best(metric = 'rmse')
best_knn
## # A tibble: 1 × 4
## neighbors weight_func dist_power .config
## <int> <chr> <dbl> <chr>
## 1 3 epanechnikov 1.06 Preprocessor1_Model2
final_knn_workflow = finalize_workflow(knn_workflow, best_knn)
# Fit final models on train data from the split and evaluates using the test data from the split
knn_model = last_fit(final_knn_workflow, split)
# See results of trained model on the test data
final_knn = extract_fit_parsnip(knn_model)
final_knn
## parsnip model object
##
##
## Call:
## kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(3L, data, 5), distance = ~1.05871301149018, kernel = ~"epanechnikov")
##
## Type of response variable: continuous
## minimal mean absolute error: 165.3931
## Minimal mean squared error: 47862.04
## Best kernel: epanechnikov
## Best k: 3
# Test metrics of the trained model on the test data
collect_metrics(knn_model)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 277. Preprocessor1_Model1
## 2 rsq standard 0.626 Preprocessor1_Model1
# Get predictions
train_predictions = collect_predictions(knn_model)
head(train_predictions)
## # A tibble: 6 × 5
## .pred id .row Balance .config
## <dbl> <chr> <int> <int> <chr>
## 1 881. train/test split 3 580 Preprocessor1_Model1
## 2 1627. train/test split 4 964 Preprocessor1_Model1
## 3 210. train/test split 5 331 Preprocessor1_Model1
## 4 777. train/test split 6 1151 Preprocessor1_Model1
## 5 136. train/test split 7 203 Preprocessor1_Model1
## 6 1077. train/test split 8 872 Preprocessor1_Model1
# Get predictions using basic functions for training data
train_predictions = final_knn$fit$fitted.values %>% bind_cols(train_data) %>% rename(.pred = ...1)
## New names:
## • `` -> `...1`
head(train_predictions)
## .pred Income Limit Rating Cards Age Education Gender Student Married
## 1 1248.3952 182.728 13913 982 4 98 17 Male No Yes
## 2 326.4580 35.691 2880 214 2 35 15 Male No No
## 3 1195.4525 123.299 8376 610 2 89 17 Male Yes No
## 4 122.4892 20.791 2672 204 1 70 18 Female No No
## 5 715.6282 39.055 5565 410 4 48 18 Female No Yes
## 6 326.3437 36.472 3806 309 2 52 13 Male No No
## Ethnicity Balance
## 1 Caucasian 1999
## 2 African American 0
## 3 African American 1259
## 4 African American 0
## 5 Caucasian 772
## 6 African American 188
# Get predictions for the test data
# Prepare test_data according to the recipe preprocessing
test_data_processed = prep(model_recipe) %>% bake(new_data = test_data)
knn_predictions = predict(final_knn, new_data = test_data_processed) %>% bind_cols(test_data_processed)
head(knn_predictions)
## # A tibble: 6 × 13
## .pred Income Limit Rating Cards Age Education Balance Gender_Female
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 881. 1.68 0.999 1.02 0.772 0.913 -0.869 580 0
## 2 1627. 2.91 2.03 2.07 0.0725 -1.14 -0.869 964 1
## 3 210. 0.324 0.0757 0.0228 -0.627 0.738 0.736 331 0
## 4 777. 0.999 1.41 1.36 0.772 1.27 -1.19 1151 0
## 5 136. -0.645 -0.564 -0.597 -0.627 -1.08 -0.548 203 1
## 6 1077. 0.756 1.02 1.00 -0.627 1.85 -1.51 872 0
## # ℹ 4 more variables: Student_Yes <dbl>, Married_Yes <dbl>,
## # Ethnicity_Asian <dbl>, Ethnicity_Caucasian <dbl>
# Evaluate the model performance
metrics(knn_predictions, truth = Balance, estimate = .pred)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 277.
## 2 rsq standard 0.626
## 3 mae standard 198.
# Linear Regression specification and workflow
lm_spec = linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
# Workflows
lm_workflow = workflow() %>%
add_recipe(model_recipe) %>%
add_model(lm_spec)
# Train the model on training data
final_lm = fit(lm_workflow, train_data)
# Evaluate the model performance
lm_predictions = final_lm %>% predict(test_data) %>% bind_cols(test_data)
metrics(lm_predictions, truth = Balance, estimate = .pred)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 107.
## 2 rsq standard 0.945
## 3 mae standard 83.6
# Collect metrics on test set for both models
# pull() extracts vector [single varialbe], select() extracts tibble (like dataframe)
test_results = test_data %>%
mutate(
.pred_lm = predict(final_lm, new_data = test_data) %>% pull(.pred),
.pred_knn = predict(final_knn, new_data = test_data_processed) %>% pull(.pred)
)
head(test_results)
## Income Limit Rating Cards Age Education Gender Student Married
## 1 104.593 7075 514 4 71 11 Male No No
## 2 148.924 9504 681 3 36 11 Female No No
## 3 55.882 4897 357 2 68 16 Male No Yes
## 4 80.180 8047 569 4 77 10 Male No No
## 5 20.996 3388 259 2 37 12 Female No No
## 6 71.408 7114 512 2 87 9 Male No No
## Ethnicity Balance .pred_lm .pred_knn
## 1 Asian 580 677.8747 880.6868
## 2 Asian 964 980.7356 1627.0813
## 3 Caucasian 331 407.5040 210.2217
## 4 Caucasian 1151 1104.0413 776.8329
## 5 African American 203 278.5849 136.0664
## 6 Asian 872 888.1353 1077.1995
# Calculate performance metrics
metrics = metric_set(rmse, rsq, mae)
lm_metrics = test_results %>%
metrics(truth = Balance, estimate = .pred_lm) %>%
mutate(model = "Linear Regression")
knn_metrics = test_results %>%
metrics(truth = Balance, estimate = .pred_knn) %>%
mutate(model = "KNN Regression")
# Combine and compare
bind_rows(lm_metrics, knn_metrics) %>% arrange(.metric)
## # A tibble: 6 × 4
## .metric .estimator .estimate model
## <chr> <chr> <dbl> <chr>
## 1 mae standard 83.6 Linear Regression
## 2 mae standard 198. KNN Regression
## 3 rmse standard 107. Linear Regression
## 4 rmse standard 277. KNN Regression
## 5 rsq standard 0.945 Linear Regression
## 6 rsq standard 0.626 KNN Regression
we see that linear regression performs better in this case.
# Visual comparison
test_results %>%
ggplot(aes(x = Balance)) +
geom_point(aes(y = .pred_lm, color = "Linear Reg")) +
geom_point(aes(y = .pred_knn, color = "KNN Reg")) +
geom_abline(color = 'blue') +
labs(title = "Actual vs Predicted Values", y = "Predicted Balance") +
scale_color_manual(values = c("#E69F00", "#56B4E9"))
RMSE (Root Mean Squared Error): Lower is better
R-squared: Higher is better (closer to 1)
MAE (Mean Absolute Error): Lower is better
KNN might overfit with too many neighbors
Assumptions: Linear regression assumes linear relationships
# Comparing KNN and Linear Regression on Credit Data
# load libraries
library(kknn)
library(ISLR)
library(glmtoolbox)
library(DALEX)
library(MASS)
library(lmtest)
library(jtools)
library(broom)
library(car)
library(vip)
library(cowplot)
library(caret)
library(glmnet)
library(tidymodels)
library(tidyverse)
# Prefer tidymodels ecosystem
tidymodels_prefer()
# Set ggplot them ans bw
theme_set(theme_bw())
## Step 1: Load and prepare data
# Load data
data(Credit)
summary(Credit)
## ID Income Limit Rating
## Min. : 1.0 Min. : 10.35 Min. : 855 Min. : 93.0
## 1st Qu.:100.8 1st Qu.: 21.01 1st Qu.: 3088 1st Qu.:247.2
## Median :200.5 Median : 33.12 Median : 4622 Median :344.0
## Mean :200.5 Mean : 45.22 Mean : 4736 Mean :354.9
## 3rd Qu.:300.2 3rd Qu.: 57.47 3rd Qu.: 5873 3rd Qu.:437.2
## Max. :400.0 Max. :186.63 Max. :13913 Max. :982.0
## Cards Age Education Gender Student
## Min. :1.000 Min. :23.00 Min. : 5.00 Male :193 No :360
## 1st Qu.:2.000 1st Qu.:41.75 1st Qu.:11.00 Female:207 Yes: 40
## Median :3.000 Median :56.00 Median :14.00
## Mean :2.958 Mean :55.67 Mean :13.45
## 3rd Qu.:4.000 3rd Qu.:70.00 3rd Qu.:16.00
## Max. :9.000 Max. :98.00 Max. :20.00
## Married Ethnicity Balance
## No :155 African American: 99 Min. : 0.00
## Yes:245 Asian :102 1st Qu.: 68.75
## Caucasian :199 Median : 459.50
## Mean : 520.01
## 3rd Qu.: 863.00
## Max. :1999.00
# Remove ID column and NA (missing) values
Credit = Credit %>%
select(-ID) %>%
drop_na()
### Step 2: Split data into training and testing sets
set.seed(123) # For reproducibility
split = initial_split(Credit, prop = 0.7, strata = Balance)
train_data = training(split)
test_data = testing(split)
### Step 3: Set common recipe and preprocess train_data
recipe = recipe(Balance ~ ., data = train_data) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
### Step 4: Define KNN model specification
knn_spec = nearest_neighbor(
mode = 'regression',
neighbors = tune(),
weight_func = tune(),
dist_power = tune() )
### Step 5: Set initial workflow
knn_workflow = workflow() %>%
add_recipe(recipe) %>%
add_model(knn_spec)
### Step 6: Create folds and set tuning grid
set.seed(123)
folds = vfold_cv(train_data, v = 10, strata = Balance)
# knn_grid = grid_regular(
# neighbors(range = c(1, 20)),
# weight_func(values = c("rectangular", "triangular", "optimal")),
# dist_power(range = c(1, 2)),
# levels = 10
# )
# OR using grid_latin_hypercube
knn_grid = grid_latin_hypercube(
neighbors(range = c(1, 20)),
weight_func(values = c("rectangular", "triangular", "optimal")),
dist_power(range = c(1, 2)),
size = 10
)
### Step 7: Tune KNN model
knn_tune = tune_grid(
knn_workflow,
resamples = folds,
grid = knn_grid,
metrics = metric_set(rmse, rsq)
)
# See metrics of different folds
knn_tune %>%
collect_metrics() %>%
arrange(mean)
## # A tibble: 20 × 9
## neighbors weight_func dist_power .metric .estimator mean n std_err
## <int> <chr> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 3 rectangular 1.92 rsq standard 0.637 10 0.0368
## 2 3 triangular 1.71 rsq standard 0.643 10 0.0349
## 3 11 rectangular 1.87 rsq standard 0.720 10 0.0290
## 4 8 rectangular 1.35 rsq standard 0.728 10 0.0322
## 5 17 optimal 1.67 rsq standard 0.740 10 0.0281
## 6 6 triangular 1.18 rsq standard 0.747 10 0.0273
## 7 16 optimal 1.48 rsq standard 0.750 10 0.0294
## 8 13 optimal 1.26 rsq standard 0.758 10 0.0280
## 9 19 triangular 1.50 rsq standard 0.759 10 0.0260
## 10 10 optimal 1.05 rsq standard 0.773 10 0.0274
## 11 10 optimal 1.05 rmse standard 225. 10 9.64
## 12 6 triangular 1.18 rmse standard 231. 10 10.1
## 13 13 optimal 1.26 rmse standard 235. 10 9.15
## 14 16 optimal 1.48 rmse standard 241. 10 8.90
## 15 19 triangular 1.50 rmse standard 242. 10 8.10
## 16 17 optimal 1.67 rmse standard 247. 10 8.97
## 17 8 rectangular 1.35 rmse standard 247. 10 10.0
## 18 11 rectangular 1.87 rmse standard 257. 10 8.96
## 19 3 triangular 1.71 rmse standard 272. 10 13.8
## 20 3 rectangular 1.92 rmse standard 273. 10 11.2
## # ℹ 1 more variable: .config <chr>
# Plot tuning results
knn_tune %>% autoplot()
# Find best hyperparameters
best_knn = knn_tune %>%
select_best(metric = "rmse")
best_knn
## # A tibble: 1 × 4
## neighbors weight_func dist_power .config
## <int> <chr> <dbl> <chr>
## 1 10 optimal 1.05 Preprocessor1_Model01
### Step 8: Finalize the KNN workflow
final_knn_workflow = knn_workflow %>%
finalize_workflow(best_knn)
### Step 9: Fit the final KNN model on the training data
final_knn_fit = final_knn_workflow %>%
fit(data = train_data)
### Step 10: Evaluate the KNN model on the test data
knn_predictions = final_knn_fit %>%
predict(new_data = test_data) %>%
bind_cols(test_data)
# .pred column is the predicted values of Balance
knn_predictions
## # A tibble: 121 × 12
## .pred Income Limit Rating Cards Age Education Gender Student Married
## <dbl> <dbl> <int> <int> <int> <int> <int> <fct> <fct> <fct>
## 1 1193. 106. 6645 483 3 82 15 "Female" Yes Yes
## 2 362. 55.9 4897 357 2 68 16 " Male" No Yes
## 3 96.2 21.0 3388 259 2 37 12 "Female" No No
## 4 0 15.0 1311 138 3 64 16 " Male" No No
## 5 60.5 20.1 2525 200 3 57 15 "Female" No Yes
## 6 285. 53.6 3714 286 3 73 17 "Female" No Yes
## 7 631. 42.1 6626 479 2 44 9 " Male" No No
## 8 896. 37.3 6378 458 1 72 17 "Female" No No
## 9 355. 14.1 4323 326 5 25 16 "Female" No Yes
## 10 525. 42.5 3625 289 6 44 12 "Female" Yes No
## # ℹ 111 more rows
## # ℹ 2 more variables: Ethnicity <fct>, Balance <int>
# Calculate RMSE and R-squared
knn_metrics = knn_predictions %>%
metrics(truth = Balance, estimate = .pred)
knn_metrics
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 234.
## 2 rsq standard 0.793
## 3 mae standard 172.
# Plot predictions vs actual values
ggplot(knn_predictions, aes(x = Balance, y = .pred)) +
geom_point() +
geom_smooth(method = 'lm') +
labs(title = "KNN Predictions vs Actual Values",
x = "Actual Balance",
y = "Predicted Balance")
## `geom_smooth()` using formula = 'y ~ x'
### Run linear regression model using tidymodels approach
# Linear regress specification
lin_reg_spec = linear_reg(mode = 'regression')
# Set linear regression workflow
lin_reg_workflow = workflow() %>%
add_recipe(recipe) %>%
add_model(lin_reg_spec)
# Train the linear regression model on the train data
final_lm = lin_reg_workflow %>% fit(data = train_data)
# Make predictions on the test data
lm_predictions = final_lm %>%
predict(new_data = test_data) %>%
bind_cols(test_data)
# Calculate RMSE and R-squared for linear regression
metrics(lm_predictions, truth = Balance, estimate = .pred)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 92.3
## 2 rsq standard 0.964
## 3 mae standard 75.8
# Calculate RMSE and R-squared for KNN regression
metrics(knn_predictions, truth = Balance, estimate = .pred)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 234.
## 2 rsq standard 0.793
## 3 mae standard 172.
# Predicting the test data
test_results = test_data %>%
mutate(
.pred_knn = knn_predictions$.pred,
.pred_lm = lm_predictions$.pred
)
head(test_results)
## Income Limit Rating Cards Age Education Gender Student Married
## 1 106.025 6645 483 3 82 15 Female Yes Yes
## 2 55.882 4897 357 2 68 16 Male No Yes
## 3 20.996 3388 259 2 37 12 Female No No
## 4 15.045 1311 138 3 64 16 Male No No
## 5 20.089 2525 200 3 57 15 Female No Yes
## 6 53.598 3714 286 3 73 17 Female No Yes
## Ethnicity Balance .pred_knn .pred_lm
## 1 Asian 903 1193.08906 914.26371
## 2 Caucasian 331 361.90311 409.04140
## 3 African American 203 96.21087 291.12924
## 4 Caucasian 0 0.00000 -174.83619
## 5 African American 0 60.53106 63.96688
## 6 African American 0 285.21505 113.45598
# Combining the metrics
lm_metrics = metrics(lm_predictions, truth = Balance, estimate = .pred) %>% mutate(model = "Linear Regression")
knn_metrics = metrics(knn_predictions, truth = Balance, estimate = .pred) %>% mutate(model = "KNN")
combined_metrics = bind_rows(lm_metrics, knn_metrics)
combined_metrics %>% arrange(.metric)
## # A tibble: 6 × 4
## .metric .estimator .estimate model
## <chr> <chr> <dbl> <chr>
## 1 mae standard 75.8 Linear Regression
## 2 mae standard 172. KNN
## 3 rmse standard 92.3 Linear Regression
## 4 rmse standard 234. KNN
## 5 rsq standard 0.964 Linear Regression
## 6 rsq standard 0.793 KNN
# Visual comparison the test results
head(test_results)
## Income Limit Rating Cards Age Education Gender Student Married
## 1 106.025 6645 483 3 82 15 Female Yes Yes
## 2 55.882 4897 357 2 68 16 Male No Yes
## 3 20.996 3388 259 2 37 12 Female No No
## 4 15.045 1311 138 3 64 16 Male No No
## 5 20.089 2525 200 3 57 15 Female No Yes
## 6 53.598 3714 286 3 73 17 Female No Yes
## Ethnicity Balance .pred_knn .pred_lm
## 1 Asian 903 1193.08906 914.26371
## 2 Caucasian 331 361.90311 409.04140
## 3 African American 203 96.21087 291.12924
## 4 Caucasian 0 0.00000 -174.83619
## 5 African American 0 60.53106 63.96688
## 6 African American 0 285.21505 113.45598
ggplot(test_results, aes(x = Balance)) +
geom_point(aes(y = .pred_knn, color = "KNN")) +
geom_point(aes(y = .pred_lm, color = "Linear Regression")) +
geom_smooth(aes(y = .pred_knn), method = 'lm', color = "blue") +
geom_smooth(aes(y = .pred_lm), method = 'lm', color = "red") +
labs(title = "KNN vs Linear Regression Predictions",
x = "Actual Balance",
y = "Predicted Balance") +
scale_color_manual(values = c("KNN" = "blue", "Linear Regression" = "red")) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'