Model comparison: Multiple linear regression versus k-nearest neighbors

Recall: When to use 10-fold CV and bootstap?

Use cross validation for model comparison, tuning, and error estimation.
Use Bootstrapping for statistical inference (e.g., confidence intervals of coefficients).
Both methods are complementary and can be used together for error estimation.

pacman::p_load(kknn, ISLR, glmtoolbox, DALEX, MASS, lmtest, jtools, broom, car, vip, cowplot, glmnet, caret, tidymodels, tidyverse)
tidymodels_prefer()
theme_set(theme_bw())

# Load and prepare data
data(Credit)

# Exclude the ID column
Credit = Credit[-1] %>% drop_na()
str(Credit)

## 'data.frame':    400 obs. of  11 variables:
##  $ Income   : num  14.9 106 104.6 148.9 55.9 ...
##  $ Limit    : int  3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
##  $ Rating   : int  283 483 514 681 357 569 259 512 266 491 ...
##  $ Cards    : int  2 3 4 3 2 4 2 2 5 3 ...
##  $ Age      : int  34 82 71 36 68 77 37 87 66 41 ...
##  $ Education: int  11 15 11 11 16 10 12 9 13 19 ...
##  $ Gender   : Factor w/ 2 levels " Male","Female": 1 2 1 2 1 1 2 1 2 2 ...
##  $ Student  : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
##  $ Married  : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
##  $ Ethnicity: Factor w/ 3 levels "African American",..: 3 2 2 2 3 3 1 2 3 1 ...
##  $ Balance  : int  333 903 580 964 331 1151 203 872 279 1350 ...

# Split data into training and testing sets
set.seed(1)
split = initial_split(Credit, prop = 0.7)
train_data = training(split)
test_data = testing(split)

# Common pre-processing recipe
model_recipe = recipe(Balance ~ ., data = train_data) %>% 
  step_normalize(all_numeric_predictors()) %>% # Important for KNN
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors())

KNN regression

# KNN Regression specification and workflow
# tune(): we will tune this with different values
# weight_func: which kernel to be used to calculate distances of the neighbours
# dist_power: power to calculate Minkowski distance
# weight_func and dist_power can also be tuned

knn_spec = nearest_neighbor(
    neighbors = tune(),
    weight_func = tune(),
    dist_power = tune()) %>%
  set_engine("kknn") %>% 
  set_mode("regression")

# Workflows
knn_workflow = workflow() %>% 
  add_recipe(model_recipe) %>% 
  add_model(knn_spec)

# Set up cross-validation folds for tuning knn hyperparameters
set.seed(123)
folds = vfold_cv(train_data, v = 10)

# Tune KNN model
knn_grid = grid_latin_hypercube(
  neighbors(range = c(1, 10)),
  weight_func(values = c("rectangular", "triangular", "epanechnikov", "biweight", 
                        "triweight", "cos", "inv", "gaussian", "optimal")),
  dist_power(range = c(1, 2)),
  size = 5
)

## Warning: `grid_latin_hypercube()` was deprecated in dials 1.3.0.
## ℹ Please use `grid_space_filling()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

knn_tune = tune_grid(knn_workflow,
    resamples = folds,
    grid = knn_grid,
    metrics = metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
  )

collect_metrics(knn_tune)

## # A tibble: 15 × 9
##    neighbors weight_func  dist_power .metric .estimator    mean     n std_err
##        <int> <chr>             <dbl> <chr>   <chr>        <dbl> <int>   <dbl>
##  1         5 cos                1.30 mae     standard   174.       10  4.95  
##  2         5 cos                1.30 rmse    standard   223.       10  7.20  
##  3         5 cos                1.30 rsq     standard     0.768    10  0.0282
##  4         3 epanechnikov       1.06 mae     standard   166.       10  6.32  
##  5         3 epanechnikov       1.06 rmse    standard   218.       10  7.58  
##  6         3 epanechnikov       1.06 rsq     standard     0.773    10  0.0283
##  7         8 optimal            1.73 mae     standard   193.       10  3.89  
##  8         8 optimal            1.73 rmse    standard   240.       10  6.01  
##  9         8 optimal            1.73 rsq     standard     0.741    10  0.0337
## 10         9 rectangular        1.93 mae     standard   218.       10  4.49  
## 11         9 rectangular        1.93 rmse    standard   269.       10  7.19  
## 12         9 rectangular        1.93 rsq     standard     0.702    10  0.0325
## 13         1 triweight          1.44 mae     standard   205.       10 10.6   
## 14         1 triweight          1.44 rmse    standard   270.       10 10.4   
## 15         1 triweight          1.44 rsq     standard     0.672    10  0.0333
## # ℹ 1 more variable: .config <chr>

knn_tune %>% autoplot()

# Select best KNN model
best_knn = knn_tune %>% select_best(metric = 'rmse')
best_knn

## # A tibble: 1 × 4
##   neighbors weight_func  dist_power .config             
##       <int> <chr>             <dbl> <chr>               
## 1         3 epanechnikov       1.06 Preprocessor1_Model2

final_knn_workflow = finalize_workflow(knn_workflow, best_knn)

# Fit final models on train data from the split and evaluates using the test data from the split
knn_model = last_fit(final_knn_workflow, split)

# See results of trained model on the test data
final_knn = extract_fit_parsnip(knn_model)
final_knn

## parsnip model object
## 
## 
## Call:
## kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(3L,     data, 5), distance = ~1.05871301149018, kernel = ~"epanechnikov")
## 
## Type of response variable: continuous
## minimal mean absolute error: 165.3931
## Minimal mean squared error: 47862.04
## Best kernel: epanechnikov
## Best k: 3

# Test metrics of the trained model on the test data
collect_metrics(knn_model)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard     277.    Preprocessor1_Model1
## 2 rsq     standard       0.626 Preprocessor1_Model1

# Get predictions
train_predictions = collect_predictions(knn_model)
head(train_predictions)

## # A tibble: 6 × 5
##   .pred id                .row Balance .config             
##   <dbl> <chr>            <int>   <int> <chr>               
## 1  881. train/test split     3     580 Preprocessor1_Model1
## 2 1627. train/test split     4     964 Preprocessor1_Model1
## 3  210. train/test split     5     331 Preprocessor1_Model1
## 4  777. train/test split     6    1151 Preprocessor1_Model1
## 5  136. train/test split     7     203 Preprocessor1_Model1
## 6 1077. train/test split     8     872 Preprocessor1_Model1

# Get predictions using basic functions for training data
train_predictions = final_knn$fit$fitted.values %>% bind_cols(train_data) %>% rename(.pred = ...1)

## New names:
## • `` -> `...1`

head(train_predictions)

##       .pred  Income Limit Rating Cards Age Education Gender Student Married
## 1 1248.3952 182.728 13913    982     4  98        17   Male      No     Yes
## 2  326.4580  35.691  2880    214     2  35        15   Male      No      No
## 3 1195.4525 123.299  8376    610     2  89        17   Male     Yes      No
## 4  122.4892  20.791  2672    204     1  70        18 Female      No      No
## 5  715.6282  39.055  5565    410     4  48        18 Female      No     Yes
## 6  326.3437  36.472  3806    309     2  52        13   Male      No      No
##          Ethnicity Balance
## 1        Caucasian    1999
## 2 African American       0
## 3 African American    1259
## 4 African American       0
## 5        Caucasian     772
## 6 African American     188

# Get predictions for the test data
# Prepare test_data according to the recipe preprocessing
test_data_processed = prep(model_recipe) %>% bake(new_data = test_data)
knn_predictions = predict(final_knn, new_data = test_data_processed) %>% bind_cols(test_data_processed)
head(knn_predictions)

## # A tibble: 6 × 13
##   .pred Income   Limit  Rating   Cards    Age Education Balance Gender_Female
##   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>  <dbl>     <dbl>   <int>         <dbl>
## 1  881.  1.68   0.999   1.02    0.772   0.913    -0.869     580             0
## 2 1627.  2.91   2.03    2.07    0.0725 -1.14     -0.869     964             1
## 3  210.  0.324  0.0757  0.0228 -0.627   0.738     0.736     331             0
## 4  777.  0.999  1.41    1.36    0.772   1.27     -1.19     1151             0
## 5  136. -0.645 -0.564  -0.597  -0.627  -1.08     -0.548     203             1
## 6 1077.  0.756  1.02    1.00   -0.627   1.85     -1.51      872             0
## # ℹ 4 more variables: Student_Yes <dbl>, Married_Yes <dbl>,
## #   Ethnicity_Asian <dbl>, Ethnicity_Caucasian <dbl>

# Evaluate the model performance
metrics(knn_predictions, truth = Balance, estimate = .pred)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     277.   
## 2 rsq     standard       0.626
## 3 mae     standard     198.

Linear regression

# Linear Regression specification and workflow
lm_spec = linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

# Workflows
lm_workflow = workflow() %>% 
  add_recipe(model_recipe) %>% 
  add_model(lm_spec)

# Train the model on training data
final_lm = fit(lm_workflow, train_data)

# Evaluate the model performance
lm_predictions = final_lm %>% predict(test_data) %>% bind_cols(test_data)
metrics(lm_predictions, truth = Balance, estimate = .pred)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     107.   
## 2 rsq     standard       0.945
## 3 mae     standard      83.6

# Collect metrics on test set for both models
# pull() extracts vector [single varialbe], select() extracts tibble (like dataframe)

test_results = test_data %>% 
  mutate(
    .pred_lm = predict(final_lm, new_data = test_data) %>% pull(.pred),
    .pred_knn = predict(final_knn, new_data = test_data_processed) %>% pull(.pred)
  )

head(test_results)

##    Income Limit Rating Cards Age Education Gender Student Married
## 1 104.593  7075    514     4  71        11   Male      No      No
## 2 148.924  9504    681     3  36        11 Female      No      No
## 3  55.882  4897    357     2  68        16   Male      No     Yes
## 4  80.180  8047    569     4  77        10   Male      No      No
## 5  20.996  3388    259     2  37        12 Female      No      No
## 6  71.408  7114    512     2  87         9   Male      No      No
##          Ethnicity Balance  .pred_lm .pred_knn
## 1            Asian     580  677.8747  880.6868
## 2            Asian     964  980.7356 1627.0813
## 3        Caucasian     331  407.5040  210.2217
## 4        Caucasian    1151 1104.0413  776.8329
## 5 African American     203  278.5849  136.0664
## 6            Asian     872  888.1353 1077.1995

# Calculate performance metrics
metrics = metric_set(rmse, rsq, mae)

lm_metrics = test_results %>% 
  metrics(truth = Balance, estimate = .pred_lm) %>% 
  mutate(model = "Linear Regression")

knn_metrics = test_results %>% 
  metrics(truth = Balance, estimate = .pred_knn) %>% 
  mutate(model = "KNN Regression")

# Combine and compare
bind_rows(lm_metrics, knn_metrics) %>% arrange(.metric)

## # A tibble: 6 × 4
##   .metric .estimator .estimate model            
##   <chr>   <chr>          <dbl> <chr>            
## 1 mae     standard      83.6   Linear Regression
## 2 mae     standard     198.    KNN Regression   
## 3 rmse    standard     107.    Linear Regression
## 4 rmse    standard     277.    KNN Regression   
## 5 rsq     standard       0.945 Linear Regression
## 6 rsq     standard       0.626 KNN Regression

we see that linear regression performs better in this case.

# Visual comparison
test_results %>% 
  ggplot(aes(x = Balance)) +
  geom_point(aes(y = .pred_lm, color = "Linear Reg")) +
  geom_point(aes(y = .pred_knn, color = "KNN Reg")) +
  geom_abline(color = 'blue') +
  labs(title = "Actual vs Predicted Values", y = "Predicted Balance") +
  scale_color_manual(values = c("#E69F00", "#56B4E9"))

RMSE (Root Mean Squared Error): Lower is better

R-squared: Higher is better (closer to 1)

MAE (Mean Absolute Error): Lower is better

KNN might overfit with too many neighbors

Assumptions: Linear regression assumes linear relationships

Classroom practice codes

# Comparing KNN and Linear Regression on Credit Data

# load libraries
library(kknn)
library(ISLR)
library(glmtoolbox)
library(DALEX)
library(MASS)
library(lmtest)
library(jtools)
library(broom)
library(car)
library(vip)
library(cowplot)
library(caret)
library(glmnet)
library(tidymodels)
library(tidyverse)

# Prefer tidymodels ecosystem
tidymodels_prefer()

# Set ggplot them ans bw
theme_set(theme_bw())

## Step 1: Load and prepare data
# Load data
data(Credit)

summary(Credit)

##        ID            Income           Limit           Rating     
##  Min.   :  1.0   Min.   : 10.35   Min.   :  855   Min.   : 93.0  
##  1st Qu.:100.8   1st Qu.: 21.01   1st Qu.: 3088   1st Qu.:247.2  
##  Median :200.5   Median : 33.12   Median : 4622   Median :344.0  
##  Mean   :200.5   Mean   : 45.22   Mean   : 4736   Mean   :354.9  
##  3rd Qu.:300.2   3rd Qu.: 57.47   3rd Qu.: 5873   3rd Qu.:437.2  
##  Max.   :400.0   Max.   :186.63   Max.   :13913   Max.   :982.0  
##      Cards            Age          Education        Gender    Student  
##  Min.   :1.000   Min.   :23.00   Min.   : 5.00    Male :193   No :360  
##  1st Qu.:2.000   1st Qu.:41.75   1st Qu.:11.00   Female:207   Yes: 40  
##  Median :3.000   Median :56.00   Median :14.00                         
##  Mean   :2.958   Mean   :55.67   Mean   :13.45                         
##  3rd Qu.:4.000   3rd Qu.:70.00   3rd Qu.:16.00                         
##  Max.   :9.000   Max.   :98.00   Max.   :20.00                         
##  Married              Ethnicity      Balance       
##  No :155   African American: 99   Min.   :   0.00  
##  Yes:245   Asian           :102   1st Qu.:  68.75  
##            Caucasian       :199   Median : 459.50  
##                                   Mean   : 520.01  
##                                   3rd Qu.: 863.00  
##                                   Max.   :1999.00

# Remove ID column and NA (missing) values
Credit = Credit %>%
  select(-ID) %>%
  drop_na()


### Step 2: Split data into training and testing sets
set.seed(123)  # For reproducibility

split = initial_split(Credit, prop = 0.7, strata = Balance)
train_data = training(split)
test_data = testing(split)

### Step 3: Set common recipe and preprocess train_data
recipe = recipe(Balance ~ ., data = train_data) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

### Step 4: Define KNN model specification
knn_spec = nearest_neighbor(
  mode = 'regression',
  neighbors = tune(),
  weight_func = tune(),
  dist_power = tune()      )

### Step 5: Set initial workflow
knn_workflow = workflow() %>% 
  add_recipe(recipe) %>% 
  add_model(knn_spec)

### Step 6: Create folds and set tuning grid
set.seed(123)

folds = vfold_cv(train_data, v = 10, strata = Balance)

# knn_grid = grid_regular(
#   neighbors(range = c(1, 20)),
#   weight_func(values = c("rectangular", "triangular", "optimal")),
#   dist_power(range = c(1, 2)),
#   levels = 10
# )

# OR using grid_latin_hypercube

knn_grid = grid_latin_hypercube(
  neighbors(range = c(1, 20)),
  weight_func(values = c("rectangular", "triangular", "optimal")),
  dist_power(range = c(1, 2)),
  size = 10
)

### Step 7: Tune KNN model
knn_tune = tune_grid(
  knn_workflow,
  resamples = folds,
  grid = knn_grid,
  metrics = metric_set(rmse, rsq)
)

# See metrics of different folds
knn_tune %>%
  collect_metrics() %>%
  arrange(mean)

## # A tibble: 20 × 9
##    neighbors weight_func dist_power .metric .estimator    mean     n std_err
##        <int> <chr>            <dbl> <chr>   <chr>        <dbl> <int>   <dbl>
##  1         3 rectangular       1.92 rsq     standard     0.637    10  0.0368
##  2         3 triangular        1.71 rsq     standard     0.643    10  0.0349
##  3        11 rectangular       1.87 rsq     standard     0.720    10  0.0290
##  4         8 rectangular       1.35 rsq     standard     0.728    10  0.0322
##  5        17 optimal           1.67 rsq     standard     0.740    10  0.0281
##  6         6 triangular        1.18 rsq     standard     0.747    10  0.0273
##  7        16 optimal           1.48 rsq     standard     0.750    10  0.0294
##  8        13 optimal           1.26 rsq     standard     0.758    10  0.0280
##  9        19 triangular        1.50 rsq     standard     0.759    10  0.0260
## 10        10 optimal           1.05 rsq     standard     0.773    10  0.0274
## 11        10 optimal           1.05 rmse    standard   225.       10  9.64  
## 12         6 triangular        1.18 rmse    standard   231.       10 10.1   
## 13        13 optimal           1.26 rmse    standard   235.       10  9.15  
## 14        16 optimal           1.48 rmse    standard   241.       10  8.90  
## 15        19 triangular        1.50 rmse    standard   242.       10  8.10  
## 16        17 optimal           1.67 rmse    standard   247.       10  8.97  
## 17         8 rectangular       1.35 rmse    standard   247.       10 10.0   
## 18        11 rectangular       1.87 rmse    standard   257.       10  8.96  
## 19         3 triangular        1.71 rmse    standard   272.       10 13.8   
## 20         3 rectangular       1.92 rmse    standard   273.       10 11.2   
## # ℹ 1 more variable: .config <chr>

# Plot tuning results
knn_tune %>% autoplot()

# Find best hyperparameters
best_knn = knn_tune %>%
  select_best(metric = "rmse")

best_knn

## # A tibble: 1 × 4
##   neighbors weight_func dist_power .config              
##       <int> <chr>            <dbl> <chr>                
## 1        10 optimal           1.05 Preprocessor1_Model01

### Step 8: Finalize the KNN workflow
final_knn_workflow = knn_workflow %>%
  finalize_workflow(best_knn)

### Step 9: Fit the final KNN model on the training data
final_knn_fit = final_knn_workflow %>%
  fit(data = train_data)

### Step 10: Evaluate the KNN model on the test data
knn_predictions = final_knn_fit %>%
  predict(new_data = test_data) %>%
  bind_cols(test_data)

# .pred column is the predicted values of Balance
knn_predictions

## # A tibble: 121 × 12
##     .pred Income Limit Rating Cards   Age Education Gender   Student Married
##     <dbl>  <dbl> <int>  <int> <int> <int>     <int> <fct>    <fct>   <fct>  
##  1 1193.   106.   6645    483     3    82        15 "Female" Yes     Yes    
##  2  362.    55.9  4897    357     2    68        16 " Male"  No      Yes    
##  3   96.2   21.0  3388    259     2    37        12 "Female" No      No     
##  4    0     15.0  1311    138     3    64        16 " Male"  No      No     
##  5   60.5   20.1  2525    200     3    57        15 "Female" No      Yes    
##  6  285.    53.6  3714    286     3    73        17 "Female" No      Yes    
##  7  631.    42.1  6626    479     2    44         9 " Male"  No      No     
##  8  896.    37.3  6378    458     1    72        17 "Female" No      No     
##  9  355.    14.1  4323    326     5    25        16 "Female" No      Yes    
## 10  525.    42.5  3625    289     6    44        12 "Female" Yes     No     
## # ℹ 111 more rows
## # ℹ 2 more variables: Ethnicity <fct>, Balance <int>

# Calculate RMSE and R-squared
knn_metrics = knn_predictions %>%
  metrics(truth = Balance, estimate = .pred)

knn_metrics

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     234.   
## 2 rsq     standard       0.793
## 3 mae     standard     172.

# Plot predictions vs actual values
ggplot(knn_predictions, aes(x = Balance, y = .pred)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  labs(title = "KNN Predictions vs Actual Values",
       x = "Actual Balance",
       y = "Predicted Balance")

## `geom_smooth()` using formula = 'y ~ x'

### Run linear regression model using tidymodels approach

# Linear regress specification

lin_reg_spec = linear_reg(mode = 'regression')

# Set linear regression workflow
lin_reg_workflow = workflow() %>%
  add_recipe(recipe) %>%
  add_model(lin_reg_spec)

# Train the linear regression model on the train data
final_lm = lin_reg_workflow %>% fit(data = train_data)

# Make predictions on the test data
lm_predictions = final_lm %>%
  predict(new_data = test_data) %>%
  bind_cols(test_data)

# Calculate RMSE and R-squared for linear regression
metrics(lm_predictions, truth = Balance, estimate = .pred)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      92.3  
## 2 rsq     standard       0.964
## 3 mae     standard      75.8

# Calculate RMSE and R-squared for KNN regression
metrics(knn_predictions, truth = Balance, estimate = .pred)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     234.   
## 2 rsq     standard       0.793
## 3 mae     standard     172.

# Predicting the test data
test_results = test_data %>% 
  mutate(
    .pred_knn = knn_predictions$.pred,
    .pred_lm = lm_predictions$.pred
  )

head(test_results)

##    Income Limit Rating Cards Age Education Gender Student Married
## 1 106.025  6645    483     3  82        15 Female     Yes     Yes
## 2  55.882  4897    357     2  68        16   Male      No     Yes
## 3  20.996  3388    259     2  37        12 Female      No      No
## 4  15.045  1311    138     3  64        16   Male      No      No
## 5  20.089  2525    200     3  57        15 Female      No     Yes
## 6  53.598  3714    286     3  73        17 Female      No     Yes
##          Ethnicity Balance  .pred_knn   .pred_lm
## 1            Asian     903 1193.08906  914.26371
## 2        Caucasian     331  361.90311  409.04140
## 3 African American     203   96.21087  291.12924
## 4        Caucasian       0    0.00000 -174.83619
## 5 African American       0   60.53106   63.96688
## 6 African American       0  285.21505  113.45598

# Combining the metrics

lm_metrics = metrics(lm_predictions, truth = Balance, estimate = .pred) %>% mutate(model = "Linear Regression")

knn_metrics = metrics(knn_predictions, truth = Balance, estimate = .pred) %>% mutate(model = "KNN")

combined_metrics = bind_rows(lm_metrics, knn_metrics)
combined_metrics %>% arrange(.metric)

## # A tibble: 6 × 4
##   .metric .estimator .estimate model            
##   <chr>   <chr>          <dbl> <chr>            
## 1 mae     standard      75.8   Linear Regression
## 2 mae     standard     172.    KNN              
## 3 rmse    standard      92.3   Linear Regression
## 4 rmse    standard     234.    KNN              
## 5 rsq     standard       0.964 Linear Regression
## 6 rsq     standard       0.793 KNN

# Visual comparison the test results
head(test_results)

##    Income Limit Rating Cards Age Education Gender Student Married
## 1 106.025  6645    483     3  82        15 Female     Yes     Yes
## 2  55.882  4897    357     2  68        16   Male      No     Yes
## 3  20.996  3388    259     2  37        12 Female      No      No
## 4  15.045  1311    138     3  64        16   Male      No      No
## 5  20.089  2525    200     3  57        15 Female      No     Yes
## 6  53.598  3714    286     3  73        17 Female      No     Yes
##          Ethnicity Balance  .pred_knn   .pred_lm
## 1            Asian     903 1193.08906  914.26371
## 2        Caucasian     331  361.90311  409.04140
## 3 African American     203   96.21087  291.12924
## 4        Caucasian       0    0.00000 -174.83619
## 5 African American       0   60.53106   63.96688
## 6 African American       0  285.21505  113.45598

ggplot(test_results, aes(x = Balance)) +
  geom_point(aes(y = .pred_knn, color = "KNN")) +
  geom_point(aes(y = .pred_lm, color = "Linear Regression")) +
  geom_smooth(aes(y = .pred_knn), method = 'lm', color = "blue") +
  geom_smooth(aes(y = .pred_lm), method = 'lm', color = "red") +
  labs(title = "KNN vs Linear Regression Predictions",
       x = "Actual Balance",
       y = "Predicted Balance") +
  scale_color_manual(values = c("KNN" = "blue", "Linear Regression" = "red")) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Model comparison: Multiple linear regression versus k-nearest neighbors

Professor Dr. Md. Kamrul Hasan

Recall: When to use 10-fold CV and bootstap?

KNN regression

Linear regression

Classroom practice codes