Recall: When to use 10-fold CV and bootstap?

pacman::p_load(kknn, ISLR, glmtoolbox, DALEX, MASS, lmtest, jtools, broom, car, vip, cowplot, glmnet, caret, tidymodels, tidyverse)
tidymodels_prefer()
theme_set(theme_bw())
# Load and prepare data
data(Credit)

# Exclude the ID column
Credit = Credit[-1] %>% drop_na()
str(Credit)
## 'data.frame':    400 obs. of  11 variables:
##  $ Income   : num  14.9 106 104.6 148.9 55.9 ...
##  $ Limit    : int  3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
##  $ Rating   : int  283 483 514 681 357 569 259 512 266 491 ...
##  $ Cards    : int  2 3 4 3 2 4 2 2 5 3 ...
##  $ Age      : int  34 82 71 36 68 77 37 87 66 41 ...
##  $ Education: int  11 15 11 11 16 10 12 9 13 19 ...
##  $ Gender   : Factor w/ 2 levels " Male","Female": 1 2 1 2 1 1 2 1 2 2 ...
##  $ Student  : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
##  $ Married  : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
##  $ Ethnicity: Factor w/ 3 levels "African American",..: 3 2 2 2 3 3 1 2 3 1 ...
##  $ Balance  : int  333 903 580 964 331 1151 203 872 279 1350 ...
# Split data into training and testing sets
set.seed(1)
split = initial_split(Credit, prop = 0.7)
train_data = training(split)
test_data = testing(split)
# Common pre-processing recipe
model_recipe = recipe(Balance ~ ., data = train_data) %>% 
  step_normalize(all_numeric_predictors()) %>% # Important for KNN
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors())

KNN regression

# KNN Regression specification and workflow
# tune(): we will tune this with different values
# weight_func: which kernel to be used to calculate distances of the neighbours
# dist_power: power to calculate Minkowski distance
# weight_func and dist_power can also be tuned

knn_spec = nearest_neighbor(
    neighbors = tune(),
    weight_func = tune(),
    dist_power = tune()) %>%
  set_engine("kknn") %>% 
  set_mode("regression")

# Workflows
knn_workflow = workflow() %>% 
  add_recipe(model_recipe) %>% 
  add_model(knn_spec)
# Set up cross-validation folds for tuning knn hyperparameters
set.seed(123)
folds = vfold_cv(train_data, v = 10)

# Tune KNN model
knn_grid = grid_latin_hypercube(
  neighbors(range = c(1, 10)),
  weight_func(values = c("rectangular", "triangular", "epanechnikov", "biweight", 
                        "triweight", "cos", "inv", "gaussian", "optimal")),
  dist_power(range = c(1, 2)),
  size = 5
)
## Warning: `grid_latin_hypercube()` was deprecated in dials 1.3.0.
## ℹ Please use `grid_space_filling()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
knn_tune = tune_grid(knn_workflow,
    resamples = folds,
    grid = knn_grid,
    metrics = metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
  )

collect_metrics(knn_tune)
## # A tibble: 15 × 9
##    neighbors weight_func  dist_power .metric .estimator    mean     n std_err
##        <int> <chr>             <dbl> <chr>   <chr>        <dbl> <int>   <dbl>
##  1         5 cos                1.30 mae     standard   174.       10  4.95  
##  2         5 cos                1.30 rmse    standard   223.       10  7.20  
##  3         5 cos                1.30 rsq     standard     0.768    10  0.0282
##  4         3 epanechnikov       1.06 mae     standard   166.       10  6.32  
##  5         3 epanechnikov       1.06 rmse    standard   218.       10  7.58  
##  6         3 epanechnikov       1.06 rsq     standard     0.773    10  0.0283
##  7         8 optimal            1.73 mae     standard   193.       10  3.89  
##  8         8 optimal            1.73 rmse    standard   240.       10  6.01  
##  9         8 optimal            1.73 rsq     standard     0.741    10  0.0337
## 10         9 rectangular        1.93 mae     standard   218.       10  4.49  
## 11         9 rectangular        1.93 rmse    standard   269.       10  7.19  
## 12         9 rectangular        1.93 rsq     standard     0.702    10  0.0325
## 13         1 triweight          1.44 mae     standard   205.       10 10.6   
## 14         1 triweight          1.44 rmse    standard   270.       10 10.4   
## 15         1 triweight          1.44 rsq     standard     0.672    10  0.0333
## # ℹ 1 more variable: .config <chr>
knn_tune %>% autoplot()

# Select best KNN model
best_knn = knn_tune %>% select_best(metric = 'rmse')
best_knn
## # A tibble: 1 × 4
##   neighbors weight_func  dist_power .config             
##       <int> <chr>             <dbl> <chr>               
## 1         3 epanechnikov       1.06 Preprocessor1_Model2
final_knn_workflow = finalize_workflow(knn_workflow, best_knn)

# Fit final models on train data from the split and evaluates using the test data from the split
knn_model = last_fit(final_knn_workflow, split)

# See results of trained model on the test data
final_knn = extract_fit_parsnip(knn_model)
final_knn
## parsnip model object
## 
## 
## Call:
## kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(3L,     data, 5), distance = ~1.05871301149018, kernel = ~"epanechnikov")
## 
## Type of response variable: continuous
## minimal mean absolute error: 165.3931
## Minimal mean squared error: 47862.04
## Best kernel: epanechnikov
## Best k: 3
# Test metrics of the trained model on the test data
collect_metrics(knn_model)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard     277.    Preprocessor1_Model1
## 2 rsq     standard       0.626 Preprocessor1_Model1
# Get predictions
train_predictions = collect_predictions(knn_model)
head(train_predictions)
## # A tibble: 6 × 5
##   .pred id                .row Balance .config             
##   <dbl> <chr>            <int>   <int> <chr>               
## 1  881. train/test split     3     580 Preprocessor1_Model1
## 2 1627. train/test split     4     964 Preprocessor1_Model1
## 3  210. train/test split     5     331 Preprocessor1_Model1
## 4  777. train/test split     6    1151 Preprocessor1_Model1
## 5  136. train/test split     7     203 Preprocessor1_Model1
## 6 1077. train/test split     8     872 Preprocessor1_Model1
# Get predictions using basic functions for training data
train_predictions = final_knn$fit$fitted.values %>% bind_cols(train_data) %>% rename(.pred = ...1)
## New names:
## • `` -> `...1`
head(train_predictions)
##       .pred  Income Limit Rating Cards Age Education Gender Student Married
## 1 1248.3952 182.728 13913    982     4  98        17   Male      No     Yes
## 2  326.4580  35.691  2880    214     2  35        15   Male      No      No
## 3 1195.4525 123.299  8376    610     2  89        17   Male     Yes      No
## 4  122.4892  20.791  2672    204     1  70        18 Female      No      No
## 5  715.6282  39.055  5565    410     4  48        18 Female      No     Yes
## 6  326.3437  36.472  3806    309     2  52        13   Male      No      No
##          Ethnicity Balance
## 1        Caucasian    1999
## 2 African American       0
## 3 African American    1259
## 4 African American       0
## 5        Caucasian     772
## 6 African American     188
# Get predictions for the test data
# Prepare test_data according to the recipe preprocessing
test_data_processed = prep(model_recipe) %>% bake(new_data = test_data)
knn_predictions = predict(final_knn, new_data = test_data_processed) %>% bind_cols(test_data_processed)
head(knn_predictions)
## # A tibble: 6 × 13
##   .pred Income   Limit  Rating   Cards    Age Education Balance Gender_Female
##   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>  <dbl>     <dbl>   <int>         <dbl>
## 1  881.  1.68   0.999   1.02    0.772   0.913    -0.869     580             0
## 2 1627.  2.91   2.03    2.07    0.0725 -1.14     -0.869     964             1
## 3  210.  0.324  0.0757  0.0228 -0.627   0.738     0.736     331             0
## 4  777.  0.999  1.41    1.36    0.772   1.27     -1.19     1151             0
## 5  136. -0.645 -0.564  -0.597  -0.627  -1.08     -0.548     203             1
## 6 1077.  0.756  1.02    1.00   -0.627   1.85     -1.51      872             0
## # ℹ 4 more variables: Student_Yes <dbl>, Married_Yes <dbl>,
## #   Ethnicity_Asian <dbl>, Ethnicity_Caucasian <dbl>
# Evaluate the model performance
metrics(knn_predictions, truth = Balance, estimate = .pred)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     277.   
## 2 rsq     standard       0.626
## 3 mae     standard     198.

Linear regression

# Linear Regression specification and workflow
lm_spec = linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

# Workflows
lm_workflow = workflow() %>% 
  add_recipe(model_recipe) %>% 
  add_model(lm_spec)

# Train the model on training data
final_lm = fit(lm_workflow, train_data)
# Evaluate the model performance
lm_predictions = final_lm %>% predict(test_data) %>% bind_cols(test_data)
metrics(lm_predictions, truth = Balance, estimate = .pred)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     107.   
## 2 rsq     standard       0.945
## 3 mae     standard      83.6
# Collect metrics on test set for both models
# pull() extracts vector [single varialbe], select() extracts tibble (like dataframe)

test_results = test_data %>% 
  mutate(
    .pred_lm = predict(final_lm, new_data = test_data) %>% pull(.pred),
    .pred_knn = predict(final_knn, new_data = test_data_processed) %>% pull(.pred)
  )

head(test_results)
##    Income Limit Rating Cards Age Education Gender Student Married
## 1 104.593  7075    514     4  71        11   Male      No      No
## 2 148.924  9504    681     3  36        11 Female      No      No
## 3  55.882  4897    357     2  68        16   Male      No     Yes
## 4  80.180  8047    569     4  77        10   Male      No      No
## 5  20.996  3388    259     2  37        12 Female      No      No
## 6  71.408  7114    512     2  87         9   Male      No      No
##          Ethnicity Balance  .pred_lm .pred_knn
## 1            Asian     580  677.8747  880.6868
## 2            Asian     964  980.7356 1627.0813
## 3        Caucasian     331  407.5040  210.2217
## 4        Caucasian    1151 1104.0413  776.8329
## 5 African American     203  278.5849  136.0664
## 6            Asian     872  888.1353 1077.1995
# Calculate performance metrics
metrics = metric_set(rmse, rsq, mae)

lm_metrics = test_results %>% 
  metrics(truth = Balance, estimate = .pred_lm) %>% 
  mutate(model = "Linear Regression")

knn_metrics = test_results %>% 
  metrics(truth = Balance, estimate = .pred_knn) %>% 
  mutate(model = "KNN Regression")

# Combine and compare
bind_rows(lm_metrics, knn_metrics) %>% arrange(.metric)
## # A tibble: 6 × 4
##   .metric .estimator .estimate model            
##   <chr>   <chr>          <dbl> <chr>            
## 1 mae     standard      83.6   Linear Regression
## 2 mae     standard     198.    KNN Regression   
## 3 rmse    standard     107.    Linear Regression
## 4 rmse    standard     277.    KNN Regression   
## 5 rsq     standard       0.945 Linear Regression
## 6 rsq     standard       0.626 KNN Regression

we see that linear regression performs better in this case.

# Visual comparison
test_results %>% 
  ggplot(aes(x = Balance)) +
  geom_point(aes(y = .pred_lm, color = "Linear Reg")) +
  geom_point(aes(y = .pred_knn, color = "KNN Reg")) +
  geom_abline(color = 'blue') +
  labs(title = "Actual vs Predicted Values", y = "Predicted Balance") +
  scale_color_manual(values = c("#E69F00", "#56B4E9"))

RMSE (Root Mean Squared Error): Lower is better

R-squared: Higher is better (closer to 1)

MAE (Mean Absolute Error): Lower is better

KNN might overfit with too many neighbors

Assumptions: Linear regression assumes linear relationships

Classroom practice codes

# Comparing KNN and Linear Regression on Credit Data

# load libraries
library(kknn)
library(ISLR)
library(glmtoolbox)
library(DALEX)
library(MASS)
library(lmtest)
library(jtools)
library(broom)
library(car)
library(vip)
library(cowplot)
library(caret)
library(glmnet)
library(tidymodels)
library(tidyverse)

# Prefer tidymodels ecosystem
tidymodels_prefer()

# Set ggplot them ans bw
theme_set(theme_bw())

## Step 1: Load and prepare data
# Load data
data(Credit)

summary(Credit)
##        ID            Income           Limit           Rating     
##  Min.   :  1.0   Min.   : 10.35   Min.   :  855   Min.   : 93.0  
##  1st Qu.:100.8   1st Qu.: 21.01   1st Qu.: 3088   1st Qu.:247.2  
##  Median :200.5   Median : 33.12   Median : 4622   Median :344.0  
##  Mean   :200.5   Mean   : 45.22   Mean   : 4736   Mean   :354.9  
##  3rd Qu.:300.2   3rd Qu.: 57.47   3rd Qu.: 5873   3rd Qu.:437.2  
##  Max.   :400.0   Max.   :186.63   Max.   :13913   Max.   :982.0  
##      Cards            Age          Education        Gender    Student  
##  Min.   :1.000   Min.   :23.00   Min.   : 5.00    Male :193   No :360  
##  1st Qu.:2.000   1st Qu.:41.75   1st Qu.:11.00   Female:207   Yes: 40  
##  Median :3.000   Median :56.00   Median :14.00                         
##  Mean   :2.958   Mean   :55.67   Mean   :13.45                         
##  3rd Qu.:4.000   3rd Qu.:70.00   3rd Qu.:16.00                         
##  Max.   :9.000   Max.   :98.00   Max.   :20.00                         
##  Married              Ethnicity      Balance       
##  No :155   African American: 99   Min.   :   0.00  
##  Yes:245   Asian           :102   1st Qu.:  68.75  
##            Caucasian       :199   Median : 459.50  
##                                   Mean   : 520.01  
##                                   3rd Qu.: 863.00  
##                                   Max.   :1999.00
# Remove ID column and NA (missing) values
Credit = Credit %>%
  select(-ID) %>%
  drop_na()


### Step 2: Split data into training and testing sets
set.seed(123)  # For reproducibility

split = initial_split(Credit, prop = 0.7, strata = Balance)
train_data = training(split)
test_data = testing(split)

### Step 3: Set common recipe and preprocess train_data
recipe = recipe(Balance ~ ., data = train_data) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

### Step 4: Define KNN model specification
knn_spec = nearest_neighbor(
  mode = 'regression',
  neighbors = tune(),
  weight_func = tune(),
  dist_power = tune()      )

### Step 5: Set initial workflow
knn_workflow = workflow() %>% 
  add_recipe(recipe) %>% 
  add_model(knn_spec)

### Step 6: Create folds and set tuning grid
set.seed(123)

folds = vfold_cv(train_data, v = 10, strata = Balance)

# knn_grid = grid_regular(
#   neighbors(range = c(1, 20)),
#   weight_func(values = c("rectangular", "triangular", "optimal")),
#   dist_power(range = c(1, 2)),
#   levels = 10
# )

# OR using grid_latin_hypercube

knn_grid = grid_latin_hypercube(
  neighbors(range = c(1, 20)),
  weight_func(values = c("rectangular", "triangular", "optimal")),
  dist_power(range = c(1, 2)),
  size = 10
)

### Step 7: Tune KNN model
knn_tune = tune_grid(
  knn_workflow,
  resamples = folds,
  grid = knn_grid,
  metrics = metric_set(rmse, rsq)
)

# See metrics of different folds
knn_tune %>%
  collect_metrics() %>%
  arrange(mean)
## # A tibble: 20 × 9
##    neighbors weight_func dist_power .metric .estimator    mean     n std_err
##        <int> <chr>            <dbl> <chr>   <chr>        <dbl> <int>   <dbl>
##  1         3 rectangular       1.92 rsq     standard     0.637    10  0.0368
##  2         3 triangular        1.71 rsq     standard     0.643    10  0.0349
##  3        11 rectangular       1.87 rsq     standard     0.720    10  0.0290
##  4         8 rectangular       1.35 rsq     standard     0.728    10  0.0322
##  5        17 optimal           1.67 rsq     standard     0.740    10  0.0281
##  6         6 triangular        1.18 rsq     standard     0.747    10  0.0273
##  7        16 optimal           1.48 rsq     standard     0.750    10  0.0294
##  8        13 optimal           1.26 rsq     standard     0.758    10  0.0280
##  9        19 triangular        1.50 rsq     standard     0.759    10  0.0260
## 10        10 optimal           1.05 rsq     standard     0.773    10  0.0274
## 11        10 optimal           1.05 rmse    standard   225.       10  9.64  
## 12         6 triangular        1.18 rmse    standard   231.       10 10.1   
## 13        13 optimal           1.26 rmse    standard   235.       10  9.15  
## 14        16 optimal           1.48 rmse    standard   241.       10  8.90  
## 15        19 triangular        1.50 rmse    standard   242.       10  8.10  
## 16        17 optimal           1.67 rmse    standard   247.       10  8.97  
## 17         8 rectangular       1.35 rmse    standard   247.       10 10.0   
## 18        11 rectangular       1.87 rmse    standard   257.       10  8.96  
## 19         3 triangular        1.71 rmse    standard   272.       10 13.8   
## 20         3 rectangular       1.92 rmse    standard   273.       10 11.2   
## # ℹ 1 more variable: .config <chr>
# Plot tuning results
knn_tune %>% autoplot()

# Find best hyperparameters
best_knn = knn_tune %>%
  select_best(metric = "rmse")

best_knn
## # A tibble: 1 × 4
##   neighbors weight_func dist_power .config              
##       <int> <chr>            <dbl> <chr>                
## 1        10 optimal           1.05 Preprocessor1_Model01
### Step 8: Finalize the KNN workflow
final_knn_workflow = knn_workflow %>%
  finalize_workflow(best_knn)

### Step 9: Fit the final KNN model on the training data
final_knn_fit = final_knn_workflow %>%
  fit(data = train_data)

### Step 10: Evaluate the KNN model on the test data
knn_predictions = final_knn_fit %>%
  predict(new_data = test_data) %>%
  bind_cols(test_data)

# .pred column is the predicted values of Balance
knn_predictions
## # A tibble: 121 × 12
##     .pred Income Limit Rating Cards   Age Education Gender   Student Married
##     <dbl>  <dbl> <int>  <int> <int> <int>     <int> <fct>    <fct>   <fct>  
##  1 1193.   106.   6645    483     3    82        15 "Female" Yes     Yes    
##  2  362.    55.9  4897    357     2    68        16 " Male"  No      Yes    
##  3   96.2   21.0  3388    259     2    37        12 "Female" No      No     
##  4    0     15.0  1311    138     3    64        16 " Male"  No      No     
##  5   60.5   20.1  2525    200     3    57        15 "Female" No      Yes    
##  6  285.    53.6  3714    286     3    73        17 "Female" No      Yes    
##  7  631.    42.1  6626    479     2    44         9 " Male"  No      No     
##  8  896.    37.3  6378    458     1    72        17 "Female" No      No     
##  9  355.    14.1  4323    326     5    25        16 "Female" No      Yes    
## 10  525.    42.5  3625    289     6    44        12 "Female" Yes     No     
## # ℹ 111 more rows
## # ℹ 2 more variables: Ethnicity <fct>, Balance <int>
# Calculate RMSE and R-squared
knn_metrics = knn_predictions %>%
  metrics(truth = Balance, estimate = .pred)

knn_metrics
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     234.   
## 2 rsq     standard       0.793
## 3 mae     standard     172.
# Plot predictions vs actual values
ggplot(knn_predictions, aes(x = Balance, y = .pred)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  labs(title = "KNN Predictions vs Actual Values",
       x = "Actual Balance",
       y = "Predicted Balance")
## `geom_smooth()` using formula = 'y ~ x'

### Run linear regression model using tidymodels approach

# Linear regress specification

lin_reg_spec = linear_reg(mode = 'regression')

# Set linear regression workflow
lin_reg_workflow = workflow() %>%
  add_recipe(recipe) %>%
  add_model(lin_reg_spec)

# Train the linear regression model on the train data
final_lm = lin_reg_workflow %>% fit(data = train_data)

# Make predictions on the test data
lm_predictions = final_lm %>%
  predict(new_data = test_data) %>%
  bind_cols(test_data)

# Calculate RMSE and R-squared for linear regression
metrics(lm_predictions, truth = Balance, estimate = .pred)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      92.3  
## 2 rsq     standard       0.964
## 3 mae     standard      75.8
# Calculate RMSE and R-squared for KNN regression
metrics(knn_predictions, truth = Balance, estimate = .pred)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     234.   
## 2 rsq     standard       0.793
## 3 mae     standard     172.
# Predicting the test data
test_results = test_data %>% 
  mutate(
    .pred_knn = knn_predictions$.pred,
    .pred_lm = lm_predictions$.pred
  )

head(test_results)
##    Income Limit Rating Cards Age Education Gender Student Married
## 1 106.025  6645    483     3  82        15 Female     Yes     Yes
## 2  55.882  4897    357     2  68        16   Male      No     Yes
## 3  20.996  3388    259     2  37        12 Female      No      No
## 4  15.045  1311    138     3  64        16   Male      No      No
## 5  20.089  2525    200     3  57        15 Female      No     Yes
## 6  53.598  3714    286     3  73        17 Female      No     Yes
##          Ethnicity Balance  .pred_knn   .pred_lm
## 1            Asian     903 1193.08906  914.26371
## 2        Caucasian     331  361.90311  409.04140
## 3 African American     203   96.21087  291.12924
## 4        Caucasian       0    0.00000 -174.83619
## 5 African American       0   60.53106   63.96688
## 6 African American       0  285.21505  113.45598
# Combining the metrics

lm_metrics = metrics(lm_predictions, truth = Balance, estimate = .pred) %>% mutate(model = "Linear Regression")

knn_metrics = metrics(knn_predictions, truth = Balance, estimate = .pred) %>% mutate(model = "KNN")

combined_metrics = bind_rows(lm_metrics, knn_metrics)
combined_metrics %>% arrange(.metric)
## # A tibble: 6 × 4
##   .metric .estimator .estimate model            
##   <chr>   <chr>          <dbl> <chr>            
## 1 mae     standard      75.8   Linear Regression
## 2 mae     standard     172.    KNN              
## 3 rmse    standard      92.3   Linear Regression
## 4 rmse    standard     234.    KNN              
## 5 rsq     standard       0.964 Linear Regression
## 6 rsq     standard       0.793 KNN
# Visual comparison the test results
head(test_results)
##    Income Limit Rating Cards Age Education Gender Student Married
## 1 106.025  6645    483     3  82        15 Female     Yes     Yes
## 2  55.882  4897    357     2  68        16   Male      No     Yes
## 3  20.996  3388    259     2  37        12 Female      No      No
## 4  15.045  1311    138     3  64        16   Male      No      No
## 5  20.089  2525    200     3  57        15 Female      No     Yes
## 6  53.598  3714    286     3  73        17 Female      No     Yes
##          Ethnicity Balance  .pred_knn   .pred_lm
## 1            Asian     903 1193.08906  914.26371
## 2        Caucasian     331  361.90311  409.04140
## 3 African American     203   96.21087  291.12924
## 4        Caucasian       0    0.00000 -174.83619
## 5 African American       0   60.53106   63.96688
## 6 African American       0  285.21505  113.45598
ggplot(test_results, aes(x = Balance)) +
  geom_point(aes(y = .pred_knn, color = "KNN")) +
  geom_point(aes(y = .pred_lm, color = "Linear Regression")) +
  geom_smooth(aes(y = .pred_knn), method = 'lm', color = "blue") +
  geom_smooth(aes(y = .pred_lm), method = 'lm', color = "red") +
  labs(title = "KNN vs Linear Regression Predictions",
       x = "Actual Balance",
       y = "Predicted Balance") +
  scale_color_manual(values = c("KNN" = "blue", "Linear Regression" = "red")) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'