For each scenario, we determine whether the problem is classification or regression, whether inference or prediction is the focus, and provide values for \(n\) (number of observations) and \(p\) (number of predictors).
library(ggplot2)
flexibility <- seq(1, 10, length.out = 100)
bias_squared <- exp(-0.5 * flexibility) * 4
variance <- log(flexibility + 1)
irreducible_error <- rep(0.5, length(flexibility))
training_error <- 1 / (flexibility + 1)
test_error <- bias_squared + variance + irreducible_error
data <- data.frame(
Flexibility = rep(flexibility, 5),
Error = c(bias_squared, variance, irreducible_error, training_error, test_error),
Type = rep(c("Bias^2", "Variance", "Irreducible Error", "Training Error", "Test Error"),
each = length(flexibility))
)
ggplot(data, aes(x = Flexibility, y = Error, color = Type)) +
geom_line(size = 1) +
labs(title = "Bias-Variance Tradeoff", x = "Model Flexibility", y = "Error") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- data.frame(
Obs = 1:6,
X1 = c(0, 2, 0, 0, -1, 1),
X2 = c(3, 0, 1, 1, 0, 1),
X3 = c(0, 0, 3, 2, 1, 1),
Y = c("Red", "Red", "Red", "Green", "Green", "Red")
)
test_point <- c(0, 0, 0)
data <- data %>%
mutate(Distance = sqrt((X1 - test_point[1])^2 + (X2 - test_point[2])^2 + (X3 - test_point[3])^2))
data <- data %>% arrange(Distance)
print(data)
## Obs X1 X2 X3 Y Distance
## 1 5 -1 0 1 Green 1.414214
## 2 6 1 1 1 Red 1.732051
## 3 2 2 0 0 Red 2.000000
## 4 4 0 1 2 Green 2.236068
## 5 1 0 3 0 Red 3.000000
## 6 3 0 1 3 Red 3.162278