# Load required libraries
library(ISLR2)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
# Load Boston data
data(Boston)
# (a) View dataset
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
# (b) Pairwise scatterplots (first 6 variables for simplicity)
ggpairs(Boston[, 1:6])
# (c) Correlation of crim with other predictors
cor(Boston$crim, Boston)
## crim zn indus chas nox rm age
## [1,] 1 -0.2004692 0.4065834 -0.05589158 0.4209717 -0.2192467 0.3527343
## dis rad tax ptratio lstat medv
## [1,] -0.3796701 0.6255051 0.5827643 0.2899456 0.4556215 -0.3883046
# (d) Summary of key predictors
summary(Boston$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
summary(Boston$tax)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 187.0 279.0 330.0 408.2 666.0 711.0
summary(Boston$ptratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.60 17.40 19.05 18.46 20.20 22.00
# (e) Tracts near the Charles River
sum(Boston$chas == 1)
## [1] 35
# (f) Median pupil-teacher ratio
median(Boston$ptratio)
## [1] 19.05
# (g) Tract with lowest median value
Boston[which.min(Boston$medv), ]
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 30.59 5
# (h) Tracts with high average rooms
sum(Boston$rm > 7)
## [1] 64
sum(Boston$rm > 8)
## [1] 13
Boston[Boston$rm > 8, ]
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 98 0.12083 0 2.89 0 0.4450 8.069 76.0 3.4952 2 276 18.0 4.21 38.7
## 164 1.51902 0 19.58 1 0.6050 8.375 93.9 2.1620 5 403 14.7 3.32 50.0
## 205 0.02009 95 2.68 0 0.4161 8.034 31.9 5.1180 4 224 14.7 2.88 50.0
## 225 0.31533 0 6.20 0 0.5040 8.266 78.3 2.8944 8 307 17.4 4.14 44.8
## 226 0.52693 0 6.20 0 0.5040 8.725 83.0 2.8944 8 307 17.4 4.63 50.0
## 227 0.38214 0 6.20 0 0.5040 8.040 86.5 3.2157 8 307 17.4 3.13 37.6
## 233 0.57529 0 6.20 0 0.5070 8.337 73.3 3.8384 8 307 17.4 2.47 41.7
## 234 0.33147 0 6.20 0 0.5070 8.247 70.4 3.6519 8 307 17.4 3.95 48.3
## 254 0.36894 22 5.86 0 0.4310 8.259 8.4 8.9067 7 330 19.1 3.54 42.8
## 258 0.61154 20 3.97 0 0.6470 8.704 86.9 1.8010 5 264 13.0 5.12 50.0
## 263 0.52014 20 3.97 0 0.6470 8.398 91.5 2.2885 5 264 13.0 5.91 48.8
## 268 0.57834 20 3.97 0 0.5750 8.297 67.0 2.4216 5 264 13.0 7.44 50.0
## 365 3.47428 0 18.10 1 0.7180 8.780 82.9 1.9047 24 666 20.2 5.29 21.9
Exercise 2: KNN Classifier vs. KNN Regression The K-Nearest Neighbors (KNN) method is a simple and versatile machine learning algorithm that can be used for both classification and regression tasks. The two variants of KNN operate differently based on the type of response variable (outcome):
🔹 KNN Classifier -Used for classification problems, where the response variable is categorical. -To classify a new observation:
Example: Predicting whether a person has a disease (Yes/No) based on features like age, weight, and blood pressure.
🔸 KNN Regression -Used for regression problems, where the response variable is quantitative (continuous). -To make a prediction:
Example: Predicting house price based on its size, location, and number of rooms.
# Install and load necessary package
install.packages("ggpubr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggpubr)
# Create a data frame with the comparison info
# Install and load the required package
install.packages("ggpubr") # Run only once
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggpubr)
# Create the comparison data frame
comparison_table <- data.frame(
Feature = c("Type of Output", "Prediction Method", "Evaluation Metrics", "Use Cases"),
KNN_Classifier = c(
"Categorical (e.g., 'Yes'/'No')",
"Majority vote among K neighbors",
"Accuracy, Precision, Confusion Matrix",
"Spam detection, disease prediction"
),
KNN_Regression = c(
"Continuous (e.g., 145.2, 23.9)",
"Average of values among K neighbors",
"MSE, RMSE",
"House price prediction, temperature"
)
)
# Generate a table plot
ggtexttable(comparison_table,
rows = NULL,
theme = ttheme("mBlue"))
cat("🔑 Summary:
• KNN Classification assigns the most common class among neighbors.
• KNN Regression averages the output values of neighbors.
• Both methods depend on the choice of K and the scale of input variables.")
## 🔑 Summary:
## • KNN Classification assigns the most common class among neighbors.
## • KNN Regression averages the output values of neighbors.
## • Both methods depend on the choice of K and the scale of input variables.
# Load Carseats dataset
data(Carseats)
## (a) Fit a multiple regression model to predict Sales using Price, Urban, and US.
# Step 1: Install ISLR if you haven't already
install.packages("ISLR")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
# Step 2: Load the ISLR package
library(ISLR)
##
## Attaching package: 'ISLR'
## The following objects are masked from 'package:ISLR2':
##
## Auto, Credit
# Step 3: Load the Carseats dataset
data(Carseats)
# Step 4: Fit the linear model
model_full <- lm(Sales ~ Price + Urban + US, data = Carseats)
# Step 5: View the model summary
summary(model_full)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
summary(model_full)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
## (b) Provide an interpretation of each coefficient in the model.
# Show coefficients
summary(model_full)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.04346894 0.651012245 20.03567373 3.626602e-62
## Price -0.05445885 0.005241855 -10.38923205 1.609917e-22
## UrbanYes -0.02191615 0.271650277 -0.08067781 9.357389e-01
## USYes 1.20057270 0.259041508 4.63467306 4.860245e-06
# Extract coefficients
coefs <- summary(model_full)$coefficients
intercept <- round(coefs["(Intercept)", "Estimate"], 2)
price_coef <- round(coefs["Price", "Estimate"], 2)
urban_coef <- round(coefs["UrbanYes", "Estimate"], 2)
us_coef <- round(coefs["USYes", "Estimate"], 2)
cat("Intercept:", intercept,
"- Expected Sales when Price = 0, Urban = 'No', US = 'No'\n")
## Intercept: 13.04 - Expected Sales when Price = 0, Urban = 'No', US = 'No'
cat("Price:", price_coef,
"- For each $1 increase in Price, Sales decreases by", abs(price_coef), "units\n")
## Price: -0.05 - For each $1 increase in Price, Sales decreases by 0.05 units
cat("UrbanYes:", urban_coef,
"- Difference in Sales between Urban = 'Yes' and Urban = 'No', holding other variables constant\n")
## UrbanYes: -0.02 - Difference in Sales between Urban = 'Yes' and Urban = 'No', holding other variables constant
cat("USYes:", us_coef,
"- Difference in Sales between US = 'Yes' and US = 'No', holding other variables constant\n")
## USYes: 1.2 - Difference in Sales between US = 'Yes' and US = 'No', holding other variables constant
## (c) Write out the model in equation form.
coef(model_full)
## (Intercept) Price UrbanYes USYes
## 13.04346894 -0.05445885 -0.02191615 1.20057270
## (d) Hypothesis Testing for Predictors
summary(model_full)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.04346894 0.651012245 20.03567373 3.626602e-62
## Price -0.05445885 0.005241855 -10.38923205 1.609917e-22
## UrbanYes -0.02191615 0.271650277 -0.08067781 9.357389e-01
## USYes 1.20057270 0.259041508 4.63467306 4.860245e-06
## (e) Fit Smaller Model Using Only Significant Predictors
model_small <- lm(Sales ~ Price + US, data = Carseats)
summary(model_small)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
## (f) Model Fit Comparison
# Compare R-squared values
summary(model_full)$r.squared
## [1] 0.2392754
summary(model_small)$r.squared
## [1] 0.2392629
## (g) 95% Confidence Intervals for Coefficients
confint(model_small, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
## (h) Checking for Outliers and High Leverage Points
par(mfrow = c(2, 2))
plot(model_small)
# --- Part (a): Log odds in your model ---
# The log odds in the standard logistic regression model is β₀ + β₁x
# This is derived directly from the model equation and doesn't require computation
# --- Part (b): Log odds in your friend's softmax model ---
# The log odds in the softmax model is (α_orange0 - α_apple0) + (α_orange1 - α_apple1)x
# This comes from taking the log of the ratio of probabilities
# --- Part (c): Finding coefficients in friend's model given your model ---
# Given β₀ = 2 and β₁ = -1
beta0 <- 2
beta1 <- -1
# One possible solution:
alpha_orange0_c <- 2 # arbitrary choice
alpha_apple0_c <- 0 # arbitrary choice
alpha_orange1_c <- -1 # satisfies constraint
alpha_apple1_c <- 0 # arbitrary choice
cat("Part (c) - One possible solution for friend's coefficients:\n")
## Part (c) - One possible solution for friend's coefficients:
cat("α_orange0 =", alpha_orange0_c, "\n")
## α_orange0 = 2
cat("α_orange1 =", alpha_orange1_c, "\n")
## α_orange1 = -1
cat("α_apple0 =", alpha_apple0_c, "\n")
## α_apple0 = 0
cat("α_apple1 =", alpha_apple1_c, "\n\n")
## α_apple1 = 0
# Alternative solution:
alpha_orange0_alt <- 0
alpha_apple0_alt <- -2
alpha_orange1_alt <- 0
alpha_apple1_alt <- 1
cat("Part (c) - Alternative solution for friend's coefficients:\n")
## Part (c) - Alternative solution for friend's coefficients:
cat("α_orange0 =", alpha_orange0_alt, "\n")
## α_orange0 = 0
cat("α_orange1 =", alpha_orange1_alt, "\n")
## α_orange1 = 0
cat("α_apple0 =", alpha_apple0_alt, "\n")
## α_apple0 = -2
cat("α_apple1 =", alpha_apple1_alt, "\n\n")
## α_apple1 = 1
# --- Part (d): Finding your coefficients given your friend's model ---
# Given coefficients from friend's model
alpha_orange0 <- 1.2
alpha_orange1 <- -2
alpha_apple0 <- 3
alpha_apple1 <- 0.6
# Calculate your model's coefficients
your_beta0 <- alpha_orange0 - alpha_apple0
your_beta1 <- alpha_orange1 - alpha_apple1
cat("Part (d) - Your model coefficients:\n")
## Part (d) - Your model coefficients:
cat("β₀ =", your_beta0, "\n")
## β₀ = -1.8
cat("β₁ =", your_beta1, "\n\n")
## β₁ = -2.6
# --- Part (e): Agreement between models ---
# Function to calculate probabilities using your logistic regression model
your_model_prob <- function(x, beta0, beta1) {
linear_pred <- beta0 + beta1 * x
exp(linear_pred) / (1 + exp(linear_pred))
}
# Function to calculate probabilities using your friend's softmax model
friend_model_prob <- function(x, alpha_orange0, alpha_orange1, alpha_apple0, alpha_apple1) {
orange_score <- exp(alpha_orange0 + alpha_orange1 * x)
apple_score <- exp(alpha_apple0 + alpha_apple1 * x)
orange_score / (orange_score + apple_score)
}
# Generate 2000 test observations
set.seed(123)
test_data <- runif(2000, -5, 5)
# Calculate predictions from both models
your_preds <- ifelse(your_model_prob(test_data, your_beta0, your_beta1) > 0.5, "orange", "apple")
friend_preds <- ifelse(friend_model_prob(test_data, alpha_orange0, alpha_orange1, alpha_apple0, alpha_apple1) > 0.5, "orange", "apple")
# Calculate agreement
agreement <- mean(your_preds == friend_preds)
cat("Part (e) - Fraction of agreement:", agreement, "\n\n")
## Part (e) - Fraction of agreement: 1
# Calculate decision boundaries for visualization
your_boundary <- -your_beta0/your_beta1
friend_boundary <- -(alpha_orange0 - alpha_apple0)/(alpha_orange1 - alpha_apple1)
cat("Decision boundary in your model: x =", your_boundary, "\n")
## Decision boundary in your model: x = -0.6923077
cat("Decision boundary in friend's model: x =", friend_boundary, "\n\n")
## Decision boundary in friend's model: x = -0.6923077
# --- Visualization of probability curves ---
x_range <- seq(-5, 5, length.out = 100)
your_probs <- sapply(x_range, function(x) your_model_prob(x, your_beta0, your_beta1))
friend_probs <- sapply(x_range, function(x)
friend_model_prob(x, alpha_orange0, alpha_orange1, alpha_apple0, alpha_apple1))
# Create data frame for plotting
plot_data <- data.frame(
x = rep(x_range, 2),
prob = c(your_probs, friend_probs),
model = rep(c("Your Model", "Friend's Model"), each = length(x_range))
)
# Plot the probability curves
par(mfrow=c(1,1))
plot(x_range, your_probs, type = "l", col = "blue", lwd = 2,
xlab = "X", ylab = "P(orange|X)",
main = "Probability of Orange by Model", ylim = c(0, 1))
lines(x_range, friend_probs, col = "red", lwd = 2)
abline(h = 0.5, lty = 2)
abline(v = your_boundary, col = "blue", lty = 3)
abline(v = friend_boundary, col = "red", lty = 3)
legend("topright", legend = c("Your Model", "Friend's Model", "Decision Boundary"),
col = c("blue", "red", "black"), lty = c(1, 1, 2))
# --- Demonstration that models are equivalent for binary classification ---
# Calculate absolute difference between probabilities at each point
prob_diff <- abs(your_probs - friend_probs)
max_diff <- max(prob_diff)
cat("Maximum difference in probabilities:", max_diff, "\n")
## Maximum difference in probabilities: 1.110223e-16
# Check if predictions match at all points
x_check <- seq(-5, 5, by = 0.01)
your_pred_check <- ifelse(your_model_prob(x_check, your_beta0, your_beta1) > 0.5, 1, 0)
friend_pred_check <- ifelse(friend_model_prob(x_check, alpha_orange0, alpha_orange1,
alpha_apple0, alpha_apple1) > 0.5, 1, 0)
all_equal <- all(your_pred_check == friend_pred_check)
cat("All predictions match:", all_equal)
## All predictions match: TRUE