#chapter2 ex10

library(ISLR2)
library(GGally)

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggplot2)

data(Boston)

a

head(Boston)

b

ggpairs(Boston[, 1:6])

# c

cor(Boston$crim, Boston)

##      crim         zn     indus        chas       nox         rm       age
## [1,]    1 -0.2004692 0.4065834 -0.05589158 0.4209717 -0.2192467 0.3527343
##             dis       rad       tax   ptratio     lstat       medv
## [1,] -0.3796701 0.6255051 0.5827643 0.2899456 0.4556215 -0.3883046

d

summary(Boston$crim)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08204  0.25651  3.61352  3.67708 88.97620

summary(Boston$tax)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   187.0   279.0   330.0   408.2   666.0   711.0

summary(Boston$ptratio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.60   17.40   19.05   18.46   20.20   22.00

e

sum(Boston$chas == 1)

## [1] 35

f

median(Boston$ptratio)

## [1] 19.05

g

Boston[which.min(Boston$medv), ]

h

sum(Boston$rm > 7)

## [1] 64

sum(Boston$rm > 8)

## [1] 13

Boston[Boston$rm > 8, ]

Chapter3 ex2

KNN is a non-parametric, instance-based machine learning algorithm that can be used for both classification and regression tasks. The core idea in both is similar: use the k closest data points (neighbors) to make a prediction.

library(class)

library(FNN)

## 
## Attaching package: 'FNN'

## The following objects are masked from 'package:class':
## 
##     knn, knn.cv

ex10

data(Carseats)

install.packages("ISLR")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(ISLR)

## 
## Attaching package: 'ISLR'

## The following objects are masked from 'package:ISLR2':
## 
##     Auto, Credit

# Step 3: Load the Carseats dataset
data(Carseats)

# Step 4: Fit the linear model
model_full <- lm(Sales ~ Price + Urban + US, data = Carseats)

# Step 5: View the model summary
summary(model_full)

## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

summary(model_full)

## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

summary(model_full)$coefficients

##                Estimate  Std. Error      t value     Pr(>|t|)
## (Intercept) 13.04346894 0.651012245  20.03567373 3.626602e-62
## Price       -0.05445885 0.005241855 -10.38923205 1.609917e-22
## UrbanYes    -0.02191615 0.271650277  -0.08067781 9.357389e-01
## USYes        1.20057270 0.259041508   4.63467306 4.860245e-06

coefs <- summary(model_full)$coefficients

intercept <- round(coefs["(Intercept)", "Estimate"], 2)
price_coef <- round(coefs["Price", "Estimate"], 2)
urban_coef <- round(coefs["UrbanYes", "Estimate"], 2)
us_coef <- round(coefs["USYes", "Estimate"], 2)

cat("Intercept:", intercept, 
    "- Expected Sales when Price = 0, Urban = 'No', US = 'No'\n")

## Intercept: 13.04 - Expected Sales when Price = 0, Urban = 'No', US = 'No'

cat("Price:", price_coef,
    "- For each $1 increase in Price, Sales decreases by", abs(price_coef), "units\n")

## Price: -0.05 - For each $1 increase in Price, Sales decreases by 0.05 units

cat("UrbanYes:", urban_coef, 
    "- Difference in Sales between Urban = 'Yes' and Urban = 'No', holding other variables constant\n")

## UrbanYes: -0.02 - Difference in Sales between Urban = 'Yes' and Urban = 'No', holding other variables constant

cat("USYes:", us_coef,
    "- Difference in Sales between US = 'Yes' and US = 'No', holding other variables constant\n")

## USYes: 1.2 - Difference in Sales between US = 'Yes' and US = 'No', holding other variables constant

coef(model_full)

## (Intercept)       Price    UrbanYes       USYes 
## 13.04346894 -0.05445885 -0.02191615  1.20057270

summary(model_full)$coefficients

##                Estimate  Std. Error      t value     Pr(>|t|)
## (Intercept) 13.04346894 0.651012245  20.03567373 3.626602e-62
## Price       -0.05445885 0.005241855 -10.38923205 1.609917e-22
## UrbanYes    -0.02191615 0.271650277  -0.08067781 9.357389e-01
## USYes        1.20057270 0.259041508   4.63467306 4.860245e-06

model_small <- lm(Sales ~ Price + US, data = Carseats)
summary(model_small)

## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

summary(model_full)$r.squared

## [1] 0.2392754

summary(model_small)$r.squared

## [1] 0.2392629

confint(model_small, level = 0.95)

##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632

par(mfrow = c(2, 2))
plot(model_small)

Chapter4 ex12

# --- Part (a): Log odds in your model ---
# The log odds in the standard logistic regression model is β₀ + β₁x
# This is derived directly from the model equation and doesn't require computation

# --- Part (b): Log odds in your friend's softmax model ---
# The log odds in the softmax model is (α_orange0 - α_apple0) + (α_orange1 - α_apple1)x
# This comes from taking the log of the ratio of probabilities

# --- Part (c): Finding coefficients in friend's model given your model ---
# Given β₀ = 2 and β₁ = -1
beta0 <- 2
beta1 <- -1

# One possible solution:
alpha_orange0_c <- 2  # arbitrary choice
alpha_apple0_c <- 0   # arbitrary choice
alpha_orange1_c <- -1 # satisfies constraint
alpha_apple1_c <- 0   # arbitrary choice

cat("Part (c) - One possible solution for friend's coefficients:\n")

## Part (c) - One possible solution for friend's coefficients:

cat("α_orange0 =", alpha_orange0_c, "\n")

## α_orange0 = 2

cat("α_orange1 =", alpha_orange1_c, "\n")

## α_orange1 = -1

cat("α_apple0 =", alpha_apple0_c, "\n")

## α_apple0 = 0

cat("α_apple1 =", alpha_apple1_c, "\n\n")

## α_apple1 = 0

alpha_orange0_alt <- 0
alpha_apple0_alt <- -2
alpha_orange1_alt <- 0
alpha_apple1_alt <- 1

cat("Part (c) - Alternative solution for friend's coefficients:\n")

## Part (c) - Alternative solution for friend's coefficients:

cat("α_orange0 =", alpha_orange0_alt, "\n")

## α_orange0 = 0

cat("α_orange1 =", alpha_orange1_alt, "\n")

## α_orange1 = 0

cat("α_apple0 =", alpha_apple0_alt, "\n")

## α_apple0 = -2

cat("α_apple1 =", alpha_apple1_alt, "\n\n")

## α_apple1 = 1

alpha_orange0 <- 1.2
alpha_orange1 <- -2
alpha_apple0 <- 3
alpha_apple1 <- 0.6

# Calculate your model's coefficients
your_beta0 <- alpha_orange0 - alpha_apple0
your_beta1 <- alpha_orange1 - alpha_apple1

cat("Part (d) - Your model coefficients:\n")

## Part (d) - Your model coefficients:

cat("β₀ =", your_beta0, "\n")

## β₀ = -1.8

cat("β₁ =", your_beta1, "\n\n")

## β₁ = -2.6

# --- Part (e): Agreement between models ---
# Function to calculate probabilities using your logistic regression model
your_model_prob <- function(x, beta0, beta1) {
  linear_pred <- beta0 + beta1 * x
  exp(linear_pred) / (1 + exp(linear_pred))
}

# Function to calculate probabilities using your friend's softmax model
friend_model_prob <- function(x, alpha_orange0, alpha_orange1, alpha_apple0, alpha_apple1) {
  orange_score <- exp(alpha_orange0 + alpha_orange1 * x)
  apple_score <- exp(alpha_apple0 + alpha_apple1 * x)
  orange_score / (orange_score + apple_score)
}

# Generate 2000 test observations
set.seed(123)
test_data <- runif(2000, -5, 5)

# Calculate predictions from both models
your_preds <- ifelse(your_model_prob(test_data, your_beta0, your_beta1) > 0.5, "orange", "apple")
friend_preds <- ifelse(friend_model_prob(test_data, alpha_orange0, alpha_orange1, alpha_apple0, alpha_apple1) > 0.5, "orange", "apple")

# Calculate agreement
agreement <- mean(your_preds == friend_preds)
cat("Part (e) - Fraction of agreement:", agreement, "\n\n")

## Part (e) - Fraction of agreement: 1

your_boundary <- -your_beta0/your_beta1
friend_boundary <- -(alpha_orange0 - alpha_apple0)/(alpha_orange1 - alpha_apple1)

cat("Decision boundary in your model: x =", your_boundary, "\n")

## Decision boundary in your model: x = -0.6923077

cat("Decision boundary in friend's model: x =", friend_boundary, "\n\n")

## Decision boundary in friend's model: x = -0.6923077

# --- Visualization of probability curves ---
x_range <- seq(-5, 5, length.out = 100)
your_probs <- sapply(x_range, function(x) your_model_prob(x, your_beta0, your_beta1))
friend_probs <- sapply(x_range, function(x) 
  friend_model_prob(x, alpha_orange0, alpha_orange1, alpha_apple0, alpha_apple1))

# Create data frame for plotting
plot_data <- data.frame(
  x = rep(x_range, 2),
  prob = c(your_probs, friend_probs),
  model = rep(c("Your Model", "Friend's Model"), each = length(x_range))
)

# Plot the probability curves
par(mfrow=c(1,1))
plot(x_range, your_probs, type = "l", col = "blue", lwd = 2,
     xlab = "X", ylab = "P(orange|X)", 
     main = "Probability of Orange by Model", ylim = c(0, 1))
lines(x_range, friend_probs, col = "red", lwd = 2)
abline(h = 0.5, lty = 2)
abline(v = your_boundary, col = "blue", lty = 3)
abline(v = friend_boundary, col = "red", lty = 3)
legend("topright", legend = c("Your Model", "Friend's Model", "Decision Boundary"), 
       col = c("blue", "red", "black"), lty = c(1, 1, 2))

# --- Demonstration that models are equivalent for binary classification ---
# Calculate absolute difference between probabilities at each point
prob_diff <- abs(your_probs - friend_probs)
max_diff <- max(prob_diff)
cat("Maximum difference in probabilities:", max_diff, "\n")

## Maximum difference in probabilities: 1.110223e-16

# Check if predictions match at all points
x_check <- seq(-5, 5, by = 0.01)
your_pred_check <- ifelse(your_model_prob(x_check, your_beta0, your_beta1) > 0.5, 1, 0)
friend_pred_check <- ifelse(friend_model_prob(x_check, alpha_orange0, alpha_orange1, 
                                            alpha_apple0, alpha_apple1) > 0.5, 1, 0)
all_equal <- all(your_pred_check == friend_pred_check)
cat("All predictions match:", all_equal)

## All predictions match: TRUE

midterm homework 113035136

2025-04-19

a

b

d

e

f

g

h

Chapter3 ex2

ex10

Chapter4 ex12