Chapter 2: ex #10

library(ISLR2)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
data(Boston)
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7
ggpairs(Boston[, 1:6])

cor(Boston$crim, Boston)
##      crim         zn     indus        chas       nox         rm       age
## [1,]    1 -0.2004692 0.4065834 -0.05589158 0.4209717 -0.2192467 0.3527343
##             dis       rad       tax   ptratio     lstat       medv
## [1,] -0.3796701 0.6255051 0.5827643 0.2899456 0.4556215 -0.3883046
summary(Boston$crim)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08204  0.25651  3.61352  3.67708 88.97620
summary(Boston$tax)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   187.0   279.0   330.0   408.2   666.0   711.0
summary(Boston$ptratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.60   17.40   19.05   18.46   20.20   22.00
sum(Boston$chas == 1)
## [1] 35
median(Boston$ptratio)
## [1] 19.05
Boston[which.min(Boston$medv), ]
##        crim zn indus chas   nox    rm age    dis rad tax ptratio lstat medv
## 399 38.3518  0  18.1    0 0.693 5.453 100 1.4896  24 666    20.2 30.59    5
sum(Boston$rm > 7)
## [1] 64
sum(Boston$rm > 8)
## [1] 13
Boston[Boston$rm > 8, ]
##        crim zn indus chas    nox    rm  age    dis rad tax ptratio lstat medv
## 98  0.12083  0  2.89    0 0.4450 8.069 76.0 3.4952   2 276    18.0  4.21 38.7
## 164 1.51902  0 19.58    1 0.6050 8.375 93.9 2.1620   5 403    14.7  3.32 50.0
## 205 0.02009 95  2.68    0 0.4161 8.034 31.9 5.1180   4 224    14.7  2.88 50.0
## 225 0.31533  0  6.20    0 0.5040 8.266 78.3 2.8944   8 307    17.4  4.14 44.8
## 226 0.52693  0  6.20    0 0.5040 8.725 83.0 2.8944   8 307    17.4  4.63 50.0
## 227 0.38214  0  6.20    0 0.5040 8.040 86.5 3.2157   8 307    17.4  3.13 37.6
## 233 0.57529  0  6.20    0 0.5070 8.337 73.3 3.8384   8 307    17.4  2.47 41.7
## 234 0.33147  0  6.20    0 0.5070 8.247 70.4 3.6519   8 307    17.4  3.95 48.3
## 254 0.36894 22  5.86    0 0.4310 8.259  8.4 8.9067   7 330    19.1  3.54 42.8
## 258 0.61154 20  3.97    0 0.6470 8.704 86.9 1.8010   5 264    13.0  5.12 50.0
## 263 0.52014 20  3.97    0 0.6470 8.398 91.5 2.2885   5 264    13.0  5.91 48.8
## 268 0.57834 20  3.97    0 0.5750 8.297 67.0 2.4216   5 264    13.0  7.44 50.0
## 365 3.47428  0 18.10    1 0.7180 8.780 82.9 1.9047  24 666    20.2  5.29 21.9

Chapter 3: ex #2, #10

library(class)
library(FNN)
## 
## Attaching package: 'FNN'
## The following objects are masked from 'package:class':
## 
##     knn, knn.cv
data(Carseats)
install.packages("ISLR")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR)
## 
## Attaching package: 'ISLR'
## The following objects are masked from 'package:ISLR2':
## 
##     Auto, Credit
data(Carseats)
model_full <- lm(Sales ~ Price + Urban + US, data = Carseats)
summary(model_full)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
summary(model_full)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
summary(model_full)$coefficients
##                Estimate  Std. Error      t value     Pr(>|t|)
## (Intercept) 13.04346894 0.651012245  20.03567373 3.626602e-62
## Price       -0.05445885 0.005241855 -10.38923205 1.609917e-22
## UrbanYes    -0.02191615 0.271650277  -0.08067781 9.357389e-01
## USYes        1.20057270 0.259041508   4.63467306 4.860245e-06
coefs <- summary(model_full)$coefficients

intercept <- round(coefs["(Intercept)", "Estimate"], 2)
price_coef <- round(coefs["Price", "Estimate"], 2)
urban_coef <- round(coefs["UrbanYes", "Estimate"], 2)
us_coef <- round(coefs["USYes", "Estimate"], 2)

cat("Intercept:", intercept, 
    "- Expected Sales when Price = 0, Urban = 'No', US = 'No'\n")
## Intercept: 13.04 - Expected Sales when Price = 0, Urban = 'No', US = 'No'
cat("Price:", price_coef,
    "- For each $1 increase in Price, Sales decreases by", abs(price_coef), "units\n")
## Price: -0.05 - For each $1 increase in Price, Sales decreases by 0.05 units
cat("UrbanYes:", urban_coef, 
    "- Difference in Sales between Urban = 'Yes' and Urban = 'No', holding other variables constant\n")
## UrbanYes: -0.02 - Difference in Sales between Urban = 'Yes' and Urban = 'No', holding other variables constant
cat("USYes:", us_coef,
    "- Difference in Sales between US = 'Yes' and US = 'No', holding other variables constant\n")
## USYes: 1.2 - Difference in Sales between US = 'Yes' and US = 'No', holding other variables constant
coef(model_full)
## (Intercept)       Price    UrbanYes       USYes 
## 13.04346894 -0.05445885 -0.02191615  1.20057270
summary(model_full)$coefficients
##                Estimate  Std. Error      t value     Pr(>|t|)
## (Intercept) 13.04346894 0.651012245  20.03567373 3.626602e-62
## Price       -0.05445885 0.005241855 -10.38923205 1.609917e-22
## UrbanYes    -0.02191615 0.271650277  -0.08067781 9.357389e-01
## USYes        1.20057270 0.259041508   4.63467306 4.860245e-06
model_small <- lm(Sales ~ Price + US, data = Carseats)
summary(model_small)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16
summary(model_full)$r.squared
## [1] 0.2392754
summary(model_small)$r.squared
## [1] 0.2392629
confint(model_small, level = 0.95)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632
par(mfrow = c(2, 2))
plot(model_small)

Chapter 4: ex #12

beta0 <- 2
beta1 <- -1
alpha_orange0_c <- 2  # arbitrary choice
alpha_apple0_c <- 0   # arbitrary choice
alpha_orange1_c <- -1 # satisfies constraint
alpha_apple1_c <- 0   # arbitrary choice

cat("Part (c) - One possible solution for friend's coefficients:\n")
## Part (c) - One possible solution for friend's coefficients:
cat("α_orange0 =", alpha_orange0_c, "\n")
## α_orange0 = 2
cat("α_orange1 =", alpha_orange1_c, "\n")
## α_orange1 = -1
cat("α_apple0 =", alpha_apple0_c, "\n")
## α_apple0 = 0
cat("α_apple1 =", alpha_apple1_c, "\n\n")
## α_apple1 = 0
alpha_orange0_alt <- 0
alpha_apple0_alt <- -2
alpha_orange1_alt <- 0
alpha_apple1_alt <- 1

cat("Part (c) - Alternative solution for friend's coefficients:\n")
## Part (c) - Alternative solution for friend's coefficients:
cat("α_orange0 =", alpha_orange0_alt, "\n")
## α_orange0 = 0
cat("α_orange1 =", alpha_orange1_alt, "\n")
## α_orange1 = 0
cat("α_apple0 =", alpha_apple0_alt, "\n")
## α_apple0 = -2
cat("α_apple1 =", alpha_apple1_alt, "\n\n")
## α_apple1 = 1
alpha_orange0 <- 1.2
alpha_orange1 <- -2
alpha_apple0 <- 3
alpha_apple1 <- 0.6

# Calculate your model's coefficients
your_beta0 <- alpha_orange0 - alpha_apple0
your_beta1 <- alpha_orange1 - alpha_apple1

cat("Part (d) - Your model coefficients:\n")
## Part (d) - Your model coefficients:
cat("β₀ =", your_beta0, "\n")
## β₀ = -1.8
cat("β₁ =", your_beta1, "\n\n")
## β₁ = -2.6
your_model_prob <- function(x, beta0, beta1) {
  linear_pred <- beta0 + beta1 * x
  exp(linear_pred) / (1 + exp(linear_pred))
}
friend_model_prob <- function(x, alpha_orange0, alpha_orange1, alpha_apple0, alpha_apple1) {
  orange_score <- exp(alpha_orange0 + alpha_orange1 * x)
  apple_score <- exp(alpha_apple0 + alpha_apple1 * x)
  orange_score / (orange_score + apple_score)
}
set.seed(123)
test_data <- runif(2000, -5, 5)
your_preds <- ifelse(your_model_prob(test_data, your_beta0, your_beta1) > 0.5, "orange", "apple")
friend_preds <- ifelse(friend_model_prob(test_data, alpha_orange0, alpha_orange1, alpha_apple0, alpha_apple1) > 0.5, "orange", "apple")
agreement <- mean(your_preds == friend_preds)
cat("Part (e) - Fraction of agreement:", agreement, "\n\n")
## Part (e) - Fraction of agreement: 1
your_boundary <- -your_beta0/your_beta1
friend_boundary <- -(alpha_orange0 - alpha_apple0)/(alpha_orange1 - alpha_apple1)

cat("Decision boundary in your model: x =", your_boundary, "\n")
## Decision boundary in your model: x = -0.6923077
cat("Decision boundary in friend's model: x =", friend_boundary, "\n\n")
## Decision boundary in friend's model: x = -0.6923077
x_range <- seq(-5, 5, length.out = 100)
your_probs <- sapply(x_range, function(x) your_model_prob(x, your_beta0, your_beta1))
friend_probs <- sapply(x_range, function(x) 
  friend_model_prob(x, alpha_orange0, alpha_orange1, alpha_apple0, alpha_apple1))
plot_data <- data.frame(
  x = rep(x_range, 2),
  prob = c(your_probs, friend_probs),
  model = rep(c("Your Model", "Friend's Model"), each = length(x_range))
)
par(mfrow=c(1,1))
plot(x_range, your_probs, type = "l", col = "blue", lwd = 2,
     xlab = "X", ylab = "P(orange|X)", 
     main = "Probability of Orange by Model", ylim = c(0, 1))
lines(x_range, friend_probs, col = "red", lwd = 2)
abline(h = 0.5, lty = 2)
abline(v = your_boundary, col = "blue", lty = 3)
abline(v = friend_boundary, col = "red", lty = 3)
legend("topright", legend = c("Your Model", "Friend's Model", "Decision Boundary"), 
       col = c("blue", "red", "black"), lty = c(1, 1, 2))

prob_diff <- abs(your_probs - friend_probs)
max_diff <- max(prob_diff)
cat("Maximum difference in probabilities:", max_diff, "\n")
## Maximum difference in probabilities: 1.110223e-16
x_check <- seq(-5, 5, by = 0.01)
your_pred_check <- ifelse(your_model_prob(x_check, your_beta0, your_beta1) > 0.5, 1, 0)
friend_pred_check <- ifelse(friend_model_prob(x_check, alpha_orange0, alpha_orange1, 
                                            alpha_apple0, alpha_apple1) > 0.5, 1, 0)
all_equal <- all(your_pred_check == friend_pred_check)
cat("All predictions match:", all_equal)
## All predictions match: TRUE