Chapter 2 Exercise 10

library(ISLR2)
data(Boston)

# Basic info
dim(Boston)
## [1] 506  13
?Boston

# Exploratory plots
pairs(Boston[, 1:6])

#There are both linear and non-linear relationships among the predictors. For example, rm (average number of rooms) tends to increase with medv (median home value), while lstat (percentage of lower-status population) shows a negative association with medv.

cor(Boston$crim, Boston[, -1])
##              zn     indus        chas       nox         rm       age        dis
## [1,] -0.2004692 0.4065834 -0.05589158 0.4209717 -0.2192467 0.3527343 -0.3796701
##            rad       tax   ptratio     lstat       medv
## [1,] 0.6255051 0.5827643 0.2899456 0.4556215 -0.3883046
plot(crim ~ rad, data = Boston)

#The crime rate (crim) is positively related to both rad (access to radial highways) and tax, and negatively related to ptratio (pupil-teacher ratio). Areas with higher crime often have more highways, higher taxes, and fewer teachers per student.

# Summary stats
summary(Boston[c("crim", "tax", "ptratio")])
##       crim               tax           ptratio     
##  Min.   : 0.00632   Min.   :187.0   Min.   :12.60  
##  1st Qu.: 0.08205   1st Qu.:279.0   1st Qu.:17.40  
##  Median : 0.25651   Median :330.0   Median :19.05  
##  Mean   : 3.61352   Mean   :408.2   Mean   :18.46  
##  3rd Qu.: 3.67708   3rd Qu.:666.0   3rd Qu.:20.20  
##  Max.   :88.97620   Max.   :711.0   Max.   :22.00
# High-value rows
Boston[which.max(Boston$crim), ]
##        crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 381 88.9762  0  18.1    0 0.671 6.968 91.9 1.4165  24 666    20.2 17.21 10.4
Boston[which.max(Boston$tax), ]
##        crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 489 0.15086  0 27.74    0 0.609 5.454 92.7 1.8209   4 711    20.1 18.06 15.2
Boston[which.max(Boston$ptratio), ]
##        crim zn indus chas   nox    rm  age     dis rad tax ptratio lstat medv
## 355 0.04301 80  1.91    0 0.413 5.663 21.9 10.5857   4 334      22  8.05 18.2
#The crim variable reaches as high as 88.98, indicating the presence of significant outliers. Tax values range from 187 to 711, and the pupil-teacher ratio varies between 12.6 and 22. Some tracts clearly deviate from the norm.


# Chas == 1 and median of ptratio
sum(Boston$chas == 1)
## [1] 35
#A total of 35 census tracts border the Charles River.

median(Boston$ptratio)
## [1] 19.05
#The median pupil-teacher ratio across all tracts is 19.05.


# Min medv
Boston[which.min(Boston$medv), ]
##        crim zn indus chas   nox    rm age    dis rad tax ptratio lstat medv
## 399 38.3518  0  18.1    0 0.693 5.453 100 1.4896  24 666    20.2 30.59    5
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00
#The census tract with the lowest medv is valued at $5,000. It also exhibits extreme values—very high crime and lstat, and a low number of rooms—suggesting poor living conditions and lower socioeconomic status.

# rm > 7 and > 8
sum(Boston$rm > 7)
## [1] 64
sum(Boston$rm > 8)
## [1] 13
Boston[Boston$rm > 8, ]
##        crim zn indus chas    nox    rm  age    dis rad tax ptratio lstat medv
## 98  0.12083  0  2.89    0 0.4450 8.069 76.0 3.4952   2 276    18.0  4.21 38.7
## 164 1.51902  0 19.58    1 0.6050 8.375 93.9 2.1620   5 403    14.7  3.32 50.0
## 205 0.02009 95  2.68    0 0.4161 8.034 31.9 5.1180   4 224    14.7  2.88 50.0
## 225 0.31533  0  6.20    0 0.5040 8.266 78.3 2.8944   8 307    17.4  4.14 44.8
## 226 0.52693  0  6.20    0 0.5040 8.725 83.0 2.8944   8 307    17.4  4.63 50.0
## 227 0.38214  0  6.20    0 0.5040 8.040 86.5 3.2157   8 307    17.4  3.13 37.6
## 233 0.57529  0  6.20    0 0.5070 8.337 73.3 3.8384   8 307    17.4  2.47 41.7
## 234 0.33147  0  6.20    0 0.5070 8.247 70.4 3.6519   8 307    17.4  3.95 48.3
## 254 0.36894 22  5.86    0 0.4310 8.259  8.4 8.9067   7 330    19.1  3.54 42.8
## 258 0.61154 20  3.97    0 0.6470 8.704 86.9 1.8010   5 264    13.0  5.12 50.0
## 263 0.52014 20  3.97    0 0.6470 8.398 91.5 2.2885   5 264    13.0  5.91 48.8
## 268 0.57834 20  3.97    0 0.5750 8.297 67.0 2.4216   5 264    13.0  7.44 50.0
## 365 3.47428  0 18.10    1 0.7180 8.780 82.9 1.9047  24 666    20.2  5.29 21.9
#There are 64 tracts where the average number of rooms exceeds 7, and 13 where it exceeds 8. These areas generally show higher medv, lower crim, and overall better socioeconomic characteristics.

Chapter 3 Exercise 2

# Load required libraries
library(class)
library(FNN)
## 
## Attaching package: 'FNN'
## The following objects are masked from 'package:class':
## 
##     knn, knn.cv
library(ISLR2)

# --- KNN Classification on iris ---
set.seed(123)
data(iris)

# Train-test split
idx <- sample(nrow(iris), 100)
train_iris <- iris[idx, ]
test_iris <- iris[-idx, ]

# Fit KNN Classifier (k = 3)
knn_pred <- knn(train = train_iris[, 1:4],
                test = test_iris[, 1:4],
                cl = train_iris$Species,
                k = 3)

# Classification accuracy
mean(knn_pred == test_iris$Species)
## [1] 0.96
#The classification model predicts the species of a flower based on the majority vote from its 3 nearest neighbors.



# --- KNN Regression on Boston ---
set.seed(123)
data(Boston)

# Train-test split
idx_boston <- sample(nrow(Boston), 400)
train_boston <- Boston[idx_boston, ]
test_boston <- Boston[-idx_boston, ]

# Fit KNN Regression (k = 3)
knn_reg <- knn.reg(train = train_boston[, -14],
                   test = test_boston[, -14],
                   y = train_boston$medv,
                   k = 3)

# Predictions and MSE
head(knn_reg$pred)
## [1] 22.06667 24.46667 19.36667 27.53333 24.76667 15.23333
mean((knn_reg$pred - test_boston$medv)^2)
## [1] 19.35238
#The regression model estimates medv (median home value) by taking the average value of the 3 nearest neighbors.

Chapter 3 Exercise 10

# Load data and libraries
library(ISLR2)
data(Carseats)

# Fit linear models
mod_full <- lm(Sales ~ Price + Urban + US, data = Carseats)
mod_simple <- lm(Sales ~ Price + US, data = Carseats)

# Summaries
summary(mod_full)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
#b)Term | Estimate | Interpretation

#Intercept | 13.0435 → This is the predicted Sales when Price = 0, and both Urban and US are set to “No.” While not realistic in practice, it serves as a baseline value.

#Price | -0.0545 → For every $1 increase in price, Sales are expected to drop by 0.0545 units, assuming other factors remain unchanged.

#UrbanYes | -0.0219 → On average, stores in urban locations sell 0.0219 fewer units than non-urban stores, holding other variables constant. However, this effect is statistically insignificant (p = 0.936).

#USYes | +1.2006 → Stores located in the US sell, on average, 1.2 more units than those outside the US, assuming price and urban setting are constant. This effect is statistically significant.


#c)Model equation:
#Sales = 13.0435 − 0.0545 × Price − 0.0219 × UrbanYes + 1.2006 × USYes + ϵ
#Where:
#UrbanYes = 1 if the store is in an urban area, otherwise 0
#USYes = 1 if the store is in the US, otherwise 0


#d)Variable | p-value | Decision
#Price | < 2e-16 | Reject H₀ → Significant
#UrbanYes | 0.936 | Fail to reject H₀ → Not significant
#USYes | 4.86e-06 | Reject H₀ → Significant



summary(mod_simple)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16
# Adjusted R-squared comparison
summary(mod_full)$adj.r.squared
## [1] 0.2335123
summary(mod_simple)$adj.r.squared
## [1] 0.2354305
#When comparing models, look at the adjusted R-squared values. If two models perform similarly, prefer the simpler one with fewer variables.



# 95% Confidence Intervals for the simpler model
confint(mod_simple)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632
#The smaller model also provides 95% confidence intervals for each coefficient, showing the range within which the true values likely fall.
# Diagnostic plots
par(mfrow = c(2, 2))
plot(mod_simple)

Chapter 4 Exercise 12

#a) log(Pr(Y=orange∣x)/Pr(Y=apple∣x))=β0+β1x


#b) log(Pr(Y=orange)/Pr(Y=apple))=(αorange0−αapple0)+(αorange1−αapple1)x


#c) Your model: 𝛽0=2, 𝛽1=−1

#Let’s find friend’s coefficients assuming softmax uses apple as the reference (arbitrary shift possible because softmax is invariant to adding same constant to both linear terms):

#One way to convert:
#Let: 𝛼orange0=2, 𝛼orange1=−1 𝛼apple0=0, 𝛼apple1=0

#Then: log⁡(exp⁡(2−𝑥)/exp⁡(2−𝑥)+exp⁡(0)) matches 𝛽0+𝛽x=2−x So: 𝛼orange0=2 𝛼orange1=−1 𝛼apple0=0 𝛼apple1=0


#d) 𝛼orange0=1.2, 𝛼orange1=−2 𝛼apple0=3, 𝛼apple1=0.6
#Convert to the model form (log odds): log(Pr(𝑌=apple)/Pr(𝑌=orange))=(𝛼orange0−𝛼apple0)+(𝛼orange1−𝛼apple1)𝑥=(1.2−3)+(−2−0.6)𝑥=−1.8−2.6𝑥

#So the model estimates:
#𝛽0=−1.8 𝛽1=−2.6
set.seed(123)
x <- seq(-5, 5, length.out = 1000)

# Friend's softmax model logits
logit_o <- 1.2 - 2 * x
logit_a <- 3 + 0.6 * x

# Softmax probabilities
p_soft_o <- exp(logit_o) / (exp(logit_o) + exp(logit_a))
p_soft_a <- 1 - p_soft_o

# Your logistic model
p_logit_o <- plogis(-1.8 - 2.6 * x)

# Predicted classes
pred_soft <- ifelse(p_soft_o > 0.5, "orange", "apple")
pred_logit <- ifelse(p_logit_o > 0.5, "orange", "apple")

# Agreement rate
mean(pred_soft == pred_logit)
## [1] 1
#This code computes the proportion of cases where your model and your friend’s model make the same prediction.