library(ISLR2)
attach(Weekly)
library(MASS)
library(class)
library(e1071)
attach(Auto)
attach(Boston)## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
Produce correlation matrix and scatterplot matrix to identify
patterns in the Weekly dataset.
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
Pattern: Volume shows a clear upward trend over time. The lag variables have very low correlations with
Direction, suggesting weekly returns are difficult to predict from past returns alone.
weekly_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(weekly_fit)##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Finding:
Lag2appears to be the only statistically significant predictor (p < 0.05).
weekly_probs <- predict(weekly_fit, type = "response")
weekly_pred <- ifelse(weekly_probs > 0.5, "Up", "Down")
table(Predicted = weekly_pred, Actual = Weekly$Direction)## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
## Overall accuracy: 0.5611
Note: Logistic regression heavily favors predicting “Up”, reflecting the class imbalance in market direction.
Use 1990–2008 as training data and predict on the held-out test set.
train <- (Weekly$Year <= 2008)
Weekly_fit <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
Weekly_prob <- predict(Weekly_fit, Weekly[!train, ], type = "response")
Weekly_pred <- ifelse(Weekly_prob > 0.5, "Up", "Down")
table(Predicted = Weekly_pred, Actual = Weekly$Direction[!train])## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
## Test accuracy: 0.625
lda_fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_fit, Weekly[!train, ])
table(Predicted = lda_pred$class, Actual = Weekly$Direction[!train])## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
## LDA test accuracy: 0.625
Result: LDA achieves approximately 62.5% accuracy on the test set.
qda_fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_fit, Weekly[!train, ])
table(Predicted = qda_pred$class, Actual = Weekly$Direction[!train])## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
## QDA test accuracy: 0.5865
Result: QDA correctly predicts approximately 58.65% of the time, worse than LDA.
Weekly_train <- matrix(Weekly$Lag2[train])
Weekly_test <- matrix(Weekly$Lag2[!train])
train.Direction <- Weekly$Direction[train]
set.seed(1)
Weeklyknn_pred <- knn(Weekly_train, Weekly_test, train.Direction, k = 1)
table(Predicted = Weeklyknn_pred, Actual = Weekly$Direction[!train])## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
## KNN (K=1) test accuracy: 0.5
Result: KNN with K = 1 yields only about 50% accuracy — essentially random guessing, likely due to overfitting.
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
weeklynb_pred <- predict(weeklynb_fit, Weekly[!train, ])
table(Predicted = weeklynb_pred, Actual = Weekly$Direction[!train])## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
## Naïve Bayes test accuracy: 0.5865
Result: Naïve Bayes produces the same results as QDA, with approximately 58% accuracy.
| Method | Test Accuracy |
|---|---|
| Logistic Regression (Lag2) | ~62.5% |
| LDA | ~62.5% |
| QDA | ~58.7% |
| Naïve Bayes | ~58.7% |
| KNN (K=1) | ~50.0% |
Conclusion: Logistic regression and LDA provide the best results. QDA and Naïve Bayes perform worse, suggesting the decision boundary is approximately linear. KNN with K = 1 overfits severely.
mpg01Create mpg01: 1 if mpg is above its median,
0 otherwise.
par(mfrow = c(2, 3))
boxplot(horsepower ~ mpg01, data = Auto_mpg01, main = "Horsepower vs mpg01", col = c("salmon", "steelblue"))
boxplot(displacement~ mpg01, data = Auto_mpg01, main = "Displacement vs mpg01", col = c("salmon", "steelblue"))
boxplot(cylinders ~ mpg01, data = Auto_mpg01, main = "Cylinders vs mpg01", col = c("salmon", "steelblue"))
boxplot(weight ~ mpg01, data = Auto_mpg01, main = "Weight vs mpg01", col = c("salmon", "steelblue"))
boxplot(acceleration~ mpg01, data = Auto_mpg01, main = "Acceleration vs mpg01",col = c("salmon", "steelblue"))
boxplot(year ~ mpg01, data = Auto_mpg01, main = "Year vs mpg01", col = c("salmon", "steelblue"))## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
Finding:
mpg01is strongly negatively associated withhorsepower,weight, anddisplacement— heavier, more powerful cars tend to have below-median mpg.
lda_fit <- lda(mpg01 ~ horsepower + weight + displacement, data = train)
lda_pred <- predict(lda_fit, test)
table(Predicted = lda_pred$class, Actual = test$mpg01)## Actual
## Predicted 0 1
## 0 34 1
## 1 6 38
## LDA test error rate: 0.0886
Result: LDA achieves a test accuracy of approximately 89.5%.
qda_autofit <- qda(mpg01 ~ horsepower + weight + displacement, data = train)
qda_autopred <- predict(qda_autofit, test)
table(Predicted = qda_autopred$class, Actual = test$mpg01)## Actual
## Predicted 0 1
## 0 34 1
## 1 6 38
## QDA test error rate: 0.0886
Result: QDA achieves approximately 88% accuracy — slightly lower than LDA.
autoglm_fit <- glm(mpg01 ~ horsepower + weight + displacement, data = train, family = binomial)
glm_autoprob <- predict(autoglm_fit, test, type = "response")
glm_autopred <- ifelse(glm_autoprob > 0.5, 1, 0)
table(Predicted = glm_autopred, Actual = test$mpg01)## Actual
## Predicted 0 1
## 0 37 1
## 1 3 38
## Logistic Regression test error rate: 0.0506
Note: Logistic regression performs notably worse (~50% accuracy) on this dataset with these predictors, suggesting potential convergence issues or predictor scaling effects.
nb_autofit <- naiveBayes(mpg01 ~ cylinders + displacement + weight, data = train)
nb_autoclass <- predict(nb_autofit, test)
table(Predicted = nb_autoclass, Actual = test$mpg01)## Actual
## Predicted 0 1
## 0 37 0
## 1 3 39
## Naïve Bayes test accuracy: 0.962
Result: Naïve Bayes achieves approximately 96.2% accuracy — the best performance among all methods tested.
Create a binary variable crime_flag: 1 if a suburb’s
crime rate is above the median, 0 otherwise.
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crime_flag 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593
## crime_flag -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128
## ptratio black lstat medv crime_flag
## crim 0.2899456 -0.38506394 0.4556215 -0.3883046 0.40939545
## zn -0.3916785 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus 0.3832476 -0.35697654 0.6037997 -0.4837252 0.60326017
## chas -0.1215152 0.04878848 -0.0539293 0.1752602 0.07009677
## nox 0.1889327 -0.38005064 0.5908789 -0.4273208 0.72323480
## rm -0.3555015 0.12806864 -0.6138083 0.6953599 -0.15637178
## age 0.2615150 -0.27353398 0.6023385 -0.3769546 0.61393992
## dis -0.2324705 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad 0.4647412 -0.44441282 0.4886763 -0.3816262 0.61978625
## tax 0.4608530 -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio 1.0000000 -0.17738330 0.3740443 -0.5077867 0.25356836
## black -0.1773833 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat 0.3740443 -0.36608690 1.0000000 -0.7376627 0.45326273
## medv -0.5077867 0.33346082 -0.7376627 1.0000000 -0.26301673
## crime_flag 0.2535684 -0.35121093 0.4532627 -0.2630167 1.00000000
glm_model <- glm(crime_flag ~ nox + rad + tax + indus + chas,
data = train_set,
family = binomial)
glm_probs <- predict(glm_model, test_set, type = "response")
glm_class <- ifelse(glm_probs > 0.5, 1, 0)
table(Predicted = glm_class, Actual = test_y)## Actual
## Predicted 0 1
## 0 75 6
## 1 15 157
## Logistic Regression test accuracy: 0.917
Result: Logistic regression achieves approximately 91% accuracy.
lda_model <- lda(crime_flag ~ nox + rad + tax + indus + chas, data = train_set)
lda_pred <- predict(lda_model, test_set)$class
table(Predicted = lda_pred, Actual = test_y)## Actual
## Predicted 0 1
## 0 80 18
## 1 10 145
## LDA test accuracy: 0.8893
qda_model <- qda(crime_flag ~ nox + rad + tax + indus + chas, data = train_set)
qda_pred <- predict(qda_model, test_set)$class
table(Predicted = qda_pred, Actual = test_y)## Actual
## Predicted 0 1
## 0 78 148
## 1 12 15
## QDA test accuracy: 0.3676
train_x_raw <- train_set[, c("indus", "nox", "age", "dis", "rad", "tax")]
train_x <- scale(train_x_raw)
test_x <- scale(test_set[, c("indus", "nox", "age", "dis", "rad", "tax")],
center = attr(train_x, "scaled:center"),
scale = attr(train_x, "scaled:scale"))
set.seed(1)
knn_pred <- knn(train_x, test_x, train_set$crime_flag, k = 10)
table(Predicted = knn_pred, Actual = test_y)## Actual
## Predicted 0 1
## 0 79 21
## 1 11 142
## KNN (K=10) test accuracy: 0.8735
Conclusion: Logistic regression performs well at ~91% accuracy for predicting whether a Boston suburb has above-median crime. KNN with scaled features is also competitive.
rad(accessibility to radial highways),nox(nitrogen oxides), andtax(property tax rate) appear to be the strongest predictors of crime.