13. This question should be answered using the Weekly data set, which is part of the ISLR2 package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010. (a) Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
Yes, between Volume, Year, and Today
library(ISLR)
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
install.packages("pacman")
## Installing package into 'C:/Users/aliso/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'pacman' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\aliso\AppData\Local\Temp\Rtmpya75V2\downloaded_packages
library(pacman)
p_load(ggplot2, dplyr, stringr, GGally, ggstats, DataExplorer, class, MASS, ISLR)
p_update(update = FALSE)
## [1] "ggstats" "Rcpp"
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
plot(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
create_report(Weekly)
##
##
## processing file: report.rmd
## | | | 0% | |. | 2% | |.. | 5% [global_options] | |... | 7% | |.... | 10% [introduce] | |.... | 12% | |..... | 14% [plot_intro]
## | |...... | 17% | |....... | 19% [data_structure] | |........ | 21% | |......... | 24% [missing_profile]
## | |.......... | 26% | |........... | 29% [univariate_distribution_header] | |........... | 31% | |............ | 33% [plot_histogram]
## | |............. | 36% | |.............. | 38% [plot_density] | |............... | 40% | |................ | 43% [plot_frequency_bar]
## | |................. | 45% | |.................. | 48% [plot_response_bar] | |.................. | 50% | |................... | 52% [plot_with_bar] | |.................... | 55% | |..................... | 57% [plot_normal_qq]
## | |...................... | 60% | |....................... | 62% [plot_response_qq] | |........................ | 64% | |......................... | 67% [plot_by_qq] | |.......................... | 69% | |.......................... | 71% [correlation_analysis]
## | |........................... | 74% | |............................ | 76% [principal_component_analysis]
## | |............................. | 79% | |.............................. | 81% [bivariate_distribution_header] | |............................... | 83% | |................................ | 86% [plot_response_boxplot] | |................................. | 88% | |................................. | 90% [plot_by_boxplot] | |.................................. | 93% | |................................... | 95% [plot_response_scatterplot] | |.................................... | 98% | |.....................................| 100% [plot_by_scatterplot]
## output file: C:/Users/aliso/Documents/UTSA/Machine Learning 101/report.knit.md
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS "C:\Users\aliso\DOCUME~1\UTSA\MACHIN~1\REPORT~1.MD" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc6aac37a46c4e.html --lua-filter "C:\Users\aliso\AppData\Local\R\win-library\4.5\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\aliso\AppData\Local\R\win-library\4.5\rmarkdown\rmarkdown\lua\latex-div.lua" --lua-filter "C:\Users\aliso\AppData\Local\R\win-library\4.5\rmarkdown\rmarkdown\lua\table-classes.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template "C:\Users\aliso\AppData\Local\R\win-library\4.5\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\aliso\AppData\Local\Temp\Rtmpya75V2\rmarkdown-str6aac23792d50.html"
##
## Output created: report.html
plot_missing(Weekly)
plot_histogram(Weekly)
plot_correlation(Weekly)
matrixThisIs <- cor(Weekly [, -9])
matrixThisIs
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
max_correlation <- max(matrixThisIs[upper.tri(matrixThisIs)])
max_correlation
## [1] 0.8419416
attach(Weekly)
plot(Volume, type = "l")
(b) Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?
logRegresWeek = glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = Weekly, family = binomial)
summary(logRegresWeek)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
coef(logRegresWeek)
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## 0.26686414 -0.04126894 0.05844168 -0.01606114 -0.02779021 -0.01447206
## Volume
## -0.02274153
**(b) cont. Do any of the predictors appear to be statistically significant? If so, which ones?
Lag2 has a positive correlation with direction.
logRegresWeek_probs <- predict(logRegresWeek, type = "response")
logRegresWeek_probs [1:10]
## 1 2 3 4 5 6 7 8
## 0.6086249 0.6010314 0.5875699 0.4816416 0.6169013 0.5684190 0.5786097 0.5151972
## 9 10
## 0.5715200 0.5554287
contrasts(Direction)
## Up
## Down 0
## Up 1
logRegresWeek_pred = rep("Down", 1089)
logRegresWeek_pred[logRegresWeek_probs > 0.5] = "Up"
table(logRegresWeek_pred, Direction)
## Direction
## logRegresWeek_pred Down Up
## Down 54 48
## Up 430 557
(54+557)/1089
## [1] 0.5610652
mean(logRegresWeek_pred == Direction)
## [1] 0.5610652
So the current model predicts the next day’s volume 56% of the time.
train = (Year<2009)
Weekly.2009 = Weekly[!train ,]
Direction.2009 = Direction[!train]
dim(Weekly.2009)
## [1] 104 9
logRegresWeek_L2 = glm(Direction ~ Lag2, data = Weekly, subset = train, family = binomial)
logRegresWeek_probs2 = predict(logRegresWeek_L2, Weekly.2009, type = "response")
logRegresWeek_pred2 = rep("Down", 104)
logRegresWeek_pred2[logRegresWeek_probs2 > 0.5] ="Up"
table(logRegresWeek_pred2, Direction.2009)
## Direction.2009
## logRegresWeek_pred2 Down Up
## Down 9 5
## Up 34 56
mean(logRegresWeek_pred2 == Direction.2009)
## [1] 0.625
(9+56)/104
## [1] 0.625
library(MASS)
lda.fit = lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
lda.pred = predict(lda.fit, Weekly.2009)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class = lda.pred$class
table(lda.class, Direction.2009)
## Direction.2009
## lda.class Down Up
## Down 9 5
## Up 34 56
mean(lda.class == Direction.2009)
## [1] 0.625
qda.fit = qda(Direction~Lag2, data = Weekly, subset = train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class = predict(qda.fit, Weekly.2009)$class
table(qda.class, Direction.2009)
## Direction.2009
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class == Direction.2009)
## [1] 0.5865385
train.X <- data.frame(Lag2 = Weekly$Lag2[train])
test.X <- data.frame(Lag2 = Weekly$Lag2[!train])
train.Direction = Direction[train]
set.seed(997)
knn.pred <- knn(train.X, test.X, train.Direction, k =2)
table(knn.pred , Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 21 28
## Up 22 33
mean(knn.pred == Weekly$Direction[!train])
## [1] 0.5192308
set.seed(997)
knn.pred <- knn(train.X, test.X, train.Direction, k =3)
table(knn.pred , Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 15 19
## Up 28 42
mean(knn.pred == Weekly$Direction[!train])
## [1] 0.5480769
set.seed(997)
knn.pred <- knn(train.X, test.X, train.Direction, k =4)
table(knn.pred , Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 21 21
## Up 22 40
mean(knn.pred == Weekly$Direction[!train])
## [1] 0.5865385
**(i) Which of these methods appears to provide the best results on this data?
Logistic Regression with Lag 2 only. And Linear Discriminant Analysis with Lag 2 only.**
p_load(MASS)
library(MASS)
lda.fit2 = lda(Direction ~ Lag1 + Lag2, data = Weekly, subset = train)
lda.fit2
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag1 Lag2
## Down 0.289444444 -0.03568254
## Up -0.009213235 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.3013148
## Lag2 0.2982579
lda.pred2 = predict(lda.fit2, Weekly.2009)
names(lda.pred2)
## [1] "class" "posterior" "x"
lda.class2 = lda.pred2$class
table(lda.class2, Direction.2009)
## Direction.2009
## lda.class2 Down Up
## Down 7 8
## Up 36 53
mean(lda.class2 == Direction.2009)
## [1] 0.5769231
library(MASS)
library(ISLR)
library(class)
library(dplyr)
data("Auto")
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
data("Auto")
med_mpg <-median(Auto$mpg, na.rm = TRUE)
med_mpg
## [1] 22.75
mpg_lvl = ifelse(Auto$mpg > med_mpg, 1, 0)
Auto$mpg_lvl <- as.factor(mpg_lvl)
table(mpg_lvl)
## mpg_lvl
## 0 1
## 196 196
boxplot(weight ~ mpg_lvl, data = Auto, main = "Weight")
boxplot(horsepower ~ mpg_lvl, data = Auto, main = "Horsepower")
boxplot(year ~ mpg_lvl, data = Auto, main = "Year")
boxplot(cylinders ~ mpg_lvl, data = Auto, main = "Cylinders")
boxplot(displacement ~ mpg_lvl, data = Auto, main = "Displacement")
(c) Split the data into a training set and a test set.
set.seed(445)
train_idx <- sample(1:dim(Auto)[1], dim(Auto)[1] * 0.6, rep=FALSE)
testX <- -train_idx
trainX3 <- Auto[train_idx, ]
testX3 <- Auto[testX, ]
mpg_lvl_test <- mpg_lvl[testX]
lda.fit3 <- lda(mpg_lvl ~ weight + horsepower + year + displacement, data = trainX3)
lda.fit3
## Call:
## lda(mpg_lvl ~ weight + horsepower + year + displacement, data = trainX3)
##
## Prior probabilities of groups:
## 0 1
## 0.5148936 0.4851064
##
## Group means:
## weight horsepower year displacement
## 0 3596.471 128.5785 74.21488 269.9091
## 1 2353.298 78.7807 77.64035 114.2588
##
## Coefficients of linear discriminants:
## LD1
## weight -0.0008853028
## horsepower 0.0140164551
## year 0.1220238325
## displacement -0.0105400125
lda_pred = predict(lda.fit3, testX3)
names(lda_pred)
## [1] "class" "posterior" "x"
pred_lda3 <- predict(lda.fit3, testX3)
table(pred_lda3$class, mpg_lvl_test)
## mpg_lvl_test
## 0 1
## 0 66 3
## 1 9 79
mean(pred_lda3$class != mpg_lvl_test)
## [1] 0.07643312
Test error rate = 7.64%
qda_model = qda(mpg_lvl ~ weight + horsepower + year + displacement, data = trainX3)
qda_model
## Call:
## qda(mpg_lvl ~ weight + horsepower + year + displacement, data = trainX3)
##
## Prior probabilities of groups:
## 0 1
## 0.5148936 0.4851064
##
## Group means:
## weight horsepower year displacement
## 0 3596.471 128.5785 74.21488 269.9091
## 1 2353.298 78.7807 77.64035 114.2588
qda.class = predict(qda_model, testX3)$class
table(qda.class,mpg_lvl_test)
## mpg_lvl_test
## qda.class 0 1
## 0 70 5
## 1 5 77
mean(qda.class != mpg_lvl_test)
## [1] 0.06369427
Test error rate = 6.3%
glm_model <- glm(mpg_lvl ~ weight + horsepower + year + displacement, data = trainX3, family = binomial)
summary(glm_model)
##
## Call:
## glm(formula = mpg_lvl ~ weight + horsepower + year + displacement,
## family = binomial, data = trainX3)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.619209 6.014234 -2.098 0.03589 *
## weight -0.002180 0.001161 -1.878 0.06037 .
## horsepower -0.055636 0.021473 -2.591 0.00957 **
## year 0.355223 0.090132 3.941 8.11e-05 ***
## displacement -0.019679 0.008905 -2.210 0.02712 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 325.57 on 234 degrees of freedom
## Residual deviance: 100.18 on 230 degrees of freedom
## AIC: 110.18
##
## Number of Fisher Scoring iterations: 8
probs <- predict(glm_model, testX3, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.65] <- 1
table(pred.glm, mpg_lvl_test)
## mpg_lvl_test
## pred.glm 0 1
## 0 75 11
## 1 0 71
mean(pred.glm != mpg_lvl_test)
## [1] 0.07006369
Test error rate is 7.0%
str(Auto)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ mpg_lvl : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
trainX4 <- scale(trainX3[, c("weight","horsepower","year","displacement")])
testX4 <- scale(testX3[, c("weight","horsepower","year","displacement")])
trainY4 <- trainX3$mpg_lvl
set.seed(997)
knn.pred = knn(trainX4, testX4, trainY4, k = 5)
table(knn.pred, mpg_lvl_test)
## mpg_lvl_test
## knn.pred 0 1
## 0 73 6
## 1 2 76
mean(knn.pred != mpg_lvl_test)
## [1] 0.05095541
library(MASS)
data("Boston")
attach(Boston)
med_crime <-median(Boston$crim, na.rm = TRUE)
med_crime
## [1] 0.25651
crime_lvl = ifelse(Boston$crim > med_crime, 1, 0)
Boston$crime_lvl <- as.factor(crime_lvl)
table(crime_lvl)
## crime_lvl
## 0 1
## 253 253
set.seed(997)
train_idx <- sample(1:nrow(Boston), nrow(Boston)*0.6)
train <- Boston[train_idx,]
test <- Boston[-train_idx,]
glm_Boston <-glm(crime_lvl ~ ., data = train, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_probs <- predict(glm_Boston, test, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
table(Predicted = glm_pred, Actual = test$crime_lvl)
## Actual
## Predicted 0 1
## 0 103 1
## 1 3 96
glm_Boston <-glm(crime_lvl ~ zn + indus + dis + ptratio + rad + nox, data = train, family = binomial)
glm_probs <- predict(glm_Boston, test, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
table(Predicted = glm_pred, Actual = test$crime_lvl)
## Actual
## Predicted 0 1
## 0 87 10
## 1 19 87
mean(glm_pred != test$crime_lvl)
## [1] 0.1428571
lda_Boston <- lda(crime_lvl ~ zn + indus + dis + ptratio + rad + nox, data = train)
lda_pred <- predict(lda_Boston, test)
table(Predicted = lda_pred$class, Actual = test$crime_lvl)
## Actual
## Predicted 0 1
## 0 96 22
## 1 10 75
mean(lda_pred$class != test$crime_lvl)
## [1] 0.1576355
train.X <- scale(train[, c("zn", "indus", "dis", "ptratio", "rad", "nox")])
test.X <- scale(test[, c("zn", "indus", "dis", "ptratio", "rad", "nox")])
train.Y <- train$crime_lvl
for (k in c(1, 3, 5, 10)) {
knn.pred <- knn(train.X, test.X, train.Y, k = k)
err <- mean(knn.pred != test$crime_lvl)
cat("Test error for k =", k, ":", round(err, 3), "\n")
}
## Test error for k = 1 : 0.039
## Test error for k = 3 : 0.039
## Test error for k = 5 : 0.074
## Test error for k = 10 : 0.079