##Question 13 ##a)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
pairs(Weekly)
cor(subset(Weekly, select = -Direction))
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
#Answer: yes there seems to be a relationship between year and volume
##b
logit_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(logit_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
#answer: Lag2 has the highest significance level with a p-value of less than 0.05
##C
show_model_performance <- function(predicted_status, observed_status) {
confusion_matrix <- table(predicted_status,
observed_status,
dnn = c("predicted Status", "observed Status"))
print(confusion_matrix)
error_rate <- mean(predicted_status != observed_status)
cat("\n")
cat(" Error Rate:", 100 * error_rate, "%\n")
cat("Correctly Predicted:", 100 * (1-error_rate), "%\n")
cat("False Positive Rate:", 100 * confusion_matrix[2,1] / sum(confusion_matrix[,1]), "%\n")
cat("False Negative Rate:", 100 * confusion_matrix[1,2] / sum(confusion_matrix[,2]), "%\n")
}
predict_glm_direction <- function(model, newdata = NULL) {
predictions <- predict(model, newdata, type="response")
return(as.factor(ifelse(predictions < 0.5, "Down", "Up")))
}
predicted_direction <- predict_glm_direction(logit_model)
show_model_performance(predicted_direction, Weekly$Direction)
## observed Status
## predicted Status Down Up
## Down 54 48
## Up 430 557
##
## Error Rate: 43.89348 %
## Correctly Predicted: 56.10652 %
## False Positive Rate: 88.84298 %
## False Negative Rate: 7.933884 %
#Answer: the trend is relatively correct
##d)Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
train <- (Weekly$Year < 2009)
train_set <- Weekly[train,]
test_set <- Weekly[!train,]
logit_model <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
predicted_direction <- predict_glm_direction(logit_model, test_set)
show_model_performance(predicted_direction, test_set$Direction)
## observed Status
## predicted Status Down Up
## Down 9 5
## Up 34 56
##
## Error Rate: 37.5 %
## Correctly Predicted: 62.5 %
## False Positive Rate: 79.06977 %
## False Negative Rate: 8.196721 %
##e)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
lda_model <- lda(Direction ~ Lag2, data = Weekly, subset = train)
predictions <- predict(lda_model, test_set, type="response")
show_model_performance(predictions$class, test_set$Direction)
## observed Status
## predicted Status Down Up
## Down 9 5
## Up 34 56
##
## Error Rate: 37.5 %
## Correctly Predicted: 62.5 %
## False Positive Rate: 79.06977 %
## False Negative Rate: 8.196721 %
#Answer: the data could predict 62.5%
##f
qda_model <- qda(Direction ~ Lag2, data = Weekly, subset = train)
predictions <- predict(qda_model, test_set, type="response")
show_model_performance(predictions$class, test_set$Direction)
## observed Status
## predicted Status Down Up
## Down 0 0
## Up 43 61
##
## Error Rate: 41.34615 %
## Correctly Predicted: 58.65385 %
## False Positive Rate: 100 %
## False Negative Rate: 0 %
#answer: the data could correctly predict 58.65%
##g
library(class)
run_knn <- function(train, test, train_class, test_class, k) {
set.seed(12345)
predictions <- knn(train, test, train_class, k)
cat("KNN: k =", k, "\n")
show_model_performance(predictions, test_class)
}
train_matrix <- as.matrix(train_set$Lag2)
test_matrix <- as.matrix(test_set$Lag2)
run_knn(train_matrix, test_matrix, train_set$Direction, test_set$Direction, k = 1)
## KNN: k = 1
## observed Status
## predicted Status Down Up
## Down 21 29
## Up 22 32
##
## Error Rate: 49.03846 %
## Correctly Predicted: 50.96154 %
## False Positive Rate: 51.16279 %
## False Negative Rate: 47.54098 %
#answer: the data could correctly predict 50.96%
##h
train=(Weekly$Year<2009)
weekly09=Weekly[!train ,]
direction09=Weekly$Direction[!train]
dim(weekly09)
## [1] 104 9
glm_fit=glm(Direction~Lag2, data = Weekly,family=binomial ,subset=train)
glm_probability=predict (glm_fit,weekly09, type="response")
glm_prediction=rep("Down",104)
glm_prediction[glm_probability >.5]=" Up"
table(glm_prediction ,direction09)
## direction09
## glm_prediction Down Up
## Up 34 56
## Down 9 5
library(e1071)
library(ISLR2)
nbayes=naiveBayes(Direction~Lag2 ,data=Weekly ,subset=train)
nbayes
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
nbayes.class=predict(nbayes ,weekly09)
table(nbayes.class ,direction09)
## direction09
## nbayes.class Down Up
## Down 0 0
## Up 43 61
##i
#answer: logistic regression seems to correctly predict 62.5% of the data correctly
##j
logit_model <- glm(Direction ~ Lag1 * Lag2, data = Weekly, family = binomial, subset = train)
predicted_direction <- predict_glm_direction(logit_model, test_set)
show_model_performance(predicted_direction, test_set$Direction)
## observed Status
## predicted Status Down Up
## Down 7 8
## Up 36 53
##
## Error Rate: 42.30769 %
## Correctly Predicted: 57.69231 %
## False Positive Rate: 83.72093 %
## False Negative Rate: 13.11475 %
lda_model <- lda(Direction ~ Lag1 * Lag2, data = Weekly, subset = train)
predictions <- predict(lda_model, test_set, type="response")
show_model_performance(predictions$class, test_set$Direction)
## observed Status
## predicted Status Down Up
## Down 7 8
## Up 36 53
##
## Error Rate: 42.30769 %
## Correctly Predicted: 57.69231 %
## False Positive Rate: 83.72093 %
## False Negative Rate: 13.11475 %
qda_model <- qda(Direction ~ Lag2 + sqrt(abs(Lag2)), data = Weekly, subset = train)
predictions <- predict(qda_model, test_set, type="response")
show_model_performance(predictions$class, test_set$Direction)
## observed Status
## predicted Status Down Up
## Down 12 13
## Up 31 48
##
## Error Rate: 42.30769 %
## Correctly Predicted: 57.69231 %
## False Positive Rate: 72.09302 %
## False Negative Rate: 21.31148 %
run_knn(train_matrix, test_matrix, train_set$Direction, test_set$Direction, k = 10)
## KNN: k = 10
## observed Status
## predicted Status Down Up
## Down 18 21
## Up 25 40
##
## Error Rate: 44.23077 %
## Correctly Predicted: 55.76923 %
## False Positive Rate: 58.13953 %
## False Negative Rate: 34.42623 %
##Question 14) ##a)
Auto$mpg01 <- Auto$mpg > median(Auto$mpg)
head(Auto$mpg01, n = 20)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE
##b)
cor(subset(Auto, select = -name))
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
pairs(Auto)
##c
train <- sample(nrow(Auto) * 0.7)
train_set <- Auto[train, ]
test_set <- Auto[-train, ]
##d
lda_model <- lda(mpg01 ~ cylinders + weight + displacement + horsepower,
data = Auto,
subset = train)
predictions <- predict(lda_model, test_set)
show_model_performance(predictions$class, test_set$mpg01)
## observed Status
## predicted Status FALSE TRUE
## FALSE 18 16
## TRUE 2 82
##
## Error Rate: 15.25424 %
## Correctly Predicted: 84.74576 %
## False Positive Rate: 10 %
## False Negative Rate: 16.32653 %
##e
qda_model <- qda(mpg01 ~ cylinders + weight + displacement + horsepower,
data = Auto,
subset = train)
predictions <- predict(qda_model, test_set)
show_model_performance(predictions$class, test_set$mpg01)
## observed Status
## predicted Status FALSE TRUE
## FALSE 18 19
## TRUE 2 79
##
## Error Rate: 17.79661 %
## Correctly Predicted: 82.20339 %
## False Positive Rate: 10 %
## False Negative Rate: 19.38776 %
#answer: the test rate is 17.79%
##f
logit_model <- glm(mpg01 ~ cylinders + weight + displacement + horsepower,
data = Auto,
family = binomial,
subset = train)
predictions <- predict(logit_model, test_set, type = "response")
show_model_performance(predictions > 0.5, test_set$mpg01)
## observed Status
## predicted Status FALSE TRUE
## FALSE 20 21
## TRUE 0 77
##
## Error Rate: 17.79661 %
## Correctly Predicted: 82.20339 %
## False Positive Rate: 0 %
## False Negative Rate: 21.42857 %
##g
library(e1071)
library(ISLR2)
nbayes=naiveBayes(cylinders ~ mpg01 ,data=Auto ,subset=train)
nbayes
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 3 4 5 6 8
## 0.010948905 0.416058394 0.003649635 0.233576642 0.335766423
##
## Conditional probabilities:
## mpg01
## Y FALSE TRUE
## 3 1.0000000 0.0000000
## 4 0.1578947 0.8421053
## 5 1.0000000 0.0000000
## 6 0.9687500 0.0312500
## 8 1.0000000 0.0000000
##h
vars <- c("cylinders", "weight", "displacement", "horsepower")
train_matrix <- as.matrix(train_set[, vars])
test_matrix <- as.matrix(test_set[, vars])
predictions <- knn(train_matrix, test_matrix, train_set$mpg01, 1)
run_knn(train_matrix, test_matrix, train_set$mpg01, test_set$mpg01, k = 1)
## KNN: k = 1
## observed Status
## predicted Status FALSE TRUE
## FALSE 18 24
## TRUE 2 74
##
## Error Rate: 22.0339 %
## Correctly Predicted: 77.9661 %
## False Positive Rate: 10 %
## False Negative Rate: 24.4898 %
run_knn(train_matrix, test_matrix, train_set$mpg01, test_set$mpg01, k = 10)
## KNN: k = 10
## observed Status
## predicted Status FALSE TRUE
## FALSE 20 22
## TRUE 0 76
##
## Error Rate: 18.64407 %
## Correctly Predicted: 81.35593 %
## False Positive Rate: 0 %
## False Negative Rate: 22.44898 %
run_knn(train_matrix, test_matrix, train_set$mpg01, test_set$mpg01, k = 100)
## KNN: k = 100
## observed Status
## predicted Status FALSE TRUE
## FALSE 20 25
## TRUE 0 73
##
## Error Rate: 21.18644 %
## Correctly Predicted: 78.81356 %
## False Positive Rate: 0 %
## False Negative Rate: 25.5102 %
##Question 16
#answer: We can say that the following variables will be good predictors