options(repos = "https://cran.r-project.org")
Chapter 4: (13) and (14)
install.packages("corrplot")
## Installing package into 'C:/Users/admin/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'corrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\admin\AppData\Local\Temp\RtmpOaYVia\downloaded_packages
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
library(MASS)
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.2
library(class)
#loading data
data("Weekly")
#printing summary of the data
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
#plotting a plot
pairs(Weekly)
#plotting with respect to year and volume
plot(Weekly$Year, Weekly$Volume,
xlab ="Year",
ylab ="Volume")
There seems a relation between year and volume. As the year is increasing, volume seems to increase.
#performing logistic regression
log_reg= glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, family= "binomial", data=Weekly)
summary(log_reg)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
In the logistic regression model, only Lag2 is statistically significant since it has a p-value = 0.0296, while the intercept is also seems significant with p-value = 0.0019, indicating its importance in predicting market direction. Rest of the variables have p-values which are higher than the commonly used significance level of 0.05 Which suggests that they are not statistically significant.
#Predicting probabilities
prob_c <-predict(log_reg, type ="response")
# Predicting based on probability threshold 0.5
pred_c <-ifelse(prob_c >0.5,"Up", "Down")
# Creating confusion matrix from the above prediction
conf_matrix_c <- table(Predicted= pred_c, Actual= Weekly$Direction)
# Printing confusion matrix
conf_matrix_c
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
#accuracy calculation
accuracy_c <- (sum(diag(conf_matrix_c))) / (sum(conf_matrix_c))
accuracy_c
## [1] 0.5610652
The confusion matrix shows that the logistic regression model correctly predicted “Down” 54 times and “Up” 557 times but made 48 false positive predictions and 430 false negative predictions. The overall accuracy of the model is approximately 62.5%, indicating its effectiveness in predicting market direction.
#creating a dataset for training and testing
train_data<- Weekly[Weekly$Year<= 2008,]
test_data<-Weekly[Weekly$Year >2008,]
# fit a logistic regression model with the function with lag2
model_d <-glm(Direction ~Lag2, family=binomial, data= train_data)
# Predicting probabilities for test data
prob_d<- predict(model_d, newdata =test_data, type="response")
# Predicting classes based on probability threshold 0.5
pred_d <-ifelse(prob_d > 0.5,"Up", "Down")
# Creating confusion matrix
conf_matrix_d<- table(Predicted =pred_d, Actual= test_data$Direction)
conf_matrix_d
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
#accuracy calculation
accuracy <- (sum(diag(conf_matrix_d))) / (sum(conf_matrix_d))
accuracy
## [1] 0.625
The confusion matrix shows that out of 104 instances, 56 were correctly predicted as “Up” and 9 as “Down” but made 5 false positive predictions and 34 false negative predictions showcasing a moderate improvement over the model trained on the entire dataset. Notably, it displayed better accuracy in predicting upward trends (91.80%) compared to downward trends (20.93%). Accuracy is 62.5 percent.
#fitting LDA
model_e <-lda(Direction ~Lag2, data= train_data)
# Predicting
pred_e <-predict(model_e, newdata =test_data)$class
# Creating confusion matrix
conf_matrix_e <-table(Predicted =pred_e, Actual =test_data$Direction)
# Printing confusion matrix
conf_matrix_e
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
#accuracy calculation
accuracy_e <- (sum(diag(conf_matrix_e))) / (sum(conf_matrix_e))
accuracy_e
## [1] 0.625
It gave similar result as logic regression. They both have accuracy of 62 percent.
#fitting QDA
model_f <-qda(Direction ~Lag2, data =train_data)
# Predicting classes for test data
pred_f <-predict(model_f,newdata =test_data)$class
#creating confusion matrix
conf_matrix_f <-table(Predicted =pred_f, Actual= test_data$Direction)
# Printing confusion matrix
conf_matrix_f
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
#accuracy calculation
accuracy_f <-(sum(diag(conf_matrix_f))) / (sum(conf_matrix_f))
accuracy_f
## [1] 0.5865385
This shows less accuracy(58.65%) as compared to the upper model.The QDA model, in this case, seems to be biased towards predicting “Up” for all instances, resulting in lower accuracy.
# Combining Lag2 with other predictors
train_features <-train_data[, c("Lag2", "Lag1", "Lag3", "Lag4","Lag5", "Volume")]
test_features <-test_data[, c("Lag2","Lag1", "Lag3", "Lag4","Lag5", "Volume")]
# Fitting KNN model using all predictors
model_g <- knn(train =train_features, test =test_features, cl =train_data$Direction, k = 1)
# Creating a confusion matrix
conf_matrix_g <-table(Predicted =model_g, Actual =test_data$Direction)
conf_matrix_g
## Actual
## Predicted Down Up
## Down 21 32
## Up 22 29
#accuracy calculation
accuracy_g <-(sum(diag(conf_matrix_g))) / (sum(conf_matrix_g))
accuracy_g
## [1] 0.4807692
In the K-Nearest Neighbors (KNN) model, the accuracy rate was 48.08%.
# Fitting Naive Bayes model using Lag2
model_h <- naiveBayes(Direction ~Lag2, data =train_data)
# Predicting classes
pred_h <- predict(model_h,newdata =test_data)
# Creating a confusion matrix
conf_matrix_h <- table(Predicted =pred_h, Actual =test_data$Direction)
conf_matrix_h
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
#accuracy calculation
accuracy_h <-sum(diag(conf_matrix_h)) / sum(conf_matrix_h)
accuracy_h
## [1] 0.5865385
The accuracy in naive bayes is less than LDA and logistic regression but seems similar to QDA(58.65%).
The methods that have the highest accuracy rates are the Logistic Regression and Linear Discriminant Analysis; both having rates of 62.5%.
#predictor combination
predictors_combination <-c("Lag1", "Lag4")
# Subsetting data
training_data_sub <- subset(train_data,select = c("Direction", predictors_combination))
testing_data_sub <- subset(test_data,select= c("Direction", predictors_combination))
# Fitting a logistic regression model
logistic_model<- glm(Direction ~ ., data =training_data_sub, family = "binomial")
# Fitting LDA model
lda_model <-lda(Direction ~ ., data= training_data_sub)
# Fitting a QDA model
qda_model <-qda(Direction ~ ., data= training_data_sub)
# Fitting a KNN model
k <- 5 #value of K
knn_model <- knn(train =training_data_sub[, -1], test =testing_data_sub[, -1], cl= training_data_sub$Direction, k = k)
# Predicting using logistic regression model
logistic_pred <-ifelse(predict(logistic_model, newdata =testing_data_sub, type = "response") > 0.5, "Up", "Down")
# Predicting using LDA model
lda_pred <- predict(lda_model, newdata =testing_data_sub)$class
# Predicting using QDA model
qda_pred <-predict(qda_model, newdata =testing_data_sub)$class
# Calculating accuracy for logistic regression
logistic_accuracy <-mean(logistic_pred ==testing_data_sub$Direction)
# Calculating accuracy for LDA
lda_accuracy <- mean(lda_pred== testing_data_sub$Direction)
# Calculating accuracy for QDA
qda_accuracy <-mean(qda_pred== testing_data_sub$Direction)
# Calculating accuracy for KNN
knn_accuracy<- mean(knn_model== testing_data_sub$Direction)
# Print accuracies for each
cat("Logistic Regression Accuracy: ", logistic_accuracy)
## Logistic Regression Accuracy: 0.5865385
cat(" LDA Accuracy: ", lda_accuracy)
## LDA Accuracy: 0.5865385
cat(" QDA Accuracy: ", qda_accuracy )
## QDA Accuracy: 0.5480769
cat(" KNN Accuracy :" , knn_accuracy)
## KNN Accuracy : 0.5
The analysis utilized Lag1 and Lag4 as predictors to forecast market direction. Logistic regression and LDA achieved comparable accuracies of approximately 58.65%, while QDA yielded a slightly lower accuracy of around 54.81%. KNN classification, with K=5, resulted in an accuracy of approximately 50%, indicating its lower predictive performance compared to other methods.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
head(Auto)
library(dplyr)
# Calculating the median of mpg
mpg_median<-median(Auto$mpg)
# Creating mpg01 variable
Auto <- Auto %>%
mutate(mpg01 =ifelse(mpg >mpg_median, 1, 0))
# Print few rows
head(Auto)
pairs(Auto)
par(mfrow=c(3,3))
boxplot(mpg~mpg01,data=Auto)
boxplot(cylinders~mpg01,data=Auto)
boxplot(displacement~mpg01,data=Auto)
boxplot(horsepower~mpg01,data=Auto)
boxplot(weight~mpg01,data=Auto)
boxplot(acceleration~mpg01,data=Auto)
boxplot(year~mpg01,data=Auto)
boxplot(origin~mpg01,data=Auto)
cylinders,displacement,horsepower,weight,acceleration,year seem most
likely to be useful in predicting mpg01.
set.seed(123)
#split the data
indices <-sample(1:nrow(Auto), nrow(Auto)*0.7)
train_data <-Auto[indices, ]
test_data <- Auto[-indices, ]
#dimensions
dim(train_data)
## [1] 274 10
dim(test_data)
## [1] 118 10
# Choosing variables associated with mpg01
selected_vars <- c("cylinders","displacement","horsepower","weight","acceleration","year")
# Building a LDA model
lda_model <- lda(mpg01 ~ .,data =train_data[, c("mpg01",selected_vars)])
# Predicting mpg01 on test data
predictions <- predict(lda_model,newdata =test_data[,selected_vars])
#test error
test_error <-mean(predictions$class!= test_data$mpg01)
cat("Test Error:", test_error, "\n")
## Test Error: 0.1016949
# Choosing variables associated with mpg01
selected_vars <- c("cylinders","displacement","horsepower", "weight", "acceleration", "year")
# Building QDA model
qda_model <-qda(mpg01 ~ ., data =train_data[, c("mpg01", selected_vars)])
# Predicting mpg01 on test data
predictions_qda <- predict(qda_model,newdata =test_data[, selected_vars])
#test error
test_error_qda <-mean(predictions_qda$class !=test_data$mpg01)
cat("Test Error (QDA):", test_error_qda, "\n")
## Test Error (QDA): 0.1016949
# Building logistic regression model
logistic_model <- glm(mpg01 ~ ., data = train_data[, c("mpg01",selected_vars)],family = binomial)
# Predicting mpg01 on test data
predictions_logistic <- predict(logistic_model,newdata = test_data[, selected_vars], type ="response")
# Converting predicted probabilities to class labels (0 or 1)
predictions_logistic_class <-ifelse(predictions_logistic > 0.5, 1, 0)
# test error
test_error_logistic <-mean(predictions_logistic_class!= test_data$mpg01)
cat("Test Error (Logistic Regression):",test_error_logistic, "\n")
## Test Error (Logistic Regression): 0.1186441
# Using variables that seemed most associated with mpg01
selected_vars <- c("cylinders","displacement","horsepower","weight","acceleration")
# using naive bayes
nb_model <-naiveBayes(mpg01 ~ ., data = train_data[, c("mpg01",selected_vars)])
#predicting mpg01 on test data
predictions_nb <-predict(nb_model, newdata= test_data[, selected_vars])
#test error
test_error_nb <- mean(predictions_nb != test_data$mpg01)
cat("Test Error (Naive Bayes):", test_error_nb)
## Test Error (Naive Bayes): 0.1016949
library(class)
#using the variables that seemed most associated with mpg01
selected_vars <- c("cylinders","displacement", "horsepower","weight","acceleration")
# Defining a function to perform KNN with varying values of K
perform_knn <- function(k_value) {
# training
knn_model <- knn(train =train_data[, selected_vars],
test= test_data[, selected_vars],
cl =train_data$mpg01,
k= k_value)
# test error
test_error_knn <- mean(knn_model != test_data$mpg01)
return(test_error_knn)
}
# Specify values
k_values <-c(1, 3, 5, 7, 9)
#use KNN for each value of K and evaluate test errors
test_errors <-sapply(k_values, perform_knn)
# Print
cat("Test Errors (KNN):\n")
## Test Errors (KNN):
print(test_errors)
## [1] 0.1694915 0.1271186 0.1101695 0.1016949 0.1101695
# best value of K
best_k <-k_values[which.min(test_errors)]
cat("\nBest K value:", best_k, "\n")
##
## Best K value: 7