Homework 3

##Question 13 ##a)

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.1.3

head(Weekly)

##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down

pairs(Weekly)

cor(subset(Weekly, select = -Direction))

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

#Answer: yes there seems to be a relationship between year and volume

##b

logit_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                   data = Weekly,
                   family = binomial)
summary(logit_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

#answer: Lag2 has the highest significance level with a p-value of less than 0.05

##C

show_model_performance <- function(predicted_status, observed_status) {
  confusion_matrix <- table(predicted_status, 
                            observed_status, 
                            dnn = c("predicted Status", "observed Status"))
  print(confusion_matrix)
  error_rate <- mean(predicted_status != observed_status)
  cat("\n")
  cat("         Error Rate:", 100 * error_rate, "%\n")
  cat("Correctly Predicted:", 100 * (1-error_rate), "%\n")
  cat("False Positive Rate:", 100 * confusion_matrix[2,1] / sum(confusion_matrix[,1]), "%\n")
  cat("False Negative Rate:", 100 * confusion_matrix[1,2] / sum(confusion_matrix[,2]), "%\n")  
}
predict_glm_direction <- function(model, newdata = NULL) {
  predictions <- predict(model, newdata, type="response")
  return(as.factor(ifelse(predictions < 0.5, "Down", "Up")))
}
predicted_direction <- predict_glm_direction(logit_model)
show_model_performance(predicted_direction, Weekly$Direction)

##                 observed Status
## predicted Status Down  Up
##             Down   54  48
##             Up    430 557
## 
##          Error Rate: 43.89348 %
## Correctly Predicted: 56.10652 %
## False Positive Rate: 88.84298 %
## False Negative Rate: 7.933884 %

#Answer: the trend is relatively correct

##d)Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).

train <- (Weekly$Year < 2009)
train_set <- Weekly[train,]
test_set <- Weekly[!train,]

logit_model <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)

predicted_direction <- predict_glm_direction(logit_model, test_set)
show_model_performance(predicted_direction, test_set$Direction)

##                 observed Status
## predicted Status Down Up
##             Down    9  5
##             Up     34 56
## 
##          Error Rate: 37.5 %
## Correctly Predicted: 62.5 %
## False Positive Rate: 79.06977 %
## False Negative Rate: 8.196721 %

##e)

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

lda_model <- lda(Direction ~ Lag2, data = Weekly, subset = train)
predictions <- predict(lda_model, test_set, type="response")
show_model_performance(predictions$class, test_set$Direction)

##                 observed Status
## predicted Status Down Up
##             Down    9  5
##             Up     34 56
## 
##          Error Rate: 37.5 %
## Correctly Predicted: 62.5 %
## False Positive Rate: 79.06977 %
## False Negative Rate: 8.196721 %

#Answer: the data could predict 62.5%

##f

qda_model <- qda(Direction ~ Lag2, data = Weekly, subset = train)
predictions <- predict(qda_model, test_set, type="response")
show_model_performance(predictions$class, test_set$Direction)

##                 observed Status
## predicted Status Down Up
##             Down    0  0
##             Up     43 61
## 
##          Error Rate: 41.34615 %
## Correctly Predicted: 58.65385 %
## False Positive Rate: 100 %
## False Negative Rate: 0 %

#answer: the data could correctly predict 58.65%

##g

library(class)
run_knn <- function(train, test, train_class, test_class, k) {
  set.seed(12345)
  predictions <- knn(train, test, train_class, k)
  cat("KNN: k =", k, "\n")
  show_model_performance(predictions, test_class)
}
train_matrix <- as.matrix(train_set$Lag2)
test_matrix <- as.matrix(test_set$Lag2)
run_knn(train_matrix, test_matrix, train_set$Direction, test_set$Direction, k = 1)

## KNN: k = 1 
##                 observed Status
## predicted Status Down Up
##             Down   21 29
##             Up     22 32
## 
##          Error Rate: 49.03846 %
## Correctly Predicted: 50.96154 %
## False Positive Rate: 51.16279 %
## False Negative Rate: 47.54098 %

#answer: the data could correctly predict 50.96%

##h

train=(Weekly$Year<2009)
weekly09=Weekly[!train ,]
direction09=Weekly$Direction[!train]
dim(weekly09)

## [1] 104   9

glm_fit=glm(Direction~Lag2, data = Weekly,family=binomial ,subset=train)
glm_probability=predict (glm_fit,weekly09, type="response")
glm_prediction=rep("Down",104)
glm_prediction[glm_probability >.5]=" Up"
table(glm_prediction ,direction09)

##               direction09
## glm_prediction Down Up
##            Up    34 56
##           Down    9  5

library(e1071)
library(ISLR2)
nbayes=naiveBayes(Direction~Lag2 ,data=Weekly ,subset=train)
nbayes

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Conditional probabilities:
##       Lag2
## Y             [,1]     [,2]
##   Down -0.03568254 2.199504
##   Up    0.26036581 2.317485

nbayes.class=predict(nbayes ,weekly09)
table(nbayes.class ,direction09)

##             direction09
## nbayes.class Down Up
##         Down    0  0
##         Up     43 61

##i

#answer: logistic regression seems to correctly predict 62.5% of the data correctly

##j

logit_model <- glm(Direction ~ Lag1 * Lag2, data = Weekly, family = binomial, subset = train)

predicted_direction <- predict_glm_direction(logit_model, test_set)
show_model_performance(predicted_direction, test_set$Direction)

##                 observed Status
## predicted Status Down Up
##             Down    7  8
##             Up     36 53
## 
##          Error Rate: 42.30769 %
## Correctly Predicted: 57.69231 %
## False Positive Rate: 83.72093 %
## False Negative Rate: 13.11475 %

lda_model <- lda(Direction ~ Lag1 * Lag2, data = Weekly, subset = train)
predictions <- predict(lda_model, test_set, type="response")
show_model_performance(predictions$class, test_set$Direction)

##                 observed Status
## predicted Status Down Up
##             Down    7  8
##             Up     36 53
## 
##          Error Rate: 42.30769 %
## Correctly Predicted: 57.69231 %
## False Positive Rate: 83.72093 %
## False Negative Rate: 13.11475 %

qda_model <- qda(Direction ~ Lag2 + sqrt(abs(Lag2)), data = Weekly, subset = train)
predictions <- predict(qda_model, test_set, type="response")
show_model_performance(predictions$class, test_set$Direction)

##                 observed Status
## predicted Status Down Up
##             Down   12 13
##             Up     31 48
## 
##          Error Rate: 42.30769 %
## Correctly Predicted: 57.69231 %
## False Positive Rate: 72.09302 %
## False Negative Rate: 21.31148 %

run_knn(train_matrix, test_matrix, train_set$Direction, test_set$Direction, k = 10)

## KNN: k = 10 
##                 observed Status
## predicted Status Down Up
##             Down   18 21
##             Up     25 40
## 
##          Error Rate: 44.23077 %
## Correctly Predicted: 55.76923 %
## False Positive Rate: 58.13953 %
## False Negative Rate: 34.42623 %

##Question 14) ##a)

Auto$mpg01 <- Auto$mpg > median(Auto$mpg)
head(Auto$mpg01, n = 20)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE

##b)

cor(subset(Auto, select = -name))

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

pairs(Auto)

##c

train <- sample(nrow(Auto) * 0.7)
train_set <- Auto[train, ]
test_set <- Auto[-train, ]

##d

lda_model <- lda(mpg01 ~ cylinders + weight + displacement + horsepower, 
                 data = Auto, 
                 subset = train)

predictions <- predict(lda_model, test_set)
show_model_performance(predictions$class, test_set$mpg01)

##                 observed Status
## predicted Status FALSE TRUE
##            FALSE    18   16
##            TRUE      2   82
## 
##          Error Rate: 15.25424 %
## Correctly Predicted: 84.74576 %
## False Positive Rate: 10 %
## False Negative Rate: 16.32653 %

##e

qda_model <- qda(mpg01 ~ cylinders + weight + displacement + horsepower, 
                 data = Auto, 
                 subset = train)

predictions <- predict(qda_model, test_set)
show_model_performance(predictions$class, test_set$mpg01)

##                 observed Status
## predicted Status FALSE TRUE
##            FALSE    18   19
##            TRUE      2   79
## 
##          Error Rate: 17.79661 %
## Correctly Predicted: 82.20339 %
## False Positive Rate: 10 %
## False Negative Rate: 19.38776 %

#answer: the test rate is 17.79%

##f

logit_model <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, 
                   data = Auto, 
                   family = binomial,
                   subset = train)

predictions <- predict(logit_model, test_set, type = "response")
show_model_performance(predictions > 0.5, test_set$mpg01)

##                 observed Status
## predicted Status FALSE TRUE
##            FALSE    20   21
##            TRUE      0   77
## 
##          Error Rate: 17.79661 %
## Correctly Predicted: 82.20339 %
## False Positive Rate: 0 %
## False Negative Rate: 21.42857 %

##g

library(e1071)
library(ISLR2)
nbayes=naiveBayes(cylinders ~ mpg01 ,data=Auto ,subset=train)
nbayes

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##           3           4           5           6           8 
## 0.010948905 0.416058394 0.003649635 0.233576642 0.335766423 
## 
## Conditional probabilities:
##    mpg01
## Y       FALSE      TRUE
##   3 1.0000000 0.0000000
##   4 0.1578947 0.8421053
##   5 1.0000000 0.0000000
##   6 0.9687500 0.0312500
##   8 1.0000000 0.0000000

##h

vars <- c("cylinders", "weight", "displacement", "horsepower")
train_matrix <- as.matrix(train_set[, vars])
test_matrix <- as.matrix(test_set[, vars])

predictions <- knn(train_matrix, test_matrix, train_set$mpg01, 1)

run_knn(train_matrix, test_matrix, train_set$mpg01, test_set$mpg01, k = 1)

## KNN: k = 1 
##                 observed Status
## predicted Status FALSE TRUE
##            FALSE    18   24
##            TRUE      2   74
## 
##          Error Rate: 22.0339 %
## Correctly Predicted: 77.9661 %
## False Positive Rate: 10 %
## False Negative Rate: 24.4898 %

run_knn(train_matrix, test_matrix, train_set$mpg01, test_set$mpg01, k = 10)

## KNN: k = 10 
##                 observed Status
## predicted Status FALSE TRUE
##            FALSE    20   22
##            TRUE      0   76
## 
##          Error Rate: 18.64407 %
## Correctly Predicted: 81.35593 %
## False Positive Rate: 0 %
## False Negative Rate: 22.44898 %

run_knn(train_matrix, test_matrix, train_set$mpg01, test_set$mpg01, k = 100)

## KNN: k = 100 
##                 observed Status
## predicted Status FALSE TRUE
##            FALSE    20   25
##            TRUE      0   73
## 
##          Error Rate: 21.18644 %
## Correctly Predicted: 78.81356 %
## False Positive Rate: 0 %
## False Negative Rate: 25.5102 %

##Question 16

#answer: We can say that the following variables will be good predictors

Homework 3

Adrian Joshua

2022-12-05