library(ISLR)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
as_tibble(Weekly)
## # A tibble: 1,089 x 9
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 1990 0.816 1.57 -3.94 -0.229 -3.48 0.155 -0.27 Down
## 2 1990 -0.27 0.816 1.57 -3.94 -0.229 0.149 -2.58 Down
## 3 1990 -2.58 -0.27 0.816 1.57 -3.94 0.160 3.51 Up
## 4 1990 3.51 -2.58 -0.27 0.816 1.57 0.162 0.712 Up
## 5 1990 0.712 3.51 -2.58 -0.27 0.816 0.154 1.18 Up
## 6 1990 1.18 0.712 3.51 -2.58 -0.27 0.154 -1.37 Down
## 7 1990 -1.37 1.18 0.712 3.51 -2.58 0.152 0.807 Up
## 8 1990 0.807 -1.37 1.18 0.712 3.51 0.132 0.041 Up
## 9 1990 0.041 0.807 -1.37 1.18 0.712 0.144 1.25 Up
## 10 1990 1.25 0.041 0.807 -1.37 1.18 0.134 -2.68 Down
## # ... with 1,079 more rows
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
There are more weeks ending in Up (605) than in Down (484). So overall, we would expect the &P 500 stock index to have increased between 1990 and 2010. That is confirmed that both the mean and median of percentage return for the week variable are positive.
The distribution of weekly percentage return for downs and up weeks are similar, both with means around 1.25 (up) and -1.25 (down). Most of the weekly percentage return range from -2% to 2%.
glm_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = "binomial")
summary(glm_fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Only the lag2 predictor is significant in the model to explain the variation of the dependent variable Direction.
predicted <- ifelse(predict(glm_fit, type = "response") < 0.5, "Down", "Up") %>%factor()
table(predicted, Weekly$Direction)
##
## predicted Down Up
## Down 54 48
## Up 430 557
train <- Weekly[Weekly$Year <= 2008, ]
test <- Weekly[Weekly$Year > 2008, ]
glm_fit <- glm(Direction ~ Lag2, data = train, family = "binomial")
glm_predicted <- ifelse(predict(glm_fit, newdata = test, type = "response") < 0.5, "Down", "Up") %>% factor()
table(glm_predicted, test$Direction)
##
## glm_predicted Down Up
## Down 9 5
## Up 34 56
#Accuracy
mean(glm_predicted == test$Direction)
## [1] 0.625
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
lda_fit <- lda(Direction ~ Lag2, data = train)
lad_predicted <- predict(lda_fit, newdata = test)$class
table(lad_predicted, test$Direction)
##
## lad_predicted Down Up
## Down 9 5
## Up 34 56
#Accuracy
mean(lad_predicted == test$Direction)
## [1] 0.625
qda_fit <- qda(Direction ~ Lag2, data = train)
qda_predicted <- predict(qda_fit, newdata = test)$class
table(qda_predicted, test$Direction)
##
## qda_predicted Down Up
## Down 0 0
## Up 43 61
#Accuracy
mean(qda_predicted == test$Direction)
## [1] 0.5865385
library(class)
train.X= data.frame(Lag2 = train$Lag2)
test.X=data.frame(Lag2 = test$Lag2)
train.Direction=train$Direction
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction,k=1)
table(knn.pred,test$Direction)
##
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred==test$Direction)
## [1] 0.5
In this particular dataset, the linear classifiers outperform the other ones. Both the logistic regression and the Linear discriminant analysis attain the accuracy of 62.25%.
Logistic regression with the interaction
glm_fit <- glm(Direction ~ Lag2:Lag1, data = train, family = "binomial")
glm_predicted <- ifelse(predict(glm_fit, newdata = test, type = "response") < 0.5, "Down", "Up") %>% factor()
table(glm_predicted, test$Direction)
##
## glm_predicted Down Up
## Down 1 1
## Up 42 60
#Accuracy
mean(glm_predicted == test$Direction)
## [1] 0.5865385
LDA regression with the interaction
lda_fit <- lda(Direction ~Lag2:Lag1, data = train)
lad_predicted <- predict(lda_fit, newdata = test)$class
table(lad_predicted, test$Direction)
##
## lad_predicted Down Up
## Down 0 1
## Up 43 60
#Accuracy
mean(lad_predicted == test$Direction)
## [1] 0.5769231
QDA with all the lagas as predictors
qda_fit <- qda(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5, data = train)
qda_predicted <- predict(qda_fit, newdata = test)$class
table(qda_predicted, test$Direction)
##
## qda_predicted Down Up
## Down 10 23
## Up 33 38
#Accuracy
mean(qda_predicted == test$Direction)
## [1] 0.4615385
KNN with k = 3,5 and 10
train.X= data.frame(Lag2 = train$Lag2)
test.X=data.frame(Lag2 = test$Lag2)
train.Direction=train$Direction
set.seed(1)
#K1
knn.pred=knn(train.X,test.X,train.Direction,k=3)
table(knn.pred,test$Direction)
##
## knn.pred Down Up
## Down 16 20
## Up 27 41
mean(knn.pred==test$Direction)
## [1] 0.5480769
#K3
knn.pred=knn(train.X,test.X,train.Direction,k=5)
table(knn.pred,test$Direction)
##
## knn.pred Down Up
## Down 15 20
## Up 28 41
mean(knn.pred==test$Direction)
## [1] 0.5384615
#K5
knn.pred=knn(train.X,test.X,train.Direction,k=10)
table(knn.pred,test$Direction)
##
## knn.pred Down Up
## Down 19 19
## Up 24 42
mean(knn.pred==test$Direction)
## [1] 0.5865385
At end, the initial linear models (logistic and LDA) with a single predictor Lag2 still attaining the best prediction accuracy.
library(ISLR)
as_tibble(Auto)
## # A tibble: 392 x 9
## mpg cylinders displacement horsepower weight acceleration year origin
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 18 8 307 130 3504 12 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11 70 1
## 4 16 8 304 150 3433 12 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10 70 1
## 7 14 8 454 220 4354 9 70 1
## 8 14 8 440 215 4312 8.5 70 1
## 9 14 8 455 225 4425 10 70 1
## 10 15 8 390 190 3850 8.5 70 1
## # ... with 382 more rows, and 1 more variable: name <fct>
mpg01 = rep(0, length(Auto$mpg))
mpg01[Auto$mpg > median(Auto$mpg)] = 1
new_auto = data.frame(Auto, mpg01)
attach(new_auto)
## The following object is masked _by_ .GlobalEnv:
##
## mpg01
## The following object is masked from package:ggplot2:
##
## mpg
pairs(new_auto[,-9])
The most evident correlation are displacement, horsepower, weight and acceleration. Since displacement, horsepower, weight are very correlated among themselves, horsepower and weight could better predictor for the model’s simplicity.
auto_train_year = (year%%2 == 0)
auto_test_year = !auto_train_year
auto_train = new_auto[auto_train_year, ]
auto_test = new_auto[auto_test_year, ]
mpg01_test = mpg01[auto_test_year]
dim(auto_train)
FALSE [1] 210 10
dim(auto_test)
FALSE [1] 182 10
auto_lda_fit = lda(mpg01 ~ acceleration + weight + horsepower, data = auto_train)
auto_lda_pred = predict(auto_lda_fit, auto_test)$class
mean(auto_lda_pred != mpg01_test)
## [1] 0.1483516
Error of 14.84%.
auto_qda_fit = qda(mpg01 ~acceleration + weight + horsepower, data = auto_train)
auto_qda_pred = predict(auto_qda_fit, auto_test)$class
mean(auto_qda_pred != mpg01_test)
## [1] 0.1483516
No change. Same error of 14.84%.
auto_glm_fit = glm(mpg01 ~ acceleration + weight + horsepower, data = auto_train,family = binomial)
auto_glm_pred_values = predict(auto_glm_fit, auto_test, type = "response")
auto_glm_pred = rep(0, length(auto_glm_pred_values))
auto_glm_pred[auto_glm_pred_values > 0.5] = 1
mean(auto_glm_pred != mpg01_test)
## [1] 0.1483516
No change. Same error of 14.84%.
train.X = cbind(acceleration, weight, horsepower)[auto_train_year, ]
test.X = cbind(acceleration, weight, horsepower)[auto_test_year, ]
train.mpg01 = mpg01[auto_train_year]
set.seed(1)
# k=1
knn.pred = knn(train.X, test.X, train.mpg01, k = 1)
mean(knn.pred != mpg01_test)
## [1] 0.1593407
# k=5
knn.pred = knn(train.X, test.X, train.mpg01, k = 5)
mean(knn.pred != mpg01_test)
## [1] 0.1483516
# k=10
knn.pred = knn(train.X, test.X, train.mpg01, k = 10)
mean(knn.pred != mpg01_test)
## [1] 0.1483516
# k=20
knn.pred = knn(train.X, test.X, train.mpg01, k = 20)
mean(knn.pred != mpg01_test)
## [1] 0.1538462
# k=50
knn.pred = knn(train.X, test.X, train.mpg01, k = 50)
mean(knn.pred != mpg01_test)
## [1] 0.1428571
# k=100
knn.pred = knn(train.X, test.X, train.mpg01, k = 100)
mean(knn.pred != mpg01_test)
## [1] 0.1428571
# k=150
knn.pred = knn(train.X, test.X, train.mpg01, k = 150)
mean(knn.pred != mpg01_test)
## [1] 0.1923077
K=50 and k=100 have the smallest error of 14.29%. Also, that was the smallest error rate among the classifiers tested.
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
detach(new_auto)
attach(Boston)
bos_lm <- lm(crim~., data = Boston)
summary(bos_lm)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.924 -2.120 -0.353 1.019 75.051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.033228 7.234903 2.354 0.018949 *
## zn 0.044855 0.018734 2.394 0.017025 *
## indus -0.063855 0.083407 -0.766 0.444294
## chas -0.749134 1.180147 -0.635 0.525867
## nox -10.313535 5.275536 -1.955 0.051152 .
## rm 0.430131 0.612830 0.702 0.483089
## age 0.001452 0.017925 0.081 0.935488
## dis -0.987176 0.281817 -3.503 0.000502 ***
## rad 0.588209 0.088049 6.680 6.46e-11 ***
## tax -0.003780 0.005156 -0.733 0.463793
## ptratio -0.271081 0.186450 -1.454 0.146611
## black -0.007538 0.003673 -2.052 0.040702 *
## lstat 0.126211 0.075725 1.667 0.096208 .
## medv -0.198887 0.060516 -3.287 0.001087 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.4396
## F-statistic: 31.47 on 13 and 492 DF, p-value: < 2.2e-16
The predictors that are significant are zn, dis, rad, black, medv.
violent = rep(0, length(crim))
violent[crim > median(crim)] = 1
Boston = data.frame(Boston, violent)
trainb = 1:(dim(Boston)[1]/2)
testb = (dim(Boston)[1]/2 + 1):dim(Boston)[1]
Boston_train = Boston[trainb, ]
Boston_test = Boston[testb, ]
violent_test = violent[testb]
dim(Boston_train)
## [1] 253 15
dim(Boston_test)
## [1] 253 15
Logistic Regression
bos_glm_fit = glm(violent ~ zn+dis+ rad+ black+ medv, data = Boston_train, family = binomial)
bos_glm_pred_values = predict(bos_glm_fit, Boston_test, type = "response")
bos_glm_pred = rep(0, length(bos_glm_pred_values))
bos_glm_pred[bos_glm_pred_values > 0.5] = 1
mean(bos_glm_pred == violent_test)
## [1] 0.8577075
LDA - SIGNIFICANT
bos_lda_fit = lda(violent ~ zn+dis+ rad+ black+ medv, data = Boston_train)
bos_lda_pred = predict(bos_lda_fit, Boston_test)$class
mean(bos_lda_pred == violent_test)
## [1] 0.8814229
LDA - ALL
bos_lda_fit = lda(violent ~ zn+ indus+ chas+nox+rm+ age+ dis+rad+tax+ptratio+black+
lstat+ medv, data = Boston_train)
bos_lda_pred = predict(bos_lda_fit, Boston_test)$class
mean(bos_lda_pred == violent_test)
## [1] 0.8656126
QDA
bos_qda_fit = qda(violent ~ zn+dis+ rad+ black+ medv, data = Boston_train)
bos_qda_pred = predict(bos_qda_fit, Boston_test)$class
mean(bos_qda_pred == violent_test)
## [1] 0.4940711
KNN
bos_train_knn_X = cbind(zn, dis, rad, black, medv)[trainb, ]
bos_test_knn_X = cbind(zn, dis, rad, black, medv)[testb, ]
train_knn_violent = violent[trainb]
set.seed(1)
# k=1
knn.pred = knn(bos_train_knn_X, bos_test_knn_X, train_knn_violent, k = 1)
mean(knn.pred == violent_test)
## [1] 0.743083
# k=5
knn.pred = knn(bos_train_knn_X, bos_test_knn_X, train_knn_violent, k = 5)
mean(knn.pred == violent_test)
## [1] 0.7905138
# k=10
knn.pred = knn(bos_train_knn_X, bos_test_knn_X, train_knn_violent, k = 10)
mean(knn.pred == violent_test)
## [1] 0.7391304
# k=50
knn.pred = knn(bos_train_knn_X, bos_test_knn_X, train_knn_violent, k = 50)
mean(knn.pred == violent_test)
## [1] 0.7509881
# k=100
knn.pred = knn(bos_train_knn_X, bos_test_knn_X, train_knn_violent, k = 100)
mean(knn.pred == violent_test)
## [1] 0.5652174
The highest accuracy among the tested classifiers was of 88.14% attained by the Linear discriminant Analysis using a few significant predictors.