#Beginning Question10
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
#(A.)
pairs(Weekly[,1:8])

cor(Weekly[,1:8])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
#On the scatterplot and correlation matrices, there appears to be a positive correlation between Year and Volume.
#(B.) & (C.)
logistic_fit = glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = Weekly, family = binomial)
summary(logistic_fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
logistic_probs = predict(logistic_fit, type = "response")
logistic_preds = rep("Down", 1089)
logistic_preds[logistic_probs>0.5] = "Up"
attach(Weekly)
table(logistic_preds, Direction)
## Direction
## logistic_preds Down Up
## Down 54 48
## Up 430 557
#(D.)
train = (Year<2009)
Test = Weekly[!train ,]
Test_Direction=Direction[!train]
logistic_fit2 = glm(Direction~Lag2, data = Weekly, family = binomial, subset= train)
logistic_probs2 = predict(logistic_fit2,Test, type = "response")
logistic_preds2 = rep("Down", 104)
logistic_preds2[logistic_probs2>0.5] = "Up"
table(logistic_preds2, Test_Direction)
## Test_Direction
## logistic_preds2 Down Up
## Down 9 5
## Up 34 56
#(E.)
library(MASS)
lda_fit = lda(Direction~Lag2, data = Weekly, subset = train)
lda_pred = predict(lda_fit, Test)
lda_class = lda_pred$class
table(lda_class,Test_Direction)
## Test_Direction
## lda_class Down Up
## Down 9 5
## Up 34 56
#(F.)
qda_fit = qda(Direction~Lag2, data = Weekly, subset = train)
qda_pred = predict(qda_fit,Test)
qda_class = qda_pred$class
table(qda_class, Test_Direction)
## Test_Direction
## qda_class Down Up
## Down 0 0
## Up 43 61
#(Q.)
library(class)
set.seed(1)
train_X = Weekly[train,3]
test_X = Weekly[!train,3]
train_direction = Direction[train]
dim(train_X) = c(985,1)
dim(test_X) = c(104,1)
knn_pred = knn(train_X, test_X, train_direction, k=1)
table(knn_pred, Test_Direction)
## Test_Direction
## knn_pred Down Up
## Down 21 30
## Up 22 31
#(I.)
knn_pred2 = knn(train_X, test_X, train_direction, k=3)
table(knn_pred2, Test_Direction)
## Test_Direction
## knn_pred2 Down Up
## Down 16 19
## Up 27 42
knn_pred3 = knn(train_X, test_X, train_direction, k=6)
table(knn_pred3, Test_Direction)
## Test_Direction
## knn_pred3 Down Up
## Down 15 20
## Up 28 41
lda_fit2 = lda(Direction~Lag1+Lag2+Lag3+Lag4+Lag5, data=Weekly, subset=train)
lda_pred2 = predict(lda_fit2,Test)
lda_class2 = lda_pred2$class
table(lda_class2, Test_Direction)
## Test_Direction
## lda_class2 Down Up
## Down 9 13
## Up 34 48
logistic_fit3 = glm(Direction~Lag2+ I(Lag2^2), data=Weekly, family=binomial, subset=train)
logistic_probs3 = predict(logistic_fit3,Test, type="response")
logistic_preds3 = rep("Down", 104)
logistic_preds3[logistic_probs3>0.5] = "Up"
logistic_fit4 = glm(Direction~Lag2 + I(Lag1^2), data=Weekly, family=binomial, subset=train)
logistic_probs4 = predict(logistic_fit4,Test, type="response")
logistic_preds4 = rep("Down", 104)
logistic_preds4[logistic_probs4>0.5] = "Up"
table(logistic_preds4,Test_Direction)
## Test_Direction
## logistic_preds4 Down Up
## Down 8 2
## Up 35 59
#Ending Question10
#Beginning Question11
library(ISLR)
#(A.)
df = Auto
df$mpg01 = NA
median_mpg = median(df$mpg)
for(i in 1:dim(df)[1]){
if(df$mpg[i] > median_mpg){
df$mpg01[i] = 1}
else{df$mpg01[i] = 0
}
}
movetolast = function(data, move) {
data[c(setdiff(names(data), move), move)]
}
#(B.)
df = movetolast(df, c("name"))
pairs(df[,1:9])

cor(df[,1:9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
#There is a strong positive correlation between mpg and mpg01, and a strong negative correlation between cylinders, displacement, weight, horsepower and mpg01.
#(C.)
require(caTools)
## Loading required package: caTools
## Warning: package 'caTools' was built under R version 3.6.3
set.seed(123)
sample_data = sample.split(df$mpg,SplitRatio =0.70)
train2 = subset(df, sample_data==TRUE)
test2 = subset(df, sample_data==FALSE)
#(D.)
lda_fit3 = lda(mpg01~cylinders+displacement+horsepower+weight, data=train2)
lda_pred3 = predict(lda_fit3,test2)
predictions = lda_pred3$class
actual = test2$mpg01
table(predictions, actual)
## actual
## predictions 0 1
## 0 48 4
## 1 8 44
mean(predictions!=actual)
## [1] 0.1153846
#The test error of this model is 11.5%.
#(E.)
qda_fit2 = qda(mpg01~cylinders+displacement+horsepower+weight, data=train2)
qda_pred2 = predict(qda_fit2,test2)
predictions = qda_pred2$class
table(predictions, actual)
## actual
## predictions 0 1
## 0 50 4
## 1 6 44
mean(predictions!=actual)
## [1] 0.09615385
#The QDA model has a test error of 9.6%.
#(F.)
logistic_fit5 = glm(mpg01~cylinders+displacement+horsepower+weight, data=train2, family=binomial)
logistic_probs5 = predict(logistic_fit5,test2, type="response")
logistic_preds5 = rep(0,length(test2$mpg01))
logistic_preds5[logistic_probs5>0.5] = 1
table(logistic_preds5, actual)
## actual
## logistic_preds5 0 1
## 0 50 4
## 1 6 44
mean(predictions!=actual)
## [1] 0.09615385
#The logistic model has a 9.6% test error rate.
#(G.)
train2_matrix = data.matrix(train2[,c("cylinders","displacement","weight","horsepower")])
test2_matrix = data.matrix(test2[,c("cylinders","displacement","weight","horsepower")])
train2_y = data.matrix(train2$mpg01)
test2_y = data.matrix(test2$mpg01)
knn_pred4 = knn(train2_matrix, test2_matrix, train2_y, k=1)
table(knn_pred4, test2_y)
## test2_y
## knn_pred4 0 1
## 0 45 9
## 1 11 39
mean(knn_pred4!=test2_y)
## [1] 0.1923077
#KNN with K=1 has a test error of 20%.
knn_pred5 = knn(train2_matrix, test2_matrix, train2_y, k=3)
table(knn_pred5, test2_y)
## test2_y
## knn_pred5 0 1
## 0 45 5
## 1 11 43
mean(knn_pred5!=test2_y)
## [1] 0.1538462
#KNN with K=3 is has a test error of 15%.
knn_pred6 = knn(train2_matrix, test2_matrix, train2_y, k=10)
table(knn_pred6, test2_y)
## test2_y
## knn_pred6 0 1
## 0 45 4
## 1 11 44
mean(knn_pred6!=test2_y)
## [1] 0.1442308
#K=10 leads to a slight improvement in test error 14.4%.
#Ending Question11
#Beginning Question13
library(ISLR)
library(MASS)
library(class)
require(caTools)
boston_df = Boston
median_crim =median(Boston$crim)
boston_df$crim01 = with(ifelse(crim>median_crim,1, 0), data=Boston)
cor(boston_df$crim01,boston_df)
## crim zn indus chas nox rm age
## [1,] 0.4093955 -0.436151 0.6032602 0.07009677 0.7232348 -0.1563718 0.6139399
## dis rad tax ptratio black lstat medv
## [1,] -0.6163416 0.6197862 0.6087413 0.2535684 -0.3512109 0.4532627 -0.2630167
## crim01
## [1,] 1
set.seed(123)
boston_sample = sample.split(boston_df$crim01, SplitRatio =0.80)
boston_train = subset(boston_df, boston_sample==TRUE)
boston_test = subset(boston_df, boston_sample==FALSE)
boston_lr = glm(crim01~.-chas-crim , data=boston_train, family=binomial)
summary(boston_lr)
##
## Call:
## glm(formula = crim01 ~ . - chas - crim, family = binomial, data = boston_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8978 -0.1705 -0.0002 0.0025 3.5340
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -39.431670 7.164710 -5.504 3.72e-08 ***
## zn -0.102660 0.038683 -2.654 0.007958 **
## indus -0.069107 0.047124 -1.467 0.142511
## nox 48.381844 8.123415 5.956 2.59e-09 ***
## rm 0.084088 0.798160 0.105 0.916096
## age 0.023671 0.013566 1.745 0.081014 .
## dis 0.820828 0.255752 3.209 0.001330 **
## rad 0.631930 0.164647 3.838 0.000124 ***
## tax -0.005362 0.002838 -1.889 0.058826 .
## ptratio 0.334091 0.133126 2.510 0.012087 *
## black -0.007506 0.005230 -1.435 0.151230
## lstat 0.076788 0.051698 1.485 0.137462
## medv 0.149509 0.077409 1.931 0.053430 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 560.06 on 403 degrees of freedom
## Residual deviance: 169.97 on 391 degrees of freedom
## AIC: 195.97
##
## Number of Fisher Scoring iterations: 9
boston_probs = predict(boston_lr, boston_test, type="response")
boston_preds = rep(0,length(boston_test$crim01))
boston_preds[boston_probs>0.5] = 1
actual = boston_test$crim01
table(boston_preds, actual)
## actual
## boston_preds 0 1
## 0 46 5
## 1 5 46
mean(boston_preds!=actual)
## [1] 0.09803922
#Test error rate of 9.8%.
boston_lr2 = glm(crim01~zn+nox+dis+rad+ptratio, data=boston_train,family=binomial)
boston_probs2 = predict(boston_lr2, boston_test, type="response")
boston_preds2 = rep(0,length(boston_test$crim01))
boston_preds2[boston_probs2>0.5] = 1
actual =boston_test$crim01
table(boston_preds2, actual)
## actual
## boston_preds2 0 1
## 0 45 9
## 1 6 42
mean(boston_preds2!=actual)
## [1] 0.1470588
#Test error rises to 14.7% when using this subset.
boston_lda = lda(crim01~.-chas-crim ,data=boston_train)
boston_preds2 = predict(boston_lda, boston_test)
table(boston_preds2$class, actual)
## actual
## 0 1
## 0 49 12
## 1 2 39
mean(boston_preds2$class!=actual)
## [1] 0.1372549
#Test error rate of 13.7%.
boston_qda = qda(crim01~.-chas-crim , data=boston_train)
boston_preds3 = predict(boston_qda, boston_test)
table(boston_preds3$class, actual)
## actual
## 0 1
## 0 50 9
## 1 1 42
mean(boston_preds3$class!=actual)
## [1] 0.09803922
#Test error rate of 9.8%.
boston_train2 = data.matrix(subset(boston_train, select=-c(crim,chas)))
boston_test2 = data.matrix(subset(boston_test, select=-c(crim,chas)))
train2_y = data.matrix(boston_train[,15])
test2_y = data.matrix(boston_test[,15])
boston_knn1 = knn(boston_train2, boston_test2, train2_y,k=1)
table(boston_knn1, test2_y)
## test2_y
## boston_knn1 0 1
## 0 47 1
## 1 4 50
mean(boston_knn1!=test2_y)
## [1] 0.04901961
#Test error rate of 4.9%.
boston_knn2 = knn(boston_train2, boston_test2, train2_y, k=3)
table(boston_knn2, test2_y)
## test2_y
## boston_knn2 0 1
## 0 45 1
## 1 6 50
mean(boston_knn2!=test2_y)
## [1] 0.06862745
#Higher test error rate of 6.9%.
boston_knn3 = knn(boston_train2, boston_test2, train2_y, k=10)
table(boston_knn3, test2_y)
## test2_y
## boston_knn3 0 1
## 0 43 5
## 1 8 46
mean(boston_knn3!=test2_y)
## [1] 0.127451
#Test error rate of 11.7%.
#QDA and Logistic regression perform better than LDA but worse than KNN.
#End Question13