library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
lr.1 = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,family = "binomial", data=Weekly)
summary(lr.1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
probs = predict(lr.1, type="response")
preds = rep("Down", 1089)
preds[probs > 0.5] = "Up"
table(preds, Weekly$Direction)
##
## preds Down Up
## Down 54 48
## Up 430 557
Formula to determine the percentage of current predictions: (54 + 557) / (54+ 48 + 430 + 557) = .5611 or 56.11%
This states that the weekly market is correctly predicted 56.11% of the time
Up Trends correctly predicts: (557) / (48 + 557) = .9207 or 92.07%
Down trend correctly predicts: (54) / (430 +54) = .1115 or 11.15%
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
train=Weekly$Year <= 2008
Weekly.test=Weekly[!train,]
logit.fit = glm(Direction ~ Lag2, family=binomial, data=Weekly, subset=train)
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
summary(logit.fit)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
glm.probs=predict(logit.fit,Weekly.test,type="response")
glm.pred=rep("Down",nrow(Weekly.test))
glm.pred[glm.probs > 0.50]="Up"
table(glm.pred,Weekly.test$Direction)
##
## glm.pred Down Up
## Down 9 5
## Up 34 56
mean(glm.pred==Weekly.test$Direction)
## [1] 0.625
library(caret)
library(MASS)
lda.fit = lda(Direction ~ Lag2, data=Weekly, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
lda.class=predict(lda.fit,Weekly.test)$class
table(lda.class,Weekly.test$Direction)
##
## lda.class Down Up
## Down 9 5
## Up 34 56
mean(lda.class==Weekly.test$Direction)
## [1] 0.625
require(MASS)
library(caret)
qda.fit = qda(Direction ~ Lag2, data=Weekly, subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class=predict(qda.fit,Weekly.test)$class
table(qda.class,Weekly.test$Direction)
##
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class==Weekly.test$Direction)
## [1] 0.5865385
library(class)
train.X=Weekly[train,"Lag2",drop=F]
test.X=Weekly[!train,"Lag2",drop=F]
train.Direction=Weekly[train,"Direction",drop=T]
test.Direction=Weekly[!train,"Direction",drop=T]
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction,k=1)
table(knn.pred,test.Direction)
## test.Direction
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred==test.Direction)
## [1] 0.5
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction,k=4)
table(knn.pred,test.Direction)
## test.Direction
## knn.pred Down Up
## Down 20 17
## Up 23 44
mean(knn.pred==test.Direction)
## [1] 0.6153846
qda.fit = qda(Direction ~ Lag2, data=Weekly, subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class=predict(qda.fit,Weekly.test)$class
table(qda.class,Weekly.test$Direction)
##
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class==Weekly.test$Direction)
## [1] 0.5865385
train=Weekly$Year <= 2008
Weekly.test=Weekly[!train,]
logit.fit = glm(Direction ~ Lag1+Lag2+Volume, family=binomial, data=Weekly, subset=train)
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
summary(logit.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Volume, family = binomial,
## data = Weekly, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4681 -1.2581 0.9929 1.0840 1.5339
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.29792 0.09136 3.261 0.00111 **
## Lag1 -0.05975 0.02917 -2.048 0.04054 *
## Lag2 0.04774 0.02941 1.624 0.10446
## Volume -0.07093 0.05263 -1.348 0.17777
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1345.1 on 981 degrees of freedom
## AIC: 1353.1
##
## Number of Fisher Scoring iterations: 4
glm.probs=predict(logit.fit,Weekly.test,type="response")
glm.pred=rep("Down",nrow(Weekly.test))
glm.pred[glm.probs > 0.50]="Up"
table(glm.pred,Weekly.test$Direction)
##
## glm.pred Down Up
## Down 27 33
## Up 16 28
mean(glm.pred==Weekly.test$Direction)
## [1] 0.5288462
data(Auto)
library(MASS)
library(ISLR)
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg),1,0)
pairs(Auto[,-9])
train <- (Auto$year %% 2 == 0)
train.auto <- Auto[train,]
test.auto <- Auto[-train,]
autolda.fit <- lda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train.auto)
autolda.pred <- predict(autolda.fit, test.auto)
table(autolda.pred$class, test.auto$mpg01)
##
## 0 1
## 0 169 7
## 1 26 189
autolda.fit <- lda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train.auto)
autolda.pred <- predict(autolda.fit, test.auto)
table(autolda.pred$class, test.auto$mpg01)
##
## 0 1
## 0 169 7
## 1 26 189
mean(autolda.pred$class != test.auto$mpg01)
## [1] 0.08439898
autoqda.fit <- qda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train.auto)
autoqda.pred <- predict(autoqda.fit, test.auto)
table(autoqda.pred$class, test.auto$mpg01)
##
## 0 1
## 0 176 20
## 1 19 176
mean(autoqda.pred$class != test.auto$mpg01)
## [1] 0.09974425
auto.fit<-glm(mpg01~displacement+horsepower+weight+year+cylinders+origin, data=train.auto,family=binomial)
auto.probs = predict(auto.fit, test.auto, type = "response")
auto.pred = rep(0, length(auto.probs))
auto.pred[auto.probs > 0.5] = 1
table(auto.pred, test.auto$mpg01)
##
## auto.pred 0 1
## 0 174 12
## 1 21 184
mean(auto.pred != test.auto$mpg01)
## [1] 0.08439898
#K=1
library(ISLR)
set.seed(1)
train.Auto = train.auto[,c("horsepower","weight","acceleration")]
test.Auto = test.auto[,c("horsepower","weight","acceleration")]
knn.pred=knn(train.Auto,test.Auto,train.auto$mpg01,k=1)
table(knn.pred,test.auto$mpg01)
##
## knn.pred 0 1
## 0 179 13
## 1 16 183
mean(knn.pred==test.auto$mpg01)
## [1] 0.9258312
#k=2
knn.pred=knn(train.Auto,test.Auto,train.auto$mpg01,k=2)
table(knn.pred,test.auto$mpg01)
##
## knn.pred 0 1
## 0 168 19
## 1 27 177
mean(knn.pred==test.auto$mpg01)
## [1] 0.8823529
#k=3
knn.pred=knn(train.Auto,test.Auto,train.auto$mpg01,k=3)
table(knn.pred,test.auto$mpg01)
##
## knn.pred 0 1
## 0 168 15
## 1 27 181
mean(knn.pred==test.auto$mpg01)
## [1] 0.8925831
#k=5
knn.pred=knn(train.Auto,test.Auto,train.auto$mpg01,k=5)
table(knn.pred,test.auto$mpg01)
##
## knn.pred 0 1
## 0 166 15
## 1 29 181
mean(knn.pred==test.auto$mpg01)
## [1] 0.887468
#k=10
knn.pred=knn(train.Auto,test.Auto,train.auto$mpg01,k=10)
table(knn.pred,test.auto$mpg01)
##
## knn.pred 0 1
## 0 162 11
## 1 33 185
mean(knn.pred==test.auto$mpg01)
## [1] 0.887468
Accuracy .887
When k=1 it looks like it has the highest accuracy score with 92.5%
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
attach(Boston)
crime_rate <- rep(0, length(crim))
crime_rate[crim > median(crim)] <- 1
Boston= data.frame(Boston,crime_rate)
train = 1:(dim(Boston)[1]/2)
test = (dim(Boston)[1]/2 + 1):dim(Boston)[1]
Boston.train = Boston[train, ]
Boston.test = Boston[test, ]
crime_rate.test = crime_rate[test]
set.seed(1)
Boston.fit <-glm(crime_rate~ indus+nox+age+dis+rad+tax, data=Boston.train,family=binomial)
Boston.probs = predict(Boston.fit, Boston.test, type = "response")
Boston.pred = rep(0, length(Boston.probs))
Boston.pred[Boston.probs > 0.5] = 1
table(Boston.pred, crime_rate.test)
## crime_rate.test
## Boston.pred 0 1
## 0 75 8
## 1 15 155
mean(Boston.pred != crime_rate.test)
## [1] 0.09090909
Accuracy: .0909
LDA:
Boston.ldafit <-lda(crime_rate~ indus+nox+age+dis+rad+tax, data=Boston.train,family=binomial)
Bostonlda.pred = predict(Boston.ldafit, Boston.test)
table(Bostonlda.pred$class, crime_rate.test)
## crime_rate.test
## 0 1
## 0 81 18
## 1 9 145
mean(Bostonlda.pred$class != crime_rate.test)
## [1] 0.1067194
Accuracy: .1067
KNN:
#K=1
train.K=cbind(indus,nox,age,dis,rad,tax)[train,]
test.K=cbind(indus,nox,age,dis,rad,tax)[test,]
Bosknn.pred=knn(train.K, test.K, crime_rate.test, k=1)
table(Bosknn.pred,crime_rate.test)
## crime_rate.test
## Bosknn.pred 0 1
## 0 31 155
## 1 59 8
mean(Bosknn.pred !=crime_rate.test)
## [1] 0.8458498
#K=3
train.K=cbind(indus,nox,age,dis,rad,tax)[train,]
test.K=cbind(indus,nox,age,dis,rad,tax)[test,]
Bosknn.pred=knn(train.K, test.K, crime_rate.test, k=3)
table(Bosknn.pred,crime_rate.test)
## crime_rate.test
## Bosknn.pred 0 1
## 0 31 17
## 1 59 146
mean(Bosknn.pred !=crime_rate.test)
## [1] 0.3003953
#K=10
train.K=cbind(indus,nox,age,dis,rad,tax)[train,]
test.K=cbind(indus,nox,age,dis,rad,tax)[test,]
Bosknn.pred=knn(train.K, test.K, crime_rate.test, k=10)
table(Bosknn.pred,crime_rate.test)
## crime_rate.test
## Bosknn.pred 0 1
## 0 41 10
## 1 49 153
mean(Bosknn.pred !=crime_rate.test)
## [1] 0.2332016
Accuracy: .2332
Reviewing all of the accuracy scores the K nearest neighbors (KNN) with k=1 has the highest rate with 84.58%.
Logistic regression had the lowest accuracy score with 9.09%