Chapter 04 (page 168): 10, 11, 13)
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
Year and volume have a distinct pattern
pairs(Weekly)
Lag 2 is statistically significant
LOG.model = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(LOG.model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Accuracy = 56.1% 92% accuracy when the markets are up and 11% error when the markets are down.
LOG.probs = predict(LOG.model, type = "response")
LOGPRED = rep("Down", length(LOG.probs))
LOGPRED[LOG.probs > 0.5] = "Up"
table(LOGPRED, Weekly$Direction)
##
## LOGPRED Down Up
## Down 54 48
## Up 430 557
Accuracy = 62.5% = 65/104
trainset = Weekly[Weekly$Year<2009,]
testset = Weekly[Weekly$Year>2008,]
LOG.model2 = glm(Direction ~ Lag2, data= trainset, family = "binomial")
testprob = predict(LOG.model2, type = "response", newdata = testset)
testDir = Weekly$Direction[Weekly$Year>2008]
testPredic = rep("Down", length(testprob))
testPredic[testprob > 0.5] = "Up"
table(testPredic, testDir)
## testDir
## testPredic Down Up
## Down 9 5
## Up 34 56
Accuracy = 62.5% = 65/104
library(MASS)
LDAModel = lda(Direction ~ Lag2, data= trainset)
LDAPRED = predict(LDAModel, newdata=testset, type="response")
LDACLA = LDAPRED$class
table(LDACLA, testset$Direction)
##
## LDACLA Down Up
## Down 9 5
## Up 34 56
Accuracy = 58.6% = 61/104
QDAModel = qda(Direction ~ Lag2, data= trainset)
QDAPRED = predict(QDAModel, newdata=testset, type="response")
QDACLA = QDAPRED$class
table(QDACLA, testset$Direction)
##
## QDACLA Down Up
## Down 0 0
## Up 43 61
Accuracy = 50% = 61/104
library(class)
set.seed(1)
train.1 = cbind(trainset$Lag2)
test.1 = cbind(testset$Lag2)
train.P = cbind(trainset$Direction)
KNNPRED = knn(train.1, test.1, train.P, k=1)
table(KNNPRED, testset$Direction)
##
## KNNPRED Down Up
## 1 21 30
## 2 22 31
Logistic and LDA provide the best out of the others
data(Auto)
attach(Auto)
mpg01 = rep(0, length(mpg))
mpg01[mpg > median(mpg)] = 1
Auto = data.frame(Auto, mpg01)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
## mpg01
## Min. :0.0
## 1st Qu.:0.0
## Median :0.5
## Mean :0.5
## 3rd Qu.:1.0
## Max. :1.0
##
pairs(Auto)
Cylinders, Displacement, horsepower, weight
datacut = sample(1:dim(Auto)[1], size=dim(Auto)[1]*0.80)
trainauto = Auto[datacut,]
testauto = Auto[-datacut,]
Test error = 10.1%
LDAAUTO = lda(mpg01 ~ cylinders + weight + displacement + year, data = trainauto)
LDAPREDAUTO = predict(LDAAUTO, newdata=testauto, type="response")$class
table(LDAPREDAUTO, testauto$mpg01)
##
## LDAPREDAUTO 0 1
## 0 37 0
## 1 8 34
Test error = 12.6%
QDAAUTO = qda(mpg01 ~ cylinders + weight + displacement + year, data = trainauto)
QDAPREDAUTO = predict(QDAAUTO, newdata=testauto, type="response")$class
table(QDAPREDAUTO, testauto$mpg01)
##
## QDAPREDAUTO 0 1
## 0 39 4
## 1 6 30
Test error = 10.2%
LOGAUTO = glm(mpg01 ~ cylinders + weight + displacement + year, family="binomial", data=trainauto)
LOGPROBAUTO = predict(LOGAUTO,testauto, type="response")
LOGPREDAUTO = rep(0, length(LOGPROBAUTO))
LOGPREDAUTO[LOGPROBAUTO>0.5]=1
table(LOGPREDAUTO, testauto$mpg01)
##
## LOGPREDAUTO 0 1
## 0 41 1
## 1 4 33
Test error = 15.1%
trainXYZ = cbind(trainauto$cylinders, trainauto$weight, trainauto$displacement, trainauto$year)
testXYZ = cbind(testauto$cylinders, testauto$weight, testauto$displacement, testauto$year)
train.P1 = cbind(trainauto$mpg01)
set.seed(1)
KNNPREDAUTO = knn(trainXYZ, testXYZ, train.P1, k=1)
table(KNNPREDAUTO, testauto$mpg01)
##
## KNNPREDAUTO 0 1
## 0 37 4
## 1 8 30
Lowest error rate is for the logistic Reg which is 9.09%. KNN had the highest error rate with 62.84%
data("Boston")
attach(Boston)
crim01 = rep(0, length(crim))
crim01[crim > median(crim)] = 1
Boston = data.frame(Boston, crim01)
traincut = 1:(dim(Boston)[1]/2)
testcut = (dim(Boston)[1]/2 + 1):dim(Boston)[1]
trainBOS = Boston[traincut, ]
testBOS = Boston[testcut, ]
LOGBOS = glm(crim01 ~ indus+nox+age+dis+rad+tax, family="binomial", data=trainBOS)
LOGPROBBOS = predict(LOGBOS,testBOS, type="response")
LOGPREDBOS = rep(0, length(LOGPROBBOS))
LOGPREDBOS[LOGPROBBOS>0.5]=1
table(LOGPREDBOS, testBOS$crim01)
##
## LOGPREDBOS 0 1
## 0 75 8
## 1 15 155
mean(LOGPREDBOS !=testBOS$crim01)
## [1] 0.09090909
LDABOS = lda(crim01 ~ indus+nox+age+dis+rad+tax, data = trainBOS)
LDAPREDBOS = predict(LDABOS, newdata=testBOS, type="response")$class
table(LDAPREDBOS, testBOS$crim01)
##
## LDAPREDBOS 0 1
## 0 81 18
## 1 9 145
mean(LDAPREDBOS !=testBOS$crim01)
## [1] 0.1067194
trainXXX = cbind(trainBOS$indus, trainBOS$nox, trainBOS$age, trainBOS$dis, trainBOS$rad,trainBOS$tax)
testXXX = cbind(testBOS$indus, testBOS$nox, testBOS$age, testBOS$dis, testBOS$rad,testBOS$tax)
train.P2 = cbind(trainBOS$crim01)
set.seed(1)
KNNPREDBOS = knn(trainXXX, testXXX, train.P2, k=1)
table(KNNPREDBOS, testBOS$crim01)
##
## KNNPREDBOS 0 1
## 0 82 151
## 1 8 12
mean(KNNPREDBOS !=testBOS$crim01)
## [1] 0.6284585