data(Weekly)
pairs(Weekly)
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
It does not look like there are any relationship between the Lags. As you have stated if we could find any relationships then we wouldn’t be here right now we would be on a beach somewhere. However there seems to be some trend with volume traded going up over time.
attach(Weekly)
glm10 = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data=Weekly, family='binomial' )
summary(glm10)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag 2 is the only predictor with a p-value that is significant.
probs10 = predict(glm10, type="response")
pred10 = rep("Down", length(probs10))
pred10[probs10 > 0.5] = "Up"
table(pred10, Direction)
## Direction
## pred10 Down Up
## Down 54 48
## Up 430 557
mean(pred10==Direction)
## [1] 0.5610652
The overall prediction of the market is 56.1% but the percentage it is right that the weeks go up is 557/(557+48) which is 92.1%. A great number! But the percentage it is right that the week goes down is 54/(430+53) which is 11.2%, a very bad number.
train = (Year < 2009)
test0910 = Weekly[!train,]
direction0910 = Direction[!train]
glm10d = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
probs10d=predict(glm10d, test0910, type="response")
pred.glm10d = rep("Down", length(probs10d))
pred.glm10d[probs10d > 0.5] = "Up"
table(pred.glm10d, direction0910)
## direction0910
## pred.glm10d Down Up
## Down 9 5
## Up 34 56
mean(pred.glm10d==direction0910)
## [1] 0.625
The overall prediction of the market is 62.5%. For weeks that go up we have 56/(56+5) which is 91.8% and for weeks that go down we have 9/(34+9) which is 20.93%.
library(MASS)
lda.fit=lda(Direction ~ Lag2, data=Weekly, subset=train)
lda.pred=predict(lda.fit, test0910)
table(lda.pred$class, direction0910)
## direction0910
## Down Up
## Down 9 5
## Up 34 56
mean(lda.pred$class==direction0910)
## [1] 0.625
Because of the similarities between LDA and Logistic regression we get roughly the same results. The overall prediction of the market is 62.5%. For weeks that go up we have 56/(56+5) which is 91.8% and for weeks that go down we have 9/(34+9) which is 20.93%.
qda.fit = qda(Direction ~ Lag2, data=Weekly, subset=train)
qda.class=predict(qda.fit,test0910)$class
table(qda.class,direction0910)
## direction0910
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class==direction0910)
## [1] 0.5865385
The overall prediction of the market is 58.65%. This model chose up every time which means it was successful every week that went up and wrong every week that went down. Since there was a higher number of weeks that went up than went down, we had a prediction over 50%.
library(class)
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.direction = Direction[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.direction,k=1)
table(knn.pred,direction0910)
## direction0910
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred==direction0910)
## [1] 0.5
The overall prediction of this model was 50%. For weeks that go up we have 31/(30+31) which is 50.81% and for weeks that go down we have 21/(21+22) which is 48.83%.
The methods that apprear to provide the best results on this data are Logistic Regression and LDA.
library(class)
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.direction = Direction[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.direction,k=3)
table(knn.pred,direction0910)
## direction0910
## knn.pred Down Up
## Down 16 20
## Up 27 41
mean(knn.pred==direction0910)
## [1] 0.5480769
train = (Year < 2009)
test0910 = Weekly[!train,]
direction0910 = Direction[!train]
glm10d = glm(Direction ~ Lag2 + Lag1, data = Weekly, family = binomial, subset = train)
probs10d=predict(glm10d, test0910, type="response")
pred.glm10d = rep("Down", length(probs10d))
pred.glm10d[probs10d > 0.5] = "Up"
table(pred.glm10d, direction0910)
## direction0910
## pred.glm10d Down Up
## Down 7 8
## Up 36 53
mean(pred.glm10d==direction0910)
## [1] 0.5769231
qda.fit = qda(Direction ~ sqrt(abs(Lag2)), data=Weekly, subset=train)
qda.class=predict(qda.fit,test0910)$class
table(qda.class,direction0910)
## direction0910
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class==direction0910)
## [1] 0.5865385
It still seems that LDA and Log are the highest.
attach(Auto)
mpg01 = rep(0, length(mpg))
mpg01[mpg > median(mpg)] = 1
Auto = data.frame(Auto, mpg01)
plot(Auto)
boxplot(cylinders ~ mpg01)
boxplot(origin ~ mpg01)
boxplot(weight ~ mpg01)
boxplot(horsepower ~ mpg01)
boxplot(displacement ~ mpg01)
It looks like cylinders origin weight horsepower and displacement all have some correlation with mpg01
train = (year %% 2 == 0)
Auto.train = Auto[train,]
Auto.test <- Auto[!train, ]
mpg01.test <- mpg01[!train]
lda.fit = lda(mpg01 ~ cylinders + origin + weight + horsepower + displacement, data = Auto, subset = train)
lda.pred = predict(lda.fit, Auto.test)
table(lda.pred$class, mpg01.test)
## mpg01.test
## 0 1
## 0 86 11
## 1 14 71
mean(lda.pred$class != mpg01.test)
## [1] 0.1373626
The error rate is 13.7%
qda.fit = qda(mpg01 ~ cylinders + origin + weight + horsepower + displacement, data = Auto, subset = train)
qda.pred=predict(qda.fit,Auto.test)
table(qda.pred$class,mpg01.test)
## mpg01.test
## 0 1
## 0 88 13
## 1 12 69
mean(qda.pred$class!=mpg01.test)
## [1] 0.1373626
The error rate is 13.7%, which we would have assumed since LDA and QDA provide similar results.
glm.fit = glm(mpg01 ~ cylinders + origin + weight + horsepower + displacement, data = Auto, family = binomial, subset = train)
probs = predict(glm.fit, Auto.test, type = "response")
glm.pred = rep(0, length(probs))
glm.pred[probs > .5] = 1
table(glm.pred, mpg01.test)
## mpg01.test
## glm.pred 0 1
## 0 89 11
## 1 11 71
mean(glm.pred!=mpg01.test)
## [1] 0.1208791
The error rate is 12.08%
train.X = cbind(cylinders , origin , weight , horsepower , displacement)[train, ]
test.X = cbind(cylinders , origin , weight , horsepower , displacement)[!train, ]
train.mpg01 = mpg01[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.mpg01, k = 1)
table(knn.pred, mpg01.test)
## mpg01.test
## knn.pred 0 1
## 0 83 11
## 1 17 71
mean(knn.pred!=mpg01.test)
## [1] 0.1538462
knn.pred = knn(train.X, test.X, train.mpg01, k = 2)
table(knn.pred, mpg01.test)
## mpg01.test
## knn.pred 0 1
## 0 81 9
## 1 19 73
mean(knn.pred!=mpg01.test)
## [1] 0.1538462
knn.pred = knn(train.X, test.X, train.mpg01, k = 3)
table(knn.pred, mpg01.test)
## mpg01.test
## knn.pred 0 1
## 0 84 9
## 1 16 73
mean(knn.pred!=mpg01.test)
## [1] 0.1373626
knn.pred = knn(train.X, test.X, train.mpg01, k = 4)
table(knn.pred, mpg01.test)
## mpg01.test
## knn.pred 0 1
## 0 79 8
## 1 21 74
mean(knn.pred!=mpg01.test)
## [1] 0.1593407
knn.pred = knn(train.X, test.X, train.mpg01, k = 5)
table(knn.pred, mpg01.test)
## mpg01.test
## knn.pred 0 1
## 0 82 9
## 1 18 73
mean(knn.pred!=mpg01.test)
## [1] 0.1483516
The error rate for k=1 is 15.38%, k=2 is 15.38%, k=3 is 13.73%, k=4 is 15.93%, k=5 is 14.83%
The best performing K seems to be 3.
Power = function()
{
2^3
}
Power()
## [1] 8
Power2 = function(x,y)
{
x^y
}
Power2(3,8)
## [1] 6561
Power2(10,3)
## [1] 1000
Power2(8,17)
## [1] 2.2518e+15
Power2(131,3)
## [1] 2248091
Power3 = function(x,y)
{
result = x^y
return(result)
}
x = 1:10
plot(x, Power3(x,2), log="xy", xlab="Log X", ylab= "Log X^2", main = "Log X^2 vs Log X")
PlotPower = function(x,y)
{
plot(x, Power3(x,y))
}
PlotPower(1:10,3)