Assignment3

Question 10a

data(Weekly)
pairs(Weekly)

cor(Weekly[,-9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

It does not look like there are any relationship between the Lags. As you have stated if we could find any relationships then we wouldn’t be here right now we would be on a beach somewhere. However there seems to be some trend with volume traded going up over time.

Question10b

attach(Weekly)
glm10 = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data=Weekly, family='binomial' )
summary(glm10)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = "binomial", data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lag 2 is the only predictor with a p-value that is significant.

Question 10c

probs10 = predict(glm10, type="response")
pred10 = rep("Down", length(probs10))
pred10[probs10 > 0.5] = "Up"
table(pred10, Direction)

##       Direction
## pred10 Down  Up
##   Down   54  48
##   Up    430 557

mean(pred10==Direction)

## [1] 0.5610652

The overall prediction of the market is 56.1% but the percentage it is right that the weeks go up is 557/(557+48) which is 92.1%. A great number! But the percentage it is right that the week goes down is 54/(430+53) which is 11.2%, a very bad number.

Question 10d

train = (Year < 2009)
test0910 = Weekly[!train,]
direction0910 = Direction[!train]
glm10d = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
probs10d=predict(glm10d, test0910, type="response")
pred.glm10d = rep("Down", length(probs10d))
pred.glm10d[probs10d > 0.5] = "Up"
table(pred.glm10d, direction0910)

##            direction0910
## pred.glm10d Down Up
##        Down    9  5
##        Up     34 56

mean(pred.glm10d==direction0910)

## [1] 0.625

The overall prediction of the market is 62.5%. For weeks that go up we have 56/(56+5) which is 91.8% and for weeks that go down we have 9/(34+9) which is 20.93%.

Question 10e

library(MASS)
lda.fit=lda(Direction ~ Lag2, data=Weekly, subset=train)
lda.pred=predict(lda.fit, test0910)
table(lda.pred$class, direction0910)

##       direction0910
##        Down Up
##   Down    9  5
##   Up     34 56

mean(lda.pred$class==direction0910)

## [1] 0.625

Because of the similarities between LDA and Logistic regression we get roughly the same results. The overall prediction of the market is 62.5%. For weeks that go up we have 56/(56+5) which is 91.8% and for weeks that go down we have 9/(34+9) which is 20.93%.

Question 10f

qda.fit = qda(Direction ~ Lag2, data=Weekly, subset=train)
qda.class=predict(qda.fit,test0910)$class
table(qda.class,direction0910)

##          direction0910
## qda.class Down Up
##      Down    0  0
##      Up     43 61

mean(qda.class==direction0910)

## [1] 0.5865385

The overall prediction of the market is 58.65%. This model chose up every time which means it was successful every week that went up and wrong every week that went down. Since there was a higher number of weeks that went up than went down, we had a prediction over 50%.

Question 10g

library(class)
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.direction = Direction[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.direction,k=1)
table(knn.pred,direction0910)

##         direction0910
## knn.pred Down Up
##     Down   21 30
##     Up     22 31

mean(knn.pred==direction0910)

## [1] 0.5

The overall prediction of this model was 50%. For weeks that go up we have 31/(30+31) which is 50.81% and for weeks that go down we have 21/(21+22) which is 48.83%.

Question 10h

The methods that apprear to provide the best results on this data are Logistic Regression and LDA.

Question 10i

library(class)
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.direction = Direction[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.direction,k=3)
table(knn.pred,direction0910)

##         direction0910
## knn.pred Down Up
##     Down   16 20
##     Up     27 41

mean(knn.pred==direction0910)

## [1] 0.5480769

train = (Year < 2009)
test0910 = Weekly[!train,]
direction0910 = Direction[!train]
glm10d = glm(Direction ~ Lag2 + Lag1, data = Weekly, family = binomial, subset = train)
probs10d=predict(glm10d, test0910, type="response")
pred.glm10d = rep("Down", length(probs10d))
pred.glm10d[probs10d > 0.5] = "Up"
table(pred.glm10d, direction0910)

##            direction0910
## pred.glm10d Down Up
##        Down    7  8
##        Up     36 53

mean(pred.glm10d==direction0910)

## [1] 0.5769231

qda.fit = qda(Direction ~ sqrt(abs(Lag2)), data=Weekly, subset=train)
qda.class=predict(qda.fit,test0910)$class
table(qda.class,direction0910)

##          direction0910
## qda.class Down Up
##      Down    0  0
##      Up     43 61

mean(qda.class==direction0910)

## [1] 0.5865385

It still seems that LDA and Log are the highest.

Question 11a

attach(Auto)
mpg01 = rep(0, length(mpg))
mpg01[mpg > median(mpg)] = 1
Auto = data.frame(Auto, mpg01)

Question 11b

plot(Auto)

boxplot(cylinders ~ mpg01)

boxplot(origin ~ mpg01)

boxplot(weight ~ mpg01)

boxplot(horsepower ~ mpg01)

boxplot(displacement ~ mpg01)

It looks like cylinders origin weight horsepower and displacement all have some correlation with mpg01

Question 11c

train = (year %% 2 == 0)
Auto.train = Auto[train,]
Auto.test <- Auto[!train, ]
mpg01.test <- mpg01[!train]

Question 11d

lda.fit = lda(mpg01 ~ cylinders + origin + weight + horsepower + displacement, data = Auto, subset = train)
lda.pred = predict(lda.fit, Auto.test)
table(lda.pred$class, mpg01.test)

##    mpg01.test
##      0  1
##   0 86 11
##   1 14 71

mean(lda.pred$class != mpg01.test)

## [1] 0.1373626

The error rate is 13.7%

Question 11e

qda.fit = qda(mpg01 ~ cylinders + origin + weight + horsepower + displacement, data = Auto, subset = train)
qda.pred=predict(qda.fit,Auto.test)
table(qda.pred$class,mpg01.test)

##    mpg01.test
##      0  1
##   0 88 13
##   1 12 69

mean(qda.pred$class!=mpg01.test)

## [1] 0.1373626

The error rate is 13.7%, which we would have assumed since LDA and QDA provide similar results.

Question 11f

glm.fit = glm(mpg01 ~ cylinders + origin + weight + horsepower + displacement, data = Auto, family = binomial, subset = train)
probs = predict(glm.fit, Auto.test, type = "response")
glm.pred = rep(0, length(probs))
glm.pred[probs > .5] = 1
table(glm.pred, mpg01.test)

##         mpg01.test
## glm.pred  0  1
##        0 89 11
##        1 11 71

mean(glm.pred!=mpg01.test)

## [1] 0.1208791

The error rate is 12.08%

Question 11g

train.X = cbind(cylinders , origin , weight , horsepower , displacement)[train, ]
test.X = cbind(cylinders , origin , weight , horsepower , displacement)[!train, ]
train.mpg01 = mpg01[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.mpg01, k = 1)
table(knn.pred, mpg01.test)

##         mpg01.test
## knn.pred  0  1
##        0 83 11
##        1 17 71

mean(knn.pred!=mpg01.test)

## [1] 0.1538462

knn.pred = knn(train.X, test.X, train.mpg01, k = 2)
table(knn.pred, mpg01.test)

##         mpg01.test
## knn.pred  0  1
##        0 81  9
##        1 19 73

mean(knn.pred!=mpg01.test)

## [1] 0.1538462

knn.pred = knn(train.X, test.X, train.mpg01, k = 3)
table(knn.pred, mpg01.test)

##         mpg01.test
## knn.pred  0  1
##        0 84  9
##        1 16 73

mean(knn.pred!=mpg01.test)

## [1] 0.1373626

knn.pred = knn(train.X, test.X, train.mpg01, k = 4)
table(knn.pred, mpg01.test)

##         mpg01.test
## knn.pred  0  1
##        0 79  8
##        1 21 74

mean(knn.pred!=mpg01.test)

## [1] 0.1593407

knn.pred = knn(train.X, test.X, train.mpg01, k = 5)
table(knn.pred, mpg01.test)

##         mpg01.test
## knn.pred  0  1
##        0 82  9
##        1 18 73

mean(knn.pred!=mpg01.test)

## [1] 0.1483516

The error rate for k=1 is 15.38%, k=2 is 15.38%, k=3 is 13.73%, k=4 is 15.93%, k=5 is 14.83%

The best performing K seems to be 3.

Question 12a

Power = function()
{
  2^3
}

Power()

## [1] 8

Question 12b

Power2 = function(x,y)
{
  x^y
}

Power2(3,8)

## [1] 6561

Question 12c

Power2(10,3)

## [1] 1000

Power2(8,17)

## [1] 2.2518e+15

Power2(131,3)

## [1] 2248091

Question 12d

Power3 = function(x,y)
{
  result = x^y
  return(result)
}

Question 12e

x = 1:10
plot(x, Power3(x,2), log="xy", xlab="Log X", ylab= "Log X^2", main = "Log X^2 vs Log X")

Question 12f

PlotPower = function(x,y)
{
  plot(x, Power3(x,y))
}

PlotPower(1:10,3)