Problem 13

part a
# a
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
plot(Weekly)

cor(Weekly[,-9])
##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000
plot(Weekly$Volume)

#### There may be a relationship between Year and Volume based on the scatterplot and the matrix.

Question 13

part b
glm.fits <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                data = Weekly, family = binomial)

summary(glm.fits)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lag 2 is the only significant predictor.

Question 13

part c
glm.probs = predict(glm.fits, type = 'response')
glm.pred = rep("Down", 1089)
glm.pred[glm.probs > 0.5] = "Up"

writeLines("Confusion Matrix and Overall Fraction of Correct Predictions:")
## Confusion Matrix and Overall Fraction of Correct Predictions:
table(glm.pred, Weekly$Direction)
##         
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557
mean(glm.pred == Weekly$Direction)
## [1] 0.5610652

The model predicted that the market would go up on 557 days and that it will go down on the 54 days. So, 56.11% of the responses in the market are correctly predicted.

Question 13

part d
train <- (Weekly$Year<2009)
weekly09 <- Weekly[!train ,]
direction09 <- Weekly$Direction[!train]
dim(weekly09)
## [1] 104   9
glm_fit <- glm(Direction~Lag2, data = Weekly,family=binomial ,subset=train)
glm_probability <- predict (glm_fit,weekly09, type="response")
glm_prediction <- rep("Down",104)
glm_prediction[glm_probability >.5]=" Up"
table(glm_prediction ,direction09)
##               direction09
## glm_prediction Down Up
##            Up    34 56
##           Down    9  5

The model indicates that the market would go up on the 56 days and down on 9 days. 62.5% of the model is predicting the response of the market correctly.

Question 13

part e
lda.fit <- lda(Direction~Lag2 ,data = Weekly ,subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162
lda.prediction <- predict(lda.fit , weekly09)
names(lda.prediction)
## [1] "class"     "posterior" "x"
lda.class <- lda.prediction$class
table(lda.class , direction09)
##          direction09
## lda.class Down Up
##      Down    9  5
##      Up     34 56

Question 13

part f
weekly.qda <- qda(Direction~Lag2 ,data=Weekly ,subset=train)
weekly.qda
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
class.qda <- predict(weekly.qda ,weekly09)$class
table(class.qda ,direction09)
##          direction09
## class.qda Down Up
##      Down    0  0
##      Up     43 61

Question 13

part g
train.X <- cbind(Weekly$Lag2)[train ,]
test.X <- cbind(Weekly$Lag2)[!train ,]
direction.train <- Weekly$Direction [train]
dim(train.X) <-  c(985,1)
dim(test.X) <- c(104,1)
set.seed(1)
knn.pred <- knn(train.X,test.X,direction.train ,k=1)
table(knn.pred ,direction09)
##         direction09
## knn.pred Down Up
##     Down   21 30
##     Up     22 31

Question 13

part h and i
n.bayes <- naiveBayes(Direction~Lag2 ,data=Weekly ,subset=train)
n.bayes
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Conditional probabilities:
##       Lag2
## Y             [,1]     [,2]
##   Down -0.03568254 2.199504
##   Up    0.26036581 2.317485

The regression modelhas the highest prediction out of all the other models, such, that model may be the best.

Question 14

a

mpg01 <- c(Auto$mpg > median(Auto$mpg))
Auto1 <- data.frame(Auto, mpg01)

Question 14

b

plot(Auto1)

boxplot(Auto1$mpg01 ~ Auto1$cylinders)

boxplot(Auto1$mpg01 ~ Auto1$displacement)

boxplot(Auto1$mpg01 ~ Auto1$horsepower)

boxplot(Auto1$mpg01 ~ Auto1$weight)

boxplot(Auto1$mpg01 ~ Auto1$acceleration)

boxplot(Auto1$mpg01 ~ Auto1$year)

boxplot(Auto1$mpg01 ~ Auto1$origin)

Weight, horsepower, displacment, acceleration may be the predictors for mpg01.

Question 14

c

set.seed(191)
Atrain = sample(nrow(Auto1), 314)

auto.train <- Auto1[Atrain,]
auto.test <- Auto1[!(as.numeric(rownames(Auto1)) %in% Atrain),]

Question 14

d

lda.auto <- lda(mpg01 ~ displacement + horsepower + weight + acceleration, data = auto.train)
lda.auto
## Call:
## lda(mpg01 ~ displacement + horsepower + weight + acceleration, 
##     data = auto.train)
## 
## Prior probabilities of groups:
##     FALSE      TRUE 
## 0.5191083 0.4808917 
## 
## Group means:
##       displacement horsepower   weight acceleration
## FALSE     276.0307   132.0798 3637.454     14.53988
## TRUE      115.7185    78.5894 2328.781     16.49073
## 
## Coefficients of linear discriminants:
##                        LD1
## displacement -0.0078245278
## horsepower    0.0036968317
## weight       -0.0009948677
## acceleration -0.0094873543
lda.pred <- predict(lda.auto, auto.test)
names(lda.pred)
## [1] "class"     "posterior" "x"
table(lda.pred$class, auto.test$mpg01)
##        
##         FALSE TRUE
##   FALSE    29    2
##   TRUE      4   48
1 - mean(lda.pred$class == auto.test$mpg01)
## [1] 0.07228916
The test error rate is 0.0722 or 7%.

Question 14

e

qda.auto <- qda(mpg01 ~ displacement + horsepower + acceleration + weight, data = auto.train)

qda.pred <- predict(qda.auto, auto.test)
names(qda.pred)
## [1] "class"     "posterior"
table(qda.pred$class, auto.test$mpg01)
##        
##         FALSE TRUE
##   FALSE    32    2
##   TRUE      1   48
1 - mean(qda.pred$class == auto.test$mpg01)
## [1] 0.03614458
The test error is 3.6%.

Question 14

f

lr.auto <- glm(mpg01 ~ displacement + horsepower + acceleration + weight, data = auto.train, family = binomial)
lr.auto
## 
## Call:  glm(formula = mpg01 ~ displacement + horsepower + acceleration + 
##     weight, family = binomial, data = auto.train)
## 
## Coefficients:
##  (Intercept)  displacement    horsepower  acceleration        weight  
##     12.96911      -0.01260      -0.05740      -0.11074      -0.00129  
## 
## Degrees of Freedom: 313 Total (i.e. Null);  309 Residual
## Null Deviance:       434.8 
## Residual Deviance: 172.2     AIC: 182.2
lr.probs = predict(lr.auto, auto.test, type = 'response')
lr.pred = rep("High", 83)
lr.pred[lr.probs > 0.5] = "Low"

mean(lr.pred == auto.test$mpg01)
## [1] 0
(5/83)*100
## [1] 6.024096
Error rate is 6%

Question 14

g

nb.auto <- naiveBayes(mpg01 ~ displacement + horsepower + weight + acceleration, data = auto.train)
nb.auto
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     FALSE      TRUE 
## 0.5191083 0.4808917 
## 
## Conditional probabilities:
##        displacement
## Y           [,1]     [,2]
##   FALSE 276.0307 92.00204
##   TRUE  115.7185 41.02814
## 
##        horsepower
## Y           [,1]     [,2]
##   FALSE 132.0798 39.10574
##   TRUE   78.5894 15.60482
## 
##        weight
## Y           [,1]     [,2]
##   FALSE 3637.454 699.0066
##   TRUE  2328.781 410.3934
## 
##        acceleration
## Y           [,1]     [,2]
##   FALSE 14.53988 2.809981
##   TRUE  16.49073 2.524001
nb.class <- predict(nb.auto, auto.test)
table(nb.class, auto.test$mpg01)
##         
## nb.class FALSE TRUE
##    FALSE    30    2
##    TRUE      3   48
1 - mean(nb.class == auto.test$mpg01)
## [1] 0.06024096
Error rate is 6%

Question 14

h

Question 16