The following are my answers to questions 10,11,13 in Chapter 4 of the ISLR book
library(ISLR)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.3
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.4
## Warning: package 'readr' was built under R version 4.0.4
## Warning: package 'forcats' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.4
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(class)
corrplot<- function(numericdata){
cormat <- round(cor(numericdata),2)
melted_cormat <- reshape2::melt(cormat)
graph<- ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
graph
}
corrplot(dplyr::select(Weekly,-Direction))
histogramplot <- function (numericdata){
graph <- numericdata %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()
graph
}
histogramplot(dplyr::select(Weekly,-Direction))
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
The lag and today variables seem to be a typical bell shaped curve. There only linear correlation is with year and volume. As technolgy advances people will do more trading.
logit <- Weekly %>%
dplyr::select(-Today, -Year) %>%
glm(Direction ~., data = ., family = "binomial")
summary(logit)
##
## Call:
## glm(formula = Direction ~ ., family = "binomial", data = .)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
The only variable that seems significant at the .05 level is Lag 2
Truth <- Weekly$Direction
Probs <- predict(logit, type = "response")
pred <- rep("Down", times = nrow(Weekly))
pred[Probs > .5] <- "Up"
mat <- table(pred,Truth)
mat
## Truth
## pred Down Up
## Down 54 48
## Up 430 557
(mat[1,1] + mat[2,2])/nrow(Weekly)
## [1] 0.5610652
The classifer is over classifying the up trend and not predicting the down trend correct at all.
df <- Weekly %>%
filter(Year == 2009 | Year == 2010 )
logitd<- df %>%
glm(Direction ~Lag2, data = ., family = "binomial")
summary(logitd)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = .)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5767 -1.3052 0.9242 1.0419 1.2978
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.32377 0.20136 1.608 0.108
## Lag2 0.08562 0.06707 1.277 0.202
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 141.04 on 103 degrees of freedom
## Residual deviance: 139.37 on 102 degrees of freedom
## AIC: 143.37
##
## Number of Fisher Scoring iterations: 4
Truth <- df$Direction
Probs <- predict(logitd, type = "response")
pred <- rep("Down", times = nrow(df))
pred[Probs > .5] <- "Up"
mat <-table(pred,Truth)
mat
## Truth
## pred Down Up
## Down 8 4
## Up 35 57
(mat[1,1] + mat[2,2])/nrow(df)
## [1] 0.625
ldae<- df %>%
lda(Direction ~Lag2, data = .)
ldae
## Call:
## lda(Direction ~ Lag2, data = .)
##
## Prior probabilities of groups:
## Down Up
## 0.4134615 0.5865385
##
## Group means:
## Lag2
## Down -0.08904651
## Up 0.69591803
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.3262636
Probs <- predict(ldae, type = "response")
mat <- table(Probs$class, Truth)
mat
## Truth
## Down Up
## Down 8 4
## Up 35 57
(mat[1,1] + mat[2,2])/nrow(df)
## [1] 0.625
qdaf<- df %>%
qda(Direction ~Lag2, data = .)
qdaf
## Call:
## qda(Direction ~ Lag2, data = .)
##
## Prior probabilities of groups:
## Down Up
## 0.4134615 0.5865385
##
## Group means:
## Lag2
## Down -0.08904651
## Up 0.69591803
Probs <- predict(qdaf, type = "response")
mat <- table(Probs$class, Truth)
mat
## Truth
## Down Up
## Down 9 4
## Up 34 57
(mat[1,1] + mat[2,2])/nrow(df)
## [1] 0.6346154
set.seed(123)
smp_size <- floor(0.50 * nrow(df))
train_ind <- sample(seq_len(nrow(df)), size = smp_size)
train.X <- df$Lag2[train_ind]
test.X <- df$Lag2[-train_ind]
train.Y <- df$Direction[train_ind]
test.Y <- df$Direction[-train_ind]
knn.pred<- knn(data.frame(train.X),data.frame(test.X),train.Y, k = 1)
mat <- table(knn.pred,train.Y)
mat
## train.Y
## knn.pred Down Up
## Down 7 12
## Up 16 17
(mat[1,1] + mat[2,2])/length(train.Y)
## [1] 0.4615385
The method that provided the best result was QDA with the best correct prediction rate.
smp_size <- floor(0.50 * nrow(df))
train_ind <- sample(seq_len(nrow(df)), size = smp_size)
train.X <- df$Lag2[train_ind]
test.X <- df$Lag2[-train_ind]
train.Y <- df$Direction[train_ind]
test.Y <- df$Direction[-train_ind]
knn.pred<- knn(data.frame(train.X),data.frame(test.X),train.Y, k = 5)
mat <- table(knn.pred,train.Y)
mat
## train.Y
## knn.pred Down Up
## Down 10 14
## Up 13 15
(mat[1,1] + mat[2,2])/length(train.Y)
## [1] 0.4807692
ldae<- df %>%
lda(Direction ~., data = .)
ldae
## Call:
## lda(Direction ~ ., data = .)
##
## Prior probabilities of groups:
## Down Up
## 0.4134615 0.5865385
##
## Group means:
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume
## Down 2009.465 0.2089767 -0.08904651 0.5855349 0.6180930 -0.08146512 5.111618
## Up 2009.525 0.5306230 0.69591803 0.2309344 0.1938361 0.60139344 5.034022
## Today
## Down -2.357488
## Up 2.230082
##
## Coefficients of linear discriminants:
## LD1
## Year 0.44977869
## Lag1 0.04901766
## Lag2 0.10000287
## Lag3 0.02479192
## Lag4 0.01118442
## Lag5 0.03928998
## Volume 0.07871424
## Today 0.51526811
Probs <- predict(ldae, type = "response")
mat <- table(Probs$class, Truth)
mat
## Truth
## Down Up
## Down 36 0
## Up 7 61
(mat[1,1] + mat[2,2])/nrow(df)
## [1] 0.9326923
cardf<- Auto %>%
dplyr::select(-name) %>%
mutate(mpg01 = if_else(mpg > median(mpg),1,0))
corrplot(cardf)
histogramplot(cardf)
smp_size <- floor(0.50 * nrow(cardf))
train_ind <- sample(seq_len(nrow(cardf)), size = smp_size)
train.X <- cardf[train_ind,]
test.X <- cardf[-train_ind,]
train.Y <- cardf$mpg01[train_ind]
test.Y <- cardf$mpg01[-train_ind]
ldacars <- lda(mpg01 ~cylinders + displacement + horsepower + weight, data = train.X)
ldacars
## Call:
## lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = train.X)
##
## Prior probabilities of groups:
## 0 1
## 0.4540816 0.5459184
##
## Group means:
## cylinders displacement horsepower weight
## 0 6.651685 264.3708 124.97753 3548.685
## 1 4.186916 117.3364 80.11215 2358.654
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.5744023711
## displacement -0.0027758506
## horsepower 0.0108161523
## weight -0.0009902655
testpredict <- predict(ldacars, test.X)$class
mat <- table(testpredict, test.Y)
1- (mat[1,1] + mat[2,2])/length(test.Y)
## [1] 0.09693878
Test error rate of .112 #### e) Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
qdacars <- qda(mpg01 ~cylinders + displacement + horsepower + weight, data = train.X)
qdacars
## Call:
## qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = train.X)
##
## Prior probabilities of groups:
## 0 1
## 0.4540816 0.5459184
##
## Group means:
## cylinders displacement horsepower weight
## 0 6.651685 264.3708 124.97753 3548.685
## 1 4.186916 117.3364 80.11215 2358.654
testpredict <- predict(qdacars, test.X)$class
mat <- table(testpredict, test.Y)
1 - (mat[1,1] + mat[2,2])/length(test.Y)
## [1] 0.1020408
Test error rate of .107 #### f) Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
logcars <- glm(mpg01 ~ cylinders + displacement + horsepower + weight, data = train.X, family = "binomial")
summary(logcars)
##
## Call:
## glm(formula = mpg01 ~ cylinders + displacement + horsepower +
## weight, family = "binomial", data = train.X)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6574 -0.2590 0.1760 0.4568 3.1329
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.5474104 2.3202958 4.977 6.47e-07 ***
## cylinders -0.2820324 0.4652243 -0.606 0.5444
## displacement -0.0126407 0.0107870 -1.172 0.2413
## horsepower -0.0247602 0.0189312 -1.308 0.1909
## weight -0.0019241 0.0009547 -2.015 0.0439 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 270.06 on 195 degrees of freedom
## Residual deviance: 107.89 on 191 degrees of freedom
## AIC: 117.89
##
## Number of Fisher Scoring iterations: 7
testpredict <- predict(logcars,test.X, type = "response")
predicted.classes <- ifelse(testpredict > 0.5, 1, 0)
1 - mean(predicted.classes == test.Y)
## [1] 0.1071429
Test error rate of .112
set.seed(123)
knn.pred<- knn(data.frame(train.X),data.frame(test.X),train.Y, k = 5)
mat <- table(knn.pred,test.Y)
mat
## test.Y
## knn.pred 0 1
## 0 91 6
## 1 16 83
1- (mat[1,1] + mat[2,2])/length(test.Y)
## [1] 0.1122449
K of 5 seems to have the lowest error rate so I will say k of 5 is the best. With a test error rate of .107
Power <- function(num){
print(num ^ 3)
}
Power(2)
## [1] 8
Power2 <- function(x,a){
print(x ^ a)
}
Power2(3,8)
## [1] 6561
Power2(10,3)
## [1] 1000
Power2(81,7)
## [1] 2.287679e+13
Power2(131,3)
## [1] 2248091
Power3 <- function(x,a){
result <- x ^ a
result
}
x <-1:100
plot(x,Power3(x,2), xlab = "x", ylab = "x^2",main = "Using the Power3 Function", log="xy")
PlotPower <- function(x,a){
plot(x,Power3(x,a))
}
PlotPower(1:10,3)