PROBLEM 1.
setwd('/Users/australialostawartoemus/Desktop/Statistical Modeling/Class Codes')
Weekly = read.csv("Weekly.csv")
Weekly$Direction <- ifelse(Weekly$Direction == "Up", 1, 0)
library(ISLR)
##
## Attaching package: 'ISLR'
## The following object is masked _by_ '.GlobalEnv':
##
## Weekly
library(boot)
## Warning: package 'boot' was built under R version 4.3.3
glm.fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = binomial(link = "logit"))
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial(link = "logit"),
## data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
#had to convert weekly direction to binary 1/0
#lda.pred <- predict(lda.fit, Smarket.2005)
#lda.pred$class
random <- sample(1:1250, 1)
train1 = (sample(1:1250) != random) #contain all training data w/o one observation.
Weekly.1 <- Weekly[random, ]
Direction.1 <- Weekly$Direction[random]
glm.fit1 <- glm(Direction ~ Lag1 + Lag2, data = Weekly, subset = train1, family = binomial(link = "logit"))
summary(glm.fit1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial(link = "logit"),
## data = Weekly, subset = train1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.21952 0.06149 3.570 0.000357 ***
## Lag1 -0.03802 0.02622 -1.450 0.147037
## Lag2 0.06028 0.02654 2.271 0.023130 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1495.0 on 1087 degrees of freedom
## Residual deviance: 1487.1 on 1085 degrees of freedom
## (161 observations deleted due to missingness)
## AIC: 1493.1
##
## Number of Fisher Scoring iterations: 4
glm.pred_1 <- predict(glm.fit1, Weekly.1)
glm.pred_1
## 625
## 0.2872087
Weekly.1
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 625 2002 -0.978 0.506 -1.573 -2.295 0.99 1.58852 -2.315 0
table(Weekly.1$Direction, glm.pred_1)
## glm.pred_1
## 0.287208676475423
## 0 1
Yes, the observation was correctly classified. Since prediction was less than 50% for Up direction.
D.i.
posterior_n = list()
D.ii-iii.
for (x in 1:1089) {
train_n = (sample(1:1089) != x) #contain all training data w/o one observation.
Weekly.n <- Weekly[x, ]
Direction.n <- Weekly$Direction[x]
glm.fit_n <- glm(Direction ~ Lag1 + Lag2, data = Weekly, subset = train_n)
glm.pred_n <- predict(glm.fit_n, Weekly.1)
pred = 2 #default is down
if(glm.pred_n >= .5) {pred = 1} #if bigger than 50% = 1
if(pred == Direction.n) {posterior_n[x] = 0 } else {
posterior_n[x] = 1}
}
D.iv.
head(posterior_n)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1
##
## [[3]]
## [1] 0
##
## [[4]]
## [1] 0
##
## [[5]]
## [1] 0
##
## [[6]]
## [1] 1
hand_LOOCV = mean(unlist(posterior_n))
hand_LOOCV
## [1] 0.4444444
Since the outcomes were converted to the binary 0/1, the error is essentially a probability. For the logistic regression, just under 50% of its predictions will incorrectly predict the direction of the market using lag 1 and lag 2.
PROBLEM 2.
set.seed(1)
x <- rnorm(100)
y <- x - 2 * x^2 + rnorm(100)
n = 100, since x is the dependent variable that contains 100 data points. p = 2, since the function contains two dependent variables.
Y = x - 2x^2 + ϵ
plot.default(x, y)
The data’s plotted curve is a negative parabola that centers at
x = 0 with a magnitude of about y = 0.
library(boot)
Data = data.frame(x,y)
set.seed(3)
glm.fit1 <- glm(y ~ x, data = Data)
cv.err <- cv.glm(Data, glm.fit1)
cv.err$delta[1]
## [1] 7.288162
glm.fit2 <- glm(y ~ poly(x, 2), data = Data)
cv.err2 <- cv.glm(Data, glm.fit2)
cv.err2$delta[1]
## [1] 0.9374236
glm.fit3 <- glm(y ~ poly(x, 3), data = Data)
cv.err3 <- cv.glm(Data, glm.fit3)
cv.err3$delta[1]
## [1] 0.9566218
glm.fit4 <- glm(y ~ poly(x, 4), data = Data)
cv.err4 <- cv.glm(Data, glm.fit4)
cv.err4$delta[1]
## [1] 0.9539049
D.i.
set.seed(13)
glm.fit1 <- glm(y ~ x, data = Data)
cv.err <- cv.glm(Data, glm.fit1)
cv.err$delta[1]
## [1] 7.288162
glm.fit2 <- glm(y ~ poly(x, 2), data = Data)
cv.err2 <- cv.glm(Data, glm.fit2)
cv.err2$delta[1]
## [1] 0.9374236
glm.fit3 <- glm(y ~ poly(x, 3), data = Data)
cv.err3 <- cv.glm(Data, glm.fit3)
cv.err3$delta[1]
## [1] 0.9566218
glm.fit4 <- glm(y ~ poly(x, 4), data = Data)
cv.err4 <- cv.glm(Data, glm.fit4)
cv.err4$delta[1]
## [1] 0.9539049
Changing the seed number doesn’t affect the cv.error values for the four models because the LOOCV method goes through each observation so there is no actual random sampling.