PROBLEM 1.

setwd('/Users/australialostawartoemus/Desktop/Statistical Modeling/Class Codes')
Weekly = read.csv("Weekly.csv")

Weekly$Direction <- ifelse(Weekly$Direction == "Up", 1, 0)

library(ISLR)
## 
## Attaching package: 'ISLR'
## The following object is masked _by_ '.GlobalEnv':
## 
##     Weekly
library(boot)
## Warning: package 'boot' was built under R version 4.3.3
glm.fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = binomial(link = "logit"))
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial(link = "logit"), 
##     data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22122    0.06147   3.599 0.000319 ***
## Lag1        -0.03872    0.02622  -1.477 0.139672    
## Lag2         0.06025    0.02655   2.270 0.023232 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1488.2  on 1086  degrees of freedom
## AIC: 1494.2
## 
## Number of Fisher Scoring iterations: 4
#had to convert weekly direction to binary 1/0
#lda.pred <- predict(lda.fit, Smarket.2005)
#lda.pred$class
random <- sample(1:1250, 1)

train1 = (sample(1:1250) != random) #contain all training data w/o one observation.

Weekly.1 <- Weekly[random, ]
Direction.1 <- Weekly$Direction[random]

glm.fit1 <- glm(Direction ~ Lag1 + Lag2, data = Weekly, subset = train1, family = binomial(link = "logit"))
summary(glm.fit1)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial(link = "logit"), 
##     data = Weekly, subset = train1)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.21952    0.06149   3.570 0.000357 ***
## Lag1        -0.03802    0.02622  -1.450 0.147037    
## Lag2         0.06028    0.02654   2.271 0.023130 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1495.0  on 1087  degrees of freedom
## Residual deviance: 1487.1  on 1085  degrees of freedom
##   (161 observations deleted due to missingness)
## AIC: 1493.1
## 
## Number of Fisher Scoring iterations: 4
glm.pred_1 <- predict(glm.fit1, Weekly.1)
glm.pred_1
##       625 
## 0.2872087
Weekly.1
##     Year   Lag1  Lag2   Lag3   Lag4 Lag5  Volume  Today Direction
## 625 2002 -0.978 0.506 -1.573 -2.295 0.99 1.58852 -2.315         0
table(Weekly.1$Direction, glm.pred_1)
##    glm.pred_1
##     0.287208676475423
##   0                 1

Yes, the observation was correctly classified. Since prediction was less than 50% for Up direction.

D.i.

posterior_n = list()

D.ii-iii.

for (x in 1:1089) {
  train_n = (sample(1:1089) != x) #contain all training data w/o one observation.

  Weekly.n <- Weekly[x, ]
  Direction.n <- Weekly$Direction[x]

  glm.fit_n <- glm(Direction ~ Lag1 + Lag2, data = Weekly, subset = train_n)
  glm.pred_n <- predict(glm.fit_n, Weekly.1)
  
  pred = 2 #default is down
  if(glm.pred_n >= .5) {pred = 1} #if bigger than 50% = 1
  if(pred == Direction.n) {posterior_n[x] = 0 } else {
        posterior_n[x] = 1} 
}

D.iv.

head(posterior_n)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 1
## 
## [[3]]
## [1] 0
## 
## [[4]]
## [1] 0
## 
## [[5]]
## [1] 0
## 
## [[6]]
## [1] 1
hand_LOOCV = mean(unlist(posterior_n))
hand_LOOCV
## [1] 0.4444444

Since the outcomes were converted to the binary 0/1, the error is essentially a probability. For the logistic regression, just under 50% of its predictions will incorrectly predict the direction of the market using lag 1 and lag 2.

PROBLEM 2.

set.seed(1)
x <- rnorm(100)
y <- x - 2 * x^2 + rnorm(100)

n = 100, since x is the dependent variable that contains 100 data points. p = 2, since the function contains two dependent variables.

Y = x - 2x^2 + ϵ

plot.default(x, y)

The data’s plotted curve is a negative parabola that centers at x = 0 with a magnitude of about y = 0.

library(boot)
Data = data.frame(x,y)
set.seed(3)

glm.fit1 <- glm(y ~ x, data = Data)
cv.err <- cv.glm(Data, glm.fit1)
cv.err$delta[1]
## [1] 7.288162
glm.fit2 <- glm(y ~ poly(x, 2), data = Data)
cv.err2 <- cv.glm(Data, glm.fit2)
cv.err2$delta[1]
## [1] 0.9374236
glm.fit3 <- glm(y ~ poly(x, 3), data = Data)
cv.err3 <- cv.glm(Data, glm.fit3)
cv.err3$delta[1]
## [1] 0.9566218
glm.fit4 <- glm(y ~ poly(x, 4), data = Data)
cv.err4 <- cv.glm(Data, glm.fit4)
cv.err4$delta[1]
## [1] 0.9539049

D.i.

set.seed(13)

glm.fit1 <- glm(y ~ x, data = Data)
cv.err <- cv.glm(Data, glm.fit1)
cv.err$delta[1]
## [1] 7.288162
glm.fit2 <- glm(y ~ poly(x, 2), data = Data)
cv.err2 <- cv.glm(Data, glm.fit2)
cv.err2$delta[1]
## [1] 0.9374236
glm.fit3 <- glm(y ~ poly(x, 3), data = Data)
cv.err3 <- cv.glm(Data, glm.fit3)
cv.err3$delta[1]
## [1] 0.9566218
glm.fit4 <- glm(y ~ poly(x, 4), data = Data)
cv.err4 <- cv.glm(Data, glm.fit4)
cv.err4$delta[1]
## [1] 0.9539049

Changing the seed number doesn’t affect the cv.error values for the four models because the LOOCV method goes through each observation so there is no actual random sampling.

  1. The quadratic model had the smallest LOOCV error value, which I expected based on the parabola curve observed from (B).