Question 1

# Load necessary packages
install.packages(c("psych", "tidyverse", "tidymodels", "vip", "ISLR2"))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(ISLR2)
library(psych)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:psych':
## 
##     logit

This question should be answered with data mtcars.

data(mtcars)
describe(mtcars)
##      vars  n   mean     sd median trimmed    mad   min    max  range  skew
## mpg     1 32  20.09   6.03  19.20   19.70   5.41 10.40  33.90  23.50  0.61
## cyl     2 32   6.19   1.79   6.00    6.23   2.97  4.00   8.00   4.00 -0.17
## disp    3 32 230.72 123.94 196.30  222.52 140.48 71.10 472.00 400.90  0.38
## hp      4 32 146.69  68.56 123.00  141.19  77.10 52.00 335.00 283.00  0.73
## drat    5 32   3.60   0.53   3.70    3.58   0.70  2.76   4.93   2.17  0.27
## wt      6 32   3.22   0.98   3.33    3.15   0.77  1.51   5.42   3.91  0.42
## qsec    7 32  17.85   1.79  17.71   17.83   1.42 14.50  22.90   8.40  0.37
## vs      8 32   0.44   0.50   0.00    0.42   0.00  0.00   1.00   1.00  0.24
## am      9 32   0.41   0.50   0.00    0.38   0.00  0.00   1.00   1.00  0.36
## gear   10 32   3.69   0.74   4.00    3.62   1.48  3.00   5.00   2.00  0.53
## carb   11 32   2.81   1.62   2.00    2.65   1.48  1.00   8.00   7.00  1.05
##      kurtosis    se
## mpg     -0.37  1.07
## cyl     -1.76  0.32
## disp    -1.21 21.91
## hp      -0.14 12.12
## drat    -0.71  0.09
## wt      -0.02  0.17
## qsec     0.34  0.32
## vs      -2.00  0.09
## am      -1.92  0.09
## gear    -1.07  0.13
## carb     1.26  0.29
library(ggplot2)
ggplot(data = mtcars, aes(x = mpg, y = disp)) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

# Load necessary packages
library(ISLR2)

# Load the Carseats dataset
data(Carseats)

# (a) Fit a multiple regression model to predict Sales using Price, Urban, and US
model <- lm(Sales ~ Price + Urban + US, data = Carseats)
summary(model)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
# (b) Interpretation of coefficients:
# - The coefficient for Price suggests that for a one-unit increase in Price, Sales decrease by the coefficient value, holding other variables constant.
# - The coefficients for Urban and US are for dummy variables. For example, if Urban is 1 (i.e., the store is located in an urban area), the Sales are expected to increase by the coefficient value, holding other variables constant.
# - For US, if the store is in the US, the coefficient indicates the additional sales compared to stores outside the US, holding other variables constant.

# (c) Model equation:
# Sales = β0 + β1 * Price + β2 * Urban + β3 * US + ε
# Where Urban and US are indicator variables.

# (d) Test for the null hypothesis H0: βj = 0
# We reject the null hypothesis for predictors with p-values < 0.05.

# (e) Fit a smaller model with predictors showing evidence of association with the outcome
smaller_model <- lm(Sales ~ Price + US, data = Carseats)
summary(smaller_model)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16
# (f) Assess model fit
# Compare the R-squared values of both models to evaluate their fit to the data.

# (g) Obtain 95% confidence intervals for the coefficient(s)
confint(smaller_model)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632
plot(smaller_model)

# (h) Check for outliers or high leverage observations
# Use diagnostic plots like residual plots, leverage plots, and Cook's distance to identify outliers or high leverage points.

Question 3

Follow the example on the following website:

https://rforanalytics.com/04-Part_3_logistic.html#using-logistic-regression

Starting from section 14.3, please replicate the result in section 14.3.3 by replacing the data in the example with the S&P500 daily index starting from 2019 to 2023. The S&P500 index can be downloaded from yahoo finance or any other sources available on the internet. Upload your link on RPubs to Tronclass.

library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
# Download S&P 500 data from Yahoo Finance
getSymbols("^GSPC", src = "yahoo", from = as.Date('2019-04-25'), to = as.Date('2023-04-25'))
## [1] "GSPC"
head(GSPC)
##            GSPC.Open GSPC.High GSPC.Low GSPC.Close GSPC.Volume GSPC.Adjusted
## 2019-04-25   2928.99   2933.10  2912.84    2926.17  3440010000       2926.17
## 2019-04-26   2925.81   2939.88  2917.56    2939.88  3264390000       2939.88
## 2019-04-29   2940.58   2949.52  2939.35    2943.03  3150390000       2943.03
## 2019-04-30   2937.14   2948.22  2924.11    2945.83  3939760000       2945.83
## 2019-05-01   2952.33   2954.13  2923.36    2923.73  3669330000       2923.73
## 2019-05-02   2922.16   2931.68  2900.50    2917.52  3802290000       2917.52
d_ex1 <- as.data.frame(GSPC)

d_ex1 <- na.omit(d_ex1)

d_ex1$Direction <- NA  # Create an empty Direction column first

for (i in 2:nrow(d_ex1)) {
  if (Cl(d_ex1)[i] > Cl(d_ex1)[i - 1]) {
    d_ex1$Direction[i] <- "Up"
  } else {
    d_ex1$Direction[i] <- "Down"
  }
}

d_ex1 <- na.omit(d_ex1)

idx1 <- c(1:round(nrow(d_ex1) * 0.7))  
d_train1 <- d_ex1[idx1, ]  
d_test1 <- d_ex1[-idx1, ]  

set.seed(999)

cntrl1 <- trainControl(method = "timeslice", initialWindow = 250, horizon = 30,
                       fixedWindow = TRUE)

prep1 <- c("center", "scale")

logit_ex1 <- train(Direction ~ ., data = d_train1, method = "glm", family = "binomial",
                   trControl = cntrl1, preProcess = prep1)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logit_ex1
## Generalized Linear Model 
## 
## 704 samples
##   6 predictor
##   2 classes: 'Down', 'Up' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Rolling Forecasting Origin Resampling (30 held-out with a fixed window) 
## Summary of sample sizes: 250, 250, 250, 250, 250, 250, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8626667  0.7174923
summary(logit_ex1$finalModel)
## 
## Call:
## NULL
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.1789     0.1774   1.008    0.313    
## GSPC.Open     -125.1807    12.2416 -10.226  < 2e-16 ***
## GSPC.High       12.9307    10.1545   1.273    0.203    
## GSPC.Low        32.2950     7.4115   4.357 1.32e-05 ***
## GSPC.Close      80.0818     9.2533   8.654  < 2e-16 ***
## GSPC.Volume      0.1649     0.1892   0.872    0.383    
## GSPC.Adjusted        NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 963.36  on 703  degrees of freedom
## Residual deviance: 413.56  on 698  degrees of freedom
## AIC: 425.56
## 
## Number of Fisher Scoring iterations: 7
vip(logit_ex1, geom = "point") + theme_minimal()