Q1. This question should be answered with data mtcars.

# Load necessary packages
install.packages(c("psych", "tidyverse", "tidymodels", "vip", "ISLR2"))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(ISLR2)
library(psych)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:psych':
## 
##     logit
data(mtcars)
describe(mtcars)
##      vars  n   mean     sd median trimmed    mad   min    max  range  skew
## mpg     1 32  20.09   6.03  19.20   19.70   5.41 10.40  33.90  23.50  0.61
## cyl     2 32   6.19   1.79   6.00    6.23   2.97  4.00   8.00   4.00 -0.17
## disp    3 32 230.72 123.94 196.30  222.52 140.48 71.10 472.00 400.90  0.38
## hp      4 32 146.69  68.56 123.00  141.19  77.10 52.00 335.00 283.00  0.73
## drat    5 32   3.60   0.53   3.70    3.58   0.70  2.76   4.93   2.17  0.27
## wt      6 32   3.22   0.98   3.33    3.15   0.77  1.51   5.42   3.91  0.42
## qsec    7 32  17.85   1.79  17.71   17.83   1.42 14.50  22.90   8.40  0.37
## vs      8 32   0.44   0.50   0.00    0.42   0.00  0.00   1.00   1.00  0.24
## am      9 32   0.41   0.50   0.00    0.38   0.00  0.00   1.00   1.00  0.36
## gear   10 32   3.69   0.74   4.00    3.62   1.48  3.00   5.00   2.00  0.53
## carb   11 32   2.81   1.62   2.00    2.65   1.48  1.00   8.00   7.00  1.05
##      kurtosis    se
## mpg     -0.37  1.07
## cyl     -1.76  0.32
## disp    -1.21 21.91
## hp      -0.14 12.12
## drat    -0.71  0.09
## wt      -0.02  0.17
## qsec     0.34  0.32
## vs      -2.00  0.09
## am      -1.92  0.09
## gear    -1.07  0.13
## carb     1.26  0.29
library(ggplot2)
ggplot(data = mtcars, aes(x = mpg, y = disp)) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

As we need to assess the relationship between mpg and other predictors, the visualization check will be implemented. Based on the plot, which predictors may have transformations like 𝑙𝑙𝑙𝑙𝑙𝑙(𝑥𝑥), √ 𝑥𝑥or 𝑥𝑥2. Comment on your findings.

pairs(mtcars)

Run the multiple regression on mpg across all predictors and show the estimated results.

model_all <- lm(mpg ~ ., data = mtcars)
summary(model_all)
## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

Using car::vif (variance inflation factor) to estimate if there is any multicollinearity among predictors. Find predictors with vif higher than 10.

vif_results <- car::vif(model_all)
high_vif_predictors <- names(vif_results[vif_results > 10])
high_vif_predictors
## [1] "cyl"  "disp" "wt"

Rerun the multiple regressions by (1) excluding disp, and (2) excluding disp and cyl from predictors. Are there any improvements observed from the regression results?

model_excluding_disp <- lm(mpg ~ . - disp, data = mtcars)
summary(model_excluding_disp)
## 
## Call:
## lm(formula = mpg ~ . - disp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7863 -1.4055 -0.2635  1.2029  4.4753 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.55052   18.52585   0.677   0.5052  
## cyl          0.09627    0.99715   0.097   0.9240  
## hp          -0.01295    0.01834  -0.706   0.4876  
## drat         0.92864    1.60794   0.578   0.5694  
## wt          -2.62694    1.19800  -2.193   0.0392 *
## qsec         0.66523    0.69335   0.959   0.3478  
## vs           0.16035    2.07277   0.077   0.9390  
## am           2.47882    2.03513   1.218   0.2361  
## gear         0.74300    1.47360   0.504   0.6191  
## carb        -0.61686    0.60566  -1.018   0.3195  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.623 on 22 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8105 
## F-statistic: 15.73 on 9 and 22 DF,  p-value: 1.183e-07
model_excluding_disp_cyl <- lm(mpg ~ . - disp - cyl, data = mtcars)
summary(model_excluding_disp_cyl)
## 
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8187 -1.3903 -0.3045  1.2269  4.5183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 13.80810   12.88582   1.072   0.2950  
## hp          -0.01225    0.01649  -0.743   0.4650  
## drat         0.88894    1.52061   0.585   0.5645  
## wt          -2.60968    1.15878  -2.252   0.0342 *
## qsec         0.63983    0.62752   1.020   0.3185  
## vs           0.08786    1.88992   0.046   0.9633  
## am           2.42418    1.91227   1.268   0.2176  
## gear         0.69390    1.35294   0.513   0.6129  
## carb        -0.61286    0.59109  -1.037   0.3106  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8187 
## F-statistic:  18.5 on 8 and 23 DF,  p-value: 2.627e-08

Q2. This question should be answered using the data set Carseats which is available after you load up library(ISLR2).

# Load necessary packages
library(ISLR2)

# Load the Carseats dataset
data(Carseats)

# (a) Fit a multiple regression model to predict Sales using Price, Urban, and US
model <- lm(Sales ~ Price + Urban + US, data = Carseats)
summary(model)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
# (b) Interpretation of coefficients:
# - The coefficient for Price suggests that for a one-unit increase in Price, Sales decrease by the coefficient value, holding other variables constant.
# - The coefficients for Urban and US are for dummy variables. For example, if Urban is 1 (i.e., the store is located in an urban area), the Sales are expected to increase by the coefficient value, holding other variables constant.
# - For US, if the store is in the US, the coefficient indicates the additional sales compared to stores outside the US, holding other variables constant.

# (c) Model equation:
# Sales = β0 + β1 * Price + β2 * Urban + β3 * US + ε
# Where Urban and US are indicator variables.

# (d) Test for the null hypothesis H0: βj = 0
# We reject the null hypothesis for predictors with p-values < 0.05.

# (e) Fit a smaller model with predictors showing evidence of association with the outcome
smaller_model <- lm(Sales ~ Price + US, data = Carseats)
summary(smaller_model)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16
# (f) Assess model fit
# Compare the R-squared values of both models to evaluate their fit to the data.

# (g) Obtain 95% confidence intervals for the coefficient(s)
confint(smaller_model)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632
plot(smaller_model)

# (h) Check for outliers or high leverage observations
# Use diagnostic plots like residual plots, leverage plots, and Cook's distance to identify outliers or high leverage points.

Q3. Follow the example on the following website: https://rforanalytics.com/04-Part_3_logistic.html#using-logistic-regression

library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
# Download S&P 500 data from Yahoo Finance
getSymbols("^GSPC", src = "yahoo", from = as.Date('2019-04-25'), to = as.Date('2023-04-25'))
## [1] "GSPC"
head(GSPC)
##            GSPC.Open GSPC.High GSPC.Low GSPC.Close GSPC.Volume GSPC.Adjusted
## 2019-04-25   2928.99   2933.10  2912.84    2926.17  3440010000       2926.17
## 2019-04-26   2925.81   2939.88  2917.56    2939.88  3264390000       2939.88
## 2019-04-29   2940.58   2949.52  2939.35    2943.03  3150390000       2943.03
## 2019-04-30   2937.14   2948.22  2924.11    2945.83  3939760000       2945.83
## 2019-05-01   2952.33   2954.13  2923.36    2923.73  3669330000       2923.73
## 2019-05-02   2922.16   2931.68  2900.50    2917.52  3802290000       2917.52
d_ex1 <- as.data.frame(GSPC)

d_ex1 <- na.omit(d_ex1)

d_ex1$Direction <- NA  # Create an empty Direction column first

for (i in 2:nrow(d_ex1)) {
  if (Cl(d_ex1)[i] > Cl(d_ex1)[i - 1]) {
    d_ex1$Direction[i] <- "Up"
  } else {
    d_ex1$Direction[i] <- "Down"
  }
}

d_ex1 <- na.omit(d_ex1)

idx1 <- c(1:round(nrow(d_ex1) * 0.7))  
d_train1 <- d_ex1[idx1, ]  
d_test1 <- d_ex1[-idx1, ]  

set.seed(999)

cntrl1 <- trainControl(method = "timeslice", initialWindow = 250, horizon = 30,
                       fixedWindow = TRUE)

prep1 <- c("center", "scale")

logit_ex1 <- train(Direction ~ ., data = d_train1, method = "glm", family = "binomial",
                   trControl = cntrl1, preProcess = prep1)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logit_ex1
## Generalized Linear Model 
## 
## 704 samples
##   6 predictor
##   2 classes: 'Down', 'Up' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Rolling Forecasting Origin Resampling (30 held-out with a fixed window) 
## Summary of sample sizes: 250, 250, 250, 250, 250, 250, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8626667  0.7174923
summary(logit_ex1$finalModel)
## 
## Call:
## NULL
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.1789     0.1774   1.008    0.313    
## GSPC.Open     -125.1807    12.2416 -10.226  < 2e-16 ***
## GSPC.High       12.9307    10.1545   1.273    0.203    
## GSPC.Low        32.2950     7.4115   4.357 1.32e-05 ***
## GSPC.Close      80.0818     9.2533   8.654  < 2e-16 ***
## GSPC.Volume      0.1649     0.1892   0.872    0.383    
## GSPC.Adjusted        NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 963.36  on 703  degrees of freedom
## Residual deviance: 413.56  on 698  degrees of freedom
## AIC: 425.56
## 
## Number of Fisher Scoring iterations: 7
vip(logit_ex1, geom = "point") + theme_minimal()