Q1. This question should be answered with data mtcars.

(a) Descriptive statistics for mtcars dataset:

# Load required packages
install.packages(c("psych", "tidyverse", "tidymodels", "vip", "ISLR2"))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(psych)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ ggplot2::%+%()    masks psych::%+%()
## ✖ scales::alpha()   masks ggplot2::alpha(), psych::alpha()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
library(ISLR2)
# Load mtcars dataset
data(mtcars)

# Generate descriptive statistics
describe(mtcars)
##      vars  n   mean     sd median trimmed    mad   min    max  range  skew
## mpg     1 32  20.09   6.03  19.20   19.70   5.41 10.40  33.90  23.50  0.61
## cyl     2 32   6.19   1.79   6.00    6.23   2.97  4.00   8.00   4.00 -0.17
## disp    3 32 230.72 123.94 196.30  222.52 140.48 71.10 472.00 400.90  0.38
## hp      4 32 146.69  68.56 123.00  141.19  77.10 52.00 335.00 283.00  0.73
## drat    5 32   3.60   0.53   3.70    3.58   0.70  2.76   4.93   2.17  0.27
## wt      6 32   3.22   0.98   3.33    3.15   0.77  1.51   5.42   3.91  0.42
## qsec    7 32  17.85   1.79  17.71   17.83   1.42 14.50  22.90   8.40  0.37
## vs      8 32   0.44   0.50   0.00    0.42   0.00  0.00   1.00   1.00  0.24
## am      9 32   0.41   0.50   0.00    0.38   0.00  0.00   1.00   1.00  0.36
## gear   10 32   3.69   0.74   4.00    3.62   1.48  3.00   5.00   2.00  0.53
## carb   11 32   2.81   1.62   2.00    2.65   1.48  1.00   8.00   7.00  1.05
##      kurtosis    se
## mpg     -0.37  1.07
## cyl     -1.76  0.32
## disp    -1.21 21.91
## hp      -0.14 12.12
## drat    -0.71  0.09
## wt      -0.02  0.17
## qsec     0.34  0.32
## vs      -2.00  0.09
## am      -1.92  0.09
## gear    -1.07  0.13
## carb     1.26  0.29

(b) Visualization Check

# Visualization check for relationships
pairs(mtcars)

## (c) Multiple Regression

# Based on the plots, predictors like cyl, disp, hp, wt, and qsec may benefit from transformations like log, sqrt, or squared transformations due to non-linear relationships with mpg.

# Multiple regression on mpg across all predictors
model <- lm(mpg ~ ., data = mtcars)
summary(model)
## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

(d) Variance inflation factor

# Load required package
if (!require("car")) install.packages("car")
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:psych':
## 
##     logit
library(car)

# Calculate Variance Inflation Factor (VIF)
vif_values <- car::vif(model)

# Find predictors with VIF higher than 10
high_vif_predictors <- names(vif_values[vif_values > 10])

# Print predictors with VIF higher than 10
if (length(high_vif_predictors) > 0) {
  cat("Predictors with VIF higher than 10:", paste(high_vif_predictors, collapse = ", "))
} else {
  cat("No predictors have VIF higher than 10.")
}
## Predictors with VIF higher than 10: cyl, disp, wt

(e). Rerun the multiple regressions, (e-1) excluding disp

# Rerun multiple regression excluding disp
model_without_disp <- lm(mpg ~ . - disp, data = mtcars)
summary(model_without_disp)
## 
## Call:
## lm(formula = mpg ~ . - disp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7863 -1.4055 -0.2635  1.2029  4.4753 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.55052   18.52585   0.677   0.5052  
## cyl          0.09627    0.99715   0.097   0.9240  
## hp          -0.01295    0.01834  -0.706   0.4876  
## drat         0.92864    1.60794   0.578   0.5694  
## wt          -2.62694    1.19800  -2.193   0.0392 *
## qsec         0.66523    0.69335   0.959   0.3478  
## vs           0.16035    2.07277   0.077   0.9390  
## am           2.47882    2.03513   1.218   0.2361  
## gear         0.74300    1.47360   0.504   0.6191  
## carb        -0.61686    0.60566  -1.018   0.3195  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.623 on 22 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8105 
## F-statistic: 15.73 on 9 and 22 DF,  p-value: 1.183e-07

(e-2) excluding disp

# Rerun multiple regression excluding disp and cyl
model_without_disp_cyl <- lm(mpg ~ . - disp - cyl, data = mtcars)
summary(model_without_disp_cyl)
## 
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8187 -1.3903 -0.3045  1.2269  4.5183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 13.80810   12.88582   1.072   0.2950  
## hp          -0.01225    0.01649  -0.743   0.4650  
## drat         0.88894    1.52061   0.585   0.5645  
## wt          -2.60968    1.15878  -2.252   0.0342 *
## qsec         0.63983    0.62752   1.020   0.3185  
## vs           0.08786    1.88992   0.046   0.9633  
## am           2.42418    1.91227   1.268   0.2176  
## gear         0.69390    1.35294   0.513   0.6129  
## carb        -0.61286    0.59109  -1.037   0.3106  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8187 
## F-statistic:  18.5 on 8 and 23 DF,  p-value: 2.627e-08

Q2 Carseats

# Load required package and dataset
if (!require("ISLR2")) install.packages("ISLR2")
library(ISLR2)
data("Carseats")

(a) Fit multiple regression model

model <- lm(Sales ~ Price + Urban + US, data = Carseats)
summary(model)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

(b) Interpretation of coefficients

The coefficient for Price represents the change in Sales for a one-unit increase in Price, holding other variables constant.

The coefficients for Urban and US represent the difference in Sales between stores located in Urban areas and those in Rural areas (for Urban), and between stores in the US and those outside the US (for US), holding other variables constant.

(c) Model equation

Sales = β0 + β1Price + β2Urban_Yes + β3*US_Yes + ε

Where Urban_Yes and US_Yes are indicator variables taking the value 1 if the store is in an Urban area or in the US, respectively, and 0 otherwise.

(d) Rejecting null hypothesis for predictors

We can use the t-statistic and p-value from the summary output to determine if we can reject the null hypothesis (H0: β = 0) for each predictor.

(e) Fit smaller model with predictors showing evidence of association

smaller_model <- lm(Sales ~ Price + US, data = Carseats)
summary(smaller_model)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

(f) Model fit comparison

We can compare the R-squared values or other fit statistics between the two models to assess their fit to the data.

(g) Confidence intervals for coefficients

confint(smaller_model)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632

(h) Outliers and leverage observations

We can examine diagnostic plots such as residuals vs. fitted values plot, Cook’s distance, and leverage plots to identify outliers and high leverage observations.

Q3:

library(vip)
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## 
## Attaching package: 'TTR'
## The following object is masked from 'package:dials':
## 
##     momentum
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
## 
##     lift
getSymbols("^GSPC", src = "yahoo", from = as.Date('2019-04-25'), to = as.Date('2023-04-25'))
## [1] "GSPC"
head(GSPC)
##            GSPC.Open GSPC.High GSPC.Low GSPC.Close GSPC.Volume GSPC.Adjusted
## 2019-04-25   2928.99   2933.10  2912.84    2926.17  3440010000       2926.17
## 2019-04-26   2925.81   2939.88  2917.56    2939.88  3264390000       2939.88
## 2019-04-29   2940.58   2949.52  2939.35    2943.03  3150390000       2943.03
## 2019-04-30   2937.14   2948.22  2924.11    2945.83  3939760000       2945.83
## 2019-05-01   2952.33   2954.13  2923.36    2923.73  3669330000       2923.73
## 2019-05-02   2922.16   2931.68  2900.50    2917.52  3802290000       2917.52
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logit_ex1
## Generalized Linear Model 
## 
## 704 samples
##   6 predictor
##   2 classes: 'Down', 'Up' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Rolling Forecasting Origin Resampling (30 held-out with a fixed window) 
## Summary of sample sizes: 250, 250, 250, 250, 250, 250, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8626667  0.7174923
summary(logit_ex1$finalModel)
## 
## Call:
## NULL
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.1789     0.1774   1.008    0.313    
## GSPC.Open     -125.1807    12.2416 -10.226  < 2e-16 ***
## GSPC.High       12.9307    10.1545   1.273    0.203    
## GSPC.Low        32.2950     7.4115   4.357 1.32e-05 ***
## GSPC.Close      80.0818     9.2533   8.654  < 2e-16 ***
## GSPC.Volume      0.1649     0.1892   0.872    0.383    
## GSPC.Adjusted        NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 963.36  on 703  degrees of freedom
## Residual deviance: 413.56  on 698  degrees of freedom
## AIC: 425.56
## 
## Number of Fisher Scoring iterations: 7
vip(logit_ex1, geom = "point") + theme_bw()