# Load necessary packages
install.packages(c("psych", "tidyverse", "tidymodels", "vip", "ISLR2"))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(ISLR2)
library(psych)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:psych':
##
## logit
This question should be answered with data mtcars.
data(mtcars)
describe(mtcars)
## vars n mean sd median trimmed mad min max range skew
## mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61
## cyl 2 32 6.19 1.79 6.00 6.23 2.97 4.00 8.00 4.00 -0.17
## disp 3 32 230.72 123.94 196.30 222.52 140.48 71.10 472.00 400.90 0.38
## hp 4 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73
## drat 5 32 3.60 0.53 3.70 3.58 0.70 2.76 4.93 2.17 0.27
## wt 6 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42
## qsec 7 32 17.85 1.79 17.71 17.83 1.42 14.50 22.90 8.40 0.37
## vs 8 32 0.44 0.50 0.00 0.42 0.00 0.00 1.00 1.00 0.24
## am 9 32 0.41 0.50 0.00 0.38 0.00 0.00 1.00 1.00 0.36
## gear 10 32 3.69 0.74 4.00 3.62 1.48 3.00 5.00 2.00 0.53
## carb 11 32 2.81 1.62 2.00 2.65 1.48 1.00 8.00 7.00 1.05
## kurtosis se
## mpg -0.37 1.07
## cyl -1.76 0.32
## disp -1.21 21.91
## hp -0.14 12.12
## drat -0.71 0.09
## wt -0.02 0.17
## qsec 0.34 0.32
## vs -2.00 0.09
## am -1.92 0.09
## gear -1.07 0.13
## carb 1.26 0.29
library(ggplot2)
ggplot(data = mtcars, aes(x = mpg, y = disp)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
As we need to assess the relationship between mpg and other predictors, the visualization check will be implemented. Based on the plot, which predictors may have transformations like 𝑙og(x), √𝑥 or 𝑥2Comment on your findings.
pairs(mtcars)
Run the multiple regression on mpg across all predictors and show the estimated results.
model_all <- lm(mpg ~ ., data = mtcars)
summary(model_all)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
Using car::vif (variance inflation factor) to estimate if there is any multicollinearity among predictors. Find predictors with vif higher than 10.
vif_results <- car::vif(model_all)
high_vif_predictors <- names(vif_results[vif_results > 10])
high_vif_predictors
## [1] "cyl" "disp" "wt"
Rerun the multiple regressions by (1) excluding disp, and (2) excluding disp and cyl from predictors. Are there any improvements observed from the regression results?
model_excluding_disp <- lm(mpg ~ . - disp, data = mtcars)
summary(model_excluding_disp)
##
## Call:
## lm(formula = mpg ~ . - disp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7863 -1.4055 -0.2635 1.2029 4.4753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.55052 18.52585 0.677 0.5052
## cyl 0.09627 0.99715 0.097 0.9240
## hp -0.01295 0.01834 -0.706 0.4876
## drat 0.92864 1.60794 0.578 0.5694
## wt -2.62694 1.19800 -2.193 0.0392 *
## qsec 0.66523 0.69335 0.959 0.3478
## vs 0.16035 2.07277 0.077 0.9390
## am 2.47882 2.03513 1.218 0.2361
## gear 0.74300 1.47360 0.504 0.6191
## carb -0.61686 0.60566 -1.018 0.3195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.623 on 22 degrees of freedom
## Multiple R-squared: 0.8655, Adjusted R-squared: 0.8105
## F-statistic: 15.73 on 9 and 22 DF, p-value: 1.183e-07
model_excluding_disp_cyl <- lm(mpg ~ . - disp - cyl, data = mtcars)
summary(model_excluding_disp_cyl)
##
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8187 -1.3903 -0.3045 1.2269 4.5183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.80810 12.88582 1.072 0.2950
## hp -0.01225 0.01649 -0.743 0.4650
## drat 0.88894 1.52061 0.585 0.5645
## wt -2.60968 1.15878 -2.252 0.0342 *
## qsec 0.63983 0.62752 1.020 0.3185
## vs 0.08786 1.88992 0.046 0.9633
## am 2.42418 1.91227 1.268 0.2176
## gear 0.69390 1.35294 0.513 0.6129
## carb -0.61286 0.59109 -1.037 0.3106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared: 0.8655, Adjusted R-squared: 0.8187
## F-statistic: 18.5 on 8 and 23 DF, p-value: 2.627e-08
This question should be answered using the data set Carseats which is available after you load up library(ISLR2).
Fit a multiple regression model to predict Sales using Price, Urban and US.
Provide an interpretation of each coefficient in the model. Be careful–some of the variables in the model are qualitative!
Write out the model in equation form, being careful to handle the qualitative variables properly.
For which of the predictors can you reject the null hypothesis 𝐻0:𝛽J=0?
On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.
How well do the models in (a) and (e) fit the data?
Using the model from (e), obtain 95% confidence intervals for the coefficient(s).
Is there evidence of ouitliers or high leverage observations in the model from (e)?
# Load necessary packages
library(ISLR2)
# Load the Carseats dataset
data(Carseats)
# (a) Fit a multiple regression model to predict Sales using Price, Urban, and US
model <- lm(Sales ~ Price + Urban + US, data = Carseats)
summary(model)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
# (b) Interpretation of coefficients:
# - The coefficient for Price suggests that for a one-unit increase in Price, Sales decrease by the coefficient value, holding other variables constant.
# - The coefficients for Urban and US are for dummy variables. For example, if Urban is 1 (i.e., the store is located in an urban area), the Sales are expected to increase by the coefficient value, holding other variables constant.
# - For US, if the store is in the US, the coefficient indicates the additional sales compared to stores outside the US, holding other variables constant.
# (c) Model equation:
# Sales = β0 + β1 * Price + β2 * Urban + β3 * US + ε
# Where Urban and US are indicator variables.
# (d) Test for the null hypothesis H0: βj = 0
# We reject the null hypothesis for predictors with p-values < 0.05.
# (e) Fit a smaller model with predictors showing evidence of association with the outcome
smaller_model <- lm(Sales ~ Price + US, data = Carseats)
summary(smaller_model)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
# (f) Assess model fit
# Compare the R-squared values of both models to evaluate their fit to the data.
# (g) Obtain 95% confidence intervals for the coefficient(s)
confint(smaller_model)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
plot(smaller_model)
# (h) Check for outliers or high leverage observations
# Use diagnostic plots like residual plots, leverage plots, and Cook's distance to identify outliers or high leverage points.
Follow the example on the following website:
https://rforanalytics.com/04-Part_3_logistic.html#using-logistic-regression
Starting from section 14.3, please replicate the result in section 14.3.3 by replacing the data in the example with the S&P500 daily index starting from 2019 to 2023. The S&P500 index can be downloaded from yahoo finance or any other sources available on the internet. Upload your link on RPubs to Tronclass.
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
# Download S&P 500 data from Yahoo Finance
getSymbols("^GSPC", src = "yahoo", from = as.Date('2019-04-25'), to = as.Date('2023-04-25'))
## [1] "GSPC"
head(GSPC)
## GSPC.Open GSPC.High GSPC.Low GSPC.Close GSPC.Volume GSPC.Adjusted
## 2019-04-25 2928.99 2933.10 2912.84 2926.17 3440010000 2926.17
## 2019-04-26 2925.81 2939.88 2917.56 2939.88 3264390000 2939.88
## 2019-04-29 2940.58 2949.52 2939.35 2943.03 3150390000 2943.03
## 2019-04-30 2937.14 2948.22 2924.11 2945.83 3939760000 2945.83
## 2019-05-01 2952.33 2954.13 2923.36 2923.73 3669330000 2923.73
## 2019-05-02 2922.16 2931.68 2900.50 2917.52 3802290000 2917.52
d_ex1 <- as.data.frame(GSPC)
d_ex1 <- na.omit(d_ex1)
d_ex1$Direction <- NA # Create an empty Direction column first
for (i in 2:nrow(d_ex1)) {
if (Cl(d_ex1)[i] > Cl(d_ex1)[i - 1]) {
d_ex1$Direction[i] <- "Up"
} else {
d_ex1$Direction[i] <- "Down"
}
}
d_ex1 <- na.omit(d_ex1)
idx1 <- c(1:round(nrow(d_ex1) * 0.7))
d_train1 <- d_ex1[idx1, ]
d_test1 <- d_ex1[-idx1, ]
set.seed(999)
cntrl1 <- trainControl(method = "timeslice", initialWindow = 250, horizon = 30,
fixedWindow = TRUE)
prep1 <- c("center", "scale")
logit_ex1 <- train(Direction ~ ., data = d_train1, method = "glm", family = "binomial",
trControl = cntrl1, preProcess = prep1)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logit_ex1
## Generalized Linear Model
##
## 704 samples
## 6 predictor
## 2 classes: 'Down', 'Up'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Rolling Forecasting Origin Resampling (30 held-out with a fixed window)
## Summary of sample sizes: 250, 250, 250, 250, 250, 250, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8626667 0.7174923
summary(logit_ex1$finalModel)
##
## Call:
## NULL
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1789 0.1774 1.008 0.313
## GSPC.Open -125.1807 12.2416 -10.226 < 2e-16 ***
## GSPC.High 12.9307 10.1545 1.273 0.203
## GSPC.Low 32.2950 7.4115 4.357 1.32e-05 ***
## GSPC.Close 80.0818 9.2533 8.654 < 2e-16 ***
## GSPC.Volume 0.1649 0.1892 0.872 0.383
## GSPC.Adjusted NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 963.36 on 703 degrees of freedom
## Residual deviance: 413.56 on 698 degrees of freedom
## AIC: 425.56
##
## Number of Fisher Scoring iterations: 7
vip(logit_ex1, geom = "point") + theme_minimal()