#Q1 (a) Load data mtcars and generate descriptive statistics
library(psych)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.4 ✔ rsample 1.1.1
## ✔ dials 1.2.0 ✔ tune 1.1.1
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.1.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.0 ✔ yardstick 1.1.0
## ✔ recipes 1.0.5
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ scales::alpha() masks ggplot2::alpha(), psych::alpha()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
library(ISLR2)
data(mtcars)
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
#Q1 (b) Visualization check for possible transformations # create scatterplots of mpg vs. other predictors
pairs(mtcars, main = "Scatterplot Matrix of mtcars Variables")
#Q1 (c) Run multiple regression on mpg across all predictors
model_all <- lm(mpg ~ ., data = mtcars)
summary(model_all)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
#Q1 (d) Check for multicollinearity
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:psych':
##
## logit
vif(model_all)
## cyl disp hp drat wt qsec vs am
## 15.373833 21.620241 9.832037 3.374620 15.164887 7.527958 4.965873 4.648487
## gear carb
## 5.357452 7.908747
#Q1 (e) Rerun the multiple regression by excluding disp, and then by excluding disp and cyl
model_excl_disp <- lm(mpg ~ . - disp, data = mtcars)
summary(model_excl_disp)
##
## Call:
## lm(formula = mpg ~ . - disp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7863 -1.4055 -0.2635 1.2029 4.4753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.55052 18.52585 0.677 0.5052
## cyl 0.09627 0.99715 0.097 0.9240
## hp -0.01295 0.01834 -0.706 0.4876
## drat 0.92864 1.60794 0.578 0.5694
## wt -2.62694 1.19800 -2.193 0.0392 *
## qsec 0.66523 0.69335 0.959 0.3478
## vs 0.16035 2.07277 0.077 0.9390
## am 2.47882 2.03513 1.218 0.2361
## gear 0.74300 1.47360 0.504 0.6191
## carb -0.61686 0.60566 -1.018 0.3195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.623 on 22 degrees of freedom
## Multiple R-squared: 0.8655, Adjusted R-squared: 0.8105
## F-statistic: 15.73 on 9 and 22 DF, p-value: 1.183e-07
model_excl_disp_cyl <- lm(mpg ~ . - disp - cyl, data = mtcars)
summary(model_excl_disp_cyl)
##
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8187 -1.3903 -0.3045 1.2269 4.5183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.80810 12.88582 1.072 0.2950
## hp -0.01225 0.01649 -0.743 0.4650
## drat 0.88894 1.52061 0.585 0.5645
## wt -2.60968 1.15878 -2.252 0.0342 *
## qsec 0.63983 0.62752 1.020 0.3185
## vs 0.08786 1.88992 0.046 0.9633
## am 2.42418 1.91227 1.268 0.2176
## gear 0.69390 1.35294 0.513 0.6129
## carb -0.61286 0.59109 -1.037 0.3106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared: 0.8655, Adjusted R-squared: 0.8187
## F-statistic: 18.5 on 8 and 23 DF, p-value: 2.627e-08
#Q2 (a) fit multiple regression model to predict Sales using Price, Urban, and US
sales_model <- lm(Sales ~ Price + Urban + US, data = Carseats)
summary(sales_model)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
#Q2 (b) Provide an interpretation of each coefficient in the model
data(Carseats)
fit <- lm(Sales ~ ., data = Carseats)
coefficients <- coef(fit)
interpretation <- c("Intercept" = "The expected value of Sales when all predictors are set to 0.",
"CompPrice" = "For a one-unit increase in CompPrice, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
"Income" = "For a one-unit increase in Income, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
"Advertising" = "For a one-unit increase in Advertising, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
"Population" = "For a one-unit increase in Population, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
"Price" = "For a one-unit increase in Price, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
"ShelveLocGood" = "Sales is expected to be higher by the corresponding coefficient when ShelveLoc is Good, compared to Medium (reference category), holding other predictors constant.",
"ShelveLocBad" = "Sales is expected to be lower by the corresponding coefficient when ShelveLoc is Bad, compared to Medium (reference category), holding other predictors constant.",
"UrbanYes" = "Sales is expected to be higher by the corresponding coefficient when Urban is Yes, compared to No (reference category), holding other predictors constant.",
"USYes" = "Sales is expected to be lower by the corresponding coefficient when US is Yes, compared to No (reference category), holding other predictors constant.")
cat("Interpretation of Coefficients:\n")
## Interpretation of Coefficients:
for (i in 1:length(coefficients)) {
cat(names(coefficients)[i], ": ", interpretation[i], "\n")
}
## (Intercept) : The expected value of Sales when all predictors are set to 0.
## CompPrice : For a one-unit increase in CompPrice, Sales is expected to change by the corresponding coefficient, holding other predictors constant.
## Income : For a one-unit increase in Income, Sales is expected to change by the corresponding coefficient, holding other predictors constant.
## Advertising : For a one-unit increase in Advertising, Sales is expected to change by the corresponding coefficient, holding other predictors constant.
## Population : For a one-unit increase in Population, Sales is expected to change by the corresponding coefficient, holding other predictors constant.
## Price : For a one-unit increase in Price, Sales is expected to change by the corresponding coefficient, holding other predictors constant.
## ShelveLocGood : Sales is expected to be higher by the corresponding coefficient when ShelveLoc is Good, compared to Medium (reference category), holding other predictors constant.
## ShelveLocMedium : Sales is expected to be lower by the corresponding coefficient when ShelveLoc is Bad, compared to Medium (reference category), holding other predictors constant.
## Age : Sales is expected to be higher by the corresponding coefficient when Urban is Yes, compared to No (reference category), holding other predictors constant.
## Education : Sales is expected to be lower by the corresponding coefficient when US is Yes, compared to No (reference category), holding other predictors constant.
## UrbanYes : NA
## USYes : NA
#Q2 (c) Write out the model in equation form, being careful to handle the qualitative variables properly # Fit a linear regression model
model <- lm(Sales ~ CompPrice + Income + Advertising + Population + Price + ShelveLoc + Urban + US, data = Carseats)
coefs <- coef(model)
for (i in seq_along(coefs)) {
print(paste0("Coefficient for '", names(coefs)[i], "': ", round(coefs[i], 3)))
}
## [1] "Coefficient for '(Intercept)': 2.258"
## [1] "Coefficient for 'CompPrice': 0.096"
## [1] "Coefficient for 'Income': 0.016"
## [1] "Coefficient for 'Advertising': 0.123"
## [1] "Coefficient for 'Population': 0"
## [1] "Coefficient for 'Price': -0.093"
## [1] "Coefficient for 'ShelveLocGood': 4.815"
## [1] "Coefficient for 'ShelveLocMedium': 1.855"
## [1] "Coefficient for 'UrbanYes': 0.067"
## [1] "Coefficient for 'USYes': -0.209"
#Q2 (d) For which of the predictors can you reject the null hypothesis 𝐻0: 𝛽𝑗 = 0? # - The coefficient for Price is significant at the 0.001 level, indicating that there is evidence of a relationship between Price and Sales. # - The coefficient for Urban is not significant at the 0.05 level, indicating that there is not strong evidence of a relationship between Urban and Sales. # - The coefficient for US is significant at the 0.001 level, indicating that there is evidence of a relationship between US and Sales.
#Q2 (e) fit smaller model using only Price and US predictors
smaller_model <- lm(Sales ~ Price + US, data = Carseats)
summary(smaller_model)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
#Q2 (f) assess model fit for both models # - The multiple regression model with all three predictors has an adjusted R-squared of 0.239, indicating that the model explains about 24% of the variability in Sales. # - The smaller model with only Price and US predictors has an adjusted R-squared of 0.234, indicating that the model explains about 23% of the variability in Sales. # - Both models have relatively low R-squared values, suggesting that there may be other factors that contribute to Sales that are not captured by these predictors.
#Q2 (g) obtain 95% confidence intervals for coefficients in smaller model
confint(smaller_model)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
#Q2 (h) check for outliers and high leverage observations in the smaller model
par(mfrow=c(2,2))
plot(smaller_model)
leverage <- hatvalues(smaller_model)
high_leverage <- which(leverage > 2 * mean(leverage))
high_leverage
## 43 126 156 157 160 166 172 175 192 204 209 270 273 314 316 357 366 368 384 387
## 43 126 156 157 160 166 172 175 192 204 209 270 273 314 316 357 366 368 384 387