#Q1 (a) Load data mtcars and generate descriptive statistics

library(psych)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.4     ✔ rsample      1.1.1
## ✔ dials        1.2.0     ✔ tune         1.1.1
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.1.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.0     ✔ yardstick    1.1.0
## ✔ recipes      1.0.5     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ ggplot2::%+%()    masks psych::%+%()
## ✖ scales::alpha()   masks ggplot2::alpha(), psych::alpha()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(vip)

## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

library(ISLR2)

Load mtcars data

data(mtcars)

Generate descriptive statistics

summary(mtcars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

#Q1 (b) Visualization check for possible transformations # create scatterplots of mpg vs. other predictors

pairs(mtcars, main = "Scatterplot Matrix of mtcars Variables")

#Q1 (c) Run multiple regression on mpg across all predictors

model_all <- lm(mpg ~ ., data = mtcars)
summary(model_all)

## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

#Q1 (d) Check for multicollinearity

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:psych':
## 
##     logit

vif(model_all)

##       cyl      disp        hp      drat        wt      qsec        vs        am 
## 15.373833 21.620241  9.832037  3.374620 15.164887  7.527958  4.965873  4.648487 
##      gear      carb 
##  5.357452  7.908747

#Q1 (e) Rerun the multiple regression by excluding disp, and then by excluding disp and cyl

model_excl_disp <- lm(mpg ~ . - disp, data = mtcars)
summary(model_excl_disp)

## 
## Call:
## lm(formula = mpg ~ . - disp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7863 -1.4055 -0.2635  1.2029  4.4753 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.55052   18.52585   0.677   0.5052  
## cyl          0.09627    0.99715   0.097   0.9240  
## hp          -0.01295    0.01834  -0.706   0.4876  
## drat         0.92864    1.60794   0.578   0.5694  
## wt          -2.62694    1.19800  -2.193   0.0392 *
## qsec         0.66523    0.69335   0.959   0.3478  
## vs           0.16035    2.07277   0.077   0.9390  
## am           2.47882    2.03513   1.218   0.2361  
## gear         0.74300    1.47360   0.504   0.6191  
## carb        -0.61686    0.60566  -1.018   0.3195  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.623 on 22 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8105 
## F-statistic: 15.73 on 9 and 22 DF,  p-value: 1.183e-07

model_excl_disp_cyl <- lm(mpg ~ . - disp - cyl, data = mtcars)
summary(model_excl_disp_cyl)

## 
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8187 -1.3903 -0.3045  1.2269  4.5183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 13.80810   12.88582   1.072   0.2950  
## hp          -0.01225    0.01649  -0.743   0.4650  
## drat         0.88894    1.52061   0.585   0.5645  
## wt          -2.60968    1.15878  -2.252   0.0342 *
## qsec         0.63983    0.62752   1.020   0.3185  
## vs           0.08786    1.88992   0.046   0.9633  
## am           2.42418    1.91227   1.268   0.2176  
## gear         0.69390    1.35294   0.513   0.6129  
## carb        -0.61286    0.59109  -1.037   0.3106  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8187 
## F-statistic:  18.5 on 8 and 23 DF,  p-value: 2.627e-08

#Q2 (a) fit multiple regression model to predict Sales using Price, Urban, and US

sales_model <- lm(Sales ~ Price + Urban + US, data = Carseats)

summarizing the model output

summary(sales_model)

## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

#Q2 (b) Provide an interpretation of each coefficient in the model

data(Carseats)

fit <- lm(Sales ~ ., data = Carseats)

Interpretation of coefficients

coefficients <- coef(fit)
interpretation <- c("Intercept" = "The expected value of Sales when all predictors are set to 0.",
                    "CompPrice" = "For a one-unit increase in CompPrice, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
                    "Income" = "For a one-unit increase in Income, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
                    "Advertising" = "For a one-unit increase in Advertising, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
                    "Population" = "For a one-unit increase in Population, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
                    "Price" = "For a one-unit increase in Price, Sales is expected to change by the corresponding coefficient, holding other predictors constant.",
                    "ShelveLocGood" = "Sales is expected to be higher by the corresponding coefficient when ShelveLoc is Good, compared to Medium (reference category), holding other predictors constant.",
                    "ShelveLocBad" = "Sales is expected to be lower by the corresponding coefficient when ShelveLoc is Bad, compared to Medium (reference category), holding other predictors constant.",
                    "UrbanYes" = "Sales is expected to be higher by the corresponding coefficient when Urban is Yes, compared to No (reference category), holding other predictors constant.",
                    "USYes" = "Sales is expected to be lower by the corresponding coefficient when US is Yes, compared to No (reference category), holding other predictors constant.")

Print interpretation of coefficients

cat("Interpretation of Coefficients:\n")

## Interpretation of Coefficients:

for (i in 1:length(coefficients)) {
  cat(names(coefficients)[i], ": ", interpretation[i], "\n")
}

## (Intercept) :  The expected value of Sales when all predictors are set to 0. 
## CompPrice :  For a one-unit increase in CompPrice, Sales is expected to change by the corresponding coefficient, holding other predictors constant. 
## Income :  For a one-unit increase in Income, Sales is expected to change by the corresponding coefficient, holding other predictors constant. 
## Advertising :  For a one-unit increase in Advertising, Sales is expected to change by the corresponding coefficient, holding other predictors constant. 
## Population :  For a one-unit increase in Population, Sales is expected to change by the corresponding coefficient, holding other predictors constant. 
## Price :  For a one-unit increase in Price, Sales is expected to change by the corresponding coefficient, holding other predictors constant. 
## ShelveLocGood :  Sales is expected to be higher by the corresponding coefficient when ShelveLoc is Good, compared to Medium (reference category), holding other predictors constant. 
## ShelveLocMedium :  Sales is expected to be lower by the corresponding coefficient when ShelveLoc is Bad, compared to Medium (reference category), holding other predictors constant. 
## Age :  Sales is expected to be higher by the corresponding coefficient when Urban is Yes, compared to No (reference category), holding other predictors constant. 
## Education :  Sales is expected to be lower by the corresponding coefficient when US is Yes, compared to No (reference category), holding other predictors constant. 
## UrbanYes :  NA 
## USYes :  NA

#Q2 (c) Write out the model in equation form, being careful to handle the qualitative variables properly # Fit a linear regression model

model <- lm(Sales ~ CompPrice + Income + Advertising + Population + Price + ShelveLoc + Urban + US, data = Carseats)

Print the coefficient estimates and their interpretations

coefs <- coef(model)
for (i in seq_along(coefs)) {
  print(paste0("Coefficient for '", names(coefs)[i], "': ", round(coefs[i], 3)))
}

## [1] "Coefficient for '(Intercept)': 2.258"
## [1] "Coefficient for 'CompPrice': 0.096"
## [1] "Coefficient for 'Income': 0.016"
## [1] "Coefficient for 'Advertising': 0.123"
## [1] "Coefficient for 'Population': 0"
## [1] "Coefficient for 'Price': -0.093"
## [1] "Coefficient for 'ShelveLocGood': 4.815"
## [1] "Coefficient for 'ShelveLocMedium': 1.855"
## [1] "Coefficient for 'UrbanYes': 0.067"
## [1] "Coefficient for 'USYes': -0.209"

#Q2 (d) For which of the predictors can you reject the null hypothesis 𝐻0: 𝛽𝑗 = 0? # - The coefficient for Price is significant at the 0.001 level, indicating that there is evidence of a relationship between Price and Sales. # - The coefficient for Urban is not significant at the 0.05 level, indicating that there is not strong evidence of a relationship between Urban and Sales. # - The coefficient for US is significant at the 0.001 level, indicating that there is evidence of a relationship between US and Sales.

#Q2 (e) fit smaller model using only Price and US predictors

smaller_model <- lm(Sales ~ Price + US, data = Carseats)

summarize the smaller model output

summary(smaller_model)

## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

#Q2 (f) assess model fit for both models # - The multiple regression model with all three predictors has an adjusted R-squared of 0.239, indicating that the model explains about 24% of the variability in Sales. # - The smaller model with only Price and US predictors has an adjusted R-squared of 0.234, indicating that the model explains about 23% of the variability in Sales. # - Both models have relatively low R-squared values, suggesting that there may be other factors that contribute to Sales that are not captured by these predictors.

#Q2 (g) obtain 95% confidence intervals for coefficients in smaller model

confint(smaller_model)

##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632

#Q2 (h) check for outliers and high leverage observations in the smaller model

par(mfrow=c(2,2))
plot(smaller_model)

leverage <- hatvalues(smaller_model)
high_leverage <- which(leverage > 2 * mean(leverage))
high_leverage

##  43 126 156 157 160 166 172 175 192 204 209 270 273 314 316 357 366 368 384 387 
##  43 126 156 157 160 166 172 175 192 204 209 270 273 314 316 357 366 368 384 387

SoftwarePackage_midterm

Narantuya

2023-04-16