Midterm

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#Q1 (a) Load data mtcars and generate descriptive statistics
library(psych)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ ggplot2::%+%()    masks psych::%+%()
## ✖ scales::alpha()   masks ggplot2::alpha(), psych::alpha()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(vip)

## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

library(ISLR2)
library(faraway)

## 
## Attaching package: 'faraway'
## 
## The following object is masked from 'package:psych':
## 
##     logit

# load the data
data(mtcars)
# summary statistics
describe(mtcars)

##      vars  n   mean     sd median trimmed    mad   min    max  range  skew
## mpg     1 32  20.09   6.03  19.20   19.70   5.41 10.40  33.90  23.50  0.61
## cyl     2 32   6.19   1.79   6.00    6.23   2.97  4.00   8.00   4.00 -0.17
## disp    3 32 230.72 123.94 196.30  222.52 140.48 71.10 472.00 400.90  0.38
## hp      4 32 146.69  68.56 123.00  141.19  77.10 52.00 335.00 283.00  0.73
## drat    5 32   3.60   0.53   3.70    3.58   0.70  2.76   4.93   2.17  0.27
## wt      6 32   3.22   0.98   3.33    3.15   0.77  1.51   5.42   3.91  0.42
## qsec    7 32  17.85   1.79  17.71   17.83   1.42 14.50  22.90   8.40  0.37
## vs      8 32   0.44   0.50   0.00    0.42   0.00  0.00   1.00   1.00  0.24
## am      9 32   0.41   0.50   0.00    0.38   0.00  0.00   1.00   1.00  0.36
## gear   10 32   3.69   0.74   4.00    3.62   1.48  3.00   5.00   2.00  0.53
## carb   11 32   2.81   1.62   2.00    2.65   1.48  1.00   8.00   7.00  1.05
##      kurtosis    se
## mpg     -0.37  1.07
## cyl     -1.76  0.32
## disp    -1.21 21.91
## hp      -0.14 12.12
## drat    -0.71  0.09
## wt      -0.02  0.17
## qsec     0.34  0.32
## vs      -2.00  0.09
## am      -1.92  0.09
## gear    -1.07  0.13
## carb     1.26  0.29

# (b) Visualization check for possible transformations
# create scatterplots of mpg vs. other predictors
pairs(mtcars, main = "Scatterplot Matrix of mtcars Variables")

# based on the plot, it appears that there might be a non-linear relationship between mpg and disp, and between mpg and hp.
# (c) Run multiple regression on mpg across all predictors
model_all <- lm(mpg ~ ., data = mtcars)
summary(model_all)

## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

# (d) Check for multicollinearity
vif(model_all)

##       cyl      disp        hp      drat        wt      qsec        vs        am 
## 15.373833 21.620241  9.832037  3.374620 15.164887  7.527958  4.965873  4.648487 
##      gear      carb 
##  5.357452  7.908747

# the predictors "disp" and "cyl" have VIF values greater than 10, indicating high multicollinearity.
# (e) Rerun the multiple regression by excluding disp, and then by excluding disp and cyl
model_excl_disp <- lm(mpg ~ . - disp, data = mtcars)
summary(model_excl_disp)

## 
## Call:
## lm(formula = mpg ~ . - disp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7863 -1.4055 -0.2635  1.2029  4.4753 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.55052   18.52585   0.677   0.5052  
## cyl          0.09627    0.99715   0.097   0.9240  
## hp          -0.01295    0.01834  -0.706   0.4876  
## drat         0.92864    1.60794   0.578   0.5694  
## wt          -2.62694    1.19800  -2.193   0.0392 *
## qsec         0.66523    0.69335   0.959   0.3478  
## vs           0.16035    2.07277   0.077   0.9390  
## am           2.47882    2.03513   1.218   0.2361  
## gear         0.74300    1.47360   0.504   0.6191  
## carb        -0.61686    0.60566  -1.018   0.3195  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.623 on 22 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8105 
## F-statistic: 15.73 on 9 and 22 DF,  p-value: 1.183e-07

model_excl_disp_cyl <- lm(mpg ~ . - disp - cyl, data = mtcars)
summary(model_excl_disp_cyl)

## 
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8187 -1.3903 -0.3045  1.2269  4.5183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 13.80810   12.88582   1.072   0.2950  
## hp          -0.01225    0.01649  -0.743   0.4650  
## drat         0.88894    1.52061   0.585   0.5645  
## wt          -2.60968    1.15878  -2.252   0.0342 *
## qsec         0.63983    0.62752   1.020   0.3185  
## vs           0.08786    1.88992   0.046   0.9633  
## am           2.42418    1.91227   1.268   0.2176  
## gear         0.69390    1.35294   0.513   0.6129  
## carb        -0.61286    0.59109  -1.037   0.3106  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8187 
## F-statistic:  18.5 on 8 and 23 DF,  p-value: 2.627e-08

#Q2
# load libraries
library(ISLR2)
# (a) fit multiple regression model to predict Sales using Price, Urban, and US
sales_model <- lm(Sales ~ Price + Urban + US, data = Carseats)
# summarize the model output
summary(sales_model)

## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

# (b) interpret the coefficients in the model
# - The intercept represents the expected sales when all other predictors are equal to 0.
# - For a one unit increase in Price, we expect a decrease of 0.0545 units in Sales, all else held constant.
# - The Urban predictor is a binary variable: if a store is located in an urban area, Urban = 1, otherwise Urban = 0. Therefore, the coefficient for Urban represents the difference in expected Sales between urban and non-urban stores, when all other predictors are equal.
# - The US predictor is also a binary variable: if a store is located in the US, US = 1, otherwise US = 0. Therefore, the coefficient for US represents the difference in expected Sales between US and non-US stores, when all other predictors are equal.
# (c) write out the model in equation form
# Sales = 13.04 - 0.0545 * Price + 0.1122 * UrbanYes + 1.2006 * USYes
# (d) test for significance of coefficients
# - The coefficient for Price is significant at the 0.001 level, indicating that there is evidence of a relationship between Price and Sales.
# - The coefficient for Urban is not significant at the 0.05 level, indicating that there is not strong evidence of a relationship between Urban and Sales.
# - The coefficient for US is significant at the 0.001 level, indicating that there is evidence of a relationship between US and Sales.
# (e) fit smaller model using only Price and US predictors
smaller_model <- lm(Sales ~ Price + US, data = Carseats)
# summarize the smaller model output
summary(smaller_model)

## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

# test for significance of coefficients in the smaller model
# - The coefficient for Price is significant at the 0.001 level, indicating that there is evidence of a relationship between Price and Sales.
# - The coefficient for US is significant at the 0.001 level, indicating that there is evidence of a relationship between US and Sales.
# (f) assess model fit for both models
# - The multiple regression model with all three predictors has an adjusted R-squared of 0.239, indicating that the model explains about 24% of the variability in Sales.
# - The smaller model with only Price and US predictors has an adjusted R-squared of 0.234, indicating that the model explains about 23% of the variability in Sales.
# - Both models have relatively low R-squared values, suggesting that there may be other factors that contribute to Sales that are not captured by these predictors.
# (g) obtain 95% confidence intervals for coefficients in smaller model
confint(smaller_model)

##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632

# (h) check for outliers and high leverage observations in the smaller model
par(mfrow=c(2,2))
plot(smaller_model)

leverage <- hatvalues(smaller_model)
high_leverage <- which(leverage > 2 * mean(leverage))
high_leverage

##  43 126 156 157 160 166 172 175 192 204 209 270 273 314 316 357 366 368 384 387 
##  43 126 156 157 160 166 172 175 192 204 209 270 273 314 316 357 366 368 384 387

#Q3
library(quantmod)

## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Loading required package: TTR
## 
## Attaching package: 'TTR'
## 
## The following object is masked from 'package:dials':
## 
##     momentum
## 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:faraway':
## 
##     melanoma
## 
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(vip)
# Download S&P 500 data from Yahoo Finance
getSymbols("^GSPC", src = "yahoo", from = as.Date('2019-04-25'), to = as.Date('2023-04-25'))

## [1] "GSPC"

head(GSPC)

##            GSPC.Open GSPC.High GSPC.Low GSPC.Close GSPC.Volume GSPC.Adjusted
## 2019-04-25   2928.99   2933.10  2912.84    2926.17  3440010000       2926.17
## 2019-04-26   2925.81   2939.88  2917.56    2939.88  3264390000       2939.88
## 2019-04-29   2940.58   2949.52  2939.35    2943.03  3150390000       2943.03
## 2019-04-30   2937.14   2948.22  2924.11    2945.83  3939760000       2945.83
## 2019-05-01   2952.33   2954.13  2923.36    2923.73  3669330000       2923.73
## 2019-05-02   2922.16   2931.68  2900.50    2917.52  3802290000       2917.52

d_ex1 <- as.data.frame(GSPC)
d_ex1 <- na.omit(d_ex1)
d_ex1$Direction <- NA  # Create an empty Direction column first
for (i in 2:nrow(d_ex1)) {
  if (Cl(d_ex1)[i] > Cl(d_ex1)[i - 1]) {
    d_ex1$Direction[i] <- "Up"
  } else {
    d_ex1$Direction[i] <- "Down"
  }
}
d_ex1 <- na.omit(d_ex1)
idx1 <- c(1:round(nrow(d_ex1) * 0.7))  
d_train1 <- d_ex1[idx1, ]  
d_test1 <- d_ex1[-idx1, ]  
set.seed(999)
cntrl1 <- trainControl(method = "timeslice", initialWindow = 250, horizon = 30,
                       fixedWindow = TRUE)
prep1 <- c("center", "scale")
logit_ex1 <- train(Direction ~ ., data = d_train1, method = "glm", family = "binomial",
                   trControl = cntrl1, preProcess = prep1)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

logit_ex1

## Generalized Linear Model 
## 
## 704 samples
##   6 predictor
##   2 classes: 'Down', 'Up' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Rolling Forecasting Origin Resampling (30 held-out with a fixed window) 
## Summary of sample sizes: 250, 250, 250, 250, 250, 250, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8626667  0.7174923

summary(logit_ex1$finalModel)

## 
## Call:
## NULL
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.1789     0.1774   1.008    0.313    
## GSPC.Open     -125.1807    12.2416 -10.226  < 2e-16 ***
## GSPC.High       12.9307    10.1545   1.273    0.203    
## GSPC.Low        32.2950     7.4115   4.357 1.32e-05 ***
## GSPC.Close      80.0818     9.2533   8.654  < 2e-16 ***
## GSPC.Volume      0.1649     0.1892   0.872    0.383    
## GSPC.Adjusted        NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 963.36  on 703  degrees of freedom
## Residual deviance: 413.56  on 698  degrees of freedom
## AIC: 425.56
## 
## Number of Fisher Scoring iterations: 7

vip(logit_ex1, geom = "point") + theme_minimal()

Midterm

MIsheel Unurjargal

2024-04-25

R Markdown