midterm

# Question1
library(psych)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ ggplot2::%+%()    masks psych::%+%()
## ✖ scales::alpha()   masks ggplot2::alpha(), psych::alpha()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

library(vip)

## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

library(ISLR2)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:psych':
## 
##     logit

data(mtcars)
describe(mtcars)

##      vars  n   mean     sd median trimmed    mad   min    max  range  skew
## mpg     1 32  20.09   6.03  19.20   19.70   5.41 10.40  33.90  23.50  0.61
## cyl     2 32   6.19   1.79   6.00    6.23   2.97  4.00   8.00   4.00 -0.17
## disp    3 32 230.72 123.94 196.30  222.52 140.48 71.10 472.00 400.90  0.38
## hp      4 32 146.69  68.56 123.00  141.19  77.10 52.00 335.00 283.00  0.73
## drat    5 32   3.60   0.53   3.70    3.58   0.70  2.76   4.93   2.17  0.27
## wt      6 32   3.22   0.98   3.33    3.15   0.77  1.51   5.42   3.91  0.42
## qsec    7 32  17.85   1.79  17.71   17.83   1.42 14.50  22.90   8.40  0.37
## vs      8 32   0.44   0.50   0.00    0.42   0.00  0.00   1.00   1.00  0.24
## am      9 32   0.41   0.50   0.00    0.38   0.00  0.00   1.00   1.00  0.36
## gear   10 32   3.69   0.74   4.00    3.62   1.48  3.00   5.00   2.00  0.53
## carb   11 32   2.81   1.62   2.00    2.65   1.48  1.00   8.00   7.00  1.05
##      kurtosis    se
## mpg     -0.37  1.07
## cyl     -1.76  0.32
## disp    -1.21 21.91
## hp      -0.14 12.12
## drat    -0.71  0.09
## wt      -0.02  0.17
## qsec     0.34  0.32
## vs      -2.00  0.09
## am      -1.92  0.09
## gear    -1.07  0.13
## carb     1.26  0.29

pairs(mtcars)

#Based on the pairs plot, some predictor variables like horsepower, weight, and displacement show linear relationships with mpg, while others like cylinders and engine displacement display non-linear patterns. Outliers are also observed, particularly in horsepower vs. mpg and weight vs. mpg plots. Transformations may be needed for variables with non-linear patterns, and outliers should be carefully addressed.
model <- lm(mpg ~ ., data = mtcars)
summary(model)

## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

vif <- car::vif(model)
high_vif <- vif[vif > 10]
high_vif

##      cyl     disp       wt 
## 15.37383 21.62024 15.16489

model_excl_disp <- lm(mpg ~ . - disp, data = mtcars)
summary(model_excl_disp)

## 
## Call:
## lm(formula = mpg ~ . - disp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7863 -1.4055 -0.2635  1.2029  4.4753 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.55052   18.52585   0.677   0.5052  
## cyl          0.09627    0.99715   0.097   0.9240  
## hp          -0.01295    0.01834  -0.706   0.4876  
## drat         0.92864    1.60794   0.578   0.5694  
## wt          -2.62694    1.19800  -2.193   0.0392 *
## qsec         0.66523    0.69335   0.959   0.3478  
## vs           0.16035    2.07277   0.077   0.9390  
## am           2.47882    2.03513   1.218   0.2361  
## gear         0.74300    1.47360   0.504   0.6191  
## carb        -0.61686    0.60566  -1.018   0.3195  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.623 on 22 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8105 
## F-statistic: 15.73 on 9 and 22 DF,  p-value: 1.183e-07

model_excl_disp_cyl <- lm(mpg ~ . - disp - cyl, data = mtcars)
summary(model_excl_disp_cyl)

## 
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8187 -1.3903 -0.3045  1.2269  4.5183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 13.80810   12.88582   1.072   0.2950  
## hp          -0.01225    0.01649  -0.743   0.4650  
## drat         0.88894    1.52061   0.585   0.5645  
## wt          -2.60968    1.15878  -2.252   0.0342 *
## qsec         0.63983    0.62752   1.020   0.3185  
## vs           0.08786    1.88992   0.046   0.9633  
## am           2.42418    1.91227   1.268   0.2176  
## gear         0.69390    1.35294   0.513   0.6129  
## carb        -0.61286    0.59109  -1.037   0.3106  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.8187 
## F-statistic:  18.5 on 8 and 23 DF,  p-value: 2.627e-08

# Question2
library(ISLR2)
data(Carseats)

# (a) 
model <- lm(Sales ~ Price + Urban + US, data = Carseats)

# (b) Provide an interpretation of each coefficient in the model
summary(model)

## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

# (c) Write out the model in equation form
# Model equation: 
# Sales = β0 + β1 * Price + β2 * Urban_Yes + β3 * US_Yes + ε
# Where Urban_Yes and US_Yes are dummy variables representing Urban and US, respectively.

# (d) For which of the predictors can you reject the null hypothesis H0:β=0?
# Use the summary output to look at the p-values for each coefficient.
# If the p-value is less than the significance level (e.g., 0.05), reject the null hypothesis and conclude that the corresponding coefficient is significantly different from zero.

# (e) Fit a smaller model that only uses the predictors for which there is evidence of association with the outcome
smaller_model <- lm(Sales ~ Price + US, data = Carseats)
summary(smaller_model)

## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

# (f) How well do the models in (a) and (e) fit the data?
# Compare the R-squared values of the models to assess how well they fit the data.
# Higher R-squared values indicate better fit.
summary(model)$r.squared

## [1] 0.2392754

summary(smaller_model)$r.squared

## [1] 0.2392629

# (g) Using the model from (e), obtain 95% confidence intervals for the coefficient(s)
confint(smaller_model)

##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632

# (h) Is there evidence of outliers or high leverage observations in the model from (e)?
# Check for outliers or high leverage observations
plot(smaller_model, which = c(4, 1, 5))

#Question3 
# Load libraries
library(quantmod)

## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Loading required package: TTR
## 
## Attaching package: 'TTR'
## 
## The following object is masked from 'package:dials':
## 
##     momentum
## 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(TTR)
library(xts)

# Download S&P500 data from Yahoo Finance
getSymbols("^GSPC", src = "yahoo", from = "2019-01-01", to = "2023-12-31")

## [1] "GSPC"

d_sp500 = Cl(GSPC)  # select close prices
colnames(d_sp500) = "Price"

# SMA
sma5 = lag(SMA(d_sp500, n = 5))  # notice the use of the lag function to take lagged values
# EMA
ema5 = lag(EMA(d_sp500, n = 5))
# MACD
macd1 = lag(MACD(d_sp500))
# RSI
rsi1 = lag(RSI(d_sp500, 5))
# log returns
ret1 = lag(dailyReturn(d_sp500, type = "log"))

# price direction indicator
dir = ifelse(d_sp500$Price >= lag(d_sp500$Price, 5), 1, 0)  # direction variable compared to 5 day before price

# Combine all the indicators and response variable in a data frame
d_ex1 = cbind(dir, ret1, sma5, ema5, macd1, rsi1)

# Change column names
colnames(d_ex1) = c("Direction", "Ret", "SMA", "EMA", "MACD", "Signal", "RSI")
# Using the quantmod package
chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and Indicators")

addTA(d_ex1[, 1], col = 1, legend = "Direction")  # Direction

# Plot each indicator separately
chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and Ret")

addTA(d_ex1[, "Ret"], on = NA, col = 2, legend = "Ret")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and SMA")

addTA(d_ex1[, "SMA"], on = NA, col = 3, legend = "SMA")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and EMA")

addTA(d_ex1[, "EMA"], on = NA, col = 4, legend = "EMA")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and MACD")

addTA(d_ex1[, "MACD"], on = NA, col = 5, legend = "MACD")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and Signal")

addTA(d_ex1[, "Signal"], on = NA, col = 6, legend = "Signal")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and RSI")

addTA(d_ex1[, "RSI"], on = NA, col = 7, legend = "RSI")

library(tidyr)
library(ggplot2)

# Create a dataset and convert data to long
d_plot = merge.xts(d_sp500, d_ex1)

# Remove NAs and then convert to long
d_plot = na.omit(d_plot)

# Convert to dataframe
d_plot = data.frame(Date = index(d_plot), coredata(d_plot))
d_plot_long = pivot_longer(d_plot, -c(Date, Direction), values_to = "value", names_to = "Indicator")

# Change direction to a factor
d_plot_long$Direction = as.factor(d_plot_long$Direction)

# Plot
(p2_ex = ggplot(d_plot_long, aes(Date, value, color = Indicator)) + geom_path(stat = "identity") +
    facet_grid(Indicator ~ ., scale = "free") + theme_minimal())

p2_ex = ggplot(d_plot_long, aes(value, Indicator, fill = Direction)) +
  geom_boxplot()

p2_ex + theme_minimal() + labs(title = "TA Indicators vs Price Direction") +
  scale_fill_manual(name = "Price Direction", values = c("purple", "lightblue"))

# remove NAs
d_ex1 = na.omit(d_ex1)
# convert to data frame
d_ex1 = as.data.frame(d_ex1)
# convert direction to a factor for classification

d_ex1$Direction = as.factor(d_ex1$Direction)

idx1 = c(1:round(nrow(d_ex1) * 0.7))  #create index for first 70% values to be in the testing set
d_train1 = d_ex1[idx1, ]  #training set
d_test1 = d_ex1[-idx1, ]  #testing set


library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(999)
# control
cntrl1 = trainControl(method = "timeslice", initialWindow = 250, horizon = 30,
                      fixedWindow = TRUE)
# preprocesing
prep1 = c("center", "scale")
# logistic regression
logit_ex1 = train(Direction ~ ., data = d_train1, method = "glm", family = "binomial",
                  trControl = cntrl1, preProcess = prep1)
logit_ex1  #final model accuracy

## Generalized Linear Model 
## 
## 857 samples
##   6 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Rolling Forecasting Origin Resampling (30 held-out with a fixed window) 
## Summary of sample sizes: 250, 250, 250, 250, 250, 250, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8247982  0.6095506

summary(logit_ex1$finalModel)  #summary of the final model

## 
## Call:
## NULL
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.6702     0.1042   6.434 1.24e-10 ***
## Ret          -0.1949     0.1315  -1.482   0.1384    
## SMA         -66.6941     9.2266  -7.228 4.88e-13 ***
## EMA          66.5705     9.2163   7.223 5.08e-13 ***
## MACD          1.3864     0.5452   2.543   0.0110 *  
## Signal       -1.4401     0.4976  -2.894   0.0038 ** 
## RSI           1.7669     0.2008   8.800  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1143.29  on 856  degrees of freedom
## Residual deviance:  612.22  on 850  degrees of freedom
## AIC: 626.22
## 
## Number of Fisher Scoring iterations: 6

library(vip)
vip(logit_ex1, geom = "point") + theme_minimal()

midterm

2024-04-25