# Question1
library(psych)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.3.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ scales::alpha() masks ggplot2::alpha(), psych::alpha()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
library(ISLR2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:psych':
##
## logit
data(mtcars)
describe(mtcars)
## vars n mean sd median trimmed mad min max range skew
## mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61
## cyl 2 32 6.19 1.79 6.00 6.23 2.97 4.00 8.00 4.00 -0.17
## disp 3 32 230.72 123.94 196.30 222.52 140.48 71.10 472.00 400.90 0.38
## hp 4 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73
## drat 5 32 3.60 0.53 3.70 3.58 0.70 2.76 4.93 2.17 0.27
## wt 6 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42
## qsec 7 32 17.85 1.79 17.71 17.83 1.42 14.50 22.90 8.40 0.37
## vs 8 32 0.44 0.50 0.00 0.42 0.00 0.00 1.00 1.00 0.24
## am 9 32 0.41 0.50 0.00 0.38 0.00 0.00 1.00 1.00 0.36
## gear 10 32 3.69 0.74 4.00 3.62 1.48 3.00 5.00 2.00 0.53
## carb 11 32 2.81 1.62 2.00 2.65 1.48 1.00 8.00 7.00 1.05
## kurtosis se
## mpg -0.37 1.07
## cyl -1.76 0.32
## disp -1.21 21.91
## hp -0.14 12.12
## drat -0.71 0.09
## wt -0.02 0.17
## qsec 0.34 0.32
## vs -2.00 0.09
## am -1.92 0.09
## gear -1.07 0.13
## carb 1.26 0.29
pairs(mtcars)

#Based on the pairs plot, some predictor variables like horsepower, weight, and displacement show linear relationships with mpg, while others like cylinders and engine displacement display non-linear patterns. Outliers are also observed, particularly in horsepower vs. mpg and weight vs. mpg plots. Transformations may be needed for variables with non-linear patterns, and outliers should be carefully addressed.
model <- lm(mpg ~ ., data = mtcars)
summary(model)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
vif <- car::vif(model)
high_vif <- vif[vif > 10]
high_vif
## cyl disp wt
## 15.37383 21.62024 15.16489
model_excl_disp <- lm(mpg ~ . - disp, data = mtcars)
summary(model_excl_disp)
##
## Call:
## lm(formula = mpg ~ . - disp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7863 -1.4055 -0.2635 1.2029 4.4753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.55052 18.52585 0.677 0.5052
## cyl 0.09627 0.99715 0.097 0.9240
## hp -0.01295 0.01834 -0.706 0.4876
## drat 0.92864 1.60794 0.578 0.5694
## wt -2.62694 1.19800 -2.193 0.0392 *
## qsec 0.66523 0.69335 0.959 0.3478
## vs 0.16035 2.07277 0.077 0.9390
## am 2.47882 2.03513 1.218 0.2361
## gear 0.74300 1.47360 0.504 0.6191
## carb -0.61686 0.60566 -1.018 0.3195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.623 on 22 degrees of freedom
## Multiple R-squared: 0.8655, Adjusted R-squared: 0.8105
## F-statistic: 15.73 on 9 and 22 DF, p-value: 1.183e-07
model_excl_disp_cyl <- lm(mpg ~ . - disp - cyl, data = mtcars)
summary(model_excl_disp_cyl)
##
## Call:
## lm(formula = mpg ~ . - disp - cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8187 -1.3903 -0.3045 1.2269 4.5183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.80810 12.88582 1.072 0.2950
## hp -0.01225 0.01649 -0.743 0.4650
## drat 0.88894 1.52061 0.585 0.5645
## wt -2.60968 1.15878 -2.252 0.0342 *
## qsec 0.63983 0.62752 1.020 0.3185
## vs 0.08786 1.88992 0.046 0.9633
## am 2.42418 1.91227 1.268 0.2176
## gear 0.69390 1.35294 0.513 0.6129
## carb -0.61286 0.59109 -1.037 0.3106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.566 on 23 degrees of freedom
## Multiple R-squared: 0.8655, Adjusted R-squared: 0.8187
## F-statistic: 18.5 on 8 and 23 DF, p-value: 2.627e-08
# Question2
library(ISLR2)
data(Carseats)
# (a)
model <- lm(Sales ~ Price + Urban + US, data = Carseats)
# (b) Provide an interpretation of each coefficient in the model
summary(model)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
# (c) Write out the model in equation form
# Model equation:
# Sales = β0 + β1 * Price + β2 * Urban_Yes + β3 * US_Yes + ε
# Where Urban_Yes and US_Yes are dummy variables representing Urban and US, respectively.
# (d) For which of the predictors can you reject the null hypothesis H0:β=0?
# Use the summary output to look at the p-values for each coefficient.
# If the p-value is less than the significance level (e.g., 0.05), reject the null hypothesis and conclude that the corresponding coefficient is significantly different from zero.
# (e) Fit a smaller model that only uses the predictors for which there is evidence of association with the outcome
smaller_model <- lm(Sales ~ Price + US, data = Carseats)
summary(smaller_model)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
# (f) How well do the models in (a) and (e) fit the data?
# Compare the R-squared values of the models to assess how well they fit the data.
# Higher R-squared values indicate better fit.
summary(model)$r.squared
## [1] 0.2392754
summary(smaller_model)$r.squared
## [1] 0.2392629
# (g) Using the model from (e), obtain 95% confidence intervals for the coefficient(s)
confint(smaller_model)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
# (h) Is there evidence of outliers or high leverage observations in the model from (e)?
# Check for outliers or high leverage observations
plot(smaller_model, which = c(4, 1, 5))



#Question3
# Load libraries
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Loading required package: TTR
##
## Attaching package: 'TTR'
##
## The following object is masked from 'package:dials':
##
## momentum
##
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(TTR)
library(xts)
# Download S&P500 data from Yahoo Finance
getSymbols("^GSPC", src = "yahoo", from = "2019-01-01", to = "2023-12-31")
## [1] "GSPC"
d_sp500 = Cl(GSPC) # select close prices
colnames(d_sp500) = "Price"
# SMA
sma5 = lag(SMA(d_sp500, n = 5)) # notice the use of the lag function to take lagged values
# EMA
ema5 = lag(EMA(d_sp500, n = 5))
# MACD
macd1 = lag(MACD(d_sp500))
# RSI
rsi1 = lag(RSI(d_sp500, 5))
# log returns
ret1 = lag(dailyReturn(d_sp500, type = "log"))
# price direction indicator
dir = ifelse(d_sp500$Price >= lag(d_sp500$Price, 5), 1, 0) # direction variable compared to 5 day before price
# Combine all the indicators and response variable in a data frame
d_ex1 = cbind(dir, ret1, sma5, ema5, macd1, rsi1)
# Change column names
colnames(d_ex1) = c("Direction", "Ret", "SMA", "EMA", "MACD", "Signal", "RSI")
# Using the quantmod package
chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and Indicators")

addTA(d_ex1[, 1], col = 1, legend = "Direction") # Direction

# Plot each indicator separately
chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and Ret")

addTA(d_ex1[, "Ret"], on = NA, col = 2, legend = "Ret")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and SMA")

addTA(d_ex1[, "SMA"], on = NA, col = 3, legend = "SMA")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and EMA")

addTA(d_ex1[, "EMA"], on = NA, col = 4, legend = "EMA")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and MACD")

addTA(d_ex1[, "MACD"], on = NA, col = 5, legend = "MACD")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and Signal")

addTA(d_ex1[, "Signal"], on = NA, col = 6, legend = "Signal")

chartSeries(d_sp500, theme = "white", name = "S&P500 Closing Prices and RSI")

addTA(d_ex1[, "RSI"], on = NA, col = 7, legend = "RSI")

library(tidyr)
library(ggplot2)
# Create a dataset and convert data to long
d_plot = merge.xts(d_sp500, d_ex1)
# Remove NAs and then convert to long
d_plot = na.omit(d_plot)
# Convert to dataframe
d_plot = data.frame(Date = index(d_plot), coredata(d_plot))
d_plot_long = pivot_longer(d_plot, -c(Date, Direction), values_to = "value", names_to = "Indicator")
# Change direction to a factor
d_plot_long$Direction = as.factor(d_plot_long$Direction)
# Plot
(p2_ex = ggplot(d_plot_long, aes(Date, value, color = Indicator)) + geom_path(stat = "identity") +
facet_grid(Indicator ~ ., scale = "free") + theme_minimal())

p2_ex = ggplot(d_plot_long, aes(value, Indicator, fill = Direction)) +
geom_boxplot()
p2_ex + theme_minimal() + labs(title = "TA Indicators vs Price Direction") +
scale_fill_manual(name = "Price Direction", values = c("purple", "lightblue"))

# remove NAs
d_ex1 = na.omit(d_ex1)
# convert to data frame
d_ex1 = as.data.frame(d_ex1)
# convert direction to a factor for classification
d_ex1$Direction = as.factor(d_ex1$Direction)
idx1 = c(1:round(nrow(d_ex1) * 0.7)) #create index for first 70% values to be in the testing set
d_train1 = d_ex1[idx1, ] #training set
d_test1 = d_ex1[-idx1, ] #testing set
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:purrr':
##
## lift
set.seed(999)
# control
cntrl1 = trainControl(method = "timeslice", initialWindow = 250, horizon = 30,
fixedWindow = TRUE)
# preprocesing
prep1 = c("center", "scale")
# logistic regression
logit_ex1 = train(Direction ~ ., data = d_train1, method = "glm", family = "binomial",
trControl = cntrl1, preProcess = prep1)
logit_ex1 #final model accuracy
## Generalized Linear Model
##
## 857 samples
## 6 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Rolling Forecasting Origin Resampling (30 held-out with a fixed window)
## Summary of sample sizes: 250, 250, 250, 250, 250, 250, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8247982 0.6095506
summary(logit_ex1$finalModel) #summary of the final model
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6702 0.1042 6.434 1.24e-10 ***
## Ret -0.1949 0.1315 -1.482 0.1384
## SMA -66.6941 9.2266 -7.228 4.88e-13 ***
## EMA 66.5705 9.2163 7.223 5.08e-13 ***
## MACD 1.3864 0.5452 2.543 0.0110 *
## Signal -1.4401 0.4976 -2.894 0.0038 **
## RSI 1.7669 0.2008 8.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1143.29 on 856 degrees of freedom
## Residual deviance: 612.22 on 850 degrees of freedom
## AIC: 626.22
##
## Number of Fisher Scoring iterations: 6
library(vip)
vip(logit_ex1, geom = "point") + theme_minimal()
