# Load required libraries
install.packages("dplyr", repos = "http://cran.us.r-project.org")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
 install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
# Simulate data for BARIUM
set.seed(123)
n <- 100
time <- 1:n
trend <- rnorm(n, mean = 0.5, sd = 0.1)
x1 <- rnorm(n, mean = 2, sd = 1)
x2 <- rnorm(n, mean = 1, sd = 0.5)
monthly_dummy <- factor(rep(1:12, length.out = n))
y <- 3 + 0.5 * trend + 0.8 * x1 - 0.4 * x2 + rnorm(n, sd = 0.5)

barium_data <- data.frame(time, trend, x1, x2, monthly_dummy, y)

# Exercise C2
# (i) Add a linear time trend and check significance
model1 <- lm(y ~ trend + x1 + x2, data = barium_data)
summary(model1)
## 
## Call:
## lm(formula = y ~ trend + x1 + x2, data = barium_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24569 -0.32696  0.02832  0.33516  1.26605 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.14013    0.35697   8.797 5.71e-14 ***
## trend        0.22275    0.58438   0.381    0.704    
## x1           0.82311    0.05473  15.040  < 2e-16 ***
## x2          -0.45739    0.11223  -4.075 9.46e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5258 on 96 degrees of freedom
## Multiple R-squared:  0.7143, Adjusted R-squared:  0.7053 
## F-statistic:    80 on 3 and 96 DF,  p-value: < 2.2e-16
# (ii) Test for joint significance of variables (excluding the trend)
model2 <- lm(y ~ x1 + x2, data = barium_data)
anova(model1, model2)
## Analysis of Variance Table
## 
## Model 1: y ~ trend + x1 + x2
## Model 2: y ~ x1 + x2
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     96 26.536                           
## 2     97 26.576 -1  -0.04016 0.1453 0.7039
# (iii) Add monthly dummies and check for seasonality
model3 <- lm(y ~ trend + x1 + x2 + monthly_dummy, data = barium_data)
summary(model3)
## 
## Call:
## lm(formula = y ~ trend + x1 + x2 + monthly_dummy, data = barium_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.29007 -0.33084  0.02293  0.36298  1.26445 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.70186    0.44819   6.028 4.14e-08 ***
## trend            0.43176    0.61444   0.703 0.484174    
## x1               0.82518    0.05820  14.178  < 2e-16 ***
## x2              -0.45454    0.12185  -3.730 0.000344 ***
## monthly_dummy2   0.48244    0.25771   1.872 0.064639 .  
## monthly_dummy3   0.35854    0.25400   1.412 0.161718    
## monthly_dummy4   0.18873    0.25160   0.750 0.455262    
## monthly_dummy5   0.36254    0.26485   1.369 0.174647    
## monthly_dummy6   0.36895    0.26222   1.407 0.163071    
## monthly_dummy7   0.42900    0.25913   1.656 0.101507    
## monthly_dummy8   0.05621    0.26987   0.208 0.835498    
## monthly_dummy9   0.52666    0.26186   2.011 0.047466 *  
## monthly_dummy10  0.46657    0.25951   1.798 0.075742 .  
## monthly_dummy11  0.36673    0.26379   1.390 0.168092    
## monthly_dummy12  0.32683    0.26806   1.219 0.226121    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5317 on 85 degrees of freedom
## Multiple R-squared:  0.7413, Adjusted R-squared:  0.6986 
## F-statistic: 17.39 on 14 and 85 DF,  p-value: < 2.2e-16
# Simulate data for VOLAT
set.seed(123)
rsp500 <- rnorm(n, mean = 5, sd = 2)
pcip <- rnorm(n, mean = 0.3, sd = 0.1)
i3 <- rnorm(n, mean = 0.05, sd = 0.02)

volat_data <- data.frame(rsp500, pcip, i3)

# Exercise C9
# (i) Fit the regression model
model_volat <- lm(rsp500 ~ pcip + i3, data = volat_data)

# (ii) Report coefficients
summary(model_volat)
## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5775 -1.2295 -0.1682  1.1247  4.6838 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.0735     0.7584   8.009 2.57e-12 ***
## pcip         -0.8614     1.8997  -0.453    0.651    
## i3          -12.2795     9.6698  -1.270    0.207    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.827 on 97 degrees of freedom
## Multiple R-squared:  0.01877,    Adjusted R-squared:  -0.001466 
## F-statistic: 0.9276 on 2 and 97 DF,  p-value: 0.399
# (iii) Identify statistically significant variables
significant_vars <- summary(model_volat)$coefficients[, "Pr(>|t|)"] < 0.05
significant_vars
## (Intercept)        pcip          i3 
##        TRUE       FALSE       FALSE
# (iv) Analyze predictability
predictability <- "Returns on the S&P 500 are predictable if significant variables are identified and have expected signs."
predictability
## [1] "Returns on the S&P 500 are predictable if significant variables are identified and have expected signs."