This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

#install.packages("zip")
#olsrr.install.packages("zip")
# install.packages("corrplot")
# install.packages("olsrr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.1.0     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
df <- data.frame(read.csv("mlr11.csv"))
head(df)
x_CRUDE = df$CRUDE
x_INTEREST = df$INTEREST
x_FOREIGN = df$FOREIGN
y = df$DJIA
x_GNP = df$GNP
x_PURCHASE = df$PURCHASE
x_CONSUMER = df$CONSUMER
library(ggplot2)
ggplot(df,aes(x=x_CRUDE, y))+
geom_point()

library(ggplot2)
ggplot(df,aes(x=x_INTEREST, y))+
geom_point()

library(ggplot2)
ggplot(df,aes(x=x_FOREIGN, y))+
geom_point()

library(ggplot2)
ggplot(df,aes(x=x_GNP, y))+
geom_point()

library(ggplot2)
ggplot(df,aes(x=x_PURCHASE, y))+
geom_point()

library(ggplot2)
ggplot(df,aes(x=x_CONSUMER, y))+
geom_point()

corr_matrix = cbind(
              x_CRUDE,
              x_INTEREST,
              x_FOREIGN,
              x_GNP,
              x_PURCHASE,
              x_CONSUMER, y)
corr_matrix = cor(corr_matrix, method = c("pearson"))
corrplot(corr_matrix, method="color")

mod <- lm(y~x_CRUDE+x_INTEREST+x_FOREIGN+x_GNP+x_PURCHASE+x_CONSUMER)
mod_summary <- summary(mod)
mod_summary
## 
## Call:
## lm(formula = y ~ x_CRUDE + x_INTEREST + x_FOREIGN + x_GNP + x_PURCHASE + 
##     x_CONSUMER)
## 
## Residuals:
##        1        2        3        4        5        6        7        8 
##  -24.444  -58.737    1.681   93.884   68.461  111.175 -108.784   19.527 
##        9       10       11       12 
##  -55.448 -162.523  -54.304  169.512 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  927.4178  5844.0938   0.159    0.880
## x_CRUDE        6.0855    30.8266   0.197    0.851
## x_INTEREST   -82.3673   101.1515  -0.814    0.452
## x_FOREIGN     11.5185    11.3482   1.015    0.357
## x_GNP         -0.2575     2.1585  -0.119    0.910
## x_PURCHASE   465.7685  2100.9056   0.222    0.833
## x_CONSUMER    -0.4294     5.0945  -0.084    0.936
## 
## Residual standard error: 143.7 on 5 degrees of freedom
## Multiple R-squared:  0.9529, Adjusted R-squared:  0.8964 
## F-statistic: 16.87 on 6 and 5 DF,  p-value: 0.003537
mod
## 
## Call:
## lm(formula = y ~ x_CRUDE + x_INTEREST + x_FOREIGN + x_GNP + x_PURCHASE + 
##     x_CONSUMER)
## 
## Coefficients:
## (Intercept)      x_CRUDE   x_INTEREST    x_FOREIGN        x_GNP   x_PURCHASE  
##    927.4178       6.0855     -82.3673      11.5185      -0.2575     465.7685  
##  x_CONSUMER  
##     -0.4294
par(mfrow=c(2,2))
plot(mod)

Calculate MSE

#calculate MSE
mean(mod_summary$residuals^2)
## [1] 8608.204
anova(mod)

All Subset Model

all_subset <- ols_step_all_possible(mod)
plot(all_subset)

Full Subset Model (2^k models) –> 64 models

all_subset

Stepwise Forward Regression

forward_model <- ols_step_forward_p(mod)
# forward_model <- ols_step_forward_aic(mod)
plot(forward_model)

forward_model$model
## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
## (Intercept)   x_CONSUMER   x_INTEREST    x_FOREIGN  
##    1889.230       -1.872     -105.100        9.644
par(mfrow=c(2,2))
plot(forward_model$model)

Backward Stepwise Regression

## Backward Regression
backward <- ols_step_backward_p(mod)
plot(backward)

backward$model
## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
## (Intercept)   x_INTEREST    x_FOREIGN        x_GNP  
##   2127.2253     -67.8511      13.3487      -0.5984
par(mfrow=c(2,2))
plot(backward$model)

Hybrid Stepwise Regression

both_model <- ols_step_both_p(mod)
plot(both_model)

both_model$model
## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
## (Intercept)   x_CONSUMER   x_INTEREST    x_FOREIGN  
##    1889.230       -1.872     -105.100        9.644
par(mfrow=c(2,2))
plot(both_model$model)

#final_model <- train(mod,data=df, method = "lm", trControl = train_control)
new_model <- both_model$model
final_mod_summary <- summary(new_model)
final_mod_summary
## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -130.62  -67.72  -24.76   64.44  197.61 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 1889.230    422.894   4.467  0.00209 **
## x_CONSUMER    -1.872      1.382  -1.354  0.21260   
## x_INTEREST  -105.100     21.039  -4.995  0.00106 **
## x_FOREIGN      9.644      2.904   3.321  0.01052 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 120.8 on 8 degrees of freedom
## Multiple R-squared:  0.9468, Adjusted R-squared:  0.9269 
## F-statistic: 47.46 on 3 and 8 DF,  p-value: 1.929e-05
#calculate MSE
mean(final_mod_summary$residuals^2)
## [1] 9727.438
#final_model
# summary(final_model)
new_model_backward <- backward$model
final_mod_b_summary <- summary(new_model_backward)
final_mod_b_summary
## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -145.45  -54.09  -11.95   72.38  169.16 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 2127.2253   472.0506   4.506  0.00199 **
## x_INTEREST   -67.8511    18.9224  -3.586  0.00713 **
## x_FOREIGN     13.3487     4.4354   3.010  0.01682 * 
## x_GNP         -0.5984     0.3482  -1.718  0.12405   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.5 on 8 degrees of freedom
## Multiple R-squared:  0.9522, Adjusted R-squared:  0.9343 
## F-statistic: 53.16 on 3 and 8 DF,  p-value: 1.256e-05
#calculate MSE
mean(final_mod_b_summary$residuals^2)
## [1] 8734.273

Using K-Fold Cross Validation (Models chosen by subset regression + based on original ANOVA table)

set.seed(125) 
# defining training control as
# repeated cross-validation and 
# value of K is 10 and repetation is 3 times
# train_control <- trainControl(method = "repeatedcv", number = 6, repeats = 20)
train_control <- trainControl(method = "repeatedcv", number = 3, repeats = 20)
# building the model and
# predicting the target variable
# as per the Naive Bayes classifier
model <- train(DJIA~INTEREST+FOREIGN+GNP, data = df,
               trControl = train_control,
               method = "lm")
model
## Linear Regression 
## 
## 12 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 20 times) 
## Summary of sample sizes: 8, 9, 7, 8, 9, 7, ... 
## Resampling results:
## 
##   RMSE     Rsquared   MAE     
##   162.859  0.9194827  134.7414
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
# building the model and
# predicting the target variable
# as per the Naive Bayes classifier
model <- train(DJIA~INTEREST+FOREIGN, data = df,
               trControl = train_control,
               method = "lm")
model
## Linear Regression 
## 
## 12 samples
##  2 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 20 times) 
## Summary of sample sizes: 9, 7, 8, 8, 7, 9, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   174.7581  0.9372077  137.3012
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
# building the model and
# predicting the target variable
# as per the Naive Bayes classifier
model <- train(DJIA~CONSUMER+INTEREST+FOREIGN, data = df,
               trControl = train_control,
               method = "lm")
model
## Linear Regression 
## 
## 12 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 20 times) 
## Summary of sample sizes: 7, 9, 8, 8, 8, 8, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   168.0466  0.9355281  139.8708
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE