This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
#install.packages("zip")
#olsrr.install.packages("zip")
# install.packages("corrplot")
# install.packages("olsrr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.1.0 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
df <- data.frame(read.csv("mlr11.csv"))
head(df)
x_CRUDE = df$CRUDE
x_INTEREST = df$INTEREST
x_FOREIGN = df$FOREIGN
y = df$DJIA
x_GNP = df$GNP
x_PURCHASE = df$PURCHASE
x_CONSUMER = df$CONSUMER
library(ggplot2)
ggplot(df,aes(x=x_CRUDE, y))+
geom_point()
library(ggplot2)
ggplot(df,aes(x=x_INTEREST, y))+
geom_point()
library(ggplot2)
ggplot(df,aes(x=x_FOREIGN, y))+
geom_point()
library(ggplot2)
ggplot(df,aes(x=x_GNP, y))+
geom_point()
library(ggplot2)
ggplot(df,aes(x=x_PURCHASE, y))+
geom_point()
library(ggplot2)
ggplot(df,aes(x=x_CONSUMER, y))+
geom_point()
corr_matrix = cbind(
x_CRUDE,
x_INTEREST,
x_FOREIGN,
x_GNP,
x_PURCHASE,
x_CONSUMER, y)
corr_matrix = cor(corr_matrix, method = c("pearson"))
corrplot(corr_matrix, method="color")
mod <- lm(y~x_CRUDE+x_INTEREST+x_FOREIGN+x_GNP+x_PURCHASE+x_CONSUMER)
mod_summary <- summary(mod)
mod_summary
##
## Call:
## lm(formula = y ~ x_CRUDE + x_INTEREST + x_FOREIGN + x_GNP + x_PURCHASE +
## x_CONSUMER)
##
## Residuals:
## 1 2 3 4 5 6 7 8
## -24.444 -58.737 1.681 93.884 68.461 111.175 -108.784 19.527
## 9 10 11 12
## -55.448 -162.523 -54.304 169.512
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 927.4178 5844.0938 0.159 0.880
## x_CRUDE 6.0855 30.8266 0.197 0.851
## x_INTEREST -82.3673 101.1515 -0.814 0.452
## x_FOREIGN 11.5185 11.3482 1.015 0.357
## x_GNP -0.2575 2.1585 -0.119 0.910
## x_PURCHASE 465.7685 2100.9056 0.222 0.833
## x_CONSUMER -0.4294 5.0945 -0.084 0.936
##
## Residual standard error: 143.7 on 5 degrees of freedom
## Multiple R-squared: 0.9529, Adjusted R-squared: 0.8964
## F-statistic: 16.87 on 6 and 5 DF, p-value: 0.003537
mod
##
## Call:
## lm(formula = y ~ x_CRUDE + x_INTEREST + x_FOREIGN + x_GNP + x_PURCHASE +
## x_CONSUMER)
##
## Coefficients:
## (Intercept) x_CRUDE x_INTEREST x_FOREIGN x_GNP x_PURCHASE
## 927.4178 6.0855 -82.3673 11.5185 -0.2575 465.7685
## x_CONSUMER
## -0.4294
par(mfrow=c(2,2))
plot(mod)
Calculate MSE
#calculate MSE
mean(mod_summary$residuals^2)
## [1] 8608.204
anova(mod)
all_subset <- ols_step_all_possible(mod)
plot(all_subset)
Full Subset Model (2^k models) –> 64 models
all_subset
forward_model <- ols_step_forward_p(mod)
# forward_model <- ols_step_forward_aic(mod)
plot(forward_model)
forward_model$model
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Coefficients:
## (Intercept) x_CONSUMER x_INTEREST x_FOREIGN
## 1889.230 -1.872 -105.100 9.644
par(mfrow=c(2,2))
plot(forward_model$model)
## Backward Regression
backward <- ols_step_backward_p(mod)
plot(backward)
backward$model
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Coefficients:
## (Intercept) x_INTEREST x_FOREIGN x_GNP
## 2127.2253 -67.8511 13.3487 -0.5984
par(mfrow=c(2,2))
plot(backward$model)
both_model <- ols_step_both_p(mod)
plot(both_model)
both_model$model
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Coefficients:
## (Intercept) x_CONSUMER x_INTEREST x_FOREIGN
## 1889.230 -1.872 -105.100 9.644
par(mfrow=c(2,2))
plot(both_model$model)
#final_model <- train(mod,data=df, method = "lm", trControl = train_control)
new_model <- both_model$model
final_mod_summary <- summary(new_model)
final_mod_summary
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -130.62 -67.72 -24.76 64.44 197.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1889.230 422.894 4.467 0.00209 **
## x_CONSUMER -1.872 1.382 -1.354 0.21260
## x_INTEREST -105.100 21.039 -4.995 0.00106 **
## x_FOREIGN 9.644 2.904 3.321 0.01052 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 120.8 on 8 degrees of freedom
## Multiple R-squared: 0.9468, Adjusted R-squared: 0.9269
## F-statistic: 47.46 on 3 and 8 DF, p-value: 1.929e-05
#calculate MSE
mean(final_mod_summary$residuals^2)
## [1] 9727.438
#final_model
# summary(final_model)
new_model_backward <- backward$model
final_mod_b_summary <- summary(new_model_backward)
final_mod_b_summary
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.45 -54.09 -11.95 72.38 169.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2127.2253 472.0506 4.506 0.00199 **
## x_INTEREST -67.8511 18.9224 -3.586 0.00713 **
## x_FOREIGN 13.3487 4.4354 3.010 0.01682 *
## x_GNP -0.5984 0.3482 -1.718 0.12405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 114.5 on 8 degrees of freedom
## Multiple R-squared: 0.9522, Adjusted R-squared: 0.9343
## F-statistic: 53.16 on 3 and 8 DF, p-value: 1.256e-05
#calculate MSE
mean(final_mod_b_summary$residuals^2)
## [1] 8734.273
set.seed(125)
# defining training control as
# repeated cross-validation and
# value of K is 10 and repetation is 3 times
# train_control <- trainControl(method = "repeatedcv", number = 6, repeats = 20)
train_control <- trainControl(method = "repeatedcv", number = 3, repeats = 20)
# building the model and
# predicting the target variable
# as per the Naive Bayes classifier
model <- train(DJIA~INTEREST+FOREIGN+GNP, data = df,
trControl = train_control,
method = "lm")
model
## Linear Regression
##
## 12 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 20 times)
## Summary of sample sizes: 8, 9, 7, 8, 9, 7, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 162.859 0.9194827 134.7414
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# building the model and
# predicting the target variable
# as per the Naive Bayes classifier
model <- train(DJIA~INTEREST+FOREIGN, data = df,
trControl = train_control,
method = "lm")
model
## Linear Regression
##
## 12 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 20 times)
## Summary of sample sizes: 9, 7, 8, 8, 7, 9, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 174.7581 0.9372077 137.3012
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# building the model and
# predicting the target variable
# as per the Naive Bayes classifier
model <- train(DJIA~CONSUMER+INTEREST+FOREIGN, data = df,
trControl = train_control,
method = "lm")
model
## Linear Regression
##
## 12 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 20 times)
## Summary of sample sizes: 7, 9, 8, 8, 8, 8, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 168.0466 0.9355281 139.8708
##
## Tuning parameter 'intercept' was held constant at a value of TRUE