#install.packages(c("readxl","dplyr","tidyverse","ggplot2","visdat","reshape2","car","tseries","lmtest","Metrics","polycor"))
library("Metrics")
## Warning: package 'Metrics' was built under R version 4.3.3
library("lmtest")
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library("tseries")
## Warning: package 'tseries' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library("car")
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
library("reshape2")
## Warning: package 'reshape2' was built under R version 4.3.3
library("dplyr")
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidyverse")
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("visdat")
## Warning: package 'visdat' was built under R version 4.3.3
library("ggplot2")
library("readxl")
## Warning: package 'readxl' was built under R version 4.3.3
library("polycor")
## Warning: package 'polycor' was built under R version 4.3.3
coca_cola <- read_excel("C:/Users/jmpo2/Downloads/cocacolasales.xlsx")
# Visualize the missing data to understand the data quality, we can see that in the whole database they aren´t nulls
vis_miss(coca_cola, warn_large_data = FALSE)

#Here we can observe the structure of the database Coca Cola, looking the columns and their data types, in order to identify if anything needs to be changed.It´s important to say that holiday_month is treat as binary, not numeric.
glimpse(coca_cola)
## Rows: 48
## Columns: 15
## $ tperiod <dttm> 2021-01-15, 2021-02-15, 2021-03-15, 2021-04-15, 20…
## $ sales_unitboxes <dbl> 5516689, 5387496, 5886747, 6389182, 6448275, 669794…
## $ consumer_sentiment <dbl> 38.06250, 37.49114, 38.50522, 37.84286, 38.03169, 3…
## $ CPI <dbl> 87.11010, 87.27538, 87.63072, 87.40384, 86.96737, 8…
## $ inflation_rate <dbl> -0.09, 0.19, 0.41, -0.26, -0.50, 0.17, 0.15, 0.21, …
## $ unemp_rate <dbl> 0.05230256, 0.05311320, 0.04608844, 0.05102038, 0.0…
## $ gdp_percapita <dbl> 11659.56, 11659.55, 11659.55, 11625.75, 11625.74, 1…
## $ itaee <dbl> 103.7654, 103.7654, 103.7654, 107.7518, 107.7518, 1…
## $ itaee_growth <dbl> 0.049716574, 0.049716574, 0.049716574, 0.031838981,…
## $ pop_density <dbl> 98.54185, 98.54186, 98.54187, 98.82843, 98.82844, 9…
## $ job_density <dbl> 18.26048, 18.46329, 18.64164, 18.67876, 18.67539, 1…
## $ pop_minwage <dbl> 9.657861, 9.657861, 9.657861, 9.594919, 9.594919, 9…
## $ exchange_rate <dbl> 14.69259, 14.92134, 15.22834, 15.22618, 15.26447, 1…
## $ max_temperature <dbl> 28, 31, 29, 32, 34, 32, 29, 29, 29, 29, 29, 26, 28,…
## $ holiday_month <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
head(coca_cola)
## # A tibble: 6 × 15
## tperiod sales_unitboxes consumer_sentiment CPI inflation_rate
## <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 2021-01-15 00:00:00 5516689. 38.1 87.1 -0.09
## 2 2021-02-15 00:00:00 5387496. 37.5 87.3 0.19
## 3 2021-03-15 00:00:00 5886747. 38.5 87.6 0.41
## 4 2021-04-15 00:00:00 6389182. 37.8 87.4 -0.26
## 5 2021-05-15 00:00:00 6448275. 38.0 87.0 -0.5
## 6 2021-06-15 00:00:00 6697947. 39.1 87.1 0.17
## # ℹ 10 more variables: unemp_rate <dbl>, gdp_percapita <dbl>, itaee <dbl>,
## # itaee_growth <dbl>, pop_density <dbl>, job_density <dbl>,
## # pop_minwage <dbl>, exchange_rate <dbl>, max_temperature <dbl>,
## # holiday_month <dbl>
#In this chunk I change the tperiod to a sequence that starts in 2015.1 and ends in 2018.12, in order to facilitate the treatment of this column. Also, using the function head I showed the first 12 rows of the column tperiod, to confirm that they changed correctly.
n_rows <- nrow(coca_cola)
start_year <- 2015
start_month <- 1
new_tperiod <- numeric(n_rows) # Cambiar de character a numeric
for (i in 1:n_rows) {
year <- start_year + (start_month - 1) %/% 12
month <- (start_month - 1) %% 12 + 1
new_tperiod[i] <- year + month / 100 # Genera un valor en formato double
start_month <- start_month + 1
}
coca_cola$tperiod <- new_tperiod
head(coca_cola$tperiod, 12)
## [1] 2015.01 2015.02 2015.03 2015.04 2015.05 2015.06 2015.07 2015.08 2015.09
## [10] 2015.10 2015.11 2015.12
str(coca_cola)
## tibble [48 × 15] (S3: tbl_df/tbl/data.frame)
## $ tperiod : num [1:48] 2015 2015 2015 2015 2015 ...
## $ sales_unitboxes : num [1:48] 5516689 5387496 5886747 6389182 6448275 ...
## $ consumer_sentiment: num [1:48] 38.1 37.5 38.5 37.8 38 ...
## $ CPI : num [1:48] 87.1 87.3 87.6 87.4 87 ...
## $ inflation_rate : num [1:48] -0.09 0.19 0.41 -0.26 -0.5 0.17 0.15 0.21 0.37 0.51 ...
## $ unemp_rate : num [1:48] 0.0523 0.0531 0.0461 0.051 0.0552 ...
## $ gdp_percapita : num [1:48] 11660 11660 11660 11626 11626 ...
## $ itaee : num [1:48] 104 104 104 108 108 ...
## $ itaee_growth : num [1:48] 0.0497 0.0497 0.0497 0.0318 0.0318 ...
## $ pop_density : num [1:48] 98.5 98.5 98.5 98.8 98.8 ...
## $ job_density : num [1:48] 18.3 18.5 18.6 18.7 18.7 ...
## $ pop_minwage : num [1:48] 9.66 9.66 9.66 9.59 9.59 ...
## $ exchange_rate : num [1:48] 14.7 14.9 15.2 15.2 15.3 ...
## $ max_temperature : num [1:48] 28 31 29 32 34 32 29 29 29 29 ...
## $ holiday_month : num [1:48] 0 0 0 1 0 0 0 0 1 0 ...
# By analyzing the descriptive statistics we can see that the database contains information from January 2015 to December 2018. Also, that the average unit boxes that are sold are 6´473,691 and that the exchange rate varied a lot, being in a range of 14.69 - 21.39, a 45% change. Finally, it called my attention that job density was between 18.26 and 22.36, which means that when it was lower there was less people employed, which could impact the frequency which with they buy coke.
summary(coca_cola)
## tperiod sales_unitboxes consumer_sentiment CPI
## Min. :2015 Min. :5301755 Min. :28.67 Min. : 86.97
## 1st Qu.:2016 1st Qu.:6171767 1st Qu.:35.64 1st Qu.: 89.18
## Median :2017 Median :6461357 Median :36.76 Median : 92.82
## Mean :2017 Mean :6473691 Mean :37.15 Mean : 93.40
## 3rd Qu.:2017 3rd Qu.:6819782 3rd Qu.:38.14 3rd Qu.: 98.40
## Max. :2018 Max. :7963063 Max. :44.87 Max. :103.02
## inflation_rate unemp_rate gdp_percapita itaee
## Min. :-0.5000 Min. :0.03466 Min. :11559 Min. :103.8
## 1st Qu.: 0.1650 1st Qu.:0.04010 1st Qu.:11830 1st Qu.:111.5
## Median : 0.3850 Median :0.04369 Median :12014 Median :113.5
## Mean : 0.3485 Mean :0.04442 Mean :11979 Mean :113.9
## 3rd Qu.: 0.5575 3rd Qu.:0.04897 3rd Qu.:12162 3rd Qu.:117.1
## Max. : 1.7000 Max. :0.05517 Max. :12329 Max. :122.5
## itaee_growth pop_density job_density pop_minwage
## Min. :0.005571 Min. : 98.54 Min. :18.26 Min. : 9.398
## 1st Qu.:0.022376 1st Qu.: 99.61 1st Qu.:19.28 1st Qu.:10.794
## Median :0.029977 Median :100.67 Median :20.39 Median :11.139
## Mean :0.031736 Mean :100.65 Mean :20.38 Mean :11.116
## 3rd Qu.:0.043038 3rd Qu.:101.69 3rd Qu.:21.60 3rd Qu.:11.413
## Max. :0.056536 Max. :102.69 Max. :22.36 Max. :13.026
## exchange_rate max_temperature holiday_month
## Min. :14.69 Min. :26.00 Min. :0.00
## 1st Qu.:17.38 1st Qu.:29.00 1st Qu.:0.00
## Median :18.62 Median :30.00 Median :0.00
## Mean :18.18 Mean :30.50 Mean :0.25
## 3rd Qu.:19.06 3rd Qu.:32.25 3rd Qu.:0.25
## Max. :21.39 Max. :37.00 Max. :1.00
Correlation
#In this step I made a correlation map, in order to see how the different variables relate.
coca_cola_num <- coca_cola[, sapply(coca_cola, is.numeric)]
cor_matrix <- cor(coca_cola_num)
melted_cor_matrix <- melt(cor_matrix, varnames = c("Var1", "Var2"), value.name = "value")
ggplot(data = melted_cor_matrix, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white", linewidth = 2) +
scale_fill_distiller(palette = "Pastel2", direction = 1, limits = c(-1, 1)) +
geom_text(aes(label = round(value, 2)), size = 4) +
theme_minimal() +
labs(title = "Correlation Heatmap", fill = "Correlation") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

#We can observe that there is a positive relation between the maximum temperature and sales unit boxes, which means that when the temperature is higher, sales too.
ggplot(coca_cola, aes(x = max_temperature, y = sales_unitboxes)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
theme_minimal() +
labs(title = "Scatterplot between Maximum temperature and Sales Unit boxes",
x = "Maximum temperature",
y = "Sales Unit boxes")
## `geom_smooth()` using formula = 'y ~ x'

# Create a scatterplot between 'inflation_rate' and 'sales_unitboxes' with a best fit line.
ggplot(coca_cola, aes(x = inflation_rate, y = sales_unitboxes)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
theme_minimal() +
labs(title = "Scatterplot between Inflation Rate and Sales Unit Boxes with Best Fit Line",
x = "Inflation Rate",
y = "Sales Unit Boxes")
## `geom_smooth()` using formula = 'y ~ x'

# Create a scatterplot between 'itaee' and 'sales_unitboxes' with a best fit line.
ggplot(coca_cola, aes(x = itaee, y = sales_unitboxes)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
theme_minimal() +
labs(title = "Scatterplot between itaee and Sales Unit Boxes with Best Fit Line",
x = "Itaee",
y = "Sales Unit Boxes")
## `geom_smooth()` using formula = 'y ~ x'

#Display a histogram of dependent variable
ggplot(coca_cola, aes(x = sales_unitboxes)) +
geom_histogram(binwidth = 120000, fill = "blue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Histogram sales unit boxes",
x = "sales_unitboxes",
y = "Frequency")

# Transform the dependent variable sales_unitboxes using logarithms
coca_cola$log10_sales_unitboxes <- log10(coca_cola$sales_unitboxes)
ggplot(coca_cola, aes(x = log10_sales_unitboxes)) +
geom_histogram(binwidth = 0.03, fill = "blue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Histogram - Log10 transformed sales unit boxes",
x = "Log10 of sales unit boxes",
y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5))

#Briefly describe at least 3 hypotheses which you would like to explore through regression analysis
#1. When the temperature is higher outside, the Coca Cola sales rise in Guadalajara
#2. Inflation rate and sales units boxes behave in an inverse way
#3. Usually sales unit boxes in Guadalajara metropolitan area are between six and seven million per month
Linear regression model
model1 <- lm(sales_unitboxes ~ lag(sales_unitboxes) + inflation_rate + exchange_rate + unemp_rate + itaee + consumer_sentiment + pop_density + max_temperature + holiday_month, data = coca_cola)
summary(model1)
##
## Call:
## lm(formula = sales_unitboxes ~ lag(sales_unitboxes) + inflation_rate +
## exchange_rate + unemp_rate + itaee + consumer_sentiment +
## pop_density + max_temperature + holiday_month, data = coca_cola)
##
## Residuals:
## Min 1Q Median 3Q Max
## -497157 -277500 33640 216889 805947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.999e+06 1.133e+07 0.883 0.38304
## lag(sales_unitboxes) 2.455e-01 1.113e-01 2.205 0.03375 *
## inflation_rate -1.071e+05 2.310e+05 -0.463 0.64572
## exchange_rate -2.232e+04 7.329e+04 -0.305 0.76245
## unemp_rate -1.979e+06 1.788e+07 -0.111 0.91245
## itaee 1.019e+05 3.415e+04 2.983 0.00503 **
## consumer_sentiment 4.468e+04 2.516e+04 1.776 0.08397 .
## pop_density -2.248e+05 1.388e+05 -1.620 0.11369
## max_temperature 1.558e+05 3.332e+04 4.677 3.81e-05 ***
## holiday_month 1.474e+05 1.289e+05 1.144 0.26014
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 364200 on 37 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6916, Adjusted R-squared: 0.6165
## F-statistic: 9.218 on 9 and 37 DF, p-value: 3.685e-07
# I make predictions in order to calculate the RMSE
model <- lm(sales_unitboxes ~ inflation_rate + max_temperature + I(max_temperature^2) + itaee, data = coca_cola)
predicted_values <- predict(model, newdata = coca_cola)
if (length(coca_cola$sales_unitboxes) == length(predicted_values)) {
error <- coca_cola$sales_unitboxes - predicted_values
} else {
warning("Las longitudes de los objetos no coinciden. Verifique los datos.")
}
# Calculate the residuals and RMSE: Root Mean Square Error - RMSE. The lower the RMSE the better the model's performance.
predicted_values <- predict(model1)
residuals <- coca_cola$sales_unitboxes - predicted_values
## Warning in coca_cola$sales_unitboxes - predicted_values: longer object length
## is not a multiple of shorter object length
rmse <- sqrt(mean(residuals^2))
print(rmse)
## [1] 469124.2
# Calculate the AIC: # Akaike Information Criterion (AIC) provides a means for model selection. The lower the AIC the better the quality of the regression model results.
aic_value <- AIC(model1)
print(aic_value)
## [1] 1347.853
Diagnostic Tests
#Multicollinearity
vif_values <- vif(model1)
VIF <- data.frame(
Explanatory_Variables = names(vif_values),
VIF = round(vif_values, 2)
)
VIF <- VIF[order(-VIF$VIF), ]
print(VIF)
## Explanatory_Variables VIF
## pop_density pop_density 10.69
## itaee itaee 8.42
## exchange_rate exchange_rate 4.41
## unemp_rate unemp_rate 3.76
## inflation_rate inflation_rate 2.79
## max_temperature max_temperature 2.67
## consumer_sentiment consumer_sentiment 1.87
## lag(sales_unitboxes) lag(sales_unitboxes) 1.57
## holiday_month holiday_month 1.12
# Heteroscedasticity
bptest(model1)
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 7.3465, df = 9, p-value = 0.6011
# Normality of Regression Residuals: Jarque-Bera test
jb_test <- jarque.bera.test(residuals(model1))
print(jb_test)
##
## Jarque Bera Test
##
## data: residuals(model1)
## X-squared = 1.2032, df = 2, p-value = 0.5479
# Durbin-Watson residuals autocorrelation
dw_test <- dwtest(model1)
print(dw_test)
##
## Durbin-Watson test
##
## data: model1
## DW = 2.5586, p-value = 0.8948
## alternative hypothesis: true autocorrelation is greater than 0
Polynomial model
polynomial_model <- lm(sales_unitboxes ~ inflation_rate + I(max_temperature^2) + itaee, data = coca_cola)
summary(polynomial_model)
##
## Call:
## lm(formula = sales_unitboxes ~ inflation_rate + I(max_temperature^2) +
## itaee, data = coca_cola)
##
## Residuals:
## Min 1Q Median 3Q Max
## -966231 -249345 34027 246389 926188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2788220.6 1576277.2 -1.769 0.0838 .
## inflation_rate -397529.6 200069.1 -1.987 0.0532 .
## I(max_temperature^2) 1959.4 436.4 4.490 5.10e-05 ***
## itaee 66440.5 13769.8 4.825 1.71e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 405200 on 44 degrees of freedom
## Multiple R-squared: 0.5713, Adjusted R-squared: 0.5421
## F-statistic: 19.55 on 3 and 44 DF, p-value: 3.338e-08
# Predictions and residuals of polynomial model
predictions <- predict(polynomial_model, newdata = coca_cola)
residuals_poly <- coca_cola$sales_unitboxes - predictions
rmse <- sqrt(mean(residuals^2))
print(rmse)
## [1] 469124.2
# AIC of the polynomial model
AIC_polinomial <- AIC(polynomial_model)
print(AIC_polinomial)
## [1] 1381.597
Diagnostic tests
# : Multicollinearity
vif_values <- vif(polynomial_model)
VIF <- data.frame(
Explanatory_Variables = names(vif_values),
VIF = round(vif_values, 2)
)
VIF <- VIF[order(-VIF$VIF), ]
print(VIF)
## Explanatory_Variables VIF
## inflation_rate inflation_rate 1.74
## I(max_temperature^2) I(max_temperature^2) 1.48
## itaee itaee 1.23
# Heteroscedasticity
bptest(polynomial_model)
##
## studentized Breusch-Pagan test
##
## data: polynomial_model
## BP = 3.0024, df = 3, p-value = 0.3913
# Normality of Regression Residuals: Jarque-Bera test
jb_test <- jarque.bera.test(residuals(polynomial_model))
print(jb_test)
##
## Jarque Bera Test
##
## data: residuals(polynomial_model)
## X-squared = 0.10366, df = 2, p-value = 0.9495
Lasso Regression
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.3.3
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
Y <- coca_cola$sales_unitboxes
X <- model.matrix(sales_unitboxes ~ + itaee +
max_temperature, data = coca_cola)[,-1]
lasso_model <- glmnet(X, Y, alpha = 1, lambda = 0.1)
LASSO_pred <- predict(lasso_model, s = 0.1, newx = X)
coef_lasso <- as.data.frame(as.matrix(coef(lasso_model)))
coef_lasso$features <- rownames(coef_lasso)
rownames(coef_lasso) <- NULL
names(coef_lasso)[1] <- "Coeficient Estimate"
print(coef_lasso)
## Coeficient Estimate features
## 1 -4532602.75 (Intercept)
## 2 56466.27 itaee
## 3 150080.99 max_temperature
# Calculate the RMSE
LASSO_rmse <- rmse(Y, LASSO_pred)
print(LASSO_rmse)
## [1] 410568.3
# Calculate the AIC
n <- nrow(X)
p <- sum(coef_lasso$`Coeficient Estimate` != 0) - 1
RSS <- sum((Y - LASSO_pred)^2)
AIC_lasso <- n * log(RSS/n) + 2 * (p + 1)
print(AIC_lasso)
## [1] 1246.829
Diagnostic tests
# Multicollinearity
residuals_lasso <- Y - LASSO_pred[,1]
# Calculthe the VIF
vif_data <- as.data.frame(X)
vif_results <- vif(lm(Y ~ ., data = vif_data))
print(vif_results)
## itaee max_temperature
## 1.040671 1.040671
# Heteroscedasticity: Breusch-Pagan
bp_test <- bptest(lm(Y ~ X))
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: lm(Y ~ X)
## BP = 3.2082, df = 2, p-value = 0.2011
# Normality of Regression Residuals: Jarque-Bera
jb_test <- jarque.bera.test(residuals_lasso)
print(jb_test)
##
## Jarque Bera Test
##
## data: residuals_lasso
## X-squared = 0.17051, df = 2, p-value = 0.9183
RIDGE regression
Y <- coca_cola$sales_unitboxes
X <- model.matrix(sales_unitboxes ~ inflation_rate + max_temperature, data = coca_cola)[,-1]
ridge_model <- glmnet(X, Y, alpha = 0, lambda = 0.1)
Ridge_pred <- predict(ridge_model, s = 0.1, newx = X)
coef_ridge <- as.data.frame(as.matrix(coef(ridge_model)))
coef_ridge$features <- rownames(coef_ridge)
rownames(coef_ridge) <- NULL
names(coef_ridge)[1] <- "Coeficient Estimate"
print(coef_ridge)
## Coeficient Estimate features
## 1 2630303.15 (Intercept)
## 2 -41562.05 inflation_rate
## 3 126487.66 max_temperature
# Calculate the RMSE
Ridge_rmse <- rmse(Y, Ridge_pred)
print(Ridge_rmse)
## [1] 485973.6
#Calculate the AIC
n <- nrow(X) # número de observaciones
p <- ncol(X) # número de predictores
Ridge_pred <- predict(ridge_model, s = 0.1, newx = X)
RSS <- sum((Y - Ridge_pred)^2)
AIC_ridge <- n * log(RSS/n) + 2 * (p + 1) # +1 porque incluye el término de intercepto
print(AIC_ridge)
## [1] 1263.015
Diagnostic tests
residuals_ridge <- Y - Ridge_pred[,1]
# Calculate the VIF
vif_data <- as.data.frame(X)
vif_results <- vif(lm(Y ~ ., data = vif_data))
print(vif_results)
## inflation_rate max_temperature
## 1.45828 1.45828
# Heteroscedasticity: Breusch-Pagan
bp_test <- bptest(lm(Y ~ X)) # La función bptest requiere el modelo lineal como entrada
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: lm(Y ~ X)
## BP = 1.1321, df = 2, p-value = 0.5678
jb_test <- jarque.bera.test(residuals_ridge)
print(jb_test)
##
## Jarque Bera Test
##
## data: residuals_ridge
## X-squared = 0.62314, df = 2, p-value = 0.7323
Conclusions
# According to Linear Regression and Polynomial Model, maximum temperature and ITAEE affect Coca Cola sales with a confidence level of 99%.
# The regression model with the best performance was Lasso model, with an RMSE of 410568.3 and an AIC of 1246.829, making it the most reliable.
# In Lasso model I only included maximum temperature and ITAEE, and it was the one with best performance.
# With a p-value of 0.4117, we accept the null hypothesis that there is no heteroscedasticity, meaning the variance of the errors is constant.
# With a p-value of 0.91 in Lasso, we accept the null hypothesis that the residuals follow a normal distribution.
# Of all the variables, the only one that shows multicollinearity is job density, with a VIF of 10.69, which is why it was not used in the models.
# The model that presented the least heteroscedasticity was the linear regression model with a p-value = 0.6011.