#install.packages(c("readxl","dplyr","tidyverse","ggplot2","visdat","reshape2","car","tseries","lmtest","Metrics","polycor"))
library("Metrics")
## Warning: package 'Metrics' was built under R version 4.3.3
library("lmtest")
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library("tseries")
## Warning: package 'tseries' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library("car")
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
library("reshape2")
## Warning: package 'reshape2' was built under R version 4.3.3
library("dplyr")
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("tidyverse")
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("visdat")
## Warning: package 'visdat' was built under R version 4.3.3
library("ggplot2")
library("readxl")
## Warning: package 'readxl' was built under R version 4.3.3
library("polycor")
## Warning: package 'polycor' was built under R version 4.3.3
coca_cola <- read_excel("C:/Users/jmpo2/Downloads/cocacolasales.xlsx")
# Visualize the missing data to understand the data quality, we can see that in the whole database they aren´t nulls 
vis_miss(coca_cola, warn_large_data = FALSE)

#Here we can observe the structure of the database Coca Cola, looking the columns and their data types, in order to identify if anything needs to be changed.It´s important to say that holiday_month is treat as binary, not numeric.
glimpse(coca_cola)
## Rows: 48
## Columns: 15
## $ tperiod            <dttm> 2021-01-15, 2021-02-15, 2021-03-15, 2021-04-15, 20…
## $ sales_unitboxes    <dbl> 5516689, 5387496, 5886747, 6389182, 6448275, 669794…
## $ consumer_sentiment <dbl> 38.06250, 37.49114, 38.50522, 37.84286, 38.03169, 3…
## $ CPI                <dbl> 87.11010, 87.27538, 87.63072, 87.40384, 86.96737, 8…
## $ inflation_rate     <dbl> -0.09, 0.19, 0.41, -0.26, -0.50, 0.17, 0.15, 0.21, …
## $ unemp_rate         <dbl> 0.05230256, 0.05311320, 0.04608844, 0.05102038, 0.0…
## $ gdp_percapita      <dbl> 11659.56, 11659.55, 11659.55, 11625.75, 11625.74, 1…
## $ itaee              <dbl> 103.7654, 103.7654, 103.7654, 107.7518, 107.7518, 1…
## $ itaee_growth       <dbl> 0.049716574, 0.049716574, 0.049716574, 0.031838981,…
## $ pop_density        <dbl> 98.54185, 98.54186, 98.54187, 98.82843, 98.82844, 9…
## $ job_density        <dbl> 18.26048, 18.46329, 18.64164, 18.67876, 18.67539, 1…
## $ pop_minwage        <dbl> 9.657861, 9.657861, 9.657861, 9.594919, 9.594919, 9…
## $ exchange_rate      <dbl> 14.69259, 14.92134, 15.22834, 15.22618, 15.26447, 1…
## $ max_temperature    <dbl> 28, 31, 29, 32, 34, 32, 29, 29, 29, 29, 29, 26, 28,…
## $ holiday_month      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
head(coca_cola)
## # A tibble: 6 × 15
##   tperiod             sales_unitboxes consumer_sentiment   CPI inflation_rate
##   <dttm>                        <dbl>              <dbl> <dbl>          <dbl>
## 1 2021-01-15 00:00:00        5516689.               38.1  87.1          -0.09
## 2 2021-02-15 00:00:00        5387496.               37.5  87.3           0.19
## 3 2021-03-15 00:00:00        5886747.               38.5  87.6           0.41
## 4 2021-04-15 00:00:00        6389182.               37.8  87.4          -0.26
## 5 2021-05-15 00:00:00        6448275.               38.0  87.0          -0.5 
## 6 2021-06-15 00:00:00        6697947.               39.1  87.1           0.17
## # ℹ 10 more variables: unemp_rate <dbl>, gdp_percapita <dbl>, itaee <dbl>,
## #   itaee_growth <dbl>, pop_density <dbl>, job_density <dbl>,
## #   pop_minwage <dbl>, exchange_rate <dbl>, max_temperature <dbl>,
## #   holiday_month <dbl>
#In this chunk I change the tperiod to a sequence that starts in 2015.1 and ends in 2018.12, in order to facilitate the treatment of this column. Also, using the function head I showed the first 12 rows of the column tperiod, to confirm that they changed correctly. 

n_rows <- nrow(coca_cola)
start_year <- 2015
start_month <- 1

new_tperiod <- numeric(n_rows)  # Cambiar de character a numeric
for (i in 1:n_rows) {
  year <- start_year + (start_month - 1) %/% 12
  month <- (start_month - 1) %% 12 + 1
  new_tperiod[i] <- year + month / 100  # Genera un valor en formato double
  start_month <- start_month + 1
}

coca_cola$tperiod <- new_tperiod
head(coca_cola$tperiod, 12)
##  [1] 2015.01 2015.02 2015.03 2015.04 2015.05 2015.06 2015.07 2015.08 2015.09
## [10] 2015.10 2015.11 2015.12
str(coca_cola)
## tibble [48 × 15] (S3: tbl_df/tbl/data.frame)
##  $ tperiod           : num [1:48] 2015 2015 2015 2015 2015 ...
##  $ sales_unitboxes   : num [1:48] 5516689 5387496 5886747 6389182 6448275 ...
##  $ consumer_sentiment: num [1:48] 38.1 37.5 38.5 37.8 38 ...
##  $ CPI               : num [1:48] 87.1 87.3 87.6 87.4 87 ...
##  $ inflation_rate    : num [1:48] -0.09 0.19 0.41 -0.26 -0.5 0.17 0.15 0.21 0.37 0.51 ...
##  $ unemp_rate        : num [1:48] 0.0523 0.0531 0.0461 0.051 0.0552 ...
##  $ gdp_percapita     : num [1:48] 11660 11660 11660 11626 11626 ...
##  $ itaee             : num [1:48] 104 104 104 108 108 ...
##  $ itaee_growth      : num [1:48] 0.0497 0.0497 0.0497 0.0318 0.0318 ...
##  $ pop_density       : num [1:48] 98.5 98.5 98.5 98.8 98.8 ...
##  $ job_density       : num [1:48] 18.3 18.5 18.6 18.7 18.7 ...
##  $ pop_minwage       : num [1:48] 9.66 9.66 9.66 9.59 9.59 ...
##  $ exchange_rate     : num [1:48] 14.7 14.9 15.2 15.2 15.3 ...
##  $ max_temperature   : num [1:48] 28 31 29 32 34 32 29 29 29 29 ...
##  $ holiday_month     : num [1:48] 0 0 0 1 0 0 0 0 1 0 ...
# By analyzing the descriptive statistics we can see that the database contains information from January 2015 to December 2018. Also, that the average unit boxes that are sold are 6´473,691 and that the exchange rate varied a lot, being in a range of 14.69 - 21.39, a 45% change. Finally, it called my attention that job density was between 18.26 and 22.36, which means that when it was lower there was less people employed, which could impact the frequency which with they buy coke. 
summary(coca_cola)
##     tperiod     sales_unitboxes   consumer_sentiment      CPI        
##  Min.   :2015   Min.   :5301755   Min.   :28.67      Min.   : 86.97  
##  1st Qu.:2016   1st Qu.:6171767   1st Qu.:35.64      1st Qu.: 89.18  
##  Median :2017   Median :6461357   Median :36.76      Median : 92.82  
##  Mean   :2017   Mean   :6473691   Mean   :37.15      Mean   : 93.40  
##  3rd Qu.:2017   3rd Qu.:6819782   3rd Qu.:38.14      3rd Qu.: 98.40  
##  Max.   :2018   Max.   :7963063   Max.   :44.87      Max.   :103.02  
##  inflation_rate      unemp_rate      gdp_percapita       itaee      
##  Min.   :-0.5000   Min.   :0.03466   Min.   :11559   Min.   :103.8  
##  1st Qu.: 0.1650   1st Qu.:0.04010   1st Qu.:11830   1st Qu.:111.5  
##  Median : 0.3850   Median :0.04369   Median :12014   Median :113.5  
##  Mean   : 0.3485   Mean   :0.04442   Mean   :11979   Mean   :113.9  
##  3rd Qu.: 0.5575   3rd Qu.:0.04897   3rd Qu.:12162   3rd Qu.:117.1  
##  Max.   : 1.7000   Max.   :0.05517   Max.   :12329   Max.   :122.5  
##   itaee_growth       pop_density      job_density     pop_minwage    
##  Min.   :0.005571   Min.   : 98.54   Min.   :18.26   Min.   : 9.398  
##  1st Qu.:0.022376   1st Qu.: 99.61   1st Qu.:19.28   1st Qu.:10.794  
##  Median :0.029977   Median :100.67   Median :20.39   Median :11.139  
##  Mean   :0.031736   Mean   :100.65   Mean   :20.38   Mean   :11.116  
##  3rd Qu.:0.043038   3rd Qu.:101.69   3rd Qu.:21.60   3rd Qu.:11.413  
##  Max.   :0.056536   Max.   :102.69   Max.   :22.36   Max.   :13.026  
##  exchange_rate   max_temperature holiday_month 
##  Min.   :14.69   Min.   :26.00   Min.   :0.00  
##  1st Qu.:17.38   1st Qu.:29.00   1st Qu.:0.00  
##  Median :18.62   Median :30.00   Median :0.00  
##  Mean   :18.18   Mean   :30.50   Mean   :0.25  
##  3rd Qu.:19.06   3rd Qu.:32.25   3rd Qu.:0.25  
##  Max.   :21.39   Max.   :37.00   Max.   :1.00

Correlation

#In this step I made a correlation map, in order to see how the different variables relate. 
coca_cola_num <- coca_cola[, sapply(coca_cola, is.numeric)]

cor_matrix <- cor(coca_cola_num)
melted_cor_matrix <- melt(cor_matrix, varnames = c("Var1", "Var2"), value.name = "value")

ggplot(data = melted_cor_matrix, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile(color = "white", linewidth = 2) +
  scale_fill_distiller(palette = "Pastel2", direction = 1, limits = c(-1, 1)) +
  geom_text(aes(label = round(value, 2)), size = 4) +
  theme_minimal() +
  labs(title = "Correlation Heatmap", fill = "Correlation") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

#We can observe that there is a positive relation between the maximum temperature and sales unit boxes, which means that when the temperature is higher, sales too. 
ggplot(coca_cola, aes(x = max_temperature, y = sales_unitboxes)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  theme_minimal() +
  labs(title = "Scatterplot between Maximum temperature and Sales Unit boxes",
       x = "Maximum temperature",
       y = "Sales Unit boxes")
## `geom_smooth()` using formula = 'y ~ x'

# Create a scatterplot between 'inflation_rate' and 'sales_unitboxes' with a best fit line. 
ggplot(coca_cola, aes(x = inflation_rate, y = sales_unitboxes)) +
  geom_point(color = "blue", alpha = 0.6) +  
  geom_smooth(method = "lm", color = "red", se = FALSE) +  
  theme_minimal() +
  labs(title = "Scatterplot between Inflation Rate and Sales Unit Boxes with Best Fit Line",
       x = "Inflation Rate",
       y = "Sales Unit Boxes")
## `geom_smooth()` using formula = 'y ~ x'

# Create a scatterplot between 'itaee' and 'sales_unitboxes' with a best fit line. 
ggplot(coca_cola, aes(x = itaee, y = sales_unitboxes)) +
  geom_point(color = "blue", alpha = 0.6) +  
  geom_smooth(method = "lm", color = "red", se = FALSE) +  
  theme_minimal() +
  labs(title = "Scatterplot between itaee and Sales Unit Boxes with Best Fit Line",
       x = "Itaee",
       y = "Sales Unit Boxes")
## `geom_smooth()` using formula = 'y ~ x'

#Display a histogram of dependent variable
ggplot(coca_cola, aes(x = sales_unitboxes)) +
  geom_histogram(binwidth = 120000, fill = "blue", color = "black", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Histogram sales unit boxes", 
       x = "sales_unitboxes", 
       y = "Frequency")

# Transform the dependent variable sales_unitboxes using logarithms
coca_cola$log10_sales_unitboxes <- log10(coca_cola$sales_unitboxes)

ggplot(coca_cola, aes(x = log10_sales_unitboxes)) +
  geom_histogram(binwidth = 0.03, fill = "blue", color = "black", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Histogram - Log10 transformed sales unit boxes", 
       x = "Log10 of sales unit boxes", 
       y = "Frequency") +
  theme(plot.title = element_text(hjust = 0.5))

#Briefly describe at least 3 hypotheses which you would like to explore through regression analysis
#1. When the temperature is higher outside, the Coca Cola sales rise in Guadalajara
#2. Inflation rate and sales units boxes behave in an inverse way 
#3. Usually sales unit boxes in Guadalajara metropolitan area are between six and seven million per month 

Linear regression model

model1 <- lm(sales_unitboxes ~ lag(sales_unitboxes) + inflation_rate + exchange_rate + unemp_rate + itaee + consumer_sentiment + pop_density + max_temperature + holiday_month, data = coca_cola)

summary(model1)
## 
## Call:
## lm(formula = sales_unitboxes ~ lag(sales_unitboxes) + inflation_rate + 
##     exchange_rate + unemp_rate + itaee + consumer_sentiment + 
##     pop_density + max_temperature + holiday_month, data = coca_cola)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -497157 -277500   33640  216889  805947 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           9.999e+06  1.133e+07   0.883  0.38304    
## lag(sales_unitboxes)  2.455e-01  1.113e-01   2.205  0.03375 *  
## inflation_rate       -1.071e+05  2.310e+05  -0.463  0.64572    
## exchange_rate        -2.232e+04  7.329e+04  -0.305  0.76245    
## unemp_rate           -1.979e+06  1.788e+07  -0.111  0.91245    
## itaee                 1.019e+05  3.415e+04   2.983  0.00503 ** 
## consumer_sentiment    4.468e+04  2.516e+04   1.776  0.08397 .  
## pop_density          -2.248e+05  1.388e+05  -1.620  0.11369    
## max_temperature       1.558e+05  3.332e+04   4.677 3.81e-05 ***
## holiday_month         1.474e+05  1.289e+05   1.144  0.26014    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 364200 on 37 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6916, Adjusted R-squared:  0.6165 
## F-statistic: 9.218 on 9 and 37 DF,  p-value: 3.685e-07
# I make predictions in order to calculate the RMSE
model <- lm(sales_unitboxes ~ inflation_rate + max_temperature + I(max_temperature^2) + itaee, data = coca_cola)

predicted_values <- predict(model, newdata = coca_cola)

if (length(coca_cola$sales_unitboxes) == length(predicted_values)) {
    error <- coca_cola$sales_unitboxes - predicted_values
} else {
    warning("Las longitudes de los objetos no coinciden. Verifique los datos.")
}
# Calculate the residuals and RMSE: Root Mean Square Error - RMSE. The lower the RMSE the better the model's performance.
predicted_values <- predict(model1)

residuals <- coca_cola$sales_unitboxes - predicted_values
## Warning in coca_cola$sales_unitboxes - predicted_values: longer object length
## is not a multiple of shorter object length
rmse <- sqrt(mean(residuals^2))
print(rmse)
## [1] 469124.2
# Calculate the AIC: # Akaike Information Criterion (AIC) provides a means for model selection. The lower the AIC the better the quality of the regression model results. 
aic_value <- AIC(model1)
print(aic_value)
## [1] 1347.853

Diagnostic Tests

#Multicollinearity
vif_values <- vif(model1)

VIF <- data.frame(
  Explanatory_Variables = names(vif_values),
  VIF = round(vif_values, 2)
)
VIF <- VIF[order(-VIF$VIF), ]
print(VIF)
##                      Explanatory_Variables   VIF
## pop_density                    pop_density 10.69
## itaee                                itaee  8.42
## exchange_rate                exchange_rate  4.41
## unemp_rate                      unemp_rate  3.76
## inflation_rate              inflation_rate  2.79
## max_temperature            max_temperature  2.67
## consumer_sentiment      consumer_sentiment  1.87
## lag(sales_unitboxes)  lag(sales_unitboxes)  1.57
## holiday_month                holiday_month  1.12
# Heteroscedasticity
bptest(model1) 
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 7.3465, df = 9, p-value = 0.6011
# Normality of Regression Residuals: Jarque-Bera test 
jb_test <- jarque.bera.test(residuals(model1))
print(jb_test)
## 
##  Jarque Bera Test
## 
## data:  residuals(model1)
## X-squared = 1.2032, df = 2, p-value = 0.5479
# Durbin-Watson residuals autocorrelation 
dw_test <- dwtest(model1)
print(dw_test)
## 
##  Durbin-Watson test
## 
## data:  model1
## DW = 2.5586, p-value = 0.8948
## alternative hypothesis: true autocorrelation is greater than 0

Polynomial model

polynomial_model <- lm(sales_unitboxes ~ inflation_rate + I(max_temperature^2) + itaee, data = coca_cola)
summary(polynomial_model)
## 
## Call:
## lm(formula = sales_unitboxes ~ inflation_rate + I(max_temperature^2) + 
##     itaee, data = coca_cola)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -966231 -249345   34027  246389  926188 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2788220.6  1576277.2  -1.769   0.0838 .  
## inflation_rate        -397529.6   200069.1  -1.987   0.0532 .  
## I(max_temperature^2)     1959.4      436.4   4.490 5.10e-05 ***
## itaee                   66440.5    13769.8   4.825 1.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 405200 on 44 degrees of freedom
## Multiple R-squared:  0.5713, Adjusted R-squared:  0.5421 
## F-statistic: 19.55 on 3 and 44 DF,  p-value: 3.338e-08
# Predictions and residuals of polynomial model 
predictions <- predict(polynomial_model, newdata = coca_cola)

residuals_poly <- coca_cola$sales_unitboxes - predictions

rmse <- sqrt(mean(residuals^2))
print(rmse)
## [1] 469124.2
# AIC of the polynomial model 
AIC_polinomial <- AIC(polynomial_model)
print(AIC_polinomial)
## [1] 1381.597

Diagnostic tests

# : Multicollinearity  
vif_values <- vif(polynomial_model)

VIF <- data.frame(
  Explanatory_Variables = names(vif_values),
  VIF = round(vif_values, 2)
)
VIF <- VIF[order(-VIF$VIF), ]
print(VIF)
##                      Explanatory_Variables  VIF
## inflation_rate              inflation_rate 1.74
## I(max_temperature^2)  I(max_temperature^2) 1.48
## itaee                                itaee 1.23
# Heteroscedasticity
bptest(polynomial_model) 
## 
##  studentized Breusch-Pagan test
## 
## data:  polynomial_model
## BP = 3.0024, df = 3, p-value = 0.3913
# Normality of Regression Residuals: Jarque-Bera test 
jb_test <- jarque.bera.test(residuals(polynomial_model))
print(jb_test)
## 
##  Jarque Bera Test
## 
## data:  residuals(polynomial_model)
## X-squared = 0.10366, df = 2, p-value = 0.9495

Lasso Regression

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.3.3
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
Y <- coca_cola$sales_unitboxes
X <- model.matrix(sales_unitboxes ~  + itaee +
                   max_temperature, data = coca_cola)[,-1]

lasso_model <- glmnet(X, Y, alpha = 1, lambda = 0.1)

LASSO_pred <- predict(lasso_model, s = 0.1, newx = X)

coef_lasso <- as.data.frame(as.matrix(coef(lasso_model)))
coef_lasso$features <- rownames(coef_lasso)
rownames(coef_lasso) <- NULL
names(coef_lasso)[1] <- "Coeficient Estimate"

print(coef_lasso)
##   Coeficient Estimate        features
## 1         -4532602.75     (Intercept)
## 2            56466.27           itaee
## 3           150080.99 max_temperature
# Calculate the RMSE
LASSO_rmse <- rmse(Y, LASSO_pred)
print(LASSO_rmse)
## [1] 410568.3
# Calculate the AIC
n <- nrow(X)  
p <- sum(coef_lasso$`Coeficient Estimate` != 0) - 1 
RSS <- sum((Y - LASSO_pred)^2)
AIC_lasso <- n * log(RSS/n) + 2 * (p + 1)  

print(AIC_lasso)
## [1] 1246.829

Diagnostic tests

# Multicollinearity
residuals_lasso <- Y - LASSO_pred[,1]

# Calculthe the VIF 
vif_data <- as.data.frame(X)
vif_results <- vif(lm(Y ~ ., data = vif_data))
print(vif_results)
##           itaee max_temperature 
##        1.040671        1.040671
# Heteroscedasticity: Breusch-Pagan
bp_test <- bptest(lm(Y ~ X))
print(bp_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm(Y ~ X)
## BP = 3.2082, df = 2, p-value = 0.2011
# Normality of Regression Residuals: Jarque-Bera
jb_test <- jarque.bera.test(residuals_lasso)
print(jb_test)
## 
##  Jarque Bera Test
## 
## data:  residuals_lasso
## X-squared = 0.17051, df = 2, p-value = 0.9183

RIDGE regression

Y <- coca_cola$sales_unitboxes
X <- model.matrix(sales_unitboxes ~ inflation_rate + max_temperature, data = coca_cola)[,-1]

ridge_model <- glmnet(X, Y, alpha = 0, lambda = 0.1)

Ridge_pred <- predict(ridge_model, s = 0.1, newx = X)

coef_ridge <- as.data.frame(as.matrix(coef(ridge_model)))
coef_ridge$features <- rownames(coef_ridge)
rownames(coef_ridge) <- NULL
names(coef_ridge)[1] <- "Coeficient Estimate"

print(coef_ridge)
##   Coeficient Estimate        features
## 1          2630303.15     (Intercept)
## 2           -41562.05  inflation_rate
## 3           126487.66 max_temperature
# Calculate the RMSE
Ridge_rmse <- rmse(Y, Ridge_pred)
print(Ridge_rmse)
## [1] 485973.6
#Calculate the AIC
n <- nrow(X)  # número de observaciones
p <- ncol(X)  # número de predictores

Ridge_pred <- predict(ridge_model, s = 0.1, newx = X)

RSS <- sum((Y - Ridge_pred)^2)

AIC_ridge <- n * log(RSS/n) + 2 * (p + 1)  # +1 porque incluye el término de intercepto

print(AIC_ridge)
## [1] 1263.015

Diagnostic tests

residuals_ridge <- Y - Ridge_pred[,1]

# Calculate the VIF 
vif_data <- as.data.frame(X)
vif_results <- vif(lm(Y ~ ., data = vif_data))
print(vif_results)
##  inflation_rate max_temperature 
##         1.45828         1.45828
# Heteroscedasticity: Breusch-Pagan
bp_test <- bptest(lm(Y ~ X)) # La función bptest requiere el modelo lineal como entrada
print(bp_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm(Y ~ X)
## BP = 1.1321, df = 2, p-value = 0.5678
jb_test <- jarque.bera.test(residuals_ridge)
print(jb_test)
## 
##  Jarque Bera Test
## 
## data:  residuals_ridge
## X-squared = 0.62314, df = 2, p-value = 0.7323

Conclusions

# According to Linear Regression and Polynomial Model, maximum temperature and ITAEE affect Coca Cola sales with a confidence level of 99%.
# The regression model with the best performance was Lasso model, with an RMSE of 410568.3 and an AIC of 1246.829, making it the most reliable.
# In Lasso model I only included maximum temperature and ITAEE, and it was the one with best performance. 
# With a p-value of 0.4117, we accept the null hypothesis that there is no heteroscedasticity, meaning the variance of the errors is constant.
# With a p-value of 0.91 in Lasso, we accept the null hypothesis that the residuals follow a normal distribution. 
# Of all the variables, the only one that shows multicollinearity is job density, with a VIF of 10.69, which is why it was not used in the models.
# The model that presented the least heteroscedasticity was the linear regression model with a p-value = 0.6011.