# Loading
library(xts)
library(dplyr)
library(zoo)
library(tseries)
library(stats)
library(forecast)
library(astsa)
library(corrplot)
library(AER)
library(vars)
library(dynlm)
library(vars)
library(TSstudio)
library(tidyverse)
library(sarima)
library(dygraphs)
library(ggplot2)
library(reshape2)
library(car)
library(Metrics)

a) EXPLORATORY DATA ANALYSIS (20 pts):

dataset <- read.csv("/Users/genarorodriguezalcantara/Desktop/Tec/Introduction to econometrics/EXAM/coca_cola_sales.csv")

str(dataset)
## 'data.frame':    48 obs. of  15 variables:
##  $ tperiod           : chr  "15-ene" "15-feb" "15-mar" "15-abr" ...
##  $ sales_unitboxes   : num  5516689 5387496 5886747 6389182 6448275 ...
##  $ consumer_sentiment: num  38.1 37.5 38.5 37.8 38 ...
##  $ CPI               : num  87.1 87.3 87.6 87.4 87 ...
##  $ inflation_rate    : num  -0.09 0.19 0.41 -0.26 -0.5 0.17 0.15 0.21 0.37 0.51 ...
##  $ unemp_rate        : num  0.0523 0.0531 0.0461 0.051 0.0552 0.0507 0.0542 0.0547 0.0538 0.0539 ...
##  $ gdp_percapita     : num  11660 11660 11660 11626 11626 ...
##  $ itaee             : num  104 104 104 108 108 ...
##  $ itaee_growth      : num  0.0497 0.0497 0.0497 0.0318 0.0318 0.0318 0.0565 0.0565 0.0565 0.0056 ...
##  $ pop_density       : num  98.5 98.5 98.5 98.8 98.8 ...
##  $ job_density       : num  18.3 18.5 18.6 18.7 18.7 ...
##  $ pop_minwage       : num  9.66 9.66 9.66 9.59 9.59 ...
##  $ exchange_rate     : num  14.7 14.9 15.2 15.2 15.3 ...
##  $ max_temperature   : int  28 31 29 32 34 32 29 29 29 29 ...
##  $ holiday_month     : int  0 0 0 1 0 0 0 0 1 0 ...
num_observations <- nrow(dataset)
print(num_observations)
## [1] 48
missing_values <- sum(is.na(dataset))
print(missing_values)
## [1] 0
summary(dataset)
##    tperiod          sales_unitboxes   consumer_sentiment      CPI        
##  Length:48          Min.   :5301755   Min.   :28.67      Min.   : 86.97  
##  Class :character   1st Qu.:6171767   1st Qu.:35.64      1st Qu.: 89.18  
##  Mode  :character   Median :6461357   Median :36.76      Median : 92.82  
##                     Mean   :6473691   Mean   :37.15      Mean   : 93.40  
##                     3rd Qu.:6819782   3rd Qu.:38.14      3rd Qu.: 98.40  
##                     Max.   :7963063   Max.   :44.87      Max.   :103.02  
##  inflation_rate      unemp_rate      gdp_percapita       itaee      
##  Min.   :-0.5000   Min.   :0.03470   Min.   :11559   Min.   :103.8  
##  1st Qu.: 0.1650   1st Qu.:0.04010   1st Qu.:11830   1st Qu.:111.5  
##  Median : 0.3850   Median :0.04370   Median :12014   Median :113.5  
##  Mean   : 0.3485   Mean   :0.04442   Mean   :11979   Mean   :113.9  
##  3rd Qu.: 0.5575   3rd Qu.:0.04895   3rd Qu.:12162   3rd Qu.:117.1  
##  Max.   : 1.7000   Max.   :0.05520   Max.   :12329   Max.   :122.5  
##   itaee_growth      pop_density      job_density     pop_minwage    
##  Min.   :0.00560   Min.   : 98.54   Min.   :18.26   Min.   : 9.398  
##  1st Qu.:0.02237   1st Qu.: 99.61   1st Qu.:19.28   1st Qu.:10.794  
##  Median :0.02995   Median :100.67   Median :20.39   Median :11.139  
##  Mean   :0.03172   Mean   :100.65   Mean   :20.38   Mean   :11.116  
##  3rd Qu.:0.04300   3rd Qu.:101.69   3rd Qu.:21.60   3rd Qu.:11.413  
##  Max.   :0.05650   Max.   :102.69   Max.   :22.36   Max.   :13.026  
##  exchange_rate   max_temperature holiday_month 
##  Min.   :14.69   Min.   :26.00   Min.   :0.00  
##  1st Qu.:17.38   1st Qu.:29.00   1st Qu.:0.00  
##  Median :18.62   Median :30.00   Median :0.00  
##  Mean   :18.18   Mean   :30.50   Mean   :0.25  
##  3rd Qu.:19.06   3rd Qu.:32.25   3rd Qu.:0.25  
##  Max.   :21.39   Max.   :37.00   Max.   :1.00

Comments:

The dependent variable, “sales_unitboxes,” has a minimum value of 5,301,755 and a maximum value of 7,963,063. The “consumer_sentiment” variable ranges from 28.67 to 44.87, indicating the level of consumer sentiment. The “CPI” (Consumer Price Index) ranges from 86.97 to 103.02, representing the inflation-adjusted price level of goods and services. The “inflation_rate” variable ranges from -0.5 to 1.7, indicating the rate of inflation. The “unemp_rate” variable ranges from 0.0347 to 0.0552, representing the unemployment rate. The “gdp_percapita” variable ranges from 11,559 to 12,329, indicating the GDP per capita. Other variables such as “itaee,” “pop_density,” “job_density,” “pop_minwage,” “exchange_rate,” “max_temperature,” and “holiday_month” also have their respective summary statistics.

b) DATA VISUALIZATION (20 pts):

Pair-wise graph (scatter plot)

ggplot(dataset, aes(x = sales_unitboxes, y = consumer_sentiment)) +
  geom_point() +
  geom_text(aes(label = tperiod), vjust = -0.5, hjust = 0.5, size = 3) +
  labs(x = "Sales Unit Boxes", y = "Consumer Sentiment") +
  ggtitle("Pair-wise Graph: Sales Unit Boxes vs. Consumer Sentiment")

#### Data patterns: Sales Unit Boxes: The x-axis represents the “Sales Unit Boxes” variable, which shows the number of units sold. The y-axis represents the “Consumer Sentiment” variable, which indicates the level of consumer sentiment. Increasing Sales: The plot shows a positive trend in sales unit boxes over time. The values on the x-axis range from approximately 100 to 1400, indicating a significant increase in the number of units sold. Positive Consumer Sentiment: The plot also indicates a positive trend in consumer sentiment. The values on the y-axis range from 0 to 100, suggesting an increase in consumer sentiment over time. Correlation: There appears to be a positive correlation between sales unit boxes and consumer sentiment. As the number of units sold increases, consumer sentiment also tends to increase. These patterns suggest that there is a positive relationship between sales unit boxes and consumer sentiment. This information can be valuable for businesses and investors to understand market trends, consumer behavior, and make informed decisions regarding sales strategies and investments.

Frequency plot (histogram):

ggplot(dataset, aes(x = sales_unitboxes)) +
  geom_histogram(binwidth = 500000) +
  labs(x = "Sales Unit Boxes", y = "Frequency") +
  ggtitle("Frequency Plot: Sales Unit Boxes")

Correlation matrix plot (heatmap):

cor_matrix <- cor(dataset[, c("sales_unitboxes", "consumer_sentiment", "CPI", "gdp_percapita")])
ggplot(data = melt(cor_matrix), aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient(low = "blue", high = "red") +
  labs(x = "", y = "") +
  ggtitle("Correlation Matrix Plot")

#### Comments: Positive correlation: A positive correlation coefficient (close to +1) indicates a strong positive relationship between two variables. As one variable increases, the other variable tends to increase as well. Negative correlation: A negative correlation coefficient (close to -1) indicates a strong negative relationship between two variables. As one variable increases, the other variable tends to decrease. No correlation: A correlation coefficient close to 0 indicates no significant linear relationship between two variables.

Box plot

ggplot(dataset, aes(x = holiday_month, y = sales_unitboxes)) +
  geom_boxplot() +
  labs(x = "Holiday Month", y = "Sales Unit Boxes") +
  ggtitle("Box Plot: Sales Unit Boxes by Holiday Month")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

c) LINEAR REGRESSION MODEL SPECIFICATION (20 pts):

Hypotheses: Null Hypothesis (H0): There is no significant relationship between consumer sentiment and sales unit boxes. Alternative Hypothesis (HA): There is a significant relationship between consumer sentiment and sales unit boxes.

model1 <- lm(sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + gdp_percapita, data = dataset)
summary(model1)
## 
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + 
##     gdp_percapita, data = dataset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1060991  -346279    45531   396795   933112 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        1163357.0  7274002.0   0.160  0.87368   
## consumer_sentiment   21558.8    35115.4   0.614  0.54249   
## CPI                  33575.0    46084.5   0.729  0.47023   
## inflation_rate     -662281.1   237122.7  -2.793  0.00776 **
## gdp_percapita          133.9      860.2   0.156  0.87701   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 545000 on 43 degrees of freedom
## Multiple R-squared:  0.2418, Adjusted R-squared:  0.1713 
## F-statistic: 3.429 on 4 and 43 DF,  p-value: 0.01612

Comments on results:

Model 1: The coefficient estimates for the variables are as follows: consumer_sentiment: 21558.8 CPI: 33575.0 inflation_rate: -662281.1 gdp_percapita: 133.9 The p-values for the variables are as follows: consumer_sentiment: 0.54249 CPI: 0.47023 inflation_rate: 0.00776 gdp_percapita: 0.87701 The multiple R-squared value is 0.2418, indicating that the model explains 24.18% of the variance in the sales_unitboxes variable.

model2 <- lm(sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + gdp_percapita + itaee + itaee_growth, data = dataset)
summary(model2)
## 
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + 
##     gdp_percapita + itaee + itaee_growth, data = dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -868008 -339877   39144  334372 1006053 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -9419489    7055858  -1.335 0.189246    
## consumer_sentiment    27338      31252   0.875 0.386789    
## CPI                  -96181      54092  -1.778 0.082806 .  
## inflation_rate      -801003     217075  -3.690 0.000653 ***
## gdp_percapita          1027        814   1.262 0.214060    
## itaee                105892      30668   3.453 0.001301 ** 
## itaee_growth       -6992265    5529195  -1.265 0.213153    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 483300 on 41 degrees of freedom
## Multiple R-squared:  0.4316, Adjusted R-squared:  0.3484 
## F-statistic: 5.189 on 6 and 41 DF,  p-value: 0.0004756

Comments on results:

Model 2: The coefficient estimates for the variables are as follows: consumer_sentiment: 27338 CPI: -96181 inflation_rate: -801003 gdp_percapita: 1027 itaee: 105892 itaee_growth: -6992265 The p-values for the variables are as follows: consumer_sentiment: 0.386789 CPI: 0.082806 inflation_rate: 0.000653 gdp_percapita: 0.214060 itaee: 0.001301 itaee_growth: 0.213153 The multiple R-squared value is 0.4316, indicating that the model explains 43.16% of the variance in the sales_unitboxes variable.

Multicollinearity

vif(model1)
## consumer_sentiment                CPI     inflation_rate      gdp_percapita 
##           1.628463           8.596376           1.350306           7.393816
vif(model2)
## consumer_sentiment                CPI     inflation_rate      gdp_percapita 
##           1.640403          15.062517           1.439226           8.419600 
##              itaee       itaee_growth 
##           4.275842           1.312193

Heteroscedasticity

plot(model1, which = 1)

plot(model2, which = 1)

AIC(model1)
## [1] 1410.965
AIC(model2)
## [1] 1401.138
# For Model 1
rmse_model1 <- sqrt(mean(model1$residuals^2))

# For Model 2
rmse_model2 <- sqrt(mean(model2$residuals^2))

Interpret the regression results (model2)

  • Intercept: The intercept term in the regression equation is estimated to be -9419489. This represents the expected value of sales_unitboxes when all the independent variables are zero.

  • Consumer Sentiment: The coefficient estimate for consumer_sentiment is 27338 with a standard error of 31252. However, the p-value of 0.386789 suggests that consumer_sentiment is not statistically significant in explaining the variation in sales_unitboxes.

  • CPI: The coefficient estimate for CPI is -96181 with a standard error of 54092. The p-value of 0.082806 indicates that CPI is not statistically significant in explaining the variation in sales_unitboxes.

  • Inflation Rate: The coefficient estimate for inflation_rate is -801003 with a standard error of 217075. The p-value of 0.000653 suggests that inflation_rate has a statistically significant negative impact on sales_unitboxes.

  • GDP per Capita: The coefficient estimate for gdp_percapita is 1027 with a standard error of 814. However, the p-value of 0.214060 indicates that gdp_percapita is not statistically significant in explaining the variation in sales_unitboxes.

  • ITAEE: The coefficient estimate for itaee is 105892 with a standard error of 30668. The p-value of 0.001301 suggests that itaee has a statistically significant positive impact on sales_unitboxes.

  • ITAEE Growth: The coefficient estimate for itaee_growth is -6992265 with a standard error of 5529195. However, the p-value of 0.213153 indicates that itaee_growth is not statistically significant in explaining the variation in sales_unitboxes.

Overall, the regression model suggests that inflation_rate and itaee are the significant variables in explaining the variation in sales_unitboxes. An increase in inflation_rate is associated with a decrease in sales_unitboxes, while an increase in itaee is associated with an increase in sales_unitboxes. However, it is important to note that the adjusted R-squared value of 0.3484 indicates that the model explains only about 34.84% of the variation in sales_unitboxes.

# Create a new dataset with mean values for other variables
mean_dataset <- dataset
mean_dataset$consumer_sentiment <- mean(dataset$consumer_sentiment)
mean_dataset$CPI <- mean(dataset$CPI)
mean_dataset$gdp_percapita <- mean(dataset$gdp_percapita)
mean_dataset$itaee_growth <- mean(dataset$itaee_growth)

# Create a sequence of values for inflation_rate
inflation_seq <- seq(min(dataset$inflation_rate), max(dataset$inflation_rate), length.out = 48)

# Create a sequence of values for itaee
itaee_seq <- seq(min(dataset$itaee), max(dataset$itaee), length.out = 48)

# Predict sales_unitboxes for varying inflation_rate
inflation_predictions <- predict(model2, newdata = data.frame(mean_dataset, inflation_rate = inflation_seq))

# Predict sales_unitboxes for varying itaee
itaee_predictions <- predict(model2, newdata = data.frame(mean_dataset, itaee = itaee_seq))

# Create effects plots
plot(inflation_seq, inflation_predictions, type = "l", xlab = "Inflation Rate", ylab = "Sales Unit Boxes", main = "Effects Plot: Inflation Rate")

plot(itaee_seq, itaee_predictions, type = "l", xlab = "ITAEE", ylab = "Sales Unit Boxes", main = "Effects Plot: ITAEE")

# Make predictions using Model 2
predictions <- predict(model2, newdata = dataset)

# View the predicted values
head(predictions)
##       1       2       3       4       5       6 
## 5933179 5677389 5494648 6547615 6787052 6265862