# Loading
library(xts)
library(dplyr)
library(zoo)
library(tseries)
library(stats)
library(forecast)
library(astsa)
library(corrplot)
library(AER)
library(vars)
library(dynlm)
library(vars)
library(TSstudio)
library(tidyverse)
library(sarima)
library(dygraphs)
library(ggplot2)
library(reshape2)
library(car)
library(Metrics)
dataset <- read.csv("/Users/genarorodriguezalcantara/Desktop/Tec/Introduction to econometrics/EXAM/coca_cola_sales.csv")
str(dataset)
## 'data.frame': 48 obs. of 15 variables:
## $ tperiod : chr "15-ene" "15-feb" "15-mar" "15-abr" ...
## $ sales_unitboxes : num 5516689 5387496 5886747 6389182 6448275 ...
## $ consumer_sentiment: num 38.1 37.5 38.5 37.8 38 ...
## $ CPI : num 87.1 87.3 87.6 87.4 87 ...
## $ inflation_rate : num -0.09 0.19 0.41 -0.26 -0.5 0.17 0.15 0.21 0.37 0.51 ...
## $ unemp_rate : num 0.0523 0.0531 0.0461 0.051 0.0552 0.0507 0.0542 0.0547 0.0538 0.0539 ...
## $ gdp_percapita : num 11660 11660 11660 11626 11626 ...
## $ itaee : num 104 104 104 108 108 ...
## $ itaee_growth : num 0.0497 0.0497 0.0497 0.0318 0.0318 0.0318 0.0565 0.0565 0.0565 0.0056 ...
## $ pop_density : num 98.5 98.5 98.5 98.8 98.8 ...
## $ job_density : num 18.3 18.5 18.6 18.7 18.7 ...
## $ pop_minwage : num 9.66 9.66 9.66 9.59 9.59 ...
## $ exchange_rate : num 14.7 14.9 15.2 15.2 15.3 ...
## $ max_temperature : int 28 31 29 32 34 32 29 29 29 29 ...
## $ holiday_month : int 0 0 0 1 0 0 0 0 1 0 ...
num_observations <- nrow(dataset)
print(num_observations)
## [1] 48
missing_values <- sum(is.na(dataset))
print(missing_values)
## [1] 0
summary(dataset)
## tperiod sales_unitboxes consumer_sentiment CPI
## Length:48 Min. :5301755 Min. :28.67 Min. : 86.97
## Class :character 1st Qu.:6171767 1st Qu.:35.64 1st Qu.: 89.18
## Mode :character Median :6461357 Median :36.76 Median : 92.82
## Mean :6473691 Mean :37.15 Mean : 93.40
## 3rd Qu.:6819782 3rd Qu.:38.14 3rd Qu.: 98.40
## Max. :7963063 Max. :44.87 Max. :103.02
## inflation_rate unemp_rate gdp_percapita itaee
## Min. :-0.5000 Min. :0.03470 Min. :11559 Min. :103.8
## 1st Qu.: 0.1650 1st Qu.:0.04010 1st Qu.:11830 1st Qu.:111.5
## Median : 0.3850 Median :0.04370 Median :12014 Median :113.5
## Mean : 0.3485 Mean :0.04442 Mean :11979 Mean :113.9
## 3rd Qu.: 0.5575 3rd Qu.:0.04895 3rd Qu.:12162 3rd Qu.:117.1
## Max. : 1.7000 Max. :0.05520 Max. :12329 Max. :122.5
## itaee_growth pop_density job_density pop_minwage
## Min. :0.00560 Min. : 98.54 Min. :18.26 Min. : 9.398
## 1st Qu.:0.02237 1st Qu.: 99.61 1st Qu.:19.28 1st Qu.:10.794
## Median :0.02995 Median :100.67 Median :20.39 Median :11.139
## Mean :0.03172 Mean :100.65 Mean :20.38 Mean :11.116
## 3rd Qu.:0.04300 3rd Qu.:101.69 3rd Qu.:21.60 3rd Qu.:11.413
## Max. :0.05650 Max. :102.69 Max. :22.36 Max. :13.026
## exchange_rate max_temperature holiday_month
## Min. :14.69 Min. :26.00 Min. :0.00
## 1st Qu.:17.38 1st Qu.:29.00 1st Qu.:0.00
## Median :18.62 Median :30.00 Median :0.00
## Mean :18.18 Mean :30.50 Mean :0.25
## 3rd Qu.:19.06 3rd Qu.:32.25 3rd Qu.:0.25
## Max. :21.39 Max. :37.00 Max. :1.00
ggplot(dataset, aes(x = sales_unitboxes, y = consumer_sentiment)) +
geom_point() +
geom_text(aes(label = tperiod), vjust = -0.5, hjust = 0.5, size = 3) +
labs(x = "Sales Unit Boxes", y = "Consumer Sentiment") +
ggtitle("Pair-wise Graph: Sales Unit Boxes vs. Consumer Sentiment")
#### Data patterns: Sales Unit Boxes: The x-axis represents the “Sales
Unit Boxes” variable, which shows the number of units sold. The y-axis
represents the “Consumer Sentiment” variable, which indicates the level
of consumer sentiment. Increasing Sales: The plot shows a positive trend
in sales unit boxes over time. The values on the x-axis range from
approximately 100 to 1400, indicating a significant increase in the
number of units sold. Positive Consumer Sentiment: The plot also
indicates a positive trend in consumer sentiment. The values on the
y-axis range from 0 to 100, suggesting an increase in consumer sentiment
over time. Correlation: There appears to be a positive correlation
between sales unit boxes and consumer sentiment. As the number of units
sold increases, consumer sentiment also tends to increase. These
patterns suggest that there is a positive relationship between sales
unit boxes and consumer sentiment. This information can be valuable for
businesses and investors to understand market trends, consumer behavior,
and make informed decisions regarding sales strategies and
investments.
ggplot(dataset, aes(x = sales_unitboxes)) +
geom_histogram(binwidth = 500000) +
labs(x = "Sales Unit Boxes", y = "Frequency") +
ggtitle("Frequency Plot: Sales Unit Boxes")
cor_matrix <- cor(dataset[, c("sales_unitboxes", "consumer_sentiment", "CPI", "gdp_percapita")])
ggplot(data = melt(cor_matrix), aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "blue", high = "red") +
labs(x = "", y = "") +
ggtitle("Correlation Matrix Plot")
#### Comments: Positive correlation: A positive correlation coefficient
(close to +1) indicates a strong positive relationship between two
variables. As one variable increases, the other variable tends to
increase as well. Negative correlation: A negative correlation
coefficient (close to -1) indicates a strong negative relationship
between two variables. As one variable increases, the other variable
tends to decrease. No correlation: A correlation coefficient close to 0
indicates no significant linear relationship between two variables.
ggplot(dataset, aes(x = holiday_month, y = sales_unitboxes)) +
geom_boxplot() +
labs(x = "Holiday Month", y = "Sales Unit Boxes") +
ggtitle("Box Plot: Sales Unit Boxes by Holiday Month")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
Hypotheses: Null Hypothesis (H0): There is no significant relationship between consumer sentiment and sales unit boxes. Alternative Hypothesis (HA): There is a significant relationship between consumer sentiment and sales unit boxes.
model1 <- lm(sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + gdp_percapita, data = dataset)
summary(model1)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate +
## gdp_percapita, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1060991 -346279 45531 396795 933112
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1163357.0 7274002.0 0.160 0.87368
## consumer_sentiment 21558.8 35115.4 0.614 0.54249
## CPI 33575.0 46084.5 0.729 0.47023
## inflation_rate -662281.1 237122.7 -2.793 0.00776 **
## gdp_percapita 133.9 860.2 0.156 0.87701
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 545000 on 43 degrees of freedom
## Multiple R-squared: 0.2418, Adjusted R-squared: 0.1713
## F-statistic: 3.429 on 4 and 43 DF, p-value: 0.01612
Model 1: The coefficient estimates for the variables are as follows: consumer_sentiment: 21558.8 CPI: 33575.0 inflation_rate: -662281.1 gdp_percapita: 133.9 The p-values for the variables are as follows: consumer_sentiment: 0.54249 CPI: 0.47023 inflation_rate: 0.00776 gdp_percapita: 0.87701 The multiple R-squared value is 0.2418, indicating that the model explains 24.18% of the variance in the sales_unitboxes variable.
model2 <- lm(sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + gdp_percapita + itaee + itaee_growth, data = dataset)
summary(model2)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate +
## gdp_percapita + itaee + itaee_growth, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -868008 -339877 39144 334372 1006053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9419489 7055858 -1.335 0.189246
## consumer_sentiment 27338 31252 0.875 0.386789
## CPI -96181 54092 -1.778 0.082806 .
## inflation_rate -801003 217075 -3.690 0.000653 ***
## gdp_percapita 1027 814 1.262 0.214060
## itaee 105892 30668 3.453 0.001301 **
## itaee_growth -6992265 5529195 -1.265 0.213153
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 483300 on 41 degrees of freedom
## Multiple R-squared: 0.4316, Adjusted R-squared: 0.3484
## F-statistic: 5.189 on 6 and 41 DF, p-value: 0.0004756
Model 2: The coefficient estimates for the variables are as follows: consumer_sentiment: 27338 CPI: -96181 inflation_rate: -801003 gdp_percapita: 1027 itaee: 105892 itaee_growth: -6992265 The p-values for the variables are as follows: consumer_sentiment: 0.386789 CPI: 0.082806 inflation_rate: 0.000653 gdp_percapita: 0.214060 itaee: 0.001301 itaee_growth: 0.213153 The multiple R-squared value is 0.4316, indicating that the model explains 43.16% of the variance in the sales_unitboxes variable.
vif(model1)
## consumer_sentiment CPI inflation_rate gdp_percapita
## 1.628463 8.596376 1.350306 7.393816
vif(model2)
## consumer_sentiment CPI inflation_rate gdp_percapita
## 1.640403 15.062517 1.439226 8.419600
## itaee itaee_growth
## 4.275842 1.312193
plot(model1, which = 1)
plot(model2, which = 1)
AIC(model1)
## [1] 1410.965
AIC(model2)
## [1] 1401.138
# For Model 1
rmse_model1 <- sqrt(mean(model1$residuals^2))
# For Model 2
rmse_model2 <- sqrt(mean(model2$residuals^2))
Intercept: The intercept term in the regression equation is estimated to be -9419489. This represents the expected value of sales_unitboxes when all the independent variables are zero.
Consumer Sentiment: The coefficient estimate for consumer_sentiment is 27338 with a standard error of 31252. However, the p-value of 0.386789 suggests that consumer_sentiment is not statistically significant in explaining the variation in sales_unitboxes.
CPI: The coefficient estimate for CPI is -96181 with a standard error of 54092. The p-value of 0.082806 indicates that CPI is not statistically significant in explaining the variation in sales_unitboxes.
Inflation Rate: The coefficient estimate for inflation_rate is -801003 with a standard error of 217075. The p-value of 0.000653 suggests that inflation_rate has a statistically significant negative impact on sales_unitboxes.
GDP per Capita: The coefficient estimate for gdp_percapita is 1027 with a standard error of 814. However, the p-value of 0.214060 indicates that gdp_percapita is not statistically significant in explaining the variation in sales_unitboxes.
ITAEE: The coefficient estimate for itaee is 105892 with a standard error of 30668. The p-value of 0.001301 suggests that itaee has a statistically significant positive impact on sales_unitboxes.
ITAEE Growth: The coefficient estimate for itaee_growth is -6992265 with a standard error of 5529195. However, the p-value of 0.213153 indicates that itaee_growth is not statistically significant in explaining the variation in sales_unitboxes.
Overall, the regression model suggests that inflation_rate and itaee are the significant variables in explaining the variation in sales_unitboxes. An increase in inflation_rate is associated with a decrease in sales_unitboxes, while an increase in itaee is associated with an increase in sales_unitboxes. However, it is important to note that the adjusted R-squared value of 0.3484 indicates that the model explains only about 34.84% of the variation in sales_unitboxes.
# Create a new dataset with mean values for other variables
mean_dataset <- dataset
mean_dataset$consumer_sentiment <- mean(dataset$consumer_sentiment)
mean_dataset$CPI <- mean(dataset$CPI)
mean_dataset$gdp_percapita <- mean(dataset$gdp_percapita)
mean_dataset$itaee_growth <- mean(dataset$itaee_growth)
# Create a sequence of values for inflation_rate
inflation_seq <- seq(min(dataset$inflation_rate), max(dataset$inflation_rate), length.out = 48)
# Create a sequence of values for itaee
itaee_seq <- seq(min(dataset$itaee), max(dataset$itaee), length.out = 48)
# Predict sales_unitboxes for varying inflation_rate
inflation_predictions <- predict(model2, newdata = data.frame(mean_dataset, inflation_rate = inflation_seq))
# Predict sales_unitboxes for varying itaee
itaee_predictions <- predict(model2, newdata = data.frame(mean_dataset, itaee = itaee_seq))
# Create effects plots
plot(inflation_seq, inflation_predictions, type = "l", xlab = "Inflation Rate", ylab = "Sales Unit Boxes", main = "Effects Plot: Inflation Rate")
plot(itaee_seq, itaee_predictions, type = "l", xlab = "ITAEE", ylab = "Sales Unit Boxes", main = "Effects Plot: ITAEE")
# Make predictions using Model 2
predictions <- predict(model2, newdata = dataset)
# View the predicted values
head(predictions)
## 1 2 3 4 5 6
## 5933179 5677389 5494648 6547615 6787052 6265862
Comments:
The dependent variable, “sales_unitboxes,” has a minimum value of 5,301,755 and a maximum value of 7,963,063. The “consumer_sentiment” variable ranges from 28.67 to 44.87, indicating the level of consumer sentiment. The “CPI” (Consumer Price Index) ranges from 86.97 to 103.02, representing the inflation-adjusted price level of goods and services. The “inflation_rate” variable ranges from -0.5 to 1.7, indicating the rate of inflation. The “unemp_rate” variable ranges from 0.0347 to 0.0552, representing the unemployment rate. The “gdp_percapita” variable ranges from 11,559 to 12,329, indicating the GDP per capita. Other variables such as “itaee,” “pop_density,” “job_density,” “pop_minwage,” “exchange_rate,” “max_temperature,” and “holiday_month” also have their respective summary statistics.