### loading libraries
library(foreign)
library(dplyr) # data manipulation
library(forcats) # to work with categorical variables
library(ggplot2) # data visualization
library(readr) # read specific csv files
library(janitor) # data exploration and cleaning
library(Hmisc) # several useful functions for data analysis
library(psych) # functions for multivariate analysis
library(naniar) # summaries and visualization of missing values NA's
library(dlookr) # summaries and visualization of missing values NA's
library(corrplot) # correlation plots
library(jtools) # presentation of regression analysis
library(lmtest) # diagnostic checks - linear regression analysis
library(car) # diagnostic checks - linear regression analysis
library(olsrr) # diagnostic checks - linear regression analysis
library(naniar) # identifying missing values
library(stargazer) # create publication quality tables
library(effects) # displays for linear and other regression models
library(tidyverse) # collection of R packages designed for data science
library(caret) # Classification and Regression Training
library(glmnet) # methods for prediction and plotting, and functions for cross-validation
tperiod: date sales_unitboxes dependent variable: sales coca-cola unit boxes consumer_sentiment: how consumers feel about the state of the economy CPI: consumer price index 2018=100 inflation_rate: change in the consumer price index 2018=100 unemp_rate: percentage of the labor force that is unemployed gdp_percapita: gross domestic population by population itaee: Indicator of the State Economic Activity - ITAEE itaee_growth: itaee’s growth rate pop_density: population per km2 job_density: employed population per km2 pop_minwage: population per km2 earning 1-2 miniumum wages exchange_rate: exchange rate U.S. - MXN max_temperature: average max temperature *holiday_month: 1 if month includes a holiday week including: public holiday, easter holiday, and christmas; 0 otherwise
#The database is loaded
<- read.csv("/Users/gabrielmedina/Downloads/coca_cola_sales.csv")
coca coca
## tperiod sales_unitboxes consumer_sentiment CPI inflation_rate
## 1 15/01/21 5516689 38.063 87.110 -0.09
## 2 15/02/21 5387496 37.491 87.275 0.19
## 3 15/03/21 5886747 38.505 87.631 0.41
## 4 15/04/21 6389182 37.843 87.404 -0.26
## 5 15/05/21 6448275 38.032 86.967 -0.50
## 6 15/06/21 6697947 39.112 87.113 0.17
## 7 15/07/21 6420091 38.132 87.241 0.15
## 8 15/08/21 6474440 37.384 87.425 0.21
## 9 15/09/21 6340781 37.449 87.752 0.37
## 10 15/10/21 6539561 37.813 88.204 0.51
## 11 15/11/21 6025373 38.183 88.685 0.55
## 12 15/12/21 6714438 38.369 89.047 0.41
## 13 16/01/21 5477874 38.182 89.386 0.38
## 14 16/02/21 5580397 36.732 89.778 0.44
## 15 16/03/21 6399322 36.814 89.910 0.15
## 16 16/04/21 6780480 36.714 89.625 -0.32
## 17 16/05/21 7423475 37.485 89.226 -0.45
## 18 16/06/21 7271309 38.349 89.324 0.11
## 19 16/07/21 6872616 36.506 89.557 0.26
## 20 16/08/21 6804384 35.655 89.809 0.28
## 21 16/09/21 6779166 34.755 90.358 0.61
## 22 16/10/21 6492389 35.032 90.906 0.61
## 23 16/11/21 6105159 34.875 91.617 0.78
## 24 16/12/21 6580560 35.478 92.039 0.46
## 25 17/01/21 5757061 28.668 93.604 1.70
## 26 17/02/21 5301755 31.516 94.145 0.58
## 27 17/03/21 6272641 33.795 94.722 0.61
## 28 17/04/21 6286247 34.935 94.839 0.12
## 29 17/05/21 7345037 35.873 94.725 -0.12
## 30 17/06/21 7211316 36.010 94.964 0.25
## 31 17/07/21 6329457 36.489 95.323 0.38
## 32 17/08/21 6865977 36.506 95.794 0.49
## 33 17/09/21 6219637 36.788 96.094 0.31
## 34 17/10/21 6182126 36.437 96.698 0.63
## 35 17/11/21 6498477 36.717 97.695 1.03
## 36 17/12/21 6590566 36.315 98.273 0.59
## 37 18/01/21 5705102 34.802 98.795 0.53
## 38 18/02/21 5568552 34.189 99.171 0.38
## 39 18/03/21 6882616 34.337 99.492 0.32
## 40 18/04/21 7121483 35.612 99.155 -0.34
## 41 18/05/21 7963063 36.648 98.994 -0.16
## 42 18/06/21 7330137 37.148 99.376 0.39
## 43 18/07/21 7130397 43.341 99.909 0.54
## 44 18/08/21 7457473 43.006 100.492 0.58
## 45 18/09/21 6264685 42.133 100.917 0.42
## 46 18/10/21 6347760 42.533 101.440 0.52
## 47 18/11/21 6140687 41.675 102.303 0.85
## 48 18/12/21 6556749 44.865 103.020 0.70
## unemp_rate gdp_percapita itaee itaee_growth pop_density job_density
## 1 0.0523 11659.56 103.7654 0.0497 98.5418 18.2605
## 2 0.0531 11659.55 103.7654 0.0497 98.5419 18.4633
## 3 0.0461 11659.55 103.7654 0.0497 98.5419 18.6416
## 4 0.0510 11625.75 107.7518 0.0318 98.8284 18.6788
## 5 0.0552 11625.74 107.7518 0.0318 98.8284 18.6754
## 6 0.0507 11625.74 107.7518 0.0318 98.8285 18.6467
## 7 0.0542 11591.89 110.5957 0.0565 99.1170 18.7028
## 8 0.0547 11591.89 110.5957 0.0565 99.1171 18.7835
## 9 0.0538 11591.89 110.5957 0.0565 99.1171 18.9389
## 10 0.0539 11558.59 111.7800 0.0056 99.4026 19.0979
## 11 0.0438 11558.59 111.7800 0.0056 99.4026 19.3272
## 12 0.0489 11558.59 111.7800 0.0056 99.4026 19.1579
## 13 0.0479 11987.32 108.7077 0.0476 99.6856 19.1579
## 14 0.0485 11987.32 108.7077 0.0476 99.6857 19.3712
## 15 0.0433 11987.32 108.7077 0.0476 99.6857 19.4560
## 16 0.0452 11953.72 111.7936 0.0375 99.9659 19.5872
## 17 0.0511 11953.71 111.7936 0.0375 99.9659 19.6069
## 18 0.0458 11953.71 111.7936 0.0375 99.9659 19.6768
## 19 0.0459 11919.98 113.7051 0.0281 100.2488 19.7453
## 20 0.0491 11919.98 113.7051 0.0281 100.2488 19.8668
## 21 0.0537 11919.97 113.7051 0.0281 100.2488 20.0734
## 22 0.0442 11886.91 117.0615 0.0472 100.5277 20.3076
## 23 0.0436 11886.91 117.0615 0.0472 100.5277 20.5067
## 24 0.0405 11886.91 117.0615 0.0472 100.5277 20.2683
## 25 0.0401 12137.86 113.2336 0.0416 100.8041 20.2683
## 26 0.0364 12137.86 113.2336 0.0416 100.8041 20.4683
## 27 0.0368 12137.86 113.2336 0.0416 100.8042 20.7349
## 28 0.0409 12105.16 112.6669 0.0078 101.0764 20.7453
## 29 0.0414 12105.16 112.6669 0.0078 101.0764 20.7721
## 30 0.0378 12105.16 112.6669 0.0078 101.0764 20.8715
## 31 0.0407 12072.12 116.3738 0.0235 101.3531 20.9045
## 32 0.0445 12072.12 116.3738 0.0235 101.3531 21.1465
## 33 0.0445 12072.12 116.3738 0.0235 101.3531 21.3258
## 34 0.0414 12039.80 119.7875 0.0233 101.6251 21.5634
## 35 0.0401 12039.80 119.7875 0.0233 101.6251 21.7130
## 36 0.0347 12039.80 119.7875 0.0233 101.6251 21.4366
## 37 0.0401 12329.05 115.6723 0.0215 101.8944 21.4366
## 38 0.0393 12329.05 115.6723 0.0215 101.8944 21.6969
## 39 0.0359 12329.04 115.6723 0.0215 101.8945 21.7603
## 40 0.0414 12296.98 117.3254 0.0413 102.1602 21.8253
## 41 0.0384 12296.98 117.3254 0.0413 102.1602 21.8741
## 42 0.0407 12296.97 117.3254 0.0413 102.1602 21.9094
## 43 0.0394 12264.69 118.9366 0.0220 102.4291 21.8432
## 44 0.0454 12264.69 118.9366 0.0220 102.4291 22.0394
## 45 0.0399 12264.69 118.9366 0.0220 102.4291 22.1380
## 46 0.0366 12233.00 122.4821 0.0225 102.6945 22.2484
## 47 0.0379 12233.00 122.4821 0.0225 102.6945 22.3622
## 48 0.0414 12232.99 122.4821 0.0225 102.6945 21.9749
## pop_minwage exchange_rate max_temperature holiday_month
## 1 9.6579 14.6926 28 0
## 2 9.6579 14.9213 31 0
## 3 9.6579 15.2283 29 0
## 4 9.5949 15.2262 32 1
## 5 9.5949 15.2645 34 0
## 6 9.5949 15.4830 32 0
## 7 9.3984 15.9396 29 0
## 8 9.3984 16.5368 29 0
## 9 9.3984 16.8578 29 1
## 10 10.6757 16.5640 29 0
## 11 10.6757 16.6357 29 0
## 12 10.6757 17.0666 26 1
## 13 11.3009 18.0728 28 0
## 14 11.3009 18.4731 31 0
## 15 11.3009 17.6490 32 1
## 16 10.8817 17.4877 33 0
## 17 10.8817 18.1542 35 0
## 18 10.8817 18.6530 33 0
## 19 10.8337 18.6014 31 0
## 20 10.8337 18.4749 32 0
## 21 10.8337 19.1924 33 1
## 22 10.9448 18.8924 29 0
## 23 10.9448 20.1185 29 0
## 24 10.9448 20.5206 28 1
## 25 11.3279 21.3853 29 0
## 26 11.3279 20.2905 30 0
## 27 11.3279 19.3010 31 0
## 28 11.2363 18.7875 33 1
## 29 11.2363 18.7557 36 0
## 30 11.2363 18.1326 35 0
## 31 11.0423 17.8283 29 0
## 32 11.0423 17.8070 29 0
## 33 11.0423 17.8357 30 1
## 34 11.2409 18.8161 30 0
## 35 11.2409 18.9158 30 0
## 36 11.2409 19.1812 27 1
## 37 12.7219 18.9074 27 0
## 38 12.7219 18.6449 29 0
## 39 12.7219 18.6308 33 1
## 40 13.0263 18.3872 33 0
## 41 13.0263 19.5910 37 0
## 42 13.0263 20.3032 35 0
## 43 12.2970 19.0095 31 0
## 44 12.2970 18.8575 29 0
## 45 12.2970 19.0154 28 1
## 46 11.6695 19.1859 28 0
## 47 11.6695 20.2612 28 0
## 48 11.6695 20.1112 26 1
str(coca)
## 'data.frame': 48 obs. of 15 variables:
## $ tperiod : chr "15/01/21" "15/02/21" "15/03/21" "15/04/21" ...
## $ sales_unitboxes : num 5516689 5387496 5886747 6389182 6448275 ...
## $ consumer_sentiment: num 38.1 37.5 38.5 37.8 38 ...
## $ CPI : num 87.1 87.3 87.6 87.4 87 ...
## $ inflation_rate : num -0.09 0.19 0.41 -0.26 -0.5 0.17 0.15 0.21 0.37 0.51 ...
## $ unemp_rate : num 0.0523 0.0531 0.0461 0.051 0.0552 0.0507 0.0542 0.0547 0.0538 0.0539 ...
## $ gdp_percapita : num 11660 11660 11660 11626 11626 ...
## $ itaee : num 104 104 104 108 108 ...
## $ itaee_growth : num 0.0497 0.0497 0.0497 0.0318 0.0318 0.0318 0.0565 0.0565 0.0565 0.0056 ...
## $ pop_density : num 98.5 98.5 98.5 98.8 98.8 ...
## $ job_density : num 18.3 18.5 18.6 18.7 18.7 ...
## $ pop_minwage : num 9.66 9.66 9.66 9.59 9.59 ...
## $ exchange_rate : num 14.7 14.9 15.2 15.2 15.3 ...
## $ max_temperature : int 28 31 29 32 34 32 29 29 29 29 ...
## $ holiday_month : int 0 0 0 1 0 0 0 0 1 0 ...
We begin to analyze the structure of the database and the nature of each of our variables. It is important to note that our dependent variable will be the sales of Coca Cola in units of boxes. You can see how there are some variables in incorrect formats such as period, maximum temperature and holidays, which must be a factor. Therefore, it is necessary to apply transformations to these variables.
sum(is.na(coca))
## [1] 0
No missing values found
#Data transformation a data time format for tperiod variable.
$tperiod <- as.Date(coca$tperiod, format = "$Y-mm-dd")
cocaclass(coca$tperiod)
## [1] "Date"
summary(coca)
## tperiod sales_unitboxes consumer_sentiment CPI
## Min. :NA Min. :5301755 Min. :28.67 Min. : 86.97
## 1st Qu.:NA 1st Qu.:6171767 1st Qu.:35.64 1st Qu.: 89.18
## Median :NA Median :6461357 Median :36.76 Median : 92.82
## Mean :NaN Mean :6473691 Mean :37.15 Mean : 93.40
## 3rd Qu.:NA 3rd Qu.:6819782 3rd Qu.:38.14 3rd Qu.: 98.40
## Max. :NA Max. :7963063 Max. :44.87 Max. :103.02
## NA's :48
## inflation_rate unemp_rate gdp_percapita itaee
## Min. :-0.5000 Min. :0.03470 Min. :11559 Min. :103.8
## 1st Qu.: 0.1650 1st Qu.:0.04010 1st Qu.:11830 1st Qu.:111.5
## Median : 0.3850 Median :0.04370 Median :12014 Median :113.5
## Mean : 0.3485 Mean :0.04442 Mean :11979 Mean :113.9
## 3rd Qu.: 0.5575 3rd Qu.:0.04895 3rd Qu.:12162 3rd Qu.:117.1
## Max. : 1.7000 Max. :0.05520 Max. :12329 Max. :122.5
##
## itaee_growth pop_density job_density pop_minwage
## Min. :0.00560 Min. : 98.54 Min. :18.26 Min. : 9.398
## 1st Qu.:0.02237 1st Qu.: 99.61 1st Qu.:19.28 1st Qu.:10.794
## Median :0.02995 Median :100.67 Median :20.39 Median :11.139
## Mean :0.03172 Mean :100.65 Mean :20.38 Mean :11.116
## 3rd Qu.:0.04300 3rd Qu.:101.69 3rd Qu.:21.60 3rd Qu.:11.413
## Max. :0.05650 Max. :102.69 Max. :22.36 Max. :13.026
##
## exchange_rate max_temperature holiday_month
## Min. :14.69 Min. :26.00 Min. :0.00
## 1st Qu.:17.38 1st Qu.:29.00 1st Qu.:0.00
## Median :18.62 Median :30.00 Median :0.00
## Mean :18.18 Mean :30.50 Mean :0.25
## 3rd Qu.:19.06 3rd Qu.:32.25 3rd Qu.:0.25
## Max. :21.39 Max. :37.00 Max. :1.00
##
The data set provides relevant information on various variables that can influence Coca-Cola case unit sales. Among the highlights, it is observed that the dependent variable “sales_unitboxes” presents values ranging between approximately 5.3 million and 7.96 million, with an average of around 6.47 million. 48 null values were found in Tperiod but it is due to an incorrect reading due to an incorrect date format.
ggplot(data = coca, aes(x = consumer_sentiment, y = sales_unitboxes)) +
geom_point(color = "blue") +
labs(x = "Consumer Sentiment", y = "Sales Unit Boxes") +
ggtitle("Scatter Plot: Sales Unit Boxes vs Consumer Sentiment")
ggplot(data = coca, aes(x = CPI, y = sales_unitboxes)) +
geom_point(color = "green") +
labs(x = "CPI", y = "Sales Unit Boxes") +
ggtitle("Scatter Plot: Sales Unit Boxes vs CPI")
ggplot(data = coca, aes(x = inflation_rate, y = sales_unitboxes)) +
geom_line(color = "red") +
labs(x = "inflation rate", y = "Sales Unit Boxes") +
ggtitle("Scatter Plot: Sales Unit vs inflation rate")
$holiday_month <- as.factor(coca$holiday_month)
cocaggplot(data = coca, aes(x = holiday_month, y = sales_unitboxes, fill = holiday_month)) +
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
labs(x = "Holiday Month", y = "Mean Sales Unit Boxes") +
ggtitle("Bar Plot: Mean Sales Unit Boxes by Holiday Month")
In the first graph, a slight and positive correlation can be seen between consumer sentiment and the company’s sales. On the other hand, the relationship between the CPI and the dependent variable seems to be a positive correlation with a low coefficient. Furthermore, in the Sales Unit vs inflation rate graph, no correlation pattern is seen between these variables. Lastly, sales during vacations are higher than in months without vacations, although the difference is not as big as one might think. This gives us an idea of which variables have predictive potential for the dependent variable.
hist(coca$consumer_sentiment)
hist(coca$CPI)
hist(coca$inflation_rate)
hist(coca$unemp_rate)
hist(coca$itaee)
hist(coca$itaee_growth)
hist(coca$pop_density)
hist(coca$job_density)
hist(coca$pop_minwage)
hist(coca$max_temperature)
hist(coca$exchange_rate)
hist(coca$gdp_percapita)
These histogram graphs allow us to analyze the distribution of the data
for each variable, some such as CPI, Job_density and GDP_per_capita are
observed that show a bias in their data. Therefore, it may be
interesting to apply a normalization with logarithm.
hist(coca$sales_unitboxes,
main = "Sales by unitboxes",
xlab = "Sales in Unitboxes",
ylab = "Frequency",
col = "blue",
border = "black",
xlim = c(5300000, 7500000),
ylim = c(0, 10),
breaks = 20)
abline(v = mean(coca$sales_unitboxes), col = "red", lwd = 2)
Regarding the dependent variable, a normal distribution of the data is
seen.
<- coca[, !colnames(coca) %in% c("tperiod", "holiday_month")]
coca_sub
<- cor(coca_sub)
cor_matrix
library(corrplot)
corrplot(cor_matrix,
type = "upper",
order = "hclust",
addCoef.col = "black",
tl.cex = 0.7)
To better understand the correlation of all the variables, a graph and a
correlation matrix were made. This allows us to identify variables that
show high correlation with the dependent variable and that can
potentially be excellent predictors for a linear regression model.
Likewise, it allows identifying the high correlations between the
independent variables, which will cause multiculionality problems. A
problem that is present in the nature of the data, since they are
macroeconomic variables where each of them has a certain relationship
between others.
cor(coca_sub)
## sales_unitboxes consumer_sentiment CPI
## sales_unitboxes 1.00000000 0.22670601 0.21286896
## consumer_sentiment 0.22670601 1.00000000 0.21765036
## CPI 0.21286896 0.21765036 1.00000000
## inflation_rate -0.33892955 -0.14477176 0.33311898
## unemp_rate -0.07511561 0.13112228 -0.79692370
## gdp_percapita 0.20993984 -0.02580430 0.89212873
## itaee 0.31776870 0.21036414 0.85296096
## itaee_growth -0.23644607 -0.17855702 -0.40215613
## pop_density 0.29642271 0.16559922 0.97873559
## job_density 0.28974759 0.13709293 0.97736968
## pop_minwage 0.27706139 -0.01774064 0.83370714
## exchange_rate 0.17754338 -0.20924928 0.67444061
## max_temperature 0.57154833 -0.23028372 -0.08988001
## inflation_rate unemp_rate gdp_percapita itaee
## sales_unitboxes -0.3389296 -0.07511561 0.2099398 0.3177687
## consumer_sentiment -0.1447718 0.13112228 -0.0258043 0.2103641
## CPI 0.3331190 -0.79692370 0.8921287 0.8529610
## inflation_rate 1.0000000 -0.37521243 0.2242198 0.4246480
## unemp_rate -0.3752124 1.00000000 -0.7959235 -0.6660985
## gdp_percapita 0.2242198 -0.79592354 1.0000000 0.6928207
## itaee 0.4246480 -0.66609852 0.6928207 1.0000000
## itaee_growth -0.1078732 0.32911242 -0.2432080 -0.3811852
## pop_density 0.3316906 -0.79011859 0.9107825 0.9072726
## job_density 0.3380064 -0.80507262 0.8962504 0.9031985
## pop_minwage 0.1913408 -0.73619228 0.9082651 0.6706054
## exchange_rate 0.5380607 -0.70976863 0.7536566 0.7558331
## max_temperature -0.5605895 0.02806429 0.1432913 -0.1976924
## itaee_growth pop_density job_density pop_minwage
## sales_unitboxes -0.236446066 0.29642271 0.28974759 0.27706139
## consumer_sentiment -0.178557021 0.16559922 0.13709293 -0.01774064
## CPI -0.402156127 0.97873559 0.97736968 0.83370714
## inflation_rate -0.107873184 0.33169057 0.33800639 0.19134077
## unemp_rate 0.329112420 -0.79011859 -0.80507262 -0.73619228
## gdp_percapita -0.243208039 0.91078246 0.89625044 0.90826505
## itaee -0.381185204 0.90727261 0.90319846 0.67060541
## itaee_growth 1.000000000 -0.40732835 -0.41345524 -0.31938333
## pop_density -0.407328345 1.00000000 0.99171505 0.85825240
## job_density -0.413455239 0.99171505 1.00000000 0.84843442
## pop_minwage -0.319383334 0.85825240 0.84843442 1.00000000
## exchange_rate -0.138833715 0.75707909 0.73809406 0.71055357
## max_temperature -0.001742395 -0.03432007 -0.01652593 0.13570714
## exchange_rate max_temperature
## sales_unitboxes 0.1775434 0.571548328
## consumer_sentiment -0.2092493 -0.230283717
## CPI 0.6744406 -0.089880006
## inflation_rate 0.5380607 -0.560589549
## unemp_rate -0.7097686 0.028064292
## gdp_percapita 0.7536566 0.143291270
## itaee 0.7558331 -0.197692379
## itaee_growth -0.1388337 -0.001742395
## pop_density 0.7570791 -0.034320074
## job_density 0.7380941 -0.016525934
## pop_minwage 0.7105536 0.135707144
## exchange_rate 1.0000000 -0.015224701
## max_temperature -0.0152247 1.000000000
All plots were explained.
According to the exploratory analysis, the following hypotheses are proposed…
HYPOTHESIS 1: A strong and positive relationship is expected between the ITAEE and Coca-Cola sales, since the ITAEE represents the State Economic Activity Indicator, an increase in this indicator generally indicates economic growth in the region. It is expected that in times of economic growth, people will have more disposable income to spend on products like Coca-Cola, which could lead to an increase in sales. Therefore, it is hypothesized that higher ITAEE is positively associated with Coca-Cola sales.
HYPOTHESIS 2: A positive relationship is expected between the maximum temperature and sales of Coca-Cola cases, since the maximum temperature can influence people’s consumption preferences. In warmer climates, people are more likely to reach for refreshing drinks like Coca-Cola. Therefore, it is hypothesized that an increase in maximum temperature is positively related to Coca-Cola case sales, as warmer weather may increase demand for soft drinks.
HYPOTHESIS 3: A positive relationship is expected between Consumer Sentiment and Coca-Cola case sales, as an increase in Consumer Sentiment could indicate greater confidence and optimism in the economy. When consumers feel more financially secure, they are more likely to spend on consumer products, such as Coca-Cola. Therefore, it is hypothesized that higher Consumer Sentiment is positively related to an increase in Coca-Cola case sales.
The variables chosen in the exploratory analysis of the data were used to propose the following model….
<- lm(sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + gdp_percapita + pop_density, data = coca)
modelo1
summary(modelo1)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate +
## gdp_percapita + pop_density, data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -732291 -369568 -13513 339961 813988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.591e+07 2.003e+07 -4.289 0.000103 ***
## consumer_sentiment 2.185e+04 2.906e+04 0.752 0.456335
## CPI -2.212e+05 6.767e+04 -3.269 0.002158 **
## inflation_rate -7.530e+05 1.973e+05 -3.817 0.000437 ***
## gdp_percapita -1.093e+03 7.612e+02 -1.437 0.158260
## pop_density 1.248e+06 2.738e+05 4.558 4.41e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 451100 on 42 degrees of freedom
## Multiple R-squared: 0.4928, Adjusted R-squared: 0.4324
## F-statistic: 8.16 on 5 and 42 DF, p-value: 1.888e-05
In this model, the estimated coefficients provide information about how each independent variable affects Coca-Cola case unit sales. For example, the coefficient for “CPI” is -2.212e+05, which means that for every unit that the Consumer Price Index increases, Coca-Cola sales are expected to decrease on average by about 221,200 units of boxes.
Similarly, the coefficient for “inflation_rate” is -7.530e+05, indicating that for every unit the inflation rate increases, sales are expected to decrease on average by about 753,000 case units. On the other hand, the coefficient for “pop_density” is 1.248e+06, which suggests that for every unit increase in population density, sales are expected to increase on average by about 1,248,000 case units. “CPI”, “inflation_rate” and “pop_density” are significant, suggesting that they have an impact on sales, while “consumer_sentiment” and “gdp_percapita” are not significant in this context. The model as a whole has an adjusted R-squared of 0.4324, which means that about 43% of the variation in sales is explained by these independent variables.
With the findings in the exploratory analysis of the data, some variables were transformed…
<- lm(log(sales_unitboxes) ~ consumer_sentiment + unemp_rate + I(max_temperature^2) + log(itaee) + log(gdp_percapita), data = coca)
modelo_2
#summary
summary(modelo_2)
##
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + unemp_rate +
## I(max_temperature^2) + log(itaee) + log(gdp_percapita), data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.09700 -0.03929 0.01097 0.03200 0.11440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.277e+01 6.501e+00 3.503 0.00111 **
## consumer_sentiment 7.905e-03 3.080e-03 2.566 0.01393 *
## unemp_rate 1.255e+00 2.486e+00 0.505 0.61620
## I(max_temperature^2) 4.570e-04 5.524e-05 8.273 2.33e-10 ***
## log(itaee) 1.651e+00 3.086e-01 5.350 3.40e-06 ***
## log(gdp_percapita) -1.671e+00 7.371e-01 -2.266 0.02864 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05512 on 42 degrees of freedom
## Multiple R-squared: 0.6865, Adjusted R-squared: 0.6492
## F-statistic: 18.39 on 5 and 42 DF, p-value: 1.221e-09
As for this model…
Consumer Sentiment: The coefficient for “consumer_sentiment” is 7.905e-03, which means that for every unit increase in the consumer sentiment index, an increase of 0.0079 in the log of Coca-Cola case unit sales. This suggests that greater positive sentiment among consumers is associated with a proportionally small increase in Coca-Cola sales.
Unemployment Rate: The coefficient for “unemp_rate” is 1.255, indicating that for every percentage point increase in the unemployment rate, an increase of approximately 1.255 is expected in the log of case unit sales. However, since the p-value associated with this coefficient is high (0.61620), we cannot consider this effect as statistically significant in this model.
Max Temperature^2: The coefficient for “I(max_temperature^2)” is 4.570e-04, which implies that for every one unit increase in the square of the maximum temperature, an increase of 0.000457 in the logarithm is expected of Coca-Cola case unit sales. This suggests that an increase in maximum temperature has a moderate positive effect on sales.
Log(ITAEE): The coefficient for “log(itaee)” is 1.651, which means that for each unit increase in the logarithm of the ITAEE variable (Indicator of the State Economic Activity and Employment), an increase of 1.651 is expected. in the logarithm of Coca-Cola case unit sales. This indicates that an increase in economic activity and employment is positively related to sales.
Log(GDP per Capita): The coefficient for “log(gdp_per capita)” is -1.671, which implies that for each unit increase in the logarithm of GDP per capita, a decrease of 1.671 in the logarithm of sales of goods is expected. Coca-Cola case units. This suggests that higher GDP per capita is associated with a decrease in Coca-Cola sales.
The model indicates that variables such as consumer sentiment, maximum temperature, ITAEE (indicator of economic activity and employment) have a statistically significant impact on Coca-Cola sales, while the unemployment rate and the logarithm of GDP per capita are not so significant in this context. Furthermore, the model has an adjusted R-squared value of 0.6492, suggesting that these variables explain approximately 64.92% of the variability in Coca-Cola sales.
#Multicollinearity Test
vif(modelo1)
## consumer_sentiment CPI inflation_rate gdp_percapita
## 1.628471 27.057491 1.364196 8.451460
## pop_density
## 28.803547
The VIF values are greater than 10, therefore problems of multiculionality were found, that is, there is a high correlation between independent variables.
#Heteroscedasticity | Breusch-Pagan Test
bptest(modelo1)
##
## studentized Breusch-Pagan test
##
## data: modelo1
## BP = 16.579, df = 5, p-value = 0.005372
The p-value is equal to 0.005372, which means it is significantly low and less than a conventional significance level, such as 0.05. This suggests that the error variance is not constant in the model, indicating the presence of heteroscedasticity.
#Normality of residuals
<- residuals(modelo1)
residuos1 shapiro.test(residuos1)
##
## Shapiro-Wilk normality test
##
## data: residuos1
## W = 0.96589, p-value = 0.1743
The p-value is equal to 0.1743, which is greater than a conventional significance level, such as 0.05. This suggests that the residuals follow an approximately normal distribution.
plot(residuos1)
#Multicollinearity Test
vif(modelo_2)
## consumer_sentiment unemp_rate I(max_temperature^2)
## 1.225057 3.296322 1.278276
## log(itaee) log(gdp_percapita)
## 2.601104 3.729142
No multicultural problems were found. There is no high correlation between independent variables.
#Heteroscedasticity | Breusch-Pagan Test
bptest(modelo_2)
##
## studentized Breusch-Pagan test
##
## data: modelo_2
## BP = 3.2856, df = 5, p-value = 0.656
Since the p-value is greater than a conventional significance level, such as 0.05, we do not have strong statistical evidence to say that heteroscedasticity (non-constant variability) exists in the model residuals.
#Normality of residuals
<- residuals(modelo_2)
residuos2 shapiro.test(residuos2)
##
## Shapiro-Wilk normality test
##
## data: residuos2
## W = 0.96638, p-value = 0.1824
Since the p-value is greater than a commonly used significance level, such as 0.05, we do not have sufficient evidence to reject the null hypothesis that the residuals follow a normal distribution. The residuals follow a normal distribution as seen in the following graph.
plot(residuos2)
##### Select the regression model that better fits the data (e.g., AIC
and / or RMSE).
AIC(modelo1)
## [1] 1393.674
AIC(modelo_2)
## [1] -134.4258
Given the R square value, the statistical significance of the predictor variables, the AIC and the diagnostic tests performed. Model 2 was chosen.
For model 2…
Consumer Sentiment: The coefficient for “consumer_sentiment” is 7.905e-03, which means that for every unit increase in the consumer sentiment index, an increase of 0.0079 in the log of Coca-Cola case unit sales. This suggests that greater positive sentiment among consumers is associated with a proportionally small increase in Coca-Cola sales.
Unemployment Rate: The coefficient for “unemp_rate” is 1.255, indicating that for every percentage point increase in the unemployment rate, an increase of approximately 1.255 is expected in the log of case unit sales. However, since the p-value associated with this coefficient is high (0.61620), we cannot consider this effect as statistically significant in this model.
Max Temperature^2: The coefficient for “I(max_temperature^2)” is 4.570e-04, which implies that for every one unit increase in the square of the maximum temperature, an increase of 0.000457 in the logarithm is expected of Coca-Cola case unit sales. This suggests that an increase in maximum temperature has a moderate positive effect on sales.
Log(ITAEE): The coefficient for “log(itaee)” is 1.651, which means that for each unit increase in the logarithm of the ITAEE variable (Indicator of the State Economic Activity and Employment), an increase of 1.651 is expected. in the logarithm of Coca-Cola case unit sales. This indicates that an increase in economic activity and employment is positively related to sales.
Log(GDP per Capita): The coefficient for “log(gdp_per capita)” is -1.671, which implies that for each unit increase in the logarithm of GDP per capita, a decrease of 1.671 in the logarithm of sales of goods is expected. Coca-Cola case units. This suggests that higher GDP per capita is associated with a decrease in Coca-Cola sales.
The model indicates that variables such as consumer sentiment, maximum temperature, ITAEE (indicator of economic activity and employment) have a statistically significant impact on Coca-Cola sales, while the unemployment rate and the logarithm of GDP per capita are not so significant in this context. Furthermore, the model has an adjusted R-squared value of 0.6492, suggesting that these variables explain approximately 64.92% of the variability in Coca-Cola sales.
library(effects)
<- allEffects(modelo_2)
efectos_modelo2 par(cex.lab = 0.1, cex.axis = 0.1)
# Graph the effects
plot(efectos_modelo2)
The goal of the anterior effects plots is to visualize how the dependent
variable changes in response to changes in the independent variables
while keeping the other variables constant.
The graphs show how, for example, temperature, consumer sentiment and ITAEE have a marked positive effect on the dependent variable. That is, they have a strong causal relationship with the dependent variable. This confirms the hypotheses raised at the beginning of this report. On the contrary, GDP per capita seems to have a negative causal relationship with the dependent variable and lastly, the unemployment rate seems to have a small positive relationship with the dependent variable, which is interesting because it does not make much sense in a real context. and contradicts the nature of some positive relationships such as consumer sentiment or ITAEE.
In this study, it was hypothesized that Coca-Cola sales may be statistically significantly related to three key predictor variables: temperature, the ITAEE index (State Economic Activity Indicator), and consumer sentiment. Through the analysis of a linear regression model, we have statistically verified these hypotheses.
Our findings indicate that all three predictor variables: temperature, ITAEE and consumer sentiment, have a statistically significant influence on Coca-Cola sales. In particular, we have observed that an increase in temperature is positively related to an increase in Coca-Cola sales, supporting our first hypothesis. Furthermore, greater economic activity as measured by the ITAEE index and more positive sentiment among consumers are positively associated with an increase in Coca-Cola sales, thus supporting our second and third hypotheses.
These results underscore the importance of considering factors such as temperature, economic status, and consumer sentiment when analyzing and forecasting Coca-Cola sales. Consequently, these findings can be valuable for strategic planning and decision making in the beverage industry, providing solid statistical evidence of the influence of these variables on Coca-Cola sales.
COCACOLA. (2020) Annual Report. (2020). Coca-Cola.com. https://coca-cola.com