library(foreign)
library(dplyr) # data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats) # to work with categorical variables
library(ggplot2) # data visualization
library(readr) # read specific csv files
library(janitor) # data exploration and cleaning
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(Hmisc) # several useful functions for data analysis
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych) # functions for multivariate analysis
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar) # summaries and visualization of missing values NA's
library(corrplot) # correlation plots
## corrplot 0.92 loaded
library(jtools) # presentation of regression analysis
##
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
##
## %nin%
library(lmtest) # diagnostic checks - linear regression analysis
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # diagnostic checks - linear regression analysis
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(olsrr) # diagnostic checks - linear regression analysis
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(naniar) # identifying missing values
library(stargazer) # create publication quality tables
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effects) # displays for linear and other regression models
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(tidyverse) # collection of R packages designed for data science
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret) # Classification and Regression Training
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet) # methods for prediction and plotting, and functions for cross-validation
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-7
library("readxl") # Read Excel
library(caret) # Imputation
library(missForest) # Imputation
library(mice) # Imputation
## Registered S3 methods overwritten by 'broom':
## method from
## tidy.glht jtools
## tidy.summary.glht jtools
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(Hmisc) # Imputation
coca <- read_excel("C:\\Users\\maria\\OneDrive\\Desktop\\AD 2023\\ECONOMETRICS\\coca_cola_sales.xlsx", sheet = 'cocacolasalesboxes')
coca
## # A tibble: 48 × 15
## tperiod sales_unitboxes consumer_sentiment CPI inflation_rate
## <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 2021-01-15 00:00:00 5516689. 38.1 87.1 -0.09
## 2 2021-02-15 00:00:00 5387496. 37.5 87.3 0.19
## 3 2021-03-15 00:00:00 5886747. 38.5 87.6 0.41
## 4 2021-04-15 00:00:00 6389182. 37.8 87.4 -0.26
## 5 2021-05-15 00:00:00 6448275. 38.0 87.0 -0.5
## 6 2021-06-15 00:00:00 6697947. 39.1 87.1 0.17
## 7 2021-07-15 00:00:00 6420091. 38.1 87.2 0.15
## 8 2021-08-15 00:00:00 6474440. 37.4 87.4 0.21
## 9 2021-09-15 00:00:00 6340781. 37.4 87.8 0.37
## 10 2021-10-15 00:00:00 6539561. 37.8 88.2 0.51
## # ℹ 38 more rows
## # ℹ 10 more variables: unemp_rate <dbl>, gdp_percapita <dbl>, itaee <dbl>,
## # itaee_growth <dbl>, pop_density <dbl>, job_density <dbl>,
## # pop_minwage <dbl>, exchange_rate <dbl>, max_temperature <dbl>,
## # holiday_month <dbl>
# setting time series format
coca$tperiod <- as.Date(coca$tperiod,"%m/%d/%Y")
## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%m/%d/%Y'
head(coca)
## # A tibble: 6 × 15
## tperiod sales_unitboxes consumer_sentiment CPI inflation_rate unemp_rate
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-01-15 5516689. 38.1 87.1 -0.09 0.0523
## 2 2021-02-15 5387496. 37.5 87.3 0.19 0.0531
## 3 2021-03-15 5886747. 38.5 87.6 0.41 0.0461
## 4 2021-04-15 6389182. 37.8 87.4 -0.26 0.0510
## 5 2021-05-15 6448275. 38.0 87.0 -0.5 0.0552
## 6 2021-06-15 6697947. 39.1 87.1 0.17 0.0507
## # ℹ 9 more variables: gdp_percapita <dbl>, itaee <dbl>, itaee_growth <dbl>,
## # pop_density <dbl>, job_density <dbl>, pop_minwage <dbl>,
## # exchange_rate <dbl>, max_temperature <dbl>, holiday_month <dbl>
coca
## # A tibble: 48 × 15
## tperiod sales_unitboxes consumer_sentiment CPI inflation_rate unemp_rate
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-01-15 5516689. 38.1 87.1 -0.09 0.0523
## 2 2021-02-15 5387496. 37.5 87.3 0.19 0.0531
## 3 2021-03-15 5886747. 38.5 87.6 0.41 0.0461
## 4 2021-04-15 6389182. 37.8 87.4 -0.26 0.0510
## 5 2021-05-15 6448275. 38.0 87.0 -0.5 0.0552
## 6 2021-06-15 6697947. 39.1 87.1 0.17 0.0507
## 7 2021-07-15 6420091. 38.1 87.2 0.15 0.0542
## 8 2021-08-15 6474440. 37.4 87.4 0.21 0.0547
## 9 2021-09-15 6340781. 37.4 87.8 0.37 0.0538
## 10 2021-10-15 6539561. 37.8 88.2 0.51 0.0539
## # ℹ 38 more rows
## # ℹ 9 more variables: gdp_percapita <dbl>, itaee <dbl>, itaee_growth <dbl>,
## # pop_density <dbl>, job_density <dbl>, pop_minwage <dbl>,
## # exchange_rate <dbl>, max_temperature <dbl>, holiday_month <dbl>
# Briefly describe the dataset. For example, what is the structure of the dataset? How many observations include the dataset? Is there any presence of missing values in the dataset?
# Display dataset
coca
## # A tibble: 48 × 15
## tperiod sales_unitboxes consumer_sentiment CPI inflation_rate unemp_rate
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-01-15 5516689. 38.1 87.1 -0.09 0.0523
## 2 2021-02-15 5387496. 37.5 87.3 0.19 0.0531
## 3 2021-03-15 5886747. 38.5 87.6 0.41 0.0461
## 4 2021-04-15 6389182. 37.8 87.4 -0.26 0.0510
## 5 2021-05-15 6448275. 38.0 87.0 -0.5 0.0552
## 6 2021-06-15 6697947. 39.1 87.1 0.17 0.0507
## 7 2021-07-15 6420091. 38.1 87.2 0.15 0.0542
## 8 2021-08-15 6474440. 37.4 87.4 0.21 0.0547
## 9 2021-09-15 6340781. 37.4 87.8 0.37 0.0538
## 10 2021-10-15 6539561. 37.8 88.2 0.51 0.0539
## # ℹ 38 more rows
## # ℹ 9 more variables: gdp_percapita <dbl>, itaee <dbl>, itaee_growth <dbl>,
## # pop_density <dbl>, job_density <dbl>, pop_minwage <dbl>,
## # exchange_rate <dbl>, max_temperature <dbl>, holiday_month <dbl>
#Identify missing values.
sum(is.na(coca))
## [1] 0
#No missing values
#48 rows and 15 columns, al with data type of double
glimpse(coca)
## Rows: 48
## Columns: 15
## $ tperiod <date> 2021-01-15, 2021-02-15, 2021-03-15, 2021-04-15, 20…
## $ sales_unitboxes <dbl> 5516689, 5387496, 5886747, 6389182, 6448275, 669794…
## $ consumer_sentiment <dbl> 38.06250, 37.49114, 38.50522, 37.84286, 38.03169, 3…
## $ CPI <dbl> 87.11010, 87.27538, 87.63072, 87.40384, 86.96737, 8…
## $ inflation_rate <dbl> -0.09, 0.19, 0.41, -0.26, -0.50, 0.17, 0.15, 0.21, …
## $ unemp_rate <dbl> 0.05230256, 0.05311320, 0.04608844, 0.05102038, 0.0…
## $ gdp_percapita <dbl> 11659.56, 11659.55, 11659.55, 11625.75, 11625.74, 1…
## $ itaee <dbl> 103.7654, 103.7654, 103.7654, 107.7518, 107.7518, 1…
## $ itaee_growth <dbl> 0.049716574, 0.049716574, 0.049716574, 0.031838981,…
## $ pop_density <dbl> 98.54185, 98.54186, 98.54187, 98.82843, 98.82844, 9…
## $ job_density <dbl> 18.26048, 18.46329, 18.64164, 18.67876, 18.67539, 1…
## $ pop_minwage <dbl> 9.657861, 9.657861, 9.657861, 9.594919, 9.594919, 9…
## $ exchange_rate <dbl> 14.69259, 14.92134, 15.22834, 15.22618, 15.26447, 1…
## $ max_temperature <dbl> 28, 31, 29, 32, 34, 32, 29, 29, 29, 29, 29, 26, 28,…
## $ holiday_month <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
#Descriptive statistics (mean, median, standard deviation, minimum, maximum)
summary(coca)
## tperiod sales_unitboxes consumer_sentiment CPI
## Min. :2021-01-15 Min. :5301755 Min. :28.67 Min. : 86.97
## 1st Qu.:2021-04-08 1st Qu.:6171767 1st Qu.:35.64 1st Qu.: 89.18
## Median :2021-07-01 Median :6461357 Median :36.76 Median : 92.82
## Mean :2021-07-02 Mean :6473691 Mean :37.15 Mean : 93.40
## 3rd Qu.:2021-09-24 3rd Qu.:6819782 3rd Qu.:38.14 3rd Qu.: 98.40
## Max. :2021-12-18 Max. :7963063 Max. :44.87 Max. :103.02
## inflation_rate unemp_rate gdp_percapita itaee
## Min. :-0.5000 Min. :0.03466 Min. :11559 Min. :103.8
## 1st Qu.: 0.1650 1st Qu.:0.04010 1st Qu.:11830 1st Qu.:111.5
## Median : 0.3850 Median :0.04369 Median :12014 Median :113.5
## Mean : 0.3485 Mean :0.04442 Mean :11979 Mean :113.9
## 3rd Qu.: 0.5575 3rd Qu.:0.04897 3rd Qu.:12162 3rd Qu.:117.1
## Max. : 1.7000 Max. :0.05517 Max. :12329 Max. :122.5
## itaee_growth pop_density job_density pop_minwage
## Min. :0.005571 Min. : 98.54 Min. :18.26 Min. : 9.398
## 1st Qu.:0.022376 1st Qu.: 99.61 1st Qu.:19.28 1st Qu.:10.794
## Median :0.029977 Median :100.67 Median :20.39 Median :11.139
## Mean :0.031736 Mean :100.65 Mean :20.38 Mean :11.116
## 3rd Qu.:0.043038 3rd Qu.:101.69 3rd Qu.:21.60 3rd Qu.:11.413
## Max. :0.056536 Max. :102.69 Max. :22.36 Max. :13.026
## exchange_rate max_temperature holiday_month
## Min. :14.69 Min. :26.00 Min. :0.00
## 1st Qu.:17.38 1st Qu.:29.00 1st Qu.:0.00
## Median :18.62 Median :30.00 Median :0.00
## Mean :18.18 Mean :30.50 Mean :0.25
## 3rd Qu.:19.06 3rd Qu.:32.25 3rd Qu.:0.25
## Max. :21.39 Max. :37.00 Max. :1.00
#Graph 1
# Histogram of dependent variable
hist(coca$sales_unitboxes, main = "Histogram of Sales")
hist(log(coca$sales_unitboxes) , main = "Histogram of Sales - Log")
#Graph 2
#Bar graph mapping Foreign Investment and GDP through the last years
graph<-coca %>% group_by(holiday_month)
graph
## # A tibble: 48 × 15
## # Groups: holiday_month [2]
## tperiod sales_unitboxes consumer_sentiment CPI inflation_rate unemp_rate
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-01-15 5516689. 38.1 87.1 -0.09 0.0523
## 2 2021-02-15 5387496. 37.5 87.3 0.19 0.0531
## 3 2021-03-15 5886747. 38.5 87.6 0.41 0.0461
## 4 2021-04-15 6389182. 37.8 87.4 -0.26 0.0510
## 5 2021-05-15 6448275. 38.0 87.0 -0.5 0.0552
## 6 2021-06-15 6697947. 39.1 87.1 0.17 0.0507
## 7 2021-07-15 6420091. 38.1 87.2 0.15 0.0542
## 8 2021-08-15 6474440. 37.4 87.4 0.21 0.0547
## 9 2021-09-15 6340781. 37.4 87.8 0.37 0.0538
## 10 2021-10-15 6539561. 37.8 88.2 0.51 0.0539
## # ℹ 38 more rows
## # ℹ 9 more variables: gdp_percapita <dbl>, itaee <dbl>, itaee_growth <dbl>,
## # pop_density <dbl>, job_density <dbl>, pop_minwage <dbl>,
## # exchange_rate <dbl>, max_temperature <dbl>, holiday_month <dbl>
ggplot(data=graph,aes(x=holiday_month,y=sales_unitboxes)) +
geom_bar(stat="identity") + coord_flip() +
labs(
y="Foreign Investment",
x="Year",
fill= 'GPD',
title="Foreign Investment and GDP")
#Graph 3
# Correlation matrix
library(ggplot2)
library(ggcorrplot)
corre<- round(cor(subset(coca,select = -c(tperiod))), 1)
corre
## sales_unitboxes consumer_sentiment CPI inflation_rate
## sales_unitboxes 1.0 0.2 0.2 -0.3
## consumer_sentiment 0.2 1.0 0.2 -0.1
## CPI 0.2 0.2 1.0 0.3
## inflation_rate -0.3 -0.1 0.3 1.0
## unemp_rate -0.1 0.1 -0.8 -0.4
## gdp_percapita 0.2 0.0 0.9 0.2
## itaee 0.3 0.2 0.9 0.4
## itaee_growth -0.2 -0.2 -0.4 -0.1
## pop_density 0.3 0.2 1.0 0.3
## job_density 0.3 0.1 1.0 0.3
## pop_minwage 0.3 0.0 0.8 0.2
## exchange_rate 0.2 -0.2 0.7 0.5
## max_temperature 0.6 -0.2 -0.1 -0.6
## holiday_month 0.0 0.1 0.1 0.0
## unemp_rate gdp_percapita itaee itaee_growth pop_density
## sales_unitboxes -0.1 0.2 0.3 -0.2 0.3
## consumer_sentiment 0.1 0.0 0.2 -0.2 0.2
## CPI -0.8 0.9 0.9 -0.4 1.0
## inflation_rate -0.4 0.2 0.4 -0.1 0.3
## unemp_rate 1.0 -0.8 -0.7 0.3 -0.8
## gdp_percapita -0.8 1.0 0.7 -0.2 0.9
## itaee -0.7 0.7 1.0 -0.4 0.9
## itaee_growth 0.3 -0.2 -0.4 1.0 -0.4
## pop_density -0.8 0.9 0.9 -0.4 1.0
## job_density -0.8 0.9 0.9 -0.4 1.0
## pop_minwage -0.7 0.9 0.7 -0.3 0.9
## exchange_rate -0.7 0.8 0.8 -0.1 0.8
## max_temperature 0.0 0.1 -0.2 0.0 0.0
## holiday_month 0.0 0.0 0.1 -0.1 0.0
## job_density pop_minwage exchange_rate max_temperature
## sales_unitboxes 0.3 0.3 0.2 0.6
## consumer_sentiment 0.1 0.0 -0.2 -0.2
## CPI 1.0 0.8 0.7 -0.1
## inflation_rate 0.3 0.2 0.5 -0.6
## unemp_rate -0.8 -0.7 -0.7 0.0
## gdp_percapita 0.9 0.9 0.8 0.1
## itaee 0.9 0.7 0.8 -0.2
## itaee_growth -0.4 -0.3 -0.1 0.0
## pop_density 1.0 0.9 0.8 0.0
## job_density 1.0 0.8 0.7 0.0
## pop_minwage 0.8 1.0 0.7 0.1
## exchange_rate 0.7 0.7 1.0 0.0
## max_temperature 0.0 0.1 0.0 1.0
## holiday_month 0.1 0.0 0.1 -0.2
## holiday_month
## sales_unitboxes 0.0
## consumer_sentiment 0.1
## CPI 0.1
## inflation_rate 0.0
## unemp_rate 0.0
## gdp_percapita 0.0
## itaee 0.1
## itaee_growth -0.1
## pop_density 0.0
## job_density 0.1
## pop_minwage 0.0
## exchange_rate 0.1
## max_temperature -0.2
## holiday_month 1.0
ggcorrplot(corre,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("pink", "white", "lightgreen"),
title="Correlogram",
ggtheme=theme_bw)
# In this matrix, we can appreciate the correlation among every variable. We will pay special attention to the ones related to our dependent variable. We can base our hypothesis on this graph and later on verify the statistical significance of each relation.
# We can see how umeployment rate has almost no relationship whatsoever the sales of unitboxes, while in the other hand Max Temperature seems to be the most correlated.
# Emphasis in noting that correlation does not mean causality. We are just cheking for a relationship between variables.
#Graph 4 Scatterplot
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(coca, aes(max_temperature, sales_unitboxes))
g + geom_point() +
geom_smooth(method="lm", se=F) +
labs(
y="Sales Unitboxes",
x="Max. Temperature",
title="Exportations vs Foreign Investment",
)
## `geom_smooth()` using formula = 'y ~ x'
# Graph 5 Counts Plot
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(coca, aes(CPI, sales_unitboxes))
g + geom_count(col="tomato3", show.legend=F) +
labs(
y="Sales Unitboxes",
x="Consumer Price Index",
title="Theft Danger & Foreign Investments")
# At first glance, this graph shows no correlation at all, it shows a heteroscedasticity behavior. We can assume these 2 varoables, sales and CPI do not intercorrelate.
Hypothesis 1: The sales of Coca Cola products increases when the temperature rises, meaning that they have a positive correlation.
Hypothesis 2: Then the inflation increases in the country, Coca Cola’s sales decreases, meaning that they have a negative correlation.
Hypothesis 3: Unemployment rate does not affect Coca Cola´s sales, meaning they do not have a significant correlation.
# Estimate 2 different multiple linear regression model specifications.
# Linear Regression
linear_regression <- lm(sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density + job_density + pop_minwage + exchange_rate + max_temperature, data=coca)
summary(linear_regression)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate +
## unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density +
## job_density + pop_minwage + exchange_rate + max_temperature,
## data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -557280 -245811 8601 197757 717948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -70533682 46993076 -1.501 0.142340
## consumer_sentiment 47082 28366 1.660 0.105887
## CPI -100726 97912 -1.029 0.310660
## inflation_rate -112101 265941 -0.422 0.675950
## unemp_rate -4743837 19670332 -0.241 0.810833
## gdp_percapita -2494 1484 -1.681 0.101676
## itaee 21189 85276 0.248 0.805217
## itaee_growth 1969921 5157483 0.382 0.704805
## pop_density 1118240 736253 1.519 0.137789
## job_density -347723 482784 -0.720 0.476157
## pop_minwage 161450 139429 1.158 0.254729
## exchange_rate 1878 111357 0.017 0.986640
## max_temperature 164835 38165 4.319 0.000123 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 370600 on 35 degrees of freedom
## Multiple R-squared: 0.7147, Adjusted R-squared: 0.6169
## F-statistic: 7.307 on 12 and 35 DF, p-value: 1.821e-06
RMSE(linear_regression$fitted.values, coca$sales_unitboxes)
## [1] 316438.2
#Linear regression with natural logarithm applied to the variables
linear_log<-lm(log(sales_unitboxes) ~ consumer_sentiment + CPI + inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density + job_density + pop_minwage + exchange_rate + max_temperature, data=coca)
summary(linear_log)
##
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI +
## inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth +
## pop_density + job_density + pop_minwage + exchange_rate +
## max_temperature, data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.096099 -0.033915 0.001249 0.029926 0.106348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3512266 7.3811850 0.590 0.559311
## consumer_sentiment 0.0072486 0.0044554 1.627 0.112722
## CPI -0.0155279 0.0153791 -1.010 0.319581
## inflation_rate -0.0157285 0.0417712 -0.377 0.708788
## unemp_rate -0.8111869 3.0896118 -0.263 0.794434
## gdp_percapita -0.0003609 0.0002331 -1.548 0.130532
## itaee 0.0056921 0.0133942 0.425 0.673465
## itaee_growth 0.1924532 0.8100839 0.238 0.813598
## pop_density 0.1619959 0.1156430 1.401 0.170070
## job_density -0.0530332 0.0758307 -0.699 0.488948
## pop_minwage 0.0222791 0.0219000 1.017 0.315987
## exchange_rate -0.0016231 0.0174908 -0.093 0.926595
## max_temperature 0.0254728 0.0059945 4.249 0.000151 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05821 on 35 degrees of freedom
## Multiple R-squared: 0.7087, Adjusted R-squared: 0.6088
## F-statistic: 7.094 on 12 and 35 DF, p-value: 2.531e-06
RMSE(linear_log$fitted.values, coca$sales_unitboxes)
## [1] 6500729
# Polynomial Regression Model
polynomial_regression <- lm(log(sales_unitboxes) ~ consumer_sentiment + CPI + inflation_rate + unemp_rate + gdp_percapita + itaee + I(itaee^2)+ itaee_growth + pop_density + job_density + pop_minwage + exchange_rate + max_temperature, data=coca)
summary(polynomial_regression)
##
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI +
## inflation_rate + unemp_rate + gdp_percapita + itaee + I(itaee^2) +
## itaee_growth + pop_density + job_density + pop_minwage +
## exchange_rate + max_temperature, data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.104158 -0.037441 0.001734 0.024955 0.089930
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.0186854 8.4798501 -0.592 0.5579
## consumer_sentiment 0.0125035 0.0050115 2.495 0.0176 *
## CPI -0.0026777 0.0160811 -0.167 0.8687
## inflation_rate -0.0047234 0.0404442 -0.117 0.9077
## unemp_rate -1.4422006 2.9804784 -0.484 0.6316
## gdp_percapita -0.0003123 0.0002249 -1.389 0.1739
## itaee 0.2471156 0.1209049 2.044 0.0488 *
## I(itaee^2) -0.0010615 0.0005286 -2.008 0.0526 .
## itaee_growth 0.6806546 0.8142529 0.836 0.4090
## pop_density 0.0989287 0.1152961 0.858 0.3969
## job_density -0.0423211 0.0729398 -0.580 0.5656
## pop_minwage 0.0092791 0.0219835 0.422 0.6756
## exchange_rate -0.0033056 0.0167998 -0.197 0.8452
## max_temperature 0.0255336 0.0057506 4.440 9.02e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05584 on 34 degrees of freedom
## Multiple R-squared: 0.7396, Adjusted R-squared: 0.64
## F-statistic: 7.426 on 13 and 34 DF, p-value: 1.299e-06
RMSE(polynomial_regression$fitted.values, coca$sales_unitboxes)
## [1] 6500729
# Evaluate each regression model using model diagnostics (e.g., multicollinearity and heteroscedasticity).
##HETEROSCEDASTICITY
# Using the BP Test, we can check out for heteroscedasticity (residuals with a NOT constant variance/unequal scatter) by looking at the P-VALUE it gave us.
#H0.Homoscedasticityt is present
#HA.Heteroscedasticty is present
# Its very important to mention that our models should have HOMOSCEDASTICITY for a model way more precise.
#If the P- Value is less than 5% or 0.05, we reject the null hypothesis which means that heteroscedasticity is present in the model
#Model 1 Evaluation
summary(linear_regression)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate +
## unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density +
## job_density + pop_minwage + exchange_rate + max_temperature,
## data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -557280 -245811 8601 197757 717948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -70533682 46993076 -1.501 0.142340
## consumer_sentiment 47082 28366 1.660 0.105887
## CPI -100726 97912 -1.029 0.310660
## inflation_rate -112101 265941 -0.422 0.675950
## unemp_rate -4743837 19670332 -0.241 0.810833
## gdp_percapita -2494 1484 -1.681 0.101676
## itaee 21189 85276 0.248 0.805217
## itaee_growth 1969921 5157483 0.382 0.704805
## pop_density 1118240 736253 1.519 0.137789
## job_density -347723 482784 -0.720 0.476157
## pop_minwage 161450 139429 1.158 0.254729
## exchange_rate 1878 111357 0.017 0.986640
## max_temperature 164835 38165 4.319 0.000123 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 370600 on 35 degrees of freedom
## Multiple R-squared: 0.7147, Adjusted R-squared: 0.6169
## F-statistic: 7.307 on 12 and 35 DF, p-value: 1.821e-06
RMSE(linear_regression$fitted.values, coca$sales_unitboxes)
## [1] 316438.2
# This model has an adjusted r2 of 0.6169 and a RMSE of 316,438.2
vif(linear_regression)
## consumer_sentiment CPI inflation_rate unemp_rate
## 2.298663 83.943670 3.674213 4.579567
## gdp_percapita itaee itaee_growth pop_density
## 47.586831 56.231040 1.946046 308.686168
## job_density pop_minwage exchange_rate max_temperature
## 124.998391 6.706682 10.947585 3.457748
#Shows a model with only 5 variables with multicolinearity.
bptest(linear_regression)
##
## studentized Breusch-Pagan test
##
## data: linear_regression
## BP = 11.905, df = 12, p-value = 0.4534
# p-value of 0.4534
AIC(linear_regression)
## [1] 1380.047
# 1380.047
histogram(polynomial_regression$residuals)
#Model 2 Evaluation
summary(linear_log)
##
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI +
## inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth +
## pop_density + job_density + pop_minwage + exchange_rate +
## max_temperature, data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.096099 -0.033915 0.001249 0.029926 0.106348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3512266 7.3811850 0.590 0.559311
## consumer_sentiment 0.0072486 0.0044554 1.627 0.112722
## CPI -0.0155279 0.0153791 -1.010 0.319581
## inflation_rate -0.0157285 0.0417712 -0.377 0.708788
## unemp_rate -0.8111869 3.0896118 -0.263 0.794434
## gdp_percapita -0.0003609 0.0002331 -1.548 0.130532
## itaee 0.0056921 0.0133942 0.425 0.673465
## itaee_growth 0.1924532 0.8100839 0.238 0.813598
## pop_density 0.1619959 0.1156430 1.401 0.170070
## job_density -0.0530332 0.0758307 -0.699 0.488948
## pop_minwage 0.0222791 0.0219000 1.017 0.315987
## exchange_rate -0.0016231 0.0174908 -0.093 0.926595
## max_temperature 0.0254728 0.0059945 4.249 0.000151 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05821 on 35 degrees of freedom
## Multiple R-squared: 0.7087, Adjusted R-squared: 0.6088
## F-statistic: 7.094 on 12 and 35 DF, p-value: 2.531e-06
RMSE(linear_log$fitted.values, coca$sales_unitboxes)
## [1] 6500729
# This model has an adjusted r2 of 0.6088 and a RMSE of 6,500,729
vif(linear_log)
## consumer_sentiment CPI inflation_rate unemp_rate
## 2.298663 83.943670 3.674213 4.579567
## gdp_percapita itaee itaee_growth pop_density
## 47.586831 56.231040 1.946046 308.686168
## job_density pop_minwage exchange_rate max_temperature
## 124.998391 6.706682 10.947585 3.457748
#Shows a model with only 5 variables with multicolinearity.
bptest(linear_log)
##
## studentized Breusch-Pagan test
##
## data: linear_log
## BP = 12.013, df = 12, p-value = 0.4446
#p-value of 0.4446
AIC(linear_log)
## [1] -123.9445
# Aic of -123.9445
histogram(linear_log$residuals)
#Model 3 Evaluation
summary(polynomial_regression)
##
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI +
## inflation_rate + unemp_rate + gdp_percapita + itaee + I(itaee^2) +
## itaee_growth + pop_density + job_density + pop_minwage +
## exchange_rate + max_temperature, data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.104158 -0.037441 0.001734 0.024955 0.089930
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.0186854 8.4798501 -0.592 0.5579
## consumer_sentiment 0.0125035 0.0050115 2.495 0.0176 *
## CPI -0.0026777 0.0160811 -0.167 0.8687
## inflation_rate -0.0047234 0.0404442 -0.117 0.9077
## unemp_rate -1.4422006 2.9804784 -0.484 0.6316
## gdp_percapita -0.0003123 0.0002249 -1.389 0.1739
## itaee 0.2471156 0.1209049 2.044 0.0488 *
## I(itaee^2) -0.0010615 0.0005286 -2.008 0.0526 .
## itaee_growth 0.6806546 0.8142529 0.836 0.4090
## pop_density 0.0989287 0.1152961 0.858 0.3969
## job_density -0.0423211 0.0729398 -0.580 0.5656
## pop_minwage 0.0092791 0.0219835 0.422 0.6756
## exchange_rate -0.0033056 0.0167998 -0.197 0.8452
## max_temperature 0.0255336 0.0057506 4.440 9.02e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05584 on 34 degrees of freedom
## Multiple R-squared: 0.7396, Adjusted R-squared: 0.64
## F-statistic: 7.426 on 13 and 34 DF, p-value: 1.299e-06
RMSE(polynomial_regression$fitted.values, coca$sales_unitboxes)
## [1] 6500729
# This model has an adjusted r2 of 0.64 and a RMSE of 6,500,729
vif(polynomial_regression)
## consumer_sentiment CPI inflation_rate unemp_rate
## 3.160313 99.735516 3.742934 4.631040
## gdp_percapita itaee I(itaee^2) itaee_growth
## 48.143239 4978.717962 4892.639049 2.136495
## pop_density job_density pop_minwage exchange_rate
## 333.424568 125.670507 7.343468 10.974883
## max_temperature
## 3.457844
#Shows a model with 6 variables with elevated coefficients of multicolinearity.
bptest(polynomial_regression)
##
## studentized Breusch-Pagan test
##
## data: polynomial_regression
## BP = 16.409, df = 13, p-value = 0.2277
#p-value of 0.2277
AIC(polynomial_regression)
## [1] -127.3247
# Aic of -127.3247
histogram(polynomial_regression$residuals)
#Looking at every model's BP Test we can see how:
# - Linear regression: P-Value = 0.4534 H0 - Homoscedasticityt is present
# - Linear regression with natural logarithm: P-Value =0.4446 HO - Homoscedasticityt is present
# - Polynomial regression: P-Value = 0.2277 H0 - Homoscedasticityt is present
##Next, we are going to check for Multicolineality.
##MULTICOLINEALITY
# We identify this phenomenon by checking the VIF function.
# In order to verify this, the values of the variables of our models should be less than 10 to the extent possible.
# After verifying all VIF tests, we can conclude that our model with the most Multicolineality is clearly the polynomial. Since the other two are pretty similar, we will decide our model based on R2, RMSE and AIC.
#Adjusted R-squared assesses the proportion of variation explained by regression models while considering model complexity.
#RMSE quantifies prediction errors, helping evaluate how closely model predictions align with observed values.
#AIC helps compare models by considering both fit and complexity, promoting the selection of simpler models with good fit.
# RMSE - Root Mean Square Error: The larger the difference indicates a larger gap between the predicted and observed values,
# which means poor regression model fit. In the same way, the smaller RMSE that indicates the better the model.
# model 1 with Adjusted R-squared of 0.6169, rmse of 316438.2 and a aic of 1380.047
# model 2 with Adjusted R-squared of 0.6088, rmse of 6500729 and a aic of -123.9445
#After analyzing each test, we coclude that the best model is model 1. This is due to the fact that it has lower RMSE an a bigger R2. Even tough the 2nd model was way lower AIC, model 1 still has a reasonable AIC.
#We will eliminate the variables that does not have a significant impact on the model using the correlation chart
corre<- round(cor(subset(coca,select = -c(tperiod))), 1)
corre
## sales_unitboxes consumer_sentiment CPI inflation_rate
## sales_unitboxes 1.0 0.2 0.2 -0.3
## consumer_sentiment 0.2 1.0 0.2 -0.1
## CPI 0.2 0.2 1.0 0.3
## inflation_rate -0.3 -0.1 0.3 1.0
## unemp_rate -0.1 0.1 -0.8 -0.4
## gdp_percapita 0.2 0.0 0.9 0.2
## itaee 0.3 0.2 0.9 0.4
## itaee_growth -0.2 -0.2 -0.4 -0.1
## pop_density 0.3 0.2 1.0 0.3
## job_density 0.3 0.1 1.0 0.3
## pop_minwage 0.3 0.0 0.8 0.2
## exchange_rate 0.2 -0.2 0.7 0.5
## max_temperature 0.6 -0.2 -0.1 -0.6
## holiday_month 0.0 0.1 0.1 0.0
## unemp_rate gdp_percapita itaee itaee_growth pop_density
## sales_unitboxes -0.1 0.2 0.3 -0.2 0.3
## consumer_sentiment 0.1 0.0 0.2 -0.2 0.2
## CPI -0.8 0.9 0.9 -0.4 1.0
## inflation_rate -0.4 0.2 0.4 -0.1 0.3
## unemp_rate 1.0 -0.8 -0.7 0.3 -0.8
## gdp_percapita -0.8 1.0 0.7 -0.2 0.9
## itaee -0.7 0.7 1.0 -0.4 0.9
## itaee_growth 0.3 -0.2 -0.4 1.0 -0.4
## pop_density -0.8 0.9 0.9 -0.4 1.0
## job_density -0.8 0.9 0.9 -0.4 1.0
## pop_minwage -0.7 0.9 0.7 -0.3 0.9
## exchange_rate -0.7 0.8 0.8 -0.1 0.8
## max_temperature 0.0 0.1 -0.2 0.0 0.0
## holiday_month 0.0 0.0 0.1 -0.1 0.0
## job_density pop_minwage exchange_rate max_temperature
## sales_unitboxes 0.3 0.3 0.2 0.6
## consumer_sentiment 0.1 0.0 -0.2 -0.2
## CPI 1.0 0.8 0.7 -0.1
## inflation_rate 0.3 0.2 0.5 -0.6
## unemp_rate -0.8 -0.7 -0.7 0.0
## gdp_percapita 0.9 0.9 0.8 0.1
## itaee 0.9 0.7 0.8 -0.2
## itaee_growth -0.4 -0.3 -0.1 0.0
## pop_density 1.0 0.9 0.8 0.0
## job_density 1.0 0.8 0.7 0.0
## pop_minwage 0.8 1.0 0.7 0.1
## exchange_rate 0.7 0.7 1.0 0.0
## max_temperature 0.0 0.1 0.0 1.0
## holiday_month 0.1 0.0 0.1 -0.2
## holiday_month
## sales_unitboxes 0.0
## consumer_sentiment 0.1
## CPI 0.1
## inflation_rate 0.0
## unemp_rate 0.0
## gdp_percapita 0.0
## itaee 0.1
## itaee_growth -0.1
## pop_density 0.0
## job_density 0.1
## pop_minwage 0.0
## exchange_rate 0.1
## max_temperature -0.2
## holiday_month 1.0
ggcorrplot(corre,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("pink", "white", "lightgreen"),
title="Correlogram",
ggtheme=theme_bw)
#We discovered that the variables with least correlation were: CPI, ITAEE GROTH. EXCHANGE RATE and GDP PER CAPTA
#Playing with the model, we removed from the model the variables: gdp per capita and CPI
#New model
linear_regression2 <- lm(sales_unitboxes ~ consumer_sentiment + inflation_rate + unemp_rate + itaee + itaee_growth + pop_density + job_density + pop_minwage + exchange_rate + max_temperature, data=coca)
summary(linear_regression2)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + inflation_rate +
## unemp_rate + itaee + itaee_growth + pop_density + job_density +
## pop_minwage + exchange_rate + max_temperature, data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603628 -185655 -8986 219853 762529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -16220942 34124806 -0.475 0.63734
## consumer_sentiment 48051 26744 1.797 0.08055 .
## inflation_rate -18995 261485 -0.073 0.94248
## unemp_rate 2522844 19188626 0.131 0.89611
## itaee 149702 43043 3.478 0.00131 **
## itaee_growth -2578117 4519466 -0.570 0.57182
## pop_density 80573 428063 0.188 0.85173
## job_density -507636 445499 -1.139 0.26183
## pop_minwage 146556 138915 1.055 0.29826
## exchange_rate -62313 101709 -0.613 0.54385
## max_temperature 182950 35041 5.221 7.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 375900 on 37 degrees of freedom
## Multiple R-squared: 0.6897, Adjusted R-squared: 0.6058
## F-statistic: 8.224 on 10 and 37 DF, p-value: 7.935e-07
RMSE(linear_regression2$fitted.values, coca$sales_unitboxes)
## [1] 330024.9
### Diagnostic Tests: New Model
vif(linear_regression2)
## consumer_sentiment inflation_rate unemp_rate itaee
## 1.985985 3.452286 4.235522 13.923110
## itaee_growth pop_density job_density pop_minwage
## 1.452346 101.413775 103.444981 6.470196
## exchange_rate max_temperature
## 8.876062 2.832909
bptest(linear_regression2)
##
## studentized Breusch-Pagan test
##
## data: linear_regression2
## BP = 10.618, df = 10, p-value = 0.388
AIC(linear_regression2)
## [1] 1380.083
histogram(linear_regression2$residuals)
#Testing again our model, we can see how the multicolineality became significantly reduced.
# We have a total of 7 variables with no multicolineality
#We also have 1 variables with a small amount of multicolineality.
# And the other 2 Having their a coefficient of around 100.
#These changes to our model did not affect significantly our rmse, r1 nor AIC.
#Now that we have the best version of a model, we can evaluate our hypothesis and make our predictions.
#Interpret the regression results of selected regression model. That is, describe how much is the impact of the explanatory variables X’s on the dependent variable Y.
summary(linear_regression2)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + inflation_rate +
## unemp_rate + itaee + itaee_growth + pop_density + job_density +
## pop_minwage + exchange_rate + max_temperature, data = coca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603628 -185655 -8986 219853 762529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -16220942 34124806 -0.475 0.63734
## consumer_sentiment 48051 26744 1.797 0.08055 .
## inflation_rate -18995 261485 -0.073 0.94248
## unemp_rate 2522844 19188626 0.131 0.89611
## itaee 149702 43043 3.478 0.00131 **
## itaee_growth -2578117 4519466 -0.570 0.57182
## pop_density 80573 428063 0.188 0.85173
## job_density -507636 445499 -1.139 0.26183
## pop_minwage 146556 138915 1.055 0.29826
## exchange_rate -62313 101709 -0.613 0.54385
## max_temperature 182950 35041 5.221 7.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 375900 on 37 degrees of freedom
## Multiple R-squared: 0.6897, Adjusted R-squared: 0.6058
## F-statistic: 8.224 on 10 and 37 DF, p-value: 7.935e-07
#We can observe that the variable with highest correlation is the mas. temperature. This is our independent variable with explains the most the behavior of our dependet variable (Coca Cola's Sales). It has a 99% certainty and shows a positive correlation. For every 1 change in the max. temperature, the sales increases by 182950.
#On the other hand, our seconf most explanatory variable is itaee (Indicator of the State Economic Activity - ITAEE). It also has a positive correlation with a significance of 0.01, showing a high statistical signifance. For this variable, For every 1 change in the ITAEE, the sales increases by 149702.
#Use the selected regression results to make predictions between the X’s regressors and the dependent variable (e.g., effects plots).
effect_plot(linear_regression2,pred=max_temperature,interval=TRUE)
effect_plot(linear_regression2,pred=itaee,interval=TRUE)
effect_plot(linear_regression2,pred=consumer_sentiment,interval=TRUE)
#We can observe the dependent variable predictions regarding the most statistaclly significant variables of the model. We can clearly observe the positive correlations among all of them and the next predictive values.
Hypothesis 1: The sales of Coca Cola products increases when the temperature rises, meaning that they have a positive correlation. Our hypothesis was indeed correct, their correlation was not just true, it was the most explanatory variable in the model.
Hypothesis 2: Then the inflation increases in the country, Coca Cola’s sales decreases, meaning that they have a negative correlation. Even though the visualization showed this behavior, the inflation was not a significant variable regarding Coca Cola’s Sales.
Hypothesis 3: Unemployment rate does not affect Coca Cola´s sales, meaning they do not have a significant correlation. Our hypothesis is true, unemployment was one of the most insignificant variables in the model.