Importing Libraries

library(foreign)
library(dplyr)        # data manipulation 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats)      # to work with categorical variables
library(ggplot2)      # data visualization 
library(readr)        # read specific csv files
library(janitor)      # data exploration and cleaning 
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(Hmisc)        # several useful functions for data analysis 
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(psych)        # functions for multivariate analysis 
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(naniar)       # summaries and visualization of missing values NA's
library(corrplot)     # correlation plots
## corrplot 0.92 loaded
library(jtools)       # presentation of regression analysis 
## 
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
## 
##     %nin%
library(lmtest)       # diagnostic checks - linear regression analysis 
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)          # diagnostic checks - linear regression analysis
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(olsrr)        # diagnostic checks - linear regression analysis 
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(naniar)       # identifying missing values
library(stargazer)    # create publication quality tables
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effects)      # displays for linear and other regression models
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(tidyverse)    # collection of R packages designed for data science
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ stringr   1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()       masks ggplot2::%+%()
## ✖ psych::alpha()     masks ggplot2::alpha()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ car::recode()      masks dplyr::recode()
## ✖ purrr::some()      masks car::some()
## ✖ Hmisc::src()       masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)        # Classification and Regression Training 
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)       # methods for prediction and plotting, and functions for cross-validation
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-7
library("readxl")     # Read Excel
library(caret)        # Imputation
library(missForest)   # Imputation
library(mice)         # Imputation
## Registered S3 methods overwritten by 'broom':
##   method            from  
##   tidy.glht         jtools
##   tidy.summary.glht jtools
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(Hmisc)        # Imputation

0. IMPORT DATABASE

coca <- read_excel("C:\\Users\\maria\\OneDrive\\Desktop\\AD 2023\\ECONOMETRICS\\coca_cola_sales.xlsx", sheet = 'cocacolasalesboxes')
coca
## # A tibble: 48 × 15
##    tperiod             sales_unitboxes consumer_sentiment   CPI inflation_rate
##    <dttm>                        <dbl>              <dbl> <dbl>          <dbl>
##  1 2021-01-15 00:00:00        5516689.               38.1  87.1          -0.09
##  2 2021-02-15 00:00:00        5387496.               37.5  87.3           0.19
##  3 2021-03-15 00:00:00        5886747.               38.5  87.6           0.41
##  4 2021-04-15 00:00:00        6389182.               37.8  87.4          -0.26
##  5 2021-05-15 00:00:00        6448275.               38.0  87.0          -0.5 
##  6 2021-06-15 00:00:00        6697947.               39.1  87.1           0.17
##  7 2021-07-15 00:00:00        6420091.               38.1  87.2           0.15
##  8 2021-08-15 00:00:00        6474440.               37.4  87.4           0.21
##  9 2021-09-15 00:00:00        6340781.               37.4  87.8           0.37
## 10 2021-10-15 00:00:00        6539561.               37.8  88.2           0.51
## # ℹ 38 more rows
## # ℹ 10 more variables: unemp_rate <dbl>, gdp_percapita <dbl>, itaee <dbl>,
## #   itaee_growth <dbl>, pop_density <dbl>, job_density <dbl>,
## #   pop_minwage <dbl>, exchange_rate <dbl>, max_temperature <dbl>,
## #   holiday_month <dbl>
# setting time series format 
coca$tperiod <- as.Date(coca$tperiod,"%m/%d/%Y")
## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%m/%d/%Y'
head(coca)
## # A tibble: 6 × 15
##   tperiod    sales_unitboxes consumer_sentiment   CPI inflation_rate unemp_rate
##   <date>               <dbl>              <dbl> <dbl>          <dbl>      <dbl>
## 1 2021-01-15        5516689.               38.1  87.1          -0.09     0.0523
## 2 2021-02-15        5387496.               37.5  87.3           0.19     0.0531
## 3 2021-03-15        5886747.               38.5  87.6           0.41     0.0461
## 4 2021-04-15        6389182.               37.8  87.4          -0.26     0.0510
## 5 2021-05-15        6448275.               38.0  87.0          -0.5      0.0552
## 6 2021-06-15        6697947.               39.1  87.1           0.17     0.0507
## # ℹ 9 more variables: gdp_percapita <dbl>, itaee <dbl>, itaee_growth <dbl>,
## #   pop_density <dbl>, job_density <dbl>, pop_minwage <dbl>,
## #   exchange_rate <dbl>, max_temperature <dbl>, holiday_month <dbl>
coca
## # A tibble: 48 × 15
##    tperiod    sales_unitboxes consumer_sentiment   CPI inflation_rate unemp_rate
##    <date>               <dbl>              <dbl> <dbl>          <dbl>      <dbl>
##  1 2021-01-15        5516689.               38.1  87.1          -0.09     0.0523
##  2 2021-02-15        5387496.               37.5  87.3           0.19     0.0531
##  3 2021-03-15        5886747.               38.5  87.6           0.41     0.0461
##  4 2021-04-15        6389182.               37.8  87.4          -0.26     0.0510
##  5 2021-05-15        6448275.               38.0  87.0          -0.5      0.0552
##  6 2021-06-15        6697947.               39.1  87.1           0.17     0.0507
##  7 2021-07-15        6420091.               38.1  87.2           0.15     0.0542
##  8 2021-08-15        6474440.               37.4  87.4           0.21     0.0547
##  9 2021-09-15        6340781.               37.4  87.8           0.37     0.0538
## 10 2021-10-15        6539561.               37.8  88.2           0.51     0.0539
## # ℹ 38 more rows
## # ℹ 9 more variables: gdp_percapita <dbl>, itaee <dbl>, itaee_growth <dbl>,
## #   pop_density <dbl>, job_density <dbl>, pop_minwage <dbl>,
## #   exchange_rate <dbl>, max_temperature <dbl>, holiday_month <dbl>

1. EXPLORATORY DATA ANALYSIS

# Briefly describe the dataset. For example, what is the structure of the dataset? How many observations include the dataset? Is there any presence of missing values in the dataset? 

# Display dataset
coca
## # A tibble: 48 × 15
##    tperiod    sales_unitboxes consumer_sentiment   CPI inflation_rate unemp_rate
##    <date>               <dbl>              <dbl> <dbl>          <dbl>      <dbl>
##  1 2021-01-15        5516689.               38.1  87.1          -0.09     0.0523
##  2 2021-02-15        5387496.               37.5  87.3           0.19     0.0531
##  3 2021-03-15        5886747.               38.5  87.6           0.41     0.0461
##  4 2021-04-15        6389182.               37.8  87.4          -0.26     0.0510
##  5 2021-05-15        6448275.               38.0  87.0          -0.5      0.0552
##  6 2021-06-15        6697947.               39.1  87.1           0.17     0.0507
##  7 2021-07-15        6420091.               38.1  87.2           0.15     0.0542
##  8 2021-08-15        6474440.               37.4  87.4           0.21     0.0547
##  9 2021-09-15        6340781.               37.4  87.8           0.37     0.0538
## 10 2021-10-15        6539561.               37.8  88.2           0.51     0.0539
## # ℹ 38 more rows
## # ℹ 9 more variables: gdp_percapita <dbl>, itaee <dbl>, itaee_growth <dbl>,
## #   pop_density <dbl>, job_density <dbl>, pop_minwage <dbl>,
## #   exchange_rate <dbl>, max_temperature <dbl>, holiday_month <dbl>
#Identify missing values. 
sum(is.na(coca))
## [1] 0
#No missing values

#48 rows and 15 columns, al with data type of double
glimpse(coca)
## Rows: 48
## Columns: 15
## $ tperiod            <date> 2021-01-15, 2021-02-15, 2021-03-15, 2021-04-15, 20…
## $ sales_unitboxes    <dbl> 5516689, 5387496, 5886747, 6389182, 6448275, 669794…
## $ consumer_sentiment <dbl> 38.06250, 37.49114, 38.50522, 37.84286, 38.03169, 3…
## $ CPI                <dbl> 87.11010, 87.27538, 87.63072, 87.40384, 86.96737, 8…
## $ inflation_rate     <dbl> -0.09, 0.19, 0.41, -0.26, -0.50, 0.17, 0.15, 0.21, …
## $ unemp_rate         <dbl> 0.05230256, 0.05311320, 0.04608844, 0.05102038, 0.0…
## $ gdp_percapita      <dbl> 11659.56, 11659.55, 11659.55, 11625.75, 11625.74, 1…
## $ itaee              <dbl> 103.7654, 103.7654, 103.7654, 107.7518, 107.7518, 1…
## $ itaee_growth       <dbl> 0.049716574, 0.049716574, 0.049716574, 0.031838981,…
## $ pop_density        <dbl> 98.54185, 98.54186, 98.54187, 98.82843, 98.82844, 9…
## $ job_density        <dbl> 18.26048, 18.46329, 18.64164, 18.67876, 18.67539, 1…
## $ pop_minwage        <dbl> 9.657861, 9.657861, 9.657861, 9.594919, 9.594919, 9…
## $ exchange_rate      <dbl> 14.69259, 14.92134, 15.22834, 15.22618, 15.26447, 1…
## $ max_temperature    <dbl> 28, 31, 29, 32, 34, 32, 29, 29, 29, 29, 29, 26, 28,…
## $ holiday_month      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
#Descriptive statistics (mean, median, standard deviation, minimum, maximum)
summary(coca)
##     tperiod           sales_unitboxes   consumer_sentiment      CPI        
##  Min.   :2021-01-15   Min.   :5301755   Min.   :28.67      Min.   : 86.97  
##  1st Qu.:2021-04-08   1st Qu.:6171767   1st Qu.:35.64      1st Qu.: 89.18  
##  Median :2021-07-01   Median :6461357   Median :36.76      Median : 92.82  
##  Mean   :2021-07-02   Mean   :6473691   Mean   :37.15      Mean   : 93.40  
##  3rd Qu.:2021-09-24   3rd Qu.:6819782   3rd Qu.:38.14      3rd Qu.: 98.40  
##  Max.   :2021-12-18   Max.   :7963063   Max.   :44.87      Max.   :103.02  
##  inflation_rate      unemp_rate      gdp_percapita       itaee      
##  Min.   :-0.5000   Min.   :0.03466   Min.   :11559   Min.   :103.8  
##  1st Qu.: 0.1650   1st Qu.:0.04010   1st Qu.:11830   1st Qu.:111.5  
##  Median : 0.3850   Median :0.04369   Median :12014   Median :113.5  
##  Mean   : 0.3485   Mean   :0.04442   Mean   :11979   Mean   :113.9  
##  3rd Qu.: 0.5575   3rd Qu.:0.04897   3rd Qu.:12162   3rd Qu.:117.1  
##  Max.   : 1.7000   Max.   :0.05517   Max.   :12329   Max.   :122.5  
##   itaee_growth       pop_density      job_density     pop_minwage    
##  Min.   :0.005571   Min.   : 98.54   Min.   :18.26   Min.   : 9.398  
##  1st Qu.:0.022376   1st Qu.: 99.61   1st Qu.:19.28   1st Qu.:10.794  
##  Median :0.029977   Median :100.67   Median :20.39   Median :11.139  
##  Mean   :0.031736   Mean   :100.65   Mean   :20.38   Mean   :11.116  
##  3rd Qu.:0.043038   3rd Qu.:101.69   3rd Qu.:21.60   3rd Qu.:11.413  
##  Max.   :0.056536   Max.   :102.69   Max.   :22.36   Max.   :13.026  
##  exchange_rate   max_temperature holiday_month 
##  Min.   :14.69   Min.   :26.00   Min.   :0.00  
##  1st Qu.:17.38   1st Qu.:29.00   1st Qu.:0.00  
##  Median :18.62   Median :30.00   Median :0.00  
##  Mean   :18.18   Mean   :30.50   Mean   :0.25  
##  3rd Qu.:19.06   3rd Qu.:32.25   3rd Qu.:0.25  
##  Max.   :21.39   Max.   :37.00   Max.   :1.00

2. DATA VISUALIZATION

#Graph 1 
# Histogram of dependent variable
hist(coca$sales_unitboxes, main = "Histogram of Sales")

hist(log(coca$sales_unitboxes) , main = "Histogram of Sales - Log")

#Graph 2
#Bar graph mapping Foreign Investment and GDP through the last years 
graph<-coca %>% group_by(holiday_month)
graph
## # A tibble: 48 × 15
## # Groups:   holiday_month [2]
##    tperiod    sales_unitboxes consumer_sentiment   CPI inflation_rate unemp_rate
##    <date>               <dbl>              <dbl> <dbl>          <dbl>      <dbl>
##  1 2021-01-15        5516689.               38.1  87.1          -0.09     0.0523
##  2 2021-02-15        5387496.               37.5  87.3           0.19     0.0531
##  3 2021-03-15        5886747.               38.5  87.6           0.41     0.0461
##  4 2021-04-15        6389182.               37.8  87.4          -0.26     0.0510
##  5 2021-05-15        6448275.               38.0  87.0          -0.5      0.0552
##  6 2021-06-15        6697947.               39.1  87.1           0.17     0.0507
##  7 2021-07-15        6420091.               38.1  87.2           0.15     0.0542
##  8 2021-08-15        6474440.               37.4  87.4           0.21     0.0547
##  9 2021-09-15        6340781.               37.4  87.8           0.37     0.0538
## 10 2021-10-15        6539561.               37.8  88.2           0.51     0.0539
## # ℹ 38 more rows
## # ℹ 9 more variables: gdp_percapita <dbl>, itaee <dbl>, itaee_growth <dbl>,
## #   pop_density <dbl>, job_density <dbl>, pop_minwage <dbl>,
## #   exchange_rate <dbl>, max_temperature <dbl>, holiday_month <dbl>
ggplot(data=graph,aes(x=holiday_month,y=sales_unitboxes)) +
  geom_bar(stat="identity") + coord_flip() +
  labs( 
       y="Foreign Investment", 
       x="Year", 
       fill= 'GPD',
       title="Foreign Investment and GDP")

#Graph 3
# Correlation matrix
library(ggplot2)
library(ggcorrplot)


corre<- round(cor(subset(coca,select =  -c(tperiod))), 1)
corre
##                    sales_unitboxes consumer_sentiment  CPI inflation_rate
## sales_unitboxes                1.0                0.2  0.2           -0.3
## consumer_sentiment             0.2                1.0  0.2           -0.1
## CPI                            0.2                0.2  1.0            0.3
## inflation_rate                -0.3               -0.1  0.3            1.0
## unemp_rate                    -0.1                0.1 -0.8           -0.4
## gdp_percapita                  0.2                0.0  0.9            0.2
## itaee                          0.3                0.2  0.9            0.4
## itaee_growth                  -0.2               -0.2 -0.4           -0.1
## pop_density                    0.3                0.2  1.0            0.3
## job_density                    0.3                0.1  1.0            0.3
## pop_minwage                    0.3                0.0  0.8            0.2
## exchange_rate                  0.2               -0.2  0.7            0.5
## max_temperature                0.6               -0.2 -0.1           -0.6
## holiday_month                  0.0                0.1  0.1            0.0
##                    unemp_rate gdp_percapita itaee itaee_growth pop_density
## sales_unitboxes          -0.1           0.2   0.3         -0.2         0.3
## consumer_sentiment        0.1           0.0   0.2         -0.2         0.2
## CPI                      -0.8           0.9   0.9         -0.4         1.0
## inflation_rate           -0.4           0.2   0.4         -0.1         0.3
## unemp_rate                1.0          -0.8  -0.7          0.3        -0.8
## gdp_percapita            -0.8           1.0   0.7         -0.2         0.9
## itaee                    -0.7           0.7   1.0         -0.4         0.9
## itaee_growth              0.3          -0.2  -0.4          1.0        -0.4
## pop_density              -0.8           0.9   0.9         -0.4         1.0
## job_density              -0.8           0.9   0.9         -0.4         1.0
## pop_minwage              -0.7           0.9   0.7         -0.3         0.9
## exchange_rate            -0.7           0.8   0.8         -0.1         0.8
## max_temperature           0.0           0.1  -0.2          0.0         0.0
## holiday_month             0.0           0.0   0.1         -0.1         0.0
##                    job_density pop_minwage exchange_rate max_temperature
## sales_unitboxes            0.3         0.3           0.2             0.6
## consumer_sentiment         0.1         0.0          -0.2            -0.2
## CPI                        1.0         0.8           0.7            -0.1
## inflation_rate             0.3         0.2           0.5            -0.6
## unemp_rate                -0.8        -0.7          -0.7             0.0
## gdp_percapita              0.9         0.9           0.8             0.1
## itaee                      0.9         0.7           0.8            -0.2
## itaee_growth              -0.4        -0.3          -0.1             0.0
## pop_density                1.0         0.9           0.8             0.0
## job_density                1.0         0.8           0.7             0.0
## pop_minwage                0.8         1.0           0.7             0.1
## exchange_rate              0.7         0.7           1.0             0.0
## max_temperature            0.0         0.1           0.0             1.0
## holiday_month              0.1         0.0           0.1            -0.2
##                    holiday_month
## sales_unitboxes              0.0
## consumer_sentiment           0.1
## CPI                          0.1
## inflation_rate               0.0
## unemp_rate                   0.0
## gdp_percapita                0.0
## itaee                        0.1
## itaee_growth                -0.1
## pop_density                  0.0
## job_density                  0.1
## pop_minwage                  0.0
## exchange_rate                0.1
## max_temperature             -0.2
## holiday_month                1.0
ggcorrplot(corre, 
           hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("pink", "white", "lightgreen"), 
           title="Correlogram", 
           ggtheme=theme_bw)

# In this matrix, we can appreciate the correlation among every variable. We will pay special attention to the ones related to our dependent variable.   We can base our hypothesis on this graph and later on verify the statistical significance of each relation.
# We can see how umeployment rate has almost no relationship whatsoever the sales of unitboxes, while in the other hand Max Temperature seems to be the most correlated.

# Emphasis in noting that correlation does not mean causality. We are just cheking for a relationship between variables.



#Graph 4 Scatterplot

theme_set(theme_bw())  # pre-set the bw theme.
g <- ggplot(coca, aes(max_temperature, sales_unitboxes))
g + geom_point() + 
  geom_smooth(method="lm", se=F) +
  labs( 
       y="Sales Unitboxes", 
       x="Max. Temperature", 
       title="Exportations vs Foreign Investment", 
       )
## `geom_smooth()` using formula = 'y ~ x'

# Graph 5 Counts Plot
theme_set(theme_bw())  # pre-set the bw theme.
g <- ggplot(coca, aes(CPI, sales_unitboxes))
g + geom_count(col="tomato3", show.legend=F) +
  labs( 
       y="Sales Unitboxes", 
       x="Consumer Price Index", 
       title="Theft Danger & Foreign Investments")

# At first glance, this graph shows no correlation at all, it shows a heteroscedasticity behavior. We can assume these 2 varoables, sales and CPI do not intercorrelate.

3. HYPOTHESES STATEMENTS

4. LINEAR REGRESSION MODEL SPECIFICATION

# Estimate 2 different multiple linear regression model specifications. 

# Linear Regression 
linear_regression <- lm(sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + unemp_rate + gdp_percapita +  itaee + itaee_growth + pop_density + job_density + pop_minwage + exchange_rate +  max_temperature, data=coca) 

summary(linear_regression)
## 
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + 
##     unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density + 
##     job_density + pop_minwage + exchange_rate + max_temperature, 
##     data = coca)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -557280 -245811    8601  197757  717948 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -70533682   46993076  -1.501 0.142340    
## consumer_sentiment     47082      28366   1.660 0.105887    
## CPI                  -100726      97912  -1.029 0.310660    
## inflation_rate       -112101     265941  -0.422 0.675950    
## unemp_rate          -4743837   19670332  -0.241 0.810833    
## gdp_percapita          -2494       1484  -1.681 0.101676    
## itaee                  21189      85276   0.248 0.805217    
## itaee_growth         1969921    5157483   0.382 0.704805    
## pop_density          1118240     736253   1.519 0.137789    
## job_density          -347723     482784  -0.720 0.476157    
## pop_minwage           161450     139429   1.158 0.254729    
## exchange_rate           1878     111357   0.017 0.986640    
## max_temperature       164835      38165   4.319 0.000123 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 370600 on 35 degrees of freedom
## Multiple R-squared:  0.7147, Adjusted R-squared:  0.6169 
## F-statistic: 7.307 on 12 and 35 DF,  p-value: 1.821e-06
RMSE(linear_regression$fitted.values, coca$sales_unitboxes)
## [1] 316438.2
#Linear regression with natural logarithm applied to the variables
linear_log<-lm(log(sales_unitboxes) ~ consumer_sentiment + CPI + inflation_rate + unemp_rate + gdp_percapita +  itaee + itaee_growth + pop_density + job_density + pop_minwage + exchange_rate +  max_temperature, data=coca) 

summary(linear_log)
## 
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI + 
##     inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth + 
##     pop_density + job_density + pop_minwage + exchange_rate + 
##     max_temperature, data = coca)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.096099 -0.033915  0.001249  0.029926  0.106348 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.3512266  7.3811850   0.590 0.559311    
## consumer_sentiment  0.0072486  0.0044554   1.627 0.112722    
## CPI                -0.0155279  0.0153791  -1.010 0.319581    
## inflation_rate     -0.0157285  0.0417712  -0.377 0.708788    
## unemp_rate         -0.8111869  3.0896118  -0.263 0.794434    
## gdp_percapita      -0.0003609  0.0002331  -1.548 0.130532    
## itaee               0.0056921  0.0133942   0.425 0.673465    
## itaee_growth        0.1924532  0.8100839   0.238 0.813598    
## pop_density         0.1619959  0.1156430   1.401 0.170070    
## job_density        -0.0530332  0.0758307  -0.699 0.488948    
## pop_minwage         0.0222791  0.0219000   1.017 0.315987    
## exchange_rate      -0.0016231  0.0174908  -0.093 0.926595    
## max_temperature     0.0254728  0.0059945   4.249 0.000151 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05821 on 35 degrees of freedom
## Multiple R-squared:  0.7087, Adjusted R-squared:  0.6088 
## F-statistic: 7.094 on 12 and 35 DF,  p-value: 2.531e-06
RMSE(linear_log$fitted.values, coca$sales_unitboxes)
## [1] 6500729
# Polynomial Regression Model
polynomial_regression <- lm(log(sales_unitboxes) ~ consumer_sentiment  + CPI + inflation_rate + unemp_rate + gdp_percapita +  itaee + I(itaee^2)+ itaee_growth + pop_density + job_density + pop_minwage + exchange_rate +  max_temperature, data=coca) 
                              

summary(polynomial_regression)
## 
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI + 
##     inflation_rate + unemp_rate + gdp_percapita + itaee + I(itaee^2) + 
##     itaee_growth + pop_density + job_density + pop_minwage + 
##     exchange_rate + max_temperature, data = coca)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.104158 -0.037441  0.001734  0.024955  0.089930 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -5.0186854  8.4798501  -0.592   0.5579    
## consumer_sentiment  0.0125035  0.0050115   2.495   0.0176 *  
## CPI                -0.0026777  0.0160811  -0.167   0.8687    
## inflation_rate     -0.0047234  0.0404442  -0.117   0.9077    
## unemp_rate         -1.4422006  2.9804784  -0.484   0.6316    
## gdp_percapita      -0.0003123  0.0002249  -1.389   0.1739    
## itaee               0.2471156  0.1209049   2.044   0.0488 *  
## I(itaee^2)         -0.0010615  0.0005286  -2.008   0.0526 .  
## itaee_growth        0.6806546  0.8142529   0.836   0.4090    
## pop_density         0.0989287  0.1152961   0.858   0.3969    
## job_density        -0.0423211  0.0729398  -0.580   0.5656    
## pop_minwage         0.0092791  0.0219835   0.422   0.6756    
## exchange_rate      -0.0033056  0.0167998  -0.197   0.8452    
## max_temperature     0.0255336  0.0057506   4.440 9.02e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05584 on 34 degrees of freedom
## Multiple R-squared:  0.7396, Adjusted R-squared:   0.64 
## F-statistic: 7.426 on 13 and 34 DF,  p-value: 1.299e-06
RMSE(polynomial_regression$fitted.values, coca$sales_unitboxes)
## [1] 6500729

5. RESULTS ANALYSIS

# Evaluate each regression model using model diagnostics (e.g., multicollinearity and heteroscedasticity).

##HETEROSCEDASTICITY
# Using the BP Test, we can check out for heteroscedasticity (residuals with a NOT constant variance/unequal scatter) by looking at the P-VALUE it gave us. 
#H0.Homoscedasticityt is present
#HA.Heteroscedasticty is present
# Its very important to mention that our models should have HOMOSCEDASTICITY for a model way more precise.

#If the P- Value is less than 5% or 0.05, we reject the null hypothesis which means that heteroscedasticity is present in the model


#Model 1 Evaluation

summary(linear_regression)
## 
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + 
##     unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density + 
##     job_density + pop_minwage + exchange_rate + max_temperature, 
##     data = coca)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -557280 -245811    8601  197757  717948 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -70533682   46993076  -1.501 0.142340    
## consumer_sentiment     47082      28366   1.660 0.105887    
## CPI                  -100726      97912  -1.029 0.310660    
## inflation_rate       -112101     265941  -0.422 0.675950    
## unemp_rate          -4743837   19670332  -0.241 0.810833    
## gdp_percapita          -2494       1484  -1.681 0.101676    
## itaee                  21189      85276   0.248 0.805217    
## itaee_growth         1969921    5157483   0.382 0.704805    
## pop_density          1118240     736253   1.519 0.137789    
## job_density          -347723     482784  -0.720 0.476157    
## pop_minwage           161450     139429   1.158 0.254729    
## exchange_rate           1878     111357   0.017 0.986640    
## max_temperature       164835      38165   4.319 0.000123 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 370600 on 35 degrees of freedom
## Multiple R-squared:  0.7147, Adjusted R-squared:  0.6169 
## F-statistic: 7.307 on 12 and 35 DF,  p-value: 1.821e-06
RMSE(linear_regression$fitted.values, coca$sales_unitboxes)
## [1] 316438.2
# This model has an adjusted r2 of 0.6169 and a RMSE of 316,438.2

vif(linear_regression)
## consumer_sentiment                CPI     inflation_rate         unemp_rate 
##           2.298663          83.943670           3.674213           4.579567 
##      gdp_percapita              itaee       itaee_growth        pop_density 
##          47.586831          56.231040           1.946046         308.686168 
##        job_density        pop_minwage      exchange_rate    max_temperature 
##         124.998391           6.706682          10.947585           3.457748
#Shows a model with only 5 variables with multicolinearity.

bptest(linear_regression)
## 
##  studentized Breusch-Pagan test
## 
## data:  linear_regression
## BP = 11.905, df = 12, p-value = 0.4534
# p-value of 0.4534

AIC(linear_regression)
## [1] 1380.047
# 1380.047
histogram(polynomial_regression$residuals)

#Model 2 Evaluation
summary(linear_log)
## 
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI + 
##     inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth + 
##     pop_density + job_density + pop_minwage + exchange_rate + 
##     max_temperature, data = coca)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.096099 -0.033915  0.001249  0.029926  0.106348 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.3512266  7.3811850   0.590 0.559311    
## consumer_sentiment  0.0072486  0.0044554   1.627 0.112722    
## CPI                -0.0155279  0.0153791  -1.010 0.319581    
## inflation_rate     -0.0157285  0.0417712  -0.377 0.708788    
## unemp_rate         -0.8111869  3.0896118  -0.263 0.794434    
## gdp_percapita      -0.0003609  0.0002331  -1.548 0.130532    
## itaee               0.0056921  0.0133942   0.425 0.673465    
## itaee_growth        0.1924532  0.8100839   0.238 0.813598    
## pop_density         0.1619959  0.1156430   1.401 0.170070    
## job_density        -0.0530332  0.0758307  -0.699 0.488948    
## pop_minwage         0.0222791  0.0219000   1.017 0.315987    
## exchange_rate      -0.0016231  0.0174908  -0.093 0.926595    
## max_temperature     0.0254728  0.0059945   4.249 0.000151 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05821 on 35 degrees of freedom
## Multiple R-squared:  0.7087, Adjusted R-squared:  0.6088 
## F-statistic: 7.094 on 12 and 35 DF,  p-value: 2.531e-06
RMSE(linear_log$fitted.values, coca$sales_unitboxes)
## [1] 6500729
# This model has an adjusted r2 of 0.6088 and a RMSE of 6,500,729

vif(linear_log)
## consumer_sentiment                CPI     inflation_rate         unemp_rate 
##           2.298663          83.943670           3.674213           4.579567 
##      gdp_percapita              itaee       itaee_growth        pop_density 
##          47.586831          56.231040           1.946046         308.686168 
##        job_density        pop_minwage      exchange_rate    max_temperature 
##         124.998391           6.706682          10.947585           3.457748
#Shows a model with only 5 variables with multicolinearity.

bptest(linear_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  linear_log
## BP = 12.013, df = 12, p-value = 0.4446
#p-value of 0.4446

AIC(linear_log)
## [1] -123.9445
# Aic of -123.9445

histogram(linear_log$residuals)

#Model 3 Evaluation
summary(polynomial_regression)
## 
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI + 
##     inflation_rate + unemp_rate + gdp_percapita + itaee + I(itaee^2) + 
##     itaee_growth + pop_density + job_density + pop_minwage + 
##     exchange_rate + max_temperature, data = coca)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.104158 -0.037441  0.001734  0.024955  0.089930 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -5.0186854  8.4798501  -0.592   0.5579    
## consumer_sentiment  0.0125035  0.0050115   2.495   0.0176 *  
## CPI                -0.0026777  0.0160811  -0.167   0.8687    
## inflation_rate     -0.0047234  0.0404442  -0.117   0.9077    
## unemp_rate         -1.4422006  2.9804784  -0.484   0.6316    
## gdp_percapita      -0.0003123  0.0002249  -1.389   0.1739    
## itaee               0.2471156  0.1209049   2.044   0.0488 *  
## I(itaee^2)         -0.0010615  0.0005286  -2.008   0.0526 .  
## itaee_growth        0.6806546  0.8142529   0.836   0.4090    
## pop_density         0.0989287  0.1152961   0.858   0.3969    
## job_density        -0.0423211  0.0729398  -0.580   0.5656    
## pop_minwage         0.0092791  0.0219835   0.422   0.6756    
## exchange_rate      -0.0033056  0.0167998  -0.197   0.8452    
## max_temperature     0.0255336  0.0057506   4.440 9.02e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05584 on 34 degrees of freedom
## Multiple R-squared:  0.7396, Adjusted R-squared:   0.64 
## F-statistic: 7.426 on 13 and 34 DF,  p-value: 1.299e-06
RMSE(polynomial_regression$fitted.values, coca$sales_unitboxes)
## [1] 6500729
# This model has an adjusted r2 of 0.64 and a RMSE of 6,500,729

vif(polynomial_regression)
## consumer_sentiment                CPI     inflation_rate         unemp_rate 
##           3.160313          99.735516           3.742934           4.631040 
##      gdp_percapita              itaee         I(itaee^2)       itaee_growth 
##          48.143239        4978.717962        4892.639049           2.136495 
##        pop_density        job_density        pop_minwage      exchange_rate 
##         333.424568         125.670507           7.343468          10.974883 
##    max_temperature 
##           3.457844
#Shows a model with  6 variables with elevated coefficients of multicolinearity.

bptest(polynomial_regression)
## 
##  studentized Breusch-Pagan test
## 
## data:  polynomial_regression
## BP = 16.409, df = 13, p-value = 0.2277
#p-value of 0.2277

AIC(polynomial_regression)
## [1] -127.3247
# Aic of -127.3247

histogram(polynomial_regression$residuals)

#Looking at every model's BP Test we can see how:
#   - Linear regression: P-Value = 0.4534 H0 - Homoscedasticityt is present
#   - Linear regression with natural logarithm: P-Value =0.4446 HO - Homoscedasticityt is present
#   - Polynomial regression: P-Value = 0.2277 H0 - Homoscedasticityt is present

##Next, we are going to check for Multicolineality.

##MULTICOLINEALITY

# We identify this phenomenon by checking the VIF function.
# In order to verify this, the values of the variables of our models should be less than 10 to the extent possible.
# After verifying all VIF tests, we can conclude that our model with the most Multicolineality is clearly the polynomial. Since the other two are pretty similar, we will decide our model based on R2, RMSE and AIC.

#Adjusted R-squared assesses the proportion of variation explained by regression models while considering model complexity.
#RMSE quantifies prediction errors, helping evaluate how closely model predictions align with observed values.
#AIC helps compare models by considering both fit and complexity, promoting the selection of simpler models with good fit.

# RMSE - Root Mean Square Error: The larger the difference indicates a larger gap between the predicted and observed values, 
# which means poor regression model fit. In the same way, the smaller RMSE that indicates the better the model.



# model 1 with Adjusted R-squared of 0.6169, rmse of 316438.2 and a aic of 1380.047

# model 2 with Adjusted R-squared of 0.6088, rmse of 6500729 and a aic of -123.9445 


#After analyzing each test, we coclude that the best model is model 1. This is due to the fact that it has lower RMSE an a bigger R2. Even tough the 2nd model was way lower AIC, model 1 still has a reasonable AIC.

6. ADJUSTING MULTICOLINEARITY

#We will eliminate the variables that does not have a significant impact on the model using the correlation chart

corre<- round(cor(subset(coca,select =  -c(tperiod))), 1)
corre
##                    sales_unitboxes consumer_sentiment  CPI inflation_rate
## sales_unitboxes                1.0                0.2  0.2           -0.3
## consumer_sentiment             0.2                1.0  0.2           -0.1
## CPI                            0.2                0.2  1.0            0.3
## inflation_rate                -0.3               -0.1  0.3            1.0
## unemp_rate                    -0.1                0.1 -0.8           -0.4
## gdp_percapita                  0.2                0.0  0.9            0.2
## itaee                          0.3                0.2  0.9            0.4
## itaee_growth                  -0.2               -0.2 -0.4           -0.1
## pop_density                    0.3                0.2  1.0            0.3
## job_density                    0.3                0.1  1.0            0.3
## pop_minwage                    0.3                0.0  0.8            0.2
## exchange_rate                  0.2               -0.2  0.7            0.5
## max_temperature                0.6               -0.2 -0.1           -0.6
## holiday_month                  0.0                0.1  0.1            0.0
##                    unemp_rate gdp_percapita itaee itaee_growth pop_density
## sales_unitboxes          -0.1           0.2   0.3         -0.2         0.3
## consumer_sentiment        0.1           0.0   0.2         -0.2         0.2
## CPI                      -0.8           0.9   0.9         -0.4         1.0
## inflation_rate           -0.4           0.2   0.4         -0.1         0.3
## unemp_rate                1.0          -0.8  -0.7          0.3        -0.8
## gdp_percapita            -0.8           1.0   0.7         -0.2         0.9
## itaee                    -0.7           0.7   1.0         -0.4         0.9
## itaee_growth              0.3          -0.2  -0.4          1.0        -0.4
## pop_density              -0.8           0.9   0.9         -0.4         1.0
## job_density              -0.8           0.9   0.9         -0.4         1.0
## pop_minwage              -0.7           0.9   0.7         -0.3         0.9
## exchange_rate            -0.7           0.8   0.8         -0.1         0.8
## max_temperature           0.0           0.1  -0.2          0.0         0.0
## holiday_month             0.0           0.0   0.1         -0.1         0.0
##                    job_density pop_minwage exchange_rate max_temperature
## sales_unitboxes            0.3         0.3           0.2             0.6
## consumer_sentiment         0.1         0.0          -0.2            -0.2
## CPI                        1.0         0.8           0.7            -0.1
## inflation_rate             0.3         0.2           0.5            -0.6
## unemp_rate                -0.8        -0.7          -0.7             0.0
## gdp_percapita              0.9         0.9           0.8             0.1
## itaee                      0.9         0.7           0.8            -0.2
## itaee_growth              -0.4        -0.3          -0.1             0.0
## pop_density                1.0         0.9           0.8             0.0
## job_density                1.0         0.8           0.7             0.0
## pop_minwage                0.8         1.0           0.7             0.1
## exchange_rate              0.7         0.7           1.0             0.0
## max_temperature            0.0         0.1           0.0             1.0
## holiday_month              0.1         0.0           0.1            -0.2
##                    holiday_month
## sales_unitboxes              0.0
## consumer_sentiment           0.1
## CPI                          0.1
## inflation_rate               0.0
## unemp_rate                   0.0
## gdp_percapita                0.0
## itaee                        0.1
## itaee_growth                -0.1
## pop_density                  0.0
## job_density                  0.1
## pop_minwage                  0.0
## exchange_rate                0.1
## max_temperature             -0.2
## holiday_month                1.0
ggcorrplot(corre, 
           hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("pink", "white", "lightgreen"), 
           title="Correlogram", 
           ggtheme=theme_bw)

#We discovered that the variables with least correlation were: CPI, ITAEE GROTH. EXCHANGE RATE and GDP PER CAPTA


#Playing with the model, we removed from the model the variables: gdp per capita and CPI

#New model
linear_regression2 <- lm(sales_unitboxes ~ consumer_sentiment + inflation_rate + unemp_rate  +  itaee + itaee_growth + pop_density + job_density + pop_minwage + exchange_rate +  max_temperature, data=coca) 

summary(linear_regression2)
## 
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + inflation_rate + 
##     unemp_rate + itaee + itaee_growth + pop_density + job_density + 
##     pop_minwage + exchange_rate + max_temperature, data = coca)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603628 -185655   -8986  219853  762529 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -16220942   34124806  -0.475  0.63734    
## consumer_sentiment     48051      26744   1.797  0.08055 .  
## inflation_rate        -18995     261485  -0.073  0.94248    
## unemp_rate           2522844   19188626   0.131  0.89611    
## itaee                 149702      43043   3.478  0.00131 ** 
## itaee_growth        -2578117    4519466  -0.570  0.57182    
## pop_density            80573     428063   0.188  0.85173    
## job_density          -507636     445499  -1.139  0.26183    
## pop_minwage           146556     138915   1.055  0.29826    
## exchange_rate         -62313     101709  -0.613  0.54385    
## max_temperature       182950      35041   5.221 7.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 375900 on 37 degrees of freedom
## Multiple R-squared:  0.6897, Adjusted R-squared:  0.6058 
## F-statistic: 8.224 on 10 and 37 DF,  p-value: 7.935e-07
RMSE(linear_regression2$fitted.values, coca$sales_unitboxes)
## [1] 330024.9
### Diagnostic Tests: New Model
vif(linear_regression2)
## consumer_sentiment     inflation_rate         unemp_rate              itaee 
##           1.985985           3.452286           4.235522          13.923110 
##       itaee_growth        pop_density        job_density        pop_minwage 
##           1.452346         101.413775         103.444981           6.470196 
##      exchange_rate    max_temperature 
##           8.876062           2.832909
bptest(linear_regression2)
## 
##  studentized Breusch-Pagan test
## 
## data:  linear_regression2
## BP = 10.618, df = 10, p-value = 0.388
AIC(linear_regression2)
## [1] 1380.083
histogram(linear_regression2$residuals)

#Testing again our model, we can see how the multicolineality became significantly reduced.
# We have a total of 7 variables with no multicolineality
#We also have 1 variables with a small amount of multicolineality.
# And the other 2 Having their a coefficient of around 100.

#These changes to our model did not affect significantly our rmse, r1 nor AIC.

6. SELECTED MODEL AND PREDICIONS

#Now that we have the best version of a model, we can evaluate our hypothesis and make our predictions.

#Interpret the regression results of selected regression model. That is, describe how much is the impact of the explanatory variables X’s on the dependent variable Y.
summary(linear_regression2)
## 
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + inflation_rate + 
##     unemp_rate + itaee + itaee_growth + pop_density + job_density + 
##     pop_minwage + exchange_rate + max_temperature, data = coca)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603628 -185655   -8986  219853  762529 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -16220942   34124806  -0.475  0.63734    
## consumer_sentiment     48051      26744   1.797  0.08055 .  
## inflation_rate        -18995     261485  -0.073  0.94248    
## unemp_rate           2522844   19188626   0.131  0.89611    
## itaee                 149702      43043   3.478  0.00131 ** 
## itaee_growth        -2578117    4519466  -0.570  0.57182    
## pop_density            80573     428063   0.188  0.85173    
## job_density          -507636     445499  -1.139  0.26183    
## pop_minwage           146556     138915   1.055  0.29826    
## exchange_rate         -62313     101709  -0.613  0.54385    
## max_temperature       182950      35041   5.221 7.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 375900 on 37 degrees of freedom
## Multiple R-squared:  0.6897, Adjusted R-squared:  0.6058 
## F-statistic: 8.224 on 10 and 37 DF,  p-value: 7.935e-07
#We can observe that the variable with highest correlation is the mas. temperature. This is our independent variable with explains the most the behavior of our dependet variable (Coca Cola's Sales). It has a 99% certainty and shows a positive correlation. For every 1 change in the max. temperature, the sales increases by 182950.

#On the other hand, our seconf most explanatory variable is itaee (Indicator of the State Economic Activity - ITAEE). It also has a positive correlation with a significance of 0.01, showing a high statistical signifance. For this variable, For every 1 change in the ITAEE, the sales increases by 149702.


#Use the selected regression results to make predictions between the X’s regressors and the dependent variable (e.g., effects plots). 

effect_plot(linear_regression2,pred=max_temperature,interval=TRUE)

effect_plot(linear_regression2,pred=itaee,interval=TRUE)

effect_plot(linear_regression2,pred=consumer_sentiment,interval=TRUE)

#We can observe the dependent variable predictions regarding the most statistaclly significant variables of the model. We can clearly observe the positive correlations among all of them and the next predictive values.

3. HYPOTHESES STATEMENTS EVALUATION