# loading required libraries 
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(dplyr)
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(zoo)
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(stats)
library(forecast)
library(astsa)
## 
## Attaching package: 'astsa'
## The following object is masked from 'package:forecast':
## 
##     gas
library(corrplot)
## corrplot 0.92 loaded
library(AER)
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## Loading required package: lmtest
## Loading required package: sandwich
## Loading required package: survival
library(vars)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: strucchange
## Loading required package: urca
library(dynlm)
library(vars)
library(TSstudio)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.3     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ stringr::boundary() masks strucchange::boundary()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks xts::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::last()       masks xts::last()
## ✖ car::recode()       masks dplyr::recode()
## ✖ MASS::select()      masks dplyr::select()
## ✖ purrr::some()       masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sarima)
## Loading required package: stats4
## 
## Attaching package: 'sarima'
## 
## The following object is masked from 'package:astsa':
## 
##     sarima
## 
## The following object is masked from 'package:stats':
## 
##     spectrum
library(dygraphs)
library(markdown)
library(foreign)
library(forcats)      
library(ggplot2)     
library(readr)        
library(janitor) 
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(Hmisc) 
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(psych)  
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following object is masked from 'package:astsa':
## 
##     scatter.hist
library(naniar)       
library(dlookr) 
## 
## Attaching package: 'dlookr'
## 
## The following object is masked from 'package:psych':
## 
##     describe
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:vars':
## 
##     normality
## 
## The following object is masked from 'package:base':
## 
##     transform
library(corrplot)  
library(jtools) 
## 
## Attaching package: 'jtools'
## 
## The following object is masked from 'package:Hmisc':
## 
##     %nin%
library(lmtest) 
library(car)  
library(olsrr)
## 
## Attaching package: 'olsrr'
## 
## The following object is masked from 'package:MASS':
## 
##     cement
## 
## The following object is masked from 'package:datasets':
## 
##     rivers
library(naniar)  
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effects) 
## Registered S3 method overwritten by 'survey':
##   method      from  
##   summary.pps dlookr
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(caret) 
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## The following object is masked from 'package:survival':
## 
##     cluster
library(glmnet)  
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
library(corrplot)
library(zoo)
# import dataset
df2<-read.csv("/Users/valeriacantulobo/Downloads/coca_cola_sales.csv")
#EXPLORATORY DATA ANALYSIS 

# Briefly describe the dataset. For example, what is the structure of the dataset? How many observations include the dataset? Is there any presence of missing values in the dataset? 

head(df2)
##   tperiod sales_unitboxes consumer_sentiment    CPI inflation_rate unemp_rate
## 1  15-Jan         5516689             38.063 87.110          -0.09     0.0523
## 2  15-Feb         5387496             37.491 87.275           0.19     0.0531
## 3  15-Mar         5886747             38.505 87.631           0.41     0.0461
## 4  15-Apr         6389182             37.843 87.404          -0.26     0.0510
## 5  15-May         6448275             38.032 86.967          -0.50     0.0552
## 6  15-Jun         6697947             39.112 87.113           0.17     0.0507
##   gdp_percapita    itaee itaee_growth pop_density job_density pop_minwage
## 1      11659.56 103.7654       0.0497     98.5418     18.2605      9.6579
## 2      11659.55 103.7654       0.0497     98.5419     18.4633      9.6579
## 3      11659.55 103.7654       0.0497     98.5419     18.6416      9.6579
## 4      11625.75 107.7518       0.0318     98.8284     18.6788      9.5949
## 5      11625.74 107.7518       0.0318     98.8284     18.6754      9.5949
## 6      11625.74 107.7518       0.0318     98.8285     18.6467      9.5949
##   exchange_rate max_temperature holiday_month
## 1       14.6926              28             0
## 2       14.9213              31             0
## 3       15.2283              29             0
## 4       15.2262              32             1
## 5       15.2645              34             0
## 6       15.4830              32             0
#observations
dim(df2)
## [1] 48 15
#structure
str(df2)
## 'data.frame':    48 obs. of  15 variables:
##  $ tperiod           : chr  "15-Jan" "15-Feb" "15-Mar" "15-Apr" ...
##  $ sales_unitboxes   : num  5516689 5387496 5886747 6389182 6448275 ...
##  $ consumer_sentiment: num  38.1 37.5 38.5 37.8 38 ...
##  $ CPI               : num  87.1 87.3 87.6 87.4 87 ...
##  $ inflation_rate    : num  -0.09 0.19 0.41 -0.26 -0.5 0.17 0.15 0.21 0.37 0.51 ...
##  $ unemp_rate        : num  0.0523 0.0531 0.0461 0.051 0.0552 0.0507 0.0542 0.0547 0.0538 0.0539 ...
##  $ gdp_percapita     : num  11660 11660 11660 11626 11626 ...
##  $ itaee             : num  104 104 104 108 108 ...
##  $ itaee_growth      : num  0.0497 0.0497 0.0497 0.0318 0.0318 0.0318 0.0565 0.0565 0.0565 0.0056 ...
##  $ pop_density       : num  98.5 98.5 98.5 98.8 98.8 ...
##  $ job_density       : num  18.3 18.5 18.6 18.7 18.7 ...
##  $ pop_minwage       : num  9.66 9.66 9.66 9.59 9.59 ...
##  $ exchange_rate     : num  14.7 14.9 15.2 15.2 15.3 ...
##  $ max_temperature   : int  28 31 29 32 34 32 29 29 29 29 ...
##  $ holiday_month     : int  0 0 0 1 0 0 0 0 1 0 ...
#missing values
sum(is.na(df2))
## [1] 0
#there are no missing values
colnames(df2)
##  [1] "tperiod"            "sales_unitboxes"    "consumer_sentiment"
##  [4] "CPI"                "inflation_rate"     "unemp_rate"        
##  [7] "gdp_percapita"      "itaee"              "itaee_growth"      
## [10] "pop_density"        "job_density"        "pop_minwage"       
## [13] "exchange_rate"      "max_temperature"    "holiday_month"
#- Include summary of descriptive statistics. What is the mean, min, and max values of the dependent variable? 
#dependent value: sales_unitboxes
summary(df2)
##    tperiod          sales_unitboxes   consumer_sentiment      CPI        
##  Length:48          Min.   :5301755   Min.   :28.67      Min.   : 86.97  
##  Class :character   1st Qu.:6171767   1st Qu.:35.64      1st Qu.: 89.18  
##  Mode  :character   Median :6461357   Median :36.76      Median : 92.82  
##                     Mean   :6473691   Mean   :37.15      Mean   : 93.40  
##                     3rd Qu.:6819782   3rd Qu.:38.14      3rd Qu.: 98.40  
##                     Max.   :7963063   Max.   :44.87      Max.   :103.02  
##  inflation_rate      unemp_rate      gdp_percapita       itaee      
##  Min.   :-0.5000   Min.   :0.03470   Min.   :11559   Min.   :103.8  
##  1st Qu.: 0.1650   1st Qu.:0.04010   1st Qu.:11830   1st Qu.:111.5  
##  Median : 0.3850   Median :0.04370   Median :12014   Median :113.5  
##  Mean   : 0.3485   Mean   :0.04442   Mean   :11979   Mean   :113.9  
##  3rd Qu.: 0.5575   3rd Qu.:0.04895   3rd Qu.:12162   3rd Qu.:117.1  
##  Max.   : 1.7000   Max.   :0.05520   Max.   :12329   Max.   :122.5  
##   itaee_growth      pop_density      job_density     pop_minwage    
##  Min.   :0.00560   Min.   : 98.54   Min.   :18.26   Min.   : 9.398  
##  1st Qu.:0.02237   1st Qu.: 99.61   1st Qu.:19.28   1st Qu.:10.794  
##  Median :0.02995   Median :100.67   Median :20.39   Median :11.139  
##  Mean   :0.03172   Mean   :100.65   Mean   :20.38   Mean   :11.116  
##  3rd Qu.:0.04300   3rd Qu.:101.69   3rd Qu.:21.60   3rd Qu.:11.413  
##  Max.   :0.05650   Max.   :102.69   Max.   :22.36   Max.   :13.026  
##  exchange_rate   max_temperature holiday_month 
##  Min.   :14.69   Min.   :26.00   Min.   :0.00  
##  1st Qu.:17.38   1st Qu.:29.00   1st Qu.:0.00  
##  Median :18.62   Median :30.00   Median :0.00  
##  Mean   :18.18   Mean   :30.50   Mean   :0.25  
##  3rd Qu.:19.06   3rd Qu.:32.25   3rd Qu.:0.25  
##  Max.   :21.39   Max.   :37.00   Max.   :1.00
# DATA VISUALIZATION 

#Show at least 4-5 plots including pair-wised graphs, frequency plots, correlation matrix plots, scatter plots, box plots, and / or histogram plots, etc.

#histogram of the dependent variable
hist(df2$sales_unitboxes) 

hist(log(df2$sales_unitboxes)) 

# plot 2: temperature and sales
plot1 <- ggplot(df2, aes(x = max_temperature, y = sales_unitboxes)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue")
  labs(x = "Weather", y = "Sales")
## $x
## [1] "Weather"
## 
## $y
## [1] "Sales"
## 
## attr(,"class")
## [1] "labels"
plot1
## `geom_smooth()` using formula = 'y ~ x'

#plot 3: inflation and sales
plot2 <- ggplot(df2, aes(x = inflation_rate, y = sales_unitboxes)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue")
  labs(x = "Inflation", y = "Sales")
## $x
## [1] "Inflation"
## 
## $y
## [1] "Sales"
## 
## attr(,"class")
## [1] "labels"
plot2
## `geom_smooth()` using formula = 'y ~ x'

#graph 4
numeric_df <- df2[, sapply(df2, is.numeric)]
cor_matrix <- cor(numeric_df)
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust")

#Select 1-2 plots and briefly describe the data patterns. 

#The graph of the sales vs the temperature has a positive trend, so as the temperature increases the sales in coca cola increase as well

#the graph of sales vs inflation has a negative trend, so this is the contrary of the last graph when the inflation increases the sales decreases.
#LINEAR REGRESSION MODEL SPECIFICATION 

#Briefly describe the hypotheses statements. That is, what is the expected impact of the explanatory variable X on the dependent variable Y.

#HYPOTHESIS 1: with higher temperature (x) higher number of sales (y)

#With a regression model we will find out the relation between the two variables and how sales acts when the temperature increases

#HYPOTHESIS 2: with higher inflation (x) the number of sales decrease (y)

#With a regression model we will find out the relation between the two variables and how sales acts when the inflation decreases
#Estimate 2 different multiple linear regression model specifications. 

#Linear Regression Analysis 

#model 1
model1<-lm(sales_unitboxes ~ consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,data=df2)
summary(model1)
## 
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + 
##     unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density + 
##     job_density + pop_minwage + exchange_rate + max_temperature + 
##     holiday_month, data = df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -510267 -226241  -10274  198534  757335 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -94493166   49437038  -1.911   0.0644 .  
## consumer_sentiment     48099      28019   1.717   0.0951 .  
## CPI                  -143717     101440  -1.417   0.1656    
## inflation_rate         31729     282054   0.112   0.9111    
## unemp_rate         -10040675   19802582  -0.507   0.6154    
## gdp_percapita          -2555       1465  -1.744   0.0901 .  
## itaee                  15152      84276   0.180   0.8584    
## itaee_growth         4087812    5314504   0.769   0.4471    
## pop_density          1437972     761521   1.888   0.0675 .  
## job_density          -459448     482871  -0.951   0.3481    
## pop_minwage           182577     138433   1.319   0.1960    
## exchange_rate         -57940     117995  -0.491   0.6266    
## max_temperature       177016      38674   4.577 6.03e-05 ***
## holiday_month         199879     143614   1.392   0.1730    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 365700 on 34 degrees of freedom
## Multiple R-squared:  0.7301, Adjusted R-squared:  0.6269 
## F-statistic: 7.075 on 13 and 34 DF,  p-value: 2.23e-06
#model 2
model2<-lm(log(sales_unitboxes) ~ consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,data=df2)
summary(model2)
## 
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI + 
##     inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth + 
##     pop_density + job_density + pop_minwage + exchange_rate + 
##     max_temperature + holiday_month, data = df2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.080799 -0.036264  0.000068  0.032301  0.113440 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.0325512  7.6945736   0.004   0.9966    
## consumer_sentiment  0.0074322  0.0043610   1.704   0.0975 .  
## CPI                -0.0232788  0.0157885  -1.474   0.1496    
## inflation_rate      0.0101989  0.0439000   0.232   0.8177    
## unemp_rate         -1.7652579  3.0821512  -0.573   0.5706    
## gdp_percapita      -0.0003718  0.0002280  -1.631   0.1121    
## itaee               0.0046038  0.0131170   0.351   0.7278    
## itaee_growth        0.5739571  0.8271701   0.694   0.4925    
## pop_density         0.2196249  0.1185261   1.853   0.0726 .  
## job_density        -0.0731622  0.0751560  -0.973   0.3372    
## pop_minwage         0.0260853  0.0215462   1.211   0.2344    
## exchange_rate      -0.0124060  0.0183652  -0.676   0.5039    
## max_temperature     0.0276677  0.0060193   4.596 5.69e-05 ***
## holiday_month       0.0360324  0.0223527   1.612   0.1162    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05692 on 34 degrees of freedom
## Multiple R-squared:  0.7293, Adjusted R-squared:  0.6259 
## F-statistic: 7.048 on 13 and 34 DF,  p-value: 2.327e-06
#model 3
model3<-lm(log(sales_unitboxes) ~ max_temperature+I(max_temperature^2)+consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,data=df2)
summary(model3)
## 
## Call:
## lm(formula = log(sales_unitboxes) ~ max_temperature + I(max_temperature^2) + 
##     consumer_sentiment + CPI + inflation_rate + unemp_rate + 
##     gdp_percapita + itaee + itaee_growth + pop_density + job_density + 
##     pop_minwage + exchange_rate + max_temperature + holiday_month, 
##     data = df2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.068167 -0.033712 -0.002205  0.027976  0.126712 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           3.0536976  7.3459021   0.416   0.6803  
## max_temperature      -0.1786356  0.0882293  -2.025   0.0511 .
## I(max_temperature^2)  0.0033236  0.0014185   2.343   0.0253 *
## consumer_sentiment    0.0078694  0.0041030   1.918   0.0638 .
## CPI                  -0.0372613  0.0159940  -2.330   0.0261 *
## inflation_rate        0.0610605  0.0466217   1.310   0.1993  
## unemp_rate           -2.0846749  2.9000105  -0.719   0.4773  
## gdp_percapita        -0.0001563  0.0002332  -0.670   0.5074  
## itaee                 0.0097000  0.0125186   0.775   0.4439  
## itaee_growth          0.8083269  0.7838363   1.031   0.3099  
## pop_density           0.2004718  0.1116980   1.795   0.0819 .
## job_density          -0.0393874  0.0720922  -0.546   0.5885  
## pop_minwage           0.0181667  0.0205306   0.885   0.3826  
## exchange_rate        -0.0344324  0.0196547  -1.752   0.0891 .
## holiday_month         0.0417181  0.0211482   1.973   0.0570 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0535 on 33 degrees of freedom
## Multiple R-squared:  0.768,  Adjusted R-squared:  0.6695 
## F-statistic: 7.801 on 14 and 33 DF,  p-value: 6.755e-07
library(caret)
library(glmnet)
# LASSO
set.seed(123)
training.samples<-df2$sales_unitboxes %>%
  createDataPartition(p=0.75,list=FALSE)

train.data<-df2[training.samples, ] 
test.data<-df2[-training.samples, ]  

x<-model.matrix(log(sales_unitboxes) ~ consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,train.data)[,-1] 
y<-train.data$sales_unitboxes

set.seed(123) 
cv.lasso <- cv.glmnet(x,y,alpha=1)

lambda_min <- cv.lasso$lambda.min     

lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)

coef(lassomodel)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                              s0
## (Intercept)         3139745.539
## consumer_sentiment    43610.950
## CPI                       .    
## inflation_rate      -548715.710
## unemp_rate                .    
## gdp_percapita         -1041.962
## itaee                 73956.372
## itaee_growth       -2300874.776
## pop_density               .    
## job_density               .    
## pop_minwage           20108.452
## exchange_rate         54798.528
## max_temperature      156590.842
## holiday_month        132714.345
x.test<-model.matrix(log(sales_unitboxes) ~consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()

# Model Accuracy
data.frame(
  RMSE = RMSE(lassopredictions, test.data$sales_unitboxes),
  Rsquare = R2(lassopredictions, test.data$sales_unitboxes))
##       RMSE   Rsquare
## 1 501137.9 0.3671956
# Graph 
lbs_fun <- function(fit, offset_x=1, ...) {
  L <- length(fit$lambda)
  x <- log(fit$lambda[L])+ offset_x
  y <- fit$beta[, L]
  labs <- names(y)
  text(x, y, labels=labs, ...)
}

lasso<-glmnet(scale(x),y,alpha=1)

plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)

#RESULTS ANALYSIS

#Evaluate each regression model using model diagnostics (e.g., multicollinearity and heteroscedasticity).

#model 1

vif(model1)#multicollinearity
## consumer_sentiment                CPI     inflation_rate         unemp_rate 
##           2.302975          92.514952           4.243659           4.752364 
##      gdp_percapita              itaee       itaee_growth        pop_density 
##          47.611019          56.391338           2.117225         339.086814 
##        job_density        pop_minwage      exchange_rate    max_temperature 
##         128.394120           6.788227          12.621039           3.645717 
##      holiday_month 
##           1.387939
bptest(model1)#heteroscedasticity
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 15.379, df = 13, p-value = 0.2843
AIC(model1)#serial correlation
## [1] 1379.386
histogram(model1$residuals)

summary(model1)
## 
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate + 
##     unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density + 
##     job_density + pop_minwage + exchange_rate + max_temperature + 
##     holiday_month, data = df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -510267 -226241  -10274  198534  757335 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -94493166   49437038  -1.911   0.0644 .  
## consumer_sentiment     48099      28019   1.717   0.0951 .  
## CPI                  -143717     101440  -1.417   0.1656    
## inflation_rate         31729     282054   0.112   0.9111    
## unemp_rate         -10040675   19802582  -0.507   0.6154    
## gdp_percapita          -2555       1465  -1.744   0.0901 .  
## itaee                  15152      84276   0.180   0.8584    
## itaee_growth         4087812    5314504   0.769   0.4471    
## pop_density          1437972     761521   1.888   0.0675 .  
## job_density          -459448     482871  -0.951   0.3481    
## pop_minwage           182577     138433   1.319   0.1960    
## exchange_rate         -57940     117995  -0.491   0.6266    
## max_temperature       177016      38674   4.577 6.03e-05 ***
## holiday_month         199879     143614   1.392   0.1730    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 365700 on 34 degrees of freedom
## Multiple R-squared:  0.7301, Adjusted R-squared:  0.6269 
## F-statistic: 7.075 on 13 and 34 DF,  p-value: 2.23e-06
predicted_values <- predict(model1)
rmse <- sqrt(mean((df2$sales_unitboxes - predicted_values)^2))
rmse #the smaller RMSE that indicates the better the model.
## [1] 307789
#Model 2

vif(model2)#multicollinearity
## consumer_sentiment                CPI     inflation_rate         unemp_rate 
##           2.302975          92.514952           4.243659           4.752364 
##      gdp_percapita              itaee       itaee_growth        pop_density 
##          47.611019          56.391338           2.117225         339.086814 
##        job_density        pop_minwage      exchange_rate    max_temperature 
##         128.394120           6.788227          12.621039           3.645717 
##      holiday_month 
##           1.387939
bptest(model2)#heteroscedasticity
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 15.774, df = 13, p-value = 0.2615
AIC(model2)#serial correlation
## [1] -125.4803
histogram(model2$residuals)

summary(model2)
## 
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI + 
##     inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth + 
##     pop_density + job_density + pop_minwage + exchange_rate + 
##     max_temperature + holiday_month, data = df2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.080799 -0.036264  0.000068  0.032301  0.113440 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.0325512  7.6945736   0.004   0.9966    
## consumer_sentiment  0.0074322  0.0043610   1.704   0.0975 .  
## CPI                -0.0232788  0.0157885  -1.474   0.1496    
## inflation_rate      0.0101989  0.0439000   0.232   0.8177    
## unemp_rate         -1.7652579  3.0821512  -0.573   0.5706    
## gdp_percapita      -0.0003718  0.0002280  -1.631   0.1121    
## itaee               0.0046038  0.0131170   0.351   0.7278    
## itaee_growth        0.5739571  0.8271701   0.694   0.4925    
## pop_density         0.2196249  0.1185261   1.853   0.0726 .  
## job_density        -0.0731622  0.0751560  -0.973   0.3372    
## pop_minwage         0.0260853  0.0215462   1.211   0.2344    
## exchange_rate      -0.0124060  0.0183652  -0.676   0.5039    
## max_temperature     0.0276677  0.0060193   4.596 5.69e-05 ***
## holiday_month       0.0360324  0.0223527   1.612   0.1162    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05692 on 34 degrees of freedom
## Multiple R-squared:  0.7293, Adjusted R-squared:  0.6259 
## F-statistic: 7.048 on 13 and 34 DF,  p-value: 2.327e-06
predicted_values <- predict(model2)
rmse <- sqrt(mean((df2$sales_unitboxes - predicted_values)^2))
rmse #the smaller RMSE that indicates the better the model.
## [1] 6500729
#Model 3

#multicollinearity
vif(model3)
##      max_temperature I(max_temperature^2)   consumer_sentiment 
##           886.706958           894.657085             2.307749 
##                  CPI       inflation_rate           unemp_rate 
##           107.477224             5.418247             4.762888 
##        gdp_percapita                itaee         itaee_growth 
##            56.387163            58.146538             2.152274 
##          pop_density          job_density          pop_minwage 
##           340.912632           133.740944             6.977286 
##        exchange_rate        holiday_month 
##            16.364620             1.406456
#heteroscedasticity
bptest(model3)
## 
##  studentized Breusch-Pagan test
## 
## data:  model3
## BP = 13.841, df = 14, p-value = 0.4616
#serial correlation
AIC(model3)
## [1] -130.8671
histogram(model3$residuals)

summary(model3)
## 
## Call:
## lm(formula = log(sales_unitboxes) ~ max_temperature + I(max_temperature^2) + 
##     consumer_sentiment + CPI + inflation_rate + unemp_rate + 
##     gdp_percapita + itaee + itaee_growth + pop_density + job_density + 
##     pop_minwage + exchange_rate + max_temperature + holiday_month, 
##     data = df2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.068167 -0.033712 -0.002205  0.027976  0.126712 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           3.0536976  7.3459021   0.416   0.6803  
## max_temperature      -0.1786356  0.0882293  -2.025   0.0511 .
## I(max_temperature^2)  0.0033236  0.0014185   2.343   0.0253 *
## consumer_sentiment    0.0078694  0.0041030   1.918   0.0638 .
## CPI                  -0.0372613  0.0159940  -2.330   0.0261 *
## inflation_rate        0.0610605  0.0466217   1.310   0.1993  
## unemp_rate           -2.0846749  2.9000105  -0.719   0.4773  
## gdp_percapita        -0.0001563  0.0002332  -0.670   0.5074  
## itaee                 0.0097000  0.0125186   0.775   0.4439  
## itaee_growth          0.8083269  0.7838363   1.031   0.3099  
## pop_density           0.2004718  0.1116980   1.795   0.0819 .
## job_density          -0.0393874  0.0720922  -0.546   0.5885  
## pop_minwage           0.0181667  0.0205306   0.885   0.3826  
## exchange_rate        -0.0344324  0.0196547  -1.752   0.0891 .
## holiday_month         0.0417181  0.0211482   1.973   0.0570 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0535 on 33 degrees of freedom
## Multiple R-squared:  0.768,  Adjusted R-squared:  0.6695 
## F-statistic: 7.801 on 14 and 33 DF,  p-value: 6.755e-07
predicted_values <- predict(model3)
rmse <- sqrt(mean((df2$sales_unitboxes - predicted_values)^2))
rmse #the smaller RMSE that indicates the better the model.
## [1] 6500729
# Detect Heteroscedasticity 
boxplot(formula=sales_unitboxes ~ tperiod, data=df2, col=cm.colors(3))

#Checking all the factors of the models which are: 

# Model 1
# R^2 adjusted: 0.6269  -> means that approximately 62.7% of the variance in "sales_unitboxes" is explained by the model.
# RSME: 307789
# Multicollinearity: yes, present in CPI,  pop_density, job_density,  gdp_percapita, exchange_rate, and itaee
# Significant Variables: "max_temperature" is highly significant (99%), while several other variables show significance at different levels.
# Heteroscedasticity Test (Breusch-Pagan): Not significant with p value = 0.2843 (p > 0.05)

# Model 2
# R^2 adjusted: 0.6259  -> means that approximately 62.6% of the variance in "sales_unitboxes" is explained by the model.
# RSME: 6500729
# Multicollinearity: yes, present in CPI,  pop_density, job_density,  gdp_percapita, exchange_rate, and itaee
# Significant Variables: "max_temperature" is highly significant (99%), while several other variables show significance at different levels.
# Heteroscedasticity Test (Breusch-Pagan): Not significant with p value = 0.2615 (p > 0.05)

# Model 3
# R^2 adjusted: 0.6695  -> means that approximately 66.9% of the variance in "sales_unitboxes" is explained by the model.
# RSME: 6500729
# Multicollinearity: yes, present in max_temperature, job_density , gdp_percapita, I(max_temperature^2), itaee, exchange_rate , CPI,  pop_density 
# Significant Variables: "I(max_temperature^2)"  and "CPI" have  significant at 95%, 

# Heteroscedasticity Test (Breusch-Pagan): Not significant with p value = 0.4616 (p > 0.05)



# the model that was selected was numer 1 because it has the highest adjusted R^2 and the RMSE is lower than model 2, also because the model it self has a p value of 2.23e-06 making it highly significant and with the R^2 saying that approximately 62.7% of the variance in "sales_unitboxes" is explained in the model.
#eliminating multicolleanility from the model

# We use model 1 without the variables with higher multicollinearity and only with the variables that are better for the 3 hypotheses

model1_new<-lm(sales_unitboxes ~ consumer_sentiment+inflation_rate+unemp_rate+itaee_growth+job_density+pop_minwage+max_temperature+holiday_month,data=df2)
summary(model1_new)
## 
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + inflation_rate + 
##     unemp_rate + itaee_growth + job_density + pop_minwage + max_temperature + 
##     holiday_month, data = df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -921096 -275353   60421  253451  809968 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -5157882    2740952  -1.882 0.067344 .  
## consumer_sentiment    54843      28413   1.930 0.060880 .  
## inflation_rate         1068     245535   0.004 0.996550    
## unemp_rate         23704837   20504985   1.156 0.254693    
## itaee_growth       -3277396    4917279  -0.667 0.509012    
## job_density          212407     132612   1.602 0.117289    
## pop_minwage          -21655     131015  -0.165 0.869572    
## max_temperature      148443      34677   4.281 0.000117 ***
## holiday_month        119235     152334   0.783 0.438517    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 438300 on 39 degrees of freedom
## Multiple R-squared:  0.5553, Adjusted R-squared:  0.464 
## F-statistic: 6.087 on 8 and 39 DF,  p-value: 4.377e-05
#New VIF with the variables that are important and without multicollinearity (VIF lower than 10)
vif(model1_new)
## consumer_sentiment     inflation_rate         unemp_rate       itaee_growth 
##           1.648519           2.238626           3.547025           1.261742 
##        job_density        pop_minwage    max_temperature      holiday_month 
##           6.741088           4.232533           2.040329           1.087044
modelo1 <- lm(sales_unitboxes ~ consumer_sentiment+inflation_rate+unemp_rate+itaee_growth+job_density+pop_minwage+max_temperature+holiday_month,data=df2)

colors <- c("blue", "green", "red", "purple")

par(mfrow=c(2,2)) 

plot(modelo1, which = 1, col = colors[1], pch = 19)  

plot(modelo1, which = 2, col = colors[2], pch = 19)

plot(modelo1, which = 3, col = colors[3], pch = 19)

plot(modelo1, which = 5, col = colors[4], pch = 19)

#Interpret the regression results of selected regression model. That is, describe how much is the impact of the explanatory variables X’s on the dependent variable Y. 

#For the hypothesis 1:  with higher temperature (x) higher number of sales (y) we can see in the model that with the temperature increasing 1 unit the sales would increase 148443, and for the hypothesis 2 with an increase of 1 unit in inflation the sales increase in $1068. So both hypotheis are accepted by the model.