# loading required libraries
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(dplyr)
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(zoo)
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(stats)
library(forecast)
library(astsa)
##
## Attaching package: 'astsa'
## The following object is masked from 'package:forecast':
##
## gas
library(corrplot)
## corrplot 0.92 loaded
library(AER)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: lmtest
## Loading required package: sandwich
## Loading required package: survival
library(vars)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: strucchange
## Loading required package: urca
library(dynlm)
library(vars)
library(TSstudio)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ stringr::boundary() masks strucchange::boundary()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks xts::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks xts::last()
## ✖ car::recode() masks dplyr::recode()
## ✖ MASS::select() masks dplyr::select()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sarima)
## Loading required package: stats4
##
## Attaching package: 'sarima'
##
## The following object is masked from 'package:astsa':
##
## sarima
##
## The following object is masked from 'package:stats':
##
## spectrum
library(dygraphs)
library(markdown)
library(foreign)
library(forcats)
library(ggplot2)
library(readr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:Hmisc':
##
## describe
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## The following object is masked from 'package:car':
##
## logit
##
## The following object is masked from 'package:astsa':
##
## scatter.hist
library(naniar)
library(dlookr)
##
## Attaching package: 'dlookr'
##
## The following object is masked from 'package:psych':
##
## describe
##
## The following object is masked from 'package:Hmisc':
##
## describe
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## The following object is masked from 'package:vars':
##
## normality
##
## The following object is masked from 'package:base':
##
## transform
library(corrplot)
library(jtools)
##
## Attaching package: 'jtools'
##
## The following object is masked from 'package:Hmisc':
##
## %nin%
library(lmtest)
library(car)
library(olsrr)
##
## Attaching package: 'olsrr'
##
## The following object is masked from 'package:MASS':
##
## cement
##
## The following object is masked from 'package:datasets':
##
## rivers
library(naniar)
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effects)
## Registered S3 method overwritten by 'survey':
## method from
## summary.pps dlookr
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
##
## The following object is masked from 'package:survival':
##
## cluster
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(corrplot)
library(zoo)
# import dataset
df2<-read.csv("/Users/valeriacantulobo/Downloads/coca_cola_sales.csv")
#EXPLORATORY DATA ANALYSIS
# Briefly describe the dataset. For example, what is the structure of the dataset? How many observations include the dataset? Is there any presence of missing values in the dataset?
head(df2)
## tperiod sales_unitboxes consumer_sentiment CPI inflation_rate unemp_rate
## 1 15-Jan 5516689 38.063 87.110 -0.09 0.0523
## 2 15-Feb 5387496 37.491 87.275 0.19 0.0531
## 3 15-Mar 5886747 38.505 87.631 0.41 0.0461
## 4 15-Apr 6389182 37.843 87.404 -0.26 0.0510
## 5 15-May 6448275 38.032 86.967 -0.50 0.0552
## 6 15-Jun 6697947 39.112 87.113 0.17 0.0507
## gdp_percapita itaee itaee_growth pop_density job_density pop_minwage
## 1 11659.56 103.7654 0.0497 98.5418 18.2605 9.6579
## 2 11659.55 103.7654 0.0497 98.5419 18.4633 9.6579
## 3 11659.55 103.7654 0.0497 98.5419 18.6416 9.6579
## 4 11625.75 107.7518 0.0318 98.8284 18.6788 9.5949
## 5 11625.74 107.7518 0.0318 98.8284 18.6754 9.5949
## 6 11625.74 107.7518 0.0318 98.8285 18.6467 9.5949
## exchange_rate max_temperature holiday_month
## 1 14.6926 28 0
## 2 14.9213 31 0
## 3 15.2283 29 0
## 4 15.2262 32 1
## 5 15.2645 34 0
## 6 15.4830 32 0
#observations
dim(df2)
## [1] 48 15
#structure
str(df2)
## 'data.frame': 48 obs. of 15 variables:
## $ tperiod : chr "15-Jan" "15-Feb" "15-Mar" "15-Apr" ...
## $ sales_unitboxes : num 5516689 5387496 5886747 6389182 6448275 ...
## $ consumer_sentiment: num 38.1 37.5 38.5 37.8 38 ...
## $ CPI : num 87.1 87.3 87.6 87.4 87 ...
## $ inflation_rate : num -0.09 0.19 0.41 -0.26 -0.5 0.17 0.15 0.21 0.37 0.51 ...
## $ unemp_rate : num 0.0523 0.0531 0.0461 0.051 0.0552 0.0507 0.0542 0.0547 0.0538 0.0539 ...
## $ gdp_percapita : num 11660 11660 11660 11626 11626 ...
## $ itaee : num 104 104 104 108 108 ...
## $ itaee_growth : num 0.0497 0.0497 0.0497 0.0318 0.0318 0.0318 0.0565 0.0565 0.0565 0.0056 ...
## $ pop_density : num 98.5 98.5 98.5 98.8 98.8 ...
## $ job_density : num 18.3 18.5 18.6 18.7 18.7 ...
## $ pop_minwage : num 9.66 9.66 9.66 9.59 9.59 ...
## $ exchange_rate : num 14.7 14.9 15.2 15.2 15.3 ...
## $ max_temperature : int 28 31 29 32 34 32 29 29 29 29 ...
## $ holiday_month : int 0 0 0 1 0 0 0 0 1 0 ...
#missing values
sum(is.na(df2))
## [1] 0
#there are no missing values
colnames(df2)
## [1] "tperiod" "sales_unitboxes" "consumer_sentiment"
## [4] "CPI" "inflation_rate" "unemp_rate"
## [7] "gdp_percapita" "itaee" "itaee_growth"
## [10] "pop_density" "job_density" "pop_minwage"
## [13] "exchange_rate" "max_temperature" "holiday_month"
#- Include summary of descriptive statistics. What is the mean, min, and max values of the dependent variable?
#dependent value: sales_unitboxes
summary(df2)
## tperiod sales_unitboxes consumer_sentiment CPI
## Length:48 Min. :5301755 Min. :28.67 Min. : 86.97
## Class :character 1st Qu.:6171767 1st Qu.:35.64 1st Qu.: 89.18
## Mode :character Median :6461357 Median :36.76 Median : 92.82
## Mean :6473691 Mean :37.15 Mean : 93.40
## 3rd Qu.:6819782 3rd Qu.:38.14 3rd Qu.: 98.40
## Max. :7963063 Max. :44.87 Max. :103.02
## inflation_rate unemp_rate gdp_percapita itaee
## Min. :-0.5000 Min. :0.03470 Min. :11559 Min. :103.8
## 1st Qu.: 0.1650 1st Qu.:0.04010 1st Qu.:11830 1st Qu.:111.5
## Median : 0.3850 Median :0.04370 Median :12014 Median :113.5
## Mean : 0.3485 Mean :0.04442 Mean :11979 Mean :113.9
## 3rd Qu.: 0.5575 3rd Qu.:0.04895 3rd Qu.:12162 3rd Qu.:117.1
## Max. : 1.7000 Max. :0.05520 Max. :12329 Max. :122.5
## itaee_growth pop_density job_density pop_minwage
## Min. :0.00560 Min. : 98.54 Min. :18.26 Min. : 9.398
## 1st Qu.:0.02237 1st Qu.: 99.61 1st Qu.:19.28 1st Qu.:10.794
## Median :0.02995 Median :100.67 Median :20.39 Median :11.139
## Mean :0.03172 Mean :100.65 Mean :20.38 Mean :11.116
## 3rd Qu.:0.04300 3rd Qu.:101.69 3rd Qu.:21.60 3rd Qu.:11.413
## Max. :0.05650 Max. :102.69 Max. :22.36 Max. :13.026
## exchange_rate max_temperature holiday_month
## Min. :14.69 Min. :26.00 Min. :0.00
## 1st Qu.:17.38 1st Qu.:29.00 1st Qu.:0.00
## Median :18.62 Median :30.00 Median :0.00
## Mean :18.18 Mean :30.50 Mean :0.25
## 3rd Qu.:19.06 3rd Qu.:32.25 3rd Qu.:0.25
## Max. :21.39 Max. :37.00 Max. :1.00
# DATA VISUALIZATION
#Show at least 4-5 plots including pair-wised graphs, frequency plots, correlation matrix plots, scatter plots, box plots, and / or histogram plots, etc.
#histogram of the dependent variable
hist(df2$sales_unitboxes)

hist(log(df2$sales_unitboxes))

# plot 2: temperature and sales
plot1 <- ggplot(df2, aes(x = max_temperature, y = sales_unitboxes)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue")
labs(x = "Weather", y = "Sales")
## $x
## [1] "Weather"
##
## $y
## [1] "Sales"
##
## attr(,"class")
## [1] "labels"
plot1
## `geom_smooth()` using formula = 'y ~ x'

#plot 3: inflation and sales
plot2 <- ggplot(df2, aes(x = inflation_rate, y = sales_unitboxes)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue")
labs(x = "Inflation", y = "Sales")
## $x
## [1] "Inflation"
##
## $y
## [1] "Sales"
##
## attr(,"class")
## [1] "labels"
plot2
## `geom_smooth()` using formula = 'y ~ x'

#graph 4
numeric_df <- df2[, sapply(df2, is.numeric)]
cor_matrix <- cor(numeric_df)
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust")

#Select 1-2 plots and briefly describe the data patterns.
#The graph of the sales vs the temperature has a positive trend, so as the temperature increases the sales in coca cola increase as well
#the graph of sales vs inflation has a negative trend, so this is the contrary of the last graph when the inflation increases the sales decreases.
#LINEAR REGRESSION MODEL SPECIFICATION
#Briefly describe the hypotheses statements. That is, what is the expected impact of the explanatory variable X on the dependent variable Y.
#HYPOTHESIS 1: with higher temperature (x) higher number of sales (y)
#With a regression model we will find out the relation between the two variables and how sales acts when the temperature increases
#HYPOTHESIS 2: with higher inflation (x) the number of sales decrease (y)
#With a regression model we will find out the relation between the two variables and how sales acts when the inflation decreases
#Estimate 2 different multiple linear regression model specifications.
#Linear Regression Analysis
#model 1
model1<-lm(sales_unitboxes ~ consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,data=df2)
summary(model1)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate +
## unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density +
## job_density + pop_minwage + exchange_rate + max_temperature +
## holiday_month, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -510267 -226241 -10274 198534 757335
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -94493166 49437038 -1.911 0.0644 .
## consumer_sentiment 48099 28019 1.717 0.0951 .
## CPI -143717 101440 -1.417 0.1656
## inflation_rate 31729 282054 0.112 0.9111
## unemp_rate -10040675 19802582 -0.507 0.6154
## gdp_percapita -2555 1465 -1.744 0.0901 .
## itaee 15152 84276 0.180 0.8584
## itaee_growth 4087812 5314504 0.769 0.4471
## pop_density 1437972 761521 1.888 0.0675 .
## job_density -459448 482871 -0.951 0.3481
## pop_minwage 182577 138433 1.319 0.1960
## exchange_rate -57940 117995 -0.491 0.6266
## max_temperature 177016 38674 4.577 6.03e-05 ***
## holiday_month 199879 143614 1.392 0.1730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 365700 on 34 degrees of freedom
## Multiple R-squared: 0.7301, Adjusted R-squared: 0.6269
## F-statistic: 7.075 on 13 and 34 DF, p-value: 2.23e-06
#model 2
model2<-lm(log(sales_unitboxes) ~ consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,data=df2)
summary(model2)
##
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI +
## inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth +
## pop_density + job_density + pop_minwage + exchange_rate +
## max_temperature + holiday_month, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.080799 -0.036264 0.000068 0.032301 0.113440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0325512 7.6945736 0.004 0.9966
## consumer_sentiment 0.0074322 0.0043610 1.704 0.0975 .
## CPI -0.0232788 0.0157885 -1.474 0.1496
## inflation_rate 0.0101989 0.0439000 0.232 0.8177
## unemp_rate -1.7652579 3.0821512 -0.573 0.5706
## gdp_percapita -0.0003718 0.0002280 -1.631 0.1121
## itaee 0.0046038 0.0131170 0.351 0.7278
## itaee_growth 0.5739571 0.8271701 0.694 0.4925
## pop_density 0.2196249 0.1185261 1.853 0.0726 .
## job_density -0.0731622 0.0751560 -0.973 0.3372
## pop_minwage 0.0260853 0.0215462 1.211 0.2344
## exchange_rate -0.0124060 0.0183652 -0.676 0.5039
## max_temperature 0.0276677 0.0060193 4.596 5.69e-05 ***
## holiday_month 0.0360324 0.0223527 1.612 0.1162
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05692 on 34 degrees of freedom
## Multiple R-squared: 0.7293, Adjusted R-squared: 0.6259
## F-statistic: 7.048 on 13 and 34 DF, p-value: 2.327e-06
#model 3
model3<-lm(log(sales_unitboxes) ~ max_temperature+I(max_temperature^2)+consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,data=df2)
summary(model3)
##
## Call:
## lm(formula = log(sales_unitboxes) ~ max_temperature + I(max_temperature^2) +
## consumer_sentiment + CPI + inflation_rate + unemp_rate +
## gdp_percapita + itaee + itaee_growth + pop_density + job_density +
## pop_minwage + exchange_rate + max_temperature + holiday_month,
## data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.068167 -0.033712 -0.002205 0.027976 0.126712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0536976 7.3459021 0.416 0.6803
## max_temperature -0.1786356 0.0882293 -2.025 0.0511 .
## I(max_temperature^2) 0.0033236 0.0014185 2.343 0.0253 *
## consumer_sentiment 0.0078694 0.0041030 1.918 0.0638 .
## CPI -0.0372613 0.0159940 -2.330 0.0261 *
## inflation_rate 0.0610605 0.0466217 1.310 0.1993
## unemp_rate -2.0846749 2.9000105 -0.719 0.4773
## gdp_percapita -0.0001563 0.0002332 -0.670 0.5074
## itaee 0.0097000 0.0125186 0.775 0.4439
## itaee_growth 0.8083269 0.7838363 1.031 0.3099
## pop_density 0.2004718 0.1116980 1.795 0.0819 .
## job_density -0.0393874 0.0720922 -0.546 0.5885
## pop_minwage 0.0181667 0.0205306 0.885 0.3826
## exchange_rate -0.0344324 0.0196547 -1.752 0.0891 .
## holiday_month 0.0417181 0.0211482 1.973 0.0570 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0535 on 33 degrees of freedom
## Multiple R-squared: 0.768, Adjusted R-squared: 0.6695
## F-statistic: 7.801 on 14 and 33 DF, p-value: 6.755e-07
library(caret)
library(glmnet)
# LASSO
set.seed(123)
training.samples<-df2$sales_unitboxes %>%
createDataPartition(p=0.75,list=FALSE)
train.data<-df2[training.samples, ]
test.data<-df2[-training.samples, ]
x<-model.matrix(log(sales_unitboxes) ~ consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,train.data)[,-1]
y<-train.data$sales_unitboxes
set.seed(123)
cv.lasso <- cv.glmnet(x,y,alpha=1)
lambda_min <- cv.lasso$lambda.min
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)
coef(lassomodel)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 3139745.539
## consumer_sentiment 43610.950
## CPI .
## inflation_rate -548715.710
## unemp_rate .
## gdp_percapita -1041.962
## itaee 73956.372
## itaee_growth -2300874.776
## pop_density .
## job_density .
## pop_minwage 20108.452
## exchange_rate 54798.528
## max_temperature 156590.842
## holiday_month 132714.345
x.test<-model.matrix(log(sales_unitboxes) ~consumer_sentiment+CPI+inflation_rate+unemp_rate+gdp_percapita+itaee+itaee_growth+pop_density+job_density+pop_minwage+exchange_rate+max_temperature+holiday_month,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()
# Model Accuracy
data.frame(
RMSE = RMSE(lassopredictions, test.data$sales_unitboxes),
Rsquare = R2(lassopredictions, test.data$sales_unitboxes))
## RMSE Rsquare
## 1 501137.9 0.3671956
# Graph
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])+ offset_x
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
lasso<-glmnet(scale(x),y,alpha=1)
plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)

#RESULTS ANALYSIS
#Evaluate each regression model using model diagnostics (e.g., multicollinearity and heteroscedasticity).
#model 1
vif(model1)#multicollinearity
## consumer_sentiment CPI inflation_rate unemp_rate
## 2.302975 92.514952 4.243659 4.752364
## gdp_percapita itaee itaee_growth pop_density
## 47.611019 56.391338 2.117225 339.086814
## job_density pop_minwage exchange_rate max_temperature
## 128.394120 6.788227 12.621039 3.645717
## holiday_month
## 1.387939
bptest(model1)#heteroscedasticity
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 15.379, df = 13, p-value = 0.2843
AIC(model1)#serial correlation
## [1] 1379.386
histogram(model1$residuals)

summary(model1)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + CPI + inflation_rate +
## unemp_rate + gdp_percapita + itaee + itaee_growth + pop_density +
## job_density + pop_minwage + exchange_rate + max_temperature +
## holiday_month, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -510267 -226241 -10274 198534 757335
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -94493166 49437038 -1.911 0.0644 .
## consumer_sentiment 48099 28019 1.717 0.0951 .
## CPI -143717 101440 -1.417 0.1656
## inflation_rate 31729 282054 0.112 0.9111
## unemp_rate -10040675 19802582 -0.507 0.6154
## gdp_percapita -2555 1465 -1.744 0.0901 .
## itaee 15152 84276 0.180 0.8584
## itaee_growth 4087812 5314504 0.769 0.4471
## pop_density 1437972 761521 1.888 0.0675 .
## job_density -459448 482871 -0.951 0.3481
## pop_minwage 182577 138433 1.319 0.1960
## exchange_rate -57940 117995 -0.491 0.6266
## max_temperature 177016 38674 4.577 6.03e-05 ***
## holiday_month 199879 143614 1.392 0.1730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 365700 on 34 degrees of freedom
## Multiple R-squared: 0.7301, Adjusted R-squared: 0.6269
## F-statistic: 7.075 on 13 and 34 DF, p-value: 2.23e-06
predicted_values <- predict(model1)
rmse <- sqrt(mean((df2$sales_unitboxes - predicted_values)^2))
rmse #the smaller RMSE that indicates the better the model.
## [1] 307789
#Model 2
vif(model2)#multicollinearity
## consumer_sentiment CPI inflation_rate unemp_rate
## 2.302975 92.514952 4.243659 4.752364
## gdp_percapita itaee itaee_growth pop_density
## 47.611019 56.391338 2.117225 339.086814
## job_density pop_minwage exchange_rate max_temperature
## 128.394120 6.788227 12.621039 3.645717
## holiday_month
## 1.387939
bptest(model2)#heteroscedasticity
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 15.774, df = 13, p-value = 0.2615
AIC(model2)#serial correlation
## [1] -125.4803
histogram(model2$residuals)

summary(model2)
##
## Call:
## lm(formula = log(sales_unitboxes) ~ consumer_sentiment + CPI +
## inflation_rate + unemp_rate + gdp_percapita + itaee + itaee_growth +
## pop_density + job_density + pop_minwage + exchange_rate +
## max_temperature + holiday_month, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.080799 -0.036264 0.000068 0.032301 0.113440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0325512 7.6945736 0.004 0.9966
## consumer_sentiment 0.0074322 0.0043610 1.704 0.0975 .
## CPI -0.0232788 0.0157885 -1.474 0.1496
## inflation_rate 0.0101989 0.0439000 0.232 0.8177
## unemp_rate -1.7652579 3.0821512 -0.573 0.5706
## gdp_percapita -0.0003718 0.0002280 -1.631 0.1121
## itaee 0.0046038 0.0131170 0.351 0.7278
## itaee_growth 0.5739571 0.8271701 0.694 0.4925
## pop_density 0.2196249 0.1185261 1.853 0.0726 .
## job_density -0.0731622 0.0751560 -0.973 0.3372
## pop_minwage 0.0260853 0.0215462 1.211 0.2344
## exchange_rate -0.0124060 0.0183652 -0.676 0.5039
## max_temperature 0.0276677 0.0060193 4.596 5.69e-05 ***
## holiday_month 0.0360324 0.0223527 1.612 0.1162
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05692 on 34 degrees of freedom
## Multiple R-squared: 0.7293, Adjusted R-squared: 0.6259
## F-statistic: 7.048 on 13 and 34 DF, p-value: 2.327e-06
predicted_values <- predict(model2)
rmse <- sqrt(mean((df2$sales_unitboxes - predicted_values)^2))
rmse #the smaller RMSE that indicates the better the model.
## [1] 6500729
#Model 3
#multicollinearity
vif(model3)
## max_temperature I(max_temperature^2) consumer_sentiment
## 886.706958 894.657085 2.307749
## CPI inflation_rate unemp_rate
## 107.477224 5.418247 4.762888
## gdp_percapita itaee itaee_growth
## 56.387163 58.146538 2.152274
## pop_density job_density pop_minwage
## 340.912632 133.740944 6.977286
## exchange_rate holiday_month
## 16.364620 1.406456
#heteroscedasticity
bptest(model3)
##
## studentized Breusch-Pagan test
##
## data: model3
## BP = 13.841, df = 14, p-value = 0.4616
#serial correlation
AIC(model3)
## [1] -130.8671
histogram(model3$residuals)

summary(model3)
##
## Call:
## lm(formula = log(sales_unitboxes) ~ max_temperature + I(max_temperature^2) +
## consumer_sentiment + CPI + inflation_rate + unemp_rate +
## gdp_percapita + itaee + itaee_growth + pop_density + job_density +
## pop_minwage + exchange_rate + max_temperature + holiday_month,
## data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.068167 -0.033712 -0.002205 0.027976 0.126712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0536976 7.3459021 0.416 0.6803
## max_temperature -0.1786356 0.0882293 -2.025 0.0511 .
## I(max_temperature^2) 0.0033236 0.0014185 2.343 0.0253 *
## consumer_sentiment 0.0078694 0.0041030 1.918 0.0638 .
## CPI -0.0372613 0.0159940 -2.330 0.0261 *
## inflation_rate 0.0610605 0.0466217 1.310 0.1993
## unemp_rate -2.0846749 2.9000105 -0.719 0.4773
## gdp_percapita -0.0001563 0.0002332 -0.670 0.5074
## itaee 0.0097000 0.0125186 0.775 0.4439
## itaee_growth 0.8083269 0.7838363 1.031 0.3099
## pop_density 0.2004718 0.1116980 1.795 0.0819 .
## job_density -0.0393874 0.0720922 -0.546 0.5885
## pop_minwage 0.0181667 0.0205306 0.885 0.3826
## exchange_rate -0.0344324 0.0196547 -1.752 0.0891 .
## holiday_month 0.0417181 0.0211482 1.973 0.0570 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0535 on 33 degrees of freedom
## Multiple R-squared: 0.768, Adjusted R-squared: 0.6695
## F-statistic: 7.801 on 14 and 33 DF, p-value: 6.755e-07
predicted_values <- predict(model3)
rmse <- sqrt(mean((df2$sales_unitboxes - predicted_values)^2))
rmse #the smaller RMSE that indicates the better the model.
## [1] 6500729
# Detect Heteroscedasticity
boxplot(formula=sales_unitboxes ~ tperiod, data=df2, col=cm.colors(3))

#Checking all the factors of the models which are:
# Model 1
# R^2 adjusted: 0.6269 -> means that approximately 62.7% of the variance in "sales_unitboxes" is explained by the model.
# RSME: 307789
# Multicollinearity: yes, present in CPI, pop_density, job_density, gdp_percapita, exchange_rate, and itaee
# Significant Variables: "max_temperature" is highly significant (99%), while several other variables show significance at different levels.
# Heteroscedasticity Test (Breusch-Pagan): Not significant with p value = 0.2843 (p > 0.05)
# Model 2
# R^2 adjusted: 0.6259 -> means that approximately 62.6% of the variance in "sales_unitboxes" is explained by the model.
# RSME: 6500729
# Multicollinearity: yes, present in CPI, pop_density, job_density, gdp_percapita, exchange_rate, and itaee
# Significant Variables: "max_temperature" is highly significant (99%), while several other variables show significance at different levels.
# Heteroscedasticity Test (Breusch-Pagan): Not significant with p value = 0.2615 (p > 0.05)
# Model 3
# R^2 adjusted: 0.6695 -> means that approximately 66.9% of the variance in "sales_unitboxes" is explained by the model.
# RSME: 6500729
# Multicollinearity: yes, present in max_temperature, job_density , gdp_percapita, I(max_temperature^2), itaee, exchange_rate , CPI, pop_density
# Significant Variables: "I(max_temperature^2)" and "CPI" have significant at 95%,
# Heteroscedasticity Test (Breusch-Pagan): Not significant with p value = 0.4616 (p > 0.05)
# the model that was selected was numer 1 because it has the highest adjusted R^2 and the RMSE is lower than model 2, also because the model it self has a p value of 2.23e-06 making it highly significant and with the R^2 saying that approximately 62.7% of the variance in "sales_unitboxes" is explained in the model.
#eliminating multicolleanility from the model
# We use model 1 without the variables with higher multicollinearity and only with the variables that are better for the 3 hypotheses
model1_new<-lm(sales_unitboxes ~ consumer_sentiment+inflation_rate+unemp_rate+itaee_growth+job_density+pop_minwage+max_temperature+holiday_month,data=df2)
summary(model1_new)
##
## Call:
## lm(formula = sales_unitboxes ~ consumer_sentiment + inflation_rate +
## unemp_rate + itaee_growth + job_density + pop_minwage + max_temperature +
## holiday_month, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -921096 -275353 60421 253451 809968
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5157882 2740952 -1.882 0.067344 .
## consumer_sentiment 54843 28413 1.930 0.060880 .
## inflation_rate 1068 245535 0.004 0.996550
## unemp_rate 23704837 20504985 1.156 0.254693
## itaee_growth -3277396 4917279 -0.667 0.509012
## job_density 212407 132612 1.602 0.117289
## pop_minwage -21655 131015 -0.165 0.869572
## max_temperature 148443 34677 4.281 0.000117 ***
## holiday_month 119235 152334 0.783 0.438517
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 438300 on 39 degrees of freedom
## Multiple R-squared: 0.5553, Adjusted R-squared: 0.464
## F-statistic: 6.087 on 8 and 39 DF, p-value: 4.377e-05
#New VIF with the variables that are important and without multicollinearity (VIF lower than 10)
vif(model1_new)
## consumer_sentiment inflation_rate unemp_rate itaee_growth
## 1.648519 2.238626 3.547025 1.261742
## job_density pop_minwage max_temperature holiday_month
## 6.741088 4.232533 2.040329 1.087044
modelo1 <- lm(sales_unitboxes ~ consumer_sentiment+inflation_rate+unemp_rate+itaee_growth+job_density+pop_minwage+max_temperature+holiday_month,data=df2)
colors <- c("blue", "green", "red", "purple")
par(mfrow=c(2,2))
plot(modelo1, which = 1, col = colors[1], pch = 19)
plot(modelo1, which = 2, col = colors[2], pch = 19)
plot(modelo1, which = 3, col = colors[3], pch = 19)
plot(modelo1, which = 5, col = colors[4], pch = 19)

#Interpret the regression results of selected regression model. That is, describe how much is the impact of the explanatory variables X’s on the dependent variable Y.
#For the hypothesis 1: with higher temperature (x) higher number of sales (y) we can see in the model that with the temperature increasing 1 unit the sales would increase 148443, and for the hypothesis 2 with an increase of 1 unit in inflation the sales increase in $1068. So both hypotheis are accepted by the model.