library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble 3.2.1 ✔ dplyr 1.1.2
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ✔ purrr 1.0.1
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.3
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(readr)
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.2.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.3
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(effects)
## Warning: package 'effects' was built under R version 4.2.3
## Use the command
## lattice::trellis.par.set(effectsTheme())
## to customize lattice options for effects plots.
## See ?effectsTheme for details.
library(forcats)
library(ggplot2)
library(readr)
library(janitor)
## Warning: package 'janitor' was built under R version 4.2.3
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(foreign)
library(dplyr)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.2.3
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych)
## Warning: package 'psych' was built under R version 4.2.3
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:Hmisc':
##
## describe
##
## The following object is masked from 'package:car':
##
## logit
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar)
## Warning: package 'naniar' was built under R version 4.2.3
library(jtools)
## Warning: package 'jtools' was built under R version 4.2.3
##
## Attaching package: 'jtools'
##
## The following object is masked from 'package:Hmisc':
##
## %nin%
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.2.3
##
## Attaching package: 'olsrr'
##
## The following object is masked from 'package:datasets':
##
## rivers
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
dataframe <- read_excel("D:/JahirBotello/Documents/Carrera LIT/5to Semestre/1er Periodo/Economics/coca_cola_sales.xlsx")
View(dataframe)
str(dataframe)
## tibble [48 × 15] (S3: tbl_df/tbl/data.frame)
## $ tperiod : POSIXct[1:48], format: "2021-01-15" "2021-02-15" ...
## $ sales_unitboxes : num [1:48] 5516689 5387496 5886747 6389182 6448275 ...
## $ consumer_sentiment: num [1:48] 38.1 37.5 38.5 37.8 38 ...
## $ CPI : num [1:48] 87.1 87.3 87.6 87.4 87 ...
## $ inflation_rate : num [1:48] -0.09 0.19 0.41 -0.26 -0.5 0.17 0.15 0.21 0.37 0.51 ...
## $ unemp_rate : num [1:48] 0.0523 0.0531 0.0461 0.051 0.0552 ...
## $ gdp_percapita : num [1:48] 11660 11660 11660 11626 11626 ...
## $ itaee : num [1:48] 104 104 104 108 108 ...
## $ itaee_growth : num [1:48] 0.0497 0.0497 0.0497 0.0318 0.0318 ...
## $ pop_density : num [1:48] 98.5 98.5 98.5 98.8 98.8 ...
## $ job_density : num [1:48] 18.3 18.5 18.6 18.7 18.7 ...
## $ pop_minwage : num [1:48] 9.66 9.66 9.66 9.59 9.59 ...
## $ exchange_rate : num [1:48] 14.7 14.9 15.2 15.2 15.3 ...
## $ max_temperature : num [1:48] 28 31 29 32 34 32 29 29 29 29 ...
## $ holiday_month : num [1:48] 0 0 0 1 0 0 0 0 1 0 ...
# With this code we can show the structure of the dataset.
na_values <- colSums(is.na(dataframe))
na_values
## tperiod sales_unitboxes consumer_sentiment CPI
## 0 0 0 0
## inflation_rate unemp_rate gdp_percapita itaee
## 0 0 0 0
## itaee_growth pop_density job_density pop_minwage
## 0 0 0 0
## exchange_rate max_temperature holiday_month
## 0 0 0
# With this code we can see if there is any missing values in the data set, as we can see we are good to go because we have 0 missing values in every column.
dim(dataframe)
## [1] 48 15
# With this line of code we can see exactly how many rows(observations) and columns (variables) are on the dataset
summary(dataframe)
## tperiod sales_unitboxes consumer_sentiment
## Min. :2021-01-15 00:00:00 Min. :5301755 Min. :28.67
## 1st Qu.:2021-04-08 00:00:00 1st Qu.:6171767 1st Qu.:35.64
## Median :2021-07-01 12:00:00 Median :6461357 Median :36.76
## Mean :2021-07-02 00:00:00 Mean :6473691 Mean :37.15
## 3rd Qu.:2021-09-24 18:00:00 3rd Qu.:6819782 3rd Qu.:38.14
## Max. :2021-12-18 00:00:00 Max. :7963063 Max. :44.87
## CPI inflation_rate unemp_rate gdp_percapita
## Min. : 86.97 Min. :-0.5000 Min. :0.03466 Min. :11559
## 1st Qu.: 89.18 1st Qu.: 0.1650 1st Qu.:0.04010 1st Qu.:11830
## Median : 92.82 Median : 0.3850 Median :0.04369 Median :12014
## Mean : 93.40 Mean : 0.3485 Mean :0.04442 Mean :11979
## 3rd Qu.: 98.40 3rd Qu.: 0.5575 3rd Qu.:0.04897 3rd Qu.:12162
## Max. :103.02 Max. : 1.7000 Max. :0.05517 Max. :12329
## itaee itaee_growth pop_density job_density
## Min. :103.8 Min. :0.005571 Min. : 98.54 Min. :18.26
## 1st Qu.:111.5 1st Qu.:0.022376 1st Qu.: 99.61 1st Qu.:19.28
## Median :113.5 Median :0.029977 Median :100.67 Median :20.39
## Mean :113.9 Mean :0.031736 Mean :100.65 Mean :20.38
## 3rd Qu.:117.1 3rd Qu.:0.043038 3rd Qu.:101.69 3rd Qu.:21.60
## Max. :122.5 Max. :0.056536 Max. :102.69 Max. :22.36
## pop_minwage exchange_rate max_temperature holiday_month
## Min. : 9.398 Min. :14.69 Min. :26.00 Min. :0.00
## 1st Qu.:10.794 1st Qu.:17.38 1st Qu.:29.00 1st Qu.:0.00
## Median :11.139 Median :18.62 Median :30.00 Median :0.00
## Mean :11.116 Mean :18.18 Mean :30.50 Mean :0.25
## 3rd Qu.:11.413 3rd Qu.:19.06 3rd Qu.:32.25 3rd Qu.:0.25
## Max. :13.026 Max. :21.39 Max. :37.00 Max. :1.00
cor_plot<- cor(dataframe[, c('sales_unitboxes', 'consumer_sentiment', 'CPI', 'inflation_rate', 'unemp_rate', 'gdp_percapita', 'itaee', 'itaee_growth', 'pop_density', 'job_density', 'pop_minwage', 'exchange_rate', "max_temperature", "holiday_month")])
corrplot(cor_plot, method="circle")
##Cor Tests
cor.test(dataframe$sales_unitboxes, dataframe$consumer_sentiment)
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$consumer_sentiment
## t = 1.5788, df = 46, p-value = 0.1212
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06137265 0.47993397
## sample estimates:
## cor
## 0.2267155
cor.test(dataframe$sales_unitboxes, dataframe$CPI)
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$CPI
## t = 1.4776, df = 46, p-value = 0.1463
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0758564 0.4686554
## sample estimates:
## cor
## 0.2128663
cor.test(dataframe$sales_unitboxes, dataframe$inflation_rate) #
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$inflation_rate
## t = -2.4433, df = 46, p-value = 0.01845
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.56833286 -0.06063401
## sample estimates:
## cor
## -0.3389296
cor.test(dataframe$sales_unitboxes, dataframe$unemp_rate)
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$unemp_rate
## t = -0.51626, df = 46, p-value = 0.6081
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3524333 0.2128254
## sample estimates:
## cor
## -0.07589903
cor.test(dataframe$sales_unitboxes, dataframe$gdp_percapita)
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$gdp_percapita
## t = 1.4563, df = 46, p-value = 0.1521
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07890143 0.46626148
## sample estimates:
## cor
## 0.2099398
cor.test(dataframe$sales_unitboxes, dataframe$itaee) #
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$itaee
## t = 2.273, df = 46, p-value = 0.02774
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03697264 0.55205884
## sample estimates:
## cor
## 0.3177691
cor.test(dataframe$sales_unitboxes, dataframe$itaee_growth)
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$itaee_growth
## t = -1.6486, df = 46, p-value = 0.106
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.48760745 0.05138638
## sample estimates:
## cor
## -0.2361969
cor.test(dataframe$sales_unitboxes, dataframe$pop_density) #
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$pop_density
## t = 2.105, df = 46, p-value = 0.04078
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.0134141 0.5354560
## sample estimates:
## cor
## 0.296419
cor.test(dataframe$sales_unitboxes, dataframe$job_density) #
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$job_density
## t = 2.0533, df = 46, p-value = 0.04576
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.006118193 0.530231130
## sample estimates:
## cor
## 0.2897492
cor.test(dataframe$sales_unitboxes, dataframe$pop_minwage)
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$pop_minwage
## t = 1.9557, df = 46, p-value = 0.05659
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.007679065 0.520240281
## sample estimates:
## cor
## 0.2770601
cor.test(dataframe$sales_unitboxes, dataframe$exchange_rate)
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$exchange_rate
## t = 1.2236, df = 46, p-value = 0.2273
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1122551 0.4395058
## sample estimates:
## cor
## 0.1775424
cor.test(dataframe$sales_unitboxes, dataframe$max_temperature) #
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$max_temperature
## t = 4.7241, df = 46, p-value = 2.205e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3431381 0.7361368
## sample estimates:
## cor
## 0.5715483
cor.test(dataframe$sales_unitboxes, dataframe$holiday_month)
##
## Pearson's product-moment correlation
##
## data: dataframe$sales_unitboxes and dataframe$holiday_month
## t = 0.17612, df = 46, p-value = 0.861
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2600941 0.3078233
## sample estimates:
## cor
## 0.02595903
hist(dataframe$sales_unitboxes, main="Histogram of Sales", xlab="Values", ylab="Frecuency", col = "lightblue",breaks=5)
shapiro.test(dataframe$sales_unitboxes)
##
## Shapiro-Wilk normality test
##
## data: dataframe$sales_unitboxes
## W = 0.98099, p-value = 0.6203
dataframe$temp_interv <- cut(dataframe$max_temperature, breaks = c(20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40))
sales_temp <- aggregate(dataframe$sales_unitboxes, by=list(dataframe$temp_interv), FUN=median)
ggplot(sales_temp, aes(x=Group.1, y=x)) +
geom_bar(stat='identity', fill="skyblue") +
labs(title="Average Sales based on Temperature",
x="Temperature Intervals",
y="Sales in Pesos") +
theme_minimal()
dataframe$pop_interv <- cut(dataframe$pop_density, breaks = c(98, 98.5, 99, 99.5, 100, 100.5, 101, 101.5, 102, 102.5, 103))
sales_pop <- aggregate(dataframe$sales_unitboxes, by=list(dataframe$pop_interv), FUN=median)
ggplot(sales_pop, aes(x=Group.1, y=x)) +
geom_bar(stat='identity', fill="skyblue") +
labs(title="Average Sales based on Population Density",
x="Population Density Intervals",
y="Sales in Pesos") +
theme_minimal() + coord_flip()
dataframe$inf_interv <- cut(dataframe$inflation_rate, breaks = c(-0.2, -0.1, 0, 0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.6, 1.8, 2.0))
sales_infl <- aggregate(dataframe$sales_unitboxes, by=list(dataframe$inf_interv), FUN=median)
ggplot(sales_infl, aes(x=Group.1, y=x)) +
geom_bar(stat='identity', fill="skyblue") +
labs(title="Average Sales based on Inflation Rate",
x="Inflation Intervals",
y="Sales in Pesos") +
theme_minimal()
# In this case I selected the histogram of my dependent variable, creating a histogram of this variable and together with the shapiro test we can determine the normality of our dependent variable, this is very important because a lot of regresion models assume normality in the data.
#The correlation tests together with correlation plot help us understand which are the independent variables that have more correlation with the dependent variable, this helps us know which variables have more influence in explaining the change in the dependent variable.
# Our null hypothesis is that the model is not valid and it does not explain the change in the dependent variable.
# Alternative Hypothesis the model is indeed valid and it can explain the change in the dependent variable
Model1 <- lm(sales_unitboxes ~ inflation_rate + itaee + pop_density + max_temperature, data = dataframe)
summary(Model1)
##
## Call:
## lm(formula = sales_unitboxes ~ inflation_rate + itaee + pop_density +
## max_temperature, data = dataframe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -891518 -204374 11997 203201 1065038
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12534143 8019378 1.563 0.125387
## inflation_rate -393589 192625 -2.043 0.047182 *
## itaee 130171 31356 4.151 0.000153 ***
## pop_density -248395 112872 -2.201 0.033179 *
## max_temperature 139562 27836 5.014 9.68e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 392200 on 43 degrees of freedom
## Multiple R-squared: 0.6074, Adjusted R-squared: 0.5709
## F-statistic: 16.63 on 4 and 43 DF, p-value: 2.612e-08
avPlots(Model1)
RMSE_M1 <- RMSE(Model1$fitted.values, dataframe$sales_unitboxes)
r2M1 <- summary(Model1)$r.squared
AIC_M1 <- AIC(Model1)
RMSE_M1
## [1] 371205
r2M1
## [1] 0.6074375
AIC_M1
## [1] 1379.371
Model2 <- lm(dataframe$sales_unitboxes ~ dataframe$inflation_rate + dataframe$itaee + dataframe$pop_density + dataframe$job_density + dataframe$max_temperature)
summary(Model2)
##
## Call:
## lm(formula = dataframe$sales_unitboxes ~ dataframe$inflation_rate +
## dataframe$itaee + dataframe$pop_density + dataframe$job_density +
## dataframe$max_temperature)
##
## Residuals:
## Min 1Q Median 3Q Max
## -921411 -222994 9863 184134 1053800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24754545 28155298 -0.879 0.3843
## dataframe$inflation_rate -345419 193796 -1.782 0.0819 .
## dataframe$itaee 134973 31225 4.323 9.27e-05 ***
## dataframe$pop_density 215421 354090 0.608 0.5462
## dataframe$job_density -502938 364351 -1.380 0.1748
## dataframe$max_temperature 149117 28404 5.250 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 388100 on 42 degrees of freedom
## Multiple R-squared: 0.6245, Adjusted R-squared: 0.5798
## F-statistic: 13.97 on 5 and 42 DF, p-value: 4.739e-08
avPlots(Model2)
RMSE_M2 <- RMSE(Model2$fitted.values, dataframe$sales_unitboxes)
r2M2 <- summary(Model2)$r.squared
AIC_M2 <- AIC(Model2)
RMSE_M2
## [1] 363060.9
r2M2
## [1] 0.6244739
AIC_M2
## [1] 1379.241
vif(Model1)
## inflation_rate itaee pop_density max_temperature
## 1.720956 6.787535 6.477211 1.642192
bptest(Model1)
##
## studentized Breusch-Pagan test
##
## data: Model1
## BP = 2.8245, df = 4, p-value = 0.5876
shapiro.test(Model1$residuals)
##
## Shapiro-Wilk normality test
##
## data: Model1$residuals
## W = 0.98363, p-value = 0.7338
vif(Model2)
## dataframe$inflation_rate dataframe$itaee dataframe$pop_density
## 1.778626 6.872837 65.086343
## dataframe$job_density dataframe$max_temperature
## 64.899118 1.745881
bptest(Model2)
##
## studentized Breusch-Pagan test
##
## data: Model2
## BP = 3.9437, df = 5, p-value = 0.5575
shapiro.test(Model2$residuals)
##
## Shapiro-Wilk normality test
##
## data: Model2$residuals
## W = 0.97605, p-value = 0.4267
RMSE_M1
## [1] 371205
r2M1
## [1] 0.6074375
AIC_M1
## [1] 1379.371
RMSE_M2
## [1] 363060.9
r2M2
## [1] 0.6244739
AIC_M2
## [1] 1379.241
# Although the values of model 2 were slightly better, when conducting tests to analyze multicollinearity, heteroscedasticity, and the normality of residuals, I realized that model 2 has high multicollinearity in 2 of its variables. Because of this reason and since the results do not have much difference, I consider that the best model for making predictions is model 1.
# Based on the results of the summary and tests we did in Model 1 we can make this interpretation:
# - The inflation rate had a negative relation with the dependent variable, for each unit increase in inflation rate, sales_unitboxes had a decrease of 393,589 units and the variable was statistically significant with ("*")
# - The itaee had a positive relation with the dependent variable, for each unit increase in itaee, sales_unitboxes had an increase in 130,171 units and the variable was statistically significant with ("***")
# - The pop density had a negative relation with the dependent variable, for each unit increase in pop density, sales_unitboxes had a decrease in 248,395 units and this variable was statistically significant with ("*")
# - The max temperature had a positive relation with the dependent variable, for each unit increase in max temperature, sales_unitboxes had an increase of 139,562 units and this variable was statistically significant with ("***")
# - The model had a r^2 value of 61% which means that around 61% of the variance in sales_unitboxes can be explained by the four independent variables.
# - The AIC statistic has a relatively competitive value regarding the other model.
# - The Model had no problem with multicolinearity, heteroscedasticity or lack of normality in the data.
effect_plot(Model1, pred = inflation_rate, interval = TRUE)
effect_plot(Model1, pred = itaee, interval = TRUE)
effect_plot(Model1, pred = pop_density, interval = TRUE)
effect_plot(Model1, pred = max_temperature, interval = TRUE)