#1 CARGAR LOS PAQUETES NECESARIOS

rm(list = ls()) #se pone para limpiar la consola
knitr::opts_chunk$set(echo = TRUE) #instrucción para publicar
pacman::p_load(dplyr, ggplot2, lubridate, scales, forecast)

#2 CARGAR BASE DE DATOS

df<- read.csv("DATA/Sample - Superstore.csv")
head (df)
##   Row.ID       Order.ID Order.Date  Ship.Date      Ship.Mode Customer.ID
## 1      1 CA-2016-152156  11/8/2016 11/11/2016   Second Class    CG-12520
## 2      2 CA-2016-152156  11/8/2016 11/11/2016   Second Class    CG-12520
## 3      3 CA-2016-138688  6/12/2016  6/16/2016   Second Class    DV-13045
## 4      4 US-2015-108966 10/11/2015 10/18/2015 Standard Class    SO-20335
## 5      5 US-2015-108966 10/11/2015 10/18/2015 Standard Class    SO-20335
## 6      6 CA-2014-115812   6/9/2014  6/14/2014 Standard Class    BH-11710
##     Customer.Name   Segment       Country            City      State
## 1     Claire Gute  Consumer United States       Henderson   Kentucky
## 2     Claire Gute  Consumer United States       Henderson   Kentucky
## 3 Darrin Van Huff Corporate United States     Los Angeles California
## 4  Sean O'Donnell  Consumer United States Fort Lauderdale    Florida
## 5  Sean O'Donnell  Consumer United States Fort Lauderdale    Florida
## 6 Brosina Hoffman  Consumer United States     Los Angeles California
##   Postal.Code Region      Product.ID        Category Sub.Category
## 1       42420  South FUR-BO-10001798       Furniture    Bookcases
## 2       42420  South FUR-CH-10000454       Furniture       Chairs
## 3       90036   West OFF-LA-10000240 Office Supplies       Labels
## 4       33311  South FUR-TA-10000577       Furniture       Tables
## 5       33311  South OFF-ST-10000760 Office Supplies      Storage
## 6       90032   West FUR-FU-10001487       Furniture  Furnishings
##                                                       Product.Name    Sales
## 1                                Bush Somerset Collection Bookcase 261.9600
## 2      Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3        Self-Adhesive Address Labels for Typewriters by Universal  14.6200
## 4                    Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5                                   Eldon Fold 'N Roll Cart System  22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood  48.8600
##   Quantity Discount    Profit
## 1        2     0.00   41.9136
## 2        3     0.00  219.5820
## 3        2     0.00    6.8714
## 4        5     0.45 -383.0310
## 5        2     0.20    2.5164
## 6        7     0.00   14.1694

#3 Construcción de las variables para series temporales

# vamos ha hacer una prediccion de ventas mensuales
df_reg <- df %>% #llamar a df
  filter(State == "California") %>%
  select(Order.Date, Sales) %>% #elegir dos columnas
  mutate(Order.Date = mdy(Order.Date)) %>% #convertir a fecha
  mutate(Fecha = floor_date(Order.Date, unit = "month")) %>% #obtener los meses 
  group_by(Fecha) %>% #agrupar por cada mes
  summarise(Ventas = sum(Sales)) #sumar ventas agrupadas por mes
head(df_reg)
## # A tibble: 6 × 2
##   Fecha      Ventas
##   <date>      <dbl>
## 1 2014-01-01  2455.
## 2 2014-02-01   309.
## 3 2014-03-01  7239.
## 4 2014-04-01  8165.
## 5 2014-05-01  5960.
## 6 2014-06-01  4379.

4 CONVERTIR A SERIE DE TIEMPO

# Convertir a serie de tiempo con ts()
ts_data <- ts(df_reg$Ventas, #llamar la columna de ventas de df_reg
              # ingresar año y mes de inicio start=c(año,mes)
              start = c(year(min(df_reg$Fecha)), month(min(df_reg$Fecha))),
              #frecuencia es 12 por que son datos mensuales
              frequency = 12)
(ts_data)
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 2014  2455.185   308.702  7239.096  8165.181  5960.020  4379.324 16231.651
## 2015  5889.954  1198.714  5852.804  8469.552  8055.575  3018.538  4146.832
## 2016  2876.464  3085.794 15857.442  6297.218  5344.928 16120.423  8605.369
## 2017  5163.432  6864.883 11331.194  8784.206 10157.356  9086.966 12923.859
##            Aug       Sep       Oct       Nov       Dec
## 2014  5596.174  4646.597  4582.182 13690.062 18049.357
## 2015  8454.083  8603.863  9836.189  9392.437 15525.303
## 2016  9573.000 16153.979  7163.222 17896.488 22577.584
## 2017 14534.322 18866.852 15776.301 13580.951 19318.023

5 AJUSTAR EL MODELO SARIMA

#ajustar modelo sarima con funcion auto.arima()
fit_arima <- auto.arima(y = ts_data, #Llamar a serie de tiempo ts-data
                        seasonal = TRUE,  #Estacional = TRUE porque mensuales
                        trace = TRUE,  #Lista de modelos candidatos
                        ic = "aic",  #Criterio de informacion de Akaike
                        approximation = TRUE) 
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)(1,1,1)[12] with drift         : Inf
##  ARIMA(0,0,0)(0,1,0)[12] with drift         : 514.8832
##  ARIMA(1,0,0)(1,1,0)[12] with drift         : 495.5661
##  ARIMA(0,0,1)(0,1,1)[12] with drift         : 508.334
##  ARIMA(0,0,0)(0,1,0)[12]                    : 515.9887
##  ARIMA(1,0,0)(0,1,0)[12] with drift         : 517.6598
##  ARIMA(1,0,0)(1,1,1)[12] with drift         : 497.2381
##  ARIMA(1,0,0)(0,1,1)[12] with drift         : 509.7639
##  ARIMA(0,0,0)(1,1,0)[12] with drift         : 498.0364
##  ARIMA(2,0,0)(1,1,0)[12] with drift         : 488.968
##  ARIMA(2,0,0)(0,1,0)[12] with drift         : 518.9726
##  ARIMA(2,0,0)(1,1,1)[12] with drift         : 490.875
##  ARIMA(2,0,0)(0,1,1)[12] with drift         : 510.8245
##  ARIMA(3,0,0)(1,1,0)[12] with drift         : 482.7104
##  ARIMA(3,0,0)(0,1,0)[12] with drift         : 521.2036
##  ARIMA(3,0,0)(1,1,1)[12] with drift         : 484.306
##  ARIMA(3,0,0)(0,1,1)[12] with drift         : 513.9044
##  ARIMA(4,0,0)(1,1,0)[12] with drift         : 477.1084
##  ARIMA(4,0,0)(0,1,0)[12] with drift         : 523.3029
##  ARIMA(4,0,0)(1,1,1)[12] with drift         : 479.0217
##  ARIMA(4,0,0)(0,1,1)[12] with drift         : 515.2304
##  ARIMA(5,0,0)(1,1,0)[12] with drift         : 460.423
##  ARIMA(5,0,0)(0,1,0)[12] with drift         : 519.7894
##  ARIMA(5,0,0)(1,1,1)[12] with drift         : 456.9196
##  ARIMA(5,0,0)(0,1,1)[12] with drift         : 515.16
##  ARIMA(5,0,1)(1,1,1)[12] with drift         : Inf
##  ARIMA(4,0,1)(1,1,1)[12] with drift         : Inf
##  ARIMA(5,0,0)(1,1,1)[12]                    : 488.8122
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(5,0,0)(1,1,1)[12] with drift         : 714.7413
## 
##  Best model: ARIMA(5,0,0)(1,1,1)[12] with drift
fit_arima
## Series: ts_data 
## ARIMA(5,0,0)(1,1,1)[12] with drift 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4     ar5     sar1     sma1     drift
##       0.0075  -0.2666  0.2745  -0.2083  0.1331  -0.6193  -0.1311  161.0977
## s.e.  0.1719   0.1694  0.1688   0.1752  0.1923   0.3123   0.4735   30.6645
## 
## sigma^2 = 15124086:  log likelihood = -348.37
## AIC=714.74   AICc=721.66   BIC=728.99

6 OBTENER LOS VALORES DE LA PREDICCIÓN

# Predecir 12 valores con la funcion forecast
# La cantidad de valores a predecir depende del modelo elegido
prediccion_sarima <- forecast(fit_arima, h=12)

# Crear un data frame con los valores de las predicciones con intervalos de confianza del 95%
df_futuro_sarima <- data.frame(
  Fecha = seq.Date(from = max(df_reg$Fecha) + months(1), by = "month", length.out = 12),
  Ventas = prediccion_sarima$mean,
  Bajo = prediccion_sarima$lower[,2],
  Alto = prediccion_sarima$upper[,2]
)
# union de series de tiempo en una sola
df_total_sarima <- bind_rows(df_reg, df_futuro_sarima)
tail(df_total_sarima, 12)
## # A tibble: 12 × 4
##    Fecha      Ventas   Bajo   Alto
##    <date>      <dbl>  <dbl>  <dbl>
##  1 2018-01-01  8009.   386. 15631.
##  2 2018-02-01  7336.  -287. 14958.
##  3 2018-03-01 17552.  9664. 25441.
##  4 2018-04-01 11056.  2902. 19209.
##  5 2018-05-01  9930.  1714. 18147.
##  6 2018-06-01 16765.  8548. 24982.
##  7 2018-07-01 13143.  4829. 21458.
##  8 2018-08-01 14079.  5712. 22446.
##  9 2018-09-01 19754. 11386. 28122.
## 10 2018-10-01 13130.  4748. 21512.
## 11 2018-11-01 19526. 11126. 27927.
## 12 2018-12-01 24781. 16376. 33186.

#7 GRÁFICA

ggplot(df_total_sarima, aes(x = Fecha, y = Ventas)) +
  geom_point(data = df_reg) + #valores historicos
  geom_line(data = df_reg) + #linea de valores historicos
  geom_ribbon(data = df_futuro_sarima, aes(ymin = Bajo, ymax = Alto), fill = "lightblue",
              alpha = 0.8) + #intervalos de confianza
  geom_point(data = df_futuro_sarima, color = "red") + #valores predichos
  geom_line(data = df_futuro_sarima, color = "red", linetype = "dashed") + #linea de predicciones
  labs(title = "prediccion de ventas con arima enero-diciembre 2018",
       x = "Mes",
       y = "Ventas")