#Cargar los paquetes necesarios

2Cargar la base de datos

df <- read.csv("Sample - Superstore.csv")
head(df)
##   Row.ID       Order.ID Order.Date  Ship.Date      Ship.Mode Customer.ID
## 1      1 CA-2016-152156  11/8/2016 11/11/2016   Second Class    CG-12520
## 2      2 CA-2016-152156  11/8/2016 11/11/2016   Second Class    CG-12520
## 3      3 CA-2016-138688  6/12/2016  6/16/2016   Second Class    DV-13045
## 4      4 US-2015-108966 10/11/2015 10/18/2015 Standard Class    SO-20335
## 5      5 US-2015-108966 10/11/2015 10/18/2015 Standard Class    SO-20335
## 6      6 CA-2014-115812   6/9/2014  6/14/2014 Standard Class    BH-11710
##     Customer.Name   Segment       Country            City      State
## 1     Claire Gute  Consumer United States       Henderson   Kentucky
## 2     Claire Gute  Consumer United States       Henderson   Kentucky
## 3 Darrin Van Huff Corporate United States     Los Angeles California
## 4  Sean O'Donnell  Consumer United States Fort Lauderdale    Florida
## 5  Sean O'Donnell  Consumer United States Fort Lauderdale    Florida
## 6 Brosina Hoffman  Consumer United States     Los Angeles California
##   Postal.Code Region      Product.ID        Category Sub.Category
## 1       42420  South FUR-BO-10001798       Furniture    Bookcases
## 2       42420  South FUR-CH-10000454       Furniture       Chairs
## 3       90036   West OFF-LA-10000240 Office Supplies       Labels
## 4       33311  South FUR-TA-10000577       Furniture       Tables
## 5       33311  South OFF-ST-10000760 Office Supplies      Storage
## 6       90032   West FUR-FU-10001487       Furniture  Furnishings
##                                                       Product.Name    Sales
## 1                                Bush Somerset Collection Bookcase 261.9600
## 2      Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3        Self-Adhesive Address Labels for Typewriters by Universal  14.6200
## 4                    Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5                                   Eldon Fold 'N Roll Cart System  22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood  48.8600
##   Quantity Discount    Profit
## 1        2     0.00   41.9136
## 2        3     0.00  219.5820
## 3        2     0.00    6.8714
## 4        5     0.45 -383.0310
## 5        2     0.20    2.5164
## 6        7     0.00   14.1694

3 Construcccion de las variables para series temporales

#Vamos a hacer una prediccion de ventas mensuales 
df_reg <- df %>%     #Llmar a df 
  filter(State =="New York") %>%
  select(Order.Date, Sales) %>%    #Elegir dos columnas
  mutate(Order.Date = mdy(Order.Date)) %>%     #Convertir a fecha 
  mutate(Fecha = floor_date(Order.Date, unit = "month")) %>% #Obtener los meses
  group_by(Fecha) %>%    #Agrupar por cada mes 
  summarise(Ventas = sum(Sales))   #Sumar ventas agrupadas por mes 
head(df_reg)
## # A tibble: 6 × 2
##   Fecha       Ventas
##   <date>       <dbl>
## 1 2014-01-01    3.93
## 2 2014-02-01   65.0 
## 3 2014-03-01 4936.  
## 4 2014-04-01  491.  
## 5 2014-05-01 1169.  
## 6 2014-06-01 5743.

4 Convertir a serie de tiempo

#Convertir a serie de tiempo con ts()
ts_data <- ts(df_reg$Ventas,   #Llamar la columna ventas de df_reg
              #Ingresar año y mes de inicio start = c (Año, mes)
              start = c(year(min(df_reg$Fecha)), month(min(df_reg$Fecha))),
              #Frecuencia es 12 porque son datos mensuales 
              frequency = 12)
ts_data
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 2014     3.928    64.960  4936.218   490.672  1168.624  5743.314  1219.668
## 2015  1108.272   781.072  3732.811  1502.592  4803.414  6133.124  7345.490
## 2016  1065.994  2011.888  7909.336  5107.188  6984.044  6057.880  2513.984
## 2017   800.916   623.734  3915.022  1843.998  2923.318 10403.588  2384.300
##            Aug       Sep       Oct       Nov       Dec
## 2014   988.682 16816.423   721.082 20776.898 11858.018
## 2015  4161.610 16637.568  4534.558 19625.778  9954.398
## 2016  5015.506 10702.824  4461.106  6690.280 13324.072
## 2017  6669.652 14737.803 24725.044 18190.242  6705.378

5 Ajustar el modelo SARIMA

#Ajustar modelo SARIMA con funcion auto.arima()
fit_arima <- auto.arima(y = ts_data,  #Llamar a serie de tiempo ts_data
                        seasonal = TRUE,   #Estacional = TRUE por que son mensuales 
                        trace = TRUE,   #Lista de modelos candidatos
                        ic = "aic",    #Criterio de informacion de Akaite
                        approximation = TRUE) 
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)(1,1,1)[12] with drift         : Inf
##  ARIMA(0,0,0)(0,1,0)[12] with drift         : 517.3708
##  ARIMA(1,0,0)(1,1,0)[12] with drift         : Inf
##  ARIMA(0,0,1)(0,1,1)[12] with drift         : 516.3515
##  ARIMA(0,0,0)(0,1,0)[12]                    : 516.2053
##  ARIMA(0,0,0)(1,1,0)[12] with drift         : 524.1016
##  ARIMA(0,0,0)(0,1,1)[12] with drift         : 514.3596
##  ARIMA(0,0,0)(1,1,1)[12] with drift         : Inf
##  ARIMA(1,0,0)(0,1,1)[12] with drift         : 517.3766
##  ARIMA(1,0,1)(0,1,1)[12] with drift         : 519.314
##  ARIMA(0,0,0)(0,1,1)[12]                    : 513.8129
##  ARIMA(0,0,0)(1,1,1)[12]                    : Inf
##  ARIMA(0,0,0)(1,1,0)[12]                    : 522.7632
##  ARIMA(1,0,0)(0,1,1)[12]                    : 516.441
##  ARIMA(0,0,1)(0,1,1)[12]                    : 515.5567
##  ARIMA(1,0,1)(0,1,1)[12]                    : 518.1764
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(0,0,0)(0,1,1)[12]                    : 721.0813
## 
##  Best model: ARIMA(0,0,0)(0,1,1)[12]
fit_arima
## Series: ts_data 
## ARIMA(0,0,0)(0,1,1)[12] 
## 
## Coefficients:
##          sma1
##       -0.3744
## s.e.   0.2264
## 
## sigma^2 = 25623412:  log likelihood = -358.54
## AIC=721.08   AICc=721.44   BIC=724.25

6 Obtener los valores de la prediccion

#Vamos a predecir 12 valores con la funcion forescast(modelo, valores)
#La cantidad de valores a predecir depende del modelo elegido
prediccion_sarima <- forecast(fit_arima, h=12)

#Crear un data frame con los valores de las predicciones con intervalos de confianza del 95%
df_futuro_sarima <- data.frame(
  Fecha = seq.Date(from = max(df_reg$Fecha) + months(1), by = "month", length.out = 12),
  Ventas = prediccion_sarima$mean,
  Bajo = prediccion_sarima$lower[,2],
  Alto = prediccion_sarima$upper[,2]
)

#Union se series de tiempo en una sola
df_total_sarima <- bind_rows(df_reg, df_futuro_sarima)
tail(df_total_sarima) 
## # A tibble: 6 × 4
##   Fecha      Ventas   Bajo   Alto
##   <date>      <dbl>  <dbl>  <dbl>
## 1 2018-07-01  2820. -7103. 12743.
## 2 2018-08-01  5791. -4132. 15714.
## 3 2018-09-01 14055.  4132. 23978.
## 4 2018-10-01 16995.  7072. 26918.
## 5 2018-11-01 15725.  5802. 25648.
## 6 2018-12-01  8800. -1123. 18723.

7 Grafica

ggplot(df_total_sarima, aes(x = Fecha, y = Ventas)) +
  geom_point(data = df_reg) + #Valores historicos 
  geom_line(data = df_reg) + #Linea de valores historicos 
  geom_ribbon(data = df_futuro_sarima, aes(ymin = Bajo, ymax = Alto),
fill = "lightblue",alpha = 0.8) + #Intervalos de confianza 
  geom_point(data = df_futuro_sarima, color = "red") +  #valores predichos
  geom_line(data = df_futuro_sarima, color = "red", linetype = "dashed") +#linea de predicciones 
  labs(title = "Prediccion de ventas con ARIMA enero-junio 2018",
       x = "Mes",
       y = "Ventas")