#1 cargar los paquetes necesarios

rm(list = ls())# se pone parsalimpiar la consola
knitr::opts_chunk$set(echo = TRUE)# instruccon para publicar
pacman::p_load(dplyr, ggplot2, lubridate, scales, forecast)
               # forecast preedice los valores futuros
               
#2 cargar la base de datps
df<-read.csv("DATA1/Sample - Superstore.csv")
head(df)           
##   Row.ID       Order.ID Order.Date  Ship.Date      Ship.Mode Customer.ID
## 1      1 CA-2016-152156  11/8/2016 11/11/2016   Second Class    CG-12520
## 2      2 CA-2016-152156  11/8/2016 11/11/2016   Second Class    CG-12520
## 3      3 CA-2016-138688  6/12/2016  6/16/2016   Second Class    DV-13045
## 4      4 US-2015-108966 10/11/2015 10/18/2015 Standard Class    SO-20335
## 5      5 US-2015-108966 10/11/2015 10/18/2015 Standard Class    SO-20335
## 6      6 CA-2014-115812   6/9/2014  6/14/2014 Standard Class    BH-11710
##     Customer.Name   Segment       Country            City      State
## 1     Claire Gute  Consumer United States       Henderson   Kentucky
## 2     Claire Gute  Consumer United States       Henderson   Kentucky
## 3 Darrin Van Huff Corporate United States     Los Angeles California
## 4  Sean O'Donnell  Consumer United States Fort Lauderdale    Florida
## 5  Sean O'Donnell  Consumer United States Fort Lauderdale    Florida
## 6 Brosina Hoffman  Consumer United States     Los Angeles California
##   Postal.Code Region      Product.ID        Category Sub.Category
## 1       42420  South FUR-BO-10001798       Furniture    Bookcases
## 2       42420  South FUR-CH-10000454       Furniture       Chairs
## 3       90036   West OFF-LA-10000240 Office Supplies       Labels
## 4       33311  South FUR-TA-10000577       Furniture       Tables
## 5       33311  South OFF-ST-10000760 Office Supplies      Storage
## 6       90032   West FUR-FU-10001487       Furniture  Furnishings
##                                                       Product.Name    Sales
## 1                                Bush Somerset Collection Bookcase 261.9600
## 2      Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3        Self-Adhesive Address Labels for Typewriters by Universal  14.6200
## 4                    Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5                                   Eldon Fold 'N Roll Cart System  22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood  48.8600
##   Quantity Discount    Profit
## 1        2     0.00   41.9136
## 2        3     0.00  219.5820
## 3        2     0.00    6.8714
## 4        5     0.45 -383.0310
## 5        2     0.20    2.5164
## 6        7     0.00   14.1694

#3construccion de las variables para series temporales

#vamos hacer una prediccion de ventas mensuales
df_reg <- df %>% #llamar a df
  filter(State == "Texas") %>%
  select(Order.Date, Sales) %>%  #elegir dos columnas
  mutate(Order.Date = mdy(Order.Date)) %>%  #convertir a fecha
  mutate(Fecha = floor_date(Order.Date, unit = "month")) %>%  #obtener los meses
  group_by(Fecha) %>%   #agrupar por cada mes
  summarise(Ventas = sum(Sales)) #sumar ventas agrupadas por mes
head(df_reg)
## # A tibble: 6 × 2
##   Fecha      Ventas
##   <date>      <dbl>
## 1 2014-01-01  144. 
## 2 2014-02-01   74.4
## 3 2014-03-01 2896. 
## 4 2014-04-01 2723. 
## 5 2014-05-01 1955. 
## 6 2014-06-01 2586.

#4 Convertir a serie de tiempo

#convertir a serie de tiempo con ts()
ts_data<- ts(df_reg$Ventas,   #llamar la columna ventas de df_reg
             #ingresar año y mes de inicio start=c(año,mes)
             start = c(year(min(df_reg$Fecha)), month(min(df_reg$Fecha))), #frecuencia es de 12  porqe son datos mensuales
             frequency = 12)
(ts_data)
##             Jan        Feb        Mar        Apr        May        Jun
## 2014   144.1500    74.3640  2895.8180  2722.7720  1955.3570  2586.3196
## 2015  2084.2676   793.8940  2746.6240  3584.9320  3062.7860  1636.8280
## 2016   441.1880   331.4760  8013.8050  4520.6520  2388.3540   988.8860
## 2017  3210.4280  1399.6024  7950.3968   626.6676  3434.8752  3326.8122
##             Jul        Aug        Sep        Oct        Nov        Dec
## 2014  6443.6320   573.8060 19135.3578  3793.3100  6385.7722  3914.5180
## 2015   533.7600  4468.7232  2699.1140  4151.5910  4336.0800  4356.3592
## 2016  3387.7060  4847.5568  6652.3014   980.5160  5489.0868  3644.6240
## 2017  2154.7900  2566.7920  3939.9660  4143.6502  7968.4470  2699.3308

#5 ajustar el modelo SARIMA

#Ajustar modelo sarima con funcion auto.arima()
fit_arima <- auto.arima(y= ts_data,         #llam ar a serie de tiempo ts data
                        seasonal = TRUE,    #estacional = TRUE porque mensuales
                        trace = TRUE,       #lista de modelo candidatos
                        ic = "aic",         #criterio de informacion de Akaike
                        approximation = TRUE)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)(1,0,1)[12] with non-zero mean : Inf
##  ARIMA(0,0,0)            with non-zero mean : 910.7992
##  ARIMA(1,0,0)(1,0,0)[12] with non-zero mean : 874.3151
##  ARIMA(0,0,1)(0,0,1)[12] with non-zero mean : 913.784
##  ARIMA(0,0,0)            with zero mean     : 949.6195
##  ARIMA(1,0,0)            with non-zero mean : 911.7766
##  ARIMA(1,0,0)(1,0,1)[12] with non-zero mean : 872.0449
##  ARIMA(1,0,0)(0,0,1)[12] with non-zero mean : 913.3394
##  ARIMA(0,0,0)(1,0,1)[12] with non-zero mean : 867.4026
##  ARIMA(0,0,0)(0,0,1)[12] with non-zero mean : 912.4456
##  ARIMA(0,0,0)(1,0,0)[12] with non-zero mean : 872.8962
##  ARIMA(0,0,1)(1,0,1)[12] with non-zero mean : 869.3639
##  ARIMA(1,0,1)(1,0,1)[12] with non-zero mean : Inf
##  ARIMA(0,0,0)(1,0,1)[12] with zero mean     : 905.8729
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(0,0,0)(1,0,1)[12] with non-zero mean : Inf
##  ARIMA(0,0,1)(1,0,1)[12] with non-zero mean : Inf
##  ARIMA(1,0,0)(1,0,1)[12] with non-zero mean : Inf
##  ARIMA(0,0,0)(1,0,0)[12] with non-zero mean : 911.8561
## 
##  Best model: ARIMA(0,0,0)(1,0,0)[12] with non-zero mean
fit_arima
## Series: ts_data 
## ARIMA(0,0,0)(1,0,0)[12] with non-zero mean 
## 
## Coefficients:
##         sar1       mean
##       0.2035  3588.0824
## s.e.  0.2046   516.8842
## 
## sigma^2 = 9493428:  log likelihood = -452.93
## AIC=911.86   AICc=912.4   BIC=917.47

6 obtener los valores de la prediccion

# predecir 12 valores con la funcion forecast
# La cantidad de valores a predecir depende del modelo elegido
prediccion_sarima <- forecast(fit_arima, h=12)

#crear un data frame con los valores de las prediccciones con intervalos de confianza del 95%
df_futuro_sarima<-data.frame(
  Fecha = seq.Date(from = max(df_reg$Fecha) + months(1), by = "month", length.out = 12),
  Ventas = prediccion_sarima$mean,
  Bajo = prediccion_sarima$lower[,2],
  Alto = prediccion_sarima$upper[,2]
)
# union de series de tiempo en una sola
df_total_sarima<- bind_rows(df_reg,
df_futuro_sarima)
tail(df_total_sarima ,12)
## # A tibble: 12 × 4
##    Fecha      Ventas   Bajo   Alto
##    <date>      <dbl>  <dbl>  <dbl>
##  1 2018-01-01  3511. -2528.  9550.
##  2 2018-02-01  3143. -2896.  9182.
##  3 2018-03-01  4476. -1563. 10515.
##  4 2018-04-01  2985. -3054.  9024.
##  5 2018-05-01  3557. -2482.  9596.
##  6 2018-06-01  3535. -2504.  9574.
##  7 2018-07-01  3296. -2743.  9335.
##  8 2018-08-01  3380. -2659.  9419.
##  9 2018-09-01  3660. -2379.  9699.
## 10 2018-10-01  3701. -2338.  9740.
## 11 2018-11-01  4480. -1559. 10518.
## 12 2018-12-01  3407. -2632.  9446.

#7 GRAFICA # Versión corregida

ggplot() +
  geom_point(data = df_reg, aes(x = Fecha, y = Ventas)) +  # Valores históricos
  geom_line(data = df_reg, aes(x = Fecha, y = Ventas)) +   # Línea de valores históricos
  geom_ribbon(data = df_futuro_sarima, aes(x = Fecha, ymin = Bajo, ymax = Alto), 
              fill = "lightblue", alpha = 0.8) +  # Intervalos de confianza
  geom_point(data = df_futuro_sarima, aes(x = Fecha, y = Ventas), color = "red") +  # Valores predichos
  geom_line(data = df_futuro_sarima, aes(x = Fecha, y = Ventas), 
            color = "red", linetype = "dashed") +  # Línea de predicciones
  labs(title = "Predicción de ventas con ARIMA enero-diciembre 2018",
       x = "Mes", y = "Ventas")

```