#Cargar los paquetes necesarios
df <- read.csv("Sample - Superstore.csv")
head(df)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 1 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 2 2 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 3 3 CA-2016-138688 6/12/2016 6/16/2016 Second Class DV-13045
## 4 4 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 5 5 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 6 6 CA-2014-115812 6/9/2014 6/14/2014 Standard Class BH-11710
## Customer.Name Segment Country City State
## 1 Claire Gute Consumer United States Henderson Kentucky
## 2 Claire Gute Consumer United States Henderson Kentucky
## 3 Darrin Van Huff Corporate United States Los Angeles California
## 4 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 5 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 6 Brosina Hoffman Consumer United States Los Angeles California
## Postal.Code Region Product.ID Category Sub.Category
## 1 42420 South FUR-BO-10001798 Furniture Bookcases
## 2 42420 South FUR-CH-10000454 Furniture Chairs
## 3 90036 West OFF-LA-10000240 Office Supplies Labels
## 4 33311 South FUR-TA-10000577 Furniture Tables
## 5 33311 South OFF-ST-10000760 Office Supplies Storage
## 6 90032 West FUR-FU-10001487 Furniture Furnishings
## Product.Name Sales
## 1 Bush Somerset Collection Bookcase 261.9600
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3 Self-Adhesive Address Labels for Typewriters by Universal 14.6200
## 4 Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5 Eldon Fold 'N Roll Cart System 22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood 48.8600
## Quantity Discount Profit
## 1 2 0.00 41.9136
## 2 3 0.00 219.5820
## 3 2 0.00 6.8714
## 4 5 0.45 -383.0310
## 5 2 0.20 2.5164
## 6 7 0.00 14.1694
#Vamos a hacer una prediccion de ventas mensuales
df_reg <- df %>% #Llmar a df
filter(State =="New York") %>%
select(Order.Date, Sales) %>% #Elegir dos columnas
mutate(Order.Date = mdy(Order.Date)) %>% #Convertir a fecha
mutate(Fecha = floor_date(Order.Date, unit = "month")) %>% #Obtener los meses
group_by(Fecha) %>% #Agrupar por cada mes
summarise(Ventas = sum(Sales)) #Sumar ventas agrupadas por mes
head(df_reg)
## # A tibble: 6 × 2
## Fecha Ventas
## <date> <dbl>
## 1 2014-01-01 3.93
## 2 2014-02-01 65.0
## 3 2014-03-01 4936.
## 4 2014-04-01 491.
## 5 2014-05-01 1169.
## 6 2014-06-01 5743.
#Convertir a serie de tiempo con ts()
ts_data <- ts(df_reg$Ventas, #Llamar la columna ventas de df_reg
#Ingresar año y mes de inicio start = c (Año, mes)
start = c(year(min(df_reg$Fecha)), month(min(df_reg$Fecha))),
#Frecuencia es 12 porque son datos mensuales
frequency = 12)
ts_data
## Jan Feb Mar Apr May Jun Jul
## 2014 3.928 64.960 4936.218 490.672 1168.624 5743.314 1219.668
## 2015 1108.272 781.072 3732.811 1502.592 4803.414 6133.124 7345.490
## 2016 1065.994 2011.888 7909.336 5107.188 6984.044 6057.880 2513.984
## 2017 800.916 623.734 3915.022 1843.998 2923.318 10403.588 2384.300
## Aug Sep Oct Nov Dec
## 2014 988.682 16816.423 721.082 20776.898 11858.018
## 2015 4161.610 16637.568 4534.558 19625.778 9954.398
## 2016 5015.506 10702.824 4461.106 6690.280 13324.072
## 2017 6669.652 14737.803 24725.044 18190.242 6705.378
#Ajustar modelo SARIMA con funcion auto.arima()
fit_arima <- auto.arima(y = ts_data, #Llamar a serie de tiempo ts_data
seasonal = TRUE, #Estacional = TRUE por que son mensuales
trace = TRUE, #Lista de modelos candidatos
ic = "aic", #Criterio de informacion de Akaite
approximation = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2)(1,1,1)[12] with drift : Inf
## ARIMA(0,0,0)(0,1,0)[12] with drift : 517.3708
## ARIMA(1,0,0)(1,1,0)[12] with drift : Inf
## ARIMA(0,0,1)(0,1,1)[12] with drift : 516.3515
## ARIMA(0,0,0)(0,1,0)[12] : 516.2053
## ARIMA(0,0,0)(1,1,0)[12] with drift : 524.1016
## ARIMA(0,0,0)(0,1,1)[12] with drift : 514.3596
## ARIMA(0,0,0)(1,1,1)[12] with drift : Inf
## ARIMA(1,0,0)(0,1,1)[12] with drift : 517.3766
## ARIMA(1,0,1)(0,1,1)[12] with drift : 519.314
## ARIMA(0,0,0)(0,1,1)[12] : 513.8129
## ARIMA(0,0,0)(1,1,1)[12] : Inf
## ARIMA(0,0,0)(1,1,0)[12] : 522.7632
## ARIMA(1,0,0)(0,1,1)[12] : 516.441
## ARIMA(0,0,1)(0,1,1)[12] : 515.5567
## ARIMA(1,0,1)(0,1,1)[12] : 518.1764
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(0,0,0)(0,1,1)[12] : 721.0813
##
## Best model: ARIMA(0,0,0)(0,1,1)[12]
fit_arima
## Series: ts_data
## ARIMA(0,0,0)(0,1,1)[12]
##
## Coefficients:
## sma1
## -0.3744
## s.e. 0.2264
##
## sigma^2 = 25623412: log likelihood = -358.54
## AIC=721.08 AICc=721.44 BIC=724.25
#Vamos a predecir 12 valores con la funcion forescast(modelo, valores)
#La cantidad de valores a predecir depende del modelo elegido
prediccion_sarima <- forecast(fit_arima, h=12)
#Crear un data frame con los valores de las predicciones con intervalos de confianza del 95%
df_futuro_sarima <- data.frame(
Fecha = seq.Date(from = max(df_reg$Fecha) + months(1), by = "month", length.out = 12),
Ventas = prediccion_sarima$mean,
Bajo = prediccion_sarima$lower[,2],
Alto = prediccion_sarima$upper[,2]
)
#Union se series de tiempo en una sola
df_total_sarima <- bind_rows(df_reg, df_futuro_sarima)
tail(df_total_sarima)
## # A tibble: 6 × 4
## Fecha Ventas Bajo Alto
## <date> <dbl> <dbl> <dbl>
## 1 2018-07-01 2820. -7103. 12743.
## 2 2018-08-01 5791. -4132. 15714.
## 3 2018-09-01 14055. 4132. 23978.
## 4 2018-10-01 16995. 7072. 26918.
## 5 2018-11-01 15725. 5802. 25648.
## 6 2018-12-01 8800. -1123. 18723.
ggplot(df_total_sarima, aes(x = Fecha, y = Ventas)) +
geom_point(data = df_reg) + #Valores historicos
geom_line(data = df_reg) + #Linea de valores historicos
geom_ribbon(data = df_futuro_sarima, aes(ymin = Bajo, ymax = Alto),
fill = "lightblue",alpha = 0.8) + #Intervalos de confianza
geom_point(data = df_futuro_sarima, color = "red") + #valores predichos
geom_line(data = df_futuro_sarima, color = "red", linetype = "dashed") +#linea de predicciones
labs(title = "Prediccion de ventas con ARIMA enero-junio 2018",
x = "Mes",
y = "Ventas")