#1 cargar los paquetes necesarios
rm(list = ls())# se pone parsalimpiar la consola
knitr::opts_chunk$set(echo = TRUE)# instruccon para publicar
pacman::p_load(dplyr, ggplot2, lubridate, scales, forecast)
# forecast preedice los valores futuros
#2 cargar la base de datps
df<-read.csv("DATA1/Sample - Superstore.csv")
head(df)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 1 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 2 2 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 3 3 CA-2016-138688 6/12/2016 6/16/2016 Second Class DV-13045
## 4 4 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 5 5 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 6 6 CA-2014-115812 6/9/2014 6/14/2014 Standard Class BH-11710
## Customer.Name Segment Country City State
## 1 Claire Gute Consumer United States Henderson Kentucky
## 2 Claire Gute Consumer United States Henderson Kentucky
## 3 Darrin Van Huff Corporate United States Los Angeles California
## 4 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 5 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 6 Brosina Hoffman Consumer United States Los Angeles California
## Postal.Code Region Product.ID Category Sub.Category
## 1 42420 South FUR-BO-10001798 Furniture Bookcases
## 2 42420 South FUR-CH-10000454 Furniture Chairs
## 3 90036 West OFF-LA-10000240 Office Supplies Labels
## 4 33311 South FUR-TA-10000577 Furniture Tables
## 5 33311 South OFF-ST-10000760 Office Supplies Storage
## 6 90032 West FUR-FU-10001487 Furniture Furnishings
## Product.Name Sales
## 1 Bush Somerset Collection Bookcase 261.9600
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3 Self-Adhesive Address Labels for Typewriters by Universal 14.6200
## 4 Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5 Eldon Fold 'N Roll Cart System 22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood 48.8600
## Quantity Discount Profit
## 1 2 0.00 41.9136
## 2 3 0.00 219.5820
## 3 2 0.00 6.8714
## 4 5 0.45 -383.0310
## 5 2 0.20 2.5164
## 6 7 0.00 14.1694
#3construccion de las variables para series temporales
#vamos hacer una prediccion de ventas mensuales
df_reg <- df %>% #llamar a df
filter(State == "Texas") %>%
select(Order.Date, Sales) %>% #elegir dos columnas
mutate(Order.Date = mdy(Order.Date)) %>% #convertir a fecha
mutate(Fecha = floor_date(Order.Date, unit = "month")) %>% #obtener los meses
group_by(Fecha) %>% #agrupar por cada mes
summarise(Ventas = sum(Sales)) #sumar ventas agrupadas por mes
head(df_reg)
## # A tibble: 6 × 2
## Fecha Ventas
## <date> <dbl>
## 1 2014-01-01 144.
## 2 2014-02-01 74.4
## 3 2014-03-01 2896.
## 4 2014-04-01 2723.
## 5 2014-05-01 1955.
## 6 2014-06-01 2586.
#4 Convertir a serie de tiempo
#convertir a serie de tiempo con ts()
ts_data<- ts(df_reg$Ventas, #llamar la columna ventas de df_reg
#ingresar año y mes de inicio start=c(año,mes)
start = c(year(min(df_reg$Fecha)), month(min(df_reg$Fecha))), #frecuencia es de 12 porqe son datos mensuales
frequency = 12)
(ts_data)
## Jan Feb Mar Apr May Jun
## 2014 144.1500 74.3640 2895.8180 2722.7720 1955.3570 2586.3196
## 2015 2084.2676 793.8940 2746.6240 3584.9320 3062.7860 1636.8280
## 2016 441.1880 331.4760 8013.8050 4520.6520 2388.3540 988.8860
## 2017 3210.4280 1399.6024 7950.3968 626.6676 3434.8752 3326.8122
## Jul Aug Sep Oct Nov Dec
## 2014 6443.6320 573.8060 19135.3578 3793.3100 6385.7722 3914.5180
## 2015 533.7600 4468.7232 2699.1140 4151.5910 4336.0800 4356.3592
## 2016 3387.7060 4847.5568 6652.3014 980.5160 5489.0868 3644.6240
## 2017 2154.7900 2566.7920 3939.9660 4143.6502 7968.4470 2699.3308
#5 ajustar el modelo SARIMA
#Ajustar modelo sarima con funcion auto.arima()
fit_arima <- auto.arima(y= ts_data, #llam ar a serie de tiempo ts data
seasonal = TRUE, #estacional = TRUE porque mensuales
trace = TRUE, #lista de modelo candidatos
ic = "aic", #criterio de informacion de Akaike
approximation = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2)(1,0,1)[12] with non-zero mean : Inf
## ARIMA(0,0,0) with non-zero mean : 910.7992
## ARIMA(1,0,0)(1,0,0)[12] with non-zero mean : 874.3151
## ARIMA(0,0,1)(0,0,1)[12] with non-zero mean : 913.784
## ARIMA(0,0,0) with zero mean : 949.6195
## ARIMA(1,0,0) with non-zero mean : 911.7766
## ARIMA(1,0,0)(1,0,1)[12] with non-zero mean : 872.0449
## ARIMA(1,0,0)(0,0,1)[12] with non-zero mean : 913.3394
## ARIMA(0,0,0)(1,0,1)[12] with non-zero mean : 867.4026
## ARIMA(0,0,0)(0,0,1)[12] with non-zero mean : 912.4456
## ARIMA(0,0,0)(1,0,0)[12] with non-zero mean : 872.8962
## ARIMA(0,0,1)(1,0,1)[12] with non-zero mean : 869.3639
## ARIMA(1,0,1)(1,0,1)[12] with non-zero mean : Inf
## ARIMA(0,0,0)(1,0,1)[12] with zero mean : 905.8729
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(0,0,0)(1,0,1)[12] with non-zero mean : Inf
## ARIMA(0,0,1)(1,0,1)[12] with non-zero mean : Inf
## ARIMA(1,0,0)(1,0,1)[12] with non-zero mean : Inf
## ARIMA(0,0,0)(1,0,0)[12] with non-zero mean : 911.8561
##
## Best model: ARIMA(0,0,0)(1,0,0)[12] with non-zero mean
fit_arima
## Series: ts_data
## ARIMA(0,0,0)(1,0,0)[12] with non-zero mean
##
## Coefficients:
## sar1 mean
## 0.2035 3588.0824
## s.e. 0.2046 516.8842
##
## sigma^2 = 9493428: log likelihood = -452.93
## AIC=911.86 AICc=912.4 BIC=917.47
# predecir 12 valores con la funcion forecast
# La cantidad de valores a predecir depende del modelo elegido
prediccion_sarima <- forecast(fit_arima, h=12)
#crear un data frame con los valores de las prediccciones con intervalos de confianza del 95%
df_futuro_sarima<-data.frame(
Fecha = seq.Date(from = max(df_reg$Fecha) + months(1), by = "month", length.out = 12),
Ventas = prediccion_sarima$mean,
Bajo = prediccion_sarima$lower[,2],
Alto = prediccion_sarima$upper[,2]
)
# union de series de tiempo en una sola
df_total_sarima<- bind_rows(df_reg,
df_futuro_sarima)
tail(df_total_sarima ,12)
## # A tibble: 12 × 4
## Fecha Ventas Bajo Alto
## <date> <dbl> <dbl> <dbl>
## 1 2018-01-01 3511. -2528. 9550.
## 2 2018-02-01 3143. -2896. 9182.
## 3 2018-03-01 4476. -1563. 10515.
## 4 2018-04-01 2985. -3054. 9024.
## 5 2018-05-01 3557. -2482. 9596.
## 6 2018-06-01 3535. -2504. 9574.
## 7 2018-07-01 3296. -2743. 9335.
## 8 2018-08-01 3380. -2659. 9419.
## 9 2018-09-01 3660. -2379. 9699.
## 10 2018-10-01 3701. -2338. 9740.
## 11 2018-11-01 4480. -1559. 10518.
## 12 2018-12-01 3407. -2632. 9446.
#7 GRAFICA # Versión corregida
ggplot() +
geom_point(data = df_reg, aes(x = Fecha, y = Ventas)) + # Valores históricos
geom_line(data = df_reg, aes(x = Fecha, y = Ventas)) + # Línea de valores históricos
geom_ribbon(data = df_futuro_sarima, aes(x = Fecha, ymin = Bajo, ymax = Alto),
fill = "lightblue", alpha = 0.8) + # Intervalos de confianza
geom_point(data = df_futuro_sarima, aes(x = Fecha, y = Ventas), color = "red") + # Valores predichos
geom_line(data = df_futuro_sarima, aes(x = Fecha, y = Ventas),
color = "red", linetype = "dashed") + # Línea de predicciones
labs(title = "Predicción de ventas con ARIMA enero-diciembre 2018",
x = "Mes", y = "Ventas")
```