#1 CARGAR LOS PAQUETES NECESARIOS
rm(list = ls()) #se pone para limpiar la consola
knitr::opts_chunk$set(echo = TRUE) #instrucción para publicar
pacman::p_load(dplyr, ggplot2, lubridate, scales, forecast)
#2 CARGAR BASE DE DATOS
df<- read.csv("DATA/Sample - Superstore.csv")
head (df)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 1 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 2 2 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 3 3 CA-2016-138688 6/12/2016 6/16/2016 Second Class DV-13045
## 4 4 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 5 5 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 6 6 CA-2014-115812 6/9/2014 6/14/2014 Standard Class BH-11710
## Customer.Name Segment Country City State
## 1 Claire Gute Consumer United States Henderson Kentucky
## 2 Claire Gute Consumer United States Henderson Kentucky
## 3 Darrin Van Huff Corporate United States Los Angeles California
## 4 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 5 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 6 Brosina Hoffman Consumer United States Los Angeles California
## Postal.Code Region Product.ID Category Sub.Category
## 1 42420 South FUR-BO-10001798 Furniture Bookcases
## 2 42420 South FUR-CH-10000454 Furniture Chairs
## 3 90036 West OFF-LA-10000240 Office Supplies Labels
## 4 33311 South FUR-TA-10000577 Furniture Tables
## 5 33311 South OFF-ST-10000760 Office Supplies Storage
## 6 90032 West FUR-FU-10001487 Furniture Furnishings
## Product.Name Sales
## 1 Bush Somerset Collection Bookcase 261.9600
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3 Self-Adhesive Address Labels for Typewriters by Universal 14.6200
## 4 Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5 Eldon Fold 'N Roll Cart System 22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood 48.8600
## Quantity Discount Profit
## 1 2 0.00 41.9136
## 2 3 0.00 219.5820
## 3 2 0.00 6.8714
## 4 5 0.45 -383.0310
## 5 2 0.20 2.5164
## 6 7 0.00 14.1694
#3 Construcción de las variables para series temporales
# vamos ha hacer una prediccion de ventas mensuales
df_reg <- df %>% #llamar a df
filter(State == "California") %>%
select(Order.Date, Sales) %>% #elegir dos columnas
mutate(Order.Date = mdy(Order.Date)) %>% #convertir a fecha
mutate(Fecha = floor_date(Order.Date, unit = "month")) %>% #obtener los meses
group_by(Fecha) %>% #agrupar por cada mes
summarise(Ventas = sum(Sales)) #sumar ventas agrupadas por mes
head(df_reg)
## # A tibble: 6 × 2
## Fecha Ventas
## <date> <dbl>
## 1 2014-01-01 2455.
## 2 2014-02-01 309.
## 3 2014-03-01 7239.
## 4 2014-04-01 8165.
## 5 2014-05-01 5960.
## 6 2014-06-01 4379.
# Convertir a serie de tiempo con ts()
ts_data <- ts(df_reg$Ventas, #llamar la columna de ventas de df_reg
# ingresar año y mes de inicio start=c(año,mes)
start = c(year(min(df_reg$Fecha)), month(min(df_reg$Fecha))),
#frecuencia es 12 por que son datos mensuales
frequency = 12)
(ts_data)
## Jan Feb Mar Apr May Jun Jul
## 2014 2455.185 308.702 7239.096 8165.181 5960.020 4379.324 16231.651
## 2015 5889.954 1198.714 5852.804 8469.552 8055.575 3018.538 4146.832
## 2016 2876.464 3085.794 15857.442 6297.218 5344.928 16120.423 8605.369
## 2017 5163.432 6864.883 11331.194 8784.206 10157.356 9086.966 12923.859
## Aug Sep Oct Nov Dec
## 2014 5596.174 4646.597 4582.182 13690.062 18049.357
## 2015 8454.083 8603.863 9836.189 9392.437 15525.303
## 2016 9573.000 16153.979 7163.222 17896.488 22577.584
## 2017 14534.322 18866.852 15776.301 13580.951 19318.023
#ajustar modelo sarima con funcion auto.arima()
fit_arima <- auto.arima(y = ts_data, #Llamar a serie de tiempo ts-data
seasonal = TRUE, #Estacional = TRUE porque mensuales
trace = TRUE, #Lista de modelos candidatos
ic = "aic", #Criterio de informacion de Akaike
approximation = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2)(1,1,1)[12] with drift : Inf
## ARIMA(0,0,0)(0,1,0)[12] with drift : 514.8832
## ARIMA(1,0,0)(1,1,0)[12] with drift : 495.5661
## ARIMA(0,0,1)(0,1,1)[12] with drift : 508.334
## ARIMA(0,0,0)(0,1,0)[12] : 515.9887
## ARIMA(1,0,0)(0,1,0)[12] with drift : 517.6598
## ARIMA(1,0,0)(1,1,1)[12] with drift : 497.2381
## ARIMA(1,0,0)(0,1,1)[12] with drift : 509.7639
## ARIMA(0,0,0)(1,1,0)[12] with drift : 498.0364
## ARIMA(2,0,0)(1,1,0)[12] with drift : 488.968
## ARIMA(2,0,0)(0,1,0)[12] with drift : 518.9726
## ARIMA(2,0,0)(1,1,1)[12] with drift : 490.875
## ARIMA(2,0,0)(0,1,1)[12] with drift : 510.8245
## ARIMA(3,0,0)(1,1,0)[12] with drift : 482.7104
## ARIMA(3,0,0)(0,1,0)[12] with drift : 521.2036
## ARIMA(3,0,0)(1,1,1)[12] with drift : 484.306
## ARIMA(3,0,0)(0,1,1)[12] with drift : 513.9044
## ARIMA(4,0,0)(1,1,0)[12] with drift : 477.1084
## ARIMA(4,0,0)(0,1,0)[12] with drift : 523.3029
## ARIMA(4,0,0)(1,1,1)[12] with drift : 479.0217
## ARIMA(4,0,0)(0,1,1)[12] with drift : 515.2304
## ARIMA(5,0,0)(1,1,0)[12] with drift : 460.423
## ARIMA(5,0,0)(0,1,0)[12] with drift : 519.7894
## ARIMA(5,0,0)(1,1,1)[12] with drift : 456.9196
## ARIMA(5,0,0)(0,1,1)[12] with drift : 515.16
## ARIMA(5,0,1)(1,1,1)[12] with drift : Inf
## ARIMA(4,0,1)(1,1,1)[12] with drift : Inf
## ARIMA(5,0,0)(1,1,1)[12] : 488.8122
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(5,0,0)(1,1,1)[12] with drift : 714.7413
##
## Best model: ARIMA(5,0,0)(1,1,1)[12] with drift
fit_arima
## Series: ts_data
## ARIMA(5,0,0)(1,1,1)[12] with drift
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 sar1 sma1 drift
## 0.0075 -0.2666 0.2745 -0.2083 0.1331 -0.6193 -0.1311 161.0977
## s.e. 0.1719 0.1694 0.1688 0.1752 0.1923 0.3123 0.4735 30.6645
##
## sigma^2 = 15124086: log likelihood = -348.37
## AIC=714.74 AICc=721.66 BIC=728.99
# Predecir 12 valores con la funcion forecast
# La cantidad de valores a predecir depende del modelo elegido
prediccion_sarima <- forecast(fit_arima, h=12)
# Crear un data frame con los valores de las predicciones con intervalos de confianza del 95%
df_futuro_sarima <- data.frame(
Fecha = seq.Date(from = max(df_reg$Fecha) + months(1), by = "month", length.out = 12),
Ventas = prediccion_sarima$mean,
Bajo = prediccion_sarima$lower[,2],
Alto = prediccion_sarima$upper[,2]
)
# union de series de tiempo en una sola
df_total_sarima <- bind_rows(df_reg, df_futuro_sarima)
tail(df_total_sarima, 12)
## # A tibble: 12 × 4
## Fecha Ventas Bajo Alto
## <date> <dbl> <dbl> <dbl>
## 1 2018-01-01 8009. 386. 15631.
## 2 2018-02-01 7336. -287. 14958.
## 3 2018-03-01 17552. 9664. 25441.
## 4 2018-04-01 11056. 2902. 19209.
## 5 2018-05-01 9930. 1714. 18147.
## 6 2018-06-01 16765. 8548. 24982.
## 7 2018-07-01 13143. 4829. 21458.
## 8 2018-08-01 14079. 5712. 22446.
## 9 2018-09-01 19754. 11386. 28122.
## 10 2018-10-01 13130. 4748. 21512.
## 11 2018-11-01 19526. 11126. 27927.
## 12 2018-12-01 24781. 16376. 33186.
#7 GRÁFICA
ggplot(df_total_sarima, aes(x = Fecha, y = Ventas)) +
geom_point(data = df_reg) + #valores historicos
geom_line(data = df_reg) + #linea de valores historicos
geom_ribbon(data = df_futuro_sarima, aes(ymin = Bajo, ymax = Alto), fill = "lightblue",
alpha = 0.8) + #intervalos de confianza
geom_point(data = df_futuro_sarima, color = "red") + #valores predichos
geom_line(data = df_futuro_sarima, color = "red", linetype = "dashed") + #linea de predicciones
labs(title = "prediccion de ventas con arima enero-diciembre 2018",
x = "Mes",
y = "Ventas")