df <- read_excel("Historical Product Demand.xlsx") %>%
clean_names()
df$date <- as.Date(df$date)
str(df)
## tibble [1,715 × 5] (S3: tbl_df/tbl/data.frame)
## $ product_code : chr [1:1715] "Product_0002" "Product_0002" "Product_0002" "Product_0002" ...
## $ warehouse : chr [1:1715] "Whse_S" "Whse_S" "Whse_S" "Whse_S" ...
## $ product_category: chr [1:1715] "Category_005" "Category_005" "Category_005" "Category_005" ...
## $ date : Date[1:1715], format: "2012-03-01" "2012-05-02" ...
## $ order_demand : num [1:1715] 50000 50000 30000 10000 30000 50000 15000 20000 25000 30000 ...
desc_prod <- df %>%
group_by(product_code) %>%
summarise(
media_demanda = mean(order_demand, na.rm = TRUE),
sd_demanda = sd(order_demand, na.rm = TRUE),
mediana = median(order_demand, na.rm = TRUE)
)
desc_prod
## # A tibble: 4 × 4
## product_code media_demanda sd_demanda mediana
## <chr> <dbl> <dbl> <dbl>
## 1 Product_0001 758. 1026. 300
## 2 Product_0002 19458. 35296. 10000
## 3 Product_0003 355. 412. 300
## 4 Product_0004 369. 557. 300
df_mes <- df %>%
mutate(mes = floor_date(date, "month")) %>%
group_by(mes, product_code) %>%
summarise(demanda_mensual = sum(order_demand, na.rm = TRUE))
## `summarise()` has grouped output by 'mes'. You can override using the `.groups`
## argument.
df_mes
## # A tibble: 239 × 3
## # Groups: mes [61]
## mes product_code demanda_mensual
## <date> <chr> <dbl>
## 1 2011-12-01 Product_0001 300
## 2 2011-12-01 Product_0002 221000
## 3 2012-01-01 Product_0001 9700
## 4 2012-01-01 Product_0002 65000
## 5 2012-01-01 Product_0003 400
## 6 2012-01-01 Product_0004 300
## 7 2012-02-01 Product_0001 13000
## 8 2012-02-01 Product_0002 19000
## 9 2012-02-01 Product_0003 1200
## 10 2012-02-01 Product_0004 1000
## # ℹ 229 more rows
g1 <- df_mes %>%
filter(product_code == "Product_0002") %>%
ggplot(aes(x = mes, y = demanda_mensual)) +
geom_line(color = "blue", linewidth = 1.2) +
geom_point(color = "darkblue", size = 2) +
theme_minimal() +
labs(
title = "Demanda mensual del Producto 0002",
x = "Mes",
y = "Demanda"
)
g1
df_prod2 <- df %>%
filter(product_code == "Product_0002")
df_prod2 %>% summarise(registros = n())
## # A tibble: 1 × 1
## registros
## <int>
## 1 450
ggplot(df_prod2, aes(x = warehouse, y = order_demand, fill = warehouse)) +
geom_boxplot(alpha = 0.7) +
theme_minimal() +
labs(
title = "Demanda por almacén - Product_0002",
x = "Almacén",
y = "Demanda"
)
anova_prod2 <- aov(order_demand ~ warehouse, data = df_prod2)
summary(anova_prod2)
## Df Sum Sq Mean Sq F value Pr(>F)
## warehouse 1 3.137e+10 3.137e+10 26.62 3.73e-07 ***
## Residuals 448 5.280e+11 1.179e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova_prod2)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = order_demand ~ warehouse, data = df_prod2)
##
## $warehouse
## diff lwr upr p adj
## Whse_S-Whse_C 21427.89 13265.32 29590.45 4e-07
Al resultar un valor p del análisis de varianza de \(3.73\times 10^{-7}\), este número es demasiado pequeño para rechazar la hipótesis nula, la cuál supone que, en efecto, las diferencias en la demanda entre cada almacén son significativas y no se deben al azar.