Прочтём таблицы и зададим англоязычные имена столбцам:
library(tidyverse)
library(readxl)
library(magrittr)
remains = read_xlsx("remains.xlsx")
colnames(remains) = c("id", "size", "free_count", "self_cost", "sell_cost")
print(remains)
## # A tibble: 10,328 x 5
## id size free_count self_cost sell_cost
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 010112159012 0100 M 118 1211 2846
## 2 010112159052 01P0 L 247 465 1311
## 3 010112159052 01P0 M 134 763 1564
## 4 010112159052 01P0 S 93 762 1631
## 5 010112159052 58P0 XXL 20 342 698
## 6 010113295022 6000 M 121 341 706
## 7 010113295032 5000 XL 202 1194 2758
## 8 010113295102 91M0 L 363 620 1612
## 9 010113295102 91M0 M 317 1254 2558
## 10 010113295102 91M0 XL 267 895 2363
## # ... with 10,318 more rows
sells = read_xlsx("sells.xlsx")
colnames(sells) = c("id_old", "size", "mounth", "sell_count")
print(sells)
## # A tibble: 60,074 x 4
## id_old size mounth sell_count
## <chr> <chr> <dbl> <dbl>
## 1 02025727Y052D0/55 XS 9 88
## 2 02025727Y052D0/55 S 6 128
## 3 020245D5Z022/55D0 M 7 83
## 4 020245D5Z022/55D0 M 9 68
## 5 020245D5Z022/55D0 L 6 74
## 6 020245D5Z022/55D0 L 7 104
## 7 020245D5Z022/55D0 L 10 66
## 8 02025727Y052D0/50 XS 11 93
## 9 02025727Y052D0/55 S 8 67
## 10 220235D4S02151D0 M 11 65
## # ... with 60,064 more rows
В соотвествии с требованиями преобразуем старые артикулы к новым и удалим пробел из новых:
# prepare ids
new.id = sells$id_old
for (index in seq_along(new.id)) {
id = new.id[index]
if (grepl("/", id)) {
pair = strsplit(id, "/")[[1]]
if (nchar(pair[1]) == 12) {
id = paste0(pair[1], pair[2])
} else if (nchar(pair[1]) == 14) {
left12 = substr(pair[1], 1, 12)
last2 = substr(pair[1], 13, 14)
right = substr(pair[2], 1, 2)
id = paste0(left12, right, last2)
} else {
id = pair[1]
}
}
new.id[index] = id
}
sells$id_old = new.id
colnames(sells)[1] = "id"
remains$id = str_remove(remains$id, " ")
print(sells)
## # A tibble: 60,074 x 4
## id size mounth sell_count
## <chr> <chr> <dbl> <dbl>
## 1 02025727Y05255D0 XS 9 88
## 2 02025727Y05255D0 S 6 128
## 3 020245D5Z02255D0 M 7 83
## 4 020245D5Z02255D0 M 9 68
## 5 020245D5Z02255D0 L 6 74
## 6 020245D5Z02255D0 L 7 104
## 7 020245D5Z02255D0 L 10 66
## 8 02025727Y05250D0 XS 11 93
## 9 02025727Y05255D0 S 8 67
## 10 220235D4S02151D0 M 11 65
## # ... with 60,064 more rows
Вставим недостающие строки и сольём таблицы:
sell_unique = unique(sells$id)
remains_unique = unique(remains$id)
diffs = setdiff(sell_unique, remains_unique)
sizes = sapply(diffs, function(d) sells[sells$id == d, 2][[1]][[1]])
remains = rbind(remains, tibble(id = diffs, size = sizes, free_count = 0, self_cost = 200,
sell_cost = 1000))
merged = merge(sells, remains)
Общая сумма продаж в рублях за июль:
sum_july = merged %>% filter(mounth == 7) %>% summarise(s = sum(sell_count * sell_cost))
print(sum_july[[1]])
## [1] 1544113364
Общая сумма прибыли в рублях в октябре:
sum_october = merged %>% filter(mounth == 10) %>% summarise(s = sum(sell_count *
(sell_cost - self_cost)))
print(sum_october[[1]])
## [1] 929184850
5-й артикул в рейтинге продаж в августе:
august = merged %>% filter(mounth == 8) %>% group_by(id) %>% summarise(r = sum(sell_count *
sell_cost)) %>% arrange(-r)
print(august[5, 1])
## # A tibble: 1 x 1
## id
## <chr>
## 1 42025327F5637500
Остаток на складе ТОП-10 артикулов в августе:
remain = sapply(august$id[1:10], function(d) merged$free_count[merged$id == d][[1]][[1]])
print(sum(remain))
## [1] 3368
График распределения количества буквенных размеров:
tb = merged %>% select(id, size) %>% filter(str_detect(size, "[XSML]")) %>% group_by(size) %>%
summarise(count = n())
ggplot(tb, aes(x = factor(size), y = count)) + geom_col() + theme_bw() + labs(x = "size",
title = "Count of products by size") + annotate("text", x = 1:9, y = tb$count +
150, label = as.character(tb$count), size = 5)