Data analytics sobes

Чтение и обработка данных
Задача 1
Задача 2
Задача 3
Задача 4
Задача 5

Чтение и обработка данных

Прочтём таблицы и зададим англоязычные имена столбцам:

library(tidyverse)
library(readxl)
library(magrittr)

remains = read_xlsx("remains.xlsx")
colnames(remains) = c("id", "size", "free_count", "self_cost", "sell_cost")

print(remains)

## # A tibble: 10,328 x 5
##    id                size  free_count self_cost sell_cost
##    <chr>             <chr>      <dbl>     <dbl>     <dbl>
##  1 010112159012 0100 M            118      1211      2846
##  2 010112159052 01P0 L            247       465      1311
##  3 010112159052 01P0 M            134       763      1564
##  4 010112159052 01P0 S             93       762      1631
##  5 010112159052 58P0 XXL           20       342       698
##  6 010113295022 6000 M            121       341       706
##  7 010113295032 5000 XL           202      1194      2758
##  8 010113295102 91M0 L            363       620      1612
##  9 010113295102 91M0 M            317      1254      2558
## 10 010113295102 91M0 XL           267       895      2363
## # ... with 10,318 more rows

sells = read_xlsx("sells.xlsx")
colnames(sells) = c("id_old", "size", "mounth", "sell_count")

print(sells)

## # A tibble: 60,074 x 4
##    id_old            size  mounth sell_count
##    <chr>             <chr>  <dbl>      <dbl>
##  1 02025727Y052D0/55 XS         9         88
##  2 02025727Y052D0/55 S          6        128
##  3 020245D5Z022/55D0 M          7         83
##  4 020245D5Z022/55D0 M          9         68
##  5 020245D5Z022/55D0 L          6         74
##  6 020245D5Z022/55D0 L          7        104
##  7 020245D5Z022/55D0 L         10         66
##  8 02025727Y052D0/50 XS        11         93
##  9 02025727Y052D0/55 S          8         67
## 10 220235D4S02151D0  M         11         65
## # ... with 60,064 more rows

В соотвествии с требованиями преобразуем старые артикулы к новым и удалим пробел из новых:

# prepare ids

new.id = sells$id_old

for (index in seq_along(new.id)) {
    
    id = new.id[index]
    
    if (grepl("/", id)) {
        pair = strsplit(id, "/")[[1]]
        
        if (nchar(pair[1]) == 12) {
            id = paste0(pair[1], pair[2])
        } else if (nchar(pair[1]) == 14) {
            
            left12 = substr(pair[1], 1, 12)
            last2 = substr(pair[1], 13, 14)
            right = substr(pair[2], 1, 2)
            
            id = paste0(left12, right, last2)
            
        } else {
            id = pair[1]
        }
        
    }
    
    new.id[index] = id
}


sells$id_old = new.id
colnames(sells)[1] = "id"

remains$id = str_remove(remains$id, " ")

print(sells)

## # A tibble: 60,074 x 4
##    id               size  mounth sell_count
##    <chr>            <chr>  <dbl>      <dbl>
##  1 02025727Y05255D0 XS         9         88
##  2 02025727Y05255D0 S          6        128
##  3 020245D5Z02255D0 M          7         83
##  4 020245D5Z02255D0 M          9         68
##  5 020245D5Z02255D0 L          6         74
##  6 020245D5Z02255D0 L          7        104
##  7 020245D5Z02255D0 L         10         66
##  8 02025727Y05250D0 XS        11         93
##  9 02025727Y05255D0 S          8         67
## 10 220235D4S02151D0 M         11         65
## # ... with 60,064 more rows

Вставим недостающие строки и сольём таблицы:

sell_unique = unique(sells$id)
remains_unique = unique(remains$id)

diffs = setdiff(sell_unique, remains_unique)

sizes = sapply(diffs, function(d) sells[sells$id == d, 2][[1]][[1]])

remains = rbind(remains, tibble(id = diffs, size = sizes, free_count = 0, self_cost = 200, 
    sell_cost = 1000))



merged = merge(sells, remains)

Задача 1

Общая сумма продаж в рублях за июль:

sum_july = merged %>% filter(mounth == 7) %>% summarise(s = sum(sell_count * sell_cost))

print(sum_july[[1]])

## [1] 1544113364

Задача 2

Общая сумма прибыли в рублях в октябре:

sum_october = merged %>% filter(mounth == 10) %>% summarise(s = sum(sell_count * 
    (sell_cost - self_cost)))

print(sum_october[[1]])

## [1] 929184850

Задача 3

5-й артикул в рейтинге продаж в августе:

august = merged %>% filter(mounth == 8) %>% group_by(id) %>% summarise(r = sum(sell_count * 
    sell_cost)) %>% arrange(-r)

print(august[5, 1])

## # A tibble: 1 x 1
##   id              
##   <chr>           
## 1 42025327F5637500

Задача 4

Остаток на складе ТОП-10 артикулов в августе:

remain = sapply(august$id[1:10], function(d) merged$free_count[merged$id == d][[1]][[1]])

print(sum(remain))

## [1] 3368

Задача 5

График распределения количества буквенных размеров:

tb = merged %>% select(id, size) %>% filter(str_detect(size, "[XSML]")) %>% group_by(size) %>% 
    summarise(count = n())

ggplot(tb, aes(x = factor(size), y = count)) + geom_col() + theme_bw() + labs(x = "size", 
    title = "Count of products by size") + annotate("text", x = 1:9, y = tb$count + 
    150, label = as.character(tb$count), size = 5)