set.seed(21821)
nrecords <- 250
customers <- c("Misha","Yulia","Andrew","Yaroslav", "Arina", "Phil", "Dima")
prod_name <- c("phone","iguana", "bike", "robot","pen","mouse")
prod_size <- c("big", "small", "tiny")
prod_adj <- c("beautiful","smart","expensive")
prod_color <- c("green","black")
products_full_name <- rep("",nrecords)
for (i in 1:nrecords) {
products_full_name[i] <- paste(sample(prod_size,1),sample(prod_adj,1),sample(prod_color,1),sample(prod_name,1))
}
price <- rnorm(length(unique(products_full_name)) , mean=100, sd=25)
products <- data.frame(unique(products_full_name),price)
names(products) <- c("name","price")
Dates <- sample(as.Date(1:1000, origin = "2015-01-01"),nrecords)
sales <- data.frame(c(1:nrecords),Dates,sample(products$name,nrecords, replace = TRUE),sample(customers,nrecords,replace = TRUE,prob = c(0.1,0.15,0.15,0.1,0.1,0.3,0.25)))
names(sales) <- c("id","date","name","customer")
sales <- merge.data.frame(sales,products,by = "name",sort = FALSE)
sales$short_name <- gsub("^\\s*\\w*\\s*\\w*\\s*\\w*", "", sales$name)
sales <- sales[c("id","date","name","short_name","customer","price")]
write.csv(sales,"sales.csv")
str(sales)
## 'data.frame': 250 obs. of 6 variables:
## $ id : int 1 32 180 117 2 26 170 3 199 4 ...
## $ date : Date, format: "2015-04-06" "2015-08-27" ...
## $ name : Factor w/ 97 levels "big beautiful black bike",..: 84 84 84 84 61 3 3 3 3 54 ...
## $ short_name: chr " iguana" " iguana" " iguana" " iguana" ...
## $ customer : Factor w/ 7 levels "Andrew","Arina",..: 6 5 3 5 7 5 3 1 5 5 ...
## $ price : num 114 114 114 114 119 ...
length(unique(sales$name))
## [1] 93
length(unique(sales$customer))
## [1] 7
length(sales$id)
## [1] 250
Сумарна виручка магазину
sum(sales$price)
## [1] 25597.75
Що ми продаємо? Який розподіл цін?
library(ggplot2)
ggplot(sales,aes(price)) + geom_histogram(binwidth = 25)
ggplot(sales,aes(price)) + geom_histogram(binwidth = 10)
ggplot(sales,aes(short_name,price)) + geom_boxplot(varwidth=T, fill="plum") +
labs(title="Ціна за категоріями",
subtitle="розподіл цін за категоріми товарів",
caption="Автор: Олексій",
x="категорія товару",
y="ціна")
ggplot(sales,aes(short_name,price,color = factor(customer))) + geom_jitter(width = 0.2) +
labs(title="Ціни - категорії - покупці",
subtitle="розподіл цін за категоріми товарів",
caption="Автор: Олексій",
x="категорія товару",
y="ціна", color = "покупець")
customer_data <- aggregate(x = sales$id, by=list(sales$customer),
FUN=length)
names(customer_data) <- c("customer","transaction_amount")
library(knitr)
kable(customer_data,format = "html")
| customer | transaction_amount |
|---|---|
| Andrew | 35 |
| Arina | 23 |
| Dima | 45 |
| Misha | 15 |
| Phil | 72 |
| Yaroslav | 22 |
| Yulia | 38 |
category_data <- aggregate(x = sales$id, by=list(sales$short_name),
FUN=length)
names(category_data) <- c("short_name","transaction_amount")
library(knitr)
kable(category_data,format = "html")
| short_name | transaction_amount |
|---|---|
| bike | 41 |
| iguana | 46 |
| mouse | 37 |
| pen | 44 |
| phone | 40 |
| robot | 42 |
Зробимо візуалізацію
ggplot(category_data, aes(x=short_name, y=transaction_amount)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Стовпчикова діаграма",
subtitle="кількість покупок",
caption="Автор: Олексій",
x="категорія товару",
y="кількість")
ggplot(category_data, aes(x=short_name, y=transaction_amount)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Стовпчикова діаграма",
subtitle="кількість покупок",
caption="Автор: Олексій",
x="категорія товару",
y="кількість") +
coord_flip()
customer_payment <- aggregate(x = sales$price, by=list(sales$customer),
FUN=sum)
names(customer_payment) <- c("customer_name","payments_total")
kable(customer_payment,format = "html")
| customer_name | payments_total |
|---|---|
| Andrew | 3602.952 |
| Arina | 2332.482 |
| Dima | 4434.267 |
| Misha | 1431.462 |
| Phil | 7381.784 |
| Yaroslav | 2469.839 |
| Yulia | 3944.969 |
ggplot(customer_payment,aes(x = "",y = payments_total,fill=customer_name)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0)
item_payment <- aggregate(x = sales$price, by=list(sales$short_name),
FUN=sum)
names(item_payment) <- c("item_name","payments_total")
kable(item_payment,format = "html")
| item_name | payments_total |
|---|---|
| bike | 4212.576 |
| iguana | 5109.380 |
| mouse | 3717.163 |
| pen | 4541.305 |
| phone | 4044.412 |
| robot | 3972.919 |
items <- aggregate(x = sales$price, by=list(sales$name),
FUN=sum)
names(items) <- c("item_name","payments_total")
items <- items[order(items$payments_total, decreasing = TRUE),]
kable(head(items, 10),format = "html")
| item_name | payments_total | |
|---|---|---|
| 18 | big expensive green bike | 642.5962 |
| 60 | small smart green mouse | 621.3535 |
| 11 | big beautiful green phone | 593.4394 |
| 3 | big beautiful black mouse | 587.2089 |
| 29 | big smart green pen | 576.6494 |
| 4 | big beautiful black pen | 566.3509 |
| 45 | small expensive black mouse | 553.2182 |
| 56 | small smart black pen | 544.1727 |
| 13 | big expensive black iguana | 496.9611 |
| 41 | small beautiful green phone | 490.2600 |
items <- aggregate(x = sales$price, by=list(sales$name),
FUN=length)
names(items) <- c("item_name","total")
items <- items[order(items$total, decreasing = TRUE),]
kable(head(items, 15),format = "html")
| item_name | total | |
|---|---|---|
| 11 | big beautiful green phone | 7 |
| 18 | big expensive green bike | 7 |
| 29 | big smart green pen | 6 |
| 1 | big beautiful black bike | 5 |
| 4 | big beautiful black pen | 5 |
| 37 | small beautiful green bike | 5 |
| 45 | small expensive black mouse | 5 |
| 56 | small smart black pen | 5 |
| 60 | small smart green mouse | 5 |
| 69 | tiny beautiful green bike | 5 |
| 84 | tiny expensive green robot | 5 |
| 3 | big beautiful black mouse | 4 |
| 13 | big expensive black iguana | 4 |
| 14 | big expensive black mouse | 4 |
| 15 | big expensive black pen | 4 |
average_customer <- aggregate(x = sales$price, by=list(sales$customer),
FUN=mean)
names(average_customer) <- c("customer","mean")
average_customer
## customer mean
## 1 Andrew 102.94149
## 2 Arina 101.41224
## 3 Dima 98.53927
## 4 Misha 95.43082
## 5 Phil 102.52477
## 6 Yaroslav 112.26542
## 7 Yulia 103.81497
Отже ми хочемо обчислити середні витрати кожного покупця (вже зробили), тепер обчислимо середні витрати взагалі. Після цього ми обчислимо відхилення кожного покупця від середнього і виведемо його на графік.
average_customer$mean_z <- round((average_customer$mean - mean(average_customer$mean))/sd(average_customer$mean), 2)
average_customer$customer_type <- ifelse(average_customer$mean_z < 0, "below", "above")
average_customer
## customer mean mean_z customer_type
## 1 Andrew 102.94149 0.10 above
## 2 Arina 101.41224 -0.19 below
## 3 Dima 98.53927 -0.74 below
## 4 Misha 95.43082 -1.34 below
## 5 Phil 102.52477 0.02 above
## 6 Yaroslav 112.26542 1.88 above
## 7 Yulia 103.81497 0.27 above
ggplot(average_customer, aes(x=customer, y=mean_z, label=customer_type)) +
geom_bar(stat='identity', aes(fill=customer_type), width=.5) +
labs(title="Відхилення для покупців",
subtitle="нормалізований чек",
caption="Автор: Олексій",
x="покупець",
y="відхилення", fill = "середній чек") +
coord_flip()
Ще один цікавий графік
ggplot(sales, aes(customer)) +
geom_bar(aes(fill=short_name), width = 0.5) +
labs(title="Гістограма",
subtitle="категорії товарів",
x="покупець",
y="кількість", fill = "категорія товарів") +
coord_flip()
ggplot(sales, aes(x = date, y = name)) + geom_point(aes(color = customer, shape = short_name))
Потрібно якось зрозуміти як відображати часові дані.
sales$year <- format(sales$date,'%Y')
sales$month <- format(sales$date,'%Y-%m')
year_payment <- aggregate(x = sales$price, by=list(sales$year),
FUN=sum)
names(year_payment) <- c("year","sum")
year_payment
## year sum
## 1 2015 9384.951
## 2 2016 8976.493
## 3 2017 7236.310
month_payment <- aggregate(x = sales$price, by=list(sales$month),
FUN=sum)
names(month_payment) <- c("month","sum")
month_payment
## month sum
## 1 2015-01 1163.6315
## 2 2015-02 472.5188
## 3 2015-03 673.3094
## 4 2015-04 946.9831
## 5 2015-05 859.0457
## 6 2015-06 728.9208
## 7 2015-07 736.8629
## 8 2015-08 1344.7883
## 9 2015-09 666.3613
## 10 2015-10 649.8126
## 11 2015-11 643.7412
## 12 2015-12 498.9753
## 13 2016-01 885.2285
## 14 2016-02 541.4691
## 15 2016-03 410.4911
## 16 2016-04 1188.5442
## 17 2016-05 593.5449
## 18 2016-06 472.5366
## 19 2016-07 909.4994
## 20 2016-08 1011.5961
## 21 2016-09 957.8938
## 22 2016-10 925.2607
## 23 2016-11 609.6757
## 24 2016-12 470.7532
## 25 2017-01 511.0675
## 26 2017-02 577.1886
## 27 2017-03 1537.8640
## 28 2017-04 588.3977
## 29 2017-05 1150.0917
## 30 2017-06 679.5739
## 31 2017-07 994.0519
## 32 2017-08 435.0942
## 33 2017-09 762.9810
month_payment$year <- gsub("-\\w*$", "", month_payment$month)
ggplot(month_payment, aes(month, sum)) + geom_point()
month_payment$month <- gsub("^\\w*-", "", month_payment$month)
ggplot(month_payment, aes(month, sum)) + geom_point()
Перша можлива діаграма виручки помісячно за роками.
ggplot(month_payment, aes(x = month, y = sum,fill = factor(year))) + geom_bar(aes(fill=year),stat = "identity")
Другий варіант
ggplot(month_payment, aes(x = month, y = sum,fill = factor(year))) + geom_bar(aes(fill=year),position = "dodge",stat = "identity")