# install tidyverse for import and wrangling
#ggplot for visualization
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data <- read_csv("2019 Winter Data Science Intern Challenge Data Set - Sheet1.csv")
## Rows: 5000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): payment_method, created_at
## dbl (5): order_id, shop_id, user_id, order_amount, total_items
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
anyNA(data)
## [1] FALSE
summary(data)
## order_id shop_id user_id order_amount
## Min. : 1 Min. : 1.00 Min. :607.0 Min. : 90
## 1st Qu.:1251 1st Qu.: 24.00 1st Qu.:775.0 1st Qu.: 163
## Median :2500 Median : 50.00 Median :849.0 Median : 284
## Mean :2500 Mean : 50.08 Mean :849.1 Mean : 3145
## 3rd Qu.:3750 3rd Qu.: 75.00 3rd Qu.:925.0 3rd Qu.: 390
## Max. :5000 Max. :100.00 Max. :999.0 Max. :704000
## total_items payment_method created_at
## Min. : 1.000 Length:5000 Length:5000
## 1st Qu.: 1.000 Class :character Class :character
## Median : 2.000 Mode :character Mode :character
## Mean : 8.787
## 3rd Qu.: 3.000
## Max. :2000.000
# Order amount and total items Very high compared to Medians #
# AOV Calculated
mean(data$order_amount)
## [1] 3145.128
median(data$order_amount)
## [1] 284
# Calculate AOV per shop to find any outliers or mistakes
aov_pershop <- data %>% group_by(shop_id) %>%
summarize(aov_per_shop = mean(order_amount)) %>%
arrange(desc(aov_per_shop))
head(aov_pershop, 10)
## # A tibble: 10 × 2
## shop_id aov_per_shop
## <dbl> <dbl>
## 1 42 235101.
## 2 78 49213.
## 3 50 404.
## 4 90 403.
## 5 38 391.
## 6 81 384
## 7 6 384.
## 8 89 379.
## 9 33 376.
## 10 51 362.
# plot aov per shop to see if shop 42 and 78 can be deemed outliers
ggplot(aov_pershop) +
aes(x = "", y = aov_pershop$aov_per_shop) +
geom_boxplot() +
coord_trans(y = "log10") +
scale_y_continuous(breaks=c(150, 250, 350, 500, 20000)) +
ylab("Average Order Value by shop") +
xlab("")+
theme_bw()
## Warning: Use of `aov_pershop$aov_per_shop` is discouraged. Use `aov_per_shop`
## instead.
