# install tidyverse for import and wrangling
#ggplot for visualization
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data <- read_csv("2019 Winter Data Science Intern Challenge Data Set - Sheet1.csv")
## Rows: 5000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): payment_method, created_at
## dbl (5): order_id, shop_id, user_id, order_amount, total_items
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
anyNA(data)
## [1] FALSE
summary(data)
##     order_id       shop_id          user_id       order_amount   
##  Min.   :   1   Min.   :  1.00   Min.   :607.0   Min.   :    90  
##  1st Qu.:1251   1st Qu.: 24.00   1st Qu.:775.0   1st Qu.:   163  
##  Median :2500   Median : 50.00   Median :849.0   Median :   284  
##  Mean   :2500   Mean   : 50.08   Mean   :849.1   Mean   :  3145  
##  3rd Qu.:3750   3rd Qu.: 75.00   3rd Qu.:925.0   3rd Qu.:   390  
##  Max.   :5000   Max.   :100.00   Max.   :999.0   Max.   :704000  
##   total_items       payment_method      created_at       
##  Min.   :   1.000   Length:5000        Length:5000       
##  1st Qu.:   1.000   Class :character   Class :character  
##  Median :   2.000   Mode  :character   Mode  :character  
##  Mean   :   8.787                                        
##  3rd Qu.:   3.000                                        
##  Max.   :2000.000
# Order amount and total items Very high compared to Medians #

# AOV Calculated
mean(data$order_amount)
## [1] 3145.128
median(data$order_amount)
## [1] 284
# Calculate AOV per shop to find any outliers or mistakes
aov_pershop <- data %>% group_by(shop_id) %>%
  summarize(aov_per_shop = mean(order_amount)) %>%
  arrange(desc(aov_per_shop))
head(aov_pershop, 10)
## # A tibble: 10 × 2
##    shop_id aov_per_shop
##      <dbl>        <dbl>
##  1      42      235101.
##  2      78       49213.
##  3      50         404.
##  4      90         403.
##  5      38         391.
##  6      81         384 
##  7       6         384.
##  8      89         379.
##  9      33         376.
## 10      51         362.
# plot aov per shop to see if shop 42 and 78 can be deemed outliers
ggplot(aov_pershop) +
  aes(x = "", y = aov_pershop$aov_per_shop) +
  geom_boxplot() +
  coord_trans(y = "log10") +
  scale_y_continuous(breaks=c(150, 250, 350, 500, 20000)) +
  ylab("Average Order Value by shop") +
  xlab("")+
  theme_bw()
## Warning: Use of `aov_pershop$aov_per_shop` is discouraged. Use `aov_per_shop`
## instead.