library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(writexl)

# 2. Data import
# ./, ../, 
bikes_tbl <- read_excel("./bikes.xlsx") # fast key: alt+-
bikeshops_tbl <- read_excel("./bikeshops.xlsx")
orderlines_tbl <- read_excel("./orderlines.xlsx")
## New names:
## • `` -> `...1`
# Examine data:
bikes_tbl
## # A tibble: 97 × 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # ℹ 87 more rows
head(bikes_tbl)
## # A tibble: 6 × 4
##   bike.id model                          description                price
##     <dbl> <chr>                          <chr>                      <dbl>
## 1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
## 2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
## 3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
## 4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
## 5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
## 6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
# Import csv file:
bike_orderlines_tbl <- read_csv("./bike_orderlines.csv")
## Rows: 15644 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): model, category_1, category_2, frame_material, bikeshop_name, city...
## dbl  (5): order_id, order_line, quantity, price, total_price
## dttm (1): order_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Joining data: 
orderlines_bikes_tbl <- left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id"))

bike_orderlines_bikeshops_joined <- left_join(orderlines_bikes_tbl, bikeshops_tbl, 
                                               by = c('customer.id' = 'bikeshop.id'))


# %>% is called pipe: fast key: ctl + shift + m 
bike_orderlines_bikeshops_joined <- left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id")) %>% 
                                    left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))

# Wrangling data: decompose description into three columns: category.1, category.2 and frame.material
bike_orderlines_wrangled_tbl <- bike_orderlines_bikeshops_joined %>% 
                                  separate(description, 
                                           into = c('category.1', 'category.2', 'frame.material'), 
                                           sep  = ' - ') %>% 
                                  separate(location, 
                                           into = c('city', 'state'), 
                                           sep  = ', ',
                                           remove = FALSE) %>% 
# create calculated columns 
                                  mutate(total.price = price * quantity) %>% 
# Reorganize columns                                  
                                  select(-...1, -location) %>% 
# Reorder columns                                  
                                  select(contains('date'), contains('id'), 
                                         contains('order'), 
                                         quantity, price, total.price, 
                                         everything()) %>% 
# Rename columns
                                  rename(order_date = order.date) %>%
                                  set_names(names(.) %>% str_replace_all("\\.", "_"))
# save the file as RDS
saveRDS(bike_orderlines_wrangled_tbl, './bike_orderlines.rds')

# dplyr and tidyr
# pull() vs. select()
bike_orderlines_wrangled_tbl %>% 
            # select(total_price)
              pull(total_price) %>% 
              mean()
## [1] 4540.548
# select_if
bike_orderlines_wrangled_tbl %>% 
          # select_if(is.character)
            select_if(is.numeric)
## # A tibble: 15,644 × 7
##    order_id customer_id product_id order_line quantity price total_price
##       <dbl>       <dbl>      <dbl>      <dbl>    <dbl> <dbl>       <dbl>
##  1        1           2         48          1        1  6070        6070
##  2        1           2         52          2        1  5970        5970
##  3        2          10         76          1        1  2770        2770
##  4        2          10         52          2        1  5970        5970
##  5        3           6          2          1        1 10660       10660
##  6        3           6         50          2        1  3200        3200
##  7        3           6          1          3        1 12790       12790
##  8        3           6          4          4        1  5330        5330
##  9        3           6         34          5        1  1570        1570
## 10        4          22         26          1        1  4800        4800
## # ℹ 15,634 more rows
# arrange() and desc()
bikes_tbl %>% 
  select(model, price) %>% 
  arrange(desc(price))
## # A tibble: 97 × 2
##    model                          price
##    <chr>                          <dbl>
##  1 Supersix Evo Black Inc.        12790
##  2 Scalpel-Si Black Inc.          12790
##  3 Habit Hi-Mod Black Inc.        12250
##  4 F-Si Black Inc.                11190
##  5 Supersix Evo Hi-Mod Team       10660
##  6 Synapse Hi-Mod Disc Black Inc.  9590
##  7 Scalpel-Si Race                 9060
##  8 F-Si Hi-Mod Team                9060
##  9 Trigger Carbon 1                8200
## 10 Supersix Evo Hi-Mod Dura Ace 1  7990
## # ℹ 87 more rows
# filter()  
bikes_tbl %>% 
  select(model, price) %>% 
  filter(price > mean(price))
## # A tibble: 35 × 2
##    model                          price
##    <chr>                          <dbl>
##  1 Supersix Evo Black Inc.        12790
##  2 Supersix Evo Hi-Mod Team       10660
##  3 Supersix Evo Hi-Mod Dura Ace 1  7990
##  4 Supersix Evo Hi-Mod Dura Ace 2  5330
##  5 Supersix Evo Hi-Mod Utegra      4260
##  6 CAAD12 Black Inc                5860
##  7 CAAD12 Disc Dura Ace            4260
##  8 Synapse Hi-Mod Disc Black Inc.  9590
##  9 Synapse Hi-Mod Disc Red         7460
## 10 Synapse Hi-Mod Dura Ace         5860
## # ℹ 25 more rows
bikes_tbl %>% 
  select(model, price) %>% 
  filter((price > 5000) & (price < 10000)) %>%    
  arrange(desc(price))
## # A tibble: 22 × 2
##    model                          price
##    <chr>                          <dbl>
##  1 Synapse Hi-Mod Disc Black Inc.  9590
##  2 Scalpel-Si Race                 9060
##  3 F-Si Hi-Mod Team                9060
##  4 Trigger Carbon 1                8200
##  5 Supersix Evo Hi-Mod Dura Ace 1  7990
##  6 Jekyll Carbon 1                 7990
##  7 Synapse Hi-Mod Disc Red         7460
##  8 Scalpel-Si Hi-Mod 1             7460
##  9 Habit Carbon 1                  7460
## 10 Slice Hi-Mod Black Inc.         7000
## # ℹ 12 more rows
bikes_tbl %>% 
  select(model, price) %>% 
  filter(price > 6000, 
         model %>% str_detect("Supersix"))
## # A tibble: 3 × 2
##   model                          price
##   <chr>                          <dbl>
## 1 Supersix Evo Black Inc.        12790
## 2 Supersix Evo Hi-Mod Team       10660
## 3 Supersix Evo Hi-Mod Dura Ace 1  7990
# Filtering one or more conditions using == and %in%
bike_orderlines_wrangled_tbl %>% 
  filter(category_2 %in% c("Over Mountain", "Trail", "Endurance Road")) %>% 
  View()

# slice()
bikes_tbl %>% 
  arrange(desc(price)) %>% 
  # slice(1:5)
  slice((nrow(.)-4):nrow(.))
## # A tibble: 5 × 4
##   bike.id model      description                 price
##     <dbl> <chr>      <chr>                       <dbl>
## 1      93 Trail 5    Mountain - Sport - Aluminum   815
## 2      94 Catalyst 1 Mountain - Sport - Aluminum   705
## 3      95 Catalyst 2 Mountain - Sport - Aluminum   585
## 4      96 Catalyst 3 Mountain - Sport - Aluminum   480
## 5      97 Catalyst 4 Mountain - Sport - Aluminum   415
# distinct(): extract unique values from data
bike_orderlines_wrangled_tbl %>% 
     distinct(category_1, category_2) %>% 
     View()
  
# mutate(): add new columns in our data
bike_orderlines_wrangled_tbl %>% 
  mutate(total_price_log = log(total_price)) %>% 
  mutate(total_price_sqrt = total_price^0.5) %>% 
  View()

# Binning with ntile()
bike_orderlines_wrangled_tbl %>% 
  mutate(total_price_binned = ntile(total_price, 3)) %>% 
  View()
  
# case_when(): provide flexible conditions for grouping (binning)
bike_orderlines_wrangled_tbl %>% 
  mutate(total_price_binned = ntile(total_price, 3)) %>% 
  mutate(total_price_binned2 = case_when(
                                         total_price > quantile(total_price, 0.75) ~ "High",
                                         total_price > quantile(total_price, 0.25) ~ "Medium", 
                                         TRUE ~ "Low"
  )) %>% 
  View()

# Grouping and summarizing with group_by() and summarize()
bike_orderlines_wrangled_tbl %>% 
  summarise(revenue = sum(total_price))
## # A tibble: 1 × 1
##    revenue
##      <dbl>
## 1 71032330
bike_orderlines_wrangled_tbl %>% 
  group_by(category_1) %>% 
  summarise(revenue = sum(total_price)) %>% 
  ungroup() %>% 
  arrange(desc(revenue))
## # A tibble: 2 × 2
##   category_1  revenue
##   <chr>         <dbl>
## 1 Mountain   39154735
## 2 Road       31877595
# 
bike_orderlines_wrangled_tbl %>%  
  group_by(category_1, category_2, frame_material) %>% 
  summarise(revenue = sum(total_price)) %>% 
  ungroup() %>% 
  arrange(desc(revenue))
## `summarise()` has grouped output by 'category_1', 'category_2'. You can
## override using the `.groups` argument.
## # A tibble: 13 × 4
##    category_1 category_2         frame_material  revenue
##    <chr>      <chr>              <chr>             <dbl>
##  1 Mountain   Cross Country Race Carbon         15906070
##  2 Road       Elite Road         Carbon          9696870
##  3 Road       Endurance Road     Carbon          8768610
##  4 Mountain   Over Mountain      Carbon          7571270
##  5 Road       Elite Road         Aluminum        5637795
##  6 Mountain   Trail              Carbon          4835850
##  7 Mountain   Trail              Aluminum        4537610
##  8 Road       Triathalon         Carbon          4053750
##  9 Mountain   Cross Country Race Aluminum        3318560
## 10 Road       Cyclocross         Carbon          2108120
## 11 Mountain   Sport              Aluminum        1932755
## 12 Road       Endurance Road     Aluminum        1612450
## 13 Mountain   Fat Bike           Aluminum        1052620
# summarize_all()

# Q1: What are the unique categories of products?
bike_orderlines_wrangled_tbl %>% distinct(category_1)
## # A tibble: 2 × 1
##   category_1
##   <chr>     
## 1 Mountain  
## 2 Road
bike_orderlines_wrangled_tbl %>% distinct(category_2)
## # A tibble: 9 × 1
##   category_2        
##   <chr>             
## 1 Over Mountain     
## 2 Trail             
## 3 Elite Road        
## 4 Endurance Road    
## 5 Sport             
## 6 Cross Country Race
## 7 Cyclocross        
## 8 Triathalon        
## 9 Fat Bike
bike_orderlines_wrangled_tbl %>% distinct(frame_material)
## # A tibble: 2 × 1
##   frame_material
##   <chr>         
## 1 Carbon        
## 2 Aluminum
# Q2: Which product categories have the largest sales?
# category_1
bike_orderlines_wrangled_tbl %>% 
  select(category_1, total_price) %>% 
  group_by(category_1) %>% 
  summarise(sales = sum(total_price)) %>% 
  ungroup() %>% 
  rename(Sales = sales) %>% 
  # format dollars
  mutate(Sales1 = Sales %>% scales::dollar())
## # A tibble: 2 × 3
##   category_1    Sales Sales1     
##   <chr>         <dbl> <chr>      
## 1 Mountain   39154735 $39,154,735
## 2 Road       31877595 $31,877,595
# Q3: Renaming the columns
bike_orderlines_wrangled_tbl <- bike_orderlines_wrangled_tbl %>%
  rename('Prime Category' = category_1,
         'Secondary Category' = category_2,
         'Frame Material' = frame_material
  )

bike_orderlines_wrangled_tbl %>% distinct('Prime Category')
## # A tibble: 1 × 1
##   `"Prime Category"`
##   <chr>             
## 1 Prime Category
bike_orderlines_wrangled_tbl %>% distinct('Secondary Category')
## # A tibble: 1 × 1
##   `"Secondary Category"`
##   <chr>                 
## 1 Secondary Category
bike_orderlines_wrangled_tbl %>% distinct('Frame Material')
## # A tibble: 1 × 1
##   `"Frame Material"`
##   <chr>             
## 1 Frame Material
# Q4: Group
category_sales_tbl <- bike_orderlines_wrangled_tbl %>%
  group_by(`Prime Category`, `Secondary Category`, `Frame Material`) %>%
  summarise(Sales = sum(total_price), .groups = "drop") %>%
  mutate(Sales = scales::dollar(Sales))

print(category_sales_tbl)
## # A tibble: 13 × 4
##    `Prime Category` `Secondary Category` `Frame Material` Sales      
##    <chr>            <chr>                <chr>            <chr>      
##  1 Mountain         Cross Country Race   Aluminum         $3,318,560 
##  2 Mountain         Cross Country Race   Carbon           $15,906,070
##  3 Mountain         Fat Bike             Aluminum         $1,052,620 
##  4 Mountain         Over Mountain        Carbon           $7,571,270 
##  5 Mountain         Sport                Aluminum         $1,932,755 
##  6 Mountain         Trail                Aluminum         $4,537,610 
##  7 Mountain         Trail                Carbon           $4,835,850 
##  8 Road             Cyclocross           Carbon           $2,108,120 
##  9 Road             Elite Road           Aluminum         $5,637,795 
## 10 Road             Elite Road           Carbon           $9,696,870 
## 11 Road             Endurance Road       Aluminum         $1,612,450 
## 12 Road             Endurance Road       Carbon           $8,768,610 
## 13 Road             Triathalon           Carbon           $4,053,750