# Setup - Install packages manually before knitting
# install.packages(c("readxl", "tidyverse", "writexl", "lubridate"))

# Load required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(writexl)
library(lubridate)

# 2. Data import
# ./, ../, 
bikes_tbl <- read_excel("./bikes.xlsx") # fast key: alt+-
bikeshops_tbl <- read_excel("./bikeshops.xlsx")
orderlines_tbl <- read_excel("./orderlines.xlsx")
## New names:
## • `` -> `...1`
# Examine data:
bikes_tbl
## # A tibble: 97 × 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # ℹ 87 more rows
head(bikes_tbl)
## # A tibble: 6 × 4
##   bike.id model                          description                price
##     <dbl> <chr>                          <chr>                      <dbl>
## 1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
## 2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
## 3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
## 4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
## 5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
## 6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
# Import csv file:
bike_orderlines_tbl <- read_csv("./bike_orderlines.csv")
## Rows: 15644 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): model, category_1, category_2, frame_material, bikeshop_name, city...
## dbl  (5): order_id, order_line, quantity, price, total_price
## dttm (1): order_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Joining data: 
orderlines_bikes_tbl <- left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id"))

bike_orderlines_bikeshops_joined <- left_join(orderlines_bikes_tbl, bikeshops_tbl, 
                                               by = c('customer.id' = 'bikeshop.id'))


# %>% is called pipe: fast key: ctl + shift + m 
bike_orderlines_bikeshops_joined <- left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id")) %>% 
                                    left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))

# Wrangling data: decompose description into three columns: category.1, category.2 and frame.material
bike_orderlines_wrangled_tbl <- bike_orderlines_bikeshops_joined %>% 
                                  separate(description, 
                                           into = c('category.1', 'category.2', 'frame.material'), 
                                           sep  = ' - ') %>% 
                                  separate(location, 
                                           into = c('city', 'state'), 
                                           sep  = ', ',
                                           remove = FALSE) %>% 
# create calculated columns 
                                  mutate(total.price = price * quantity) %>% 
# Reorganize columns                                  
                                  select(-...1, -location) %>% 
# Reorder columns                                  
                                  select(contains('date'), contains('id'), 
                                         contains('order'), 
                                         quantity, price, total.price, 
                                         everything()) %>% 
# Rename columns
                                  rename(order_date = order.date) %>%
                                  set_names(names(.) %>% str_replace_all("\\.", "_"))
# save the file as RDS
saveRDS(bike_orderlines_wrangled_tbl, './bike_orderlines.rds')

# dplyr and tidyr
# pull() vs. select()
bike_orderlines_wrangled_tbl %>% 
            # select(total_price)
              pull(total_price) %>% 
              mean()
## [1] 4540.548
# select_if
bike_orderlines_wrangled_tbl %>% 
          # select_if(is.character)
            select_if(is.numeric)
## # A tibble: 15,644 × 7
##    order_id customer_id product_id order_line quantity price total_price
##       <dbl>       <dbl>      <dbl>      <dbl>    <dbl> <dbl>       <dbl>
##  1        1           2         48          1        1  6070        6070
##  2        1           2         52          2        1  5970        5970
##  3        2          10         76          1        1  2770        2770
##  4        2          10         52          2        1  5970        5970
##  5        3           6          2          1        1 10660       10660
##  6        3           6         50          2        1  3200        3200
##  7        3           6          1          3        1 12790       12790
##  8        3           6          4          4        1  5330        5330
##  9        3           6         34          5        1  1570        1570
## 10        4          22         26          1        1  4800        4800
## # ℹ 15,634 more rows
# arrange() and desc()
bikes_tbl %>% 
  select(model, price) %>% 
  arrange(desc(price))
## # A tibble: 97 × 2
##    model                          price
##    <chr>                          <dbl>
##  1 Supersix Evo Black Inc.        12790
##  2 Scalpel-Si Black Inc.          12790
##  3 Habit Hi-Mod Black Inc.        12250
##  4 F-Si Black Inc.                11190
##  5 Supersix Evo Hi-Mod Team       10660
##  6 Synapse Hi-Mod Disc Black Inc.  9590
##  7 Scalpel-Si Race                 9060
##  8 F-Si Hi-Mod Team                9060
##  9 Trigger Carbon 1                8200
## 10 Supersix Evo Hi-Mod Dura Ace 1  7990
## # ℹ 87 more rows
# filter()  
bikes_tbl %>% 
  select(model, price) %>% 
  filter(price > mean(price))
## # A tibble: 35 × 2
##    model                          price
##    <chr>                          <dbl>
##  1 Supersix Evo Black Inc.        12790
##  2 Supersix Evo Hi-Mod Team       10660
##  3 Supersix Evo Hi-Mod Dura Ace 1  7990
##  4 Supersix Evo Hi-Mod Dura Ace 2  5330
##  5 Supersix Evo Hi-Mod Utegra      4260
##  6 CAAD12 Black Inc                5860
##  7 CAAD12 Disc Dura Ace            4260
##  8 Synapse Hi-Mod Disc Black Inc.  9590
##  9 Synapse Hi-Mod Disc Red         7460
## 10 Synapse Hi-Mod Dura Ace         5860
## # ℹ 25 more rows
bikes_tbl %>% 
  select(model, price) %>% 
  filter((price > 5000) & (price < 10000)) %>%    
  arrange(desc(price))
## # A tibble: 22 × 2
##    model                          price
##    <chr>                          <dbl>
##  1 Synapse Hi-Mod Disc Black Inc.  9590
##  2 Scalpel-Si Race                 9060
##  3 F-Si Hi-Mod Team                9060
##  4 Trigger Carbon 1                8200
##  5 Supersix Evo Hi-Mod Dura Ace 1  7990
##  6 Jekyll Carbon 1                 7990
##  7 Synapse Hi-Mod Disc Red         7460
##  8 Scalpel-Si Hi-Mod 1             7460
##  9 Habit Carbon 1                  7460
## 10 Slice Hi-Mod Black Inc.         7000
## # ℹ 12 more rows
bikes_tbl %>% 
  select(model, price) %>% 
  filter(price > 6000, 
         model %>% str_detect("Supersix"))
## # A tibble: 3 × 2
##   model                          price
##   <chr>                          <dbl>
## 1 Supersix Evo Black Inc.        12790
## 2 Supersix Evo Hi-Mod Team       10660
## 3 Supersix Evo Hi-Mod Dura Ace 1  7990
# Filtering one or more conditions using == and %in%
bike_orderlines_wrangled_tbl %>% 
  filter(category_2 %in% c("Over Mountain", "Trail", "Endurance Road")) %>% 
  View()

# slice()
bikes_tbl %>% 
  arrange(desc(price)) %>% 
  # slice(1:5)
  slice((nrow(.)-4):nrow(.))
## # A tibble: 5 × 4
##   bike.id model      description                 price
##     <dbl> <chr>      <chr>                       <dbl>
## 1      93 Trail 5    Mountain - Sport - Aluminum   815
## 2      94 Catalyst 1 Mountain - Sport - Aluminum   705
## 3      95 Catalyst 2 Mountain - Sport - Aluminum   585
## 4      96 Catalyst 3 Mountain - Sport - Aluminum   480
## 5      97 Catalyst 4 Mountain - Sport - Aluminum   415
# distinct(): extract unique values from data
bike_orderlines_wrangled_tbl %>% 
     distinct(category_1, category_2) %>% 
     View()
  
# mutate(): add new columns in our data
bike_orderlines_wrangled_tbl %>% 
  mutate(total_price_log = log(total_price)) %>% 
  mutate(total_price_sqrt = total_price^0.5) %>% 
  View()

# Binning with ntile()
bike_orderlines_wrangled_tbl %>% 
  mutate(total_price_binned = ntile(total_price, 3)) %>% 
  View()
  
# case_when(): provide flexible conditions for grouping (binning)
bike_orderlines_wrangled_tbl %>% 
  mutate(total_price_binned = ntile(total_price, 3)) %>% 
  mutate(total_price_binned2 = case_when(
                                         total_price > quantile(total_price, 0.75) ~ "High",
                                         total_price > quantile(total_price, 0.25) ~ "Medium", 
                                         TRUE ~ "Low"
  )) %>% 
  View()

# Grouping and summarizing with group_by() and summarize()
bike_orderlines_wrangled_tbl %>% 
  summarise(revenue = sum(total_price))
## # A tibble: 1 × 1
##    revenue
##      <dbl>
## 1 71032330
bike_orderlines_wrangled_tbl %>% 
  group_by(category_1) %>% 
  summarise(revenue = sum(total_price)) %>% 
  ungroup() %>% 
  arrange(desc(revenue))
## # A tibble: 2 × 2
##   category_1  revenue
##   <chr>         <dbl>
## 1 Mountain   39154735
## 2 Road       31877595
# 
bike_orderlines_wrangled_tbl %>%  
  group_by(category_1, category_2, frame_material) %>% 
  summarise(revenue = sum(total_price)) %>% 
  ungroup() %>% 
  arrange(desc(revenue))
## `summarise()` has grouped output by 'category_1', 'category_2'. You can
## override using the `.groups` argument.
## # A tibble: 13 × 4
##    category_1 category_2         frame_material  revenue
##    <chr>      <chr>              <chr>             <dbl>
##  1 Mountain   Cross Country Race Carbon         15906070
##  2 Road       Elite Road         Carbon          9696870
##  3 Road       Endurance Road     Carbon          8768610
##  4 Mountain   Over Mountain      Carbon          7571270
##  5 Road       Elite Road         Aluminum        5637795
##  6 Mountain   Trail              Carbon          4835850
##  7 Mountain   Trail              Aluminum        4537610
##  8 Road       Triathalon         Carbon          4053750
##  9 Mountain   Cross Country Race Aluminum        3318560
## 10 Road       Cyclocross         Carbon          2108120
## 11 Mountain   Sport              Aluminum        1932755
## 12 Road       Endurance Road     Aluminum        1612450
## 13 Mountain   Fat Bike           Aluminum        1052620
# summarize_all()


# Q1: What are the unique categories of products? 
bike_orderlines_wrangled_tbl %>% distinct(category_1)
## # A tibble: 2 × 1
##   category_1
##   <chr>     
## 1 Mountain  
## 2 Road
bike_orderlines_wrangled_tbl %>% distinct(category_2)
## # A tibble: 9 × 1
##   category_2        
##   <chr>             
## 1 Over Mountain     
## 2 Trail             
## 3 Elite Road        
## 4 Endurance Road    
## 5 Sport             
## 6 Cross Country Race
## 7 Cyclocross        
## 8 Triathalon        
## 9 Fat Bike
bike_orderlines_wrangled_tbl %>% distinct(frame_material)
## # A tibble: 2 × 1
##   frame_material
##   <chr>         
## 1 Carbon        
## 2 Aluminum
# Q2: Which product categories have the largest sales? 
# category_1
bike_orderlines_wrangled_tbl %>% 
                             select(category_1, total_price) %>% 
                             group_by(category_1) %>% 
                             summarise(sales = sum(total_price)) %>%
                             ungroup() %>% 
                             rename(`Primary Category` = category_1, 
                                     Sales = sales) %>% 
                             # format dollars
                             mutate(Sales1 = Sales %>% scales::dollar())
## # A tibble: 2 × 3
##   `Primary Category`    Sales Sales1     
##   <chr>                 <dbl> <chr>      
## 1 Mountain           39154735 $39,154,735
## 2 Road               31877595 $31,877,595
# Generate total sales by year
# check the data structure
str(bike_orderlines_wrangled_tbl)
## tibble [15,644 × 15] (S3: tbl_df/tbl/data.frame)
##  $ order_date    : POSIXct[1:15644], format: "2011-01-07" "2011-01-07" ...
##  $ order_id      : num [1:15644] 1 1 2 2 3 3 3 3 3 4 ...
##  $ customer_id   : num [1:15644] 2 2 10 10 6 6 6 6 6 22 ...
##  $ product_id    : num [1:15644] 48 52 76 52 2 50 1 4 34 26 ...
##  $ order_line    : num [1:15644] 1 2 1 2 1 2 3 4 5 1 ...
##  $ quantity      : num [1:15644] 1 1 1 1 1 1 1 1 1 1 ...
##  $ price         : num [1:15644] 6070 5970 2770 5970 10660 ...
##  $ total_price   : num [1:15644] 6070 5970 2770 5970 10660 ...
##  $ model         : chr [1:15644] "Jekyll Carbon 2" "Trigger Carbon 2" "Beast of the East 1" "Trigger Carbon 2" ...
##  $ category_1    : chr [1:15644] "Mountain" "Mountain" "Mountain" "Mountain" ...
##  $ category_2    : chr [1:15644] "Over Mountain" "Over Mountain" "Trail" "Over Mountain" ...
##  $ frame_material: chr [1:15644] "Carbon" "Carbon" "Aluminum" "Carbon" ...
##  $ bikeshop_name : chr [1:15644] "Ithaca Mountain Climbers" "Ithaca Mountain Climbers" "Kansas City 29ers" "Kansas City 29ers" ...
##  $ city          : chr [1:15644] "Ithaca" "Ithaca" "Kansas City" "Kansas City" ...
##  $ state         : chr [1:15644] "NY" "NY" "KS" "KS" ...
bike_sales_y <- bike_orderlines_wrangled_tbl %>%
                select(order_date, total_price) %>% 
                # change order_date into ymd format
                mutate(order_date = ymd(order_date)) %>% 
                mutate(year = year(order_date)) %>% 
                # group by year
                group_by(year) %>% 
                summarise(sales = sum(total_price)) %>% 
                ungroup()

# Compute total sales by month and year
bike_orderlines_wrangled_tbl %>% 
     select(order_date, total_price) %>% 
     mutate(order_date = ymd(order_date)) %>% 
     mutate(year  = year(order_date),
            month = month(order_date, label = TRUE)) %>% 
     group_by(year, month) %>% 
     summarise(sales = sum(total_price)) %>% 
     ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 60 × 3
##     year month   sales
##    <dbl> <ord>   <dbl>
##  1  2011 Jan    483015
##  2  2011 Feb   1162075
##  3  2011 Mar    659975
##  4  2011 Apr   1827140
##  5  2011 May    844170
##  6  2011 Jun   1413445
##  7  2011 Jul   1194430
##  8  2011 Aug    679790
##  9  2011 Sep    814720
## 10  2011 Oct    734920
## # ℹ 50 more rows
# Time series aggregation: floor_date
# floor_date() is to reduce a date to the nearest unit. 
# similar to ceiling_date()
bike_sales_m <- bike_orderlines_wrangled_tbl %>% 
                select(order_date, total_price) %>% 
                mutate(order_date = ymd(order_date)) %>% 
                mutate(year_month = floor_date(order_date, unit = "month")) %>%
                group_by(year_month) %>% 
                summarise(sales = sum(total_price))

# Measure the change such as annual change in numbers or percentage
# dplyr::lag()
# lag is to align past observations with future observations
bike_sales_y %>% 
    mutate(sales_lag_1 = lag(sales, n = 1)) %>% 
# How to handle the NA values? 
# Replace NA with sales in year 2011
    mutate(sales_lag_1 = case_when(
              is.na(sales_lag_1) ~ sales,
              TRUE ~ sales_lag_1
    )) %>% 
    # calculate difference and percentage change by year
    mutate(diff_1 = sales - sales_lag_1) %>% 
    mutate(pct_diff_1 = diff_1 / sales_lag_1) %>% 
    mutate(pct_diff_1_chr = scales::percent(pct_diff_1))
## # A tibble: 5 × 6
##    year    sales sales_lag_1   diff_1 pct_diff_1 pct_diff_1_chr
##   <dbl>    <dbl>       <dbl>    <dbl>      <dbl> <chr>         
## 1  2011 11292885    11292885        0     0      0.0%          
## 2  2012 12163075    11292885   870190     0.0771 7.7%          
## 3  2013 16480775    12163075  4317700     0.355  35.5%         
## 4  2014 13924085    16480775 -2556690    -0.155  -15.5%        
## 5  2015 17171510    13924085  3247425     0.233  23.3%
# Calculate monthly change
# Try to write a function to do the same job again!
calculate_pct_diff <- function(data){
  
  data %>% 
    mutate(sales_lag_1 = lag(sales, n = 1)) %>% 
    # How to handle the NA values? 
    # Replace NA with sales in year 2011
    mutate(sales_lag_1 = case_when(
      is.na(sales_lag_1) ~ sales,
      TRUE ~ sales_lag_1
    )) %>% 
    # calculate difference and percentage change by year
    mutate(diff_1 = sales - sales_lag_1) %>% 
    mutate(pct_diff_1 = diff_1 / sales_lag_1) %>% 
    mutate(pct_diff_1_chr = scales::percent(pct_diff_1))

}
  
calculate_pct_diff(bike_sales_m)
## # A tibble: 60 × 6
##    year_month   sales sales_lag_1  diff_1 pct_diff_1 pct_diff_1_chr
##    <date>       <dbl>       <dbl>   <dbl>      <dbl> <chr>         
##  1 2011-01-01  483015      483015       0     0      0.000%        
##  2 2011-02-01 1162075      483015  679060     1.41   140.588%      
##  3 2011-03-01  659975     1162075 -502100    -0.432  -43.207%      
##  4 2011-04-01 1827140      659975 1167165     1.77   176.850%      
##  5 2011-05-01  844170     1827140 -982970    -0.538  -53.798%      
##  6 2011-06-01 1413445      844170  569275     0.674  67.436%       
##  7 2011-07-01 1194430     1413445 -219015    -0.155  -15.495%      
##  8 2011-08-01  679790     1194430 -514640    -0.431  -43.087%      
##  9 2011-09-01  814720      679790  134930     0.198  19.849%       
## 10 2011-10-01  734920      814720  -79800    -0.0979 -9.795%       
## # ℹ 50 more rows
# dplyr::first()
bike_sales_y %>% 
   mutate(sales_2011 = first(sales))
## # A tibble: 5 × 3
##    year    sales sales_2011
##   <dbl>    <dbl>      <dbl>
## 1  2011 11292885   11292885
## 2  2012 12163075   11292885
## 3  2013 16480775   11292885
## 4  2014 13924085   11292885
## 5  2015 17171510   11292885
# dplyr::cumsum()
bike_sales_y %>% 
   mutate(cumulative_sales = cumsum(sales))
## # A tibble: 5 × 3
##    year    sales cumulative_sales
##   <dbl>    <dbl>            <dbl>
## 1  2011 11292885         11292885
## 2  2012 12163075         23455960
## 3  2013 16480775         39936735
## 4  2014 13924085         53860820
## 5  2015 17171510         71032330
# library(ggplot2)
# Try to generate a plot of total sales by year
bike_sales_y %>% 
          # prepare a clean canvas
          ggplot(aes(x = year, y = sales, color = sales)) +
          geom_point(size = 5) +
          geom_line(linewidth = 2) +
          geom_smooth(method = "lm", formula = 'y ~ x', se = FALSE) +
          # formatting y axis to numbers
          # change y axis starting from zero
          expand_limits(y = c(0, 20e6)) +
          scale_colour_continuous(low = "red", high = "black",
                                  labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
          scale_y_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
          labs(
            title = "Revenue",
            subtitle = "Sales are trending up and to the right!",
            x = "year",
            y = "Sales (Millions)",
            color = "Rev ($M)",
            caption = "Total sales from 2011 to 2015"
          )
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# Bar plot
# Generate total sales by category 2
revenue_by_category2_tbl <- bike_orderlines_wrangled_tbl %>% 
                          select(category_2, total_price) %>% 
                          group_by(category_2) %>% 
                          summarise(revenue = sum(total_price)) %>% 
                          ungroup()
# Bar plot
revenue_by_category2_tbl %>%
        mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>%
        ggplot(aes(x = category_2, y = revenue)) +
        geom_col(fill = "pink") +
        coord_flip()