Load libraries and Import dataset

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readxl)
bike_orderlines <- read_excel("bike_orderlines.xlsx")

1.Fix typos found in Feature Engineering. #Fix errors

bike_orderlines <- bike_orderlines %>%
  mutate(model = case_when(
    model == "CAAD Disc Ultegra" ~ "CAAD12 Disc Ultegra",
    model == "Syapse Carbon Tiagra" ~ "Synapse Carbon Tiagra",
    model == "Supersix Evo Hi-Mod Utegra" ~ "Supersix Evo Hi-Mod Ultegra",
    TRUE ~ model
  ))
glimpse(bike_orderlines)
## Rows: 15,644
## Columns: 13
## $ order_date     <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, 2011-0…
## $ order_id       <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7…
## $ order_line     <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3, 4, 1…
## $ quantity       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1…
## $ price          <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330, 1570,…
## $ total_price    <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330, 1570,…
## $ model          <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast of the Ea…
## $ category_1     <chr> "Mountain", "Mountain", "Mountain", "Mountain", "Road",…
## $ category_2     <chr> "Over Mountain", "Over Mountain", "Trail", "Over Mounta…
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carbon", "Ca…
## $ bikeshop_name  <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Climbers",…
## $ city           <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City", "Loui…
## $ state          <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY", "KY", "…

2.Which month has the highest bike sales? #sales by month

bike_orderlines %>%
  select(order_date, total_price) %>%
  mutate(order_date = ymd(order_date)) %>%
  mutate(month = month(order_date, abbr = FALSE, label = TRUE)) %>%
  group_by(month) %>%
  summarize(sales = sum(total_price)) %>%
  ungroup() %>%
  mutate(month = month %>% as_factor(),
         sales = sales %>% scales::dollar(),
         month = month %>% str_to_title())
## # A tibble: 12 × 2
##    month     sales     
##    <chr>     <chr>     
##  1 January   $4,089,460
##  2 February  $5,343,295
##  3 March     $7,282,280
##  4 April     $8,386,170
##  5 May       $7,935,055
##  6 June      $7,813,105
##  7 July      $7,602,005
##  8 August    $5,346,125
##  9 September $5,556,055
## 10 October   $4,394,300
## 11 November  $4,169,755
## 12 December  $3,114,725

3.Median Orderline #Evaluate “Black Inc”.

bike_orderlines %>%
  select(model, total_price) %>%
  mutate(black = model %>% str_to_lower() %>% str_detect("black inc")) %>%
  group_by(black) %>%
  summarize(median = median(total_price)) %>%
  ungroup() %>%
  mutate(median = median %>% scales::dollar()) %>%
  rename(`Black Inc` = black,
         `Median Orderline` = median) 
## # A tibble: 2 × 2
##   `Black Inc` `Median Orderline`
##   <lgl>       <chr>             
## 1 FALSE       $2,880            
## 2 TRUE        $12,250

#Evaluate “Ultegra”.

bike_orderlines %>%
  select(model, total_price) %>%
  mutate(ultegra = model %>% str_to_lower() %>% str_detect("ultegra")) %>%
  group_by(ultegra) %>%
  summarize(median = median(total_price)) %>%
  ungroup() %>%
  mutate(median = median %>% scales::dollar()) %>%
  rename(`Ultegra` = ultegra,
         `Median Orderline` = median) 
## # A tibble: 2 × 2
##   Ultegra `Median Orderline`
##   <lgl>   <chr>             
## 1 FALSE   $3,200            
## 2 TRUE    $3,200

#Evaluate “Disc” option.

bike_orderlines %>%
  select(model, total_price) %>%
  mutate(disc = model %>% str_to_lower() %>% str_detect("disc")) %>%
  group_by(disc) %>%
  summarize(median = median(total_price)) %>%
  ungroup() %>%
  mutate(median = median %>% scales::dollar()) %>%
  rename(`Disc` = disc,
         `Median Orderline` = median) 
## # A tibble: 2 × 2
##   Disc  `Median Orderline`
##   <lgl> <chr>             
## 1 FALSE $3,200            
## 2 TRUE  $2,660

4.What are the average, min, and max prices by Base Model? ## ‘summarise()‘ has grouped output by ’category_1’, ’category_2’.

bike_orderlines %>%
  select(model, category_1, category_2, price) %>%
  separate(model, 
           into = str_c("model_", 1:7),
           sep = " ", 
           remove = T,
           fill = "right",
           extra = "drop") %>%
  mutate(model_base = case_when(
    # fix - supersix evo
    str_detect(str_to_lower(model_1), "supersix") ~ str_c(model_1, model_2, sep = " "),
    # fix - beast of the east
    str_detect(str_to_lower(model_1), "beast") ~ str_c(model_1, model_2, model_3, model_4, sep = " "),
    # fix - fat CAAD
    str_detect(str_to_lower(model_1), "fat") ~ str_c(model_1, model_2, sep = " "),
    # fix - bad habit
    str_detect(str_to_lower(model_1), "bad") ~ str_c(model_1, model_2, sep = " "),
    # fix - scalpel 29
    str_detect(str_to_lower(model_2), "29") ~ str_c(model_1, model_2, sep = " "),
    # catch-all
    TRUE ~ model_1)
  ) %>%
  mutate(model_1 = model_1 %>% str_trim()) %>%
  select(category_1, category_2, model_base, price) %>%
  group_by(category_1, category_2,model_base) %>%
  summarize(mean = round(mean(price),0) %>% scales::dollar(),
            min = min(price) %>% scales::dollar(),
            max = max(price) %>% scales::dollar()) %>%
  ungroup() %>%
  arrange(desc(mean)) %>%
  rename(
    `Category 1` = category_1,
    `Category 2` = category_2,
    `Model Base` = model_base,
    `Mean Price` = mean,
    `Min Price` = min, 
    `Max Price` = max
  ) 
## `summarise()` has grouped output by 'category_1', 'category_2'. You can
## override using the `.groups` argument.
## # A tibble: 18 × 6
##    `Category 1` `Category 2`   `Model Base` `Mean Price` `Min Price` `Max Price`
##    <chr>        <chr>          <chr>        <chr>        <chr>       <chr>      
##  1 Mountain     Cross Country… Scalpel-Si   $6,695       $3,200      $12,790    
##  2 Mountain     Sport          Catalyst     $541         $415        $705       
##  3 Mountain     Over Mountain  Jekyll       $5,042       $3,200      $7,990     
##  4 Road         Elite Road     Supersix Evo $4,978       $1,840      $12,790    
##  5 Mountain     Over Mountain  Trigger      $4,970       $3,200      $8,200     
##  6 Mountain     Trail          Habit        $4,611       $1,950      $12,250    
##  7 Mountain     Cross Country… F-Si         $4,504       $1,840      $11,190    
##  8 Mountain     Cross Country… Scalpel 29   $4,499       $3,200      $6,390     
##  9 Mountain     Fat Bike       Fat CAAD1    $3,730       $3,730      $3,730     
## 10 Road         Triathalon     Slice        $3,527       $1,950      $7,000     
## 11 Road         Endurance Road Synapse      $3,080       $870        $9,590     
## 12 Mountain     Trail          Bad Habit    $2,954       $2,660      $3,200     
## 13 Road         Elite Road     CAAD12       $2,926       $1,680      $5,860     
## 14 Road         Cyclocross     SuperX       $2,339       $1,750      $3,500     
## 15 Mountain     Trail          Beast of th… $2,194       $1,620      $2,770     
## 16 Mountain     Fat Bike       Fat CAAD2    $2,130       $2,130      $2,130     
## 17 Mountain     Sport          Trail        $1,153       $815        $1,520     
## 18 Road         Elite Road     CAAD8        $1,136       $815        $1,410