Midterm_HectorRomero

Loading libraries, read the files

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'lubridate'
## 
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Fix typos found in Feature Engineering.

## Correction 1
bikes_corrected <- bike_orderlines %>% 
  mutate(model = case_when(
    model == "CAAD Disc Ultegra" ~"CAAD12 Disc Ultegra", 
    TRUE ~ model))
# Correction 2
bikes_corrected <- bikes_corrected %>% 
  mutate(model = case_when(
    model == "Syapse Carbon Tiagra" ~"Synapse Carbon Tiagra", 
    TRUE ~ model))
# Correction 3
bikes_corrected <- bikes_corrected %>% 
  mutate(model = case_when(
    model == "Supersix Evo Hi-Mod Utegra" ~"Supersix Evo Hi-Mod Ultegra", 
    TRUE ~ model))

glimpse(bikes_corrected)

## Rows: 15,644
## Columns: 13
## $ order_date     <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, 2011-0…
## $ order_id       <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7…
## $ order_line     <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3, 4, 1…
## $ quantity       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1…
## $ price          <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330, 1570,…
## $ total_price    <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330, 1570,…
## $ model          <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast of the Ea…
## $ category_1     <chr> "Mountain", "Mountain", "Mountain", "Mountain", "Road",…
## $ category_2     <chr> "Over Mountain", "Over Mountain", "Trail", "Over Mounta…
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carbon", "Ca…
## $ bikeshop_name  <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Climbers",…
## $ city           <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City", "Loui…
## $ state          <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY", "KY", "…

Which month has the highest bike sales? The month with the highest sales is the month of April with over 8.3 million dollars in sales. Also the month of December has the lowest amount of sales with only 3.1 million dollars in sales.

bike_sales_m_tbl <- bikes_corrected %>% 
  select(order_date, total_price) %>% 
  mutate(order_date = ymd(order_date)) %>% 
  mutate(Month = month(order_date, label = TRUE)) %>% 
  group_by(Month) %>% 
  summarize(Sales = sum(total_price)) %>%
  mutate(Sales = scales::dollar(Sales))

bike_sales_m_tbl %>%  View()

What is the median orderline sales value by Bike Attribute? Median sales by ‘Black Inc’ attribute

bike_sales_blackInc <- bikes_corrected %>%
  mutate(BlackInc = str_detect(model, "Black Inc")) %>%
  group_by(BlackInc) %>%
  summarize(MeanOrderline = mean(total_price)) %>%
  mutate(MeanOrderline = scales::dollar(MeanOrderline))

bike_sales_blackInc

## # A tibble: 2 × 2
##   BlackInc MeanOrderline
##   <lgl>    <chr>        
## 1 FALSE    $4,037.61    
## 2 TRUE     $13,154.58

#Bikes that contain the 'Black Inc' feature tend to have a higher price when compared to the median price of the other products.

## Median sales by 'Ultegra' attribute
bike_sales_ultegra <- bikes_corrected %>%
  mutate(Ultegra = str_detect(model, "Ultegra")) %>%
  group_by(Ultegra) %>%
  summarize(MeanOrderline = mean(total_price)) %>%
  mutate(MeanOrderline = scales::dollar(MeanOrderline))

bike_sales_ultegra

## # A tibble: 2 × 2
##   Ultegra MeanOrderline
##   <lgl>   <chr>        
## 1 FALSE   $4,601.17    
## 2 TRUE    $4,175.90

#  Bikes that contain the 'Ultegra' feature tend to be similarly priced to the median of the other products. 
#  Those these bikes have a slightly lower median by comparison, with about a a 425$ difference.

## Median sales by 'Disc' attribute
bike_sales_disc <- bikes_corrected %>%
  mutate(Disc = str_detect(model, "Disc")) %>%
  group_by(Disc) %>%
  summarize(MeanOrderline = mean(total_price)) %>%
  mutate(MeanOrderline = scales::dollar(MeanOrderline))

bike_sales_disc

## # A tibble: 2 × 2
##   Disc  MeanOrderline
##   <lgl> <chr>        
## 1 FALSE $4,544.44    
## 2 TRUE  $4,511.86

# Bikes that contain the 'Disc' feature tend to have a similar median when compared to the other products offered. 
# The difference between medians as almost insignificant.

What are the average, min, and max prices by Base Model?

bikes_corrected %>% select(category_1, category_2, model, total_price) %>% 
  mutate(Model_Base = str_extract(model, "^[a-zA-Z\\s-]+")) %>%
  group_by(category_1, category_2, Model_Base) %>%
  summarize(
    Mean_Price = scales::dollar(mean(total_price)),
    Max_Price = scales::dollar(max(total_price)),
    Min_Price = scales::dollar(min(total_price))
  ) %>% View()

## `summarise()` has grouped output by 'category_1', 'category_2'. You can
## override using the `.groups` argument.

#As displayed, the bikes with the highest price don't necessary tend  to the highest means.
#Also some models present a very large spread between the min and max price. For example the "Scapel" base model who has a spread of almost 60k between the min and max price.

Midterm_HectorRomero_111035121

Hector Romero 111035121

2023-11-14