checkpoint_3_forcats

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.1     √ purrr   0.3.2
## √ tibble  2.1.1     √ dplyr   0.8.1
## √ tidyr   0.8.3     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 3.5.3
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.5.3
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
## Loading required package: PerformanceAnalytics
## Warning: package 'PerformanceAnalytics' was built under R version 3.5.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.5.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.5.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Warning: package 'quantmod' was built under R version 3.5.3
## Loading required package: TTR
## Warning: package 'TTR' was built under R version 3.5.3
## Version 0.4-0 included new data defaults. See ?getSymbols.
bike_orderlines_tbl <- read_rds("D:/hw0527/HW0527/data_wrangled_student/bike_orderlines.rds")

bike_orderlines_tbl
## # A tibble: 15,644 x 13
##    order_date          order_id order_line quantity price total_price model
##    <dttm>                 <dbl>      <dbl>    <dbl> <dbl>       <dbl> <chr>
##  1 2011-01-07 00:00:00        1          1        1  6070        6070 Jeky~
##  2 2011-01-07 00:00:00        1          2        1  5970        5970 Trig~
##  3 2011-01-10 00:00:00        2          1        1  2770        2770 Beas~
##  4 2011-01-10 00:00:00        2          2        1  5970        5970 Trig~
##  5 2011-01-10 00:00:00        3          1        1 10660       10660 Supe~
##  6 2011-01-10 00:00:00        3          2        1  3200        3200 Jeky~
##  7 2011-01-10 00:00:00        3          3        1 12790       12790 Supe~
##  8 2011-01-10 00:00:00        3          4        1  5330        5330 Supe~
##  9 2011-01-10 00:00:00        3          5        1  1570        1570 Syna~
## 10 2011-01-11 00:00:00        4          1        1  4800        4800 Syna~
## # ... with 15,634 more rows, and 6 more variables: category_1 <chr>,
## #   category_2 <chr>, frame_material <chr>, bikeshop_name <chr>,
## #   city <chr>, state <chr>
# 2.0 Motivating Example -----

# Manipulation
sales_by_cat_2_tbl <- bike_orderlines_tbl %>%
    
    select(category_2, total_price) %>%
    
    group_by(category_2) %>%
    summarize(sales = sum(total_price)) %>%
    ungroup() %>%
    
    arrange(desc(sales)) %>%
    
    mutate(category_2 = category_2 %>% as_factor() %>% fct_rev())
# Plotting
sales_by_cat_2_tbl %>%
    ggplot(aes(x = sales, y = category_2)) +
    geom_point(size = 5, color = "#2c3e50") +
    labs(title = "Sales By Category 2") +
    scale_x_continuous(labels = scales::dollar_format()) +
    theme_tq() +
    expand_limits(x = 0)

plot_sales <- function(data) {
    data %>%
        ggplot(aes(x = sales, y = category_2)) +
        geom_point(size = 5, color = "#2c3e50") +
        labs(title = "Sales By Category 2") +
        scale_x_continuous(labels = scales::dollar_format()) +
        theme_tq() +
        expand_limits(x = 0)
}

sales_by_cat_2_tbl %>%
    plot_sales()

# 3.1 Inspecting Factors ----

# Vector
sales_by_cat_2_tbl %>% pull(category_2) %>% levels()
## [1] "Fat Bike"           "Sport"              "Cyclocross"        
## [4] "Triathalon"         "Over Mountain"      "Trail"             
## [7] "Endurance Road"     "Elite Road"         "Cross Country Race"
sales_by_cat_2_tbl %>% pull(category_2) %>% as.numeric()
## [1] 9 8 7 6 5 4 3 2 1
# Tibble

sales_by_cat_2_tbl %>%
    mutate(category_2 = category_2 %>% fct_rev() %>% fct_rev()) %>%
    mutate(
        label = category_2 %>% as.character(),
        value = category_2 %>% as.numeric()
    )
## # A tibble: 9 x 4
##   category_2            sales label              value
##   <fct>                 <dbl> <chr>              <dbl>
## 1 Cross Country Race 19224630 Cross Country Race     9
## 2 Elite Road         15334665 Elite Road             8
## 3 Endurance Road     10381060 Endurance Road         7
## 4 Trail               9373460 Trail                  6
## 5 Over Mountain       7571270 Over Mountain          5
## 6 Triathalon          4053750 Triathalon             4
## 7 Cyclocross          2108120 Cyclocross             3
## 8 Sport               1932755 Sport                  2
## 9 Fat Bike            1052620 Fat Bike               1
# 3.2 Creating Factors: as_factor() vs as.factor() ----

sales_by_cat_2_tbl %>%
    mutate(
        category_2           = as.character(category_2),
        category_2_as_factor = as_factor(category_2) %>% as.numeric(),
        category_2_as.factor = as.factor(category_2) %>% as.numeric()
    )
## # A tibble: 9 x 4
##   category_2            sales category_2_as_factor category_2_as.factor
##   <chr>                 <dbl>                <dbl>                <dbl>
## 1 Cross Country Race 19224630                    1                    1
## 2 Elite Road         15334665                    2                    3
## 3 Endurance Road     10381060                    3                    4
## 4 Trail               9373460                    4                    8
## 5 Over Mountain       7571270                    5                    6
## 6 Triathalon          4053750                    6                    9
## 7 Cyclocross          2108120                    7                    2
## 8 Sport               1932755                    8                    7
## 9 Fat Bike            1052620                    9                    5
# 3.3 Reordering Factors: fct_reorder() and fct_rev() ----

sales_by_cat_2_tbl %>%
    arrange(desc(sales)) %>%
    mutate(sales_negative = -sales) %>%
    mutate(
        category_2 = category_2 %>% fct_reorder(sales_negative),
        values     = category_2 %>% as.numeric()) %>%
    
    plot_sales()

# 3.4 Time-Based Reordering: fct_reorder2() ----


sales_by_cat_2_q_tbl <- bike_orderlines_tbl %>%
    
    mutate(order_date = order_date %>% floor_date("quarter") %>% ymd()) %>%
    
    group_by(category_2, order_date) %>%
    summarise(sales = sum(total_price)) %>%
    ungroup()

sales_by_cat_2_q_tbl
## # A tibble: 180 x 3
##    category_2         order_date   sales
##    <chr>              <date>       <dbl>
##  1 Cross Country Race 2011-01-01  610060
##  2 Cross Country Race 2011-04-01 1083310
##  3 Cross Country Race 2011-07-01  609770
##  4 Cross Country Race 2011-10-01  614110
##  5 Cross Country Race 2012-01-01  731330
##  6 Cross Country Race 2012-04-01 1097010
##  7 Cross Country Race 2012-07-01 1000220
##  8 Cross Country Race 2012-10-01  532240
##  9 Cross Country Race 2013-01-01 1017470
## 10 Cross Country Race 2013-04-01 1503950
## # ... with 170 more rows
sales_by_cat_2_q_tbl %>%
    
    mutate(category_2 = category_2 %>% fct_reorder2(order_date, sales)) %>%
    
    ggplot(aes(x = order_date, y = sales, color = category_2)) +
    geom_point() +
    geom_line() +
    facet_wrap(~ category_2) +
    
    theme_tq() +
    scale_color_tq() +
    scale_y_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M"))

# 3.5 Creating "Other" Category - fct_lump() & fct_relevel() ----

sales_by_cat_2_tbl %>%
    
    mutate(category_2 = category_2 %>% fct_lump(n = 6, 
                                                w = sales, 
                                                other_level = "All Other Bike Categories")) %>%
    
    group_by(category_2) %>%
    summarize(sales = sum(sales)) %>%
    
    mutate(category_2 = category_2 %>% fct_relevel("All Other Bike Categories", after = 0)) %>%
    
    plot_sales()

ggplot2_geometries_1

bike_orderlines_tbl <- read_rds("D:/hw0527/HW0527/data_wrangled_student/bike_orderlines.rds")

glimpse(bike_orderlines_tbl)
## Observations: 15,644
## Variables: 13
## $ order_date     <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10...
## $ order_id       <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6,...
## $ order_line     <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2,...
## $ quantity       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,...
## $ price          <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ total_price    <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ model          <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast o...
## $ category_1     <chr> "Mountain", "Mountain", "Mountain", "Mountain",...
## $ category_2     <chr> "Over Mountain", "Over Mountain", "Trail", "Ove...
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carb...
## $ bikeshop_name  <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Cl...
## $ city           <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City...
## $ state          <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY",...
# 1.0 Point / Scatter Plots ----
# - Great for Continuous vs Continuous
# - Also good for Lollipop Charts (more on this in advanced plots)

# Goal: Explain relationship between order value and quantity of bikes sold

# Data Manipulation
order_value_tbl <- bike_orderlines_tbl %>%
    
    select(order_id, order_line, total_price, quantity) %>%
    
    group_by(order_id) %>%
    summarize(
        total_quantity = sum(quantity),
        total_price    = sum(total_price)
    ) %>%
    ungroup()

# Scatter Plot
order_value_tbl %>%
    
    ggplot(aes(x = total_quantity, y = total_price)) +
    
    geom_point(alpha = 0.5, size = 2) +
    geom_smooth(method = "lm", se = FALSE)

# 2.0 Line Plots ----
# - Great for time series

# Goal: Describe revenue by Month, expose cyclic nature

# Data Manipulation
revenue_by_month_tbl <- bike_orderlines_tbl %>%
    
    select(order_date, total_price) %>%
    
    mutate(year_month = floor_date(order_date, "months") %>% ymd()) %>%
    
    group_by(year_month) %>%
    summarize(revenue = sum(total_price)) %>%
    ungroup()

# Line Plot
revenue_by_month_tbl %>%
    
    ggplot(aes(year_month, revenue)) +
    
    geom_line(size = 0.5, linetype = 1) +
    geom_smooth(method = "loess", span = 0.2)

# 3.0 Bar / Column Plots ----
# - Great for categories

# Goal: Sales by Descriptive Category

# Data Manipulation
revenue_by_category_2_tbl <- bike_orderlines_tbl %>%
    
    select(category_2, total_price) %>%
    
    group_by(category_2) %>%
    summarize(revenue = sum(total_price)) %>%
    ungroup()

# Bar Plot
revenue_by_category_2_tbl %>%
    
    mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>%
    
    ggplot(aes(category_2, revenue)) +
    
    geom_col(fill = "#2c3e50") + 
    coord_flip()

# 4.0 Histogram / Density Plots ----
# - Great for inspecting the distribution of a variable


# Goal: Unit price of bicycles
# Histogram

bike_orderlines_tbl %>%
    
    distinct(model, price) %>%
    
    ggplot(aes(price)) +
    
    geom_histogram(bins = 25, fill = "purple", color = "white")

# Goal: Unit price of bicylce, segmenting by frame material
# Histogram
bike_orderlines_tbl %>%
    
    distinct(price, model, frame_material) %>%
    
    ggplot(aes(price, fill = frame_material)) +
    
    geom_histogram() +
    
    facet_wrap(~ frame_material, ncol = 1) +
    
    scale_fill_tq() +
    theme_tq()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Density

bike_orderlines_tbl %>%
    
    distinct(price, model, frame_material) %>%
    
    ggplot(aes(price, fill = frame_material)) +
    
    geom_density(alpha = 0.5) +
    # facet_wrap(~ frame_material, ncol = 1) +
    
    scale_fill_tq() +
    theme_tq() +
    theme(legend.position = "bottom")

# 5.0 Box Plot / Violin Plot ----
# - Great for comparing distributions


# Goal: Unit price of model, segmenting by category 2

# Data Manipulation
unit_price_by_cat_2_tbl <- bike_orderlines_tbl %>%
    
    select(category_2, model, price) %>%
    distinct() %>%
    
    mutate(category_2 = as_factor(category_2) %>% fct_reorder(price))

# Box Plot
unit_price_by_cat_2_tbl %>%
    
    ggplot(aes(category_2, price)) +
    
    geom_boxplot() +
    coord_flip() +
    theme_tq()

# Violin Plot & Jitter Plot

unit_price_by_cat_2_tbl %>%
    
    ggplot(aes(category_2, price)) +
    
    geom_jitter(width = 0.15, color = "#2c3e50") +
    geom_violin(alpha = 0.5) +
    
    coord_flip() +
    theme_tq()

# 6.0 Adding Text & Labels ----

# Goal: Exposing sales over time, highlighting outlier

# Data Manipulation

revenue_by_year_tbl <- bike_orderlines_tbl %>%
    
    select(order_date, total_price) %>%
    
    mutate(year = year(order_date)) %>%
    
    group_by(year) %>%
    summarize(revenue = sum(total_price)) %>%
    ungroup()

# Adding text to bar chart
# Filtering labels to highlight a point

revenue_by_year_tbl %>%
    
    # mutate(revenue_text = scales::dollar(revenue, scale = 1e-6, suffix = "M")) %>%
    
    ggplot(aes(year, revenue)) +
    
    geom_col(fill = "#2c3e50") +
    geom_smooth(method = "lm", se = FALSE) +
    
    geom_text(aes(label =  scales::dollar(revenue, scale = 1e-6, suffix = "M")), 
              vjust = 1.5, color = "white") +
    
    geom_label(label =  "Major Demand This Year",
               vjust = -0.5, 
               size  = 5,
               fill  = "#1f78b4",
               color = "white",
               fontface = "italic",
               data = revenue_by_year_tbl %>%
                   filter(year %in% c(2013))) + 
    
    expand_limits(y = 2e7) +
    theme_tq()