checkpoint_3_forcats
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.1 √ purrr 0.3.2
## √ tibble 2.1.1 √ dplyr 0.8.1
## √ tidyr 0.8.3 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 3.5.3
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.5.3
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
## Loading required package: PerformanceAnalytics
## Warning: package 'PerformanceAnalytics' was built under R version 3.5.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.5.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.5.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Warning: package 'quantmod' was built under R version 3.5.3
## Loading required package: TTR
## Warning: package 'TTR' was built under R version 3.5.3
## Version 0.4-0 included new data defaults. See ?getSymbols.
bike_orderlines_tbl <- read_rds("D:/hw0527/HW0527/data_wrangled_student/bike_orderlines.rds")
bike_orderlines_tbl
## # A tibble: 15,644 x 13
## order_date order_id order_line quantity price total_price model
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2011-01-07 00:00:00 1 1 1 6070 6070 Jeky~
## 2 2011-01-07 00:00:00 1 2 1 5970 5970 Trig~
## 3 2011-01-10 00:00:00 2 1 1 2770 2770 Beas~
## 4 2011-01-10 00:00:00 2 2 1 5970 5970 Trig~
## 5 2011-01-10 00:00:00 3 1 1 10660 10660 Supe~
## 6 2011-01-10 00:00:00 3 2 1 3200 3200 Jeky~
## 7 2011-01-10 00:00:00 3 3 1 12790 12790 Supe~
## 8 2011-01-10 00:00:00 3 4 1 5330 5330 Supe~
## 9 2011-01-10 00:00:00 3 5 1 1570 1570 Syna~
## 10 2011-01-11 00:00:00 4 1 1 4800 4800 Syna~
## # ... with 15,634 more rows, and 6 more variables: category_1 <chr>,
## # category_2 <chr>, frame_material <chr>, bikeshop_name <chr>,
## # city <chr>, state <chr>
# 2.0 Motivating Example -----
# Manipulation
sales_by_cat_2_tbl <- bike_orderlines_tbl %>%
select(category_2, total_price) %>%
group_by(category_2) %>%
summarize(sales = sum(total_price)) %>%
ungroup() %>%
arrange(desc(sales)) %>%
mutate(category_2 = category_2 %>% as_factor() %>% fct_rev())
# Plotting
sales_by_cat_2_tbl %>%
ggplot(aes(x = sales, y = category_2)) +
geom_point(size = 5, color = "#2c3e50") +
labs(title = "Sales By Category 2") +
scale_x_continuous(labels = scales::dollar_format()) +
theme_tq() +
expand_limits(x = 0)

plot_sales <- function(data) {
data %>%
ggplot(aes(x = sales, y = category_2)) +
geom_point(size = 5, color = "#2c3e50") +
labs(title = "Sales By Category 2") +
scale_x_continuous(labels = scales::dollar_format()) +
theme_tq() +
expand_limits(x = 0)
}
sales_by_cat_2_tbl %>%
plot_sales()

# 3.1 Inspecting Factors ----
# Vector
sales_by_cat_2_tbl %>% pull(category_2) %>% levels()
## [1] "Fat Bike" "Sport" "Cyclocross"
## [4] "Triathalon" "Over Mountain" "Trail"
## [7] "Endurance Road" "Elite Road" "Cross Country Race"
sales_by_cat_2_tbl %>% pull(category_2) %>% as.numeric()
## [1] 9 8 7 6 5 4 3 2 1
# Tibble
sales_by_cat_2_tbl %>%
mutate(category_2 = category_2 %>% fct_rev() %>% fct_rev()) %>%
mutate(
label = category_2 %>% as.character(),
value = category_2 %>% as.numeric()
)
## # A tibble: 9 x 4
## category_2 sales label value
## <fct> <dbl> <chr> <dbl>
## 1 Cross Country Race 19224630 Cross Country Race 9
## 2 Elite Road 15334665 Elite Road 8
## 3 Endurance Road 10381060 Endurance Road 7
## 4 Trail 9373460 Trail 6
## 5 Over Mountain 7571270 Over Mountain 5
## 6 Triathalon 4053750 Triathalon 4
## 7 Cyclocross 2108120 Cyclocross 3
## 8 Sport 1932755 Sport 2
## 9 Fat Bike 1052620 Fat Bike 1
# 3.2 Creating Factors: as_factor() vs as.factor() ----
sales_by_cat_2_tbl %>%
mutate(
category_2 = as.character(category_2),
category_2_as_factor = as_factor(category_2) %>% as.numeric(),
category_2_as.factor = as.factor(category_2) %>% as.numeric()
)
## # A tibble: 9 x 4
## category_2 sales category_2_as_factor category_2_as.factor
## <chr> <dbl> <dbl> <dbl>
## 1 Cross Country Race 19224630 1 1
## 2 Elite Road 15334665 2 3
## 3 Endurance Road 10381060 3 4
## 4 Trail 9373460 4 8
## 5 Over Mountain 7571270 5 6
## 6 Triathalon 4053750 6 9
## 7 Cyclocross 2108120 7 2
## 8 Sport 1932755 8 7
## 9 Fat Bike 1052620 9 5
# 3.3 Reordering Factors: fct_reorder() and fct_rev() ----
sales_by_cat_2_tbl %>%
arrange(desc(sales)) %>%
mutate(sales_negative = -sales) %>%
mutate(
category_2 = category_2 %>% fct_reorder(sales_negative),
values = category_2 %>% as.numeric()) %>%
plot_sales()

# 3.4 Time-Based Reordering: fct_reorder2() ----
sales_by_cat_2_q_tbl <- bike_orderlines_tbl %>%
mutate(order_date = order_date %>% floor_date("quarter") %>% ymd()) %>%
group_by(category_2, order_date) %>%
summarise(sales = sum(total_price)) %>%
ungroup()
sales_by_cat_2_q_tbl
## # A tibble: 180 x 3
## category_2 order_date sales
## <chr> <date> <dbl>
## 1 Cross Country Race 2011-01-01 610060
## 2 Cross Country Race 2011-04-01 1083310
## 3 Cross Country Race 2011-07-01 609770
## 4 Cross Country Race 2011-10-01 614110
## 5 Cross Country Race 2012-01-01 731330
## 6 Cross Country Race 2012-04-01 1097010
## 7 Cross Country Race 2012-07-01 1000220
## 8 Cross Country Race 2012-10-01 532240
## 9 Cross Country Race 2013-01-01 1017470
## 10 Cross Country Race 2013-04-01 1503950
## # ... with 170 more rows
sales_by_cat_2_q_tbl %>%
mutate(category_2 = category_2 %>% fct_reorder2(order_date, sales)) %>%
ggplot(aes(x = order_date, y = sales, color = category_2)) +
geom_point() +
geom_line() +
facet_wrap(~ category_2) +
theme_tq() +
scale_color_tq() +
scale_y_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M"))

# 3.5 Creating "Other" Category - fct_lump() & fct_relevel() ----
sales_by_cat_2_tbl %>%
mutate(category_2 = category_2 %>% fct_lump(n = 6,
w = sales,
other_level = "All Other Bike Categories")) %>%
group_by(category_2) %>%
summarize(sales = sum(sales)) %>%
mutate(category_2 = category_2 %>% fct_relevel("All Other Bike Categories", after = 0)) %>%
plot_sales()

ggplot2_geometries_1
bike_orderlines_tbl <- read_rds("D:/hw0527/HW0527/data_wrangled_student/bike_orderlines.rds")
glimpse(bike_orderlines_tbl)
## Observations: 15,644
## Variables: 13
## $ order_date <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10...
## $ order_id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6,...
## $ order_line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2,...
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,...
## $ price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ total_price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ model <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast o...
## $ category_1 <chr> "Mountain", "Mountain", "Mountain", "Mountain",...
## $ category_2 <chr> "Over Mountain", "Over Mountain", "Trail", "Ove...
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carb...
## $ bikeshop_name <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Cl...
## $ city <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City...
## $ state <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY",...
# 1.0 Point / Scatter Plots ----
# - Great for Continuous vs Continuous
# - Also good for Lollipop Charts (more on this in advanced plots)
# Goal: Explain relationship between order value and quantity of bikes sold
# Data Manipulation
order_value_tbl <- bike_orderlines_tbl %>%
select(order_id, order_line, total_price, quantity) %>%
group_by(order_id) %>%
summarize(
total_quantity = sum(quantity),
total_price = sum(total_price)
) %>%
ungroup()
# Scatter Plot
order_value_tbl %>%
ggplot(aes(x = total_quantity, y = total_price)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", se = FALSE)

# 2.0 Line Plots ----
# - Great for time series
# Goal: Describe revenue by Month, expose cyclic nature
# Data Manipulation
revenue_by_month_tbl <- bike_orderlines_tbl %>%
select(order_date, total_price) %>%
mutate(year_month = floor_date(order_date, "months") %>% ymd()) %>%
group_by(year_month) %>%
summarize(revenue = sum(total_price)) %>%
ungroup()
# Line Plot
revenue_by_month_tbl %>%
ggplot(aes(year_month, revenue)) +
geom_line(size = 0.5, linetype = 1) +
geom_smooth(method = "loess", span = 0.2)

# 3.0 Bar / Column Plots ----
# - Great for categories
# Goal: Sales by Descriptive Category
# Data Manipulation
revenue_by_category_2_tbl <- bike_orderlines_tbl %>%
select(category_2, total_price) %>%
group_by(category_2) %>%
summarize(revenue = sum(total_price)) %>%
ungroup()
# Bar Plot
revenue_by_category_2_tbl %>%
mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>%
ggplot(aes(category_2, revenue)) +
geom_col(fill = "#2c3e50") +
coord_flip()

# 4.0 Histogram / Density Plots ----
# - Great for inspecting the distribution of a variable
# Goal: Unit price of bicycles
# Histogram
bike_orderlines_tbl %>%
distinct(model, price) %>%
ggplot(aes(price)) +
geom_histogram(bins = 25, fill = "purple", color = "white")

# Goal: Unit price of bicylce, segmenting by frame material
# Histogram
bike_orderlines_tbl %>%
distinct(price, model, frame_material) %>%
ggplot(aes(price, fill = frame_material)) +
geom_histogram() +
facet_wrap(~ frame_material, ncol = 1) +
scale_fill_tq() +
theme_tq()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Density
bike_orderlines_tbl %>%
distinct(price, model, frame_material) %>%
ggplot(aes(price, fill = frame_material)) +
geom_density(alpha = 0.5) +
# facet_wrap(~ frame_material, ncol = 1) +
scale_fill_tq() +
theme_tq() +
theme(legend.position = "bottom")

# 5.0 Box Plot / Violin Plot ----
# - Great for comparing distributions
# Goal: Unit price of model, segmenting by category 2
# Data Manipulation
unit_price_by_cat_2_tbl <- bike_orderlines_tbl %>%
select(category_2, model, price) %>%
distinct() %>%
mutate(category_2 = as_factor(category_2) %>% fct_reorder(price))
# Box Plot
unit_price_by_cat_2_tbl %>%
ggplot(aes(category_2, price)) +
geom_boxplot() +
coord_flip() +
theme_tq()

# Violin Plot & Jitter Plot
unit_price_by_cat_2_tbl %>%
ggplot(aes(category_2, price)) +
geom_jitter(width = 0.15, color = "#2c3e50") +
geom_violin(alpha = 0.5) +
coord_flip() +
theme_tq()

# 6.0 Adding Text & Labels ----
# Goal: Exposing sales over time, highlighting outlier
# Data Manipulation
revenue_by_year_tbl <- bike_orderlines_tbl %>%
select(order_date, total_price) %>%
mutate(year = year(order_date)) %>%
group_by(year) %>%
summarize(revenue = sum(total_price)) %>%
ungroup()
# Adding text to bar chart
# Filtering labels to highlight a point
revenue_by_year_tbl %>%
# mutate(revenue_text = scales::dollar(revenue, scale = 1e-6, suffix = "M")) %>%
ggplot(aes(year, revenue)) +
geom_col(fill = "#2c3e50") +
geom_smooth(method = "lm", se = FALSE) +
geom_text(aes(label = scales::dollar(revenue, scale = 1e-6, suffix = "M")),
vjust = 1.5, color = "white") +
geom_label(label = "Major Demand This Year",
vjust = -0.5,
size = 5,
fill = "#1f78b4",
color = "white",
fontface = "italic",
data = revenue_by_year_tbl %>%
filter(year %in% c(2013))) +
expand_limits(y = 2e7) +
theme_tq()
