library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.1 √ purrr 0.3.2
## √ tibble 2.1.1 √ dplyr 0.8.1
## √ tidyr 0.8.3 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 3.5.3
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.5.3
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
## Loading required package: PerformanceAnalytics
## Warning: package 'PerformanceAnalytics' was built under R version 3.5.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.5.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.5.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Warning: package 'quantmod' was built under R version 3.5.3
## Loading required package: TTR
## Warning: package 'TTR' was built under R version 3.5.3
## Version 0.4-0 included new data defaults. See ?getSymbols.
bike_orderlines_tbl <- read_rds("data_wrangled_student/bike_orderlines.rds")
bike_orderlines_tbl
## # A tibble: 15,644 x 13
## order_date order_id order_line quantity price total_price model
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2011-01-07 00:00:00 1 1 1 6070 6070 Jeky~
## 2 2011-01-07 00:00:00 1 2 1 5970 5970 Trig~
## 3 2011-01-10 00:00:00 2 1 1 2770 2770 Beas~
## 4 2011-01-10 00:00:00 2 2 1 5970 5970 Trig~
## 5 2011-01-10 00:00:00 3 1 1 10660 10660 Supe~
## 6 2011-01-10 00:00:00 3 2 1 3200 3200 Jeky~
## 7 2011-01-10 00:00:00 3 3 1 12790 12790 Supe~
## 8 2011-01-10 00:00:00 3 4 1 5330 5330 Supe~
## 9 2011-01-10 00:00:00 3 5 1 1570 1570 Syna~
## 10 2011-01-11 00:00:00 4 1 1 4800 4800 Syna~
## # ... with 15,634 more rows, and 6 more variables: category_1 <chr>,
## # category_2 <chr>, frame_material <chr>, bikeshop_name <chr>,
## # city <chr>, state <chr>
執行tidyverse、tidyquant
用read_rds 讀取bike_orderlines.rds 資料
sales_by_cat_2_tbl <- bike_orderlines_tbl %>%
select(category_2, total_price) %>%
group_by(category_2) %>%
summarize(sales = sum(total_price)) %>%
ungroup() %>%
arrange(desc(sales)) %>%
mutate(category_2 = category_2 %>% as_factor() %>% fct_rev())
用select函數選擇需要的欄位
用mutate函數新增新的欄位資料
用summarize函數總整所有資料
用group_by函數 - 群組起來
用arrange函數將資料排列整理
desc函數 - 由大到小排列
fct_rev() - 反轉資料
sales_by_cat_2_tbl %>%
ggplot(aes(x = sales, y = category_2)) +
geom_point(size = 5, color = "#2c3e50") +
labs(title = "Sales By Category 2") +
scale_x_continuous(labels = scales::dollar_format()) +
theme_tq() +
expand_limits(x = 0)
plot_sales <- function(data) {
data %>%
ggplot(aes(x = sales, y = category_2)) +
geom_point(size = 5, color = "#2c3e50") +
labs(title = "Sales By Category 2") +
scale_x_continuous(labels = scales::dollar_format()) +
theme_tq() +
expand_limits(x = 0)
}
sales_by_cat_2_tbl %>%
plot_sales()
用geom_point函數畫點
theme_tq() - 美化圖表
plot_sales <- function(data) - 設定函數方便使用
sales_by_cat_2_tbl %>% pull(category_2) %>% levels()
## [1] "Fat Bike" "Sport" "Cyclocross"
## [4] "Triathalon" "Over Mountain" "Trail"
## [7] "Endurance Road" "Elite Road" "Cross Country Race"
sales_by_cat_2_tbl %>% pull(category_2) %>% as.numeric()
## [1] 9 8 7 6 5 4 3 2 1
pull函數 - 將資料拉出
as.numeric() - 數字形式
sales_by_cat_2_tbl %>%
mutate(category_2 = category_2 %>% fct_rev() %>% fct_rev()) %>%
mutate(
label = category_2 %>% as.character(),
value = category_2 %>% as.numeric()
)
## # A tibble: 9 x 4
## category_2 sales label value
## <fct> <dbl> <chr> <dbl>
## 1 Cross Country Race 19224630 Cross Country Race 9
## 2 Elite Road 15334665 Elite Road 8
## 3 Endurance Road 10381060 Endurance Road 7
## 4 Trail 9373460 Trail 6
## 5 Over Mountain 7571270 Over Mountain 5
## 6 Triathalon 4053750 Triathalon 4
## 7 Cyclocross 2108120 Cyclocross 3
## 8 Sport 1932755 Sport 2
## 9 Fat Bike 1052620 Fat Bike 1
用mutate函數新增新的欄位資料
label為文字形式
value為數字形式
sales_by_cat_2_tbl %>%
mutate(
category_2 = as.character(category_2),
category_2_as_factor = as_factor(category_2) %>% as.numeric(),
category_2_as.factor = as.factor(category_2) %>% as.numeric()
)
## # A tibble: 9 x 4
## category_2 sales category_2_as_factor category_2_as.factor
## <chr> <dbl> <dbl> <dbl>
## 1 Cross Country Race 19224630 1 1
## 2 Elite Road 15334665 2 3
## 3 Endurance Road 10381060 3 4
## 4 Trail 9373460 4 8
## 5 Over Mountain 7571270 5 6
## 6 Triathalon 4053750 6 9
## 7 Cyclocross 2108120 7 2
## 8 Sport 1932755 8 7
## 9 Fat Bike 1052620 9 5
用mutate函數新增新的欄位資料
category_2為文字形式
category_2_as_factor、category_2_as.factor為數字形式
as_factor - 以原本的排列作排列
as.factor - 以英文字母排列
sales_by_cat_2_tbl %>%
arrange(desc(sales)) %>%
mutate(sales_negative = -sales) %>%
mutate(
category_2 = category_2 %>% fct_reorder(sales_negative),
values = category_2 %>% as.numeric()) %>%
plot_sales()
用arrange函數將資料排列整理
desc函數 - 由大到小排列
用mutate函數新增新的欄位資料
fct_reorder函數 - 針對因子排序
category_2為文字形式
sales_by_cat_2_q_tbl <- bike_orderlines_tbl %>%
mutate(order_date = order_date %>% floor_date("quarter") %>% ymd()) %>%
group_by(category_2, order_date) %>%
summarise(sales = sum(total_price)) %>%
ungroup()
sales_by_cat_2_q_tbl
## # A tibble: 180 x 3
## category_2 order_date sales
## <chr> <date> <dbl>
## 1 Cross Country Race 2011-01-01 610060
## 2 Cross Country Race 2011-04-01 1083310
## 3 Cross Country Race 2011-07-01 609770
## 4 Cross Country Race 2011-10-01 614110
## 5 Cross Country Race 2012-01-01 731330
## 6 Cross Country Race 2012-04-01 1097010
## 7 Cross Country Race 2012-07-01 1000220
## 8 Cross Country Race 2012-10-01 532240
## 9 Cross Country Race 2013-01-01 1017470
## 10 Cross Country Race 2013-04-01 1503950
## # ... with 170 more rows
sales_by_cat_2_q_tbl %>%
mutate(category_2 = category_2 %>% fct_reorder2(order_date, sales)) %>%
ggplot(aes(x = order_date, y = sales, color = category_2)) +
geom_point() +
geom_line() +
facet_wrap(~ category_2) +
theme_tq() +
scale_color_tq() +
scale_y_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M"))
用mutate函數新增新的欄位資料
用summarize函數總整所有資料
用group_by函數 - 群組起來
用geom_point函數畫點
用geom_line函數畫線
facet_wrap函數 - 打資料分成很多小圖
theme_tq() 、scale_color_tq()美化圖表
sales_by_cat_2_tbl %>%
mutate(category_2 = category_2 %>% fct_lump(n = 6,
w = sales,
other_level = "All Other Bike Categories")) %>%
group_by(category_2) %>%
summarize(sales = sum(sales)) %>%
mutate(category_2 = category_2 %>% fct_relevel("All Other Bike Categories", after = 0)) %>%
plot_sales()
用mutate函數新增新的欄位資料
用summarize函數總整所有資料
用group_by函數 - 群組起來
fct_lump函數 - 把資料堆疊
fct_relevel函數 - 把資料重新排序
library(tidyverse)
library(lubridate)
library(tidyquant)
bike_orderlines_tbl <- read_rds("data_wrangled_student/bike_orderlines.rds")
glimpse(bike_orderlines_tbl)
## Observations: 15,644
## Variables: 13
## $ order_date <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10...
## $ order_id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6,...
## $ order_line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2,...
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,...
## $ price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ total_price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ model <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast o...
## $ category_1 <chr> "Mountain", "Mountain", "Mountain", "Mountain",...
## $ category_2 <chr> "Over Mountain", "Over Mountain", "Trail", "Ove...
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carb...
## $ bikeshop_name <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Cl...
## $ city <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City...
## $ state <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY",...
執行tidyverse、tidyquant、lubridate
用read_rds 讀取bike_orderlines.rds 資料
order_value_tbl <- bike_orderlines_tbl %>%
select(order_id, order_line, total_price, quantity) %>%
group_by(order_id) %>%
summarize(
total_quantity = sum(quantity),
total_price = sum(total_price)
) %>%
ungroup()
用select函數選擇需要的欄位
用mutate函數新增新的欄位資料
用summarize函數總整所有資料
用group_by函數 - 群組起來
order_value_tbl %>%
ggplot(aes(x = total_quantity, y = total_price)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", se = FALSE)
用aes設定X與Y軸
用geom_point函數畫點
alpha - 透明度
geom_smooth函數畫平滑線(lm - 回歸)
revenue_by_month_tbl <- bike_orderlines_tbl %>%
select(order_date, total_price) %>%
mutate(year_month = floor_date(order_date, "months") %>% ymd()) %>%
group_by(year_month) %>%
summarize(revenue = sum(total_price)) %>%
ungroup()
用mutate函數新增新的欄位資料
用summarize函數總整所有資料
用group_by函數 - 群組起來
floor_date函數 - 取第一天的資料
revenue_by_month_tbl %>%
ggplot(aes(year_month, revenue)) +
geom_line(size = 0.5, linetype = 1) +
geom_smooth(method = "loess", span = 0.2)
用aes設定X與Y軸
用geom_linet函數畫線
geom_smooth函數畫平滑線(loess - 曲線)
revenue_by_category_2_tbl <- bike_orderlines_tbl %>%
select(category_2, total_price) %>%
group_by(category_2) %>%
summarize(revenue = sum(total_price)) %>%
ungroup()
用select函數選擇需要的欄位
用summarize函數總整所有資料
用group_by函數 - 群組起來
revenue_by_category_2_tbl %>%
mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>%
ggplot(aes(category_2, revenue)) +
geom_col(fill = "#2c3e50") +
coord_flip()
用mutate函數新增新的欄位資料
geom_col函數畫欄
coord_flip()函數 - X與Y軸對調
bike_orderlines_tbl %>%
distinct(price, model, frame_material) %>%
ggplot(aes(price, fill = frame_material)) +
geom_histogram() +
facet_wrap(~ frame_material, ncol = 1) +
scale_fill_tq() +
theme_tq()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
distinct函數 - 把重複資料刪除取唯一一個
用aes設定X與Y軸
geom_histogram()畫直方圖
scale_fill_tq()、theme_tq() - 用來美化圖表
facet_wrap函數 - 打資料分成很多小圖
bike_orderlines_tbl %>%
distinct(price, model, frame_material) %>%
ggplot(aes(price, fill = frame_material)) +
geom_density(alpha = 0.5) +
# facet_wrap(~ frame_material, ncol = 1) +
scale_fill_tq() +
theme_tq() +
theme(legend.position = "bottom")
distinct函數 - 把重複資料刪除取唯一一個
用aes設定X與Y軸
geom_density畫曲線分布圖
scale_fill_tq()、theme_tq() - 用來美化圖表
unit_price_by_cat_2_tbl <- bike_orderlines_tbl %>%
select(category_2, model, price) %>%
distinct() %>%
mutate(category_2 = as_factor(category_2) %>% fct_reorder(price))
distinct函數 - 把重複資料刪除取唯一一個
用select函數選擇需要的欄位
用mutate函數新增新的欄位資料
unit_price_by_cat_2_tbl %>%
ggplot(aes(category_2, price)) +
geom_boxplot() +
coord_flip() +
theme_tq()
用aes設定X與Y軸
geom_boxplot畫盒狀圖
coord_flip()函數 - X與Y軸對調
theme_tq()美化圖表
unit_price_by_cat_2_tbl %>%
ggplot(aes(category_2, price)) +
geom_jitter(width = 0.15, color = "#2c3e50") +
geom_violin(alpha = 0.5) +
coord_flip() +
theme_tq()
用aes設定X與Y軸
coord_flip()函數 - X與Y軸對調
theme_tq()美化圖表
revenue_by_year_tbl <- bike_orderlines_tbl %>%
select(order_date, total_price) %>%
mutate(year = year(order_date)) %>%
group_by(year) %>%
summarize(revenue = sum(total_price)) %>%
ungroup()
用select函數選擇需要的欄位
用mutate函數新增新的欄位資料
用summarize函數總整所有資料
用group_by函數 - 群組起來
revenue_by_year_tbl %>%
# mutate(revenue_text = scales::dollar(revenue, scale = 1e-6, suffix = "M")) %>%
ggplot(aes(year, revenue)) +
geom_col(fill = "#2c3e50") +
geom_smooth(method = "lm", se = FALSE) +
geom_text(aes(label = scales::dollar(revenue, scale = 1e-6, suffix = "M")),
vjust = 1.5, color = "white") +
geom_label(label = "Major Demand This Year",
vjust = -0.5,
size = 5,
fill = "#1f78b4",
color = "white",
fontface = "italic",
data = revenue_by_year_tbl %>%
filter(year %in% c(2013))) +
expand_limits(y = 2e7) +
theme_tq()
用aes設定X與Y軸
geom_col函數畫欄
geom_smooth函數畫平滑線
用geom_text函數把文字標示在圖表上
用geom_label函數修改標籤資料
theme_tq()美化圖表