library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.1 √ purrr 0.3.2
## √ tibble 2.1.1 √ dplyr 0.8.0.1
## √ tidyr 0.8.3 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.5.3
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
#library(tidyquant)
library(readxl)
## Warning: package 'readxl' was built under R version 3.5.3
library(writexl)
## Warning: package 'writexl' was built under R version 3.5.3
讀excel檔的程式readxl、writexl
bikes_tbl <- read_excel("bikes.xlsx")
bikeshops_tbl <- read_excel("bikeshops.xlsx")
orderlines_tbl <- read_excel("orderlines.xlsx")
## New names:
## * `` -> ...1
bikes_tbl
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # ... with 87 more rows
orderlines_tbl %>% glimpse()
## Observations: 15,644
## Variables: 7
## $ ...1 <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",...
## $ order.id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6,...
## $ order.line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3,...
## $ order.date <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, 2...
## $ customer.id <dbl> 2, 2, 10, 10, 6, 6, 6, 6, 6, 22, 8, 8, 8, 8, 16, 1...
## $ product.id <dbl> 48, 52, 76, 52, 2, 50, 1, 4, 34, 26, 96, 66, 35, 7...
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,...
讀excel檔用read_excel
讀取bikes.xlsx、bikeshops.xlsx和
orderlines_tbl %>% glimpse() - 透過一個管道將orderlines_tbl丟到glimpse()中
bikes_tbl - 腳踏車型號資料
bikeshops_tbl - 腳踏車商家資料
orderlines_tbl - 腳踏車訂單編號資料
left_join(orderlines_tbl, bikes_tbl, by = c("product.id"="bike.id"))
## # A tibble: 15,644 x 10
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # ... with 15,634 more rows, and 4 more variables: quantity <dbl>,
## # model <chr>, description <chr>, price <dbl>
bikes_orderlines_joined_tbl<-orderlines_tbl %>% left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>%
left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))
bikes_orderlines_joined_tbl
## # A tibble: 15,644 x 12
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # ... with 15,634 more rows, and 6 more variables: quantity <dbl>,
## # model <chr>, description <chr>, price <dbl>, bikeshop.name <chr>,
## # location <chr>
orderlines_tbl %>% left_join(bikes_tbl, by = c("product.id"="bike.id")) %>%
left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))
## # A tibble: 15,644 x 12
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # ... with 15,634 more rows, and 6 more variables: quantity <dbl>,
## # model <chr>, description <chr>, price <dbl>, bikeshop.name <chr>,
## # location <chr>
將訂單編號和腳踏車型號2個資料連結
再將第一個連接好的資料跟腳踏車商家資料連結
orderlines_tbl %>% left_join(bikes_tbl, by = c(“product.id”=“bike.id”)) %>% left_join(bikeshops_tbl, by = c(“customer.id” = “bikeshop.id”)) - 透過一個管道將orderlines_tbl丟到left_join中
bike_orderlines_wrangled_tbl <- bikes_orderlines_joined_tbl %>%
separate(description,
into = c("category.1", "category.2", "frame.material"),
sep = " - ",
remove = TRUE) %>%
separate(location,
into = c("city", "state"),
sep = ", ",
remove = FALSE) %>%
mutate(total.price = price * quantity) %>%
select(-...1, -location) %>%
select(-ends_with(".id")) %>%
bind_cols(bikes_orderlines_joined_tbl %>% select(order.id)) %>%
select(contains("date"), contains("id"), contains("order"),
quantity, price, total.price,
everything()) %>%
rename(order_date = order.date) %>%
set_names(names(.) %>% str_replace_all("\\.", "_"))
bike_orderlines_wrangled_tbl %>% glimpse()
## Observations: 15,644
## Variables: 13
## $ order_date <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10...
## $ order_id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6,...
## $ order_line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2,...
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,...
## $ price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ total_price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ model <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast o...
## $ category_1 <chr> "Mountain", "Mountain", "Mountain", "Mountain",...
## $ category_2 <chr> "Over Mountain", "Over Mountain", "Trail", "Ove...
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carb...
## $ bikeshop_name <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Cl...
## $ city <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City...
## $ state <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY",...
將資料中description的部分分為category.1、category.2、frame.material 3個部分
將資料中location的部分分為city(城市)、state(州) 2個部分
將價格部份延長計算price * quantity(用mutate函數)
選擇要刪除的部分
ends_with(“.id”) - 用ends_with函數將所有名稱改成.id
重新將欄命名,包含date、id、order等
將order_date = order.date重新命名
將有.的部分改用_取代重新命名
bike_orderlines_wrangled_tbl %>% glimpse() - 透過一個管道將bike_orderlines_wrangled_tbl丟到glimpse()中