library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.1       √ purrr   0.3.2  
## √ tibble  2.1.1       √ dplyr   0.8.0.1
## √ tidyr   0.8.3       √ stringr 1.4.0  
## √ readr   1.3.1       √ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.5.3
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
#library(tidyquant)

library(readxl)
## Warning: package 'readxl' was built under R version 3.5.3
library(writexl)
## Warning: package 'writexl' was built under R version 3.5.3

安裝並執行tidyverse、lubridate、tidyquant、readxl、writexl

讀excel檔的程式readxl、writexl

bikes_tbl <- read_excel("bikes.xlsx")
bikeshops_tbl <- read_excel("bikeshops.xlsx")
orderlines_tbl <- read_excel("orderlines.xlsx")
## New names:
## * `` -> ...1
bikes_tbl
## # A tibble: 97 x 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # ... with 87 more rows
orderlines_tbl %>% glimpse()
## Observations: 15,644
## Variables: 7
## $ ...1        <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",...
## $ order.id    <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6,...
## $ order.line  <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3,...
## $ order.date  <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, 2...
## $ customer.id <dbl> 2, 2, 10, 10, 6, 6, 6, 6, 6, 22, 8, 8, 8, 8, 16, 1...
## $ product.id  <dbl> 48, 52, 76, 52, 2, 50, 1, 4, 34, 26, 96, 66, 35, 7...
## $ quantity    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,...

讀取excel資料檔

讀excel檔用read_excel

讀取bikes.xlsx、bikeshops.xlsx和

orderlines_tbl %>% glimpse() - 透過一個管道將orderlines_tbl丟到glimpse()中

bikes_tbl - 腳踏車型號資料

bikeshops_tbl - 腳踏車商家資料

orderlines_tbl - 腳踏車訂單編號資料

left_join(orderlines_tbl, bikes_tbl, by = c("product.id"="bike.id"))
## # A tibble: 15,644 x 10
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # ... with 15,634 more rows, and 4 more variables: quantity <dbl>,
## #   model <chr>, description <chr>, price <dbl>
bikes_orderlines_joined_tbl<-orderlines_tbl %>% left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>% 
    left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))

bikes_orderlines_joined_tbl
## # A tibble: 15,644 x 12
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # ... with 15,634 more rows, and 6 more variables: quantity <dbl>,
## #   model <chr>, description <chr>, price <dbl>, bikeshop.name <chr>,
## #   location <chr>
orderlines_tbl %>% left_join(bikes_tbl, by = c("product.id"="bike.id")) %>% 
                   left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))
## # A tibble: 15,644 x 12
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # ... with 15,634 more rows, and 6 more variables: quantity <dbl>,
## #   model <chr>, description <chr>, price <dbl>, bikeshop.name <chr>,
## #   location <chr>

連結資料

將訂單編號和腳踏車型號2個資料連結

再將第一個連接好的資料跟腳踏車商家資料連結

orderlines_tbl %>% left_join(bikes_tbl, by = c(“product.id”=“bike.id”)) %>% left_join(bikeshops_tbl, by = c(“customer.id” = “bikeshop.id”)) - 透過一個管道將orderlines_tbl丟到left_join中

bike_orderlines_wrangled_tbl <- bikes_orderlines_joined_tbl %>%
  
  separate(description,
           into = c("category.1", "category.2", "frame.material"),
           sep = " - ",
           remove = TRUE) %>%
  
  separate(location,
           into = c("city", "state"),
           sep  = ", ",
           remove = FALSE) %>%
  
  mutate(total.price = price * quantity) %>%
  

  select(-...1, -location) %>%
  select(-ends_with(".id")) %>%
  
  bind_cols(bikes_orderlines_joined_tbl %>% select(order.id)) %>%
  

  select(contains("date"), contains("id"), contains("order"),
         quantity, price, total.price,
         everything()) %>%
  

  rename(order_date = order.date) %>%
  set_names(names(.) %>% str_replace_all("\\.", "_")) 

bike_orderlines_wrangled_tbl %>% glimpse()
## Observations: 15,644
## Variables: 13
## $ order_date     <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10...
## $ order_id       <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6,...
## $ order_line     <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2,...
## $ quantity       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,...
## $ price          <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ total_price    <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ model          <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast o...
## $ category_1     <chr> "Mountain", "Mountain", "Mountain", "Mountain",...
## $ category_2     <chr> "Over Mountain", "Over Mountain", "Trail", "Ove...
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carb...
## $ bikeshop_name  <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Cl...
## $ city           <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City...
## $ state          <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY",...

處理資料

將資料中description的部分分為category.1、category.2、frame.material 3個部分

將資料中location的部分分為city(城市)、state(州) 2個部分

將價格部份延長計算price * quantity(用mutate函數)

選擇要刪除的部分

ends_with(“.id”) - 用ends_with函數將所有名稱改成.id

重新將欄命名,包含date、id、order等

將order_date = order.date重新命名

將有.的部分改用_取代重新命名

bike_orderlines_wrangled_tbl %>% glimpse() - 透過一個管道將bike_orderlines_wrangled_tbl丟到glimpse()中