library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.1 √ purrr 0.3.2
## √ tibble 2.1.1 √ dplyr 0.8.0.1
## √ tidyr 0.8.3 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.5.3
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(readxl)
## Warning: package 'readxl' was built under R version 3.5.3
library(writexl)
## Warning: package 'writexl' was built under R version 3.5.3
?read_excel()
## starting httpd help server ... done
bikes_tbl <- read_excel("bikes.xlsx")
bikeshops_tbl <- read_excel("bikeshops.xlsx")
orderlines_tbl <- read_excel("orderlines.xlsx")
## New names:
## * `` -> ...1
bikes_tbl
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # ... with 87 more rows
glimpse(bikes_tbl)
## Observations: 97
## Variables: 4
## $ bike.id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ model <chr> "Supersix Evo Black Inc.", "Supersix Evo Hi-Mod Te...
## $ description <chr> "Road - Elite Road - Carbon", "Road - Elite Road -...
## $ price <dbl> 12790, 10660, 7990, 5330, 4260, 3940, 3200, 2660, ...
bikeshops_tbl
## # A tibble: 30 x 3
## bikeshop.id bikeshop.name location
## <dbl> <chr> <chr>
## 1 1 Pittsburgh Mountain Machines Pittsburgh, PA
## 2 2 Ithaca Mountain Climbers Ithaca, NY
## 3 3 Columbus Race Equipment Columbus, OH
## 4 4 Detroit Cycles Detroit, MI
## 5 5 Cincinnati Speed Cincinnati, OH
## 6 6 Louisville Race Equipment Louisville, KY
## 7 7 Nashville Cruisers Nashville, TN
## 8 8 Denver Bike Shop Denver, CO
## 9 9 Minneapolis Bike Shop Minneapolis, MN
## 10 10 Kansas City 29ers Kansas City, KS
## # ... with 20 more rows
orderlines_tbl
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # ... with 15,634 more rows, and 1 more variable: quantity <dbl>
?left_join
orderlines_tbl
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # ... with 15,634 more rows, and 1 more variable: quantity <dbl>
bikes_tbl
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # ... with 87 more rows
left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id"))
## # A tibble: 15,644 x 10
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # ... with 15,634 more rows, and 4 more variables: quantity <dbl>,
## # model <chr>, description <chr>, price <dbl>
bike_orderlines_joined_tbl <- orderlines_tbl %>%
left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>%
left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))
bike_orderlines_joined_tbl
## # A tibble: 15,644 x 12
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # ... with 15,634 more rows, and 6 more variables: quantity <dbl>,
## # model <chr>, description <chr>, price <dbl>, bikeshop.name <chr>,
## # location <chr>
bike_orderlines_joined_tbl %>% glimpse()
## Observations: 15,644
## Variables: 12
## $ ...1 <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10...
## $ order.id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, ...
## $ order.line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, ...
## $ order.date <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10,...
## $ customer.id <dbl> 2, 2, 10, 10, 6, 6, 6, 6, 6, 22, 8, 8, 8, 8, 16,...
## $ product.id <dbl> 48, 52, 76, 52, 2, 50, 1, 4, 34, 26, 96, 66, 35,...
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, ...
## $ model <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast of...
## $ description <chr> "Mountain - Over Mountain - Carbon", "Mountain -...
## $ price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330...
## $ bikeshop.name <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Cli...
## $ location <chr> "Ithaca, NY", "Ithaca, NY", "Kansas City, KS", "...
bike_orderlines_wrangled_tbl <- bike_orderlines_joined_tbl %>%
# Separate description into category.1, category.2, and frame.material
separate(description,
into = c("category.1", "category.2", "frame.material"),
sep = " - ",
remove = TRUE) %>%
# Separate location into city and state
separate(location,
into = c("city", "state"),
sep = ", ",
remove = FALSE) %>%
# price extended
mutate(total.price = price * quantity) %>%
# Reorganize
select(-...1, -location) %>%
select(-ends_with(".id")) %>%
bind_cols(bike_orderlines_joined_tbl %>% select(order.id)) %>%
# Reorder columns
select(contains("date"), contains("id"), contains("order"),
quantity, price, total.price,
everything()) %>%
# Renaming columns
rename(order_date = order.date) %>%
set_names(names(.) %>% str_replace_all("\\.", "_"))
bike_orderlines_wrangled_tbl %>% glimpse()
## Observations: 15,644
## Variables: 13
## $ order_date <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10...
## $ order_id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6,...
## $ order_line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2,...
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,...
## $ price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ total_price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ model <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast o...
## $ category_1 <chr> "Mountain", "Mountain", "Mountain", "Mountain",...
## $ category_2 <chr> "Over Mountain", "Over Mountain", "Trail", "Ove...
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carb...
## $ bikeshop_name <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Cl...
## $ city <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City...
## $ state <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY",...