library(readxl)
library(tidyverse)
## ── Attaching packages ───────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(lubridate)
library(broom)
bike_orderlines_tbl <- read_rds("data_wrangled_student/bike_orderlines.rds")
glimpse(bike_orderlines_tbl)
## Observations: 15,644
## Variables: 13
## $ order_date <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, …
## $ order_id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6…
## $ order_line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3…
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1…
## $ price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330,…
## $ total_price <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330,…
## $ model <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast of …
## $ category_1 <chr> "Mountain", "Mountain", "Mountain", "Mountain", "…
## $ category_2 <chr> "Over Mountain", "Over Mountain", "Trail", "Over …
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carbon…
## $ bikeshop_name <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Clim…
## $ city <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City",…
## $ state <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY", "…
# 1.0 PRIMER ON PURRR ----
# Programmatically getting Excel files into R
excel_paths_tbl <- fs::dir_info("data/bike_sales")
paths_chr <- excel_paths_tbl %>%
pull(path)
# What Not To Do: Don't use for loops
excel_list <- list()
for (path in paths_chr) {
excel_list[[path]] <- read_excel(path)
}
## New names:
## * `` -> ...1
excel_list
## $`data/bike_sales/bikes.xlsx`
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # … with 87 more rows
##
## $`data/bike_sales/bikeshops.xlsx`
## # A tibble: 30 x 3
## bikeshop.id bikeshop.name location
## <dbl> <chr> <chr>
## 1 1 Pittsburgh Mountain Machines Pittsburgh, PA
## 2 2 Ithaca Mountain Climbers Ithaca, NY
## 3 3 Columbus Race Equipment Columbus, OH
## 4 4 Detroit Cycles Detroit, MI
## 5 5 Cincinnati Speed Cincinnati, OH
## 6 6 Louisville Race Equipment Louisville, KY
## 7 7 Nashville Cruisers Nashville, TN
## 8 8 Denver Bike Shop Denver, CO
## 9 9 Minneapolis Bike Shop Minneapolis, MN
## 10 10 Kansas City 29ers Kansas City, KS
## # … with 20 more rows
##
## $`data/bike_sales/orderlines.xlsx`
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
# What to Do: Use map()
?map
# Method 1: function name
excel_list_2 <- paths_chr %>%
map(read_excel) %>%
set_names(paths_chr)
## New names:
## * `` -> ...1
# Method 2: Anonymous Function
paths_chr %>%
map(~ read_excel(.))
## New names:
## * `` -> ...1
## [[1]]
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # … with 87 more rows
##
## [[2]]
## # A tibble: 30 x 3
## bikeshop.id bikeshop.name location
## <dbl> <chr> <chr>
## 1 1 Pittsburgh Mountain Machines Pittsburgh, PA
## 2 2 Ithaca Mountain Climbers Ithaca, NY
## 3 3 Columbus Race Equipment Columbus, OH
## 4 4 Detroit Cycles Detroit, MI
## 5 5 Cincinnati Speed Cincinnati, OH
## 6 6 Louisville Race Equipment Louisville, KY
## 7 7 Nashville Cruisers Nashville, TN
## 8 8 Denver Bike Shop Denver, CO
## 9 9 Minneapolis Bike Shop Minneapolis, MN
## 10 10 Kansas City 29ers Kansas City, KS
## # … with 20 more rows
##
## [[3]]
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
# Method 3: Function specified with function()
paths_chr %>%
map(function(x) read_excel(path = x))
## New names:
## * `` -> ...1
## [[1]]
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # … with 87 more rows
##
## [[2]]
## # A tibble: 30 x 3
## bikeshop.id bikeshop.name location
## <dbl> <chr> <chr>
## 1 1 Pittsburgh Mountain Machines Pittsburgh, PA
## 2 2 Ithaca Mountain Climbers Ithaca, NY
## 3 3 Columbus Race Equipment Columbus, OH
## 4 4 Detroit Cycles Detroit, MI
## 5 5 Cincinnati Speed Cincinnati, OH
## 6 6 Louisville Race Equipment Louisville, KY
## 7 7 Nashville Cruisers Nashville, TN
## 8 8 Denver Bike Shop Denver, CO
## 9 9 Minneapolis Bike Shop Minneapolis, MN
## 10 10 Kansas City 29ers Kansas City, KS
## # … with 20 more rows
##
## [[3]]
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
# Reading Excel Sheets
excel_sheets("data/bike_sales/bikes.xlsx") %>%
map(~ read_excel(path = "data/bike_sales/bikes.xlsx", sheet = .))
## [[1]]
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # … with 87 more rows
# 2.0 MAPPING DATA FRAMES ----
# 2.1 Column-wise Map ----
bike_orderlines_tbl %>% is.list()
## [1] TRUE
bike_orderlines_tbl %>%
map(~ class(.))
## $order_date
## [1] "POSIXct" "POSIXt"
##
## $order_id
## [1] "numeric"
##
## $order_line
## [1] "numeric"
##
## $quantity
## [1] "numeric"
##
## $price
## [1] "numeric"
##
## $total_price
## [1] "numeric"
##
## $model
## [1] "character"
##
## $category_1
## [1] "character"
##
## $category_2
## [1] "character"
##
## $frame_material
## [1] "character"
##
## $bikeshop_name
## [1] "character"
##
## $city
## [1] "character"
##
## $state
## [1] "character"
# 2.2 Map Variants ----
?map
# Character map
bike_orderlines_tbl %>%
map_chr(~ class(.)[1])
## order_date order_id order_line quantity price
## "POSIXct" "numeric" "numeric" "numeric" "numeric"
## total_price model category_1 category_2 frame_material
## "numeric" "character" "character" "character" "character"
## bikeshop_name city state
## "character" "character" "character"
# Data Frame map
bike_orderlines_tbl %>%
map_df(~ class(.)[1]) %>%
gather()
## # A tibble: 13 x 2
## key value
## <chr> <chr>
## 1 order_date POSIXct
## 2 order_id numeric
## 3 order_line numeric
## 4 quantity numeric
## 5 price numeric
## 6 total_price numeric
## 7 model character
## 8 category_1 character
## 9 category_2 character
## 10 frame_material character
## 11 bikeshop_name character
## 12 city character
## 13 state character
bike_orderlines_tbl %>%
map_df(~ sum(is.na(.)) / length(.)) %>%
gather()
## # A tibble: 13 x 2
## key value
## <chr> <dbl>
## 1 order_date 0
## 2 order_id 0
## 3 order_line 0
## 4 quantity 0
## 5 price 0
## 6 total_price 0
## 7 model 0
## 8 category_1 0
## 9 category_2 0
## 10 frame_material 0
## 11 bikeshop_name 0
## 12 city 0
## 13 state 0
# 2.3 Row-wise Map ----
excel_tbl <- excel_paths_tbl %>%
select(path) %>%
mutate(data = path %>% map(read_excel))
## New names:
## * `` -> ...1
excel_list
## $`data/bike_sales/bikes.xlsx`
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # … with 87 more rows
##
## $`data/bike_sales/bikeshops.xlsx`
## # A tibble: 30 x 3
## bikeshop.id bikeshop.name location
## <dbl> <chr> <chr>
## 1 1 Pittsburgh Mountain Machines Pittsburgh, PA
## 2 2 Ithaca Mountain Climbers Ithaca, NY
## 3 3 Columbus Race Equipment Columbus, OH
## 4 4 Detroit Cycles Detroit, MI
## 5 5 Cincinnati Speed Cincinnati, OH
## 6 6 Louisville Race Equipment Louisville, KY
## 7 7 Nashville Cruisers Nashville, TN
## 8 8 Denver Bike Shop Denver, CO
## 9 9 Minneapolis Bike Shop Minneapolis, MN
## 10 10 Kansas City 29ers Kansas City, KS
## # … with 20 more rows
##
## $`data/bike_sales/orderlines.xlsx`
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
excel_tbl
## # A tibble: 3 x 2
## path data
## <fs::path> <list>
## 1 data/bike_sales/bikes.xlsx <tibble [97 × 4]>
## 2 data/bike_sales/bikeshops.xlsx <tibble [30 × 3]>
## 3 data/bike_sales/orderlines.xlsx <tibble [15,644 × 7]>
# 3.0 NESTED DATA ----
# Unnest
excel_tbl
## # A tibble: 3 x 2
## path data
## <fs::path> <list>
## 1 data/bike_sales/bikes.xlsx <tibble [97 × 4]>
## 2 data/bike_sales/bikeshops.xlsx <tibble [30 × 3]>
## 3 data/bike_sales/orderlines.xlsx <tibble [15,644 × 7]>
excel_tbl$data
## [[1]]
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # … with 87 more rows
##
## [[2]]
## # A tibble: 30 x 3
## bikeshop.id bikeshop.name location
## <dbl> <chr> <chr>
## 1 1 Pittsburgh Mountain Machines Pittsburgh, PA
## 2 2 Ithaca Mountain Climbers Ithaca, NY
## 3 3 Columbus Race Equipment Columbus, OH
## 4 4 Detroit Cycles Detroit, MI
## 5 5 Cincinnati Speed Cincinnati, OH
## 6 6 Louisville Race Equipment Louisville, KY
## 7 7 Nashville Cruisers Nashville, TN
## 8 8 Denver Bike Shop Denver, CO
## 9 9 Minneapolis Bike Shop Minneapolis, MN
## 10 10 Kansas City 29ers Kansas City, KS
## # … with 20 more rows
##
## [[3]]
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
excel_tbl$data[[3]]
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
excel_tbl_unnested <- excel_tbl %>%
unnest(data, .id = "ID")
excel_tbl_unnested
## # A tibble: 15,771 x 16
## path ID bike.id model description price bikeshop.id
## <fs::path> <chr> <dbl> <chr> <chr> <dbl> <dbl>
## 1 data/bike… 1 1 Supe… Road - Eli… 12790 NA
## 2 data/bike… 1 2 Supe… Road - Eli… 10660 NA
## 3 data/bike… 1 3 Supe… Road - Eli… 7990 NA
## 4 data/bike… 1 4 Supe… Road - Eli… 5330 NA
## 5 data/bike… 1 5 Supe… Road - Eli… 4260 NA
## 6 data/bike… 1 6 Supe… Road - Eli… 3940 NA
## 7 data/bike… 1 7 Supe… Road - Eli… 3200 NA
## 8 data/bike… 1 8 Supe… Road - Eli… 2660 NA
## 9 data/bike… 1 9 Supe… Road - Eli… 2240 NA
## 10 data/bike… 1 10 Supe… Road - Eli… 1840 NA
## # … with 15,761 more rows, and 9 more variables: bikeshop.name <chr>,
## # location <chr>, ...1 <chr>, order.id <dbl>, order.line <dbl>,
## # order.date <dttm>, customer.id <dbl>, product.id <dbl>, quantity <dbl>
# Nest
excel_tbl_nested <- excel_tbl_unnested %>%
group_by(ID, path) %>%
nest()
excel_tbl_nested$data
## [[1]]
## # A tibble: 97 x 14
## bike.id model description price bikeshop.id bikeshop.name location ...1
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 Supe… Road - Eli… 12790 NA <NA> <NA> <NA>
## 2 2 Supe… Road - Eli… 10660 NA <NA> <NA> <NA>
## 3 3 Supe… Road - Eli… 7990 NA <NA> <NA> <NA>
## 4 4 Supe… Road - Eli… 5330 NA <NA> <NA> <NA>
## 5 5 Supe… Road - Eli… 4260 NA <NA> <NA> <NA>
## 6 6 Supe… Road - Eli… 3940 NA <NA> <NA> <NA>
## 7 7 Supe… Road - Eli… 3200 NA <NA> <NA> <NA>
## 8 8 Supe… Road - Eli… 2660 NA <NA> <NA> <NA>
## 9 9 Supe… Road - Eli… 2240 NA <NA> <NA> <NA>
## 10 10 Supe… Road - Eli… 1840 NA <NA> <NA> <NA>
## # … with 87 more rows, and 6 more variables: order.id <dbl>,
## # order.line <dbl>, order.date <dttm>, customer.id <dbl>,
## # product.id <dbl>, quantity <dbl>
##
## [[2]]
## # A tibble: 30 x 14
## bike.id model description price bikeshop.id bikeshop.name location ...1
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 NA <NA> <NA> NA 1 Pittsburgh M… Pittsbu… <NA>
## 2 NA <NA> <NA> NA 2 Ithaca Mount… Ithaca,… <NA>
## 3 NA <NA> <NA> NA 3 Columbus Rac… Columbu… <NA>
## 4 NA <NA> <NA> NA 4 Detroit Cycl… Detroit… <NA>
## 5 NA <NA> <NA> NA 5 Cincinnati S… Cincinn… <NA>
## 6 NA <NA> <NA> NA 6 Louisville R… Louisvi… <NA>
## 7 NA <NA> <NA> NA 7 Nashville Cr… Nashvil… <NA>
## 8 NA <NA> <NA> NA 8 Denver Bike … Denver,… <NA>
## 9 NA <NA> <NA> NA 9 Minneapolis … Minneap… <NA>
## 10 NA <NA> <NA> NA 10 Kansas City … Kansas … <NA>
## # … with 20 more rows, and 6 more variables: order.id <dbl>,
## # order.line <dbl>, order.date <dttm>, customer.id <dbl>,
## # product.id <dbl>, quantity <dbl>
##
## [[3]]
## # A tibble: 15,644 x 14
## bike.id model description price bikeshop.id bikeshop.name location ...1
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 NA <NA> <NA> NA NA <NA> <NA> 1
## 2 NA <NA> <NA> NA NA <NA> <NA> 2
## 3 NA <NA> <NA> NA NA <NA> <NA> 3
## 4 NA <NA> <NA> NA NA <NA> <NA> 4
## 5 NA <NA> <NA> NA NA <NA> <NA> 5
## 6 NA <NA> <NA> NA NA <NA> <NA> 6
## 7 NA <NA> <NA> NA NA <NA> <NA> 7
## 8 NA <NA> <NA> NA NA <NA> <NA> 8
## 9 NA <NA> <NA> NA NA <NA> <NA> 9
## 10 NA <NA> <NA> NA NA <NA> <NA> 10
## # … with 15,634 more rows, and 6 more variables: order.id <dbl>,
## # order.line <dbl>, order.date <dttm>, customer.id <dbl>,
## # product.id <dbl>, quantity <dbl>
# Mapping Nested List Columns
x <- rep(NA, 5)
x
## [1] NA NA NA NA NA
!is.na(x) %>% all()
## [1] FALSE
y <- c(1:4, NA_real_)
y
## [1] 1 2 3 4 NA
!is.na(y) %>% all()
## [1] TRUE
excel_tbl_nested$data[[3]] %>%
select_if(~ !is.na(.) %>% all())
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48
## 2 2 1 2 2011-01-07 00:00:00 2 52
## 3 3 2 1 2011-01-10 00:00:00 10 76
## 4 4 2 2 2011-01-10 00:00:00 10 52
## 5 5 3 1 2011-01-10 00:00:00 6 2
## 6 6 3 2 2011-01-10 00:00:00 6 50
## 7 7 3 3 2011-01-10 00:00:00 6 1
## 8 8 3 4 2011-01-10 00:00:00 6 4
## 9 9 3 5 2011-01-10 00:00:00 6 34
## 10 10 4 1 2011-01-11 00:00:00 22 26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
excel_tbl_nested
## # A tibble: 3 x 3
## ID path data
## <chr> <fs::path> <list>
## 1 1 data/bike_sales/bikes.xlsx <tibble [97 × 14]>
## 2 2 data/bike_sales/bikeshops.xlsx <tibble [30 × 14]>
## 3 3 data/bike_sales/orderlines.xlsx <tibble [15,644 × 14]>
# Method 1: Creating a function outside of purrr::map()
# Step 1: Create a function that can be mapped to one element
select_non_na_columns <- function(data) {
data %>%
select_if(~ !is.na(.) %>% all())
}
# Step 2: Extract an element, and test the function
excel_tbl_nested$data[[1]] %>%
select_non_na_columns()
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # … with 87 more rows
# Step 3: Use mutate() + map()
excel_tbl_nested_fixed <- excel_tbl_nested %>%
mutate(data_fixed = data %>% map(select_non_na_columns))
excel_tbl_nested_fixed$data[[1]]
## # A tibble: 97 x 14
## bike.id model description price bikeshop.id bikeshop.name location ...1
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 Supe… Road - Eli… 12790 NA <NA> <NA> <NA>
## 2 2 Supe… Road - Eli… 10660 NA <NA> <NA> <NA>
## 3 3 Supe… Road - Eli… 7990 NA <NA> <NA> <NA>
## 4 4 Supe… Road - Eli… 5330 NA <NA> <NA> <NA>
## 5 5 Supe… Road - Eli… 4260 NA <NA> <NA> <NA>
## 6 6 Supe… Road - Eli… 3940 NA <NA> <NA> <NA>
## 7 7 Supe… Road - Eli… 3200 NA <NA> <NA> <NA>
## 8 8 Supe… Road - Eli… 2660 NA <NA> <NA> <NA>
## 9 9 Supe… Road - Eli… 2240 NA <NA> <NA> <NA>
## 10 10 Supe… Road - Eli… 1840 NA <NA> <NA> <NA>
## # … with 87 more rows, and 6 more variables: order.id <dbl>,
## # order.line <dbl>, order.date <dttm>, customer.id <dbl>,
## # product.id <dbl>, quantity <dbl>
# 4.0 MODELING WITH PURRR ----
# 4.1 Time Series Plot ----
# - What if we wanted to approximate the 3 month rolling average with a line?
# - We can use a smoother
# Code comes from 04_functions_iteration/01_functional_programming
rolling_avg_3_tbl <- bike_orderlines_tbl %>%
select(order_date, category_1, category_2, total_price) %>%
mutate(order_date = ymd(order_date)) %>%
mutate(month_end = ceiling_date(order_date, unit = "month") - period(1, unit = "days")) %>%
group_by(category_1, category_2, month_end) %>%
summarise(
total_price = sum(total_price)
) %>%
mutate(rolling_avg_3 = rollmean(total_price, k = 3, na.pad = TRUE, align = "right")) %>%
ungroup() %>%
mutate(category_2 = as_factor(category_2) %>% fct_reorder2(month_end, total_price))
rolling_avg_3_tbl %>%
ggplot(aes(month_end, total_price, color = category_2)) +
# Geometries
geom_point() +
geom_line(aes(y = rolling_avg_3), color = "blue", linetype = 1) +
facet_wrap(~ category_2, scales = "free_y") +
# Add Loess Smoother
geom_smooth(method = "loess", se = FALSE, span = 0.2, color = "black") +
# Formatting
theme_tq() +
scale_color_tq() +
scale_y_continuous(labels = scales::dollar_format(scale = 1e-3, suffix = "K"))
## Warning: Removed 2 rows containing missing values (geom_path).

# 4.2 Modeling Primer ----
# Data Preparation
sales_by_m_cross_country_tbl <- rolling_avg_3_tbl %>%
filter(category_2 == "Cross Country Race") %>%
select(month_end, total_price) %>%
mutate(month_end_num = as.numeric(month_end))
sales_by_m_cross_country_tbl %>%
ggplot(aes(month_end_num, total_price)) +
geom_point() +
geom_smooth(method = "loess", span = 0.2, se = FALSE)

# Making a loess model
?loess
fit_loess_cross_country <- sales_by_m_cross_country_tbl %>%
loess(total_price ~ month_end_num, data = ., span = 0.2)
fit_loess_cross_country
## Call:
## loess(formula = total_price ~ month_end_num, data = ., span = 0.2)
##
## Number of Observations: 60
## Equivalent Number of Parameters: 14.69
## Residual Standard Error: 89070
# Working With Broom
fit_loess_cross_country %>%
broom::augment() %>%
# Visualizing results
ggplot(aes(month_end_num, total_price)) +
geom_point() +
geom_line(aes(y = .fitted), color = "blue")

# 4.3 Step 1: Function To Return Fitted Results ----
rolling_avg_3_tbl_nested <- rolling_avg_3_tbl %>%
group_by(category_1, category_2) %>%
nest()
rolling_avg_3_tbl_nested$data[[1]]
## # A tibble: 60 x 3
## month_end total_price rolling_avg_3
## <date> <dbl> <dbl>
## 1 2011-01-31 143660 NA
## 2 2011-02-28 324400 NA
## 3 2011-03-31 142000 203353.
## 4 2011-04-30 498580 321660
## 5 2011-05-31 220310 286963.
## 6 2011-06-30 364420 361103.
## 7 2011-07-31 307300 297343.
## 8 2011-08-31 110600 260773.
## 9 2011-09-30 191870 203257.
## 10 2011-10-31 196440 166303.
## # … with 50 more rows
data <- rolling_avg_3_tbl_nested$data[[1]]
tidy_loess <- function(data, span = 0.2) {
data_formatted <- data %>%
select(month_end, total_price) %>%
mutate(month_end_num = as.numeric(month_end))
fit_loess <- loess(formula = total_price ~ month_end_num,
data = data_formatted,
span = span)
output_tbl <- fit_loess %>%
broom::augment() %>%
select(.fitted)
return(output_tbl)
}
# 4.4 Step 2: Test Function on Single Element ----
rolling_avg_3_tbl_nested$data[[2]] %>%
tidy_loess()
## # A tibble: 58 x 1
## .fitted
## <dbl>
## 1 39170.
## 2 36927.
## 3 33231.
## 4 28581.
## 5 20731.
## 6 12102.
## 7 10050.
## 8 10024.
## 9 8033.
## 10 6620.
## # … with 48 more rows
# 4.5 Step 3: Map Function to All Categories ----
# Map Functions
loess_tbl_nested <- rolling_avg_3_tbl_nested %>%
mutate(fitted = data %>% map(tidy_loess))
loess_tbl_nested$fitted[[1]]
## # A tibble: 60 x 1
## .fitted
## <dbl>
## 1 176998.
## 2 239802.
## 3 286279.
## 4 311685.
## 5 313621.
## 6 298642.
## 7 261073.
## 8 221223.
## 9 201690.
## 10 187415.
## # … with 50 more rows
loess_tbl_nested %>%
unnest()
## # A tibble: 538 x 6
## category_1 category_2 month_end total_price rolling_avg_3 .fitted
## <chr> <fct> <date> <dbl> <dbl> <dbl>
## 1 Mountain Cross Country R… 2011-01-31 143660 NA 176998.
## 2 Mountain Cross Country R… 2011-02-28 324400 NA 239802.
## 3 Mountain Cross Country R… 2011-03-31 142000 203353. 286279.
## 4 Mountain Cross Country R… 2011-04-30 498580 321660 311685.
## 5 Mountain Cross Country R… 2011-05-31 220310 286963. 313621.
## 6 Mountain Cross Country R… 2011-06-30 364420 361103. 298642.
## 7 Mountain Cross Country R… 2011-07-31 307300 297343. 261073.
## 8 Mountain Cross Country R… 2011-08-31 110600 260773. 221223.
## 9 Mountain Cross Country R… 2011-09-30 191870 203257. 201690.
## 10 Mountain Cross Country R… 2011-10-31 196440 166303. 187415.
## # … with 528 more rows
# Visualize Results
loess_tbl_nested %>%
unnest() %>%
ggplot(aes(month_end, total_price, color = category_2)) +
# Geometries
geom_point() +
geom_line(aes(y = .fitted), color = "blue", size = 2) +
geom_smooth(method = "loess", span = 0.2) +
facet_wrap(~ category_2, scales = "free_y")
