library(readxl)
library(tidyverse)
## ── Attaching packages ───────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.1     ✔ dplyr   0.8.1
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidyquant)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(lubridate)
library(broom)

bike_orderlines_tbl <- read_rds("data_wrangled_student/bike_orderlines.rds")

glimpse(bike_orderlines_tbl)
## Observations: 15,644
## Variables: 13
## $ order_date     <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, …
## $ order_id       <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6…
## $ order_line     <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3…
## $ quantity       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1…
## $ price          <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330,…
## $ total_price    <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330,…
## $ model          <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast of …
## $ category_1     <chr> "Mountain", "Mountain", "Mountain", "Mountain", "…
## $ category_2     <chr> "Over Mountain", "Over Mountain", "Trail", "Over …
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carbon…
## $ bikeshop_name  <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Clim…
## $ city           <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City",…
## $ state          <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY", "…
# 1.0 PRIMER ON PURRR ----
# Programmatically getting Excel files into R
excel_paths_tbl <- fs::dir_info("data/bike_sales")

paths_chr <- excel_paths_tbl %>%
    pull(path)


# What Not To Do: Don't use for loops
excel_list <- list()
for (path in paths_chr) {
    excel_list[[path]] <- read_excel(path)
}
## New names:
## * `` -> ...1
excel_list
## $`data/bike_sales/bikes.xlsx`
## # A tibble: 97 x 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # … with 87 more rows
## 
## $`data/bike_sales/bikeshops.xlsx`
## # A tibble: 30 x 3
##    bikeshop.id bikeshop.name                location       
##          <dbl> <chr>                        <chr>          
##  1           1 Pittsburgh Mountain Machines Pittsburgh, PA 
##  2           2 Ithaca Mountain Climbers     Ithaca, NY     
##  3           3 Columbus Race Equipment      Columbus, OH   
##  4           4 Detroit Cycles               Detroit, MI    
##  5           5 Cincinnati Speed             Cincinnati, OH 
##  6           6 Louisville Race Equipment    Louisville, KY 
##  7           7 Nashville Cruisers           Nashville, TN  
##  8           8 Denver Bike Shop             Denver, CO     
##  9           9 Minneapolis Bike Shop        Minneapolis, MN
## 10          10 Kansas City 29ers            Kansas City, KS
## # … with 20 more rows
## 
## $`data/bike_sales/orderlines.xlsx`
## # A tibble: 15,644 x 7
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
# What to Do: Use map()
?map

# Method 1: function name
excel_list_2 <- paths_chr %>%
    map(read_excel) %>%
    set_names(paths_chr)
## New names:
## * `` -> ...1
# Method 2: Anonymous Function
paths_chr %>%
    map(~ read_excel(.))
## New names:
## * `` -> ...1
## [[1]]
## # A tibble: 97 x 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # … with 87 more rows
## 
## [[2]]
## # A tibble: 30 x 3
##    bikeshop.id bikeshop.name                location       
##          <dbl> <chr>                        <chr>          
##  1           1 Pittsburgh Mountain Machines Pittsburgh, PA 
##  2           2 Ithaca Mountain Climbers     Ithaca, NY     
##  3           3 Columbus Race Equipment      Columbus, OH   
##  4           4 Detroit Cycles               Detroit, MI    
##  5           5 Cincinnati Speed             Cincinnati, OH 
##  6           6 Louisville Race Equipment    Louisville, KY 
##  7           7 Nashville Cruisers           Nashville, TN  
##  8           8 Denver Bike Shop             Denver, CO     
##  9           9 Minneapolis Bike Shop        Minneapolis, MN
## 10          10 Kansas City 29ers            Kansas City, KS
## # … with 20 more rows
## 
## [[3]]
## # A tibble: 15,644 x 7
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
# Method 3: Function specified with function()
paths_chr %>%
    map(function(x) read_excel(path = x))
## New names:
## * `` -> ...1
## [[1]]
## # A tibble: 97 x 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # … with 87 more rows
## 
## [[2]]
## # A tibble: 30 x 3
##    bikeshop.id bikeshop.name                location       
##          <dbl> <chr>                        <chr>          
##  1           1 Pittsburgh Mountain Machines Pittsburgh, PA 
##  2           2 Ithaca Mountain Climbers     Ithaca, NY     
##  3           3 Columbus Race Equipment      Columbus, OH   
##  4           4 Detroit Cycles               Detroit, MI    
##  5           5 Cincinnati Speed             Cincinnati, OH 
##  6           6 Louisville Race Equipment    Louisville, KY 
##  7           7 Nashville Cruisers           Nashville, TN  
##  8           8 Denver Bike Shop             Denver, CO     
##  9           9 Minneapolis Bike Shop        Minneapolis, MN
## 10          10 Kansas City 29ers            Kansas City, KS
## # … with 20 more rows
## 
## [[3]]
## # A tibble: 15,644 x 7
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
# Reading Excel Sheets

excel_sheets("data/bike_sales/bikes.xlsx") %>%
    map(~ read_excel(path = "data/bike_sales/bikes.xlsx", sheet = .))
## [[1]]
## # A tibble: 97 x 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # … with 87 more rows
# 2.0 MAPPING DATA FRAMES ----

# 2.1 Column-wise Map ----

bike_orderlines_tbl %>% is.list()
## [1] TRUE
bike_orderlines_tbl %>%
    map(~ class(.))
## $order_date
## [1] "POSIXct" "POSIXt" 
## 
## $order_id
## [1] "numeric"
## 
## $order_line
## [1] "numeric"
## 
## $quantity
## [1] "numeric"
## 
## $price
## [1] "numeric"
## 
## $total_price
## [1] "numeric"
## 
## $model
## [1] "character"
## 
## $category_1
## [1] "character"
## 
## $category_2
## [1] "character"
## 
## $frame_material
## [1] "character"
## 
## $bikeshop_name
## [1] "character"
## 
## $city
## [1] "character"
## 
## $state
## [1] "character"
# 2.2 Map Variants ----

?map

# Character map
bike_orderlines_tbl %>%
    map_chr(~ class(.)[1])
##     order_date       order_id     order_line       quantity          price 
##      "POSIXct"      "numeric"      "numeric"      "numeric"      "numeric" 
##    total_price          model     category_1     category_2 frame_material 
##      "numeric"    "character"    "character"    "character"    "character" 
##  bikeshop_name           city          state 
##    "character"    "character"    "character"
# Data Frame map
bike_orderlines_tbl %>%
    map_df(~ class(.)[1]) %>%
    gather()
## # A tibble: 13 x 2
##    key            value    
##    <chr>          <chr>    
##  1 order_date     POSIXct  
##  2 order_id       numeric  
##  3 order_line     numeric  
##  4 quantity       numeric  
##  5 price          numeric  
##  6 total_price    numeric  
##  7 model          character
##  8 category_1     character
##  9 category_2     character
## 10 frame_material character
## 11 bikeshop_name  character
## 12 city           character
## 13 state          character
bike_orderlines_tbl %>%
    map_df(~ sum(is.na(.)) / length(.)) %>%
    gather()
## # A tibble: 13 x 2
##    key            value
##    <chr>          <dbl>
##  1 order_date         0
##  2 order_id           0
##  3 order_line         0
##  4 quantity           0
##  5 price              0
##  6 total_price        0
##  7 model              0
##  8 category_1         0
##  9 category_2         0
## 10 frame_material     0
## 11 bikeshop_name      0
## 12 city               0
## 13 state              0
# 2.3 Row-wise Map ----

excel_tbl <- excel_paths_tbl %>%
    select(path) %>%
    mutate(data = path %>% map(read_excel))
## New names:
## * `` -> ...1
excel_list
## $`data/bike_sales/bikes.xlsx`
## # A tibble: 97 x 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # … with 87 more rows
## 
## $`data/bike_sales/bikeshops.xlsx`
## # A tibble: 30 x 3
##    bikeshop.id bikeshop.name                location       
##          <dbl> <chr>                        <chr>          
##  1           1 Pittsburgh Mountain Machines Pittsburgh, PA 
##  2           2 Ithaca Mountain Climbers     Ithaca, NY     
##  3           3 Columbus Race Equipment      Columbus, OH   
##  4           4 Detroit Cycles               Detroit, MI    
##  5           5 Cincinnati Speed             Cincinnati, OH 
##  6           6 Louisville Race Equipment    Louisville, KY 
##  7           7 Nashville Cruisers           Nashville, TN  
##  8           8 Denver Bike Shop             Denver, CO     
##  9           9 Minneapolis Bike Shop        Minneapolis, MN
## 10          10 Kansas City 29ers            Kansas City, KS
## # … with 20 more rows
## 
## $`data/bike_sales/orderlines.xlsx`
## # A tibble: 15,644 x 7
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
excel_tbl
## # A tibble: 3 x 2
##   path                            data                 
##   <fs::path>                      <list>               
## 1 data/bike_sales/bikes.xlsx      <tibble [97 × 4]>    
## 2 data/bike_sales/bikeshops.xlsx  <tibble [30 × 3]>    
## 3 data/bike_sales/orderlines.xlsx <tibble [15,644 × 7]>
# 3.0 NESTED DATA ----

# Unnest
excel_tbl
## # A tibble: 3 x 2
##   path                            data                 
##   <fs::path>                      <list>               
## 1 data/bike_sales/bikes.xlsx      <tibble [97 × 4]>    
## 2 data/bike_sales/bikeshops.xlsx  <tibble [30 × 3]>    
## 3 data/bike_sales/orderlines.xlsx <tibble [15,644 × 7]>
excel_tbl$data
## [[1]]
## # A tibble: 97 x 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # … with 87 more rows
## 
## [[2]]
## # A tibble: 30 x 3
##    bikeshop.id bikeshop.name                location       
##          <dbl> <chr>                        <chr>          
##  1           1 Pittsburgh Mountain Machines Pittsburgh, PA 
##  2           2 Ithaca Mountain Climbers     Ithaca, NY     
##  3           3 Columbus Race Equipment      Columbus, OH   
##  4           4 Detroit Cycles               Detroit, MI    
##  5           5 Cincinnati Speed             Cincinnati, OH 
##  6           6 Louisville Race Equipment    Louisville, KY 
##  7           7 Nashville Cruisers           Nashville, TN  
##  8           8 Denver Bike Shop             Denver, CO     
##  9           9 Minneapolis Bike Shop        Minneapolis, MN
## 10          10 Kansas City 29ers            Kansas City, KS
## # … with 20 more rows
## 
## [[3]]
## # A tibble: 15,644 x 7
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
excel_tbl$data[[3]]
## # A tibble: 15,644 x 7
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
excel_tbl_unnested <- excel_tbl %>%
    unnest(data, .id = "ID") 

excel_tbl_unnested
## # A tibble: 15,771 x 16
##    path       ID    bike.id model description price bikeshop.id
##    <fs::path> <chr>   <dbl> <chr> <chr>       <dbl>       <dbl>
##  1 data/bike… 1           1 Supe… Road - Eli… 12790          NA
##  2 data/bike… 1           2 Supe… Road - Eli… 10660          NA
##  3 data/bike… 1           3 Supe… Road - Eli…  7990          NA
##  4 data/bike… 1           4 Supe… Road - Eli…  5330          NA
##  5 data/bike… 1           5 Supe… Road - Eli…  4260          NA
##  6 data/bike… 1           6 Supe… Road - Eli…  3940          NA
##  7 data/bike… 1           7 Supe… Road - Eli…  3200          NA
##  8 data/bike… 1           8 Supe… Road - Eli…  2660          NA
##  9 data/bike… 1           9 Supe… Road - Eli…  2240          NA
## 10 data/bike… 1          10 Supe… Road - Eli…  1840          NA
## # … with 15,761 more rows, and 9 more variables: bikeshop.name <chr>,
## #   location <chr>, ...1 <chr>, order.id <dbl>, order.line <dbl>,
## #   order.date <dttm>, customer.id <dbl>, product.id <dbl>, quantity <dbl>
# Nest

excel_tbl_nested <- excel_tbl_unnested %>%
    group_by(ID, path) %>%
    nest()

excel_tbl_nested$data
## [[1]]
## # A tibble: 97 x 14
##    bike.id model description price bikeshop.id bikeshop.name location ...1 
##      <dbl> <chr> <chr>       <dbl>       <dbl> <chr>         <chr>    <chr>
##  1       1 Supe… Road - Eli… 12790          NA <NA>          <NA>     <NA> 
##  2       2 Supe… Road - Eli… 10660          NA <NA>          <NA>     <NA> 
##  3       3 Supe… Road - Eli…  7990          NA <NA>          <NA>     <NA> 
##  4       4 Supe… Road - Eli…  5330          NA <NA>          <NA>     <NA> 
##  5       5 Supe… Road - Eli…  4260          NA <NA>          <NA>     <NA> 
##  6       6 Supe… Road - Eli…  3940          NA <NA>          <NA>     <NA> 
##  7       7 Supe… Road - Eli…  3200          NA <NA>          <NA>     <NA> 
##  8       8 Supe… Road - Eli…  2660          NA <NA>          <NA>     <NA> 
##  9       9 Supe… Road - Eli…  2240          NA <NA>          <NA>     <NA> 
## 10      10 Supe… Road - Eli…  1840          NA <NA>          <NA>     <NA> 
## # … with 87 more rows, and 6 more variables: order.id <dbl>,
## #   order.line <dbl>, order.date <dttm>, customer.id <dbl>,
## #   product.id <dbl>, quantity <dbl>
## 
## [[2]]
## # A tibble: 30 x 14
##    bike.id model description price bikeshop.id bikeshop.name location ...1 
##      <dbl> <chr> <chr>       <dbl>       <dbl> <chr>         <chr>    <chr>
##  1      NA <NA>  <NA>           NA           1 Pittsburgh M… Pittsbu… <NA> 
##  2      NA <NA>  <NA>           NA           2 Ithaca Mount… Ithaca,… <NA> 
##  3      NA <NA>  <NA>           NA           3 Columbus Rac… Columbu… <NA> 
##  4      NA <NA>  <NA>           NA           4 Detroit Cycl… Detroit… <NA> 
##  5      NA <NA>  <NA>           NA           5 Cincinnati S… Cincinn… <NA> 
##  6      NA <NA>  <NA>           NA           6 Louisville R… Louisvi… <NA> 
##  7      NA <NA>  <NA>           NA           7 Nashville Cr… Nashvil… <NA> 
##  8      NA <NA>  <NA>           NA           8 Denver Bike … Denver,… <NA> 
##  9      NA <NA>  <NA>           NA           9 Minneapolis … Minneap… <NA> 
## 10      NA <NA>  <NA>           NA          10 Kansas City … Kansas … <NA> 
## # … with 20 more rows, and 6 more variables: order.id <dbl>,
## #   order.line <dbl>, order.date <dttm>, customer.id <dbl>,
## #   product.id <dbl>, quantity <dbl>
## 
## [[3]]
## # A tibble: 15,644 x 14
##    bike.id model description price bikeshop.id bikeshop.name location ...1 
##      <dbl> <chr> <chr>       <dbl>       <dbl> <chr>         <chr>    <chr>
##  1      NA <NA>  <NA>           NA          NA <NA>          <NA>     1    
##  2      NA <NA>  <NA>           NA          NA <NA>          <NA>     2    
##  3      NA <NA>  <NA>           NA          NA <NA>          <NA>     3    
##  4      NA <NA>  <NA>           NA          NA <NA>          <NA>     4    
##  5      NA <NA>  <NA>           NA          NA <NA>          <NA>     5    
##  6      NA <NA>  <NA>           NA          NA <NA>          <NA>     6    
##  7      NA <NA>  <NA>           NA          NA <NA>          <NA>     7    
##  8      NA <NA>  <NA>           NA          NA <NA>          <NA>     8    
##  9      NA <NA>  <NA>           NA          NA <NA>          <NA>     9    
## 10      NA <NA>  <NA>           NA          NA <NA>          <NA>     10   
## # … with 15,634 more rows, and 6 more variables: order.id <dbl>,
## #   order.line <dbl>, order.date <dttm>, customer.id <dbl>,
## #   product.id <dbl>, quantity <dbl>
# Mapping Nested List Columns

x <- rep(NA, 5)
x
## [1] NA NA NA NA NA
!is.na(x) %>% all()
## [1] FALSE
y <- c(1:4, NA_real_)
y
## [1]  1  2  3  4 NA
!is.na(y) %>% all()
## [1] TRUE
excel_tbl_nested$data[[3]] %>%
    select_if(~ !is.na(.) %>% all())
## # A tibble: 15,644 x 7
##    ...1  order.id order.line order.date          customer.id product.id
##    <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>
##  1 1            1          1 2011-01-07 00:00:00           2         48
##  2 2            1          2 2011-01-07 00:00:00           2         52
##  3 3            2          1 2011-01-10 00:00:00          10         76
##  4 4            2          2 2011-01-10 00:00:00          10         52
##  5 5            3          1 2011-01-10 00:00:00           6          2
##  6 6            3          2 2011-01-10 00:00:00           6         50
##  7 7            3          3 2011-01-10 00:00:00           6          1
##  8 8            3          4 2011-01-10 00:00:00           6          4
##  9 9            3          5 2011-01-10 00:00:00           6         34
## 10 10           4          1 2011-01-11 00:00:00          22         26
## # … with 15,634 more rows, and 1 more variable: quantity <dbl>
excel_tbl_nested
## # A tibble: 3 x 3
##   ID    path                            data                  
##   <chr> <fs::path>                      <list>                
## 1 1     data/bike_sales/bikes.xlsx      <tibble [97 × 14]>    
## 2 2     data/bike_sales/bikeshops.xlsx  <tibble [30 × 14]>    
## 3 3     data/bike_sales/orderlines.xlsx <tibble [15,644 × 14]>
# Method 1: Creating a function outside of purrr::map()

# Step 1: Create a function that can be mapped to one element 
select_non_na_columns <- function(data) {
    
    data %>%
        select_if(~ !is.na(.) %>% all())
    
}

# Step 2: Extract an element, and test the function
excel_tbl_nested$data[[1]] %>%
    select_non_na_columns()
## # A tibble: 97 x 4
##    bike.id model                          description                price
##      <dbl> <chr>                          <chr>                      <dbl>
##  1       1 Supersix Evo Black Inc.        Road - Elite Road - Carbon 12790
##  2       2 Supersix Evo Hi-Mod Team       Road - Elite Road - Carbon 10660
##  3       3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon  7990
##  4       4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon  5330
##  5       5 Supersix Evo Hi-Mod Utegra     Road - Elite Road - Carbon  4260
##  6       6 Supersix Evo Red               Road - Elite Road - Carbon  3940
##  7       7 Supersix Evo Ultegra 3         Road - Elite Road - Carbon  3200
##  8       8 Supersix Evo Ultegra 4         Road - Elite Road - Carbon  2660
##  9       9 Supersix Evo 105               Road - Elite Road - Carbon  2240
## 10      10 Supersix Evo Tiagra            Road - Elite Road - Carbon  1840
## # … with 87 more rows
# Step 3: Use mutate() + map()
excel_tbl_nested_fixed <- excel_tbl_nested %>%
    mutate(data_fixed = data %>% map(select_non_na_columns))

excel_tbl_nested_fixed$data[[1]]
## # A tibble: 97 x 14
##    bike.id model description price bikeshop.id bikeshop.name location ...1 
##      <dbl> <chr> <chr>       <dbl>       <dbl> <chr>         <chr>    <chr>
##  1       1 Supe… Road - Eli… 12790          NA <NA>          <NA>     <NA> 
##  2       2 Supe… Road - Eli… 10660          NA <NA>          <NA>     <NA> 
##  3       3 Supe… Road - Eli…  7990          NA <NA>          <NA>     <NA> 
##  4       4 Supe… Road - Eli…  5330          NA <NA>          <NA>     <NA> 
##  5       5 Supe… Road - Eli…  4260          NA <NA>          <NA>     <NA> 
##  6       6 Supe… Road - Eli…  3940          NA <NA>          <NA>     <NA> 
##  7       7 Supe… Road - Eli…  3200          NA <NA>          <NA>     <NA> 
##  8       8 Supe… Road - Eli…  2660          NA <NA>          <NA>     <NA> 
##  9       9 Supe… Road - Eli…  2240          NA <NA>          <NA>     <NA> 
## 10      10 Supe… Road - Eli…  1840          NA <NA>          <NA>     <NA> 
## # … with 87 more rows, and 6 more variables: order.id <dbl>,
## #   order.line <dbl>, order.date <dttm>, customer.id <dbl>,
## #   product.id <dbl>, quantity <dbl>
# 4.0 MODELING WITH PURRR ----

# 4.1 Time Series Plot ----
#  - What if we wanted to approximate the 3 month rolling average with a line?
#  - We can use a smoother

# Code comes from 04_functions_iteration/01_functional_programming
rolling_avg_3_tbl <- bike_orderlines_tbl %>%
    select(order_date, category_1, category_2, total_price) %>%
    
    mutate(order_date = ymd(order_date)) %>%
    mutate(month_end = ceiling_date(order_date, unit = "month") - period(1, unit = "days")) %>%
    
    group_by(category_1, category_2, month_end) %>%
    summarise(
        total_price = sum(total_price)
    ) %>%
    mutate(rolling_avg_3 = rollmean(total_price, k = 3, na.pad = TRUE, align = "right")) %>%
    ungroup() %>%
    
    mutate(category_2 = as_factor(category_2) %>% fct_reorder2(month_end, total_price)) 

rolling_avg_3_tbl %>%
    
    ggplot(aes(month_end, total_price, color = category_2)) +
    
    # Geometries
    geom_point() +
    geom_line(aes(y = rolling_avg_3), color = "blue", linetype = 1) +
    facet_wrap(~ category_2, scales = "free_y") +
    
    # Add Loess Smoother
    geom_smooth(method = "loess", se = FALSE, span = 0.2, color = "black") +
    
    # Formatting
    theme_tq() +
    scale_color_tq() +
    scale_y_continuous(labels = scales::dollar_format(scale = 1e-3, suffix = "K"))
## Warning: Removed 2 rows containing missing values (geom_path).

# 4.2 Modeling Primer ----

# Data Preparation

sales_by_m_cross_country_tbl <- rolling_avg_3_tbl %>%
    
    filter(category_2 == "Cross Country Race") %>%
    
    select(month_end, total_price) %>%
    
    mutate(month_end_num = as.numeric(month_end))


sales_by_m_cross_country_tbl %>%
    
    ggplot(aes(month_end_num, total_price)) +
    geom_point() +
    geom_smooth(method = "loess", span = 0.2, se = FALSE)

# Making a loess model
?loess

fit_loess_cross_country <- sales_by_m_cross_country_tbl %>%
    
    loess(total_price ~ month_end_num, data = ., span = 0.2)

fit_loess_cross_country
## Call:
## loess(formula = total_price ~ month_end_num, data = ., span = 0.2)
## 
## Number of Observations: 60 
## Equivalent Number of Parameters: 14.69 
## Residual Standard Error: 89070
# Working With Broom

fit_loess_cross_country %>%
    broom::augment() %>%
    
    
    # Visualizing results
    ggplot(aes(month_end_num, total_price)) +
    geom_point() +
    geom_line(aes(y = .fitted), color = "blue")

# 4.3 Step 1: Function To Return Fitted Results ----

rolling_avg_3_tbl_nested <- rolling_avg_3_tbl %>%
    group_by(category_1, category_2) %>%
    nest()

rolling_avg_3_tbl_nested$data[[1]]
## # A tibble: 60 x 3
##    month_end  total_price rolling_avg_3
##    <date>           <dbl>         <dbl>
##  1 2011-01-31      143660           NA 
##  2 2011-02-28      324400           NA 
##  3 2011-03-31      142000       203353.
##  4 2011-04-30      498580       321660 
##  5 2011-05-31      220310       286963.
##  6 2011-06-30      364420       361103.
##  7 2011-07-31      307300       297343.
##  8 2011-08-31      110600       260773.
##  9 2011-09-30      191870       203257.
## 10 2011-10-31      196440       166303.
## # … with 50 more rows
data <- rolling_avg_3_tbl_nested$data[[1]]

tidy_loess <- function(data, span = 0.2) {
    
    data_formatted <- data %>%
        select(month_end, total_price) %>%
        mutate(month_end_num = as.numeric(month_end))
    
    fit_loess <- loess(formula = total_price ~ month_end_num, 
                       data    = data_formatted, 
                       span    = span)
    
    output_tbl <- fit_loess %>%
        broom::augment() %>%
        select(.fitted)
    
    return(output_tbl)
    
}


# 4.4 Step 2: Test Function on Single Element ----
rolling_avg_3_tbl_nested$data[[2]] %>%
    tidy_loess()
## # A tibble: 58 x 1
##    .fitted
##      <dbl>
##  1  39170.
##  2  36927.
##  3  33231.
##  4  28581.
##  5  20731.
##  6  12102.
##  7  10050.
##  8  10024.
##  9   8033.
## 10   6620.
## # … with 48 more rows
# 4.5 Step 3: Map Function to All Categories ----

# Map Functions

loess_tbl_nested <- rolling_avg_3_tbl_nested %>%
    mutate(fitted = data %>% map(tidy_loess))

loess_tbl_nested$fitted[[1]]
## # A tibble: 60 x 1
##    .fitted
##      <dbl>
##  1 176998.
##  2 239802.
##  3 286279.
##  4 311685.
##  5 313621.
##  6 298642.
##  7 261073.
##  8 221223.
##  9 201690.
## 10 187415.
## # … with 50 more rows
loess_tbl_nested %>%
    unnest()
## # A tibble: 538 x 6
##    category_1 category_2       month_end  total_price rolling_avg_3 .fitted
##    <chr>      <fct>            <date>           <dbl>         <dbl>   <dbl>
##  1 Mountain   Cross Country R… 2011-01-31      143660           NA  176998.
##  2 Mountain   Cross Country R… 2011-02-28      324400           NA  239802.
##  3 Mountain   Cross Country R… 2011-03-31      142000       203353. 286279.
##  4 Mountain   Cross Country R… 2011-04-30      498580       321660  311685.
##  5 Mountain   Cross Country R… 2011-05-31      220310       286963. 313621.
##  6 Mountain   Cross Country R… 2011-06-30      364420       361103. 298642.
##  7 Mountain   Cross Country R… 2011-07-31      307300       297343. 261073.
##  8 Mountain   Cross Country R… 2011-08-31      110600       260773. 221223.
##  9 Mountain   Cross Country R… 2011-09-30      191870       203257. 201690.
## 10 Mountain   Cross Country R… 2011-10-31      196440       166303. 187415.
## # … with 528 more rows
# Visualize Results

loess_tbl_nested %>%
    
    unnest() %>%
    
    ggplot(aes(month_end, total_price, color = category_2)) +
    
    # Geometries
    geom_point() +
    geom_line(aes(y = .fitted), color = "blue", size = 2) +
    geom_smooth(method = "loess", span = 0.2) +
    facet_wrap(~ category_2, scales = "free_y")