1. Install packages

install.packages(“readxl”) install.packages(“tidyverse”) # Only run once library(tidyverse)

library(readxl) library(writexl)

2. Data import

./, ../,

bikes_tbl <- read_excel(“./bikes.xlsx”) # fast key: alt+- bikeshops_tbl <- read_excel(“./bikeshops.xlsx”) orderlines_tbl <- read_excel(“./orderlines.xlsx”)

Examine data:

bikes_tbl head(bikes_tbl)

Import csv file:

bike_orderlines_tbl <- read_csv(“./bike_orderlines.csv”)

Joining data:

orderlines_bikes_tbl <- left_join(orderlines_tbl, bikes_tbl, by = c(“product.id” = “bike.id”))

bike_orderlines_bikeshops_joined <- left_join(orderlines_bikes_tbl, bikeshops_tbl, by = c(‘customer.id’ = ‘bikeshop.id’))

%>% is called pipe: fast key: ctl + shift + m

bike_orderlines_bikeshops_joined <- left_join(orderlines_tbl, bikes_tbl, by = c(“product.id” = “bike.id”)) %>% left_join(bikeshops_tbl, by = c(“customer.id” = “bikeshop.id”))

Wrangling data: decompose description into three columns: category.1, category.2 and frame.material

bike_orderlines_wrangled_tbl <- bike_orderlines_bikeshops_joined %>% separate(description, into = c(‘category.1’, ‘category.2’, ‘frame.material’), sep = ’ - ‘) %>% separate(location, into = c(’city’, ‘state’), sep = ‘,’, remove = FALSE) %>% # create calculated columns mutate(total.price = price * quantity) %>% # Reorganize columns
select(-…1, -location) %>% # Reorder columns
select(contains(‘date’), contains(‘id’), contains(‘order’), quantity, price, total.price, everything()) %>% # Rename columns rename(order_date = order.date) %>% set_names(names(.) %>% str_replace_all(“\.”, “_“)) # save the file as RDS saveRDS(bike_orderlines_wrangled_tbl, ‘./bike_orderlines.rds’)

dplyr and tidyr

pull() vs. select()

bike_orderlines_wrangled_tbl %>% # select(total_price) pull(total_price) %>% mean() # select_if bike_orderlines_wrangled_tbl %>% # select_if(is.character) select_if(is.numeric)

arrange() and desc()

bikes_tbl %>% select(model, price) %>% arrange(desc(price))

filter()

bikes_tbl %>% select(model, price) %>% filter(price > mean(price))

bikes_tbl %>% select(model, price) %>% filter((price > 5000) & (price < 10000)) %>%
arrange(desc(price))

bikes_tbl %>% select(model, price) %>% filter(price > 6000, model %>% str_detect(“Supersix”))

Filtering one or more conditions using == and %in%

bike_orderlines_wrangled_tbl %>% filter(category_2 %in% c(“Over Mountain”, “Trail”, “Endurance Road”)) %>% View()

slice()

bikes_tbl %>% arrange(desc(price)) %>% # slice(1:5) slice((nrow(.)-4):nrow(.))

distinct(): extract unique values from data

bike_orderlines_wrangled_tbl %>% distinct(category_1, category_2) %>% View()

mutate(): add new columns in our data

bike_orderlines_wrangled_tbl %>% mutate(total_price_log = log(total_price)) %>% mutate(total_price_sqrt = total_price^0.5) %>% View()

Binning with ntile()

bike_orderlines_wrangled_tbl %>% mutate(total_price_binned = ntile(total_price, 3)) %>% View()

case_when(): provide flexible conditions for grouping (binning)

bike_orderlines_wrangled_tbl %>% mutate(total_price_binned = ntile(total_price, 3)) %>% mutate(total_price_binned2 = case_when( total_price > quantile(total_price, 0.75) ~ “High”, total_price > quantile(total_price, 0.25) ~ “Medium”, TRUE ~ “Low” )) %>% View()

Grouping and summarizing with group_by() and summarize()

bike_orderlines_wrangled_tbl %>% summarise(revenue = sum(total_price))

bike_orderlines_wrangled_tbl %>% group_by(category_1) %>% summarise(revenue = sum(total_price)) %>% ungroup() %>% arrange(desc(revenue))

bike_orderlines_wrangled_tbl %>%
group_by(category_1, category_2, frame_material) %>% summarise(revenue = sum(total_price)) %>% ungroup() %>% arrange(desc(revenue))

summarize_all()

Q1: What are the unique categories of products?

bike_orderlines_wrangled_tbl %>% distinct(category_1) bike_orderlines_wrangled_tbl %>% distinct(category_2) bike_orderlines_wrangled_tbl %>% distinct(frame_material)

Q2: Which product categories have the largest sales?

category_1

bike_orderlines_wrangled_tbl %>% select(category_1, total_price) %>% group_by(category_1) %>% summarise(sales = sum(total_price)) %>% ungroup() %>% rename(Primary Category = category_1, Sales = sales) %>% # format dollars mutate(Sales1 = Sales %>% scales::dollar())