R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

1. Load up libraries —-

library(readxl) # Load up libraries:pacman if (!require(“pacman”)) install.packages(“pacman”)

library(tidyverse) # library(tidyquant) library(readxl) # library(writexl)

2. Import data —-

readxl::read_excel() ?read_excel

path1 = “./bikes.xlsx”

bikes_tbl <- read_excel(path1) bikeshop_tbl <- read_excel(path = “./bikeshops.xlsx”) orderlines_tbl <- read_excel(path = “./orderlines.xlsx”) bike_orderlines_tbl <- read_excel(path = “./bike_orderlines.xlsx”)

3. Examining data —-

bikes_tbl str(bikes_tbl) glimpse(bikes_tbl)

4. Introductio to dplyr —-

select() —-

bike_orderlines_tbl %>% select(starts_with(“order_”))

Reduce columns, rearrange columns

bike_orderlines_tbl %>% select(bikeshop_name:state, everything())

pull() —-

bike_orderlines_tbl %>% pull(total_price) %>% mean() ## select_if() —- bike_orderlines_tbl %>% select_if(is.numeric)

arrange() —-

bikes_tbl %>% select(model, price) %>% arrange(desc(price))

filter() —-

bikes_tbl %>% select(model, price) %>% filter(price > mean(price))

bikes_tbl %>% select(model, price) %>% filter((price > 5000) | (price < 1000)) %>% arrange(desc(price)) %>% View()

bikes_tbl %>% select(model, price) %>% filter((price > 6000), model %>% str_detect(“Supersix”)) %>% View()

Adding Flag

bike_orderlines_tbl %>% filter(category_2 == “Over Mountain”) %>% View()

slice() —-

bikes_tbl %>% arrange(price) %>% slice(1:5) %>% View()

distinct() —-

bike_orderlines_tbl %>% distinct(category_1, category_2) %>% View()

mutate() —-

bike_orderlines_tbl %>% select(order_date, model, quantity, price) %>% mutate(total_price = quantity * price) %>% View()

ntile() —-

bike_orderlines_tbl %>% mutate(total_price_binned = ntile(total_price, 3)) %>% View() ## case_when() —- ### Numeric to categorical bike_orderlines_tbl %>% mutate(total_price_binned = ntile(total_price, 3)) %>% mutate(total_price_binned2 = case_when( total_price > quantile(total_price, 0.75) ~ “High”, total_price > quantile(total_price, 0.25) ~ “Medium”, TRUE ~ “Low” )) %>% View()

Text to Categorical

left_join() —-

?left_join bike_order <- left_join(orderlines_tbl, bikes_tbl, by = c(“product.id” = “bike.id”))

Week 4 —-

Here we use separate() to help us to create the tibble data with the same

format as the bike_orderlines_tbl.

separate() —-

bike_order_1 <- bike_order %>% separate(description, into = c(“category 1”, “category 2”, “frame_material”), sep = ” - “) %>% left_join(bikeshop_tbl, by = c(”customer.id” = “bikeshop.id”)) %>% separate(location, into = c(“city”, “state”), sep = “,”) %>% mutate(total_price = quantity * price) %>% select(-1) %>% select(order.date, order.id, order.line, everything()) %>% select(1:3, quantity, price, total_price, everything()) %>% select(1:6, model:state)

group_by() and summarize() —-

Basics

bike_orderlines_tbl %>% summarise( revenue = sum(total_price) )

bike_orderlines_tbl %>% group_by(category_1, category_2, frame_material) %>% summarise( revenue = sum(total_price) ) %>% ungroup() %>% arrange(desc(revenue))

bike_orderlines_tbl %>% group_by(category_1, category_2) %>% summarise( count = n(), avg = mean(total_price), sd = sd(total_price) )

summarize_all() —-

bike_orderlines_missing <- bike_orderlines_tbl %>% mutate(total_price = c(rep(NA, 4), total_price[5:nrow(.)] ))

bike_orderlines_missing %>% summarize_all(~ sum(is.na(.)) / length(.))

rename() and set_names() —-

rename()

bike_order_1 %>% rename(order_date = order.date) %>% rename(Bikeshop name = bikeshop.name) %>% View() # set_names(): change all column names at once bike_order_1 %>% set_names(names(.) %>% str_replace(“_“,”.”)) %>% View()

Reshaping (Pivoting) Data with spread() and gather() —-

bikeshop_revenue_tbl <- bike_orderlines_tbl %>% select(bikeshop_name, category_1, total_price) %>% group_by(bikeshop_name, category_1) %>% summarise(sales = sum(total_price)) %>% ungroup() %>% arrange(desc(sales))

spread() —-

Long to wide

bikeshop_revenue_formatted <- bikeshop_revenue_tbl %>% spread(key = category_1, value = sales)

gather() —-

Wide to long

bikeshop_revenue_formatted %>% gather(key = “category_1”, value = “sales”, Mountain, Road)

Binding Data by Row or by Column with bind_rows() and bind_col() —-

bind_cols() —-

bike_orderlines_tbl %>% select(-contains(“order”)) %>% bind_cols(bike_orderlines_tbl %>% select(order_id)) %>% View()

bind_rows() —-

Separate & Unite —-

NA values —-

example of NA

create data frame

df <- data.frame(points=c(10, NA, 15, 15, 14, 16), assists=c(4, NA, 4, NA, 9, 3), rebounds=c(NA, 5, 10, 7, 7, NA))

view data frame

df

drop rows with missing values in any column

df %>% drop_na()

drop rows with missing values in rebounds column

df %>% drop_na(rebounds)

df %>% replace_na(list(rebounds = 0)) %>% str() df %>% replace_na(list(rebounds = 0L)) %>% str()

Week 5&6 Intro to ggplot2 —-

library(lubridate)

1.1 Date and Datetime

order_date_tbl <- bike_orderlines_tbl %>% select(order_date)

order_date_tbl %>% pull(order_date) %>% class()

1.2 Date classes

order_date_tbl %>% mutate(order_date_chr = as.character(order_date)) %>% mutate(order_date_chr2 = order_date_chr %>% str_c(” 00:00:00”)) %>% mutate(order_date_date = order_date_chr %>% ymd()) %>% mutate(order_date_dttm = order_date_chr2 %>% ymd_hms())

1.3 Conversion

“06/01/18” %>% mdy()

“06/01/18 12:30:15” %>% mdy_hms()

“2011-01-01” %>% ymd() %>% year()
ddays(1)

2.0 Time-based Data grouping

bike_sales_y_tbl <- bike_orderlines_tbl %>% select(order_date, total_price) %>% mutate(year = year(order_date)) %>% group_by(year) %>% summarize(sales = sum(total_price)) %>% ungroup()

bike_sales_m_tbl <- bike_orderlines_tbl %>% select(order_date, total_price) %>% mutate(order_date = ymd(order_date)) %>% mutate(year = year(order_date), month = month(order_date, label = TRUE)) %>% group_by(year, month) %>% summarize(sales = sum(total_price))

Floor date

bike_orderlines_tbl %>% select(order_date, total_price) %>% mutate(order_date = ymd(order_date)) %>% mutate(year_month = floor_date(order_date, unit = “month”)) %>% group_by(year_month) %>% summarize(sales = sum(total_price))

2.1 Filtering Date Ranges

bike_orderlines_tbl %>% mutate(order_date = ymd(order_date)) %>% filter(year(order_date) %in% c(2012, 2013))

week 7 —-

measuring change

bike_sales_y_tbl %>% mutate(sales_lag_1 = lag(sales, n = 1)) %>% ## How to handle NA values mutate(sales_lag_1 = case_when( is.na(sales_lag_1) ~ sales, TRUE ~ sales_lag_1)) %>% ## compute yearly change in percentage mutate(diff_1 = sales - sales_lag_1) %>% mutate(pct_diff_1 = diff_1 / sales_lag_1) %>% mutate(pct_diff_1_chr = scales::percent(pct_diff_1))

Measure change: calculate monthly sales

How to generate function to calcualte sales change

calculate_pct_diff <- function(data){ data %>% mutate(sales_lag_1 = lag(sales, n = 1)) %>% mutate(sales_lag_1 = case_when( is.na(sales_lag_1) ~ sales, TRUE ~ sales_lag_1)) %>% mutate(diff_1 = sales - sales_lag_1) %>% mutate(pct_diff_1 = diff_1 / sales_lag_1) %>% mutate(pct_diff_1_chr = scales::percent(pct_diff_1)) }

bike_sales_m_tbl %>% calculate_pct_diff() %>% View()

Cumulative calculation: cumsum()

bike_sales_y_tbl %>% mutate(cumulative_sales = cumsum(sales)) %>% mutate(cumulative_sales_pct = cumulative_sales / sum(sales))

ggplot

revenue_by_year_tbl <- bike_orderlines_tbl %>% select(order_date, total_price) %>% mutate(year = year(order_date)) %>% group_by(year) %>% summarize(revenue = sum(total_price)) %>% ungroup()

g <- revenue_by_year_tbl %>% # create a canvas ggplot(aes(x = year, y = revenue)) + # adding geometries geom_point(size = 5) + geom_line(size = 1) + geom_smooth(method = “lm”, se = FALSE) + labs( title = ‘Revenue’, subtitle = ‘Sales are trending up’, x = ““, y =”Sales (Millions)” ) + scale_y_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = “M”)) + theme_bw() + expand_limits(y = 0)

g

View(g)

scatter plot

order_value_tbl <- bike_orderlines_tbl %>% select(order_id, order_line, total_price, quantity) %>% group_by(order_id) %>% summarize( total_quantity = sum(quantity), total_price = sum(total_price) ) %>% ungroup()

generate scatter plot

order_value_tbl %>% ggplot(aes(x = total_quantity, y = total_price)) + geom_point(alpha = 1, size = 2)