Introduction

ASK

PREPARE + PROCESS

# I started by loading the tidyverse library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# I read the files into R, assigning each to an object
Q1 <- read.csv("Divvy_Trips_2019_Q1.csv")
Q2 <- read.csv("Divvy_Trips_2019_Q2.csv")
Q3 <- read.csv("Divvy_Trips_2019_Q3.csv")
Q4 <- read.csv("Divvy_Trips_2019_Q4.csv")
# When making sure all variables are consistent across the four files
names(Q1)
##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"
# ... I found Q2 had inconsistent variable names, and to solve this I took the names of Q1 and then passed as the names for Q2
Proper_Column_Names <- names(Q1)
names(Q2) <- Proper_Column_Names
# I also had to convert data types because the start_time and end_time variables were of character type and not POSIXct
# To do this, I assigned the 4 quarters into 1 list
my_list <- list(Q1, Q2, Q3, Q4)
# I then wrote this function to focus on this specific task of correcting the data types:
cleaning_function <- function(df) {
        
        df[, 2:3] <- lapply(df[, 2:3], function(x) as.POSIXct(x, format = "%Y-%m-%d %H:%M:%S"))
        
        df[, 5] <- gsub(",", "", df[, 5])
        
        df[, 5] <- as.numeric(df[, 5])
        
        return(df)
}
# Lastly, I used lapply to return a list of the same length with the proper data types
my_list <- lapply(my_list, cleaning_function)
# This function filters each data frame by user type
filter_and_assign <- function(df) {
        
        customer_data <- df[df$usertype == "Customer", ]
        subscriber_data <- df[df$usertype == "Subscriber", ]
        
        return(list(customer = customer_data, subscriber = subscriber_data))
}

# Applying the function to my list
my_list2 <- lapply(my_list, filter_and_assign)
Observations
  1. During the cleaning stage I noticed quite a few NAs in the gender variable, but I decided to keep them because they might give us some insights on how each user type used the service in the context of data privacy.
  2. I ended up with a tidy data set, which is an object of type list with 4 elements, one for each quarter, and within each element there are 2 data frames, 1 for subscribers and 1 for customers.

ANALYZE

# This function calculates the number of trips per quarter by user as well as average time on bike
summary_stats_function <- function(x) {
        
        Q1SubData <- my_list2[[1]]$subscriber %>% summarise(mean(tripduration, na.rm = T),
                                                                 trip_id = n())
        Q2SubData <- my_list2[[2]]$subscriber %>% summarise(mean(tripduration, na.rm = T),
                                                                 trip_id = n())
        Q3SubData <- my_list2[[3]]$subscriber %>% summarise(mean(tripduration, na.rm = T),
                                                                 trip_id = n())
        Q4SubData <- my_list2[[4]]$subscriber %>% summarise(mean(tripduration, na.rm = T),
                                                                 trip_id = n())
        
        SubData <- rbind(Q1SubData, Q2SubData, Q3SubData, Q4SubData)
        
        Q1CustData <- my_list2[[1]]$customer %>% summarise(mean(tripduration, na.rm = T),
                                                                trip_id = n())
        Q2CustData <- my_list2[[2]]$customer %>% summarise(mean(tripduration, na.rm = T),
                                                                trip_id = n())
        Q3CustData <- my_list2[[3]]$customer %>% summarise(mean(tripduration, na.rm = T),
                                                                trip_id = n())
        Q4CustData <- my_list2[[4]]$customer %>% summarise(mean(tripduration, na.rm = T),
                                                                trip_id = n())
        
        CustData <- rbind(Q1CustData, Q2CustData, Q3CustData, Q4CustData)
        
        return(list(SubData, CustData))
   
}
# Applying the function to object my_list2 (which was created in previous code chunk)
my_list3 <- lapply(my_list2, summary_stats_function)

# Extracting results into a data frame
Summmary_Stats <- my_list3[[1]]
Summmary_Stats <- as.data.frame(Summmary_Stats)
# How many minutes each user type spent on bikes on average across the 4 quarters
Summmary_Stats %>% mutate(
    subscribers_average_time_on_bike_per_quarter = mean.tripduration..na.rm...T./60,
    customers_average_time_on_bike_per_quarter = mean.tripduration..na.rm...T..1/60
) %>% select(5:6)
##   subscribers_average_time_on_bike_per_quarter
## 1                                     13.89112
## 2                                     14.04721
## 3                                     15.60901
## 4                                     12.49013
##   customers_average_time_on_bike_per_quarter
## 1                                   61.92896
## 2                                   48.51607
## 3                                   60.38972
## 4                                   61.09094
# How many bike trips by each user type per quarter
Summmary_Stats %>% mutate(
        subscribers_trips_per_quarter = trip_id,
        customers_trips_per_quarter = trip_id.1) %>% select(5:6)
##   subscribers_trips_per_quarter customers_trips_per_quarter
## 1                        341906                       23163
## 2                        848577                      259586
## 3                       1149024                      491694
## 4                        597860                      106194
Observations
  1. During Q1, subscribers completed 341 906 rides and customers 23 163. In Q2, activity increased to 848 577 vs. 259 586. In Q3, subscribers exceeded 1 million rides (1 149 625) while customers’ numbers continue to go up to 491 694. Lastly, for Q4 there was a drop in activity with 597 860 subscriber rides against 106 194 by customers.
  2. Although subscribers used the service more heavily than customers, on average their trips were much shorter. On average subscribers cycled for 13.9825 minutes whereas customers spent close to 1 hour (57.97 minutes).
  • Next I created a couple of new variables that I thought would be relevant in order to understand user behavior better. The idea was to leverage R’s lubridate package and create a couple of new variables based on the time data. By focusing on the time of day and days of the week when rides took place I would be able to show how the number of trips evolved during the year, month by month. I also wanted to find out which days of the week and time of the day were most popular among the two user types. To do this, I wrote a function to create new variables (month, weekday, minutes on the bike) and then flattened the list into one big data frame.
# Loading the lubridate package
library(lubridate)
# This function is a bit clunky, but it worked
timedate_func <- function(x) {
        
        Q1SubDataTime <- my_list2[[1]]$subscriber %>% mutate(
                Month_var = month(start_time),
                Weekday_var = weekdays(start_time),
                Diff_01 = difftime(end_time, start_time),
                MinutesOnBike = as.numeric(Diff_01)
        )
        
        Q2SubDataTime <- my_list2[[2]]$subscriber %>% mutate(
                Month_var = month(start_time),
                Weekday_var = weekdays(start_time),
                Diff_01 = difftime(end_time, start_time),
                MinutesOnBike = as.numeric(Diff_01)
        )
        
        Q3SubDataTime <- my_list2[[3]]$subscriber %>% mutate(
                Month_var = month(start_time),
                Weekday_var = weekdays(start_time),
                Diff_01 = difftime(end_time, start_time),
                MinutesOnBike = as.numeric(Diff_01)
        )
        
        Q4SubDataTime <- my_list2[[4]]$subscriber %>% mutate(
                Month_var = month(start_time),
                Weekday_var = weekdays(start_time),
                Diff_01 = difftime(end_time, start_time),
                MinutesOnBike = as.numeric(Diff_01)
        )
        
        Q1CustDataTime <- my_list2[[1]]$customer %>% mutate(
                Month_var = month(start_time),
                Weekday_var = weekdays(start_time),
                Diff_01 = difftime(end_time, start_time),
                MinutesOnBike = as.numeric(Diff_01)
        )
        
        Q2CustDataTime <- my_list2[[2]]$customer %>% mutate(
                Month_var = month(start_time),
                Weekday_var = weekdays(start_time),
                Diff_01 = difftime(end_time, start_time),
                MinutesOnBike = as.numeric(Diff_01)
        )
        
        Q3CustDataTime <- my_list2[[3]]$customer %>% mutate(
                Month_var = month(start_time),
                Weekday_var = weekdays(start_time),
                Diff_01 = difftime(end_time, start_time),
                MinutesOnBike = as.numeric(Diff_01)
        )
        
        Q4CustDataTime <- my_list2[[4]]$customer %>% mutate(
                Month_var = month(start_time),
                Weekday_var = weekdays(start_time),
                Diff_01 = difftime(end_time, start_time),
                MinutesOnBike = as.numeric(Diff_01)
        )
        
        return(list(Q1SubDataTime, Q2SubDataTime, Q3SubDataTime, Q4SubDataTime,
                    Q1CustDataTime, Q2CustDataTime, Q3CustDataTime, Q4CustDataTime))
}
# Applying the function the list created in the previous phase
my_list4 <- lapply(my_list2, timedate_func)
# Counting weekdays
count_weekday_occurrences <- function(x) {
    table(x$Weekday_var)
}
# Getting the first element of list
my_list5 <- my_list4[[1]]
# Making a data frame 
count_weekdays <- lapply(my_list5, count_weekday_occurrences)
results_weekdays <- do.call(rbind, count_weekdays)
results_weekdays <- as.data.frame(results_weekdays)
results_weekdays$Legend <- c("Q1_Subs", "Q2_Subs", "Q3_Subs", "Q4_Subs", "Q1_Cust", "Q2_Cust", "Q3_Cust", "Q4_Cust")
correct_column_order <- c("Legend", "Monday", "Tuesday", "Wednesday",
                           "Thursday", "Friday", "Saturday", "Sunday")
results_weekdays <- results_weekdays[, correct_column_order]
# The data frame with favorite days of the week by quarter
results_weekdays
##    Legend Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 1 Q1_Subs  48507   58277     57925    63983  59672    29309  24218
## 2 Q2_Subs 128866  139521    145303   137949 135404    85884  75650
## 3 Q3_Subs 180984  189449    196437   188017 170830   121040 102267
## 4 Q4_Subs 100423  109778     94612    96966  91060    50930  54091
## 5 Q1_Cust   1892    2728      2489     2920   3375     5993   3764
## 6 Q2_Cust  27577   25566     24901    24771  36608    64831  55332
## 7 Q3_Cust  59230   48320     52128    61475  68255   116538  85748
## 8 Q4_Cust  12790   12041     10227    12206  12903    20694  25333
# This function counts rides according to 4 time periods
time_period_analysis <- function(x) {

        hour_component <- hour(x$start_time)
        
        time_intervals <- cut(hour_component,
                              breaks = c(-Inf, 6, 12, 18, 24),
                              labels = c("00:00-06:00", "06:01-12:00", "12:01-18:00", "18:01-24:00"),
                              include.lowest = TRUE)
        
        counts <- table(time_intervals)
        
        return(counts)
}
# Flattening the list into one big data frame with all the data
big_dataframe <- bind_rows(my_list5, .id = "id")
# Getting summary statistics on monthly rides for subscribers
subscriber_monthly_rides <- big_dataframe %>%
        filter(usertype == "Subscriber") %>%
        group_by(Month_var) %>% 
        summarise(total_trips = n_distinct(trip_id))
# Summary stats on monthly rides for customers
customer_monthly_rides <- big_dataframe %>%         filter(usertype == "Customer") %>%
        group_by(Month_var) %>% 
        summarise(total_trips = n_distinct(trip_id))
# Removing NAs before plotting
subscriber_monthly_rides <- subscriber_monthly_rides[- 13, ]
customer_monthly_rides <- customer_monthly_rides[- 13, ]

SHARE

Comparing monthly use throughout the year

# Loading the patchwork library (to patch the two plots)
library(patchwork)
# Creating monthly variables
month_names <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
# Building the subscribers' plot
subs_plot <- ggplot(subscriber_monthly_rides, aes(x=Month_var, y=total_trips)) +
geom_area(colour = "black", fill = "blue", alpha = .2) +
        scale_y_continuous(labels = scales::comma_format()) +
        geom_point(size = 4, shape = 22, fill = "darkblue") +
        ggtitle("Monthly rides by subscribers") +
        ylab(NULL) +
        scale_x_continuous(breaks = 1:12, labels = month_names) +
        xlab(NULL) +
        theme_minimal()
# Building the customers' plot
custs_plot <- ggplot(customer_monthly_rides, aes(x=Month_var, y=total_trips)) +
        geom_area(colour = "black", fill = "darkorange", alpha = .2) +
        scale_y_continuous(labels = scales::comma_format()) +
        geom_point(size = 4, shape = 22, fill = "darkorange") +
        ggtitle("and customers") +
        ylab(NULL) +
        scale_x_continuous(breaks = 1:12, labels = month_names) +
        xlab(NULL) +
        theme_minimal()
# Patching the two plots
subs_plot/custs_plot

Comparing favorite times to cycle

# I started by applying the function created before to the data
timeday_counts <- lapply(my_list5, time_period_analysis)
# I then coerced the list into a data frame format
timeday_counts <- as.data.frame(timeday_counts)
# Created a vector with the variables I wanted to keep
columns_to_keep_time_period <- c("time_intervals", "Freq", "Freq.1", "Freq.2", "Freq.3", "Freq.4", "Freq.5", "Freq.6", "Freq.7")
# Subsetted the data frame with the said vector
timeday_counts <- timeday_counts[, columns_to_keep_time_period]
# Dividing the data into subscribers and customers
subscribers_timeday <- timeday_counts %>% select(1:5)
customers_timeday <- timeday_counts %>% select(c(1, 6:9))
# Renaming column names
subscribers_timeday <- rename(subscribers_timeday, "Q1" = "Freq", "Q2" = "Freq.1", "Q3" = "Freq.2", "Q4" = "Freq.3")
customers_timeday <- rename(customers_timeday, "Q1" = "Freq.4", "Q2" = "Freq.5", "Q3" = "Freq.6", "Q4" = "Freq.7")
# The results for subscribers
subscribers_timeday
##   time_intervals     Q1     Q2     Q3     Q4
## 1    00:00-06:00  21267  47333  69438  37780
## 2    06:01-12:00 121361 285239 377246 217415
## 3    12:01-18:00 161187 395836 523775 273210
## 4    18:01-24:00  38076 120169 178565  69455
# The results for customers
customers_timeday
##   time_intervals    Q1     Q2     Q3    Q4
## 1    00:00-06:00   651   7412  16969  3694
## 2    06:01-12:00  6086  69906 127589 34176
## 3    12:01-18:00 14119 141360 257506 56142
## 4    18:01-24:00  2305  40908  89630 12182
# Calculating percentages
percentage_subs <- subscribers_timeday %>%
    mutate(
        Q1_pct = Q1 / sum(Q1) * 100,
        Q2_pct = Q2 / sum(Q2) * 100,
        Q3_pct = Q3 / sum(Q3) * 100,
        Q4_pct = Q4 / sum(Q4) * 100
    ) %>% select(c(1, 6:9))
percentage_custs <- customers_timeday %>%
    mutate(
        Q1_pct = Q1 / sum(Q1) * 100,
        Q2_pct = Q2 / sum(Q2) * 100,
        Q3_pct = Q3 / sum(Q3) * 100,
        Q4_pct = Q4 / sum(Q4) * 100
    ) %>% select(c(1, 6:9))
# Making a table with gt package
library(gt)
subs_perc_plot <- percentage_subs %>%
    gt() %>%
    tab_header(
        title = "Subscriber distribution by time of day and quarter"
    ) %>%
    fmt_number(
        columns = c("Q1_pct", "Q2_pct","Q3_pct","Q4_pct"),
        decimals = 1
    ) %>% cols_label(
        time_intervals = "Time Intervals",
        Q1_pct = "Q1 (%)",
        Q2_pct = "Q2 (%)",
        Q3_pct = "Q3 (%)",
        Q4_pct = "Q4 (%)"
    ) %>%
    data_color(
        columns = c(Q1_pct, Q2_pct, Q3_pct, Q4_pct),
        colors = scales::col_numeric(
            palette = c("lightblue", "darkblue"),
            domain = NULL
        )
    ) %>%
    tab_style(
        style = cell_borders(
            sides = "all",
            color = "gray",
            weight = px(1)
        ),
        locations = cells_body()
    ) %>%
    tab_options(
        table.font.size = px(14),
        heading.align = "center",
        column_labels.font.size = px(16),
        column_labels.font.weight = "bold",
        table.border.top.width = px(2),
        table.border.top.color = "black",
        table.border.bottom.width = px(2),
        table.border.bottom.color = "black",
        data_row.padding = px(8))
## Warning: Since gt v0.9.0, the `colors` argument has been deprecated.
## • Please use the `fn` argument instead.
## This warning is displayed once every 8 hours.
custs_perc_plot <- percentage_custs %>%
    gt() %>%
    tab_header(
        title = "Customer distribution by time of day and quarter"
    ) %>%
    fmt_number(
        columns = c("Q1_pct", "Q2_pct","Q3_pct","Q4_pct"),
        decimals = 1
    ) %>% cols_label(
        time_intervals = "Time Intervals",
        Q1_pct = "Q1 (%)",
        Q2_pct = "Q2 (%)",
        Q3_pct = "Q3 (%)",
        Q4_pct = "Q4 (%)"
    ) %>%
    data_color(
        columns = c(Q1_pct, Q2_pct, Q3_pct, Q4_pct),
        colors = scales::col_numeric(
            palette = c("orange", "brown"),
            domain = NULL
        )
    ) %>%
    tab_style(
        style = cell_borders(
            sides = "all",
            color = "gray",
            weight = px(1)
        ),
        locations = cells_body()
    ) %>%
    tab_options(
        table.font.size = px(14),
        heading.align = "center",
        column_labels.font.size = px(16),
        column_labels.font.weight = "bold",
        table.border.top.width = px(2),
        table.border.top.color = "black",
        table.border.bottom.width = px(2),
        table.border.bottom.color = "black",
        data_row.padding = px(8))
subs_perc_plot
Subscriber distribution by time of day and quarter
Time Intervals Q1 (%) Q2 (%) Q3 (%) Q4 (%)
00:00-06:00 6.2 5.6 6.0 6.3
06:01-12:00 35.5 33.6 32.8 36.4
12:01-18:00 47.1 46.6 45.6 45.7
18:01-24:00 11.1 14.2 15.5 11.6
custs_perc_plot
Customer distribution by time of day and quarter
Time Intervals Q1 (%) Q2 (%) Q3 (%) Q4 (%)
00:00-06:00 2.8 2.9 3.5 3.5
06:01-12:00 26.3 26.9 25.9 32.2
12:01-18:00 61.0 54.5 52.4 52.9
18:01-24:00 10.0 15.8 18.2 11.5

ACT

  1. Implementing a campaign to convert casual customers into subscribers by using the data about time of day and days of the week.
  1. Exploring the hypothesis that subscribers are using the app to get to work and if so, trying to optimize that experience with a rewards scheme.

  2. Refining the analysis further - for example by finding out the stations that saw the most traffic, the male to female ratio among customers and subscribers, average age of the two user types, etc.