CapstoneR

###Import libraries for tidyverse here skimr janitor stringr lubridate kimisc

library(tidyverse)
library(here)
library(skimr)
library(janitor)
library(stringr)
library(lubridate)
library(kimisc)

##Import the csv file

The cyclistic bike data is imported after being exported from Postgresql.

bike_data <- read.csv("cyclistic_bike_data.csv")

bike_data$started_at <- as.POSIXct(bike_data$started_at)
bike_data$ended_at <- as.POSIXct(bike_data$ended_at)
bike_data$member_casual <- as.factor(bike_data$member_casual)
bike_data$day_of_week <- as.factor(bike_data$day_of_week)
bike_data$month <- as.factor(bike_data$month)
bike_data$rideable_type <- as.factor(bike_data$rideable_type)
bike_data$duration <-  difftime(bike_data$ended_at, bike_data$started_at, units='mins')

##Cleaning Features, started_at and ended_at, will be converted to R’s date time format and the field, duration, will be the difference in minutes between ended_at and started_at.

Features, rideable_type, day_of_week, member_casual, and month will be converted into factors.

skim_without_charts(bike_data)

Data summary
Name	bike_data
Number of rows	4849115
Number of columns	18
_______________________
Column type frequency:
character	5
difftime	1
factor	4
numeric	6
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
ride_id	1	16	16	4849115
start_station_name	1	3	53	785
start_station_id	1	1	36	1296
end_station_name	1	10	53	780
end_station_id	1	1	36	1295

Variable type: difftime

skim_variable	n_missing	complete_rate	min	max	median	n_unique
duration	0	1	0 mins	55944.15 mins	12.82 mins	24468

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
rideable_type	1	FALSE	3	cla: 2743244, doc: 1075881, ele: 1029990
member_casual	1	FALSE	2	mem: 2670580, cas: 2178535
month	1	FALSE	12	Sep: 1119381, Jul: 692309, Aug: 674385, Jun: 608774
day_of_week	1	FALSE	7	sat: 875131, sun: 754975, fri: 696431, wed: 653104

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	1	41.90	0.04	41.65	41.88	41.90	41.93	42.06
start_lng	1	-87.64	0.02	-87.83	-87.66	-87.64	-87.63	-87.53
end_lat	1	41.90	0.04	41.65	41.88	41.90	41.93	42.17
end_lng	1	-87.64	0.02	-87.83	-87.66	-87.64	-87.63	-87.52
hour_of_day	1	14.30	4.90	0.00	11.00	15.00	18.00	23.00
week_of_year	1	29.67	10.43	1.00	23.00	31.00	37.00	53.00

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2020-09-01 00:00:07	2021-09-30 23:59:44	2021-06-09 16:12:42	4152439
ended_at	0	1	2020-09-01 00:04:43	2021-10-01 18:39:55	2021-06-09 16:34:33	4136546

A summary of the csv shows that there are 4,849,115 records (or data points) with no missing or empty values. There are 15 features (or columns).

glimpse(bike_data)

## Rows: 4,849,115
## Columns: 18
## $ ride_id            <chr> "4D03447AB226B3EA", "C8A8B54B3C6DA039", "F436EFBA9D~
## $ rideable_type      <fct> docked_bike, classic_bike, classic_bike, classic_bi~
## $ started_at         <dttm> 2020-09-30 11:30:18, 2021-05-03 19:16:02, 2021-05-~
## $ ended_at           <dttm> 2020-09-30 11:38:12, 2021-05-03 19:23:34, 2021-05-~
## $ start_station_name <chr> "Christiana Ave & Lawrence Ave", "Lincoln Ave & Bel~
## $ start_station_id   <chr> "474", "TA1309000026", "TA1309000026", "KA150300000~
## $ end_station_name   <chr> "Rockwell St & Eastwood Ave", "Southport Ave & Wave~
## $ end_station_id     <chr> "478", "13235", "13235", "KA1503000031", "13235", "~
## $ start_lat          <dbl> 41.96835, 41.95600, 41.95600, 41.80241, 41.95600, 4~
## $ start_lng          <dbl> -87.71183, -87.68016, -87.68016, -87.58692, -87.680~
## $ end_lat            <dbl> 41.96590, 41.94815, 41.94815, 41.77372, 41.94815, 4~
## $ end_lng            <dbl> -87.69364, -87.66394, -87.66394, -87.60563, -87.663~
## $ member_casual      <fct> member, member, member, member, casual, casual, cas~
## $ duration           <drtn> 7.900000 mins, 7.533333 mins, 10.233333 mins, 17.4~
## $ month              <fct> Sep, May, May, May, May, May, May, May, May, Apr, M~
## $ day_of_week        <fct> wednesday, monday   , thursday , sunday   , tuesday~
## $ hour_of_day        <int> 11, 19, 7, 8, 17, 17, 17, 21, 19, 0, 15, 19, 14, 17~
## $ week_of_year       <int> 40, 18, 20, 20, 18, 18, 19, 21, 19, 16, 19, 21, 20,~

##Understanding the Data

casual_pct <- round(nrow(bike_data %>% filter(member_casual=='casual'))/nrow(bike_data),2)
member_pct <- round(nrow(bike_data %>% filter(member_casual=='member'))/nrow(bike_data),2)

The rideshare program offers two customer options, members and casual riders. Members pay an annual membership fee while casual riders buy either a day or hourly pass. Data collected from the past 12 months show that 45% of the riders are casual while 55% are from members.

ggplot(data = bike_data) +
  geom_bar(mapping = aes(x = member_casual, y=after_stat(count), fill=member_casual)) +
 labs(title="Comparison of Riders: Casual vs. Members",
       subtitle=paste0("pct of casual riders ", casual_pct*100, "%     pct of subscribed members ", member_pct*100, "%"))

riders <- bike_data %>% 
  select(member_casual) %>% 
  group_by(member_casual) %>% 
  summarise(type_no = n() ) 
riders$riders_pct <- riders$type_no / sum(riders$type_no)
riders

## # A tibble: 2 x 3
##   member_casual type_no riders_pct
##   <fct>           <int>      <dbl>
## 1 casual        2178535      0.449
## 2 member        2670580      0.551

##Duration

month_data <- bike_data %>% 
                select(member_casual, month) %>% 
                group_by(member_casual,month) %>% 
                summarise(per_month = n(), .groups = 'keep')

daily_data <- bike_data %>% 
                select(member_casual, day_of_week) %>% 
                group_by(member_casual,day_of_week) %>% 
                summarise(per_day_of_week = n(), .groups = 'keep')

hourly_data <- bike_data %>% 
                select(member_casual, hour_of_day) %>% 
                group_by(member_casual, hour_of_day) %>% 
                summarise(per_hour_of_day = n(), .groups = 'keep')

weekly_data <- bike_data %>% 
                select(member_casual, week_of_year) %>% 
                group_by(member_casual, week_of_year) %>% 
                summarise(week_no_of_year = n(), .groups = 'keep') 
weekly_data

## # A tibble: 106 x 3
## # Groups:   member_casual, week_of_year [106]
##    member_casual week_of_year week_no_of_year
##    <fct>                <int>           <int>
##  1 casual                   1            4605
##  2 casual                   2            3918
##  3 casual                   3            3253
##  4 casual                   4            1728
##  5 casual                   5            1104
##  6 casual                   6             884
##  7 casual                   7             626
##  8 casual                   8            5999
##  9 casual                   9            9181
## 10 casual                  10           20336
## # ... with 96 more rows

###Round Trips

Analyzing the behavior of casual vs member riders, we see that casual riders are more than twice as likely as members to take round trips, returning the bike back to its station of origin.

bike_data %>% 
          select(start_station_id, end_station_id, member_casual ) %>% 
          group_by(member_casual) %>% 
          filter(start_station_id == end_station_id ) %>% 
          summarise(round_trips = n(), .groups = "keep")

## # A tibble: 2 x 2
## # Groups:   member_casual [2]
##   member_casual round_trips
##   <fct>               <int>
## 1 casual             263722
## 2 member             123985

bike_data %>% 
          select(start_station_id, end_station_id, member_casual ) %>% 
          group_by(member_casual) %>% 
          filter(start_station_id != end_station_id ) %>% 
          summarise(non_round_trips = n(), .groups = "keep")

## # A tibble: 2 x 2
## # Groups:   member_casual [2]
##   member_casual non_round_trips
##   <fct>                   <int>
## 1 casual                1914813
## 2 member                2546595

###Stations

The most popular station for the casual rider is Streeter Dr & Grand Avenue. That station is more than twice as popular as the second most station, Lake Shore Dr & Monroe St.

bike_data %>% 
          select(start_station_name,member_casual, start_lat, start_lng ) %>% 
          filter(member_casual=='casual') %>% 
          group_by(member_casual,start_station_name, start_lat, start_lng ) %>% 
          summarise(stations = n(), .groups = "keep") %>% 
          arrange(desc(stations)) %>% 
          head(25)

## # A tibble: 25 x 5
## # Groups:   member_casual, start_station_name, start_lat, start_lng [25]
##    member_casual start_station_name         start_lat start_lng stations
##    <fct>         <chr>                          <dbl>     <dbl>    <int>
##  1 casual        Streeter Dr & Grand Ave         41.9     -87.6    55857
##  2 casual        Lake Shore Dr & Monroe St       41.9     -87.6    22903
##  3 casual        Theater on the Lake             41.9     -87.6    20793
##  4 casual        Michigan Ave & Oak St           41.9     -87.6    15854
##  5 casual        Millennium Park                 41.9     -87.6    15421
##  6 casual        Lake Shore Dr & North Blvd      41.9     -87.6    15045
##  7 casual        Wells St & Concord Ln           41.9     -87.6    14915
##  8 casual        Indiana Ave & Roosevelt Rd      41.9     -87.6    14353
##  9 casual        Clark St & Lincoln Ave          41.9     -87.6    14029
## 10 casual        Dusable Harbor                  41.9     -87.6    13168
## # ... with 15 more rows

For members, the most popular starting station is Clar St & Elm. There isn’t a huge difference between the top and second most stations like we saw for the casual riders.

bike_data %>% 
          select(start_station_name,member_casual, start_lat, start_lng ) %>% 
          filter(member_casual=='member') %>% 
          group_by(member_casual,start_station_name, start_lat, start_lng ) %>% 
          summarise(stations = n(), .groups = "keep") %>% 
          arrange(desc(stations)) %>% 
          head(25)

## # A tibble: 25 x 5
## # Groups:   member_casual, start_station_name, start_lat, start_lng [25]
##    member_casual start_station_name       start_lat start_lng stations
##    <fct>         <chr>                        <dbl>     <dbl>    <int>
##  1 member        Clark St & Elm St             41.9     -87.6    21166
##  2 member        Wells St & Concord Ln         41.9     -87.6    18581
##  3 member        Theater on the Lake           41.9     -87.6    16755
##  4 member        Wells St & Elm St             41.9     -87.6    16745
##  5 member        Dearborn St & Erie St         41.9     -87.6    14814
##  6 member        Clark St & Armitage Ave       41.9     -87.6    14572
##  7 member        Wells St & Huron St           41.9     -87.6    14520
##  8 member        Streeter Dr & Grand Ave       41.9     -87.6    14321
##  9 member        Clark St & Lincoln Ave        41.9     -87.6    13856
## 10 member        Kingsbury St & Kinzie St      41.9     -87.6    13484
## # ... with 15 more rows

###Rider Usage over Time

MONTHLY

There is a strong seasonal component with both the casual and member riders. There’s a steady rise from March to August which is followed by a spike in September and then a decline for the rest of year.

member_monthly_data <- bike_data %>% 
  select(member_casual, started_at) %>% 
  count(strftime(started_at, format="%m"),member_casual)

names(member_monthly_data)[1] <- "month"
member_monthly_data

##    month member_casual      n
## 1     01        casual  14690
## 2     01        member  68819
## 3     02        casual   8613
## 4     02        member  34383
## 5     03        casual  75641
## 6     03        member 130048
## 7     04        casual 120420
## 8     04        member 177783
## 9     05        casual 216829
## 10    05        member 234164
## 11    06        casual 304189
## 12    06        member 304585
## 13    07        casual 369407
## 14    07        member 322902
## 15    08        casual 341469
## 16    08        member 332916
## 17    09        casual 507607
## 18    09        member 611774
## 19    10        casual 122328
## 20    10        member 215072
## 21    11        casual  72850
## 22    11        member 149085
## 23    12        casual  24492
## 24    12        member  89049

member_monthly_data %>% 
  select(month, member_casual, n) %>% 
  ggplot(aes(month, n)) +
  geom_point(color='blue') +
  facet_wrap(~member_casual)

WEEKLY

Weekly ridership confirms that the month of September was the busiest month for both the casual and member riders as the busiest weeks for all riders were weeks 35 to 38. Week 37 was the busiest for members while week 35 was the busiest for casual riders.

We see the same seasonal pattern in the weekly data as we did in the monthly data.

member_weekly_data <- bike_data %>% 
  select(member_casual, started_at) %>% 
  count(strftime(started_at, format="%W"),member_casual) %>% 
  arrange(desc(n))

names(member_weekly_data)[1] <- "week"
member_weekly_data %>% 
  head(8)

##   week member_casual      n
## 1   37        member 151043
## 2   38        member 144036
## 3   35        member 134342
## 4   36        member 132683
## 5   35        casual 127613
## 6   37        casual 124903
## 7   36        casual 119670
## 8   38        casual 113075

member_weekly_data %>% 
  select(week, member_casual, n) %>% 
  ggplot(aes(week, n)) +
  geom_point(color='darkgreen') +
  facet_wrap(~member_casual) +
  scale_x_discrete(breaks = seq(10, 60, by = 10))

DAILY

A comparison of daily usage shows spikes on Saturday and Sunday for the casual riders with the lowest usage on Monday through Thursday. The daily usage pattern for members shows a steady increase from Sunday though Wednesday where it peaks, and then a decline/leveling off for the rest of the week.

Daily usage is the first real differentiation that we see between the two rider groups.

member_daily_data <- bike_data %>% 
  select(member_casual, day_of_week) %>% 
  count(day_of_week,member_casual)

member_daily_data %>% 
  select(day_of_week, member_casual, n) %>% 
  ggplot(aes(day_of_week, n)) +
  geom_point(color='red') +
  facet_wrap(~member_casual)

HOURLY

The hourly pattern is the same for both classes of riders. However, we see a much larger spike for member riders at 5:00 PM. Demand peaks at that time of the day and then sharply declines as the evening wears on. Demand is closely associated with the availability of sunlight.

member_hour_data <- bike_data %>% 
  select(member_casual, started_at) %>% 
  count(strftime(started_at, format="%H"),member_casual)
names(member_hour_data)[1] <- 'hour_of_day'

member_hour_data %>% 
  select(hour_of_day, member_casual, n) %>% 
  ggplot(aes(hour_of_day, n)) +
  geom_point(color='darkblue') +
  facet_wrap(~member_casual) +
   theme(axis.text.x = element_text(angle = 90))

DURATION

What is the length of the ride for each class? The boxplot below shows that the average duration for the casual rider is more than double of that for the member rider.

member_duration %>% 
        ggplot(aes(x=member_casual, y=avg_duration, fill=member_casual)) +
          geom_boxplot() +
          scale_y_continuous()

member_duration %>% 
   group_by(member_casual) %>% 
  summarise(avg_duration = mean(avg_duration))

## # A tibble: 2 x 2
##   member_casual avg_duration 
##   <fct>         <drtn>       
## 1 casual        33.06502 mins
## 2 member        13.92385 mins

Looking at the daily duration, Sundays and Saturdays see the longest rides in particular by the casual rider. In fact, the most use for the longest period of time is by the casual rider on Sundays.

  ggplot(data=member_duration, aes(reorder(day_of_week, -avg_duration), avg_duration, fill=member_casual)) +
  geom_col() +
  scale_y_continuous() +
  facet_wrap(~member_casual) +
  xlab("Days of the week") +
  ylab("Average Duration")

BIKE TYPEs

There are three types of rideable bikes. 57% of the bikes are the classic styled bikes, 22% are docked bikes, and 21% are electrical bikes.

docked_bike_pct <- round(nrow(bike_data %>% filter(rideable_type =='docked_bike'))/nrow(bike_data),2)
classic_bike_pct <- round(nrow(bike_data %>% filter(rideable_type =='classic_bike'))/nrow(bike_data),2)
electric_bike_pct <- round(nrow(bike_data %>% filter(rideable_type =='electric_bike'))/nrow(bike_data),2)

bikes_pcts <- c(docked_bike_pct, classic_bike_pct,electric_bike_pct)
bike_types <- c('docked_bike','classic_bike','electric_bike')
bikes_df <- data.frame(bike_types,bikes_pcts)
bikes_df

##      bike_types bikes_pcts
## 1   docked_bike       0.22
## 2  classic_bike       0.57
## 3 electric_bike       0.21

ggplot(data = bikes_df, aes(x = bike_types, y=bikes_pcts,  fill=bike_types)) +
  geom_col(color='black') +
  labs(title="Percentage of Rideable Types",
        subtitle=paste0("Classic Bike: ", classic_bike_pct*100, "% ",
                        "  Docked Bike: ", docked_bike_pct*100, "%  ",
                        "  Electric Bike: ",electric_bike_pct*100, "%" )) +
  ylab("") +
  xlab("")

bike_types <- bike_data %>% 
                     group_by(member_casual, day_of_week, rideable_type) %>% 
                    summarise(avg_duration = mean(duration), no_of_rides = n(), .groups = 'keep')

When looking at the preferences for each rider class, there isn’t a difference among their preferences except that members may want electrical bikes a bit more, but both groups have a strong preference for classic bikes.

bike_types %>% 
  #filter(member_casual == 'casual') %>% 
  ggplot(aes(x=day_of_week, y=no_of_rides, fill=member_casual))  +
    geom_col() +
    facet_wrap(~rideable_type)

However, for docked_bikes, the casual riders take them out for much longer ride times including Sundays and Saturdays.

bike_types %>% 
  #filter(member_casual == 'casual') %>% 
  ggplot(aes(x=day_of_week, y=avg_duration, fill=member_casual))  +
    geom_col() +
    scale_y_continuous() +
    facet_wrap(~rideable_type)

CapstoneR

John K. Hancock

10/7/2021