###Import libraries for tidyverse here skimr janitor stringr lubridate kimisc
library(tidyverse)
library(here)
library(skimr)
library(janitor)
library(stringr)
library(lubridate)
library(kimisc)
##Import the csv file
The cyclistic bike data is imported after being exported from Postgresql.
bike_data <- read.csv("cyclistic_bike_data.csv")
bike_data$started_at <- as.POSIXct(bike_data$started_at)
bike_data$ended_at <- as.POSIXct(bike_data$ended_at)
bike_data$member_casual <- as.factor(bike_data$member_casual)
bike_data$day_of_week <- as.factor(bike_data$day_of_week)
bike_data$month <- as.factor(bike_data$month)
bike_data$rideable_type <- as.factor(bike_data$rideable_type)
bike_data$duration <- difftime(bike_data$ended_at, bike_data$started_at, units='mins')
##Cleaning Features, started_at and ended_at, will be converted to R’s date time format and the field, duration, will be the difference in minutes between ended_at and started_at.
Features, rideable_type, day_of_week, member_casual, and month will be converted into factors.
skim_without_charts(bike_data)
| Name | bike_data |
| Number of rows | 4849115 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| difftime | 1 |
| factor | 4 |
| numeric | 6 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1 | 16 | 16 | 0 | 4849115 | 0 |
| start_station_name | 0 | 1 | 3 | 53 | 0 | 785 | 0 |
| start_station_id | 0 | 1 | 1 | 36 | 0 | 1296 | 0 |
| end_station_name | 0 | 1 | 10 | 53 | 0 | 780 | 0 |
| end_station_id | 0 | 1 | 1 | 36 | 0 | 1295 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| duration | 0 | 1 | 0 mins | 55944.15 mins | 12.82 mins | 24468 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| rideable_type | 0 | 1 | FALSE | 3 | cla: 2743244, doc: 1075881, ele: 1029990 |
| member_casual | 0 | 1 | FALSE | 2 | mem: 2670580, cas: 2178535 |
| month | 0 | 1 | FALSE | 12 | Sep: 1119381, Jul: 692309, Aug: 674385, Jun: 608774 |
| day_of_week | 0 | 1 | FALSE | 7 | sat: 875131, sun: 754975, fri: 696431, wed: 653104 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.04 | 41.65 | 41.88 | 41.90 | 41.93 | 42.06 |
| start_lng | 0 | 1 | -87.64 | 0.02 | -87.83 | -87.66 | -87.64 | -87.63 | -87.53 |
| end_lat | 0 | 1 | 41.90 | 0.04 | 41.65 | 41.88 | 41.90 | 41.93 | 42.17 |
| end_lng | 0 | 1 | -87.64 | 0.02 | -87.83 | -87.66 | -87.64 | -87.63 | -87.52 |
| hour_of_day | 0 | 1 | 14.30 | 4.90 | 0.00 | 11.00 | 15.00 | 18.00 | 23.00 |
| week_of_year | 0 | 1 | 29.67 | 10.43 | 1.00 | 23.00 | 31.00 | 37.00 | 53.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2020-09-01 00:00:07 | 2021-09-30 23:59:44 | 2021-06-09 16:12:42 | 4152439 |
| ended_at | 0 | 1 | 2020-09-01 00:04:43 | 2021-10-01 18:39:55 | 2021-06-09 16:34:33 | 4136546 |
A summary of the csv shows that there are 4,849,115 records (or data points) with no missing or empty values. There are 15 features (or columns).
glimpse(bike_data)
## Rows: 4,849,115
## Columns: 18
## $ ride_id <chr> "4D03447AB226B3EA", "C8A8B54B3C6DA039", "F436EFBA9D~
## $ rideable_type <fct> docked_bike, classic_bike, classic_bike, classic_bi~
## $ started_at <dttm> 2020-09-30 11:30:18, 2021-05-03 19:16:02, 2021-05-~
## $ ended_at <dttm> 2020-09-30 11:38:12, 2021-05-03 19:23:34, 2021-05-~
## $ start_station_name <chr> "Christiana Ave & Lawrence Ave", "Lincoln Ave & Bel~
## $ start_station_id <chr> "474", "TA1309000026", "TA1309000026", "KA150300000~
## $ end_station_name <chr> "Rockwell St & Eastwood Ave", "Southport Ave & Wave~
## $ end_station_id <chr> "478", "13235", "13235", "KA1503000031", "13235", "~
## $ start_lat <dbl> 41.96835, 41.95600, 41.95600, 41.80241, 41.95600, 4~
## $ start_lng <dbl> -87.71183, -87.68016, -87.68016, -87.58692, -87.680~
## $ end_lat <dbl> 41.96590, 41.94815, 41.94815, 41.77372, 41.94815, 4~
## $ end_lng <dbl> -87.69364, -87.66394, -87.66394, -87.60563, -87.663~
## $ member_casual <fct> member, member, member, member, casual, casual, cas~
## $ duration <drtn> 7.900000 mins, 7.533333 mins, 10.233333 mins, 17.4~
## $ month <fct> Sep, May, May, May, May, May, May, May, May, Apr, M~
## $ day_of_week <fct> wednesday, monday , thursday , sunday , tuesday~
## $ hour_of_day <int> 11, 19, 7, 8, 17, 17, 17, 21, 19, 0, 15, 19, 14, 17~
## $ week_of_year <int> 40, 18, 20, 20, 18, 18, 19, 21, 19, 16, 19, 21, 20,~
##Understanding the Data
casual_pct <- round(nrow(bike_data %>% filter(member_casual=='casual'))/nrow(bike_data),2)
member_pct <- round(nrow(bike_data %>% filter(member_casual=='member'))/nrow(bike_data),2)
The rideshare program offers two customer options, members and casual riders. Members pay an annual membership fee while casual riders buy either a day or hourly pass. Data collected from the past 12 months show that 45% of the riders are casual while 55% are from members.
ggplot(data = bike_data) +
geom_bar(mapping = aes(x = member_casual, y=after_stat(count), fill=member_casual)) +
labs(title="Comparison of Riders: Casual vs. Members",
subtitle=paste0("pct of casual riders ", casual_pct*100, "% pct of subscribed members ", member_pct*100, "%"))
riders <- bike_data %>%
select(member_casual) %>%
group_by(member_casual) %>%
summarise(type_no = n() )
riders$riders_pct <- riders$type_no / sum(riders$type_no)
riders
## # A tibble: 2 x 3
## member_casual type_no riders_pct
## <fct> <int> <dbl>
## 1 casual 2178535 0.449
## 2 member 2670580 0.551
##Duration
month_data <- bike_data %>%
select(member_casual, month) %>%
group_by(member_casual,month) %>%
summarise(per_month = n(), .groups = 'keep')
daily_data <- bike_data %>%
select(member_casual, day_of_week) %>%
group_by(member_casual,day_of_week) %>%
summarise(per_day_of_week = n(), .groups = 'keep')
hourly_data <- bike_data %>%
select(member_casual, hour_of_day) %>%
group_by(member_casual, hour_of_day) %>%
summarise(per_hour_of_day = n(), .groups = 'keep')
weekly_data <- bike_data %>%
select(member_casual, week_of_year) %>%
group_by(member_casual, week_of_year) %>%
summarise(week_no_of_year = n(), .groups = 'keep')
weekly_data
## # A tibble: 106 x 3
## # Groups: member_casual, week_of_year [106]
## member_casual week_of_year week_no_of_year
## <fct> <int> <int>
## 1 casual 1 4605
## 2 casual 2 3918
## 3 casual 3 3253
## 4 casual 4 1728
## 5 casual 5 1104
## 6 casual 6 884
## 7 casual 7 626
## 8 casual 8 5999
## 9 casual 9 9181
## 10 casual 10 20336
## # ... with 96 more rows
###Round Trips
Analyzing the behavior of casual vs member riders, we see that casual riders are more than twice as likely as members to take round trips, returning the bike back to its station of origin.
bike_data %>%
select(start_station_id, end_station_id, member_casual ) %>%
group_by(member_casual) %>%
filter(start_station_id == end_station_id ) %>%
summarise(round_trips = n(), .groups = "keep")
## # A tibble: 2 x 2
## # Groups: member_casual [2]
## member_casual round_trips
## <fct> <int>
## 1 casual 263722
## 2 member 123985
bike_data %>%
select(start_station_id, end_station_id, member_casual ) %>%
group_by(member_casual) %>%
filter(start_station_id != end_station_id ) %>%
summarise(non_round_trips = n(), .groups = "keep")
## # A tibble: 2 x 2
## # Groups: member_casual [2]
## member_casual non_round_trips
## <fct> <int>
## 1 casual 1914813
## 2 member 2546595
###Stations
The most popular station for the casual rider is Streeter Dr & Grand Avenue. That station is more than twice as popular as the second most station, Lake Shore Dr & Monroe St.Â
bike_data %>%
select(start_station_name,member_casual, start_lat, start_lng ) %>%
filter(member_casual=='casual') %>%
group_by(member_casual,start_station_name, start_lat, start_lng ) %>%
summarise(stations = n(), .groups = "keep") %>%
arrange(desc(stations)) %>%
head(25)
## # A tibble: 25 x 5
## # Groups: member_casual, start_station_name, start_lat, start_lng [25]
## member_casual start_station_name start_lat start_lng stations
## <fct> <chr> <dbl> <dbl> <int>
## 1 casual Streeter Dr & Grand Ave 41.9 -87.6 55857
## 2 casual Lake Shore Dr & Monroe St 41.9 -87.6 22903
## 3 casual Theater on the Lake 41.9 -87.6 20793
## 4 casual Michigan Ave & Oak St 41.9 -87.6 15854
## 5 casual Millennium Park 41.9 -87.6 15421
## 6 casual Lake Shore Dr & North Blvd 41.9 -87.6 15045
## 7 casual Wells St & Concord Ln 41.9 -87.6 14915
## 8 casual Indiana Ave & Roosevelt Rd 41.9 -87.6 14353
## 9 casual Clark St & Lincoln Ave 41.9 -87.6 14029
## 10 casual Dusable Harbor 41.9 -87.6 13168
## # ... with 15 more rows
For members, the most popular starting station is Clar St & Elm. There isn’t a huge difference between the top and second most stations like we saw for the casual riders.
bike_data %>%
select(start_station_name,member_casual, start_lat, start_lng ) %>%
filter(member_casual=='member') %>%
group_by(member_casual,start_station_name, start_lat, start_lng ) %>%
summarise(stations = n(), .groups = "keep") %>%
arrange(desc(stations)) %>%
head(25)
## # A tibble: 25 x 5
## # Groups: member_casual, start_station_name, start_lat, start_lng [25]
## member_casual start_station_name start_lat start_lng stations
## <fct> <chr> <dbl> <dbl> <int>
## 1 member Clark St & Elm St 41.9 -87.6 21166
## 2 member Wells St & Concord Ln 41.9 -87.6 18581
## 3 member Theater on the Lake 41.9 -87.6 16755
## 4 member Wells St & Elm St 41.9 -87.6 16745
## 5 member Dearborn St & Erie St 41.9 -87.6 14814
## 6 member Clark St & Armitage Ave 41.9 -87.6 14572
## 7 member Wells St & Huron St 41.9 -87.6 14520
## 8 member Streeter Dr & Grand Ave 41.9 -87.6 14321
## 9 member Clark St & Lincoln Ave 41.9 -87.6 13856
## 10 member Kingsbury St & Kinzie St 41.9 -87.6 13484
## # ... with 15 more rows
###Rider Usage over Time
MONTHLY
There is a strong seasonal component with both the casual and member riders. There’s a steady rise from March to August which is followed by a spike in September and then a decline for the rest of year.
member_monthly_data <- bike_data %>%
select(member_casual, started_at) %>%
count(strftime(started_at, format="%m"),member_casual)
names(member_monthly_data)[1] <- "month"
member_monthly_data
## month member_casual n
## 1 01 casual 14690
## 2 01 member 68819
## 3 02 casual 8613
## 4 02 member 34383
## 5 03 casual 75641
## 6 03 member 130048
## 7 04 casual 120420
## 8 04 member 177783
## 9 05 casual 216829
## 10 05 member 234164
## 11 06 casual 304189
## 12 06 member 304585
## 13 07 casual 369407
## 14 07 member 322902
## 15 08 casual 341469
## 16 08 member 332916
## 17 09 casual 507607
## 18 09 member 611774
## 19 10 casual 122328
## 20 10 member 215072
## 21 11 casual 72850
## 22 11 member 149085
## 23 12 casual 24492
## 24 12 member 89049
member_monthly_data %>%
select(month, member_casual, n) %>%
ggplot(aes(month, n)) +
geom_point(color='blue') +
facet_wrap(~member_casual)
WEEKLY
Weekly ridership confirms that the month of September was the busiest month for both the casual and member riders as the busiest weeks for all riders were weeks 35 to 38. Week 37 was the busiest for members while week 35 was the busiest for casual riders.
We see the same seasonal pattern in the weekly data as we did in the monthly data.
member_weekly_data <- bike_data %>%
select(member_casual, started_at) %>%
count(strftime(started_at, format="%W"),member_casual) %>%
arrange(desc(n))
names(member_weekly_data)[1] <- "week"
member_weekly_data %>%
head(8)
## week member_casual n
## 1 37 member 151043
## 2 38 member 144036
## 3 35 member 134342
## 4 36 member 132683
## 5 35 casual 127613
## 6 37 casual 124903
## 7 36 casual 119670
## 8 38 casual 113075
member_weekly_data %>%
select(week, member_casual, n) %>%
ggplot(aes(week, n)) +
geom_point(color='darkgreen') +
facet_wrap(~member_casual) +
scale_x_discrete(breaks = seq(10, 60, by = 10))
DAILY
A comparison of daily usage shows spikes on Saturday and Sunday for the casual riders with the lowest usage on Monday through Thursday. The daily usage pattern for members shows a steady increase from Sunday though Wednesday where it peaks, and then a decline/leveling off for the rest of the week.
Daily usage is the first real differentiation that we see between the two rider groups.
member_daily_data <- bike_data %>%
select(member_casual, day_of_week) %>%
count(day_of_week,member_casual)
member_daily_data %>%
select(day_of_week, member_casual, n) %>%
ggplot(aes(day_of_week, n)) +
geom_point(color='red') +
facet_wrap(~member_casual)
HOURLY
The hourly pattern is the same for both classes of riders. However, we see a much larger spike for member riders at 5:00 PM. Demand peaks at that time of the day and then sharply declines as the evening wears on. Demand is closely associated with the availability of sunlight.
member_hour_data <- bike_data %>%
select(member_casual, started_at) %>%
count(strftime(started_at, format="%H"),member_casual)
names(member_hour_data)[1] <- 'hour_of_day'
member_hour_data %>%
select(hour_of_day, member_casual, n) %>%
ggplot(aes(hour_of_day, n)) +
geom_point(color='darkblue') +
facet_wrap(~member_casual) +
theme(axis.text.x = element_text(angle = 90))
DURATION
What is the length of the ride for each class? The boxplot below shows that the average duration for the casual rider is more than double of that for the member rider.
member_duration %>%
ggplot(aes(x=member_casual, y=avg_duration, fill=member_casual)) +
geom_boxplot() +
scale_y_continuous()
member_duration %>%
group_by(member_casual) %>%
summarise(avg_duration = mean(avg_duration))
## # A tibble: 2 x 2
## member_casual avg_duration
## <fct> <drtn>
## 1 casual 33.06502 mins
## 2 member 13.92385 mins
Looking at the daily duration, Sundays and Saturdays see the longest rides in particular by the casual rider. In fact, the most use for the longest period of time is by the casual rider on Sundays.
ggplot(data=member_duration, aes(reorder(day_of_week, -avg_duration), avg_duration, fill=member_casual)) +
geom_col() +
scale_y_continuous() +
facet_wrap(~member_casual) +
xlab("Days of the week") +
ylab("Average Duration")
BIKE TYPEs
There are three types of rideable bikes. 57% of the bikes are the classic styled bikes, 22% are docked bikes, and 21% are electrical bikes.
docked_bike_pct <- round(nrow(bike_data %>% filter(rideable_type =='docked_bike'))/nrow(bike_data),2)
classic_bike_pct <- round(nrow(bike_data %>% filter(rideable_type =='classic_bike'))/nrow(bike_data),2)
electric_bike_pct <- round(nrow(bike_data %>% filter(rideable_type =='electric_bike'))/nrow(bike_data),2)
bikes_pcts <- c(docked_bike_pct, classic_bike_pct,electric_bike_pct)
bike_types <- c('docked_bike','classic_bike','electric_bike')
bikes_df <- data.frame(bike_types,bikes_pcts)
bikes_df
## bike_types bikes_pcts
## 1 docked_bike 0.22
## 2 classic_bike 0.57
## 3 electric_bike 0.21
ggplot(data = bikes_df, aes(x = bike_types, y=bikes_pcts, fill=bike_types)) +
geom_col(color='black') +
labs(title="Percentage of Rideable Types",
subtitle=paste0("Classic Bike: ", classic_bike_pct*100, "% ",
" Docked Bike: ", docked_bike_pct*100, "% ",
" Electric Bike: ",electric_bike_pct*100, "%" )) +
ylab("") +
xlab("")
bike_types <- bike_data %>%
group_by(member_casual, day_of_week, rideable_type) %>%
summarise(avg_duration = mean(duration), no_of_rides = n(), .groups = 'keep')
When looking at the preferences for each rider class, there isn’t a difference among their preferences except that members may want electrical bikes a bit more, but both groups have a strong preference for classic bikes.
bike_types %>%
#filter(member_casual == 'casual') %>%
ggplot(aes(x=day_of_week, y=no_of_rides, fill=member_casual)) +
geom_col() +
facet_wrap(~rideable_type)
However, for docked_bikes, the casual riders take them out for much longer ride times including Sundays and Saturdays.
bike_types %>%
#filter(member_casual == 'casual') %>%
ggplot(aes(x=day_of_week, y=avg_duration, fill=member_casual)) +
geom_col() +
scale_y_continuous() +
facet_wrap(~rideable_type)