Find answers to the following questions: How do annual members and casual riders use Cyclistic bikes differently? Why would casual rider buy Cyclistic annual memberships? How can Cyclistic use digital media to influence casual riders to become members?
Primary stakeholder: Lily Moreno, Director of Marketing Secondary stakeholders: Cyclistic Executive Team
Cyclistic provided data - We will look at all 2022 trip data.
library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
library(dplyr)
library(stringr)
library(data.table)
library("lubridate")
## Loading required package: timechange
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
Read each CSV file
trips_03_2022 <- fread("~/cyclistic_2022/202203-divvy-tripdata.csv")
trips_02_2022 <- fread("~/cyclistic_2022/202202-divvy-tripdata.csv")
trips_01_2022 <- fread("~/cyclistic_2022/202201-divvy-tripdata.csv")
trips_04_2022 <- fread("~/cyclistic_2022/202204-divvy-tripdata.csv")
trips_05_2022 <- fread("~/cyclistic_2022/202205-divvy-tripdata.csv")
trips_06_2022 <- fread("~/cyclistic_2022/202206-divvy-tripdata.csv")
trips_07_2022 <- fread("~/cyclistic_2022/202207-divvy-tripdata.csv")
trips_08_2022 <- fread("~/cyclistic_2022/202208-divvy-tripdata.csv")
trips_09_2022 <- fread("~/cyclistic_2022/202209-divvy-tripdata.csv")
trips_10_2022 <- fread("~/cyclistic_2022/202210-divvy-tripdata.csv")
trips_11_2022 <- fread("~/cyclistic_2022/202211-divvy-tripdata.csv")
trips_12_2022 <- fread("~/cyclistic_2022/202212-divvy-tripdata.csv")
Each CSV file has 13 variables. Check that all variables are in the right order and have the same format using the glimpse function on each file (January data is shown as an example.)
glimpse(trips_01_2022)
## Rows: 103,770
## Columns: 13
## $ ride_id <chr> "C2F7DD78E82EC875", "A6CF8980A652D272", "BD0F91DFF7…
## $ rideable_type <chr> "electric_bike", "electric_bike", "classic_bike", "…
## $ started_at <dttm> 2022-01-13 11:59:47, 2022-01-10 08:41:56, 2022-01-…
## $ ended_at <dttm> 2022-01-13 12:02:44, 2022-01-10 08:46:17, 2022-01-…
## $ start_station_name <chr> "Glenwood Ave & Touhy Ave", "Glenwood Ave & Touhy A…
## $ start_station_id <chr> "525", "525", "TA1306000016", "KA1504000151", "TA13…
## $ end_station_name <chr> "Clark St & Touhy Ave", "Clark St & Touhy Ave", "Gr…
## $ end_station_id <chr> "RP-007", "RP-007", "TA1307000001", "TA1309000021",…
## $ start_lat <dbl> 42.01280, 42.01276, 41.92560, 41.98359, 41.87785, 4…
## $ start_lng <dbl> -87.66591, -87.66597, -87.65371, -87.66915, -87.624…
## $ end_lat <dbl> 42.01256, 42.01256, 41.92533, 41.96151, 41.88462, 4…
## $ end_lng <dbl> -87.67437, -87.67437, -87.66580, -87.67139, -87.627…
## $ member_casual <chr> "casual", "casual", "member", "casual", "member", "…
Create a single dataset using rbind function.
cyclistic_trips_2022 <- rbind(trips_01_2022,
trips_02_2022,
trips_03_2022,
trips_04_2022,
trips_05_2022,
trips_06_2022,
trips_07_2022,
trips_08_2022,
trips_09_2022,
trips_10_2022,
trips_11_2022,
trips_12_2022)
Eliminate any potential duplicates in ride_id. This is primary unique ID.
cyclistic_trips_2022 %>%
distinct(ride_id, .keep_all = TRUE)
## ride_id rideable_type started_at ended_at
## 1: C2F7DD78E82EC875 electric_bike 2022-01-13 11:59:47 2022-01-13 12:02:44
## 2: A6CF8980A652D272 electric_bike 2022-01-10 08:41:56 2022-01-10 08:46:17
## 3: BD0F91DFF741C66D classic_bike 2022-01-25 04:53:40 2022-01-25 04:58:01
## 4: CBB80ED419105406 classic_bike 2022-01-04 00:18:04 2022-01-04 00:33:00
## 5: DDC963BFDDA51EEA classic_bike 2022-01-20 01:31:10 2022-01-20 01:37:12
## ---
## 5667713: 43ABEE85B6E15DCA classic_bike 2022-12-05 06:51:04 2022-12-05 06:54:48
## 5667714: F041C89A3D1F0270 electric_bike 2022-12-14 17:06:28 2022-12-14 17:19:27
## 5667715: A2BECB88430BE156 classic_bike 2022-12-08 16:27:47 2022-12-08 16:32:20
## 5667716: 37B392960E566F58 classic_bike 2022-12-28 09:37:38 2022-12-28 09:41:34
## 5667717: 2DD1587210BA45AE classic_bike 2022-12-09 00:27:25 2022-12-09 00:35:28
## start_station_name start_station_id
## 1: Glenwood Ave & Touhy Ave 525
## 2: Glenwood Ave & Touhy Ave 525
## 3: Sheffield Ave & Fullerton Ave TA1306000016
## 4: Clark St & Bryn Mawr Ave KA1504000151
## 5: Michigan Ave & Jackson Blvd TA1309000002
## ---
## 5667713: Sangamon St & Washington Blvd 13409
## 5667714: Bernard St & Elston Ave 18016
## 5667715: Wacker Dr & Washington St KA1503000072
## 5667716: Sangamon St & Washington Blvd 13409
## 5667717: Southport Ave & Waveland Ave 13235
## end_station_name end_station_id start_lat start_lng
## 1: Clark St & Touhy Ave RP-007 42.01280 -87.66591
## 2: Clark St & Touhy Ave RP-007 42.01276 -87.66597
## 3: Greenview Ave & Fullerton Ave TA1307000001 41.92560 -87.65371
## 4: Paulina St & Montrose Ave TA1309000021 41.98359 -87.66915
## 5: State St & Randolph St TA1305000029 41.87785 -87.62408
## ---
## 5667713: Peoria St & Jackson Blvd 13158 41.88316 -87.65110
## 5667714: Seeley Ave & Roscoe St 13144 41.94998 -87.71402
## 5667715: Green St & Madison St TA1307000120 41.88314 -87.63724
## 5667716: Peoria St & Jackson Blvd 13158 41.88316 -87.65110
## 5667717: Seeley Ave & Roscoe St 13144 41.94815 -87.66394
## end_lat end_lng member_casual
## 1: 42.01256 -87.67437 casual
## 2: 42.01256 -87.67437 casual
## 3: 41.92533 -87.66580 member
## 4: 41.96151 -87.67139 casual
## 5: 41.88462 -87.62783 member
## ---
## 5667713: 41.87764 -87.64962 member
## 5667714: 41.94340 -87.67962 member
## 5667715: 41.88186 -87.64926 member
## 5667716: 41.87764 -87.64962 member
## 5667717: 41.94340 -87.67962 casual
Find and remove any entries with NA values
sapply(cyclistic_trips_2022, function(x) sum(is.na(x)))
## ride_id rideable_type started_at ended_at
## 0 0 0 0
## start_station_name start_station_id end_station_name end_station_id
## 0 0 0 0
## start_lat start_lng end_lat end_lng
## 0 0 5858 5858
## member_casual
## 0
# Entries with NA value for end_lat and end_lng found. All end station data is also blank.
na_rows <- subset(cyclistic_trips_2022, is.na(cyclistic_trips_2022$end_lat))
# Remove entries with NA values
cyclistic_trips_2022_clean <- na.omit(cyclistic_trips_2022)
Check for any possible unexpected results in category columns. Confirm “casual” and “member” are the only two options in the member_casual column
unique(cyclistic_trips_2022_clean$member_casual)
## [1] "casual" "member"
See all rideable type options
unique(cyclistic_trips_2022_clean$rideable_type)
## [1] "electric_bike" "classic_bike" "docked_bike"
Create ride_length column that shows time elapsed between when the ride started and ended.
cyclistic_trips_2022_clean$ride_length <- difftime(cyclistic_trips_2022_clean$ended_at, cyclistic_trips_2022_clean$started_at)
range(cyclistic_trips_2022_clean$ride_length)
## Time differences in secs
## [1] -621201 2061244
Convert ride_length into hours and minutes
cyclistic_trips_2022_clean = cyclistic_trips_2022_clean %>%
mutate(hours = hour(seconds_to_period(cyclistic_trips_2022_clean$ride_length)),
minutes = minute(seconds_to_period(cyclistic_trips_2022_clean$ride_length)),)
Add date, month, and day of the week columns
cyclistic_trips_2022_clean$date <- as.Date(cyclistic_trips_2022_clean$started_at)
cyclistic_trips_2022_clean$month <- format(as.Date(cyclistic_trips_2022_clean$date), "%m")
cyclistic_trips_2022_clean$day_of_week <- format(as.Date(cyclistic_trips_2022_clean$date), "%A")
Remove entries with a negative trip lengths and all docked bikes. Docked bikes have been taken out of circulation so we remove them from our analysis.
trip_data = cyclistic_trips_2022_clean[!(cyclistic_trips_2022_clean$rideable_type == "docked_bike" | cyclistic_trips_2022_clean$ride_length<0),]
Remove any rows with empty string for the start_station_name, end station, or trip id
trip_data = trip_data[!trip_data$ride_id=="",]
trip_data = trip_data[!trip_data$start_station_name=="",]
trip_data = trip_data[!trip_data$end_station_name=="",]
trip_data$day_of_week <- factor(trip_data$day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday", "Sunday"))
#Average ride length vs. day of the week
trip_data %>%
group_by(member_casual, day_of_week) %>%
summarise(average_duration = mean(minutes), .groups = "drop") %>%
ggplot(aes(x = day_of_week, y = average_duration, fill = member_casual)) +
geom_col(width = 0.4, position = position_dodge(width = 0.5)) +
labs(title = "Average ride length vs. day of the week", x = 'Day of Week', y = 'Average Trip Duration')
#Total Ride Count vs. day of the week
trip_data %>%
group_by(member_casual, day_of_week) %>%
summarise(number_of_rides = n(), .groups = "drop") %>%
ggplot(aes(x = day_of_week, y = format(number_of_rides, scientific = FALSE), fill = member_casual)) +
geom_col(width = 0.4, position = position_dodge(width = 0.5)) +
labs(title = "Total Ride Count vs. day of the week", x = 'Day of Week', y = 'Total Rides')
# AVG by month
trip_data %>%
group_by(member_casual, month) %>%
summarise(average_duration = mean(minutes), .groups = "drop") %>%
ggplot(aes(x = month, y = average_duration, fill = member_casual)) +
geom_col(width = 0.4, position = position_dodge(width = 0.5)) +
labs(title = "Average ride length vs. Month", x = 'Month', y = 'Average Trip Duration')
# Total by month
trip_data %>%
group_by(member_casual, month) %>%
summarise(number_of_rides = n(), .groups = "drop") %>%
ggplot(aes(x = month, y = format(number_of_rides, scientific = FALSE), fill = member_casual)) +
geom_col(width = 0.4, position = position_dodge(width = 0.5)) +
labs(title = "Total Ride Count vs. Month", x = 'Month', y = 'Total Rides')
#Find top 5 start stations by ride count for members and casual customers
station_rides_count <- trip_data %>% group_by(start_station_name, member_casual) %>%
summarise(total_count=n(), .groups='drop') %>%
as.data.frame()
top_stations <- station_rides_count %>% # Top N highest values by group
arrange(desc(total_count)) %>%
group_by(member_casual) %>%
slice(1:5)
top_stations
## # A tibble: 10 × 3
## # Groups: member_casual [2]
## start_station_name member_casual total_count
## <chr> <chr> <int>
## 1 Streeter Dr & Grand Ave casual 43478
## 2 DuSable Lake Shore Dr & Monroe St casual 23380
## 3 Michigan Ave & Oak St casual 19589
## 4 DuSable Lake Shore Dr & North Blvd casual 19095
## 5 Millennium Park casual 18244
## 6 Kingsbury St & Kinzie St member 23523
## 7 Clark St & Elm St member 20581
## 8 Wells St & Concord Ln member 19674
## 9 Clinton St & Washington Blvd member 18828
## 10 Loomis St & Lexington St member 18252