Setting up my R environment by loading the “tidyverse” package.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Importing the datasets from 2019 and 2020. These datasets will be used for data cleaning and analysis.
dv2019 <- read_csv("C:/Users/asus/OneDrive/Documents/GOOGLE CERTIFICATE DATA ANALYST/Divvy_Trips_2019_Q1/Divvy_Trips_2019_Q1.csv")
## Rows: 365069 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): start_time, end_time, from_station_name, to_station_name, usertype,...
## dbl (6): trip_id, bikeid, from_station_id, to_station_id, birthyear, day_of_...
## num (1): tripduration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dv2020 <- read_csv("C:/Users/asus/OneDrive/Documents/GOOGLE CERTIFICATE DATA ANALYST/Divvy_Trips_2020_Q1/Divvy_Trips_2020_Q1.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 426887 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, started_at, ended_at, start_station_name, ...
## dbl (7): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## time (1): ride_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Checking the column names of the datasets to identify differences in variable names between the datasets before combining them.
colnames(dv2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
## [13] "ride_length" "day_of_week"
colnames(dv2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
Renaming column names in 2019 dataset to match 2020 dataset structure to ensures consistency between the datasets and allows them to be combined later.
dv2019 <- dv2019 %>%
rename(ride_id = trip_id,
rideable_type = bikeid,
started_at = start_time,
ended_at = end_time,
start_station_name = from_station_name,
start_station_id = from_station_id,
end_station_name = to_station_name,
end_station_id = to_station_id,
member_casual = usertype)
Checking the data type of each column in the data frame to check for inconsistencies.
str(dv2019)
## spc_tbl_ [365,069 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : num [1:365069] 21742443 21742444 21742445 21742446 21742447 ...
## $ started_at : chr [1:365069] "1/1/2019 0:04" "1/1/2019 0:08" "1/1/2019 0:13" "1/1/2019 0:13" ...
## $ ended_at : chr [1:365069] "1/1/2019 0:11" "1/1/2019 0:15" "1/1/2019 0:27" "1/1/2019 0:43" ...
## $ rideable_type : num [1:365069] 2167 4386 1524 252 1170 ...
## $ tripduration : num [1:365069] 390 441 829 1783 364 ...
## $ start_station_id : num [1:365069] 199 44 15 123 173 98 98 211 150 268 ...
## $ start_station_name: chr [1:365069] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ end_station_id : num [1:365069] 84 624 644 176 35 49 49 142 148 141 ...
## $ end_station_name : chr [1:365069] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ member_casual : chr [1:365069] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:365069] "Male" "Female" "Female" "Male" ...
## $ birthyear : num [1:365069] 1989 1990 1994 1993 1994 ...
## $ ride_length : chr [1:365069] "0:06:30" "0:07:21" "0:13:49" "0:29:43" ...
## $ day_of_week : num [1:365069] 3 3 3 3 3 3 3 3 3 3 ...
## - attr(*, "spec")=
## .. cols(
## .. trip_id = col_double(),
## .. start_time = col_character(),
## .. end_time = col_character(),
## .. bikeid = col_double(),
## .. tripduration = col_number(),
## .. from_station_id = col_double(),
## .. from_station_name = col_character(),
## .. to_station_id = col_double(),
## .. to_station_name = col_character(),
## .. usertype = col_character(),
## .. gender = col_character(),
## .. birthyear = col_double(),
## .. ride_length = col_character(),
## .. day_of_week = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(dv2020)
## spc_tbl_ [426,887 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:426887] "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
## $ rideable_type : chr [1:426887] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ started_at : chr [1:426887] "1/21/2020 20:06" "1/30/2020 14:22" "1/9/2020 19:29" "1/6/2020 16:17" ...
## $ ended_at : chr [1:426887] "1/21/2020 20:14" "1/30/2020 14:26" "1/9/2020 19:32" "1/6/2020 16:25" ...
## $ start_station_name: chr [1:426887] "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
## $ start_station_id : num [1:426887] 239 234 296 51 66 212 96 96 212 38 ...
## $ end_station_name : chr [1:426887] "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
## $ end_station_id : num [1:426887] 326 318 117 24 212 96 212 212 96 100 ...
## $ start_lat : num [1:426887] 42 42 41.9 41.9 41.9 ...
## $ start_lng : num [1:426887] -87.7 -87.7 -87.6 -87.6 -87.6 ...
## $ end_lat : num [1:426887] 42 42 41.9 41.9 41.9 ...
## $ end_lng : num [1:426887] -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ member_casual : chr [1:426887] "member" "member" "member" "member" ...
## $ ride_length : 'hms' num [1:426887] 00:07:31 00:03:43 00:02:51 00:08:49 ...
## ..- attr(*, "units")= chr "secs"
## $ day_of_week : num [1:426887] 3 5 5 2 5 6 6 6 6 6 ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_character(),
## .. ended_at = col_character(),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character(),
## .. ride_length = col_time(format = ""),
## .. day_of_week = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
The data types of selected variables for both datasets are adjusted to ensure consistency.
dv2020 <- dv2020 %>%
mutate(ride_length = as.difftime(ride_length))
dv2019 <- dv2019 %>%
mutate(ride_id = as.character(ride_id),
rideable_type = as.character(rideable_type),
ride_length = as.difftime(ride_length))
Combining both datasets into one data frame.
all_trips <- bind_rows(dv2019,dv2020)
Removing colummns that are not requird for the analysis.
all_trips <- all_trips %>%
select(-c(start_lat,start_lng,
end_lat,end_lng,
birthyear,gender,
tripduration))
Examining the structure and basic characteristics of the dataset-checking column names, number of rows, dimensions, previewing the first and last few rows, and reviewing the data structure and summary statistics.
colnames(all_trips)
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "member_casual"
## [10] "ride_length" "day_of_week"
nrow(all_trips)
## [1] 791956
dim(all_trips)
## [1] 791956 11
head(all_trips)
## # A tibble: 6 × 11
## ride_id started_at ended_at rideable_type start_station_id start_station_name
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 21742443 1/1/2019 … 1/1/201… 2167 199 Wabash Ave & Gran…
## 2 21742444 1/1/2019 … 1/1/201… 4386 44 State St & Randol…
## 3 21742445 1/1/2019 … 1/1/201… 1524 15 Racine Ave & 18th…
## 4 21742446 1/1/2019 … 1/1/201… 252 123 California Ave & …
## 5 21742447 1/1/2019 … 1/1/201… 1170 173 Mies van der Rohe…
## 6 21742448 1/1/2019 … 1/1/201… 2437 98 LaSalle St & Wash…
## # ℹ 5 more variables: end_station_id <dbl>, end_station_name <chr>,
## # member_casual <chr>, ride_length <time>, day_of_week <dbl>
tail(all_trips)
## # A tibble: 6 × 11
## ride_id started_at ended_at rideable_type start_station_id start_station_name
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 6F4D221… 3/10/2020… 3/10/20… docked_bike 675 HQ QR
## 2 ADDAA33… 3/10/2020… 3/10/20… docked_bike 675 HQ QR
## 3 82B10FA… 3/7/2020 … 3/7/202… docked_bike 161 Rush St & Superio…
## 4 AA0D5AA… 3/1/2020 … 3/1/202… docked_bike 141 Clark St & Lincol…
## 5 3296360… 3/7/2020 … 3/7/202… docked_bike 672 Franklin St & Ill…
## 6 064EC76… 3/8/2020 … 3/8/202… docked_bike 110 Dearborn St & Eri…
## # ℹ 5 more variables: end_station_id <dbl>, end_station_name <chr>,
## # member_casual <chr>, ride_length <time>, day_of_week <dbl>
str(all_trips)
## tibble [791,956 × 11] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:791956] "21742443" "21742444" "21742445" "21742446" ...
## $ started_at : chr [1:791956] "1/1/2019 0:04" "1/1/2019 0:08" "1/1/2019 0:13" "1/1/2019 0:13" ...
## $ ended_at : chr [1:791956] "1/1/2019 0:11" "1/1/2019 0:15" "1/1/2019 0:27" "1/1/2019 0:43" ...
## $ rideable_type : chr [1:791956] "2167" "4386" "1524" "252" ...
## $ start_station_id : num [1:791956] 199 44 15 123 173 98 98 211 150 268 ...
## $ start_station_name: chr [1:791956] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ end_station_id : num [1:791956] 84 624 644 176 35 49 49 142 148 141 ...
## $ end_station_name : chr [1:791956] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ member_casual : chr [1:791956] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ ride_length : 'hms' num [1:791956] 00:06:30 00:07:21 00:13:49 00:29:43 ...
## ..- attr(*, "units")= chr "secs"
## $ day_of_week : num [1:791956] 3 3 3 3 3 3 3 3 3 3 ...
summary(all_trips)
## ride_id started_at ended_at rideable_type
## Length:791956 Length:791956 Length:791956 Length:791956
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_station_id start_station_name end_station_id end_station_name
## Min. : 2.0 Length:791956 Min. : 2.0 Length:791956
## 1st Qu.: 77.0 Class :character 1st Qu.: 77.0 Class :character
## Median :174.0 Mode :character Median :174.0 Mode :character
## Mean :204.4 Mean :204.4
## 3rd Qu.:291.0 3rd Qu.:291.0
## Max. :675.0 Max. :675.0
## NA's :1
## member_casual ride_length day_of_week
## Length:791956 Min. :00:00:00.000000 Min. :1.00
## Class :character 1st Qu.:00:05:28.000000 1st Qu.:3.00
## Mode :character Median :00:08:57.000000 Median :4.00
## Mean :00:14:22.601495 Mean :3.99
## 3rd Qu.:00:15:09.000000 3rd Qu.:5.00
## Max. :99:16:27.000000 Max. :7.00
## NA's :405
Counting the number of observations of each user type which should only have two categories: member and casual.
table(all_trips$member_casual)
##
## casual Customer member Subscriber
## 48480 23163 378407 341906
Changing the user type labels to make the categories consistent.
all_trips<-all_trips %>%
mutate(member_casual = recode(member_casual,
"Subscriber" = "member",
"Customer" = "casual"))
Removing data consisting test rides and negative ride length (incorrect data).
all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length<0),]
Calculating mean, median, maximum and minimum value of ride length according to user types.
rl_mean_member_casual <- all_trips_v2 %>%
aggregate(ride_length ~ member_casual,FUN = mean)
rl_mean_member_casual
## member_casual ride_length
## 1 casual 2679.6586 secs
## 2 member 696.0927 secs
rl_median_member_casual <- all_trips_v2 %>%
aggregate(ride_length ~ member_casual,FUN = median)
rl_median_member_casual
## member_casual ride_length
## 1 casual 1390
## 2 member 508
rl_max_member_casual <- all_trips_v2 %>%
aggregate(ride_length ~ member_casual,FUN = max)
rl_max_member_casual
## member_casual ride_length
## 1 casual 357387 secs
## 2 member 347120 secs
rl_min_member_casual <- all_trips_v2 %>%
aggregate(ride_length ~ member_casual,FUN = min)
rl_min_member_casual
## member_casual ride_length
## 1 casual 2 secs
## 2 member 1 secs
Changing numeric codes of day of the week (1-7) to actual day names.
all_trips_v2 <- all_trips_v2 %>%
mutate(started_at = mdy_hm(started_at)) %>%
mutate(day_of_week = recode(day_of_week,
"1" = "Sunday",
"2" = "Monday",
"3" = "Tuesday",
"4" = "Wednesday",
"5" = "Thursday",
"6" = "Friday",
"7" = "Saturday"))
head(all_trips_v2$day_of_week)
## [1] "Tuesday" "Tuesday" "Tuesday" "Tuesday" "Tuesday" "Tuesday"
Calculating mean ride length for each combination of user type and day of the week.
aggregate(all_trips_v2$ride_length ~
all_trips_v2$member_casual + all_trips_v2$day_of_week,
FUN = mean)
## all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1 casual Friday 2679.4308 secs
## 2 member Friday 688.3539 secs
## 3 casual Monday 2227.5281 secs
## 4 member Monday 671.8463 secs
## 5 casual Saturday 2578.1526 secs
## 6 member Saturday 761.6883 secs
## 7 casual Sunday 2898.9683 secs
## 8 member Sunday 780.7767 secs
## 9 casual Thursday 2658.6866 secs
## 10 member Thursday 675.2459 secs
## 11 casual Tuesday 2384.9871 secs
## 12 member Tuesday 685.3928 secs
## 13 casual Wednesday 2953.6225 secs
## 14 member Wednesday 684.2431 secs
Converting to numeric allows calculations like mean, median and create charts for analysis.
all_trips_v2$ride_length <- as.numeric(all_trips_v2$ride_length,
units = "secs")
These visualizations contrast the total number of rides with the average ride duration for each user segment. The data reveals a significant inverse relationship:
all_trips_v2 %>%
drop_na(member_casual) %>%
group_by(member_casual) %>%
summarise(number_of_rides = n(),
average_duration = round(mean(ride_length,na.rm=TRUE))) %>%
arrange(member_casual) %>%
ggplot(aes(x = member_casual,y = average_duration,fill = member_casual))+
geom_col()+
geom_text(aes(label = average_duration),vjust = 1.5,color = "black")+
labs(title = "Average Duration Vs. Usertype",
subtitle = "Casual Riders have higher average tripduration than Annual Members by 3.8x",
x = "User Type",
y = "Average Duration (in seconds)")+
scale_fill_manual(name = "User Type",
values = c("casual"="pink","member"="yellow"),
labels = c("Casual Riders","Annual Members"))
all_trips_v2 %>%
drop_na(member_casual) %>%
group_by(member_casual) %>%
summarise(number_of_rides = n(),
average_duration = mean(ride_length,na.rm = TRUE)) %>%
arrange(member_casual) %>%
ggplot(aes(x = member_casual,y = number_of_rides,fill = member_casual))+
geom_col()+
geom_text(aes(label = scales::comma(number_of_rides)),vjust = 1.5,color = "black")+
labs(title = "Total Rides by User Type",
subtitle = "Annual Members have significantly higher number of rides than Casual Riders by 10.6x.",
x = "User Type",
y = "Number of Rides")+
scale_fill_manual(name="User Type",
values=c("casual"="lightblue","member"="lightgreen"),
labels=c("Casual Riders","Annual Members"))
The data reveals a distinct difference in usage patterns between Annual Members and Casual Riders.
all_trips_v2 %>%
mutate(weekday = wday(started_at,label = TRUE)) %>%
drop_na(weekday,member_casual) %>%
group_by(member_casual,weekday) %>%
summarise(number_of_rides = n(),
average_duration = mean(ride_length,na.rm = TRUE),.groups = "drop") %>%
arrange(member_casual,weekday) %>%
ggplot(aes(x = weekday,y = number_of_rides,fill = member_casual))+
geom_col(position = "dodge")+
labs(title = "Weekly Usage Pattern: Member vs. Casual",
subtitle = "Members peak during the work week; Casual riders surge on weekends",
x = "Day of the Week",
y = "Number of Rides")+
scale_fill_manual(name = "User Type",
values = c("casual"="blue","member"="green"),
labels = c("Casual Riders","Annual Members") )
Based on the analysis of Cyclistic’s 2019 and 2020 trip data, I recommend the following strategies to drive Casual-to-Member conversions:
Start a “Weekend Member” pass. Since the data shows Casual Riders thrive on weekends and has 3x longer average usage duration, offering a membership that focuses on weekend leisure benefits could connect with users who are not ready for a full annual commitment.
Gamification️. Collaborate with the marketing team to build “Member-Only Scenic Routes” within the app. Offer digital rewards or points for completing specific scenic paths. These points could eventually be redeemed for a discount on a full Annual Membership.
Exporting the average duration and total number of rides grouping it by week day, user types and time of the day.
counts <- all_trips_v2 %>%
drop_na() %>%
mutate(hour_of_day = hour(started_at)) %>%
group_by(day_of_week,member_casual,hour_of_day) %>%
summarise(mean_ride_length = mean(ride_length,na.rm=TRUE),
number_of_rides = n(),.groups = "drop")
write.csv(counts,"C:/Users/asus/OneDrive/Documents/GOOGLE CERTIFICATE DATA ANALYST/avg_ride_length.csv")