Evaluation of the data used:
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(here,
tidyverse,
janitor, # Cleaning column names
scales, # Transform axis scales
ggrepel, # Optimise plot label separation
plyr,
readr,
skimr,
dplyr,
tibble,
lubridate,
ggplot2)
# Import Data and bind it in a single file
# X202101_divvy_tripdata <- read.csv("./202101-divvy-tripdata.csv")
# X202102_divvy_tripdata <- read.csv("./202102-divvy-tripdata.csv")
# X202103_divvy_tripdata <- read.csv("./202103-divvy-tripdata.csv")
# X202104_divvy_tripdata <- read.csv("./202104-divvy-tripdata.csv")
# X202105_divvy_tripdata <- read.csv("./202105-divvy-tripdata.csv")
# X202106_divvy_tripdata <- read.csv("./202106-divvy-tripdata.csv")
# X202107_divvy_tripdata <- read.csv("./202107-divvy-tripdata.csv")
# X202108_divvy_tripdata <- read.csv("./202108-divvy-tripdata.csv")
# X202109_divvy_tripdata <- read.csv("./202109-divvy-tripdata.csv")
# X202110_divvy_tripdata <- read.csv("./202110-divvy-tripdata.csv")
# X202111_divvy_tripdata <- read.csv("./202111-divvy-tripdata.csv")
# X202112_divvy_tripdata <- read.csv("./202112-divvy-tripdata.csv")
# X2021_divvy_tripdata <- rbind(X202101_divvy_tripdata,X202102_divvy_tripdata,X202103_divvy_tripdata,X202104_divvy_tripdata,X202105_divvy_tripdata,X202106_divvy_tripdata,X202107_divvy_tripdata,X202108_divvy_tripdata,X202109_divvy_tripdata,X202110_divvy_tripdata,X202111_divvy_tripdata,X202112_divvy_tripdata)
# Save the binded file for easy upload and processing in the future
# write.csv(X2021_divvy_tripdata,"2021_divvy_tripdata.csv")
library(readr)
X2021_divvy_tripdata <- read_csv("2021_divvy_tripdata.csv")
## New names:
## * `` -> ...1
## Rows: 5595063 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (5): ...1, start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(skimr)
skim_without_charts(X2021_divvy_tripdata)
| Name | X2021_divvy_tripdata |
| Number of rows | 5595063 |
| Number of columns | 14 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 5595063 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 690809 | 0.88 | 3 | 53 | 0 | 847 | 0 |
| start_station_id | 690806 | 0.88 | 3 | 36 | 0 | 834 | 0 |
| end_station_name | 739170 | 0.87 | 10 | 53 | 0 | 844 | 0 |
| end_station_id | 739170 | 0.87 | 3 | 36 | 0 | 832 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| …1 | 0 | 1 | 2797532.00 | 1615155.71 | 1.00 | 1398766.50 | 2797532.00 | 4196297.50 | 5595063.00 |
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 4771 | 1 | 41.90 | 0.05 | 41.39 | 41.88 | 41.90 | 41.93 | 42.17 |
| end_lng | 4771 | 1 | -87.65 | 0.03 | -88.97 | -87.66 | -87.64 | -87.63 | -87.49 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-01-01 00:02:05 | 2021-12-31 23:59:48 | 2021-08-01 01:52:11 | 4677998 |
| ended_at | 0 | 1 | 2021-01-01 00:08:39 | 2022-01-03 17:32:18 | 2021-08-01 02:21:55 | 4671372 |
# Detaching plyr and reloading packages for accurate results
library(dplyr, warn.conflicts = FALSE)
# Suppress summarise info
options(dplyr.summarise.inform = FALSE)
library(lubridate)
detach(package:plyr)
library(ggplot2)
library(scales)
X2021_divvy_tripdata <- X2021_divvy_tripdata %>%
mutate(started_at = ymd_hms(started_at), ended_at = ymd_hms(ended_at))
# Add a new column calculating the ride length
X2021_divvy_tripdata <- X2021_divvy_tripdata %>%
mutate(ride_length = as.numeric(ended_at-started_at,units="mins"))
# Add three new columns stating date, month & day of the week
X2021_divvy_tripdata <- X2021_divvy_tripdata %>%
mutate(date_month = mday(started_at),month_year = month(started_at,label = TRUE) , day_week = wday(started_at, label = TRUE))
# Number of Members are more than Casual riders
member_types <- X2021_divvy_tripdata %>%
group_by(member_casual) %>%
tally(name="Count")
member_types
## # A tibble: 2 x 2
## member_casual Count
## <chr> <int>
## 1 casual 2529005
## 2 member 3066058
member_types_graph <- member_types %>%
ggplot(aes(x = member_casual, y = Count, fill = member_casual, label = member_casual )) + geom_bar(stat="identity")+
scale_y_continuous(labels = unit_format(unit = "M", scale = 1e-6)) + labs(title="Comparison between Casuals & Members",
x ="", y = "Count")
member_types_graph
# Calculating the Comparison of Total ride time & average ride time per user.
memberstatistics <- X2021_divvy_tripdata %>%
group_by(member_casual) %>%
summarise(total_ridetime = sum(ride_length), average_ride = mean(ride_length))
# Total ride time of Casuals exceed that of the Members.
total_ridesgraph <- memberstatistics %>%
ggplot(aes(x = member_casual, y = total_ridetime, fill = member_casual)) + geom_bar(stat="identity")+
scale_y_continuous(labels = unit_format(unit = "M", scale = 1e-6))+ labs(title="Total ride time (Million Minutes) in the year of 2021",subtitle = "Comparison between Casuals & Members", x ="", y = "Total Ridetime")
total_ridesgraph
#Average riding time of a Member is 14 minutes whereas a Casual Rider rides for 32 minutes
average_ridesgraph <- memberstatistics %>%
ggplot(aes(x = member_casual, y=average_ride, fill = member_casual)) + geom_bar(stat="identity")+
scale_y_continuous(labels = unit_format(unit = "minutes", scale = 1.00)) + labs(title="Average ride time in the year of 2021",subtitle = "Comparison between Casuals & Members", x ="", y = "Average Ridetime")
average_ridesgraph
# In Winters members have more ride time than Casuals but in Summers Casuals reach triple ride time the amount of Members
monthstatistics <- X2021_divvy_tripdata %>%
group_by(month_year,member_casual) %>%
summarise( total_rides = sum(ride_length))
monthstatistics %>%
arrange(-total_rides)
## # A tibble: 24 x 3
## # Groups: month_year [12]
## month_year member_casual total_rides
## <ord> <chr> <dbl>
## 1 Jul casual 14495092.
## 2 Jun casual 13760209.
## 3 Aug casual 11879300.
## 4 Sep casual 10121597.
## 5 May casual 9822147.
## 6 Oct casual 7376149.
## 7 Aug member 5523020.
## 8 Jul member 5415982.
## 9 Sep member 5387437.
## 10 Jun member 5268063.
## # ... with 14 more rows
monthstatistics_graph <- monthstatistics %>%
ggplot(aes(x=month_year, y=total_rides, group= member_casual, color = member_casual))+ geom_line()+
scale_y_continuous(labels = unit_format(unit = "M minutes", scale = 1e-6)) + labs(title="Ride time (Million Minutes) of users per month [2021]",subtitle = "Comparison between Casuals & Members", x ="", y = "Total Ride time")
monthstatistics_graph
# Calculating the Average ride time of users according to the month of the year.
avgmonthstatistics <- X2021_divvy_tripdata %>%
group_by(month_year,member_casual) %>%
summarise( average_ride = mean(ride_length))
avgmonthstatistics %>%
arrange(-average_ride)
## # A tibble: 24 x 3
## # Groups: month_year [12]
## month_year member_casual average_ride
## <ord> <chr> <dbl>
## 1 Feb casual 49.4
## 2 May casual 38.2
## 3 Mar casual 38.2
## 4 Apr casual 38.0
## 5 Jun casual 37.1
## 6 Jul casual 32.8
## 7 Aug casual 28.8
## 8 Oct casual 28.7
## 9 Sep casual 27.8
## 10 Jan casual 25.7
## # ... with 14 more rows
# Average Ride time of Casuals is much more than members throughout the year and it reaches the peak in February
avgmonthstatistics_graph <- avgmonthstatistics %>%
ggplot(aes(x=month_year, y=average_ride, group= member_casual, color = member_casual))+ geom_line()+
scale_y_continuous(labels = unit_format(unit = "minutes")) + labs(title="Average ride time per month [2021]",subtitle = "Comparison between Casuals & Members", x ="", y = "Average Ridetime")
avgmonthstatistics_graph
# Number of users throughout the year 2021 separated by user type
usermonthstatistics <- X2021_divvy_tripdata %>%
group_by(month_year, member_casual) %>%
tally(name = "Count")
usermonthstatistics_graph <- usermonthstatistics %>%
ggplot(aes(x=month_year, y=Count, group= member_casual, color = member_casual))+ geom_line()+
scale_y_continuous(labels = unit_format(unit = "")) + labs(title="Number of users per month [2021]",subtitle = "Comparison between Casuals & Members", x ="", y = "Users Count")
usermonthstatistics_graph
library(dplyr)
daystatistics <- X2021_divvy_tripdata %>%
group_by(day_week,member_casual) %>%
summarise( average_rides = mean(ride_length))
# The most popular day for both the member and casual riders is Sunday
## Even on Weekdays Casual riders have more ride time than members
daystatistics_graph <- daystatistics %>%
ggplot(aes(x=day_week, y=average_rides, group= member_casual, color = member_casual))+ geom_line()+
scale_y_continuous(labels = unit_format(unit = "minutes")) + labs(title="Average ride time per day of the week [2021]",subtitle = "Comparison between Casuals & Members", x ="", y = "Users Count")
daystatistics_graph
datestatistics <- X2021_divvy_tripdata %>%
group_by(date_month,member_casual) %>%
summarise( total_rides = sum(ride_length), .groups = 'drop')
datestatistics_graph <- datestatistics %>%
ggplot(aes(x=date_month, y=total_rides, group= member_casual, color = member_casual))+ geom_line()+
scale_y_continuous(labels = unit_format(unit = "M minutes", scale = 1e-6)) + labs(title="Ride time of users according to date of the month [2021]",subtitle = "Comparison between Casuals & Members", x ="", y = "Total Ride time")
datestatistics_graph
# Calculate the bike usage per month according to the month of the year.
bike_monthstatistics <- X2021_divvy_tripdata %>%
group_by(month_year, member_casual, rideable_type ) %>%
tally(name = "count_bike")
bike_monthstatistics_graph <- bike_monthstatistics %>%
ggplot(aes(x=month_year, y=count_bike, fill=rideable_type, shape = as.factor(member_casual)))+geom_bar(stat="identity")+ theme(axis.text.x=element_text(angle=45, hjust=1))+
scale_y_continuous(labels = unit_format(unit = "M minutes", scale = 1e-6)) + labs(title="Ride time of users according to bike type [2021]",subtitle = "Comparison between Casuals & Members", x ="", y = "Total Ride time") + facet_wrap(~member_casual)
bike_monthstatistics_graph
# The most popular starting points.
start_station <- X2021_divvy_tripdata %>%
group_by(member_casual,start_station_name) %>%
count(start_station_name,name = "Count", sort= TRUE) %>%
summarize(start_station_name,Count) %>%
arrange(-Count)
start_station = na.omit(start_station)
top_starting_station<- top_n(start_station ,5,Count)
top_starting_stationgraph <- top_starting_station %>%
ggplot(aes(x = reorder(start_station_name, -Count),y = Count, fill = member_casual))+geom_bar(stat="identity")+theme(axis.text.x=element_text(angle=45, hjust=1)) + labs(title="Starting station for Users",subtitle = "Comparison between Casuals & Members", x ="Station Name", y = "Number of rides")
top_starting_stationgraph
# Combining two strings into one to find Popular routes & Displaying the most popular routes for casual riders.
X2021_divvy_tripdata$route <- paste(X2021_divvy_tripdata$start_station_name, X2021_divvy_tripdata$end_station_name, sep=" TO ")
routes <- X2021_divvy_tripdata %>%
group_by(route, member_casual) %>%
count(route, name = "count", sort = TRUE)%>%
summarize(route,count) %>%
arrange(-count)
routes <- na.omit(routes)
routes <-routes[-c(1,2),]
casualroutes <- routes %>%
filter(member_casual=='casual')
top_casualroutes<- top_n (ungroup(casualroutes), 13, count)
top_casualroutes
## # A tibble: 13 x 3
## route member_casual count
## <chr> <chr> <int>
## 1 Streeter Dr & Grand Ave TO Streeter Dr & Grand Ave casual 11683
## 2 Millennium Park TO Millennium Park casual 6111
## 3 Michigan Ave & Oak St TO Michigan Ave & Oak St casual 5900
## 4 Lake Shore Dr & Monroe St TO Lake Shore Dr & Monroe St casual 4669
## 5 Buckingham Fountain TO Buckingham Fountain casual 3445
## 6 Streeter Dr & Grand Ave TO Millennium Park casual 3309
## 7 Theater on the Lake TO Theater on the Lake casual 3219
## 8 DuSable Lake Shore Dr & Monroe St TO DuSable Lake Shore ~ casual 3116
## 9 Montrose Harbor TO Montrose Harbor casual 3028
## 10 Shedd Aquarium TO Shedd Aquarium casual 2931
## 11 Millennium Park TO Streeter Dr & Grand Ave casual 2927
## 12 Indiana Ave & Roosevelt Rd TO Indiana Ave & Roosevelt Rd casual 2840
## 13 Michigan Ave & 8th St TO Michigan Ave & 8th St casual 2827
top_casualroutesgraph <- top_casualroutes %>%
ggplot(aes(x = reorder(route, -count) , y = count, fill = member_casual))+geom_bar(stat="identity")+theme(axis.text.x=element_text(angle=45, hjust=1)) + labs(title="Routes for Casual users",subtitle = "Similar routes for Casuals:", x ="Route", y = "Number of rides")
top_casualroutesgraph