Analyzing NYC’s CitiBike

Created by: Patrick Ingelmo

www.patrickingelmo.com

Overview

Every month, CitiBike releases a treasure trove of data that includes information on each individual trip (e.g. duration, start and end location, subscription type, gender of rider). The data is available here - http://www.citibikenyc.com/system-data. I pulled data for every month in 2014. It’s massive dataset that contains over 8 million lines. I took a sample of 500,000 lines in order to process the data more quickly. The code to load and clean is below, followed by an exploration of who rides Citi Bikes, what types of rides they take and their favorite routes.

Munge Data

library(ggplot2)
library(ggthemes)
library(dplyr)
library(scales)
library(knitr)
library(lubridate)
library(ggmap)

#Sep-Dec
#set directory to where csv files are located
setwd("/Users/pingelmo/Dropbox/data/CitiBike/data")
#initialize data frame
citi <- data.frame()
#assign files to list
files_list <- list.files()
#read in csv and row bind into one data frame
for(file in files_list){citi<-rbind(citi,read.table(file,header=T,sep=",",stringsAsFactors=F))}
#create tripDate field
citi$tripDate <- as.Date(factor(citi$starttime),format="%m/%d/%Y")
# convert character to POSIXct
citi$time <- as.POSIXct(strptime(citi$starttime,"%m/%d/%Y %H:%M"))
#extract time
citi$time <- strftime(citi$time,format="%H:%M")

#Jan-Aug
setwd("/Users/pingelmo/Dropbox/data/CitiBike/data1")
citi1 <- data.frame()
files_list <- list.files()
for(file in files_list){citi1<-rbind(citi1,read.table(file,header=T,sep=",",stringsAsFactors=F))}
citi1$tripDate <- as.Date(factor(citi1$starttime),format="%Y-%m-%d")
citi1$time <- as.POSIXct(strptime(citi1$starttime,"%Y-%m-%d %H:%M"))
citi1$time <- strftime(citi1$time,format="%H:%M")
setwd("/Users/pingelmo/Dropbox/data/CitiBike")

#combine DFs
citi <- rbind(citi,citi1)
rm(citi1,file,files_list)

#create sample
citi <- sample_n(citi,500000)

#convert birth year to numeric
citi$birth.year <- as.numeric(citi$birth.year)

#new variables
citi$wday <- wday(citi$tripDate)
citi$hour <- as.numeric(substr(citi$time,1,2))
citi$tripMin <- citi$tripduration/60
citi$age <- year(today())-citi$birth.year
citi$age_group <- cut(citi$age,breaks=c(0,30,40,50,60,70,80,100),labels=c("under 30","30s","40s","50s","60s","70s","over 80"))

#convert gender vars to name
citi$gender <- ifelse(citi$gender==1,"Male",ifelse(citi$gender==2,"Female","Unknown"))

#rename weekdays
citi$wday <- ifelse(citi$wday==1,"Sunday",citi$wday)
citi$wday <- ifelse(citi$wday==2,"Monday",citi$wday)
citi$wday <- ifelse(citi$wday==3,"Tuesday",citi$wday)
citi$wday <- ifelse(citi$wday==4,"Wednesday",citi$wday)
citi$wday <- ifelse(citi$wday==5,"Thursday",citi$wday)
citi$wday <- ifelse(citi$wday==6,"Friday",citi$wday)
citi$wday <- ifelse(citi$wday==7,"Saturday",citi$wday)
citi$wday <- factor(citi$wday,levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))

Who rides Citi Bikes?

Riders skew heavily male. In the sample, 70% of riders were male, 20% were female and 10% were unknown.

Almost 55% of riders were aged 39 or younger.

Female riders tend to be slightly younger than males. Female median age is 35, compared to 37 for males.

90% of riders have subscriptions (1 year passes)

# gender
citi %>% ggplot(aes(x=gender,fill=gender)) + geom_bar(alpha=.8) + theme_fivethirtyeight() + scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle(expression(atop("Gender of Citi Bikers",atop("70% are Male"))))+scale_y_continuous(labels=comma)

# age
citi %>% ggplot(aes(x=age)) + geom_density()+theme_fivethirtyeight()+theme(axis.text.y=element_blank())+ggtitle("Age Distribution of Citi Bikers")+geom_vline(xintercept=median(citi$age,na.rm=T),linetype="dashed",col="dark grey")+annotate("text",x=50,y=.035,label="Median = 37")

# age_group
citi %>% ggplot(aes(x=age_group,fill=age_group)) + geom_bar(alpha=.8)+theme_fivethirtyeight()+scale_fill_brewer(palette="Set2")+theme(legend.position="false")+ggtitle(expression(atop("Distribution by Age Group",atop("55% are aged 39 or younger"))))+scale_y_continuous(labels=comma)

citi %>% filter(gender %in% c("Male","Female")) %>% ggplot(aes(x=gender,y=age,fill=gender))+geom_boxplot(alpha=.4)+ylim(15,60)+theme_fivethirtyeight()+theme(legend.position="none")+scale_fill_brewer(palette="Set2")+ggtitle("Age Distribution by Gender")+annotate("text",x="Female",y=36,label="Median = 35")+annotate("text",x="Male",y=38,label="Median = 37")

# usertype
citi %>% ggplot(aes(x=usertype,fill=usertype)) + geom_bar(alpha=.7)+theme_fivethirtyeight()+theme(legend.position="none")+ggtitle(expression(atop("UserType Distribution",atop("Customer = 24 hour/7 Day pass  | Subscriber = Annual Pass")))) + scale_fill_brewer(palette="Set2")+scale_y_continuous(labels=comma)+annotate("text",x="Subscriber",y=350000,label="90% are Subscribers")

Types of Rides

Weekdays are slightly more popular than weekends for ridership.

Weekday rides follow a peak and valley type of distribution. With peaks during the morning rush (8-9) and afternoon rush (5-6).

Weekday rides have a median length of 10 min, mean of 13.5 min. Weekend rides have a median length of 12 min, mean of 16.6 min.

# rides by day
citi %>% ggplot(aes(x=wday,fill=wday))+geom_bar(alpha=.8)+theme_fivethirtyeight()+ scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle(expression(atop("More rides on Weekdays than Weekends",atop("Number of Rides by day"))))

# weekday distribution, graph
citi %>% ggplot(aes(x=hour,fill=factor(wday))) + geom_density(alpha=.2)+facet_wrap(~wday,ncol=1)+theme_fivethirtyeight()+theme(legend.position="none",axis.text.y=element_blank(),plot.title=element_text(hjust=.5)) + ggtitle(expression(atop("Hourly Distribution of Rides",atop("Weekday Peaks during Morning Rush (8am-9am) and Afternoon Rush (5pm-6pm)"))))

# weekday distribution
citi %>% ggplot(aes(x=wday,y=tripMin,fill=wday))+geom_boxplot(alpha=.5)+ylim(0,30)+theme_fivethirtyeight()+scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle("Weekend Rides Last Slightly Longer")

How Usertype Affects Ridership

Subscribers clearly use Citi Bikes for commuting to work. Most rides are centered around the morning and afternoon peaks.

Subscribers and customers have similar density curves during the weekend.

Subscribers also have far shorter trips, averaging 13 min compared to 27 min for customers!

#time of day distribution by usertype
citi %>% ggplot(aes(x=hour,group=usertype,fill=usertype)) + geom_density(alpha=.3) + theme_fivethirtyeight() + theme(legend.title=element_blank(),axis.text.y=element_blank())+ggtitle(expression(atop("How Usertype Affects Time of Day Distribution",atop("Customer = 24 hour/7 Day pass  | Subscriber = Annual Pass"))))

#time of day distribution by usertype, weekday
citi %>% ggplot(aes(x=hour,group=usertype,fill=usertype)) + geom_density(alpha=.3) + facet_wrap(~wday,ncol=1) + theme_fivethirtyeight() + theme(legend.title=element_blank(),axis.text.y=element_blank(),plot.title=element_text(hjust=.5)) +ggtitle(expression(atop("How Usertype Affects Hourly Distribution",atop("Subscribers are using CitiBikes for commuting"))))

# trip duration by usertype
citi %>% ggplot(aes(x=usertype,y=tripMin,fill=usertype))+geom_boxplot(alpha=.5)+ylim(0,30)+theme_fivethirtyeight()+scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle(expression(atop("Subscribers Take Far Shorter Rides",atop("Minutes per Ride"))))+annotate("text",x="Customer",y=19,label="Median = 18")+annotate("text",x="Subscriber",y=11,label="Median = 9")

Most Popular Routes

Citi Bike also provides data on the start and end locations of every trip. By combining these two columns, we can get a sense for what the most popular start and end combinations are.

Four of the top 10 routes, including the most popular, are rides around Central Park.

Grand Central (E 43st & Vanderbilt) is a popular start location, going to Penn Station (W 33rd & 7 ave) and Port Authority (W 41st & 8ave).

#create route field
citi$route <- paste(citi$start.station.name,citi$end.station.name,sep=" -> ")

# map of NY
NY <- qmap("23rd street and 5th ave,new york",zoom=13,color="bw")

# 10 most popular routes
citi %>% group_by(route) %>% summarize(count=n()) %>% arrange(desc(count)) %>% top_n(n=10) %>% kable(format="markdown")

route	count
Central Park S & 6 Ave -> Central Park S & 6 Ave	749
Broadway & W 60 St -> Broadway & W 60 St	356
Grand Army Plaza & Central Park S -> Grand Army Plaza & Central Park S	345
E 43 St & Vanderbilt Ave -> W 41 St & 8 Ave	236
W 17 St & 8 Ave -> 8 Ave & W 31 St	197
W 21 St & 6 Ave -> W 22 St & 10 Ave	195
Grand Army Plaza & Central Park S -> Broadway & W 60 St	193
W 21 St & 6 Ave -> 9 Ave & W 22 St	189
Centre St & Chambers St -> Centre St & Chambers St	184
Washington Square E -> University Pl & E 14 St	184

# most popular start station
start <- citi %>% group_by(start.station.name,start.station.latitude,start.station.longitude) %>% summarize(count=n()) %>% ungroup() %>% arrange(desc(count)) %>% head(n=10)

NY+ geom_point(data=start,aes(x=start.station.longitude,y=start.station.latitude,size=count),col="dark green",alpha=.65)+scale_size_continuous(range=c(6,15))+theme(legend.position="none",plot.title=element_text(size=rel(1.5)))+ggtitle("Most Popular Start Stations")

start.station.name	start.station.latitude	start.station.longitude	count
8 Ave & W 31 St	40.75045	-73.99481	6095
Lafayette St & E 8 St	40.73029	-73.99076	5345
E 17 St & Broadway	40.73705	-73.99009	5006
W 21 St & 6 Ave	40.74174	-73.99416	4568
Broadway & E 14 St	40.73455	-73.99074	3999
West St & Chambers St	40.71755	-74.01322	3893
Cleveland Pl & Spring St	40.72182	-73.99720	3831
Broadway & E 22 St	40.74034	-73.98955	3758
Broadway & W 24 St	40.74235	-73.98915	3645
University Pl & E 14 St	40.73493	-73.99201	3643

#most popular end station
end <- citi %>% group_by(end.station.name,end.station.latitude,end.station.longitude) %>% summarize(count=n()) %>% ungroup() %>% arrange(desc(count)) %>% head(n=10)

NY+ geom_point(data=end,aes(x=end.station.longitude,y=end.station.latitude,size=count),col="dark green",alpha=.65)+scale_size_continuous(range=c(6,15))+theme(legend.position="none",plot.title=element_text(size=rel(1.5)))+ggtitle("Most Popular End Stations")

end.station.name	end.station.latitude	end.station.longitude	count
Lafayette St & E 8 St	40.73029	-73.99076	5282
E 17 St & Broadway	40.73705	-73.99009	5138
8 Ave & W 31 St	40.75045	-73.99481	4956
W 21 St & 6 Ave	40.74174	-73.99416	4623
Broadway & E 14 St	40.73455	-73.99074	4004
University Pl & E 14 St	40.73493	-73.99201	3972
West St & Chambers St	40.71755	-74.01322	3842
Broadway & E 22 St	40.74034	-73.98955	3766
Greenwich Ave & 8 Ave	40.73902	-74.00264	3745
Cleveland Pl & Spring St	40.72182	-73.99720	3731