Created by: Patrick Ingelmo
www.patrickingelmo.com

Overview

Every month, CitiBike releases a treasure trove of data that includes information on each individual trip (e.g. duration, start and end location, subscription type, gender of rider). The data is available here - http://www.citibikenyc.com/system-data. I pulled data for every month in 2014. It’s massive dataset that contains over 8 million lines. I took a sample of 500,000 lines in order to process the data more quickly. The code to load and clean is below, followed by an exploration of who rides Citi Bikes, what types of rides they take and their favorite routes.

Munge Data

library(ggplot2)
library(ggthemes)
library(dplyr)
library(scales)
library(knitr)
library(lubridate)
library(ggmap)

#Sep-Dec
#set directory to where csv files are located
setwd("/Users/pingelmo/Dropbox/data/CitiBike/data")
#initialize data frame
citi <- data.frame()
#assign files to list
files_list <- list.files()
#read in csv and row bind into one data frame
for(file in files_list){citi<-rbind(citi,read.table(file,header=T,sep=",",stringsAsFactors=F))}
#create tripDate field
citi$tripDate <- as.Date(factor(citi$starttime),format="%m/%d/%Y")
# convert character to POSIXct
citi$time <- as.POSIXct(strptime(citi$starttime,"%m/%d/%Y %H:%M"))
#extract time
citi$time <- strftime(citi$time,format="%H:%M")

#Jan-Aug
setwd("/Users/pingelmo/Dropbox/data/CitiBike/data1")
citi1 <- data.frame()
files_list <- list.files()
for(file in files_list){citi1<-rbind(citi1,read.table(file,header=T,sep=",",stringsAsFactors=F))}
citi1$tripDate <- as.Date(factor(citi1$starttime),format="%Y-%m-%d")
citi1$time <- as.POSIXct(strptime(citi1$starttime,"%Y-%m-%d %H:%M"))
citi1$time <- strftime(citi1$time,format="%H:%M")
setwd("/Users/pingelmo/Dropbox/data/CitiBike")

#combine DFs
citi <- rbind(citi,citi1)
rm(citi1,file,files_list)

#create sample
citi <- sample_n(citi,500000)

#convert birth year to numeric
citi$birth.year <- as.numeric(citi$birth.year)

#new variables
citi$wday <- wday(citi$tripDate)
citi$hour <- as.numeric(substr(citi$time,1,2))
citi$tripMin <- citi$tripduration/60
citi$age <- year(today())-citi$birth.year
citi$age_group <- cut(citi$age,breaks=c(0,30,40,50,60,70,80,100),labels=c("under 30","30s","40s","50s","60s","70s","over 80"))

#convert gender vars to name
citi$gender <- ifelse(citi$gender==1,"Male",ifelse(citi$gender==2,"Female","Unknown"))

#rename weekdays
citi$wday <- ifelse(citi$wday==1,"Sunday",citi$wday)
citi$wday <- ifelse(citi$wday==2,"Monday",citi$wday)
citi$wday <- ifelse(citi$wday==3,"Tuesday",citi$wday)
citi$wday <- ifelse(citi$wday==4,"Wednesday",citi$wday)
citi$wday <- ifelse(citi$wday==5,"Thursday",citi$wday)
citi$wday <- ifelse(citi$wday==6,"Friday",citi$wday)
citi$wday <- ifelse(citi$wday==7,"Saturday",citi$wday)
citi$wday <- factor(citi$wday,levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))

Who rides Citi Bikes?

  • Riders skew heavily male. In the sample, 70% of riders were male, 20% were female and 10% were unknown.
  • Almost 55% of riders were aged 39 or younger.
  • Female riders tend to be slightly younger than males. Female median age is 35, compared to 37 for males.
  • 90% of riders have subscriptions (1 year passes)

    # gender
    citi %>% ggplot(aes(x=gender,fill=gender)) + geom_bar(alpha=.8) + theme_fivethirtyeight() + scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle(expression(atop("Gender of Citi Bikers",atop("70% are Male"))))+scale_y_continuous(labels=comma)

    # age
    citi %>% ggplot(aes(x=age)) + geom_density()+theme_fivethirtyeight()+theme(axis.text.y=element_blank())+ggtitle("Age Distribution of Citi Bikers")+geom_vline(xintercept=median(citi$age,na.rm=T),linetype="dashed",col="dark grey")+annotate("text",x=50,y=.035,label="Median = 37")

    # age_group
    citi %>% ggplot(aes(x=age_group,fill=age_group)) + geom_bar(alpha=.8)+theme_fivethirtyeight()+scale_fill_brewer(palette="Set2")+theme(legend.position="false")+ggtitle(expression(atop("Distribution by Age Group",atop("55% are aged 39 or younger"))))+scale_y_continuous(labels=comma)

    citi %>% filter(gender %in% c("Male","Female")) %>% ggplot(aes(x=gender,y=age,fill=gender))+geom_boxplot(alpha=.4)+ylim(15,60)+theme_fivethirtyeight()+theme(legend.position="none")+scale_fill_brewer(palette="Set2")+ggtitle("Age Distribution by Gender")+annotate("text",x="Female",y=36,label="Median = 35")+annotate("text",x="Male",y=38,label="Median = 37")

    # usertype
    citi %>% ggplot(aes(x=usertype,fill=usertype)) + geom_bar(alpha=.7)+theme_fivethirtyeight()+theme(legend.position="none")+ggtitle(expression(atop("UserType Distribution",atop("Customer = 24 hour/7 Day pass  | Subscriber = Annual Pass")))) + scale_fill_brewer(palette="Set2")+scale_y_continuous(labels=comma)+annotate("text",x="Subscriber",y=350000,label="90% are Subscribers")

    Types of Rides

  • Weekdays are slightly more popular than weekends for ridership.
  • Weekday rides follow a peak and valley type of distribution. With peaks during the morning rush (8-9) and afternoon rush (5-6).
  • Weekday rides have a median length of 10 min, mean of 13.5 min. Weekend rides have a median length of 12 min, mean of 16.6 min.
  • # rides by day
    citi %>% ggplot(aes(x=wday,fill=wday))+geom_bar(alpha=.8)+theme_fivethirtyeight()+ scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle(expression(atop("More rides on Weekdays than Weekends",atop("Number of Rides by day"))))

    # weekday distribution, graph
    citi %>% ggplot(aes(x=hour,fill=factor(wday))) + geom_density(alpha=.2)+facet_wrap(~wday,ncol=1)+theme_fivethirtyeight()+theme(legend.position="none",axis.text.y=element_blank(),plot.title=element_text(hjust=.5)) + ggtitle(expression(atop("Hourly Distribution of Rides",atop("Weekday Peaks during Morning Rush (8am-9am) and Afternoon Rush (5pm-6pm)"))))

    # weekday distribution
    citi %>% ggplot(aes(x=wday,y=tripMin,fill=wday))+geom_boxplot(alpha=.5)+ylim(0,30)+theme_fivethirtyeight()+scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle("Weekend Rides Last Slightly Longer")

    How Usertype Affects Ridership

  • Subscribers clearly use Citi Bikes for commuting to work. Most rides are centered around the morning and afternoon peaks.
  • Subscribers and customers have similar density curves during the weekend.
  • Subscribers also have far shorter trips, averaging 13 min compared to 27 min for customers!
  • #time of day distribution by usertype
    citi %>% ggplot(aes(x=hour,group=usertype,fill=usertype)) + geom_density(alpha=.3) + theme_fivethirtyeight() + theme(legend.title=element_blank(),axis.text.y=element_blank())+ggtitle(expression(atop("How Usertype Affects Time of Day Distribution",atop("Customer = 24 hour/7 Day pass  | Subscriber = Annual Pass")))) 

    #time of day distribution by usertype, weekday
    citi %>% ggplot(aes(x=hour,group=usertype,fill=usertype)) + geom_density(alpha=.3) + facet_wrap(~wday,ncol=1) + theme_fivethirtyeight() + theme(legend.title=element_blank(),axis.text.y=element_blank(),plot.title=element_text(hjust=.5)) +ggtitle(expression(atop("How Usertype Affects Hourly Distribution",atop("Subscribers are using CitiBikes for commuting"))))

    # trip duration by usertype
    citi %>% ggplot(aes(x=usertype,y=tripMin,fill=usertype))+geom_boxplot(alpha=.5)+ylim(0,30)+theme_fivethirtyeight()+scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle(expression(atop("Subscribers Take Far Shorter Rides",atop("Minutes per Ride"))))+annotate("text",x="Customer",y=19,label="Median = 18")+annotate("text",x="Subscriber",y=11,label="Median = 9")

    Most Popular Routes

    Citi Bike also provides data on the start and end locations of every trip. By combining these two columns, we can get a sense for what the most popular start and end combinations are.

  • Four of the top 10 routes, including the most popular, are rides around Central Park.
  • Grand Central (E 43st & Vanderbilt) is a popular start location, going to Penn Station (W 33rd & 7 ave) and Port Authority (W 41st & 8ave).
  • #create route field
    citi$route <- paste(citi$start.station.name,citi$end.station.name,sep=" -> ")
    
    # map of NY
    NY <- qmap("23rd street and 5th ave,new york",zoom=13,color="bw")
    # 10 most popular routes
    citi %>% group_by(route) %>% summarize(count=n()) %>% arrange(desc(count)) %>% top_n(n=10) %>% kable(format="markdown")
    route count
    Central Park S & 6 Ave -> Central Park S & 6 Ave 749
    Broadway & W 60 St -> Broadway & W 60 St 356
    Grand Army Plaza & Central Park S -> Grand Army Plaza & Central Park S 345
    E 43 St & Vanderbilt Ave -> W 41 St & 8 Ave 236
    W 17 St & 8 Ave -> 8 Ave & W 31 St 197
    W 21 St & 6 Ave -> W 22 St & 10 Ave 195
    Grand Army Plaza & Central Park S -> Broadway & W 60 St 193
    W 21 St & 6 Ave -> 9 Ave & W 22 St 189
    Centre St & Chambers St -> Centre St & Chambers St 184
    Washington Square E -> University Pl & E 14 St 184
    # most popular start station
    start <- citi %>% group_by(start.station.name,start.station.latitude,start.station.longitude) %>% summarize(count=n()) %>% ungroup() %>% arrange(desc(count)) %>% head(n=10)
    
    NY+ geom_point(data=start,aes(x=start.station.longitude,y=start.station.latitude,size=count),col="dark green",alpha=.65)+scale_size_continuous(range=c(6,15))+theme(legend.position="none",plot.title=element_text(size=rel(1.5)))+ggtitle("Most Popular Start Stations")

    start.station.name start.station.latitude start.station.longitude count
    8 Ave & W 31 St 40.75045 -73.99481 6095
    Lafayette St & E 8 St 40.73029 -73.99076 5345
    E 17 St & Broadway 40.73705 -73.99009 5006
    W 21 St & 6 Ave 40.74174 -73.99416 4568
    Broadway & E 14 St 40.73455 -73.99074 3999
    West St & Chambers St 40.71755 -74.01322 3893
    Cleveland Pl & Spring St 40.72182 -73.99720 3831
    Broadway & E 22 St 40.74034 -73.98955 3758
    Broadway & W 24 St 40.74235 -73.98915 3645
    University Pl & E 14 St 40.73493 -73.99201 3643
    #most popular end station
    end <- citi %>% group_by(end.station.name,end.station.latitude,end.station.longitude) %>% summarize(count=n()) %>% ungroup() %>% arrange(desc(count)) %>% head(n=10)
    
    NY+ geom_point(data=end,aes(x=end.station.longitude,y=end.station.latitude,size=count),col="dark green",alpha=.65)+scale_size_continuous(range=c(6,15))+theme(legend.position="none",plot.title=element_text(size=rel(1.5)))+ggtitle("Most Popular End Stations")

    end.station.name end.station.latitude end.station.longitude count
    Lafayette St & E 8 St 40.73029 -73.99076 5282
    E 17 St & Broadway 40.73705 -73.99009 5138
    8 Ave & W 31 St 40.75045 -73.99481 4956
    W 21 St & 6 Ave 40.74174 -73.99416 4623
    Broadway & E 14 St 40.73455 -73.99074 4004
    University Pl & E 14 St 40.73493 -73.99201 3972
    West St & Chambers St 40.71755 -74.01322 3842
    Broadway & E 22 St 40.74034 -73.98955 3766
    Greenwich Ave & 8 Ave 40.73902 -74.00264 3745
    Cleveland Pl & Spring St 40.72182 -73.99720 3731