Every month, CitiBike releases a treasure trove of data that includes information on each individual trip (e.g. duration, start and end location, subscription type, gender of rider). The data is available here - http://www.citibikenyc.com/system-data. I pulled data for every month in 2014. It’s massive dataset that contains over 8 million lines. I took a sample of 500,000 lines in order to process the data more quickly. The code to load and clean is below, followed by an exploration of who rides Citi Bikes, what types of rides they take and their favorite routes.
library(ggplot2)
library(ggthemes)
library(dplyr)
library(scales)
library(knitr)
library(lubridate)
library(ggmap)
#Sep-Dec
#set directory to where csv files are located
setwd("/Users/pingelmo/Dropbox/data/CitiBike/data")
#initialize data frame
citi <- data.frame()
#assign files to list
files_list <- list.files()
#read in csv and row bind into one data frame
for(file in files_list){citi<-rbind(citi,read.table(file,header=T,sep=",",stringsAsFactors=F))}
#create tripDate field
citi$tripDate <- as.Date(factor(citi$starttime),format="%m/%d/%Y")
# convert character to POSIXct
citi$time <- as.POSIXct(strptime(citi$starttime,"%m/%d/%Y %H:%M"))
#extract time
citi$time <- strftime(citi$time,format="%H:%M")
#Jan-Aug
setwd("/Users/pingelmo/Dropbox/data/CitiBike/data1")
citi1 <- data.frame()
files_list <- list.files()
for(file in files_list){citi1<-rbind(citi1,read.table(file,header=T,sep=",",stringsAsFactors=F))}
citi1$tripDate <- as.Date(factor(citi1$starttime),format="%Y-%m-%d")
citi1$time <- as.POSIXct(strptime(citi1$starttime,"%Y-%m-%d %H:%M"))
citi1$time <- strftime(citi1$time,format="%H:%M")
setwd("/Users/pingelmo/Dropbox/data/CitiBike")
#combine DFs
citi <- rbind(citi,citi1)
rm(citi1,file,files_list)
#create sample
citi <- sample_n(citi,500000)
#convert birth year to numeric
citi$birth.year <- as.numeric(citi$birth.year)
#new variables
citi$wday <- wday(citi$tripDate)
citi$hour <- as.numeric(substr(citi$time,1,2))
citi$tripMin <- citi$tripduration/60
citi$age <- year(today())-citi$birth.year
citi$age_group <- cut(citi$age,breaks=c(0,30,40,50,60,70,80,100),labels=c("under 30","30s","40s","50s","60s","70s","over 80"))
#convert gender vars to name
citi$gender <- ifelse(citi$gender==1,"Male",ifelse(citi$gender==2,"Female","Unknown"))
#rename weekdays
citi$wday <- ifelse(citi$wday==1,"Sunday",citi$wday)
citi$wday <- ifelse(citi$wday==2,"Monday",citi$wday)
citi$wday <- ifelse(citi$wday==3,"Tuesday",citi$wday)
citi$wday <- ifelse(citi$wday==4,"Wednesday",citi$wday)
citi$wday <- ifelse(citi$wday==5,"Thursday",citi$wday)
citi$wday <- ifelse(citi$wday==6,"Friday",citi$wday)
citi$wday <- ifelse(citi$wday==7,"Saturday",citi$wday)
citi$wday <- factor(citi$wday,levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
90% of riders have subscriptions (1 year passes)
# gender
citi %>% ggplot(aes(x=gender,fill=gender)) + geom_bar(alpha=.8) + theme_fivethirtyeight() + scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle(expression(atop("Gender of Citi Bikers",atop("70% are Male"))))+scale_y_continuous(labels=comma)
# age
citi %>% ggplot(aes(x=age)) + geom_density()+theme_fivethirtyeight()+theme(axis.text.y=element_blank())+ggtitle("Age Distribution of Citi Bikers")+geom_vline(xintercept=median(citi$age,na.rm=T),linetype="dashed",col="dark grey")+annotate("text",x=50,y=.035,label="Median = 37")
# age_group
citi %>% ggplot(aes(x=age_group,fill=age_group)) + geom_bar(alpha=.8)+theme_fivethirtyeight()+scale_fill_brewer(palette="Set2")+theme(legend.position="false")+ggtitle(expression(atop("Distribution by Age Group",atop("55% are aged 39 or younger"))))+scale_y_continuous(labels=comma)
citi %>% filter(gender %in% c("Male","Female")) %>% ggplot(aes(x=gender,y=age,fill=gender))+geom_boxplot(alpha=.4)+ylim(15,60)+theme_fivethirtyeight()+theme(legend.position="none")+scale_fill_brewer(palette="Set2")+ggtitle("Age Distribution by Gender")+annotate("text",x="Female",y=36,label="Median = 35")+annotate("text",x="Male",y=38,label="Median = 37")
# usertype
citi %>% ggplot(aes(x=usertype,fill=usertype)) + geom_bar(alpha=.7)+theme_fivethirtyeight()+theme(legend.position="none")+ggtitle(expression(atop("UserType Distribution",atop("Customer = 24 hour/7 Day pass | Subscriber = Annual Pass")))) + scale_fill_brewer(palette="Set2")+scale_y_continuous(labels=comma)+annotate("text",x="Subscriber",y=350000,label="90% are Subscribers")
# rides by day
citi %>% ggplot(aes(x=wday,fill=wday))+geom_bar(alpha=.8)+theme_fivethirtyeight()+ scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle(expression(atop("More rides on Weekdays than Weekends",atop("Number of Rides by day"))))
# weekday distribution, graph
citi %>% ggplot(aes(x=hour,fill=factor(wday))) + geom_density(alpha=.2)+facet_wrap(~wday,ncol=1)+theme_fivethirtyeight()+theme(legend.position="none",axis.text.y=element_blank(),plot.title=element_text(hjust=.5)) + ggtitle(expression(atop("Hourly Distribution of Rides",atop("Weekday Peaks during Morning Rush (8am-9am) and Afternoon Rush (5pm-6pm)"))))
# weekday distribution
citi %>% ggplot(aes(x=wday,y=tripMin,fill=wday))+geom_boxplot(alpha=.5)+ylim(0,30)+theme_fivethirtyeight()+scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle("Weekend Rides Last Slightly Longer")
#time of day distribution by usertype
citi %>% ggplot(aes(x=hour,group=usertype,fill=usertype)) + geom_density(alpha=.3) + theme_fivethirtyeight() + theme(legend.title=element_blank(),axis.text.y=element_blank())+ggtitle(expression(atop("How Usertype Affects Time of Day Distribution",atop("Customer = 24 hour/7 Day pass | Subscriber = Annual Pass"))))
#time of day distribution by usertype, weekday
citi %>% ggplot(aes(x=hour,group=usertype,fill=usertype)) + geom_density(alpha=.3) + facet_wrap(~wday,ncol=1) + theme_fivethirtyeight() + theme(legend.title=element_blank(),axis.text.y=element_blank(),plot.title=element_text(hjust=.5)) +ggtitle(expression(atop("How Usertype Affects Hourly Distribution",atop("Subscribers are using CitiBikes for commuting"))))
# trip duration by usertype
citi %>% ggplot(aes(x=usertype,y=tripMin,fill=usertype))+geom_boxplot(alpha=.5)+ylim(0,30)+theme_fivethirtyeight()+scale_fill_brewer(palette="Set2")+theme(legend.position="none")+ggtitle(expression(atop("Subscribers Take Far Shorter Rides",atop("Minutes per Ride"))))+annotate("text",x="Customer",y=19,label="Median = 18")+annotate("text",x="Subscriber",y=11,label="Median = 9")
Citi Bike also provides data on the start and end locations of every trip. By combining these two columns, we can get a sense for what the most popular start and end combinations are.
#create route field
citi$route <- paste(citi$start.station.name,citi$end.station.name,sep=" -> ")
# map of NY
NY <- qmap("23rd street and 5th ave,new york",zoom=13,color="bw")
# 10 most popular routes
citi %>% group_by(route) %>% summarize(count=n()) %>% arrange(desc(count)) %>% top_n(n=10) %>% kable(format="markdown")
| route | count |
|---|---|
| Central Park S & 6 Ave -> Central Park S & 6 Ave | 749 |
| Broadway & W 60 St -> Broadway & W 60 St | 356 |
| Grand Army Plaza & Central Park S -> Grand Army Plaza & Central Park S | 345 |
| E 43 St & Vanderbilt Ave -> W 41 St & 8 Ave | 236 |
| W 17 St & 8 Ave -> 8 Ave & W 31 St | 197 |
| W 21 St & 6 Ave -> W 22 St & 10 Ave | 195 |
| Grand Army Plaza & Central Park S -> Broadway & W 60 St | 193 |
| W 21 St & 6 Ave -> 9 Ave & W 22 St | 189 |
| Centre St & Chambers St -> Centre St & Chambers St | 184 |
| Washington Square E -> University Pl & E 14 St | 184 |
# most popular start station
start <- citi %>% group_by(start.station.name,start.station.latitude,start.station.longitude) %>% summarize(count=n()) %>% ungroup() %>% arrange(desc(count)) %>% head(n=10)
NY+ geom_point(data=start,aes(x=start.station.longitude,y=start.station.latitude,size=count),col="dark green",alpha=.65)+scale_size_continuous(range=c(6,15))+theme(legend.position="none",plot.title=element_text(size=rel(1.5)))+ggtitle("Most Popular Start Stations")
| start.station.name | start.station.latitude | start.station.longitude | count |
|---|---|---|---|
| 8 Ave & W 31 St | 40.75045 | -73.99481 | 6095 |
| Lafayette St & E 8 St | 40.73029 | -73.99076 | 5345 |
| E 17 St & Broadway | 40.73705 | -73.99009 | 5006 |
| W 21 St & 6 Ave | 40.74174 | -73.99416 | 4568 |
| Broadway & E 14 St | 40.73455 | -73.99074 | 3999 |
| West St & Chambers St | 40.71755 | -74.01322 | 3893 |
| Cleveland Pl & Spring St | 40.72182 | -73.99720 | 3831 |
| Broadway & E 22 St | 40.74034 | -73.98955 | 3758 |
| Broadway & W 24 St | 40.74235 | -73.98915 | 3645 |
| University Pl & E 14 St | 40.73493 | -73.99201 | 3643 |
#most popular end station
end <- citi %>% group_by(end.station.name,end.station.latitude,end.station.longitude) %>% summarize(count=n()) %>% ungroup() %>% arrange(desc(count)) %>% head(n=10)
NY+ geom_point(data=end,aes(x=end.station.longitude,y=end.station.latitude,size=count),col="dark green",alpha=.65)+scale_size_continuous(range=c(6,15))+theme(legend.position="none",plot.title=element_text(size=rel(1.5)))+ggtitle("Most Popular End Stations")
| end.station.name | end.station.latitude | end.station.longitude | count |
|---|---|---|---|
| Lafayette St & E 8 St | 40.73029 | -73.99076 | 5282 |
| E 17 St & Broadway | 40.73705 | -73.99009 | 5138 |
| 8 Ave & W 31 St | 40.75045 | -73.99481 | 4956 |
| W 21 St & 6 Ave | 40.74174 | -73.99416 | 4623 |
| Broadway & E 14 St | 40.73455 | -73.99074 | 4004 |
| University Pl & E 14 St | 40.73493 | -73.99201 | 3972 |
| West St & Chambers St | 40.71755 | -74.01322 | 3842 |
| Broadway & E 22 St | 40.74034 | -73.98955 | 3766 |
| Greenwich Ave & 8 Ave | 40.73902 | -74.00264 | 3745 |
| Cleveland Pl & Spring St | 40.72182 | -73.99720 | 3731 |