Check out the Bay Area Bike Share site for more information and find the source data here.
#Widen output to make plots more visible
knitr::opts_chunk$set(out.width='2000px', dpi=200)
Download the data sets on the above page and use them to build two data frames named as follows, with the following fields:
stations_df: station_id, name, lat, long, dockcount, landmark, installation
trips_df: trip_id, duration, start_date, start_station, start_terminal, end_date, end_station, end_terminal, bike_number, subscription_type, zip_code
Show the results of calling str() on your data frames to verify that your data sets have loaded correctly.
stations_df <- read.csv("201508_station_data.csv", header = TRUE)
str(stations_df)
## 'data.frame': 70 obs. of 7 variables:
## $ station_id : int 2 3 4 5 6 7 8 9 10 11 ...
## $ name : Factor w/ 70 levels "2nd at Folsom",..: 53 52 57 5 55 37 56 26 51 32 ...
## $ lat : num 37.3 37.3 37.3 37.3 37.3 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ dockcount : int 27 15 11 19 15 15 15 15 15 19 ...
## $ landmark : Factor w/ 5 levels "Mountain View",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ installation: Factor w/ 17 levels "1/22/2014","12/31/2013",..: 16 15 16 15 17 17 15 15 16 16 ...
library(forcats)
trips_df_201402 <- read.csv("201402_trip_data.csv")
#Station Name Discrepancy
trips_df_201402$Start.Station <- fct_recode(trips_df_201402$Start.Station, "Santa Clara County Civic Center" = 'San Jose Government Center')
trips_df_201402$End.Station <- fct_recode(trips_df_201402$End.Station, "Santa Clara County Civic Center" = 'San Jose Government Center')
#Column Discrepancy
colnames(trips_df_201402)[10] <- "Subscriber.Type"
trips_df_201408 <- read.csv("201408_trip_data.csv")
trips_df_201508 <- read.csv("201508_trip_data.csv")
trips_df <- rbind(trips_df_201402,trips_df_201408,trips_df_201508)
str(trips_df)
## 'data.frame': 669959 obs. of 11 variables:
## $ Trip.ID : int 4576 4607 4130 4251 4299 4927 4500 4563 4760 4258 ...
## $ Duration : int 63 70 71 77 83 103 109 111 113 114 ...
## $ Start.Date : Factor w/ 361559 levels "1/1/2014 0:14",..: 70063 70077 69922 69944 69963 70234 70028 70057 70155 69948 ...
## $ Start.Station : Factor w/ 73 levels "2nd at Folsom",..: 61 51 35 51 61 23 58 57 61 51 ...
## $ Start.Terminal : int 66 10 27 10 66 59 4 8 66 10 ...
## $ End.Date : Factor w/ 357757 levels "1/1/2014 0:21",..: 69278 69294 69162 69178 69190 69451 69249 69272 69374 69179 ...
## $ End.Station : Factor w/ 73 levels "2nd at Folsom",..: 61 51 35 51 28 23 5 57 61 33 ...
## $ End.Terminal : int 66 10 27 10 67 59 5 8 66 11 ...
## $ Bike.. : int 520 661 48 26 319 527 679 687 553 107 ...
## $ Subscriber.Type: Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 2 ...
## $ Zip.Code : Factor w/ 7440 levels "","10000","10001",..: 2362 2603 2756 2563 2337 2345 2580 2580 2337 2563 ...
Create and show a table of the number of trips per station for the top and bottom five (ranked by count) stations. Use knitr::kable to display your table.
new.df <- merge(stations_df[,c("station_id","name")],trips_df[,c("Start.Station","Trip.ID","Duration")],by.x = 'name', by.y = 'Start.Station')
trip.count <- data.frame(table(new.df$name))
names(trip.count) <- c('station name','trip.count')
top5.trip <- trip.count[order(trip.count$trip.count,decreasing = TRUE)[1:5],]
bottom5.trip <- trip.count[order(trip.count$trip.count,decreasing = FALSE)[1:5],]
bind.trip.count <- rbind(top5.trip,bottom5.trip)
library(knitr)
kable(bind.trip.count[order(bind.trip.count$trip.count,decreasing = T),],align = 'c',caption = 'Frequency of Trips per Station',col.names = c('station name','trip count'),row.names = FALSE)
| station name | trip count |
|---|---|
| San Francisco Caltrain (Townsend at 4th) | 49092 |
| San Francisco Caltrain 2 (330 Townsend) | 33742 |
| Harry Bridges Plaza (Ferry Building) | 32934 |
| Embarcadero at Sansome | 27713 |
| Temporary Transbay Terminal (Howard at Beale) | 26089 |
| Mezes Park | 341 |
| Redwood City Medical Center | 311 |
| San Mateo County Center | 287 |
| Franklin at Maple | 224 |
| Redwood City Public Library | 213 |
Create and show a table of the mean trip duration by station for the top and bottom five (ranked by mean duration) stations.
mean.duration <- aggregate(list(trip.duration = new.df$Duration),list(station = new.df$name), FUN = mean, na.rm = TRUE)
top5.duration <- mean.duration[order(mean.duration$trip.duration, decreasing = TRUE)[1:5],]
bottom5.duration <- mean.duration[order(mean.duration$trip.duration, decreasing = FALSE)[1:5],]
bind.mean.duration <- rbind(top5.duration,bottom5.duration)
kable(bind.mean.duration[order(bind.mean.duration$trip.duration, decreasing = TRUE),], align = 'c', caption = 'Average Trip Duration per Station',col.names = c('Station','Trip Duration'),row.names = FALSE)
| Station | Trip Duration |
|---|---|
| University and Emerson | 7090.2394 |
| California Ave Caltrain Station | 4628.0058 |
| Redwood City Public Library | 4579.2347 |
| Park at Olive | 4438.1613 |
| San Jose Civic Center | 4208.0169 |
| San Francisco Caltrain 2 (330 Townsend) | 690.1422 |
| 2nd at South Park | 687.8771 |
| Townsend at 7th | 676.5225 |
| Temporary Transbay Terminal (Howard at Beale) | 655.4351 |
| 2nd at Folsom | 581.7098 |
Generate a line graph of the number of rides per day system wide during the month of August. Highlight weekends using a vertical line.
library(lubridate)
trips_df$Start.Date <-parse_date_time(trips_df$Start.Date,orders = "%m/%d/%y H:M")
trips_df$week.day <- wday(trips_df$Start.Date, label = TRUE)
trips_df$End.Date <-parse_date_time(trips_df$End.Date,orders = "%m/%d/%y H:M")
month.august <- trips_df[month(trips_df$Start.Date)== 8,]
month.august <- data.frame(table(day(month.august$Start.Date),year(month.august$Start.Date),month.august$week.day))
month.august <- month.august[month.august$Freq > 0,]
colnames(month.august) <- c("day", "year","weekday","trip count")
library(ggplot2)
vline.dat <- month.august[month.august$weekday %in% c('Sat','Sun'),]
vline.dat$day <- as.numeric(vline.dat$day)
p <- ggplot(month.august,aes(x = day, y = `trip count`))+
geom_point()+
geom_line(aes(group = year,color = year))+
labs(title = "System Wide Use for the Month of August", x = 'Day', y = 'Trip Count')+
theme_bw()+
facet_grid(facets = year~.)
p + geom_vline(data = vline.dat, aes(xintercept = day, color = weekday), show.legend = TRUE,linetype = 'dashed')
Generate some graphs to explore the distribution of trip duration (remove outliers by only including up to the 90th percentile in your dataset):
#Using lubridate to derive trip durations.
trips_df$trip_interval <- as.interval(trips_df$Start.Date,trips_df$End.Date)
trips_df$trip_duration <- as.numeric(as.duration(trips_df$trip_interval),'minutes')
#Getting rid of > 90th percentile outliers.
trips_df_90th <- trips_df[trips_df$trip_duration < quantile(trips_df$trip_duration, .9),]
A black histogram with “steelblue” fill
attach(trips_df_90th)
hist <- ggplot(trips_df_90th, aes(x = trip_duration)) +
geom_histogram(fill = 'steelblue'
,color = 'snow'
,binwidth = 1
,bins = length(unique(trips_df_90th$trip_duration)))+
scale_fill_discrete(name = 'Subscriber Type')+
theme_dark()+
ggtitle(label = 'Frequency of Trip Duration (excluding outliers)')+
scale_x_continuous(name = 'Trip Duration (minutes)', breaks = unique(trip_duration))+
scale_y_continuous(name = 'Count',breaks = seq(0,60000,10000))
detach(trips_df_90th)
hist
A density plot
attach(trips_df_90th)
dens <- ggplot(trips_df_90th, aes(x = trip_duration, fill = Subscriber.Type))+
geom_density(aes(alpha = .5))+
scale_alpha(guide = 'none')+
scale_x_continuous(breaks = unique(trip_duration))+
scale_fill_discrete(name = 'Subscriber Type')+
labs(title = 'Density Plot of Trip Durations',x = 'Trip Duration (mintues)', y = 'Density')
detach(trips_df_90th)
dens
A Box Plot
#It is not possible to boxplot the stand alone distributions for each trip duration (Using a histogram to look at the frequency for each trip duration as above would be appropriate). You need another variable in order to generate the statistics associated with a boxplot (mean, quantiles, outliers, etc...) In this case, I decided to look at the trip durations by subcriber type.
attach(trips_df_90th)
box.plot <- ggplot(trips_df_90th, aes(x = Subscriber.Type, y = trip_duration, color = Subscriber.Type))+
geom_boxplot()+
theme_bw()+
labs(title = 'Trip Durations by Subcriber Type', x = 'Subscriber Type', y = 'Trip Duration')
detach(trips_df_90th)
box.plot
–The distribution of overall trip durations is somewhat normal. By looking at the histogram, it is apparent that the majority of trips fall within the 4-11 minutes duration window. The tail is slightly longer on the right suggesting that there are more shorter trips overall. In the density and box plots, I grouped the trips by subscriber type to get a closer look at the data. It appers that the majority of trips are taken by subscribers. Again, the subscriber distribution appears to be ‘normalish’ while the customer distribution of trips follows more of a uniform distribution with the least frequent durations being on the shorter end of the spectrum between 1-5 minutes.
Generate a bar plot (single bars, not side-by-side) showing the number of trips per day of week for the top five stations (ranked by number of trips), with fill color indicating subscription type. Facet your plot by station.
#for fun colors
library(wesanderson)
trips.per.day <- data.frame(table(trips_df[,c("Start.Station","Subscriber.Type",'week.day')]))
subdata <- trips.per.day[trips.per.day$Start.Station %in% top5.trip$`station name`,]
attach(subdata)
bar.plot <- ggplot(data = subdata, aes(x = week.day, y = Freq, fill = Subscriber.Type))+
geom_bar(stat = 'identity')+
facet_wrap(facets = ~ Start.Station, scales = 'free_x', labeller = label_wrap_gen(multi_line = TRUE))+
scale_fill_manual(values = wes_palette(name = 'GrandBudapest2'),name = 'Subcriber Type') +
labs(title = 'Trips Per Day for the Top 5 Stations', x = 'Week Day', y = 'Trip Count')+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust = 0))
detach(subdata)
bar.plot
weather_data_201402 <- read.csv("201402_weather_data.csv", header = TRUE)
weather_data_201408 <- read.csv("201408_weather_data.csv", header = TRUE)
weather_data_201508 <- read.csv("201508_weather_data.csv", header = TRUE)
colnames(weather_data_201402)<- colnames(weather_data_201408)
weather_df <- rbind(weather_data_201402,weather_data_201408,weather_data_201508)
weather_df$PDT <- parse_date_time(weather_df$PDT, orders = 'mdy')
weather.averages.per.day <- aggregate(list('mean temp' = weather_df$Mean.TemperatureF), list('date' = weather_df$PDT), FUN = mean, na.rm = TRUE)
total_trips_per_day <- data.frame(table(date(trips_df$Start.Date)))
colnames(total_trips_per_day) <- c('date','trip count')
total_trips_per_day$date <- parse_date_time(total_trips_per_day$date, orders = 'ymd')
weather_trip_data_merged <- merge(total_trips_per_day,weather.averages.per.day)
weather_trip_data_merged$year <- year(weather_trip_data_merged$date)
weather_trip_data_merged$month <-month(weather_trip_data_merged$date,label = TRUE)
weather_trip_data_merged$day <- day(weather_trip_data_merged$date)
attach(weather_trip_data_merged)
weather_trip_data_merged$day_month <- as.character(paste(day,month,sep = '-'))
p <-ggplot(data = weather_trip_data_merged, aes(x = mean.temp, y = `trip count`, color = mean.temp))+
geom_point()+
geom_smooth()+
facet_wrap(~month)+
scale_x_continuous("Mean Temperature")+
scale_color_distiller(type = 'seq',palette = 'RdBu',name = 'Mean Temp')+
theme_dark()+
labs(title = "Does Weather Have An Affect On System Use?", y = 'Trip Count')
I wanted to know if the temperature had any affect on system use. Because there were multiple recordins on a given day (varied by zip), I took the grand mean of the mean temepratures recorded on each day. I also calculated the total amount of trips on a given day across the entire system. I then merged the data together and plotted it below. There does not appear to be any correlation between the temperature and system use.
p