#Load required library
library(dplyr)
library(ggplot2)
library(reshape2)
library(bigrquery)
library(ggmap)
project <- "gentle-realm-87910"
options("httr_oauth_cache"="C:\\Users\\Stone\\Documents\\.httr-oauth")
The New York City Taxi and Limousine Commission (TLC), created in 1971, is the agency responsible for licensing and regulating New York City’s medallion (yellow) taxicabs, for-hire vehicles (community-based liveries, black cars and luxury limousines), commuter vans, and paratransit vehicles. Dataset that records all New York TLC taxi trips are publicly available in Google BigQuery (https://bigquery.cloud.google.com/dataset/bigquery-public-data:new_york). This public dataset is being used in this report as a data source to perform analysis :
1. NYC Yellow Taxi Trips Count analysis by hour in 2016
2. NYC Taxi Trips Trend analysis from 2012 to 2016
3. NYC Yellow Taxi coverage analysis
We will analyze 2016 TLC Yellow Taxicab Trip dataset to identify the peak hour of the Yellow Taxicab Trip. First, we will analyze the trip count pattern accross all months in 2016.
#Query to extract the trip count on yellow taxi trips in 2016
sql <-
"SELECT month(pickup_datetime) month, date(pickup_datetime) date, hour(pickup_datetime) hour, count(*) count
FROM [bigquery-public-data:new_york.tlc_yellow_trips_2016]
GROUP BY month, date, hour
ORDER BY month, date, hour"
sql_result_set <- query_exec(sql, project = project, useLegacySql = FALSE)
## 0 bytes processed
#Transform the dataset to calculate the median of trip count by Month and Hour
ride_cnt_data <- sql_result_set %>%
select(month, hour, count) %>%
group_by(month, hour)
ride_cnt_data_avg <- summarise_all(ride_cnt_data, median)
#Plot a line chart and group by month
ggplot(ride_cnt_data_avg, aes(x=hour, y=count)) +
labs(title = "NYC Yellow Taxi Trips Count Monthly Average in 2016") +
labs(x = "Hour", y = "Trips Count Average") +
geom_line() +
geom_point() +
facet_wrap( ~ month, ncol = 4)
From the chart, we can observe that the trips count has the similar pattern across all months. Because it has similar pattern, we will average the trip count only by hour for 2016 and will identify the list of peak hour of Yellow Taxi trips.
#Transform the dataset and calculate the median of trip count only by Hour
ride_cnt_data_year <- sql_result_set %>%
select(hour, count) %>%
group_by(hour)
ride_cnt_data_year_avg <- summarise_all(ride_cnt_data_year, median)
#Plot a line chart
ggplot(ride_cnt_data_year_avg, aes(x=hour, y=count)) +
labs(title = "NYC Yellow Taxi Trips Count Average in 2016") +
labs(x = "Hour", y = "Trips Count") +
geom_line() +
geom_point() +
geom_vline(xintercept = which.max(ride_cnt_data_year_avg$count) - 1)
#List the trip count data and sort from the busiest hour
ride_cnt_data_year_avg %>% arrange(desc(count))
## # A tibble: 24 x 2
## hour count
## <int> <dbl>
## 1 19 22440.
## 2 18 21966
## 3 20 21267
## 4 21 21144.
## 5 22 20450.
## 6 8 18834.
## 7 17 18646.
## 8 14 18494.
## 9 15 17825
## 10 12 17816
## # ... with 14 more rows
From the chart and table, we can identify the peak hours for NYC Yellow Taxi are from 18.00 to 22.00, which 19.00 is the busiest hour having the average of 22,400 trips per day.
There are three taxi type which are available to analyze :
1. Yellow Taxi : the regular taxi which are able to pick up passengers anywhere in the five boroughs of New York.
2. Green Taxi : street hail livery vehicles, commonly known as “boro taxis”, which began to appear in August 2013, are allowed to pick up passengers in all boroughs of New York except Upper Manhattan area, LaGuardia Airport and John F. Kennedy International Airport.
3. For-Hire Vehicle (FHV) : only available through prearranged service with TLC. FHV is not allowed to pick up passegers from the street.
Now we will try to look at the trend analysis of total trips count from year 2012 to 2016 for all the taxi type.
#Query to extract all the trip count for Yellow Taxi, Green Taxi and For-Hire Vehicle from 2012 to 2016
sql <-
"select taxi_type, year, month, count
FROM
(SELECT 'Yellow Taxi' taxi_type, year(pickup_datetime) year, month(pickup_datetime) month, count(*) count
FROM [bigquery-public-data:new_york.tlc_yellow_trips_2016],
[bigquery-public-data:new_york.tlc_yellow_trips_2015],
[bigquery-public-data:new_york.tlc_yellow_trips_2014],
[bigquery-public-data:new_york.tlc_yellow_trips_2013],
[bigquery-public-data:new_york.tlc_yellow_trips_2012],
group by year, month),
(SELECT 'Green Taxi' taxi_type, year(pickup_datetime) year, month(pickup_datetime) month, count(*) count
FROM [bigquery-public-data:new_york.tlc_green_trips_2016],
[bigquery-public-data:new_york.tlc_green_trips_2015],
[bigquery-public-data:new_york.tlc_green_trips_2014],
[bigquery-public-data:new_york.tlc_green_trips_2013]
group by year, month),
(SELECT 'FHV' taxi_type, year(pickup_datetime) year, month(pickup_datetime) month, count(*) count
FROM [bigquery-public-data:new_york.tlc_fhv_trips_2016],
[bigquery-public-data:new_york.tlc_fhv_trips_2015]
group by year, month)
order by taxi_type, year, month"
sql_result_set <- query_exec(sql, project = project, useLegacySql = FALSE)
## Auto-refreshing stale OAuth token.
## 0 bytes processed
#Transform the dataset to calculate the sum of trip count by Taxi Type and Year
ride_trend_data <- sql_result_set %>%
select(taxi_type, year, count) %>%
group_by(taxi_type, year)
ride_trend_data_sum <- summarise_all(ride_trend_data, sum)
#Plot the bar chart to see the trend of trips count from 2012 to 2016
ggplot(ride_trend_data_sum, aes(fill=taxi_type, y=count/1000000, x=year)) +
labs(title = "NYC Taxi Total Trips Trend (Year 2012 - 2016)") +
labs(x = "Year", y = "Total Trips (in million)") +
geom_bar( stat="identity") +
scale_fill_manual("legend",
values = c("FHV" = "blue", "Green Taxi" = "green", "Yellow Taxi" = "yellow"))
#Pivot data of trip count summary by year and taxi type
dcast(ride_trend_data_sum, taxi_type ~ year)
## Using count as value column: use value.var to override.
## taxi_type 2012 2013 2014 2015 2016
## 1 FHV NA NA NA 63727029 133285141
## 2 Green Taxi NA 1210811 15837001 19233765 16385532
## 3 Yellow Taxi 178544324 173179759 165114361 146112989 131165043
From the above graphics we can see that the total trip count is increase from year 2012 and 2016. But here we can observe a decreasing trend for the total trips of the regular yellow taxi, while the total trip of FHV (For-Hire Vehicle) is significantly increased. It seems that nowadays the passenger of Yellow Taxi cab is shifting to FHV service, or even shifting to other online hailing ride service like Uber and Lyft.
In this analysis, we will use the TLC Yellow Taxi Trip data set on 13 - 15 June 2016 to identify the coverage of TLC Yellow Taxi in New York City. The ggmap library will be utilized to plot the trip geolocation data from the dataset onto New York City map.
#Query to extract longitude and latitude data of Yellow Taxi Trip on 13 - 15 June 2016
sql <-
"SELECT pickup_longitude, pickup_latitude
FROM [bigquery-public-data:new_york.tlc_yellow_trips_2016]
WHERE month(pickup_datetime) = 6
and day(pickup_datetime) in (13,14,15)
and pickup_longitude is not null
and pickup_latitude is not null"
sql_result_set <- query_exec(sql, project = project, useLegacySql = FALSE)
## 0 bytes processed
## Warning: Only first 10 pages of size 10000 retrieved. Use max_pages = Inf
## to retrieve all.
trip_location <- sql_result_set
#Plot the geolocation data onto New York City map
qmap(location = "New York", zoom = 10) +
geom_point(aes(x=pickup_longitude, y=pickup_latitude),
data=trip_location,
col="red", alpha=0.2
)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York&sensor=false
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: Removed 1280 rows containing missing values (geom_point).
From the coverage map, we can observe that NYC Yellow Taxi cab mostly only serve the passengers in Manhattan and Upper Brooklyn Area. While Staten Island, The Bronx, and Queens are barely served by the NYC Yellow Taxi service.
–