New York City Taxis in 2016 and 2017

New York Open Data Portal @ ‘https://data.cityofnewyork.us/Transportation/’

last accessed 4/12/2020

load packages

library(rsocrata)
library(ggplot2)
library(dplyr)
library(soql)
library(geosphere)
library(reshape2)

Set API

#Green taxis 2016 (g_16) and 2017 (g_17)
g_16 <- 'https://data.cityofnewyork.us/resource/hvrh-b6nb.json'
g_17 <- 'https://data.cityofnewyork.us/resource/5gj9-2kzx.json'

Question 1

What fraction of payments under $5 use a credit card?

Question_1 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%  
    soql_select('count(total_amount)') %>%  
    soql_where('total_amount > 0 AND total_amount < 5 AND Payment_type = 1') %>%  
    as.character() -> q  
    read.socrata(q)-> q1  
  soql() %>%  
      soql_add_endpoint(API) %>%  
      soql_select('count(*)') %>%  
      as.character() -> total  
      read.socrata(total)->total_count  
  as.numeric(total_count[1])/as.numeric(q1[1]) -> frac  
  round(frac,0) -> answer_1  
  print(paste0('1 out of ~',answer_1,' payments <$5 uses a credit card'))  
  return(answer_1)}

API Endpoint

https://data.cityofnewyork.us/resource/API.json?$select=count(*)&$wheretotal_amount > 0 AND total_amount < 5 AND Payment_type = 1  
https://data.cityofnewyork.us/resource/API.json?$select=count(*)

Question 1 Answer

#Question_1(g_16)
"1 out of ~367 payments <$5 uses a credit card"
## [1] "1 out of ~367 payments <$5 uses a credit card"

Question 2

Number of Credit Card payments under $5 and a list sorted highest to lowest?

Question_2 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('count(total_amount)') %>%
    soql_where('total_amount > 0 AND total_amount < 5 AND Payment_type = 1') %>%
    as.character() -> x
    read.socrata(q)-> a
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('total_amount') %>%
    soql_where('total_amount > 0 AND total_amount < 5 AND Payment_type = 1') %>%
    soql_order('total_amount DESC') %>%
    as.character() -> q
    read.socrata(q)-> b
    as.data.frame(b)-> b
    answer_2 <- list("count" = a, "charges" = b)
    print(paste0(answer_2$count, ' rides where fares <$5 use a cc, use `df$charges` for a list'))
    return(answer_2)}

API Endpoint

https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$order=total_amount DESC&$where=total_amount >= 0 AND total_amount <= 5 AND payment_type=1&$select=total_amount,payment_type

Question 2 Answer

#Question_2(g_16) -> t
#t$count
#head(t$charges)

"44598 rides where fares <$5 use a cc, use `df$charges` for a list"
## [1] "44598 rides where fares <$5 use a cc, use `df$charges` for a list"

Question 3

What fraction of payments over $50 use a credit card?

Question_3 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('count(total_amount)') %>%
    soql_where('total_amount > 50 AND Payment_type = 1') %>%
    as.character() -> q
    read.socrata(q)-> q3
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('count(*)') %>%
    as.character() -> total
    read.socrata(total)->total_count
  as.numeric(total_count[,1])/as.numeric(q3[,1]) -> answer_3
  round(answer_3, 0) -> answer_3
  print(paste0('1 out of ~',answer_3,' payments >$50 uses a credit card'))
  return(answer_3)}

API Endpoint

#https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=count(total_amount)&$where=total_amount>=50 AND payment_type=1

Question 3 Answer

#Question_3(g_16)
"1 out of ~86 payments >$50 uses a credit card"
## [1] "1 out of ~86 payments >$50 uses a credit card"

Question 4

Number of credit card payments over $50?

Question_4 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('count(total_amount)') %>%
    soql_where('total_amount > 50 AND Payment_type = 1') %>%
    as.character() -> q
    read.socrata(q)-> answer_4
  print(paste0(answer_4,' cc payments for trips >$50'))
  return(answer_4)}

API Endpoint

#https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=count(total_amount)&$where=total_amount>50 AND payment_type=1

Question 4 answer

#Question_4(g_16)
"189750 cc payments for trips >$50"
## [1] "189750 cc payments for trips >$50"

Question 5

What is the mean fare per minute driven?

(trouble shooting and logic included)
#subset top 2000 fares in March
Question_5 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('lpep_dropoff_datetime, lpep_pickup_datetime, Fare_amount') %>%
    soql_where('date_extract_m(lpep_dropoff_datetime) = 3 AND lpep_dropoff_datetime>lpep_pickup_datetime AND Fare_amount > 0') %>%
    soql_limit(2000) %>%
    soql_order('Fare_amount DESC') %>%
    as.character() -> q
  read.socrata(q)-> answer_5
  as.data.frame(answer_5) -> df
  (ymd_hms(df$lpep_dropoff_datetime)-ymd_hms(df$lpep_pickup_datetime)) -> df$seconds
  (as.numeric(df$seconds))/60 -> df$minute
  mean(as.numeric(df$Fare_amount)/(df$minute)) -> answer_5
  round(answer_5, 2) -> answer_5
  print(paste0('$',answer_5, ' is the mean fair/minute'))
  return(answer_5)}

API Endpoint

'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?%24select=lpep_dropoff_datetime%2C%20lpep_pickup_datetime%2C%20fare_amount&%24where=date_extract_m%28lpep_dropoff_datetime%29%20%3D%203%20AND%20lpep_dropoff_datetime%3Elpep_pickup_datetime%20AND%20fare_amount%20%3E%200&$limit=2000'

Question 5 answer

#Question_5(g_16)
"$248.11is the mean fair/minute"
## [1] "$248.11is the mean fair/minute"

trouble shooting

Question_5 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>% #datetime operations and transforms 
    soql_select('avg(fare_amount/(datetime_diff(lpep_dropoff_datetime,lpep_pickup_datetimecount)))') %>%
    #soql_select(datetime_diff(to_fixed_timestamp(lpep_dropoff_datetime),     to_fixed_timestamp(lpep_pickup_datetime), 'ISO8601', 'minute'))'%>%
    soql_where('lpep_dropoff_datetime>lpep_pickup_datetime AND fare_amount > 0') %>%
    as.character() -> q
  read.socrata(q)-> answer_5
  print(paste('mean fair/minute',answer_5))
  return(answer_4)} 

Question 6

What is the median of the taxi’s fare per mile driven?

Question_6 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('count(*)') %>%
    soql_where('trip_distance > 0 AND fare_amount > 0') %>%
    as.character() -> q
    read.socrata(q)-> qa
    (as.numeric(qa[,1])/2)-1 -> qa_r
    round(qa_r,0)-> qa_r
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('fare_amount/trip_distance') %>%
    soql_where('trip_distance > 0 AND fare_amount > 0') %>%
    soql_limit('2') %>%
    soql_order('fare_amount/trip_distance') %>%
    soql_offset(qa_r) %>%
    as.character() -> med
    read.socrata(med)-> m_2
    sum(as.numeric(m_2[,1]))/2 -> answer_6
    round(answer_6, 2)-> answer_6
    print(paste0('The median fare/mile: $',answer_6))
  return(answer_6)}

API Endpoint

'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=(fare_amount/trip_distance)%20&$where=trip_distance%20%3E%200%20AND%20Fare_amount%3E0&$order=(fare_amount/trip_distance)&$offset=8052935'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=(fare_amount/trip_distance)%20&$where=trip_distance%20%3E%200%20AND%20Fare_amount%3E0&$order=(fare_amount/trip_distance)&$offset=8052935"
'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?%3F%24select=fare_amount%2Ftrip_distance&%24where=fare_amount%20%3E%200%20AND%20trip_distance%20%3E%200&%24limit=2&%24order=fare_amount%2Ftrip_distance&%24offset=8052934'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?%3F%24select=fare_amount%2Ftrip_distance&%24where=fare_amount%20%3E%200%20AND%20trip_distance%20%3E%200&%24limit=2&%24order=fare_amount%2Ftrip_distance&%24offset=8052934"

Question 6 answer

#Question_6(g_16)
"The median fare/mile: $4.97"
## [1] "The median fare/mile: $4.97"

Question 7

What is the 95 percentile of the taxi’s average driving speed in miles per hour?

(trouble shooting and logic included)
Question_7 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('lpep_dropoff_datetime, lpep_pickup_datetime, trip_distance') %>%
    soql_where('date_extract_m(lpep_dropoff_datetime) = 3 AND lpep_dropoff_datetime>lpep_pickup_datetime AND trip_distance > 0') %>%
    soql_limit(2000) %>%
    soql_order('trip_distance DESC') %>%
    as.character() -> q
    read.socrata(q) -> df
    (ymd_hms(df$lpep_dropoff_datetime)-ymd_hms(df$lpep_pickup_datetime)) -> df$seconds
    (as.numeric(df$seconds))/3600 -> df$hour
    as.numeric(df$trip_distance)/(df$hour) -> df$mph
    quantile(df$mph[df$mph < 100], 0.95) -> nine_five
    round(nine_five, 1) -> answer_7
    print(paste0('95th percentile taxi trip speed is ', answer_7, ' avg mph'))
    return(answer_7)}

API Endpoint

'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=lpep_dropoff_datetime,%20lpep_pickup_datetime,%20trip_distance&$where=%20date_extract_m(lpep_dropoff_datetime)%20=%203%20AND%20lpep_dropoff_datetime%3Elpep_pickup_datetime%20AND%20trip_distance%20%3E%200&$limit=2000&$order=trip_distance%20DESC'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=lpep_dropoff_datetime,%20lpep_pickup_datetime,%20trip_distance&$where=%20date_extract_m(lpep_dropoff_datetime)%20=%203%20AND%20lpep_dropoff_datetime%3Elpep_pickup_datetime%20AND%20trip_distance%20%3E%200&$limit=2000&$order=trip_distance%20DESC"

Question 7 answer

#Question_7(g_16)
"95th percentile taxi trip speed is 47.1 avg mph"
## [1] "95th percentile taxi trip speed is 47.1 avg mph"

logic for full dataset

Question_7 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('count(*)') %>%
    soql_where('lpep_dropoff_datetime>lpep_pickup_datetime AND trip_distance > 0') %>%
    as.character() -> q
  read.socrata(q)-> qa
  (as.numeric(qa[,1])*0.05) -> qa_r
  round(qa_r,0)-> qa_r
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('datetime_diff(lpep_dropoff_datetime,lpep_pickup_datetime)/trip_distance)') %>%
    soql_where('lpep_dropoff_datetime>lpep_pickup_datetime AND trip_distance > 0') %>%
    soql_limit(qa_r) %>%
    soql_order('datetime_diff(lpep_dropoff_datetime,lpep_pickup_datetime)/(trip_distance) DESC') %>%
    as.character() -> nine_five
  read.socrata(nine_five)-> top
  mean((as.numeric(m_2[,1])/3600)) -> answer_7
  round(answer_7, 2)-> answer_7
  print(paste0('The 95th percentile speed (mph): ',answer_7))
return(answer_7)}

notes

#'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=datetime_diff(to_fixed_timestamp(lpep_dropoff_datetime),to_fixed_timestamp(lpep_pickup_datetime),'UTC', 'seconds')'

#'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=datetime_diff(to_fixed_timestamp(lpep_dropoff_datetime),to_fixed_timestamp(lpep_pickup_datetime),'%+', 'seconds') '%+')'

Question 8

What is the average ratio of the distance between the pickup and drop-off divided by the distance driven?

logic and troubleshooting provided

Question_8 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('Pickup_longitude, Pickup_latitude, Dropoff_longitude, Dropoff_latitude, trip_distance') %>%
    soql_where('date_extract_m(lpep_dropoff_datetime) = 3 AND trip_distance > 0') %>%
    soql_limit('2000') %>%
    soql_order('trip_distance DESC') %>%
    as.character() -> q
    read.socrata(q) -> try
    lapply(try,as.numeric) -> to
    as.data.frame(to) -> t
    t <- t %>% rowwise() %>% mutate(meters = distHaversine(c(Pickup_longitude , Pickup_latitude),c(Dropoff_longitude, Dropoff_latitude))) %>% as.data.frame()
    mean((t$meters/1609.34)/t$trip_distance) -> distance_ratio
    round(distance_ratio, 2) -> d_2
    print(paste(d_2,'miles are covered as the crow flies for every mile driven'))}

API Endpoint

'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=avg(Tip_amount)&$where=Tip_amount%3E=0%20AND%20RateCodeID=2'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=avg(Tip_amount)&$where=Tip_amount%3E=0%20AND%20RateCodeID=2"

Question 8 answer

#Question_8(g_16)
"0.76 miles are covered as the crow flies for every mile driven"
## [1] "0.76 miles are covered as the crow flies for every mile driven"

Question 9

What is the average tip for rides from JFK?

Question_9 <- function(API){
  soql() %>%
    soql_select('avg(Tip_amount)') %>%
    soql_where('Tip_amount >=0 AND RateCodeID = 2') %>%
    as.character() -> qb
    read.socrata(qb)-> qb
    as.numeric(qb[,1])/as.numeric(qa[,1]) -> answer_9
    round(answer_9,2)-> answer_9
    print(paste('The avg tip from JFK: $',answer_9))
    return(answer_9)}

API Endpoint

'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=avg(Tip_amount)&$where=Tip_amount%3E=0%20AND%20RateCodeID=2'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=avg(Tip_amount)&$where=Tip_amount%3E=0%20AND%20RateCodeID=2"

Question 9 answer

#Question_9(g_16)
"The avg tip from JFK: $4.61"
## [1] "The avg tip from JFK: $4.61"

Question 10

What is the median March revenue of a taxi driver?

Question_10 <- function(API){
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('count(*)') %>%
    soql_where('date_extract_m(lpep_dropoff_datetime)=3 AND Total_amount > 0') %>%
    as.character() -> q
  read.socrata(q)-> qa
  (as.numeric(qa[,1])/2)-1 -> qa_r
  round(qa_r,0)-> qa_r
  soql() %>%
    soql_add_endpoint(API) %>%
    soql_select('Total_amount') %>%
    soql_where('date_extract_m(lpep_dropoff_datetime)=3 AND Total_amount > 0') %>%
    soql_limit('2') %>%
    soql_order('Total_amount') %>%
    soql_offset(qa_r) %>%
    as.character() -> med
  read.socrata(med)-> m_2
  sum(as.numeric(m_2[,1]))/2 -> answer_10
  print(paste0('The median revenue in March is $',answer_10))
  return(answer_10)}

API Endpoint

'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$where=date_extract_m(lpep_dropoff_datetime)=3%20AND%20total_amount%3E=0&$select=total_amount&$order=total_amount'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$where=date_extract_m(lpep_dropoff_datetime)=3%20AND%20total_amount%3E=0&$select=total_amount&$order=total_amount"

Question 10 answer

#Question_10(g_16)
"The median revenue in March is $11.30"
## [1] "The median revenue in March is $11.30"

Question 11

Project: How are the tips in the city that never sleeps?

API Endpoints

#max <- https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=max(Tip_amount),max(Fare_amount),%20date_extract_hh(lpep_dropoff_datetime)&$where=Tip_amount%3E=0&$group=date_extract_hh(lpep_dropoff_datetime)
#good <-https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=count(Tip_amount%3E(Total_amount*0.15))&$where=Tip_amount%3E(Total_amount*0.15)&$group=date_extract_hh(lpep_dropoff_datetime)'
#total <-https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=count(*)&$group=date_extract_hh(lpep_dropoff_datetime)
#cbind(max, good, total)-> th
##       Hour           count            tips_15          less_15      
##  Min.   : 1.00   Min.   : 182692   Min.   : 51443   Min.   :130128  
##  1st Qu.: 6.75   1st Qu.: 547346   1st Qu.:179670   1st Qu.:355924  
##  Median :12.50   Median : 674201   Median :216989   Median :453542  
##  Mean   :12.50   Mean   : 682731   Mean   :222484   Mean   :458178  
##  3rd Qu.:18.25   3rd Qu.: 886410   3rd Qu.:300164   3rd Qu.:583442  
##  Max.   :24.00   Max.   :1117937   Max.   :380416   Max.   :742009  
##                                                                     
##  avg_Tip_amount  avg_Total_amount    over_100     max_Tip_amount  
##  Min.   :1.095   Min.   :14.00    Min.   : 4.00   Min.   : 99.67  
##  1st Qu.:1.188   1st Qu.:14.34    1st Qu.: 9.00   1st Qu.:244.38  
##  Median :1.238   Median :14.74    Median :15.00   Median :372.50  
##  Mean   :1.264   Mean   :14.88    Mean   :15.65   Mean   :371.23  
##  3rd Qu.:1.339   3rd Qu.:14.95    3rd Qu.:20.50   3rd Qu.:450.00  
##  Max.   :1.550   Max.   :17.81    Max.   :33.00   Max.   :900.00  
##                                   NA's   :1                       
##  max_Total_amount     at_35     
##  Min.   : 480.0   Min.   : 775  
##  1st Qu.: 654.9   1st Qu.:1692  
##  Median : 871.3   Median :2314  
##  Mean   :1243.3   Mean   :2069  
##  3rd Qu.:1124.2   3rd Qu.:2443  
##  Max.   :5006.3   Max.   :2984  
## 
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Localadmin\AppData\Local\Temp\RtmpMDgOKk\downloaded_packages
## package 'ggrepel' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Localadmin\AppData\Local\Temp\RtmpMDgOKk\downloaded_packages

How does tipping look at different times in NYC?

Average Fare Tip

require(ggplot2)
ggplot(th, aes(x=Hour)) + geom_point(aes(y=avg_Total_amount), color = 'steelblue3', size = 3) +
  geom_point(aes(y = (avg_Total_amount + avg_Tip_amount)), size = 3, color = 'springgreen2') +
  pretty + ylab('Avg Fare and Tip') +
  scale_x_continuous(breaks = c(3,6,9,12,15, 18, 21, 24),labels = c('3' = '3a', '6'= '6a', '9'='9a', '12' = 'Noon', '15'='3p', '18'='6p', '21' = '9p', '24' = 'Midnight')) +
  xlab('Time') + annotate(geom = 'text', label = 'Green Taxi \n2016', size = 3, color = 'grey5', x = 21.5, y = 20)

##Max Tip

ggplot(th, aes(x=Hour)) + geom_point(aes(y=max_Total_amount), color = 'steelblue3') +
  geom_point(aes(y = (max_Total_amount + max_Tip_amount)), color = 'springgreen2') +
  pretty + ylab('Max Fare and Tip') +
  scale_x_continuous(breaks = c(3,6,9,12,15, 18, 21, 24),labels = c('3' = '3a', '6'= '6a', '9'='9a', '12' = 'Noon', '15'='3p', '18'='6p', '21' = '9p', '24' = 'Midnight')) +
  xlab('Time') + annotate(geom = 'text', label = 'Green Taxi \n2016', size = 3, color = 'grey5', x = 21.5, y = 5000, family = 'Palatino Linotype')

Frequency of tips over $100

ggplot(th, aes(Hour, th$over_100/count, label = over_100)) + geom_point(color = 'springgreen3', size = 2)  + pretty+
  scale_x_continuous(breaks = c(3,6,9,12,15, 18, 21, 24),labels = c('3' = '3a', '6'= '6a', '9'='9a', '12' = 'Noon', '15'='3p', '18'='6p', '21' = '9p', '24' = 'Midnight')) + xlab('Time') +
  ylab('Frequency of Tips > $100.00') + annotate(geom = 'text', label = 'Green Taxi \n2016', size = 3, color = 'grey5', x = 21.5, y = .00015, family = 'Palatino Linotype') +
  ggrepel::geom_text_repel(label = th$over_100, size = 3, family = 'Palatino Linotype')

### How do tips compare at the standard 15%?

ggplot(th_melt, aes(Hour, value/count)) + stat_smooth(aes(fill=variable), color = 'grey40', size = 0.1,   geom='area', method = 'loess', span = 1/4, alpha = 0.4) +  pretty +
  scale_x_continuous(breaks = c(3,6,9,12,15, 18, 21, 24),labels = c('3' = '3a', '6'= '6a', '9'='9a', '12' = 'Noon', '15'='3p', '18'='6p', '21' = '9p', '24' = 'Midnight'), expand = c(0,0)) +
  xlab('Time') + ylab('Tipping above or below 15%') +
  scale_fill_manual(name = 'Tip Amount', values = c('tips_15' = "green", "less_15" = "aquamarine2",'at_35' = 'grey20'), labels = c('Above 15%', 'Below 15%', 'At 15%')) +
geom_vline(xintercept = c(6,12,19.5,24), linetype = 3, color = 'grey30', size = 0.2) + annotate(geom = 'text', label = 'Green Taxi \n2016', size = 3, color = 'grey5', x = 21.5, y = .75, family = 'Palatino Linotype')
## `geom_smooth()` using formula 'y ~ x'