library(rsocrata)
library(ggplot2)
library(dplyr)
library(soql)
library(geosphere)
library(reshape2)
#Green taxis 2016 (g_16) and 2017 (g_17)
g_16 <- 'https://data.cityofnewyork.us/resource/hvrh-b6nb.json'
g_17 <- 'https://data.cityofnewyork.us/resource/5gj9-2kzx.json'
Question_1 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('count(total_amount)') %>%
soql_where('total_amount > 0 AND total_amount < 5 AND Payment_type = 1') %>%
as.character() -> q
read.socrata(q)-> q1
soql() %>%
soql_add_endpoint(API) %>%
soql_select('count(*)') %>%
as.character() -> total
read.socrata(total)->total_count
as.numeric(total_count[1])/as.numeric(q1[1]) -> frac
round(frac,0) -> answer_1
print(paste0('1 out of ~',answer_1,' payments <$5 uses a credit card'))
return(answer_1)}
https://data.cityofnewyork.us/resource/API.json?$select=count(*)&$wheretotal_amount > 0 AND total_amount < 5 AND Payment_type = 1
https://data.cityofnewyork.us/resource/API.json?$select=count(*)
#Question_1(g_16)
"1 out of ~367 payments <$5 uses a credit card"
## [1] "1 out of ~367 payments <$5 uses a credit card"
Question_2 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('count(total_amount)') %>%
soql_where('total_amount > 0 AND total_amount < 5 AND Payment_type = 1') %>%
as.character() -> x
read.socrata(q)-> a
soql() %>%
soql_add_endpoint(API) %>%
soql_select('total_amount') %>%
soql_where('total_amount > 0 AND total_amount < 5 AND Payment_type = 1') %>%
soql_order('total_amount DESC') %>%
as.character() -> q
read.socrata(q)-> b
as.data.frame(b)-> b
answer_2 <- list("count" = a, "charges" = b)
print(paste0(answer_2$count, ' rides where fares <$5 use a cc, use `df$charges` for a list'))
return(answer_2)}
https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$order=total_amount DESC&$where=total_amount >= 0 AND total_amount <= 5 AND payment_type=1&$select=total_amount,payment_type
#Question_2(g_16) -> t
#t$count
#head(t$charges)
"44598 rides where fares <$5 use a cc, use `df$charges` for a list"
## [1] "44598 rides where fares <$5 use a cc, use `df$charges` for a list"
Question_3 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('count(total_amount)') %>%
soql_where('total_amount > 50 AND Payment_type = 1') %>%
as.character() -> q
read.socrata(q)-> q3
soql() %>%
soql_add_endpoint(API) %>%
soql_select('count(*)') %>%
as.character() -> total
read.socrata(total)->total_count
as.numeric(total_count[,1])/as.numeric(q3[,1]) -> answer_3
round(answer_3, 0) -> answer_3
print(paste0('1 out of ~',answer_3,' payments >$50 uses a credit card'))
return(answer_3)}
#https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=count(total_amount)&$where=total_amount>=50 AND payment_type=1
#Question_3(g_16)
"1 out of ~86 payments >$50 uses a credit card"
## [1] "1 out of ~86 payments >$50 uses a credit card"
Question_4 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('count(total_amount)') %>%
soql_where('total_amount > 50 AND Payment_type = 1') %>%
as.character() -> q
read.socrata(q)-> answer_4
print(paste0(answer_4,' cc payments for trips >$50'))
return(answer_4)}
#https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=count(total_amount)&$where=total_amount>50 AND payment_type=1
#Question_4(g_16)
"189750 cc payments for trips >$50"
## [1] "189750 cc payments for trips >$50"
#subset top 2000 fares in March
Question_5 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('lpep_dropoff_datetime, lpep_pickup_datetime, Fare_amount') %>%
soql_where('date_extract_m(lpep_dropoff_datetime) = 3 AND lpep_dropoff_datetime>lpep_pickup_datetime AND Fare_amount > 0') %>%
soql_limit(2000) %>%
soql_order('Fare_amount DESC') %>%
as.character() -> q
read.socrata(q)-> answer_5
as.data.frame(answer_5) -> df
(ymd_hms(df$lpep_dropoff_datetime)-ymd_hms(df$lpep_pickup_datetime)) -> df$seconds
(as.numeric(df$seconds))/60 -> df$minute
mean(as.numeric(df$Fare_amount)/(df$minute)) -> answer_5
round(answer_5, 2) -> answer_5
print(paste0('$',answer_5, ' is the mean fair/minute'))
return(answer_5)}
'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?%24select=lpep_dropoff_datetime%2C%20lpep_pickup_datetime%2C%20fare_amount&%24where=date_extract_m%28lpep_dropoff_datetime%29%20%3D%203%20AND%20lpep_dropoff_datetime%3Elpep_pickup_datetime%20AND%20fare_amount%20%3E%200&$limit=2000'
#Question_5(g_16)
"$248.11is the mean fair/minute"
## [1] "$248.11is the mean fair/minute"
Question_5 <- function(API){
soql() %>%
soql_add_endpoint(API) %>% #datetime operations and transforms
soql_select('avg(fare_amount/(datetime_diff(lpep_dropoff_datetime,lpep_pickup_datetimecount)))') %>%
#soql_select(datetime_diff(to_fixed_timestamp(lpep_dropoff_datetime), to_fixed_timestamp(lpep_pickup_datetime), 'ISO8601', 'minute'))'%>%
soql_where('lpep_dropoff_datetime>lpep_pickup_datetime AND fare_amount > 0') %>%
as.character() -> q
read.socrata(q)-> answer_5
print(paste('mean fair/minute',answer_5))
return(answer_4)}
Question_6 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('count(*)') %>%
soql_where('trip_distance > 0 AND fare_amount > 0') %>%
as.character() -> q
read.socrata(q)-> qa
(as.numeric(qa[,1])/2)-1 -> qa_r
round(qa_r,0)-> qa_r
soql() %>%
soql_add_endpoint(API) %>%
soql_select('fare_amount/trip_distance') %>%
soql_where('trip_distance > 0 AND fare_amount > 0') %>%
soql_limit('2') %>%
soql_order('fare_amount/trip_distance') %>%
soql_offset(qa_r) %>%
as.character() -> med
read.socrata(med)-> m_2
sum(as.numeric(m_2[,1]))/2 -> answer_6
round(answer_6, 2)-> answer_6
print(paste0('The median fare/mile: $',answer_6))
return(answer_6)}
'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=(fare_amount/trip_distance)%20&$where=trip_distance%20%3E%200%20AND%20Fare_amount%3E0&$order=(fare_amount/trip_distance)&$offset=8052935'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=(fare_amount/trip_distance)%20&$where=trip_distance%20%3E%200%20AND%20Fare_amount%3E0&$order=(fare_amount/trip_distance)&$offset=8052935"
'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?%3F%24select=fare_amount%2Ftrip_distance&%24where=fare_amount%20%3E%200%20AND%20trip_distance%20%3E%200&%24limit=2&%24order=fare_amount%2Ftrip_distance&%24offset=8052934'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?%3F%24select=fare_amount%2Ftrip_distance&%24where=fare_amount%20%3E%200%20AND%20trip_distance%20%3E%200&%24limit=2&%24order=fare_amount%2Ftrip_distance&%24offset=8052934"
#Question_6(g_16)
"The median fare/mile: $4.97"
## [1] "The median fare/mile: $4.97"
Question_7 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('lpep_dropoff_datetime, lpep_pickup_datetime, trip_distance') %>%
soql_where('date_extract_m(lpep_dropoff_datetime) = 3 AND lpep_dropoff_datetime>lpep_pickup_datetime AND trip_distance > 0') %>%
soql_limit(2000) %>%
soql_order('trip_distance DESC') %>%
as.character() -> q
read.socrata(q) -> df
(ymd_hms(df$lpep_dropoff_datetime)-ymd_hms(df$lpep_pickup_datetime)) -> df$seconds
(as.numeric(df$seconds))/3600 -> df$hour
as.numeric(df$trip_distance)/(df$hour) -> df$mph
quantile(df$mph[df$mph < 100], 0.95) -> nine_five
round(nine_five, 1) -> answer_7
print(paste0('95th percentile taxi trip speed is ', answer_7, ' avg mph'))
return(answer_7)}
'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=lpep_dropoff_datetime,%20lpep_pickup_datetime,%20trip_distance&$where=%20date_extract_m(lpep_dropoff_datetime)%20=%203%20AND%20lpep_dropoff_datetime%3Elpep_pickup_datetime%20AND%20trip_distance%20%3E%200&$limit=2000&$order=trip_distance%20DESC'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=lpep_dropoff_datetime,%20lpep_pickup_datetime,%20trip_distance&$where=%20date_extract_m(lpep_dropoff_datetime)%20=%203%20AND%20lpep_dropoff_datetime%3Elpep_pickup_datetime%20AND%20trip_distance%20%3E%200&$limit=2000&$order=trip_distance%20DESC"
#Question_7(g_16)
"95th percentile taxi trip speed is 47.1 avg mph"
## [1] "95th percentile taxi trip speed is 47.1 avg mph"
Question_7 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('count(*)') %>%
soql_where('lpep_dropoff_datetime>lpep_pickup_datetime AND trip_distance > 0') %>%
as.character() -> q
read.socrata(q)-> qa
(as.numeric(qa[,1])*0.05) -> qa_r
round(qa_r,0)-> qa_r
soql() %>%
soql_add_endpoint(API) %>%
soql_select('datetime_diff(lpep_dropoff_datetime,lpep_pickup_datetime)/trip_distance)') %>%
soql_where('lpep_dropoff_datetime>lpep_pickup_datetime AND trip_distance > 0') %>%
soql_limit(qa_r) %>%
soql_order('datetime_diff(lpep_dropoff_datetime,lpep_pickup_datetime)/(trip_distance) DESC') %>%
as.character() -> nine_five
read.socrata(nine_five)-> top
mean((as.numeric(m_2[,1])/3600)) -> answer_7
round(answer_7, 2)-> answer_7
print(paste0('The 95th percentile speed (mph): ',answer_7))
return(answer_7)}
#'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=datetime_diff(to_fixed_timestamp(lpep_dropoff_datetime),to_fixed_timestamp(lpep_pickup_datetime),'UTC', 'seconds')'
#'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$select=datetime_diff(to_fixed_timestamp(lpep_dropoff_datetime),to_fixed_timestamp(lpep_pickup_datetime),'%+', 'seconds') '%+')'
Question_8 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('Pickup_longitude, Pickup_latitude, Dropoff_longitude, Dropoff_latitude, trip_distance') %>%
soql_where('date_extract_m(lpep_dropoff_datetime) = 3 AND trip_distance > 0') %>%
soql_limit('2000') %>%
soql_order('trip_distance DESC') %>%
as.character() -> q
read.socrata(q) -> try
lapply(try,as.numeric) -> to
as.data.frame(to) -> t
t <- t %>% rowwise() %>% mutate(meters = distHaversine(c(Pickup_longitude , Pickup_latitude),c(Dropoff_longitude, Dropoff_latitude))) %>% as.data.frame()
mean((t$meters/1609.34)/t$trip_distance) -> distance_ratio
round(distance_ratio, 2) -> d_2
print(paste(d_2,'miles are covered as the crow flies for every mile driven'))}
'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=avg(Tip_amount)&$where=Tip_amount%3E=0%20AND%20RateCodeID=2'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=avg(Tip_amount)&$where=Tip_amount%3E=0%20AND%20RateCodeID=2"
#Question_8(g_16)
"0.76 miles are covered as the crow flies for every mile driven"
## [1] "0.76 miles are covered as the crow flies for every mile driven"
Question_9 <- function(API){
soql() %>%
soql_select('avg(Tip_amount)') %>%
soql_where('Tip_amount >=0 AND RateCodeID = 2') %>%
as.character() -> qb
read.socrata(qb)-> qb
as.numeric(qb[,1])/as.numeric(qa[,1]) -> answer_9
round(answer_9,2)-> answer_9
print(paste('The avg tip from JFK: $',answer_9))
return(answer_9)}
'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=avg(Tip_amount)&$where=Tip_amount%3E=0%20AND%20RateCodeID=2'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=avg(Tip_amount)&$where=Tip_amount%3E=0%20AND%20RateCodeID=2"
#Question_9(g_16)
"The avg tip from JFK: $4.61"
## [1] "The avg tip from JFK: $4.61"
Question_10 <- function(API){
soql() %>%
soql_add_endpoint(API) %>%
soql_select('count(*)') %>%
soql_where('date_extract_m(lpep_dropoff_datetime)=3 AND Total_amount > 0') %>%
as.character() -> q
read.socrata(q)-> qa
(as.numeric(qa[,1])/2)-1 -> qa_r
round(qa_r,0)-> qa_r
soql() %>%
soql_add_endpoint(API) %>%
soql_select('Total_amount') %>%
soql_where('date_extract_m(lpep_dropoff_datetime)=3 AND Total_amount > 0') %>%
soql_limit('2') %>%
soql_order('Total_amount') %>%
soql_offset(qa_r) %>%
as.character() -> med
read.socrata(med)-> m_2
sum(as.numeric(m_2[,1]))/2 -> answer_10
print(paste0('The median revenue in March is $',answer_10))
return(answer_10)}
'https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$where=date_extract_m(lpep_dropoff_datetime)=3%20AND%20total_amount%3E=0&$select=total_amount&$order=total_amount'
## [1] "https://data.cityofnewyork.us/resource/hvrh-b6nb.json?$where=date_extract_m(lpep_dropoff_datetime)=3%20AND%20total_amount%3E=0&$select=total_amount&$order=total_amount"
#Question_10(g_16)
"The median revenue in March is $11.30"
## [1] "The median revenue in March is $11.30"
#max <- https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=max(Tip_amount),max(Fare_amount),%20date_extract_hh(lpep_dropoff_datetime)&$where=Tip_amount%3E=0&$group=date_extract_hh(lpep_dropoff_datetime)
#good <-https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=count(Tip_amount%3E(Total_amount*0.15))&$where=Tip_amount%3E(Total_amount*0.15)&$group=date_extract_hh(lpep_dropoff_datetime)'
#total <-https://data.cityofnewyork.us/resource/hvrh-b6nb.json?&$select=count(*)&$group=date_extract_hh(lpep_dropoff_datetime)
#cbind(max, good, total)-> th
## Hour count tips_15 less_15
## Min. : 1.00 Min. : 182692 Min. : 51443 Min. :130128
## 1st Qu.: 6.75 1st Qu.: 547346 1st Qu.:179670 1st Qu.:355924
## Median :12.50 Median : 674201 Median :216989 Median :453542
## Mean :12.50 Mean : 682731 Mean :222484 Mean :458178
## 3rd Qu.:18.25 3rd Qu.: 886410 3rd Qu.:300164 3rd Qu.:583442
## Max. :24.00 Max. :1117937 Max. :380416 Max. :742009
##
## avg_Tip_amount avg_Total_amount over_100 max_Tip_amount
## Min. :1.095 Min. :14.00 Min. : 4.00 Min. : 99.67
## 1st Qu.:1.188 1st Qu.:14.34 1st Qu.: 9.00 1st Qu.:244.38
## Median :1.238 Median :14.74 Median :15.00 Median :372.50
## Mean :1.264 Mean :14.88 Mean :15.65 Mean :371.23
## 3rd Qu.:1.339 3rd Qu.:14.95 3rd Qu.:20.50 3rd Qu.:450.00
## Max. :1.550 Max. :17.81 Max. :33.00 Max. :900.00
## NA's :1
## max_Total_amount at_35
## Min. : 480.0 Min. : 775
## 1st Qu.: 654.9 1st Qu.:1692
## Median : 871.3 Median :2314
## Mean :1243.3 Mean :2069
## 3rd Qu.:1124.2 3rd Qu.:2443
## Max. :5006.3 Max. :2984
##
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Localadmin\AppData\Local\Temp\RtmpMDgOKk\downloaded_packages
## package 'ggrepel' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Localadmin\AppData\Local\Temp\RtmpMDgOKk\downloaded_packages
require(ggplot2)
ggplot(th, aes(x=Hour)) + geom_point(aes(y=avg_Total_amount), color = 'steelblue3', size = 3) +
geom_point(aes(y = (avg_Total_amount + avg_Tip_amount)), size = 3, color = 'springgreen2') +
pretty + ylab('Avg Fare and Tip') +
scale_x_continuous(breaks = c(3,6,9,12,15, 18, 21, 24),labels = c('3' = '3a', '6'= '6a', '9'='9a', '12' = 'Noon', '15'='3p', '18'='6p', '21' = '9p', '24' = 'Midnight')) +
xlab('Time') + annotate(geom = 'text', label = 'Green Taxi \n2016', size = 3, color = 'grey5', x = 21.5, y = 20)
##Max Tip
ggplot(th, aes(x=Hour)) + geom_point(aes(y=max_Total_amount), color = 'steelblue3') +
geom_point(aes(y = (max_Total_amount + max_Tip_amount)), color = 'springgreen2') +
pretty + ylab('Max Fare and Tip') +
scale_x_continuous(breaks = c(3,6,9,12,15, 18, 21, 24),labels = c('3' = '3a', '6'= '6a', '9'='9a', '12' = 'Noon', '15'='3p', '18'='6p', '21' = '9p', '24' = 'Midnight')) +
xlab('Time') + annotate(geom = 'text', label = 'Green Taxi \n2016', size = 3, color = 'grey5', x = 21.5, y = 5000, family = 'Palatino Linotype')
ggplot(th, aes(Hour, th$over_100/count, label = over_100)) + geom_point(color = 'springgreen3', size = 2) + pretty+
scale_x_continuous(breaks = c(3,6,9,12,15, 18, 21, 24),labels = c('3' = '3a', '6'= '6a', '9'='9a', '12' = 'Noon', '15'='3p', '18'='6p', '21' = '9p', '24' = 'Midnight')) + xlab('Time') +
ylab('Frequency of Tips > $100.00') + annotate(geom = 'text', label = 'Green Taxi \n2016', size = 3, color = 'grey5', x = 21.5, y = .00015, family = 'Palatino Linotype') +
ggrepel::geom_text_repel(label = th$over_100, size = 3, family = 'Palatino Linotype')
### How do tips compare at the standard 15%?
ggplot(th_melt, aes(Hour, value/count)) + stat_smooth(aes(fill=variable), color = 'grey40', size = 0.1, geom='area', method = 'loess', span = 1/4, alpha = 0.4) + pretty +
scale_x_continuous(breaks = c(3,6,9,12,15, 18, 21, 24),labels = c('3' = '3a', '6'= '6a', '9'='9a', '12' = 'Noon', '15'='3p', '18'='6p', '21' = '9p', '24' = 'Midnight'), expand = c(0,0)) +
xlab('Time') + ylab('Tipping above or below 15%') +
scale_fill_manual(name = 'Tip Amount', values = c('tips_15' = "green", "less_15" = "aquamarine2",'at_35' = 'grey20'), labels = c('Above 15%', 'Below 15%', 'At 15%')) +
geom_vline(xintercept = c(6,12,19.5,24), linetype = 3, color = 'grey30', size = 0.2) + annotate(geom = 'text', label = 'Green Taxi \n2016', size = 3, color = 'grey5', x = 21.5, y = .75, family = 'Palatino Linotype')
## `geom_smooth()` using formula 'y ~ x'