Yesterday’s Code

First, let’s load the packages we need and rerun the most essential code from yesterday.

.libPaths("/home/rstudioshared/shared_files/packages")
library(dplyr);library(ggplot2); library(lubridate);library(geosphere)

sample_taxi_train <- read.csv("/home/rstudioshared/shared_files/data/sample_taxi_train.csv")

sample_taxi_train <- sample_taxi_train %>% 
  mutate(pickup_latitude = ifelse(between(pickup_latitude,40.25, 41.25),pickup_latitude,NA),
         dropoff_latitude= ifelse(between(dropoff_latitude,40.25, 41.25),dropoff_latitude,NA),
         pickup_longitude= ifelse(between(pickup_longitude,-74.5,-73.5),pickup_longitude,NA),
         dropoff_longitude= ifelse(between(dropoff_longitude, -74.5,-73.5),dropoff_longitude,NA)
  ) 

sample_taxi_train <- sample_taxi_train %>% mutate(pickup_datetime=ymd_hms(pickup_datetime),
                                                  year = as.factor(year(pickup_datetime)),
                                                  month = as.factor(month(pickup_datetime)),
                                                  day = as.numeric(day(pickup_datetime)),
                                                  dayOfWeek = as.factor(wday(pickup_datetime)),
                                                  hour = as.numeric(hour(pickup_datetime))
) 

sample_taxi_train <- sample_taxi_train %>% mutate(CrowD = distHaversine(cbind(pickup_latitude, pickup_longitude), cbind(dropoff_latitude, dropoff_longitude),r=3959))

Locations

Now let’s add in variables signifying whether the pickup or drop off were at Laguardia Airport (or, more specifically, within 0.5 miles of the location of Laguardia.

#laguardia
lga_lat<-40.779
lga_long<--73.8740
lga <- c(lga_lat, lga_long)

sample_taxi_train <- sample_taxi_train %>% 
  mutate(LGA_pickup = distHaversine(cbind(pickup_latitude, pickup_longitude), lga,r=3959)<0.5,
         LGA_dropoff = distHaversine(cbind(dropoff_latitude, dropoff_longitude), lga,r=3959)<0.5
         )

We could add in variables for other notable locations if we want:

### airport locations
#laguardia
lga_lat<-40.779
lga_long<--73.8740
lga <- c(lga_lat, lga_long
#jfk
jfk_lat<-40.6413
jfk_long<--73.7781
jfk<-c(jfk_long, jfk_lat)
#newark
ewr_lat<-40.6895
ewr_long<--74.1745
ewr<-c(ewr_long, ewr_lat)

### notable locations
#MSG
msg_lat<-40.7505
msg_long<--73.9934
msg<-c(msg_long, msg_lat)
#times square
ts_lat<-40.7589
ts_long<--73.9851
ts<-c(ts_long, ts_lat)
#freedom tower
freedom_lat<-40.7127
freedom_long<--74.0134
freedom<-c(freedom_long, freedom_lat)
#empire state building
esb_lat<-40.7484
esb_long<--73.9857
esb<-c(esb_long, esb_lat)
#grand central
grand_lat<-40.7527
grand_long<--73.9772
grand<-c(grand_long, grand_lat)

Trees and Forests

We can also make decision trees to predict cab fares or even random forests to predict cab fares:

library(rpart); library(rpart.plot); library(randomForest)

tree <- rpart(fare_amount ~ CrowD+dayOfWeek+hour+LGA_pickup+LGA_dropoff+year+pickup_longitude+pickup_latitude, data=sample_taxi_train, cp=0.003)
prp(tree)

forest <- randomForest(fare_amount ~ CrowD+dayOfWeek+hour+LGA_pickup+LGA_dropoff+year+pickup_longitude+pickup_latitude, 
                       data=sample_taxi_train[complete.cases(sample_taxi_train),],sampsize=5000,  ntree=150)
varImpPlot(forest)

The Test Set

In order to make predictions on the test set, we’ll need to create all of the same variables first:

taxi_test <- read.csv("/home/rstudioshared/shared_files/data/taxi_test.csv"))

taxi_test <- taxi_test %>% 
  mutate(pickup_latitude = ifelse(between(pickup_latitude,40.25, 41.25),pickup_latitude,NA),
         dropoff_latitude= ifelse(between(dropoff_latitude,40.25, 41.25),dropoff_latitude,NA),
         pickup_longitude= ifelse(between(pickup_longitude,-74.5,-73.5),pickup_longitude,NA),
         dropoff_longitude= ifelse(between(dropoff_longitude, -74.5,-73.5),dropoff_longitude,NA)
  ) 


taxi_test <- taxi_test %>% mutate(pickup_datetime=ymd_hms(pickup_datetime),
                                                  year = as.factor(year(pickup_datetime)),
                                                  month = as.factor(month(pickup_datetime)),
                                                  day = as.numeric(day(pickup_datetime)),
                                                  dayOfWeek = as.factor(wday(pickup_datetime)),
                                                  hour = as.numeric(hour(pickup_datetime))
) 


taxi_test <- taxi_test %>% mutate(CrowD = distHaversine(cbind(pickup_latitude, pickup_longitude), cbind(dropoff_latitude, dropoff_longitude),r=3959))

taxi_test <- taxi_test %>% 
  mutate(LGA_pickup = distHaversine(cbind(pickup_latitude, pickup_longitude), 
                                    cbind(lga_lat, lga_long),r=3959)<0.5,
         LGA_dropoff = distHaversine(cbind(dropoff_latitude, dropoff_longitude), 
                                     cbind(lga_lat, lga_long),r=3959)<0.5
  )

and then make our predictions:

forest.preds <- predict(forest, taxi_test)
tree.preds <- predict(tree, taxi_test)
taxi_test$fare_amount <- ifelse(is.na(forest.preds), tree.preds, forest.preds)
taxi_test %>% select(key, fare_amount) %>% write.csv("forest_and_tree_fare_preds.csv", row.names = FALSE)

Are there ways to make a better model? We might want to know something about how taxi cab fares are calculated.

Current Rates

Historical Rates

Looking Forward

Using Zillow Shape Files I added neighborhoods for pickup and drop off locations to our test and training sets. Can this information help up predict fares more accurately?

sample_taxi_train_neighbors <- read.csv("/home/rstudioshared/shared_files/data/sample_taxi_train_neighbors.csv")
taxi_test_neighbors <- read.csv("/home/rstudioshared/shared_files/data/taxi_test_neighbors.csv")