First, let’s load the packages we need and rerun the most essential code from yesterday.
.libPaths("/home/rstudioshared/shared_files/packages")
library(dplyr);library(ggplot2); library(lubridate);library(geosphere)
sample_taxi_train <- read.csv("/home/rstudioshared/shared_files/data/sample_taxi_train.csv")
sample_taxi_train <- sample_taxi_train %>%
mutate(pickup_latitude = ifelse(between(pickup_latitude,40.25, 41.25),pickup_latitude,NA),
dropoff_latitude= ifelse(between(dropoff_latitude,40.25, 41.25),dropoff_latitude,NA),
pickup_longitude= ifelse(between(pickup_longitude,-74.5,-73.5),pickup_longitude,NA),
dropoff_longitude= ifelse(between(dropoff_longitude, -74.5,-73.5),dropoff_longitude,NA)
)
sample_taxi_train <- sample_taxi_train %>% mutate(pickup_datetime=ymd_hms(pickup_datetime),
year = as.factor(year(pickup_datetime)),
month = as.factor(month(pickup_datetime)),
day = as.numeric(day(pickup_datetime)),
dayOfWeek = as.factor(wday(pickup_datetime)),
hour = as.numeric(hour(pickup_datetime))
)
sample_taxi_train <- sample_taxi_train %>% mutate(CrowD = distHaversine(cbind(pickup_latitude, pickup_longitude), cbind(dropoff_latitude, dropoff_longitude),r=3959))
Now let’s add in variables signifying whether the pickup or drop off were at Laguardia Airport (or, more specifically, within 0.5 miles of the location of Laguardia.
#laguardia
lga_lat<-40.779
lga_long<--73.8740
lga <- c(lga_lat, lga_long)
sample_taxi_train <- sample_taxi_train %>%
mutate(LGA_pickup = distHaversine(cbind(pickup_latitude, pickup_longitude), lga,r=3959)<0.5,
LGA_dropoff = distHaversine(cbind(dropoff_latitude, dropoff_longitude), lga,r=3959)<0.5
)
We could add in variables for other notable locations if we want:
### airport locations
#laguardia
lga_lat<-40.779
lga_long<--73.8740
lga <- c(lga_lat, lga_long
#jfk
jfk_lat<-40.6413
jfk_long<--73.7781
jfk<-c(jfk_long, jfk_lat)
#newark
ewr_lat<-40.6895
ewr_long<--74.1745
ewr<-c(ewr_long, ewr_lat)
### notable locations
#MSG
msg_lat<-40.7505
msg_long<--73.9934
msg<-c(msg_long, msg_lat)
#times square
ts_lat<-40.7589
ts_long<--73.9851
ts<-c(ts_long, ts_lat)
#freedom tower
freedom_lat<-40.7127
freedom_long<--74.0134
freedom<-c(freedom_long, freedom_lat)
#empire state building
esb_lat<-40.7484
esb_long<--73.9857
esb<-c(esb_long, esb_lat)
#grand central
grand_lat<-40.7527
grand_long<--73.9772
grand<-c(grand_long, grand_lat)
We can also make decision trees to predict cab fares or even random forests to predict cab fares:
library(rpart); library(rpart.plot); library(randomForest)
tree <- rpart(fare_amount ~ CrowD+dayOfWeek+hour+LGA_pickup+LGA_dropoff+year+pickup_longitude+pickup_latitude, data=sample_taxi_train, cp=0.003)
prp(tree)
forest <- randomForest(fare_amount ~ CrowD+dayOfWeek+hour+LGA_pickup+LGA_dropoff+year+pickup_longitude+pickup_latitude,
data=sample_taxi_train[complete.cases(sample_taxi_train),],sampsize=5000, ntree=150)
varImpPlot(forest)
In order to make predictions on the test set, we’ll need to create all of the same variables first:
taxi_test <- read.csv("/home/rstudioshared/shared_files/data/taxi_test.csv"))
taxi_test <- taxi_test %>%
mutate(pickup_latitude = ifelse(between(pickup_latitude,40.25, 41.25),pickup_latitude,NA),
dropoff_latitude= ifelse(between(dropoff_latitude,40.25, 41.25),dropoff_latitude,NA),
pickup_longitude= ifelse(between(pickup_longitude,-74.5,-73.5),pickup_longitude,NA),
dropoff_longitude= ifelse(between(dropoff_longitude, -74.5,-73.5),dropoff_longitude,NA)
)
taxi_test <- taxi_test %>% mutate(pickup_datetime=ymd_hms(pickup_datetime),
year = as.factor(year(pickup_datetime)),
month = as.factor(month(pickup_datetime)),
day = as.numeric(day(pickup_datetime)),
dayOfWeek = as.factor(wday(pickup_datetime)),
hour = as.numeric(hour(pickup_datetime))
)
taxi_test <- taxi_test %>% mutate(CrowD = distHaversine(cbind(pickup_latitude, pickup_longitude), cbind(dropoff_latitude, dropoff_longitude),r=3959))
taxi_test <- taxi_test %>%
mutate(LGA_pickup = distHaversine(cbind(pickup_latitude, pickup_longitude),
cbind(lga_lat, lga_long),r=3959)<0.5,
LGA_dropoff = distHaversine(cbind(dropoff_latitude, dropoff_longitude),
cbind(lga_lat, lga_long),r=3959)<0.5
)
and then make our predictions:
forest.preds <- predict(forest, taxi_test)
tree.preds <- predict(tree, taxi_test)
taxi_test$fare_amount <- ifelse(is.na(forest.preds), tree.preds, forest.preds)
taxi_test %>% select(key, fare_amount) %>% write.csv("forest_and_tree_fare_preds.csv", row.names = FALSE)
Are there ways to make a better model? We might want to know something about how taxi cab fares are calculated.
Using Zillow Shape Files I added neighborhoods for pickup and drop off locations to our test and training sets. Can this information help up predict fares more accurately?
sample_taxi_train_neighbors <- read.csv("/home/rstudioshared/shared_files/data/sample_taxi_train_neighbors.csv")
taxi_test_neighbors <- read.csv("/home/rstudioshared/shared_files/data/taxi_test_neighbors.csv")