Taxi Fares: Day 3... price hikes and neighborhoods

Data Science

Loading the Data and Packages

sample_taxi_train_neighbors <- read.csv("/home/rstudioshared/shared_files/data/sample_taxi_train_neighbors.csv")
sample_taxi_train <- read.csv("/home/rstudioshared/shared_files/data/sample_taxi_train.csv")
taxi_test_neighbors <- read.csv("/home/rstudioshared/shared_files/data/taxi_test_neighbors.csv")
taxi_test_all <- read.csv("/home/rstudioshared/shared_files/data/taxi_test.csv")

.libPaths("/home/rstudioshared/shared_files/packages")
library(dplyr);library(ggplot2); library(lubridate);library(geosphere)
library(rpart); library(rpart.plot); library(randomForest)

A quick look at the data

head(sample_taxi_train_neighbors)
                            key fare_amount         pickup_datetime
1 2012-05-26 12:13:00.000000107        5.30 2012-05-26 12:13:00 UTC
2 2014-01-18 23:59:00.000000231        5.50 2014-01-18 23:59:00 UTC
3   2012-07-01 19:14:07.0000004        9.70 2012-07-01 19:14:07 UTC
4   2012-04-03 09:06:28.0000003       15.30 2012-04-03 09:06:28 UTC
5   2014-06-24 19:18:10.0000003       30.83 2014-06-24 19:18:10 UTC
6 2012-12-11 20:17:00.000000222        9.00 2012-12-11 20:17:00 UTC
  pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
1        -73.96674        40.79396         -73.96232         40.80494
2        -73.96773        40.76021         -73.97894         40.76230
3        -73.98023        40.73949         -74.00747         40.74089
4        -74.00474        40.72287         -73.98168         40.77360
5        -73.86591        40.76951         -73.97717         40.75219
6        -73.98277        40.77222         -73.97671         40.78080
  passenger_count pickup_State pickup_County pickup_City
1               1           NY      New York    New York
2               1           NY      New York    New York
3               1           NY      New York    New York
4               1           NY      New York    New York
5               1           NY        Queens    New York
6               1           NY      New York    New York
         pickup_Name pickup_RegionID dropoff_State dropoff_County
1    Upper West Side          270958            NY       New York
2       Sutton Place          270946            NY       New York
3           Gramercy          273860            NY       New York
4               SoHo          270928            NY       New York
5 La Guardia Airport           39299            NY       New York
6    Upper West Side          270958            NY       New York
  dropoff_City        dropoff_Name dropoff_RegionID
1     New York Morningside Heights           270891
2     New York             Midtown           270885
3     New York        West Village           270964
4     New York     Upper West Side           270958
5     New York             Midtown           270885
6     New York     Upper West Side           270958

Grouping by Neighborhoods

neighborhood_summary <- sample_taxi_train_neighbors %>% 
  group_by(pickup_Name, dropoff_Name) %>% 
  summarize(n = n(), meanFare = mean(fare_amount)) %>% ungroup()

Trip Summaries

neighborhood_summary %>% filter(n>=3) %>% top_n(5, meanFare)  %>% arrange(desc(meanFare))

neighborhood_summary %>% top_n(5, n) %>% arrange(desc(n))

Trip Summaries as Predictions

taxi_test <- left_join(taxi_test_neighbors, 
                       neighborhood_summary %>% filter(n>=3), 
                       by=c("pickup_Name", "dropoff_Name")) 

colnames(taxi_test)
 [1] "key"               "pickup_datetime"   "pickup_longitude" 
 [4] "pickup_latitude"   "dropoff_longitude" "dropoff_latitude" 
 [7] "passenger_count"   "pickup_State"      "pickup_County"    
[10] "pickup_City"       "pickup_Name"       "pickup_RegionID"  
[13] "dropoff_State"     "dropoff_County"    "dropoff_City"     
[16] "dropoff_Name"      "dropoff_RegionID"  "n"                
[19] "meanFare"         

Mapping them to the Full Test Set

taxi_test <- left_join(taxi_test_neighbors, 
                       neighborhood_summary %>% filter(n>=3), 
                       by=c("pickup_Name", "dropoff_Name")) 

taxi_test_all <- left_join(taxi_test_all, 
                       taxi_test %>% select(key, meanFare), 
                       by=c("key")) 

colnames(taxi_test_all)
[1] "key"               "pickup_datetime"   "pickup_longitude" 
[4] "pickup_latitude"   "dropoff_longitude" "dropoff_latitude" 
[7] "passenger_count"   "meanFare"         
summary(taxi_test_all$meanFare)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  3.900   6.537   8.399  10.850  11.700 103.800     565 

Now to combine this with a decision tree model!

We can make predictions using a decision tree or a random forest model and either use these when neighborhood information isn't available or average these with the neighborhood model.

A little background readiing

Fare Hike in Sept. 2012

Surcharge in Jan 2015

We should tell our models about changes (rather than just sticking year and month into a decision tree and hoping that it figures this out)

Coding the Changes (simple version) - training data

sample_taxi_train <- sample_taxi_train %>% mutate(pickup_datetime=ymd_hms(pickup_datetime),
                                                  year = as.factor(year(pickup_datetime)),
                                                  month = as.factor(month(pickup_datetime)),
                                                  day = as.numeric(day(pickup_datetime)),
                                                  dayOfWeek = as.factor(wday(pickup_datetime)),
                                                  hour = as.numeric(hour(pickup_datetime))
) 

sample_taxi_train <- sample_taxi_train %>% 
  mutate(rate_hike = ifelse(pickup_datetime >= "2012-09-04", 1, 0),
         surcharge = ifelse(pickup_datetime >= "2015-01-01", 1, 0)) 

Same thing with the test data

taxi_test_all <- taxi_test_all %>% mutate(pickup_datetime=ymd_hms(pickup_datetime),
                                                  year = as.factor(year(pickup_datetime)),
                                                  month = as.factor(month(pickup_datetime)),
                                                  day = as.numeric(day(pickup_datetime)),
                                                  dayOfWeek = as.factor(wday(pickup_datetime)),
                                                  hour = as.numeric(hour(pickup_datetime))
) 

taxi_test_all <- taxi_test_all %>% 
  mutate(rate_hike = ifelse(pickup_datetime >= "2012-09-04", 1, 0),
         surcharge = ifelse(pickup_datetime >= "2015-01-01", 1, 0)) 

Adding Distances (part 1)

sample_taxi_train <- sample_taxi_train %>% 
  mutate(pickup_latitude = ifelse(between(pickup_latitude,40.25, 41.25),pickup_latitude,NA),
         dropoff_latitude= ifelse(between(dropoff_latitude,40.25, 41.25),dropoff_latitude,NA),
         pickup_longitude= ifelse(between(pickup_longitude,-74.5,-73.5),pickup_longitude,NA),
         dropoff_longitude= ifelse(between(dropoff_longitude, -74.5,-73.5),dropoff_longitude,NA)
  ) 

taxi_test_all <- taxi_test_all %>% 
  mutate(pickup_latitude = ifelse(between(pickup_latitude,40.25, 41.25),pickup_latitude,NA),
         dropoff_latitude= ifelse(between(dropoff_latitude,40.25, 41.25),dropoff_latitude,NA),
         pickup_longitude= ifelse(between(pickup_longitude,-74.5,-73.5),pickup_longitude,NA),
         dropoff_longitude= ifelse(between(dropoff_longitude, -74.5,-73.5),dropoff_longitude,NA)
  ) 

Adding Distances (part II)

sample_taxi_train <- sample_taxi_train %>% 
  mutate(CrowD = distHaversine(cbind(pickup_latitude, pickup_longitude), cbind(dropoff_latitude, dropoff_longitude),r=3959))

taxi_test_all <- taxi_test_all %>% 
  mutate(CrowD = distHaversine(cbind(pickup_latitude, pickup_longitude), cbind(dropoff_latitude, dropoff_longitude),r=3959))

A Tree

tree <- rpart(fare_amount ~ rate_hike + surcharge + CrowD, 
              data=sample_taxi_train, cp=0.003)
prp(tree)

plot of chunk unnamed-chunk-12

Predictions!

taxi_test_all$tree.preds <- predict(tree, taxi_test_all)

taxi_test_all <- taxi_test_all %>% 
  mutate(fare_amount = ifelse(is.na(meanFare), tree.preds, 0.70*meanFare + 0.30*tree.preds))

taxi_test_all %>% 
  select(key, fare_amount) %>% 
  write.csv("combo_predictions.csv", row.names=FALSE)