Data Science
sample_taxi_train_neighbors <- read.csv("/home/rstudioshared/shared_files/data/sample_taxi_train_neighbors.csv")
sample_taxi_train <- read.csv("/home/rstudioshared/shared_files/data/sample_taxi_train.csv")
taxi_test_neighbors <- read.csv("/home/rstudioshared/shared_files/data/taxi_test_neighbors.csv")
taxi_test_all <- read.csv("/home/rstudioshared/shared_files/data/taxi_test.csv")
.libPaths("/home/rstudioshared/shared_files/packages")
library(dplyr);library(ggplot2); library(lubridate);library(geosphere)
library(rpart); library(rpart.plot); library(randomForest)
head(sample_taxi_train_neighbors)
key fare_amount pickup_datetime
1 2012-05-26 12:13:00.000000107 5.30 2012-05-26 12:13:00 UTC
2 2014-01-18 23:59:00.000000231 5.50 2014-01-18 23:59:00 UTC
3 2012-07-01 19:14:07.0000004 9.70 2012-07-01 19:14:07 UTC
4 2012-04-03 09:06:28.0000003 15.30 2012-04-03 09:06:28 UTC
5 2014-06-24 19:18:10.0000003 30.83 2014-06-24 19:18:10 UTC
6 2012-12-11 20:17:00.000000222 9.00 2012-12-11 20:17:00 UTC
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
1 -73.96674 40.79396 -73.96232 40.80494
2 -73.96773 40.76021 -73.97894 40.76230
3 -73.98023 40.73949 -74.00747 40.74089
4 -74.00474 40.72287 -73.98168 40.77360
5 -73.86591 40.76951 -73.97717 40.75219
6 -73.98277 40.77222 -73.97671 40.78080
passenger_count pickup_State pickup_County pickup_City
1 1 NY New York New York
2 1 NY New York New York
3 1 NY New York New York
4 1 NY New York New York
5 1 NY Queens New York
6 1 NY New York New York
pickup_Name pickup_RegionID dropoff_State dropoff_County
1 Upper West Side 270958 NY New York
2 Sutton Place 270946 NY New York
3 Gramercy 273860 NY New York
4 SoHo 270928 NY New York
5 La Guardia Airport 39299 NY New York
6 Upper West Side 270958 NY New York
dropoff_City dropoff_Name dropoff_RegionID
1 New York Morningside Heights 270891
2 New York Midtown 270885
3 New York West Village 270964
4 New York Upper West Side 270958
5 New York Midtown 270885
6 New York Upper West Side 270958
neighborhood_summary <- sample_taxi_train_neighbors %>%
group_by(pickup_Name, dropoff_Name) %>%
summarize(n = n(), meanFare = mean(fare_amount)) %>% ungroup()
neighborhood_summary %>% filter(n>=3) %>% top_n(5, meanFare) %>% arrange(desc(meanFare))
neighborhood_summary %>% top_n(5, n) %>% arrange(desc(n))
taxi_test <- left_join(taxi_test_neighbors,
neighborhood_summary %>% filter(n>=3),
by=c("pickup_Name", "dropoff_Name"))
colnames(taxi_test)
[1] "key" "pickup_datetime" "pickup_longitude"
[4] "pickup_latitude" "dropoff_longitude" "dropoff_latitude"
[7] "passenger_count" "pickup_State" "pickup_County"
[10] "pickup_City" "pickup_Name" "pickup_RegionID"
[13] "dropoff_State" "dropoff_County" "dropoff_City"
[16] "dropoff_Name" "dropoff_RegionID" "n"
[19] "meanFare"
taxi_test <- left_join(taxi_test_neighbors,
neighborhood_summary %>% filter(n>=3),
by=c("pickup_Name", "dropoff_Name"))
taxi_test_all <- left_join(taxi_test_all,
taxi_test %>% select(key, meanFare),
by=c("key"))
colnames(taxi_test_all)
[1] "key" "pickup_datetime" "pickup_longitude"
[4] "pickup_latitude" "dropoff_longitude" "dropoff_latitude"
[7] "passenger_count" "meanFare"
summary(taxi_test_all$meanFare)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
3.900 6.537 8.399 10.850 11.700 103.800 565
We can make predictions using a decision tree or a random forest model and either use these when neighborhood information isn't available or average these with the neighborhood model.
We should tell our models about changes (rather than just sticking year and month into a decision tree and hoping that it figures this out)
sample_taxi_train <- sample_taxi_train %>% mutate(pickup_datetime=ymd_hms(pickup_datetime),
year = as.factor(year(pickup_datetime)),
month = as.factor(month(pickup_datetime)),
day = as.numeric(day(pickup_datetime)),
dayOfWeek = as.factor(wday(pickup_datetime)),
hour = as.numeric(hour(pickup_datetime))
)
sample_taxi_train <- sample_taxi_train %>%
mutate(rate_hike = ifelse(pickup_datetime >= "2012-09-04", 1, 0),
surcharge = ifelse(pickup_datetime >= "2015-01-01", 1, 0))
taxi_test_all <- taxi_test_all %>% mutate(pickup_datetime=ymd_hms(pickup_datetime),
year = as.factor(year(pickup_datetime)),
month = as.factor(month(pickup_datetime)),
day = as.numeric(day(pickup_datetime)),
dayOfWeek = as.factor(wday(pickup_datetime)),
hour = as.numeric(hour(pickup_datetime))
)
taxi_test_all <- taxi_test_all %>%
mutate(rate_hike = ifelse(pickup_datetime >= "2012-09-04", 1, 0),
surcharge = ifelse(pickup_datetime >= "2015-01-01", 1, 0))
sample_taxi_train <- sample_taxi_train %>%
mutate(pickup_latitude = ifelse(between(pickup_latitude,40.25, 41.25),pickup_latitude,NA),
dropoff_latitude= ifelse(between(dropoff_latitude,40.25, 41.25),dropoff_latitude,NA),
pickup_longitude= ifelse(between(pickup_longitude,-74.5,-73.5),pickup_longitude,NA),
dropoff_longitude= ifelse(between(dropoff_longitude, -74.5,-73.5),dropoff_longitude,NA)
)
taxi_test_all <- taxi_test_all %>%
mutate(pickup_latitude = ifelse(between(pickup_latitude,40.25, 41.25),pickup_latitude,NA),
dropoff_latitude= ifelse(between(dropoff_latitude,40.25, 41.25),dropoff_latitude,NA),
pickup_longitude= ifelse(between(pickup_longitude,-74.5,-73.5),pickup_longitude,NA),
dropoff_longitude= ifelse(between(dropoff_longitude, -74.5,-73.5),dropoff_longitude,NA)
)
sample_taxi_train <- sample_taxi_train %>%
mutate(CrowD = distHaversine(cbind(pickup_latitude, pickup_longitude), cbind(dropoff_latitude, dropoff_longitude),r=3959))
taxi_test_all <- taxi_test_all %>%
mutate(CrowD = distHaversine(cbind(pickup_latitude, pickup_longitude), cbind(dropoff_latitude, dropoff_longitude),r=3959))
tree <- rpart(fare_amount ~ rate_hike + surcharge + CrowD,
data=sample_taxi_train, cp=0.003)
prp(tree)
taxi_test_all$tree.preds <- predict(tree, taxi_test_all)
taxi_test_all <- taxi_test_all %>%
mutate(fare_amount = ifelse(is.na(meanFare), tree.preds, 0.70*meanFare + 0.30*tree.preds))
taxi_test_all %>%
select(key, fare_amount) %>%
write.csv("combo_predictions.csv", row.names=FALSE)