NYC taxi data analysis
# loading the necessary libraries
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(lubridate)
library(geosphere)
library(corrplot)
library(caret)
library(ggmap)
library(randomForest)
# setting seed for repeatability
set.seed(101)Taking a random sample of 50k observations after combining the two datasets.
fare_df <- read.csv("data/trip_fare_4.csv", header=TRUE)
data_df <- read.csv("data/trip_data_4.csv", header=TRUE)
combine_df <- fare_df %>%
inner_join(data_df)
combine_df_sample <- combine_df[sample(1:nrow(combine_df), 50000, replace =FALSE),]
write.csv(combine_df_sample, "data/combine_df_sample.csv")combine_df <- read.csv("data/combine_df_sample.csv", header=TRUE)
drop <- c("X")
combine_df <- combine_df %>%
select(-one_of(drop))
rm(drop)str(combine_df)## 'data.frame': 50000 obs. of 21 variables:
## $ medallion : Factor w/ 12715 levels "00005007A9F30E289E760362F69E4EAD",..: 2531 2517 3069 1414 9973 12361 4932 3587 987 12604 ...
## $ hack_license : Factor w/ 23742 levels "0002555BBE359440D6CEB34B699D3932",..: 12783 1100 14803 21257 17452 1425 11576 15964 16740 20776 ...
## $ vendor_id : Factor w/ 2 levels "CMT","VTS": 1 1 2 1 1 1 1 1 1 2 ...
## $ pickup_datetime : Factor w/ 41233 levels "2013-04-01 00:00:24",..: 26495 18870 41187 27920 24245 11270 20426 330 33011 35563 ...
## $ payment_type : Factor w/ 5 levels "CRD","CSH","DIS",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ fare_amount : num 6 52 80 3.5 10 10.5 27 6 28.5 9.5 ...
## $ surcharge : num 0.5 0 0 0.5 0 0 1 0 1 1 ...
## $ mta_tax : num 0.5 0.5 0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ tip_amount : num 1 6 0 0 0 0 0 0 0 0 ...
## $ tolls_amount : num 0 5.33 0 0 0 0 0 0 5.33 0 ...
## $ total_amount : num 8 63.8 80 4.5 10.5 ...
## $ rate_code : int 1 2 5 1 1 1 1 1 1 1 ...
## $ store_and_fwd_flag: Factor w/ 3 levels "","N","Y": 2 2 1 2 2 2 2 2 2 1 ...
## $ dropoff_datetime : Factor w/ 41241 levels "2013-04-01 00:05:12",..: 26434 18825 41215 27854 24177 11254 20402 324 32968 35508 ...
## $ passenger_count : int 2 3 1 3 1 1 1 1 1 1 ...
## $ trip_time_in_secs : int 231 1688 2160 94 847 629 2467 326 1428 720 ...
## $ trip_distance : num 1.1 19.5 13.1 0.5 1.1 ...
## $ pickup_longitude : num -74 -73.8 -74 -74 -74 ...
## $ pickup_latitude : num 40.8 40.6 40.7 40.8 40.7 ...
## $ dropoff_longitude : num -74 -74 -74 -74 -74 ...
## $ dropoff_latitude : num 40.8 40.8 40.9 40.8 40.7 ...
sum(is.na(combine_df))## [1] 4
4 null values in random sample. Now to find which columns have the null values
colnames(combine_df)[colSums(is.na(combine_df)) > 0]## [1] "dropoff_longitude" "dropoff_latitude"
If dropoff_longitude is null or dropoff_latitude is null then must be a system error. Will remove from analysis
combine_df <- combine_df[complete.cases(combine_df), ]combine_df %>%
count(passenger_count)## # A tibble: 6 x 2
## passenger_count n
## <int> <int>
## 1 1 35517
## 2 2 6470
## 3 3 2037
## 4 4 991
## 5 5 2997
## 6 6 1986
ggplot(combine_df, aes(combine_df$passenger_count)) +
geom_histogram(binwidth = 1) +
labs(title = "Histogram of Passenger Count",
x = "Number of Passengers",
y = "Total Count")combine_df %>%
group_by(payment_type) %>%
summarise(count = n() )## # A tibble: 5 x 2
## payment_type count
## <fct> <int>
## 1 CRD 26842
## 2 CSH 22996
## 3 DIS 34
## 4 NOC 102
## 5 UNK 24
Will plot the relative percentage of each payment type to make results easier to interpret.
combine_df %>%
group_by(payment_type) %>%
summarise(count = n() ) %>%
mutate( percentage = count * 100 / sum(count) ) %>%
ggplot( aes(x = payment_type, y = percentage, fill = payment_type)) +
geom_bar(stat="identity") +
theme(legend.position = "none") +
labs(title = "Distribution of Payment Type",
x = "Payment Type",
y = "Percentage")With some data dictionary found online, assuming the following payment_types
CRD = credit card
CSH = cash
DIS = dispute
NOC = no charge
UNK = unknown
DIS and NOC will have to be treated for modelling when predicting fare and tips.
Approach will be to remove NOC from the dataset since these are "free rides", perhaps as personal favours
DIS will remain in the dataset. Charged the customers, just disputing maybe because of the longer route taken to the destination.
combine_df %>%
group_by(fare_amount) %>%
summarise(count = n() )## # A tibble: 183 x 2
## fare_amount count
## <dbl> <int>
## 1 2.50 166
## 2 3.00 189
## 3 3.50 651
## 4 4.00 1204
## 5 4.50 1748
## 6 5.00 2151
## 7 5.50 2417
## 8 5.55 1
## 9 6.00 2544
## 10 6.50 2536
## # ... with 173 more rows
From looking at the count, seems like most fares are rounded to the nearest 50 cents. Definitely scattered in between where customers are charged outside of this rule, but for ease of readability, will round to the nearest 0.5
combine_df %>%
mutate(fare_rounded = round(fare_amount/0.5)*0.5) %>%
group_by(fare_rounded) %>%
summarise(count = n() )## # A tibble: 176 x 2
## fare_rounded count
## <dbl> <int>
## 1 2.50 166
## 2 3.00 189
## 3 3.50 651
## 4 4.00 1204
## 5 4.50 1748
## 6 5.00 2151
## 7 5.50 2418
## 8 6.00 2544
## 9 6.50 2536
## 10 7.00 2488
## # ... with 166 more rows
combine_df %>%
mutate(fare_rounded = round(fare_amount/0.5)*0.5) %>%
group_by(fare_rounded) %>%
summarise(count = n() ) %>%
ggplot( aes(x = fare_rounded, y = count, fill = fare_rounded)) +
geom_bar(stat="identity") +
theme(legend.position = "none") +
labs(title = "Distribution of Fare Amount",
x = "Fare Amount",
y = "Total Count")From the graph above, seems to have some outliers that affect how we see the distribution. Will remove outliers, defined as being > 3 standard deviations
combine_df %>%
mutate(fare_rounded = round(fare_amount/0.5)*0.5) %>%
group_by(fare_rounded) %>%
summarise(count = n() ) %>%
filter( abs(fare_rounded - mean(fare_rounded)) / sd(fare_rounded) <= 3 ) %>%
ggplot( aes(x = fare_rounded, y = count, fill = fare_rounded)) +
geom_bar(stat="identity") +
theme(legend.position = "none") +
labs(title = "Distribution of Fare Amount",
x = "Fare Amount",
y = "Total Count")Still many values in the larger fare spectrum. But from the on closer inspection we can still see the peaks in fares from $5 to $7.50.
Also note the interesting spike at $52.
combine_df %>%
group_by(tip_amount) %>%
summarise(count = n() )## # A tibble: 489 x 2
## tip_amount count
## <dbl> <int>
## 1 0. 23923
## 2 0.0100 17
## 3 0.0200 6
## 4 0.0300 3
## 5 0.0400 1
## 6 0.0500 1
## 7 0.0800 5
## 8 0.100 11
## 9 0.110 2
## 10 0.150 1
## # ... with 479 more rows
combine_df %>%
group_by(tip_amount) %>%
summarise(count = n() ) %>%
ggplot( aes(x = tip_amount, y = count, fill = tip_amount)) +
geom_line(stat="identity") +
theme(legend.position = "none") +
labs(title = "Distribution of Tip Amount",
x = "Tip Amount",
y = "Total Count")The above graph does not have much use. Can see a large portion of trips have 0 tips. Will first remove when tips = 0
combine_df %>%
group_by(tip_amount) %>%
filter(tip_amount != 0 ) %>%
summarise(count = n() ) %>%
filter( abs(tip_amount - mean(tip_amount)) / sd(tip_amount) <= 2) %>%
ggplot( aes(x = tip_amount, y = count, fill = tip_amount)) +
geom_line(stat="identity") +
theme(legend.position = "none") +
labs(title = "Distribution of Tip Amount",
x = "Tip Amount",
y = "Total Count")Even after removing the 0 tips, the distribution does not give much information. Can see a lot are small tips. However, "small tips" may not be small relative to the fare amount.
Will plot distribution of tip amount, but now as a percentage of the fare amount, this time still removing trips with no tips
combine_df %>%
filter(tip_amount != 0 ) %>%
mutate(tip_percent = round((tip_amount * 100 / fare_amount),0)) %>%
group_by(tip_percent) %>%
summarise(count = n() ) %>%
ggplot( aes(x = tip_percent, y = count, fill = tip_percent)) +
geom_line(stat="identity") +
theme(legend.position = "none") +
labs(title = "Distribution of Tip Percentage",
x = "Tip Percentage",
y = "Total Count")Unexpected tip percentages seen in the above graph. When we see a summary of the tip percentage in quartiles, most seem to sit in normal amounts, but some are recording extremely high values
tip_test <- combine_df %>%
select(fare_amount, tip_amount) %>%
filter(tip_amount != 0 ) %>%
mutate(tip_percent = round((tip_amount * 100 / fare_amount),0))
summary(tip_test$tip_percent)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 17.00 21.00 20.53 23.00 1558.00
Investigating tip percentages > 100%
tip_test %>%
filter(tip_percent > 100) %>%
group_by(fare_amount) %>%
summarise(count = n() ) %>%
ggplot( aes(x = fare_amount, y = count, fill = fare_amount)) +
geom_line(stat="identity") +
theme(legend.position = "none") +
labs(title = "Distribution of Base Fare Amount for Tip Percentage > 100",
x = "Fare Amount",
y = "Total Count")Very low counts for trips with high tip fee percentages
Will now re-plot the distribution of tips as a percentage of fare amount, but only for tips which are less than 100% of the fare amount to zoom in on the peak observed in the above graph.
combine_df %>%
filter(tip_amount != 0 ) %>%
mutate(tip_percent = round((tip_amount * 100 / fare_amount),0)) %>%
filter(tip_percent < 100) %>%
group_by(tip_percent) %>%
summarise(count = n() ) %>%
ggplot( aes(x = tip_percent, y = count, fill = tip_percent)) +
geom_line(stat="identity") +
theme(legend.position = "none") +
labs(title = "Distribution of Tip Percentage for < 100%",
x = "Tip Percentage",
y = "Total Count")For modelling exercise, will remove these huge outliers of greater than 100% tips. Under the assumption that this is incorrect data.
combine_df <- combine_df %>%
mutate(tip_percent = round((tip_amount * 100 / fare_amount),0)) %>%
filter(tip_percent < 100)Since fare amount had a round to nearest 50 cents, will also round to the nearest 50 cents when looking at the distribution of the total amount
combine_df %>%
mutate(total_amount_rounded = round(total_amount/0.5)*0.5) %>%
group_by(total_amount_rounded) %>%
summarise(count = n() ) ## # A tibble: 212 x 2
## total_amount_rounded count
## <dbl> <int>
## 1 3.00 70
## 2 3.50 140
## 3 4.00 328
## 4 4.50 581
## 5 5.00 964
## 6 5.50 1362
## 7 6.00 1784
## 8 6.50 1858
## 9 7.00 2236
## 10 7.50 2013
## # ... with 202 more rows
combine_df %>%
mutate(total_amount_rounded = round(total_amount/0.5)*0.5) %>%
group_by(total_amount_rounded) %>%
summarise(count = n() ) %>%
ggplot( aes(x = total_amount_rounded, y = count, fill = total_amount_rounded)) +
geom_line(stat="identity") +
theme(legend.position = "none") +
labs(title = "Distribution of Total Amount",
x = "Total Amount",
y = "Trip Count")As expected, has a similar distribution as the above $ amounts. Will limit the x axis to $100 to zoom in on the non-flat line part
combine_df %>%
mutate(total_amount_rounded = round(total_amount/0.5)*0.5) %>%
group_by(total_amount_rounded) %>%
summarise(count = n() ) %>%
ggplot( aes(x = total_amount_rounded, y = count, fill = total_amount_rounded)) +
geom_line(stat="identity") +
theme(legend.position = "none") +
scale_x_continuous(limits = c(0, 100)) +
labs(title = "Distribution of Total Amount <$100",
x = "Total Amount",
y = "Trip Count")combine_df %>%
mutate(total_amount_rounded = round(total_amount/0.5)*0.5) %>%
summarise(median = median(total_amount_rounded), mean = mean(total_amount_rounded))## median mean
## 1 11 14.58555
Will need to convert to date-timestamp and extract the hour
combine_df %>%
mutate(pickup_ts = hour(pickup_datetime)) %>%
group_by(pickup_ts) %>%
summarise(count = n() ) %>%
top_n(5, count)## # A tibble: 5 x 2
## pickup_ts count
## <int> <int>
## 1 18 3065
## 2 19 3183
## 3 20 2954
## 4 21 2925
## 5 22 2817
From the result above, we can see the 5 busiest hours of the day ranked in order are: 1. 6pm 2. 7pm 3. 8pm 4. 9pm 5. 10pm
Will be using the latitude and longitude values provided in the data. However, errors can be found, as seen in the section below. Parts where the latitude/longitude values are too far off the central longitude and latitude of New York City.
Using: NYC Longitude = -73.935242, Latitude = 40.730610 https://www.latlong.net/place/new-york-city-ny-usa-1848.html
# when looking at the map, +/- 2 is a large distance that covers a huge area
longitude_upper = -73.935242 + 2
longitude_lower = -73.935242 - 2
latitude_upper = 40.730610 + 2
latitude_lower = 40.730610 - 2
combine_df %>%
mutate(longitude_rounded = round(pickup_longitude,0)) %>%
mutate(latitude_rounded = round(pickup_latitude,0)) %>%
filter(!(between(pickup_longitude, longitude_lower, longitude_upper))| !(between(pickup_latitude, latitude_lower, latitude_upper))) %>%
group_by(longitude_rounded,latitude_rounded) %>%
summarise(count = n() )## # A tibble: 8 x 3
## # Groups: longitude_rounded [?]
## longitude_rounded latitude_rounded count
## <dbl> <dbl> <int>
## 1 -74. 0. 2
## 2 -73. 35. 1
## 3 -71. 41. 1
## 4 -69. 51. 1
## 5 -65. 39. 1
## 6 0. 0. 802
## 7 0. 41. 8
## 8 41. -74. 1
Error assessment: Total of 817 observations with poor data. This makes up approximately 1.6% of the observations. Will exclude in this distribution and modelling process.
Rounding the latitude and longitude values to get "city locations"
# removing the bad coordinate data
combine_df <- combine_df %>%
filter(between(pickup_longitude, longitude_lower, longitude_upper), between(pickup_latitude, latitude_lower, latitude_upper))
pickup_df <- combine_df %>%
mutate(longitude_rounded = round(pickup_longitude,4)) %>%
mutate(latitude_rounded = round(pickup_latitude,4)) %>%
filter(!(longitude_rounded == 0), !(latitude_rounded == 0), !(abs(longitude_rounded) > 180), !(abs(latitude_rounded) > 180) ) %>%
mutate(location = paste(longitude_rounded, latitude_rounded))
pickup_df %>%
select(location) %>%
group_by(location) %>%
summarise(count = n() ) %>%
top_n(10, count)## # A tibble: 10 x 2
## location count
## <chr> <int>
## 1 -73.7767 40.6453 18
## 2 -73.7767 40.6454 22
## 3 -73.8628 40.7688 24
## 4 -73.8709 40.7737 25
## 5 -73.8709 40.7738 19
## 6 -73.8746 40.7741 25
## 7 -73.994 40.7513 24
## 8 -73.9941 40.7511 17
## 9 -73.9941 40.7512 39
## 10 -74.023 40.766 22
4 decimal places seems a little too specific to a particular area. Tweaking to just 2 decimal places aggregates the numbers at a more reasonable scale.
pickup_df2 <- combine_df %>%
mutate(longitude_rounded = round(pickup_longitude,2)) %>%
mutate(latitude_rounded = round(pickup_latitude,2)) %>%
filter(!(longitude_rounded == 0), !(latitude_rounded == 0), !(abs(longitude_rounded) > 180), !(abs(latitude_rounded) > 180) ) %>%
mutate(location = paste(longitude_rounded, latitude_rounded))
pickup_df2 %>%
select(location) %>%
group_by(location) %>%
summarise(count = n() ) %>%
top_n(10, count)## # A tibble: 10 x 2
## location count
## <chr> <int>
## 1 -73.96 40.77 1759
## 2 -73.97 40.76 3069
## 3 -73.98 40.75 2501
## 4 -73.98 40.76 2487
## 5 -73.98 40.77 1573
## 6 -73.99 40.73 1827
## 7 -73.99 40.74 2270
## 8 -73.99 40.75 2753
## 9 -73.99 40.76 2164
## 10 -74 40.74 1560
Going to define a "trip" type based on the pickup and dropoff latitude and longitude, rounded to a similar 2 decimal places as part (g)
# list of columns needed
trip_sd <- combine_df %>%
select(pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude, trip_time_in_secs) %>%
mutate(pickup_longitude_rounded = round(pickup_longitude,2)) %>%
mutate(pickup_latitude_rounded = round(pickup_latitude,2)) %>%
mutate(dropoff_longitude_rounded = round(dropoff_longitude,2)) %>%
mutate(dropoff_latitude_rounded = round(dropoff_latitude,2)) %>%
mutate(trip_time_in_min = trip_time_in_secs/60) %>%
mutate(trip_coord = paste(pickup_longitude_rounded,pickup_latitude_rounded,dropoff_longitude_rounded,dropoff_latitude_rounded))
trip_sd %>%
select(trip_coord,trip_time_in_min)%>%
group_by(trip_coord) %>%
summarise(sd = sd(trip_time_in_min)) %>%
top_n(1,sd)## # A tibble: 1 x 2
## trip_coord sd
## <chr> <dbl>
## 1 -73.86 40.77 -73.95 40.81 42.3
Going to define a "trip" type based on the pickup and dropoff latitude and longitude, rounded to a similar 2 decimal places as part (g)
# list of columns needed
trip_sd <- combine_df %>%
select(pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude, fare_amount) %>%
mutate(pickup_longitude_rounded = round(pickup_longitude,2)) %>%
mutate(pickup_latitude_rounded = round(pickup_latitude,2)) %>%
mutate(dropoff_longitude_rounded = round(dropoff_longitude,2)) %>%
mutate(dropoff_latitude_rounded = round(dropoff_latitude,2)) %>%
mutate(trip_coord = paste(pickup_longitude_rounded,pickup_latitude_rounded,dropoff_longitude_rounded,dropoff_latitude_rounded))
low_number_trips <- trip_sd %>%
select(trip_coord) %>%
group_by(trip_coord) %>%
summarise(count = n()) %>%
filter(count < 100)
remove_trips <- low_number_trips[,1]
trip_sd %>%
select(trip_coord,fare_amount)%>%
filter(!(trip_coord %in% remove_trips)) %>%
group_by(trip_coord) %>%
summarise(sd = sd(fare_amount)) %>%
top_n(-1,sd)## # A tibble: 188 x 2
## trip_coord sd
## <chr> <dbl>
## 1 -73.78 40.64 -73.96 40.76 0.
## 2 -73.78 40.64 -73.97 40.76 0.
## 3 -73.78 40.64 -73.98 40.75 0.
## 4 -73.78 40.64 -73.98 40.78 0.
## 5 -73.78 40.64 -73.99 40.72 0.
## 6 -73.78 40.64 -73.99 40.74 0.
## 7 -73.78 40.64 -73.99 40.75 0.
## 8 -73.78 40.64 -73.99 40.76 0.
## 9 -73.78 40.64 -74 40.72 0.
## 10 -73.78 40.64 -74 40.75 0.
## # ... with 178 more rows
unsure
From the question, going to assume this means building a model based on the information we have from the data, specifically only looking at the following columns:
fare_amount
tip_amount
pickup_datetime
pickup_longitude
pickup_latitude
Since the above will be information that the driver will have prior to starting the trip
Unable to create a month and year feature since all data in the dataset were from the same month and year.
model_combine_df <- combine_df %>%
mutate(pickup_hour = as.factor(hour(pickup_datetime))) %>%
mutate(pickup_wday = as.factor(wday(pickup_datetime, label=TRUE)))note: Year and Month are the same, so will not extract that detail
Important assumption: Will the taxi driver know the destination co-ordinates before accepting a customer? Working off the assumption that they will not know, and the only information that will be available to the driver will be the pickup location of the customer based on where they choose to drive their vechicle
will also be taking the total of fare_amount and tip_amount. working off the assumption that the taxi driver does not care how the amount is distributed. (i.e. if the money comes from tips or the fare)
model_df <- model_combine_df %>%
select(vendor_id, fare_amount, tip_amount, pickup_longitude, pickup_latitude, pickup_hour, pickup_wday) %>%
mutate(total_earned = fare_amount + tip_amount) %>%
select(vendor_id, total_earned, pickup_longitude, pickup_latitude, pickup_hour, pickup_wday)# train and test set split
split = 0.8
model_dataset <- createDataPartition(model_df$total_earned, p=split, list=FALSE)
model_train <- model_df[model_dataset,]
model_test <- model_df[-model_dataset,]Need to have some baseline from which to see if a model adds any value. Will be using the mean of total_earned as the baseline. Since in the absence of a model, the best way to estimate a taxi driver's earnings will be taking an average of the records.
baseline_value <- mean(model_train$total_earned) RMSE_baseline <- sqrt(mean((baseline_value - model_test$total_earned)^2))
RMSE_baseline## [1] 10.94656
MAE_baseline <- mean(abs(baseline_value - model_test$total_earned))
MAE_baseline## [1] 7.042726
From the above results, a model will be deemed to be better than the current baseline if the errors on the test set return better results after training.
linear_model <- lm(total_earned ~. , data = model_train)
saveRDS(linear_model, file = "output/linear_model.rds")Loading the model from a saved file
linear_model <- readRDS("output/linear_model.rds")
summary(linear_model)##
## Call:
## lm(formula = total_earned ~ ., data = model_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.496 -5.426 -2.246 2.769 213.442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.330e+04 1.222e+02 108.777 < 2e-16 ***
## vendor_idVTS 1.731e-01 9.580e-02 1.807 0.070725 .
## pickup_longitude 1.344e+02 1.387e+00 96.910 < 2e-16 ***
## pickup_latitude -8.196e+01 1.772e+00 -46.248 < 2e-16 ***
## pickup_hour1 4.369e-02 3.830e-01 0.114 0.909184
## pickup_hour2 -2.575e-01 4.104e-01 -0.627 0.530361
## pickup_hour3 1.461e-01 4.582e-01 0.319 0.749793
## pickup_hour4 1.865e+00 5.137e-01 3.629 0.000284 ***
## pickup_hour5 1.347e+00 5.895e-01 2.285 0.022347 *
## pickup_hour6 -6.752e-01 4.069e-01 -1.660 0.097006 .
## pickup_hour7 -7.835e-01 3.496e-01 -2.241 0.025021 *
## pickup_hour8 -1.009e+00 3.330e-01 -3.030 0.002446 **
## pickup_hour9 -1.155e+00 3.326e-01 -3.472 0.000517 ***
## pickup_hour10 -4.877e-01 3.352e-01 -1.455 0.145705
## pickup_hour11 -6.432e-01 3.333e-01 -1.930 0.053648 .
## pickup_hour12 -5.820e-01 3.265e-01 -1.783 0.074649 .
## pickup_hour13 3.168e-02 3.291e-01 0.096 0.923305
## pickup_hour14 -1.206e-01 3.251e-01 -0.371 0.710661
## pickup_hour15 -7.099e-02 3.279e-01 -0.216 0.828609
## pickup_hour16 -4.836e-01 3.427e-01 -1.411 0.158123
## pickup_hour17 -5.193e-01 3.283e-01 -1.582 0.113694
## pickup_hour18 -8.334e-01 3.125e-01 -2.667 0.007663 **
## pickup_hour19 -1.376e+00 3.109e-01 -4.425 9.65e-06 ***
## pickup_hour20 -1.353e+00 3.164e-01 -4.276 1.91e-05 ***
## pickup_hour21 -6.979e-01 3.158e-01 -2.210 0.027118 *
## pickup_hour22 -3.519e-01 3.177e-01 -1.108 0.267907
## pickup_hour23 -1.006e-01 3.265e-01 -0.308 0.757926
## pickup_wday.L 1.843e-02 1.285e-01 0.143 0.885947
## pickup_wday.Q -3.529e-01 1.328e-01 -2.658 0.007875 **
## pickup_wday.C -6.368e-01 1.265e-01 -5.033 4.85e-07 ***
## pickup_wday^4 -1.012e-01 1.276e-01 -0.793 0.427767
## pickup_wday^5 1.649e-01 1.244e-01 1.326 0.184971
## pickup_wday^6 -3.814e-02 1.274e-01 -0.299 0.764612
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.492 on 39293 degrees of freedom
## Multiple R-squared: 0.2383, Adjusted R-squared: 0.2377
## F-statistic: 384.2 on 32 and 39293 DF, p-value: < 2.2e-16
Low R-squared value of 23.83% does not bode well, as it will seem the features do not explain total_earned. Will now conduct the performance tests
linear_predict <- predict(linear_model,model_test)RMSE_linear <- sqrt(mean((linear_predict - model_test$total_earned)^2))
RMSE_linear## [1] 9.566904
# improvement
linear_RMSE_diff <- RMSE_baseline - RMSE_linear
linear_RMSE_diff## [1] 1.379654
MAE_linear <- mean(abs(linear_predict - model_test$total_earned))
MAE_linear## [1] 6.197861
# improvement
linear_MAE_diff <- MAE_baseline - MAE_linear
linear_MAE_diff## [1] 0.8448644
Saw some overall improvements prediction rates when using the linear model, but will see if a random forest algorithm will enable us to predict the total_earned value even better
rf_model <- randomForest(total_earned ~., data = model_train, importance = TRUE, ntree=1000)
saveRDS(rf_model, file = "output/rf_model.rds")Loading the model from a saved file
rf_model <- readRDS("output/rf_model.rds")
importance(rf_model)[,1]## vendor_id pickup_longitude pickup_latitude pickup_hour
## -0.9679805 63.6406683 51.8200573 25.1077622
## pickup_wday
## 12.1879906
From the importance variables, we can see that pickup location greatly influences the total_amount earned compared to the other features.
rf_predict <- predict(rf_model,model_test)RMSE_rf <- sqrt(mean((rf_predict - model_test$total_earned)^2))
RMSE_rf## [1] 8.90182
# improvement
rf_RMSE_diff <- RMSE_baseline - RMSE_rf
rf_RMSE_diff## [1] 2.044738
MAE_rf <- mean(abs(rf_predict - model_test$total_earned))
MAE_rf## [1] 5.859856
# improvement
rf_MAE_diff <- MAE_baseline - MAE_rf
rf_MAE_diff## [1] 1.18287
Random forest still does better than the baseline. Now to test if the linear or random forest model works better
# improvement
model_RMSE_diff <- RMSE_linear - RMSE_rf
model_RMSE_diff## [1] 0.6650843
# improvement
model_MAE_diff <- MAE_linear - MAE_rf
model_MAE_diff## [1] 0.3380053
Across both performance measures, the random forest model outperforms the linear model.
Will now use the random forest model to predict fare amount based on the longitude/latitude around NYC at different times of the day
Overlap that value with the trip duration which can be derived from trip_time_in_secs.
Looking for: Highest outcome of (Predicted fare / trip duration)
maximise_df <- combine_df %>%
mutate(total_earned = tip_amount + fare_amount) %>%
mutate(pickup_hour = as.factor(hour(pickup_datetime))) %>%
mutate(pickup_wday = as.factor(wday(pickup_datetime, label=TRUE))) %>%
mutate(trip_duration = trip_time_in_secs/60) %>%
select(vendor_id, total_earned, pickup_longitude, pickup_latitude, pickup_hour, pickup_wday, trip_duration) %>%
filter(pickup_hour == 18, pickup_wday == "Mon")
rf_earning <- predict(rf_model,maximise_df)
maximise_df <- cbind(rf_earning, maximise_df)
maximise_df <- maximise_df %>%
select(rf_earning, pickup_longitude, pickup_latitude) %>%
group_by(pickup_longitude, pickup_latitude) %>%
summarise(mean = mean(rf_earning))Plotting on a map
nyc_map <- get_map(location = c(lon = -73.935242, lat = 40.730610), maptype = "terrain", zoom = 12)
ggmap(nyc_map) +
geom_point(data= maximise_df, aes(x=pickup_longitude, y=pickup_latitude, alpha=mean), fill = 'red')## Warning: Removed 12 rows containing missing values (geom_point).
As a taxi driver, I will stick to the darker dots in the heart of New York. From the map, we can see the key points of interests highlighted nearer the darker dots as 1. Brooklyn Bridge 2. Times Square 3. South/Middle sides of Central Park
First need to determine the average wages of a typical taxi driver
taxi_df <- combine_df %>%
mutate(total_earned = fare_amount + tip_amount) %>%
select(hack_license, total_earned) %>%
group_by(hack_license) %>%
summarise(mean = mean(total_earned))
mean(taxi_df$mean)## [1] 13.64381
Seems this value does not make sense as the sample size was reduced over the month