suppressPackageStartupMessages({
  library(lubridate)
  library(Hmisc)
  library(tidyverse)
  library(caret)
  library(FNN)
  library(ggrepel)
  library(knitr)
  library(gmodels)
  library(psych)
})

1 Data Understanding

1.1 Import

First, we load the dataset into a tibble directly from a URL. This dataset contains trip information from NYC Green Taxi Trip Records for 2018 provided by the NYC Taxi and Limousine Commission (TLC). We aim to build a kNN regression model to predict the tip amount for a given ride.

# Import
df <- read_csv(file = "https://github.com/cwcourtney/DA5020Practicum3Data/raw/main/P3smaller.csv")
original <- df

1.2 General Exploration

Then, we do a first pass at exploring the data pre-cleaning.

# Inspect structure
str(df, give.attr=FALSE)
## spc_tbl_ [3,986 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ VendorID             : num [1:3986] NA 2 2 NA 2 2 2 2 2 2 ...
##  $ lpep_pickup_datetime : POSIXct[1:3986], format: "2020-02-07 09:13:00" "2020-02-24 15:15:36" ...
##  $ lpep_dropoff_datetime: POSIXct[1:3986], format: "2020-02-07 09:30:00" "2020-02-24 15:23:17" ...
##  $ store_and_fwd_flag   : chr [1:3986] NA "N" "N" NA ...
##  $ RatecodeID           : num [1:3986] NA 1 1 NA 1 1 1 1 1 1 ...
##  $ PULocationID         : num [1:3986] 210 244 7 185 41 95 210 74 255 41 ...
##  $ DOLocationID         : num [1:3986] 55 244 7 78 152 95 29 75 49 142 ...
##  $ passenger_count      : num [1:3986] NA 1 1 NA 1 1 1 1 1 1 ...
##  $ trip_distance        : num [1:3986] 4.28 1.04 0.4 2.54 2.1 0.68 0.98 0.39 2.9 2.74 ...
##  $ fare_amount          : num [1:3986] 13.3 6.5 3.5 23.4 9.5 ...
##  $ extra                : num [1:3986] 2.75 0 0 2.75 0.5 0.5 0 0 0.5 0 ...
##  $ mta_tax              : num [1:3986] 0 0.5 0.5 0 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ tip_amount           : num [1:3986] 0 0 0 0 2.16 1.26 8 0.86 2.66 2.18 ...
##  $ tolls_amount         : num [1:3986] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ehail_fee            : logi [1:3986] NA NA NA NA NA NA ...
##  $ improvement_surcharge: num [1:3986] 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 ...
##  $ total_amount         : num [1:3986] 16.4 7.3 4.3 26.5 13 ...
##  $ payment_type         : num [1:3986] NA 2 2 NA 1 1 1 1 1 1 ...
##  $ trip_type            : num [1:3986] NA 1 1 NA 1 1 1 1 1 1 ...
##  $ congestion_surcharge : num [1:3986] NA 0 0 NA 0 0 0 0 0 2.75 ...
# Inspect distribution
summary(df)
##     VendorID     lpep_pickup_datetime            
##  Min.   :1.000   Min.   :2020-02-01 00:19:44.00  
##  1st Qu.:2.000   1st Qu.:2020-02-08 12:15:13.50  
##  Median :2.000   Median :2020-02-15 15:16:05.50  
##  Mean   :1.825   Mean   :2020-02-15 23:30:14.33  
##  3rd Qu.:2.000   3rd Qu.:2020-02-23 13:19:55.25  
##  Max.   :2.000   Max.   :2020-02-29 23:49:26.00  
##  NA's   :802                                     
##  lpep_dropoff_datetime            store_and_fwd_flag   RatecodeID   
##  Min.   :2020-02-01 00:28:38.00   Length:3986        Min.   :1.000  
##  1st Qu.:2020-02-08 12:27:45.25   Class :character   1st Qu.:1.000  
##  Median :2020-02-15 15:34:16.00   Mode  :character   Median :1.000  
##  Mean   :2020-02-15 23:47:16.51                      Mean   :1.108  
##  3rd Qu.:2020-02-23 13:35:12.75                      3rd Qu.:1.000  
##  Max.   :2020-02-29 23:57:55.00                      Max.   :5.000  
##                                                      NA's   :802    
##   PULocationID    DOLocationID   passenger_count trip_distance   
##  Min.   :  3.0   Min.   :  3.0   Min.   :0.000   Min.   : 0.000  
##  1st Qu.: 51.0   1st Qu.: 61.0   1st Qu.:1.000   1st Qu.: 1.040  
##  Median : 82.0   Median :129.0   Median :1.000   Median : 1.970  
##  Mean   :106.5   Mean   :128.1   Mean   :1.315   Mean   : 3.366  
##  3rd Qu.:166.0   3rd Qu.:189.0   3rd Qu.:1.000   3rd Qu.: 4.010  
##  Max.   :265.0   Max.   :265.0   Max.   :6.000   Max.   :40.270  
##                                  NA's   :802                     
##   fare_amount         extra            mta_tax          tip_amount     
##  Min.   :-27.47   Min.   :-1.0000   Min.   :-0.5000   Min.   : 0.0000  
##  1st Qu.:  7.00   1st Qu.: 0.0000   1st Qu.: 0.5000   1st Qu.: 0.0000  
##  Median : 11.20   Median : 0.5000   Median : 0.5000   Median : 0.0000  
##  Mean   : 15.50   Mean   : 0.8889   Mean   : 0.4009   Mean   : 0.9868  
##  3rd Qu.: 20.32   3rd Qu.: 1.0000   3rd Qu.: 0.5000   3rd Qu.: 1.6600  
##  Max.   :185.00   Max.   : 8.2500   Max.   : 0.5000   Max.   :50.0000  
##                                                                        
##   tolls_amount     ehail_fee      improvement_surcharge  total_amount   
##  Min.   : 0.0000   Mode:logical   Min.   :-0.3000       Min.   :-24.42  
##  1st Qu.: 0.0000   NA's:3986      1st Qu.: 0.3000       1st Qu.:  8.80  
##  Median : 0.0000                  Median : 0.3000       Median : 14.16  
##  Mean   : 0.2695                  Mean   : 0.2928       Mean   : 18.67  
##  3rd Qu.: 0.0000                  3rd Qu.: 0.3000       3rd Qu.: 24.40  
##  Max.   :13.7500                  Max.   : 0.3000       Max.   :199.10  
##                                                                         
##   payment_type     trip_type     congestion_surcharge
##  Min.   :1.000   Min.   :1.000   Min.   :0.000       
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.000       
##  Median :1.000   Median :1.000   Median :0.000       
##  Mean   :1.466   Mean   :1.025   Mean   :0.449       
##  3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:0.000       
##  Max.   :4.000   Max.   :2.000   Max.   :2.750       
##  NA's   :802     NA's   :802     NA's   :802
hist.data.frame(select(df, -c(lpep_pickup_datetime, lpep_dropoff_datetime)),
                n.unique=2)

# Check for missing values
apply(df, MARGIN=2, function(x) sum(is.na(x)))
##              VendorID  lpep_pickup_datetime lpep_dropoff_datetime 
##                   802                     0                     0 
##    store_and_fwd_flag            RatecodeID          PULocationID 
##                   802                   802                     0 
##          DOLocationID       passenger_count         trip_distance 
##                     0                   802                     0 
##           fare_amount                 extra               mta_tax 
##                     0                     0                     0 
##            tip_amount          tolls_amount             ehail_fee 
##                     0                     0                  3986 
## improvement_surcharge          total_amount          payment_type 
##                     0                     0                   802 
##             trip_type  congestion_surcharge 
##                   802                   802
# Investigate distribution of payment types
table(df$payment_type)
## 
##    1    2    3    4 
## 1732 1427   19    6

We observe the following:

  1. VendorID, store_and_fwd_flag, RatecodeID, payment_type, trip_type are categorical variables that must be converted into dummy variables for correlation and modeling
  2. A little more than half the data are Credit Card type payments, and the rest are non-Credit. According to the data dictionary, only credit card tips are recorded, so it is would be inaccurate to use non-credit card transactions for the purpose of forecasting tips (their values are recorded as 0 in this dataset when in reality they are missing values).
  3. The entire ehail_fee column is missing, and 802 rows are missing 7 columns of data
  4. PULocationID and DOLocationID are also categorical variables, but we suspect there are too many levels (more than 100) to be useful.
  5. lpep_pickup_datetime and lpep_pickup_datetime are type POSIXct, which also cannot be directly used as a continuous variable.
  6. While the min values of most features are 0 (logically, as they are dollar amounts), the maximums range from 0.5 to 200. A min/max normalization may make sense.

These will be addressed in the next section.

1.3 Distribution and Outliers

Additionally, we also took a first glance at the distribution of each feature and observe the following that may be useful for our modeling:

  • The pickup and dropoff locations are not uniformly distributed, suggesting there is a difference in popularity among the different regions. Could the popularity of the region be tied to the tip amount?
  • Trip distance, tip amount, fare amount, and total amount are all similarly distributed (right-skew, clustered at lower values).
  • All other variables are either categorical or quite sparse, making their usefulness for our model unlikely.
  • We note some outliers (or categories with extremely low representation):
    • Trips with a “yes” under store_and_fwd_flag (only 7 of 3986)
    • Trips with RatecodeID of not 1 (2, 3, 4, or 5) are very uncommon (91 of 3986)
    • Trips with a negative mta_tax, improvement_surcharge, fare_amount, extra, or total_amount (roughly 10 of 3986)
  • Concerning outliers in numerical features, listed below are the outliers that were found using the z-score approach. An outlier was considered to be any observation that had a z-score greater than 3.
# create z score function
z_score <- function(x) {
  z <- abs((mean(x)-x))/sd(x) 
  return(z)
}
numeric_columns <- c(8:14, 17, 20)

outliers <- df %>%
  mutate(across(numeric_columns, z_score, .names = "{.col}_z"))

# function to return outlier row indicies
filter_cond <- function(x) { return(which(x >= 3))
}
# apply outlier function to z-score columns
out_table <- lapply(outliers[21:29], filter_cond) 
out_table
## $passenger_count_z
## integer(0)
## 
## $trip_distance_z
##  [1]  105  116  157  259  272  273  274  293  299  419  469  471  488  527  604
## [16]  618  627  661  692  740  747  781  841  857  861  881  945  952 1045 1061
## [31] 1205 1283 1327 1361 1453 1545 1569 1578 1582 1595 1740 1761 1762 1834 1867
## [46] 1957 1978 2110 2133 2147 2162 2177 2184 2190 2200 2317 2377 2465 2491 2502
## [61] 2507 2544 2573 2586 2623 2629 2708 2717 2825 2839 2905 2914 2931 2945 2987
## [76] 3014 3065 3104 3115 3135 3236 3360 3389 3415 3417 3512 3563 3585 3588 3710
## [91] 3784 3819 3850 3851 3867 3938 3960
## 
## $fare_amount_z
##  [1]  234  273  293  327  347  527  604  607  618  690  747  754  781  841  857
## [16]  874  881  952 1045 1061 1165 1327 1361 1453 1569 1578 1582 1761 1785 1834
## [31] 1867 1957 2021 2147 2155 2284 2465 2491 2507 2544 2573 2586 2623 2629 2655
## [46] 2717 2771 2824 2905 2914 2931 2945 2987 2990 3014 3115 3415 3486 3501 3541
## [61] 3585 3588 3603 3651 3710 3777 3780 3850 3938
## 
## $extra_z
##  [1]  173  318  349  382  435  440  469  566  627  720  724  808 1145 1162 1242
## [16] 1275 1295 1322 1599 1697 1876 1946 2061 2090 2198 2287 2393 2502 2603 2708
## [31] 2714 2845 2887 2934 2960 2966 2983 3248 3255 3429 3432 3472 3579 3712 3803
## [46] 3942 3947 3953
## 
## $mta_tax_z
## [1]  312  368 1196 1286 1653 1764 2572 3650
## 
## $tip_amount_z
##  [1]    7  165  214  245  347  617  785  959 1060 1125 1142 1165 1174 1296 1297
## [16] 1401 1436 1582 1723 1762 1857 1902 2082 2117 2177 2284 2336 2511 2544 2554
## [31] 2656 2671 2730 2875 3091 3135 3258 3331 3366 3471 3486 3512 3553 3648 3678
## [46] 3747 3777
## 
## $tolls_amount_z
##   [1]   41  116  157  165  169  205  260  272  273  274  293  299  393  422  433
##  [16]  469  474  488  586  604  618  627  661  692  722  737  762  785  791  797
##  [31]  800  818  841  861  874  940  945  952  957  963  966  970 1045 1059 1061
##  [46] 1067 1083 1106 1183 1229 1242 1283 1290 1292 1295 1296 1328 1344 1399 1431
##  [61] 1453 1508 1543 1584 1595 1610 1676 1683 1694 1740 1760 1761 1762 1771 1785
##  [76] 1810 1813 1857 1882 1902 1910 1914 1945 1978 2070 2077 2079 2081 2090 2110
##  [91] 2117 2122 2141 2171 2177 2190 2262 2282 2317 2323 2327 2329 2336 2424 2432
## [106] 2499 2502 2507 2544 2571 2593 2617 2623 2629 2671 2717 2735 2825 2841 2875
## [121] 2931 2945 2963 2987 3006 3014 3065 3085 3091 3350 3360 3366 3373 3389 3393
## [136] 3432 3472 3486 3512 3513 3529 3543 3563 3572 3579 3581 3617 3661 3707 3784
## [151] 3810 3818 3819 3851 3863 3867 3924 3938 3953 3957 3960
## 
## $total_amount_z
##  [1]  234  273  293  327  347  488  527  604  618  627  754  781  841  861  874
## [16]  881  952 1045 1061 1165 1174 1297 1453 1569 1578 1582 1761 1762 1785 1834
## [31] 1867 2110 2147 2155 2177 2284 2491 2507 2544 2573 2623 2629 2655 2717 2905
## [46] 2931 2945 2987 2990 3014 3065 3115 3360 3415 3486 3501 3512 3585 3603 3651
## [61] 3661 3777 3850 3938
## 
## $congestion_surcharge_z
## integer(0)

1.4 Colinearity

Lastly, we create a first-pass correlation matrix for continuous variables to identify good predictors of tip amount.

cor(select(df, -c(lpep_pickup_datetime, lpep_dropoff_datetime, # not numeric
                  ehail_fee, # all NA
                  store_and_fwd_flag)), # not numeric
    use="complete.obs")[,"tip_amount"] %>%
  sort()
##          payment_type               mta_tax       passenger_count 
##          -0.481144064          -0.021460238           0.009492441 
## improvement_surcharge             trip_type              VendorID 
##           0.023954288           0.035391021           0.037499564 
##          PULocationID            RatecodeID          DOLocationID 
##           0.046262512           0.061372678           0.098629376 
##                 extra          tolls_amount  congestion_surcharge 
##           0.116532133           0.179537794           0.275778086 
##           fare_amount         trip_distance          total_amount 
##           0.307179675           0.313131330           0.494325022 
##            tip_amount 
##           1.000000000

2 Data Preparation

2.1 Feature Selection

We observed from our colinearity analysis that payment_type, total_amount, trip_distance, fare_amount, and congestion_surchage may be good candidates as they are the highest (exceeding 0.25) in correlation to tip_amount. We will repeat this in later sections after cleaning the data as well to confirm this. However, we may need to remove payment_type as previously stated, because tips for only Credit Card payments are recorded according to the data dictionary.

In addition, other features that may be selected to predict tip_amount would be: tolls_amount, MTA_tax, improvement_surcharge, and extra. The reason for these selections are that they influence the tip that the customer would receive on their credit card. Our reasoning is that customers may be upset when seeing these extra charges such as toll fees, taxes, or other fees. Subsequently, they may alter their tip depending on the amount of fees/tolls/tax that is included on their final charge.

Concerning categorical variables, we decided that RatecodeID, VendorID, and trip_type were to be included in the modeling. For RatecodeID, rates are different dependent on numerous factors. For example, if it was a negotiated fare rate, the taxi driver may get a better tip if the fare is reasonably priced for the customer. Therefore, based on the rate of the taxi ride, it is logical to conclude that there are different rates for trips, and could affect the tip amount. For the VendorID feature, different vendors may carry different reputations and thus encourage passengers to tip differently. For the trip_type feature, the success of street hailing a cab is more based on chance compared to dispatching, which is relatively straightforward. A poor experience with street hailing may therefore lead to fewer tips.

The others are omitted due to not being an influence for credit card. It would be passenger_count, store_and_fwd_flag, PULocationID, DOLocationID, lpep_dropoff_datetime, lpep_pickup_datetime, and ehail_fee. The ehail_fee specifically was entirely missing values and therefore contributed no information to the dataset. The passenger_count was excluded because the charge is independent of the total amount charged at the end of the taxi ride. This is supported by the low correlation value between the tip_amount and passenger_count above. The PULocationID and DOLocationID were excluded as the location of where you are getting picked up and dropped off does not typically affect a tip.

The lpep_dropoff_datetime, and lpep_pickup_datetime features are dates of times of pickup and dropoff. They may have some correlation with tips due to rush hour or busy travel periods. Therefore, they will be used for feature engineering since the original features are of type POSIXct, which cannot be directly used as a continuous variable.

remove <- c("passenger_count", "store_and_fwd_flag", "PULocationID", "DOLocationID", "lpep_dropoff_datetime", "lpep_pickup_datetime", "ehail_fee")
df <- df %>% select(-remove)
pairs.panels(df)

The pairs panel above visualizes the pairwise correlation between all 13 remaining features after elimination. The downward diagonal shows the distribution of each variable, the graphs below the diagonal plot each pair in a scatterplot, and the values above the diagonal indicate the linear correlation coefficient.

2.2 Derived Attributes

Lastly, we propose 3 new features that may be useful in prediction tip amount: trip duration, average trip speed in miles per hour, and whether the trip is during nighttime or daytime.

# Create 3 new features
tmp <- original %>%
  mutate(duration = as.numeric(lpep_dropoff_datetime - lpep_pickup_datetime),
         mph = trip_distance / (duration / 60),
         nighttime = ifelse(hour(lpep_pickup_datetime) > 17 |
                              hour(lpep_pickup_datetime) < 7, 1, 0))
# Re-create correlation matrix
cor(select(tmp, -c(lpep_pickup_datetime, lpep_dropoff_datetime, # not numeric
                  ehail_fee, # all NA
                  store_and_fwd_flag)), # not numeric
    use="complete.obs")[,"tip_amount"] %>%
  sort()
##          payment_type               mta_tax             nighttime 
##          -0.481071900          -0.021433070           0.009497283 
##       passenger_count improvement_surcharge                   mph 
##           0.009977380           0.023973765           0.028512701 
##              duration             trip_type              VendorID 
##           0.033778362           0.035365773           0.037160827 
##          PULocationID            RatecodeID          DOLocationID 
##           0.046210250           0.061347211           0.098978439 
##                 extra          tolls_amount  congestion_surcharge 
##           0.116568170           0.179521298           0.275724326 
##           fare_amount         trip_distance          total_amount 
##           0.307161016           0.313038733           0.494302022 
##            tip_amount 
##           1.000000000

Our proposed features have very poor correlation to the tip amount (less than 1%) and will likely not be good indicators. This may be because the trip duration and speed are already baked in to the fare_amount variable, as it is based on both time and distance according to the data dictionary. Additionally, the nighttime/daytime feature may also already be baked in (more accurately) by the extra feature, which includes the rush hour and overnight charges.

Before moving to the next section, please note that many of these steps must be repeated AFTER cleaning and normalizing the data, and takeaways may change as a result.

2.3 Cleaning

First, we convert categorical variables to factors.

df <- df %>%
  mutate(VendorID = factor(VendorID),
         RatecodeID = factor(RatecodeID),
         payment_type = factor(payment_type),
         trip_type = factor(trip_type),
         congestion_surcharge = factor(congestion_surcharge))
sapply(df, function(x) sum(is.na(x)))
##              VendorID            RatecodeID         trip_distance 
##                   802                   802                     0 
##           fare_amount                 extra               mta_tax 
##                     0                     0                     0 
##            tip_amount          tolls_amount improvement_surcharge 
##                     0                     0                     0 
##          total_amount          payment_type             trip_type 
##                     0                   802                   802 
##  congestion_surcharge 
##                   802
# Show result
str(df, give.attr=F)
## tibble [3,986 × 13] (S3: tbl_df/tbl/data.frame)
##  $ VendorID             : Factor w/ 2 levels "1","2": NA 2 2 NA 2 2 2 2 2 2 ...
##  $ RatecodeID           : Factor w/ 5 levels "1","2","3","4",..: NA 1 1 NA 1 1 1 1 1 1 ...
##  $ trip_distance        : num [1:3986] 4.28 1.04 0.4 2.54 2.1 0.68 0.98 0.39 2.9 2.74 ...
##  $ fare_amount          : num [1:3986] 13.3 6.5 3.5 23.4 9.5 ...
##  $ extra                : num [1:3986] 2.75 0 0 2.75 0.5 0.5 0 0 0.5 0 ...
##  $ mta_tax              : num [1:3986] 0 0.5 0.5 0 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ tip_amount           : num [1:3986] 0 0 0 0 2.16 1.26 8 0.86 2.66 2.18 ...
##  $ tolls_amount         : num [1:3986] 0 0 0 0 0 0 0 0 0 0 ...
##  $ improvement_surcharge: num [1:3986] 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 ...
##  $ total_amount         : num [1:3986] 16.4 7.3 4.3 26.5 13 ...
##  $ payment_type         : Factor w/ 4 levels "1","2","3","4": NA 2 2 NA 1 1 1 1 1 1 ...
##  $ trip_type            : Factor w/ 2 levels "1","2": NA 1 1 NA 1 1 1 1 1 1 ...
##  $ congestion_surcharge : Factor w/ 3 levels "0","2.5","2.75": NA 1 1 NA 1 1 1 1 1 3 ...

There are 802 missing values in 5 different columns. However, credit card payments are the only payments that populate an observation in their corresponding tip_amount column. Thankfully, when filtered for payment_type = 1 or for credit cards, these missing values are also filtered out as demonstrated below.

df <- df %>% filter(payment_type == 1) %>% droplevels() %>% select(-payment_type)
# Show result
sapply(df, function(x) sum(is.na(x)))
##              VendorID            RatecodeID         trip_distance 
##                     0                     0                     0 
##           fare_amount                 extra               mta_tax 
##                     0                     0                     0 
##            tip_amount          tolls_amount improvement_surcharge 
##                     0                     0                     0 
##          total_amount             trip_type  congestion_surcharge 
##                     0                     0                     0

Concerning outliers, the z-score method was used on the newly filtered to find any observation with a z-score greater than 3 standard deviations.

numeric_columns <- which(sapply(df, is.numeric))

outliers <- df %>%
  mutate(across(numeric_columns, z_score, .names = "{.col}_z"))

# function to return outlier row indicies
filter_cond <- function(x) { return(which(x >= 3))
}
# apply outlier function to z-score columns
out_table <- lapply(outliers[13:20], filter_cond) 

# show data frame with outliers
outlier_table <- outliers %>% filter(
  trip_distance_z >= 3 | fare_amount_z >= 3 | 
  extra_z >= 3 | mta_tax_z >= 3 | tip_amount_z >= 3 | 
  tolls_amount_z >= 3 | improvement_surcharge_z >3)
out_percent <- (nrow(outlier_table)/nrow(outliers)) * 100

# impute
df <- outliers %>% mutate(
  trip_distance = replace(trip_distance, which(trip_distance_z >= 3), mean(trip_distance, na.rm = TRUE)),
  fare_amount = replace(fare_amount, which(fare_amount_z >= 3),  mean(fare_amount, na.rm = TRUE)),
  extra = replace(extra, which(extra_z >= 3), mean(extra, na.rm = TRUE)),
  mta_tax = replace(mta_tax, which(mta_tax_z >= 3), mean(mta_tax, na.rm = TRUE)),
  tip_amount = replace(tip_amount, which(tip_amount_z >= 3), mean(tip_amount, na.rm = TRUE)),
  tolls_amount = replace(tolls_amount, which(tolls_amount_z >= 3),  mean(tolls_amount, na.rm = TRUE)),
  improvement_surcharge = replace(improvement_surcharge, which(improvement_surcharge_z >= 3), 
                                  mean(improvement_surcharge, na.rm = TRUE))) %>%
  select(1:12)
# Show result
summary(df)
##  VendorID RatecodeID trip_distance     fare_amount        extra       
##  1: 316   1:1693     Min.   : 0.000   Min.   : 0.00   Min.   :0.0000  
##  2:1416   2:   3     1st Qu.: 0.990   1st Qu.: 6.50   1st Qu.:0.0000  
##           3:   1     Median : 1.745   Median :10.00   Median :0.0000  
##           4:   1     Mean   : 2.460   Mean   :12.18   Mean   :0.3491  
##           5:  34     3rd Qu.: 3.143   3rd Qu.:15.00   3rd Qu.:0.5000  
##                      Max.   :13.280   Max.   :47.00   Max.   :1.0000  
##     mta_tax         tip_amount      tolls_amount     improvement_surcharge
##  Min.   :0.4896   Min.   : 0.000   Min.   :0.00000   Min.   :0.2986       
##  1st Qu.:0.5000   1st Qu.: 1.000   1st Qu.:0.00000   1st Qu.:0.3000       
##  Median :0.5000   Median : 1.950   Median :0.00000   Median :0.3000       
##  Mean   :0.4998   Mean   : 2.091   Mean   :0.01602   Mean   :0.3000       
##  3rd Qu.:0.5000   3rd Qu.: 2.860   3rd Qu.:0.00000   3rd Qu.:0.3000       
##  Max.   :0.5000   Max.   :10.000   Max.   :2.80000   Max.   :0.3000       
##   total_amount    trip_type congestion_surcharge
##  Min.   :  0.00   1:1699    0   :1357           
##  1st Qu.:  9.36   2:  33    2.5 :   1           
##  Median : 13.56             2.75: 374           
##  Mean   : 17.52                                 
##  3rd Qu.: 20.61                                 
##  Max.   :199.10

There were 167 outliers that constitute 9.6420323% of the data. Since this is larger than 5%, we decided to mean impute any outliers.

No data transformations were done as kNN modeling does not require normal distributions prior to model building.

2.4 Normalization

# normalize function
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
# save target variable in vector to avoid normalization
tip <- df$tip_amount
# normalize outlier-free numeric data, keep categorical columns
df <- as.data.frame(lapply(df, function(col) {
  if(is.numeric(col)) {
    normalize(col)
  } else {
    col
  }
}))
# add back target variable
df$tip_amount <- tip
# Show result
summary(df)
##  VendorID RatecodeID trip_distance      fare_amount         extra       
##  1: 316   1:1693     Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  2:1416   2:   3     1st Qu.:0.07455   1st Qu.:0.1383   1st Qu.:0.0000  
##           3:   1     Median :0.13140   Median :0.2128   Median :0.0000  
##           4:   1     Mean   :0.18527   Mean   :0.2592   Mean   :0.3491  
##           5:  34     3rd Qu.:0.23663   3rd Qu.:0.3191   3rd Qu.:0.5000  
##                      Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##     mta_tax         tip_amount      tolls_amount      improvement_surcharge
##  Min.   :0.0000   Min.   : 0.000   Min.   :0.000000   Min.   :0.0000       
##  1st Qu.:1.0000   1st Qu.: 1.000   1st Qu.:0.000000   1st Qu.:1.0000       
##  Median :1.0000   Median : 1.950   Median :0.000000   Median :1.0000       
##  Mean   :0.9792   Mean   : 2.091   Mean   :0.005723   Mean   :0.9954       
##  3rd Qu.:1.0000   3rd Qu.: 2.860   3rd Qu.:0.000000   3rd Qu.:1.0000       
##  Max.   :1.0000   Max.   :10.000   Max.   :1.000000   Max.   :1.0000       
##   total_amount     trip_type congestion_surcharge
##  Min.   :0.00000   1:1699    0   :1357           
##  1st Qu.:0.04701   2:  33    2.5 :   1           
##  Median :0.06811             2.75: 374           
##  Mean   :0.08799                                 
##  3rd Qu.:0.10349                                 
##  Max.   :1.00000

Data was normalized using min-max normalization on all numeric features in the dataset. A shown in the output, all features now range from 0 to 1 (tip amount is the dependent variable we are trying to predict and thus should not undergo normalization).

2.5 Encoding Categorical Features

# one hot encode RateCodeID , add to df
encoded_rate <- model.matrix(~ RatecodeID - 1, data = df)
encoded_rate <- as.data.frame(encoded_rate)
df<- cbind(df, encoded_rate)
# remove original RatecodeID 
df$RatecodeID <- NULL
encoded_cong <- model.matrix(~ congestion_surcharge - 1, data = df)
encoded_cong <- as.data.frame(encoded_cong)
df <- cbind(df, encoded_cong)
df$congestion_surcharge <- NULL
# dummy code vendorID, trip_type
df <- df %>%
  mutate(VendorID = ifelse(VendorID == 1, 0, 1),
         trip_type = ifelse(trip_type == 1, 0, 1))
# Show result
str(df)
## 'data.frame':    1732 obs. of  18 variables:
##  $ VendorID                : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ trip_distance           : num  0.1581 0.0512 0.0738 0.0294 0.2184 ...
##  $ fare_amount             : num  0.2021 0.1064 0.1277 0.0745 0.2553 ...
##  $ extra                   : num  0.5 0.5 0 0 0.5 0 0.5 0 1 0 ...
##  $ mta_tax                 : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ tip_amount              : num  2.16 1.26 8 0.86 2.66 2.18 0 2.56 2.3 0 ...
##  $ tolls_amount            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ improvement_surcharge   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ total_amount            : num  0.0651 0.038 0.0743 0.0259 0.0802 ...
##  $ trip_type               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ RatecodeID1             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ RatecodeID2             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ RatecodeID3             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ RatecodeID4             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ RatecodeID5             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ congestion_surcharge0   : num  1 1 1 1 1 0 1 1 1 1 ...
##  $ congestion_surcharge2.5 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ congestion_surcharge2.75: num  0 0 0 0 0 1 0 0 0 0 ...

The RatecodeID and congestion_surcharge features were one hot encoded since they had more than 2 levels. The VendorID and trip_type features were dummy encoded since they only had 2 levels.

2.6 Random Partitioning in Testing and Training Sets

Lastly, we split the data into the recommended 80%/20% for the training set and testing set. This is general practice as a good starting point. However, we attempt a bonus step in the very last section to investigate the impact of this decision further.

set.seed(1)
# shuffle data
df <- df[sample(nrow(df)),]

# partition data
validation <- createDataPartition(df$tip_amount, p = .2, list = FALSE)
# create training and test data
data_test <- df[validation,]
data_train <- df[-validation,]
# Show resulting df dimensions
dim(data_test)
## [1] 348  18
dim(data_train)
## [1] 1384   18

3 Data Modeling with kNN

3.1 Build Model

Below, we create a function that takes in a training dataset to train a kNN model using the inputted k value, applies the newly trained model to the provided testing dataset, and returns the mean squared error (MSE). We use the FNN package to achieve this.

knn_predict <- function(data_train, data_test, k) {
  train_actual <- data_train$tip_amount
  train <- data_train %>% select(-tip_amount)
  test_actual <- data_test$tip_amount
  test <- data_test %>% select(-tip_amount)
  pred <- knn.reg(train,test, train_actual, k)$pred
  mse <- mean((test_actual - pred)^2)
  return(mse)
}

We see the function in action below, with a truncated dataset:

data_train_smaller <- head(data_train, n=40)
data_test_smaller <- head(data_test, n=10)

knn_predict(data_train_smaller, data_test_smaller, k=3)
## [1] 1.605968

3.2 Assess Model

mse_values <- numeric(60)

for (i in 1:60) {
  k <- i  
  mse_values[i] <- knn_predict(data_train, data_test, k)
}

k_mse <- data.frame(
  k = 1:60,
  mse = mse_values)
min.mse <- min(k_mse$mse)
# mse vs. k plot
ggplot(k_mse, aes(x = k, y = mse, 
       label=ifelse(mse==min.mse, paste("Min:", round(mse, 4)), NA))) + 
  geom_line() + 
  labs(title = "K vs MSE",
       y = "Mean Squared Error (MSE)") + 
  geom_label_repel(size=3, box.padding=1) +
  theme_bw()

The square root of the number of observations, which is approximately 41, is typically the k-value used in kNN modeling. For the loop, we wanted to explore a large range of values and made sure to use a range that contains the square root. From the plot above, you can see that the MSE rapidly decreases until reaching the minimum MSE of 1.8095905. Afterwards, the MSE increases gradually as k increases before slowly plateauing. The graph indicates that the most optimal k-value is 5 with the lowest MSE of 1.8095905.

4 Tune Parameter k

Using this optimized value of k, the kNN model was used to predict values for tip_amount on the testing dataset.

# ideal k pred and plot
k_pred <- knn.reg(data_train[, !names(data_train) %in% "tip_amount"],
                   data_test[, !names(data_train) %in% "tip_amount"],
                   data_train$tip_amount, which.min(k_mse$mse))$pred
preds <- data.frame(
  actual = data_test$tip_amount,
  pred = k_pred
)
ggplot(preds, aes(x = actual, y = pred)) + geom_point() +
  labs(x = "Actual tip amounts", y = "Predicted tip amounts",
       title = "Predicted vs. Actual tip amounts for k = 5",
       caption = "Blue dotted line represents line of identity") + 
  geom_abline(slope=1, intercept=0, linetype="dashed", color="blue") +
  theme_bw()

We visualized the predicted tip vs. the actual tip amounts in a scatterplot. If we were to have 100% accuracy or a 0% MSE, all the values would be in a single positive line. However since there is a non-zero MSE, there is some distribution displayed. Interestingly, the model does not appear to predict cases of $0 tips very well, suggesting a potential limitation of the model. Other avenues that might be worth exploring to predict tip data would be multiple linear regression or other machine learning methods.

5 Assess Impact of Training Split Proportion on MSE

Analyzing relationship between training split proportion and MSE

set.seed(1) # for reproducibility

# Define percentage of total dataset dedicated for training: 80%
train_pct_vec = seq(.1, .9, by=0.1)

pct_vs_k_vs_mse = data.frame()

for (train_pct in train_pct_vec) {
  # Randomly sample
  ind <- sample(nrow(df), floor(train_pct * nrow(df))) # indices randomly selected
  data_train <- df[ind,]
  data_test <- df[-ind,]
  
  # Recommended starting place: sqrt of # training cases
  floor(sqrt(nrow(data_train)))
  # Ensure range includes that
  k_vals <- seq(from=1, to=60, by=1)
  
  k_opti_results <- data.frame(k_vals = k_vals,
             mse = -1)
  
  for (ii in 1:length(k_vals)) {
    k_opti_results$mse[ii] <- knn_predict(data_train, data_test, k_vals[ii])
  }
  
  pct_vs_k_vs_mse <- k_opti_results %>%
    mutate(train_prop = train_pct) %>%
    rbind(pct_vs_k_vs_mse)
  
}

pct_vs_k_vs_mse %>%
  ggplot(aes(x=k_vals, y=mse, group=train_prop, color=factor(train_prop))) +
  geom_line(size=1) +
  theme_bw() +
  labs(x="k",
       y="Mean Squared Error in $ Tip Amount",
       color="Training Data\nProportion",
       title="k-vs-MSE curves for varying proportion splits of training/testing data")

pct_vs_k_vs_mse %>%
  group_by(train_prop) %>%
  filter(mse == min(mse)) %>%
  arrange(train_prop) %>%
  kable(col.names=c("k","Minimum MSE","Training Data Proportion"))
k Minimum MSE Training Data Proportion
11 1.863870 0.1
5 1.904896 0.2
31 1.891400 0.3
10 1.703953 0.4
11 1.680989 0.5
10 1.626330 0.6
9 1.613804 0.7
4 1.583319 0.8
15 2.154987 0.9

To evaluate the effect of the percentage split for training and test sets, we tested various partitions and their resulting MSE based on k-values of 1 to 60. From the graph, it looks like the 0.8 split makes the best predictions for the model as it has the lowest MSE of 1.583319 with a k-value of 4. Our original model with the 0.8 split from question 4 had a MSE of 1.8095905 with a k-value of 5. This is similar to our findings from the model in question 4. Differences in MSE values may be due to randomization of the sample or the split itself.

We also notice a trend in how rapidly MSE drops off as k becomes larger. With training data proportions of 0.7 and 0.8, the MSE drops off most rapidly and stays quite flat, suggesting it is less sensitive to the chosen value of k. For the lowest proportion models, 0.1 through 0.3, the MSE drops off less quickly with respect to k, the minimum point varies greatly among them, and the remainder of the line fluctuates, suggesting that fine-tuning k is essential when using lower-percentage splits of training data.

The highest-proportion model, 0.9, is a stark outlier with the highest MSE by far for nearly all values of k. This suggests that using too much data for training purposes and leaving too little for testing purposes can actually make the model less accurate.