Load Packages

library(tidyverse)
library(psych)
library(stringr)
library(ggmap)
library(readr)

Load and Process Data

Original Post Office Data

Original dataset, post first-round geocoding, contains all 249 post offices in NYC (5 Boroughs).

location_data <- read_csv("data/post_office_coords.csv", show_col_types = FALSE)

Manufactured Routes Dataset

After randomizing the rows of the above address dataset, I created every combination of 2 addresses without repetition. This requires the assumption that the travel duration between any two points in the city is the same in both directions (which is likely not the case). The reason for this decision is due to financial limitations of google maps API (gmaps), which was used to compute the travel distance and duration for each combinatin of addresses. gmaps provides users with a generous amount of free queries after signup, but as it is my personal account I did not want to cross the free-threshold.

In total there are 30,876 route combinations used in the subsequent analysis.

data <- read_csv("data/route_data.csv", show_col_types = FALSE)

Data Cleaning and Additional Feature Engineering

There are instances where different post offices operated in separate units of the same building. In these situations the geodesic distance calculated with geopy and the google_duration provided by gmaps would equal 0, which should not be included in the model.

data <- data %>%
  filter(
    google_duration > 300
  )

In addition to the is_same_borough feature which was created using python, here I create a function that will extract an address’ zip code, later to be used to create is_same_zip feature for a route

extract_zip <- function(address) {
  zip <- str_extract(address, "\\d{5}")
  
  return(zip)
}

Additionally, since we have the data, we should test to see if adding a categorical value designating which two boroughs were involved in a given route will provide value to a model.

get_borough_path <- function(path_string) {
  split <- unlist(str_split(path_string,","))
  b1 <- split[1]
  b2 <- split[2]
  b_vector <- c(b1, b2)
  sorted <- sort(b_vector)
  return(str_c(sorted, collapse=","))
}
borough_paths <- lapply(str_c(data$origin_borough, data$destination_borough, sep=","), get_borough_path)
data <- data %>%
  mutate(
    origin_zip = extract_zip(origin_address),
    destination_zip = extract_zip(destination_address),
    is_same_zip = origin_zip == destination_zip,
    borough_path = unlist(borough_paths),
    log_norm_geodesic = scale(log(geodesic_distance),center=TRUE, scale=TRUE)
  ) %>%
  select(origin_address, 
         origin_zip,
         origin_borough, 
         origin_coordinate,
         destination_address,
         destination_zip,
         destination_borough,
         destination_coordinate,
         is_same_zip,
         is_same_borough,
         borough_path,
         geodesic_distance,
         log_norm_geodesic,
         google_distance,
         google_duration)

EDA and Data Visualization

Address Dataset

The routes dataset is based on the initial address dataset, which includes 249 post office addresses in teh 5 boroughs of NYC.

location_data %>%
  ggplot() +
  geom_bar(aes(x=Borough, fill=Borough)) +
  theme(legend.position = "none")

Generating map plot

Below we create two functions that extract the latitude and longitude from the initial address dataset (Coordinate was saved as a string).

extract_lat <- function(coordinate_string) {
  lat <- unlist(str_split(str_replace_all(coordinate_string,"\\(?\\)?",""),","))[1]
  
  return(lat)
}
extract_long <- function(coordinate_string) {
  long <- unlist(str_split(str_replace_all(coordinate_string,"\\(?\\)?",""),","))[2]
  
  return(long)
}
lat_data = lapply(location_data$Coordinate, extract_lat)
long_data = lapply(location_data$Coordinate, extract_long)

Next we create another dataframe in the correct format for gmaps

coord_data <- location_data %>%
  select(
    Address,
    Coordinate
  ) %>%
  mutate(
    lat = as.numeric(unlist(lat_data)),
    long = as.numeric(unlist(long_data))
  ) %>%
  select(
    Address,
    lat,
    long
  )
api_key <- read_csv("key_file.csv", show_col_types = FALSE)$key[1]
register_google(key = api_key)
nymap <- get_map(location = c(lon = mean(coord_data$long), lat = mean(coord_data$lat)), zoom = 10,
                      maptype = "satellite", scale = 2)
ggmap(nymap) +
  geom_point(data = coord_data, aes(x = long, y = lat, fill = "red", alpha = 0.8), size = 2, shape = 21) +
  guides(fill=FALSE, alpha=FALSE, size=FALSE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Routes Dataset

describe(data$google_duration)

From the below histogram the response variable ‘google_duration’ (how long google estimates a trip between two points will tage) is normally distributed with a mean of 1897.7 and a standard deviation of 727.13

data %>%
  ggplot() +
  geom_histogram((aes(x=google_duration)))

describe(data$geodesic_distance)

Unlike ‘google_duration’, the primary independent variable ‘geodesic_distance’ has a clear right skew and potentially breaches assumption of normality.

The Shapiro test confirms non-normality with a p-value close to zero.

shapiro.test(sample(data$geodesic_distance,100))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(data$geodesic_distance, 100)
## W = 0.94321, p-value = 0.0003042
data %>%
  ggplot() +
  geom_histogram((aes(x=geodesic_distance)))

qqnorm(data$geodesic_distance)
qqline(data$geodesic_distance)

Let’s test if the log is any better

data %>%
  ggplot() +
  geom_histogram((aes(x=log_norm_geodesic)))

It seems that the log is not much better than standard geodesic_distance.

qqnorm(data$log_norm_geodesic)
qqline(data$log_norm_geodesic)

shapiro.test(sample(data$log_norm_geodesic,100))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(data$log_norm_geodesic, 100)
## W = 0.91418, p-value = 7.002e-06
cor(data$geodesic_distance, data$google_duration)
## [1] 0.8704409
data %>%
  ggplot(aes(x=geodesic_distance, y=google_duration)) +
  geom_point() +
  geom_smooth(method="lm")

Let’s see if we can see a significant effect on the response based on different ‘borough_paths’

data %>%
  ggplot() +
  geom_point(aes(x=geodesic_distance, y=google_duration)) +
  facet_wrap(~borough_path)

data %>%
  ggplot() + 
  geom_boxplot(aes(x=google_duration, y=borough_path), fill="purple")

Perform ANOVA test

anova <- aov(google_duration ~ borough_path, data = data)

summary(anova)
##                 Df    Sum Sq   Mean Sq F value Pr(>F)    
## borough_path    14 9.100e+09 649987339    2858 <2e-16 ***
## Residuals    30773 6.999e+09    227452                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Following this F Test, we confirm that the 'borough_path' has a significant impact on the ultimate google_duration and that this variable should be retaiend in our model.

Create Linear Model

Baseline model using just geodesic distance

baseline = lm(google_duration ~ geodesic_distance, data=data)
summary(baseline)
## 
## Call:
## lm(formula = google_duration ~ geodesic_distance, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1218.15  -255.95   -45.47   214.78  1604.54 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       778.6676     4.1503   187.6   <2e-16 ***
## geodesic_distance  72.8380     0.2348   310.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 356 on 30786 degrees of freedom
## Multiple R-squared:  0.7577, Adjusted R-squared:  0.7577 
## F-statistic: 9.625e+04 on 1 and 30786 DF,  p-value: < 2.2e-16

Interpreting the above, we can write out the following linear model:

google_duration = 772.8249 + 73.1228*geodesic_distance

Intercept: assuming geodesic_distance of 0, the google_duration will be roughly 773 seconds

Slope: for each additinal km added to geodesic distance, we can expect that google_duration will increase be roughly 73 seconds.

Baseline log

log_model = lm(google_duration ~ log_norm_geodesic, data=data)
summary(log_model)
## 
## Call:
## lm(formula = google_duration ~ log_norm_geodesic, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -997.32 -300.78  -21.85  266.35 1669.34 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1901.97       2.23   853.0   <2e-16 ***
## log_norm_geodesic   608.15       2.23   272.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 391.3 on 30786 degrees of freedom
## Multiple R-squared:  0.7073, Adjusted R-squared:  0.7073 
## F-statistic: 7.438e+04 on 1 and 30786 DF,  p-value: < 2.2e-16

Adding additional features

feature_model = lm(google_duration ~ poly(geodesic_distance,1) + is_same_zip + borough_path, data=data)
summary(feature_model)
## 
## Call:
## lm(formula = google_duration ~ poly(geodesic_distance, 1) + is_same_zip + 
##     borough_path, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1106.1  -203.8   -16.3   184.8  1422.2 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               1643.330     10.098 162.742  < 2e-16
## poly(geodesic_distance, 1)              102094.215    484.794 210.593  < 2e-16
## is_same_zipTRUE                           -359.860     38.426  -9.365  < 2e-16
## borough_pathBronx,Brooklyn                 325.742     12.677  25.695  < 2e-16
## borough_pathBronx,Manhattan                 60.776     11.407   5.328 1.00e-07
## borough_pathBronx,Queens                   -14.589     11.686  -1.248    0.212
## borough_pathBronx,Staten Island            295.750     17.859  16.560  < 2e-16
## borough_pathBrooklyn,Brooklyn              353.716     11.924  29.665  < 2e-16
## borough_pathBrooklyn,Manhattan             567.055     11.088  51.139  < 2e-16
## borough_pathBrooklyn,Queens                364.738     11.247  32.430  < 2e-16
## borough_pathBrooklyn,Staten Island         433.577     14.199  30.535  < 2e-16
## borough_pathManhattan,Manhattan             80.746     11.824   6.829 8.72e-12
## borough_pathManhattan,Queens               220.075     11.174  19.695  < 2e-16
## borough_pathManhattan,Staten Island        520.086     14.805  35.129  < 2e-16
## borough_pathQueens,Queens                   65.196     11.867   5.494 3.96e-08
## borough_pathQueens,Staten Island           393.513     15.839  24.845  < 2e-16
## borough_pathStaten Island,Staten Island     -7.474     29.746  -0.251    0.802
##                                            
## (Intercept)                             ***
## poly(geodesic_distance, 1)              ***
## is_same_zipTRUE                         ***
## borough_pathBronx,Brooklyn              ***
## borough_pathBronx,Manhattan             ***
## borough_pathBronx,Queens                   
## borough_pathBronx,Staten Island         ***
## borough_pathBrooklyn,Brooklyn           ***
## borough_pathBrooklyn,Manhattan          ***
## borough_pathBrooklyn,Queens             ***
## borough_pathBrooklyn,Staten Island      ***
## borough_pathManhattan,Manhattan         ***
## borough_pathManhattan,Queens            ***
## borough_pathManhattan,Staten Island     ***
## borough_pathQueens,Queens               ***
## borough_pathQueens,Staten Island        ***
## borough_pathStaten Island,Staten Island    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 304.6 on 30771 degrees of freedom
## Multiple R-squared:  0.8226, Adjusted R-squared:  0.8225 
## F-statistic:  8920 on 16 and 30771 DF,  p-value: < 2.2e-16

The Adjusted R Squared went up a lot!

Testing out polynomials

poly_model = lm(google_duration ~ poly(geodesic_distance,2) + is_same_zip  + borough_path, data=data)
summary(poly_model)
## 
## Call:
## lm(formula = google_duration ~ poly(geodesic_distance, 2) + is_same_zip + 
##     borough_path, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -970.8 -202.2  -15.5  183.5 1396.3 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               1725.851      9.913 174.099  < 2e-16
## poly(geodesic_distance, 2)1             100333.215    469.865 213.536  < 2e-16
## poly(geodesic_distance, 2)2             -17659.091    376.655 -46.884  < 2e-16
## is_same_zipTRUE                           -247.442     37.201  -6.651 2.95e-11
## borough_pathBronx,Brooklyn                 223.750     12.439  17.988  < 2e-16
## borough_pathBronx,Manhattan                -50.455     11.273  -4.476 7.64e-06
## borough_pathBronx,Queens                  -138.634     11.595 -11.956  < 2e-16
## borough_pathBronx,Staten Island            496.379     17.777  27.923  < 2e-16
## borough_pathBrooklyn,Brooklyn              322.294     11.539  27.931  < 2e-16
## borough_pathBrooklyn,Manhattan             456.268     10.970  41.592  < 2e-16
## borough_pathBrooklyn,Queens                237.972     11.197  21.253  < 2e-16
## borough_pathBrooklyn,Staten Island         317.540     13.940  22.780  < 2e-16
## borough_pathManhattan,Manhattan             75.335     11.424   6.595 4.34e-11
## borough_pathManhattan,Queens               101.413     11.088   9.146  < 2e-16
## borough_pathManhattan,Staten Island        448.183     14.385  31.156  < 2e-16
## borough_pathQueens,Queens                    2.534     11.542   0.220   0.8262
## borough_pathQueens,Staten Island           430.652     15.322  28.106  < 2e-16
## borough_pathStaten Island,Staten Island    -61.804     28.761  -2.149   0.0317
##                                            
## (Intercept)                             ***
## poly(geodesic_distance, 2)1             ***
## poly(geodesic_distance, 2)2             ***
## is_same_zipTRUE                         ***
## borough_pathBronx,Brooklyn              ***
## borough_pathBronx,Manhattan             ***
## borough_pathBronx,Queens                ***
## borough_pathBronx,Staten Island         ***
## borough_pathBrooklyn,Brooklyn           ***
## borough_pathBrooklyn,Manhattan          ***
## borough_pathBrooklyn,Queens             ***
## borough_pathBrooklyn,Staten Island      ***
## borough_pathManhattan,Manhattan         ***
## borough_pathManhattan,Queens            ***
## borough_pathManhattan,Staten Island     ***
## borough_pathQueens,Queens                  
## borough_pathQueens,Staten Island        ***
## borough_pathStaten Island,Staten Island *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 294.3 on 30770 degrees of freedom
## Multiple R-squared:  0.8345, Adjusted R-squared:  0.8344 
## F-statistic:  9124 on 17 and 30770 DF,  p-value: < 2.2e-16

I am happy with this model. Let’s see what the RSME is!

sqrt(sum(poly_model$residuals^2) / length(poly_model$residuals))
## [1] 294.2121

Not bad at all! RSME is 294 seconds, or just under 5 minutes.

294/60

ggplot(data = poly_model, aes(x = .fitted, y = .resid, color=borough_path, alpha=.1)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

poly_model %>%
  ggplot() +
  geom_boxplot(aes(x=.resid, y=borough_path), fill="purple") +
  xlab("Residuals") +
  ylab("Borough Path")

qqnorm(poly_model$residuals)
qqline(poly_model$residuals)

ggplot() +
  geom_histogram(aes(x=poly_model$residuals))

data <- data %>%
  mutate(
    predictions = poly_model$fitted.values,
    residuals = poly_model$residuals
  )
data %>%
  arrange(desc(residuals))

2012 / 60

get_address_residual <- function(address){
  in_scope_df <- data%>%
                  filter(
                    origin_address == address | destination_address == address
                  )
  avg_residual = mean(abs(in_scope_df$residuals))
  
  return(avg_residual)
  
}
coord_data <- coord_data %>%
  mutate(
    average_residual = unlist(lapply(coord_data$Address, get_address_residual)),
    residual_group = ntile(average_residual, 20))
ggmap(nymap) +
  geom_point(data = coord_data, aes(x = long, y = lat, color = residual_group), size = 2, shape = 20, position="jitter") +
  scale_color_gradient(low="green", high="red")

ggmap(nymap) +
  geom_point(data = coord_data[coord_data$residual_group %in% c(18,19,20),], aes(x = long, y = lat), color = "orange", size = 2, shape = 20, position="jitter")

Over Predictions 14506 243rd St, Rosedale, NY 11422 626 Sheepshead Bay Rd Ste 8, Brooklyn, NY 11224

Under Predictions 45 Bay St Ste 2, Staten Island, NY 10301 1369 Broadway, Brooklyn, NY 11221

under_predictions <- coord_data %>%
  mutate(
    type = "under"
  ) %>%
  filter(
    str_detect(Address, "45 Bay St Ste 2") | str_detect(Address, "1369 Broadway")
  )
over_predictions <- coord_data %>%
  mutate(
    type = "over"
  ) %>%
  filter(
    str_detect(Address, "14506 243rd St") | str_detect(Address, "626 Sheepshead Bay Rd")
  )
bad_predictions <- rbind(over_predictions, under_predictions)
ggmap(nymap) +
  geom_point(data = bad_predictions, aes(x = long, y = lat, color = type), size = 2, shape = 20, position="jitter")