In this section we will first explore the dataset to gain an understanding of our relevant parameters and to determine trends. Then we will aggreggate our data to zones which in this case we selected to be New York City Post Code. From this, we then call the Open Streem Map API to gain data on the built environment factors that may influence Passenger Demand.

library(glmnet)
library(lme4)
library(readr)
library(Hmisc)
library(osmdata)
library(darksky)
library(ggplot2)
library(tidyverse)
library(rgdal)
library(rgeos)
library(tmap)
library(leaflet)
library(RColorBrewer)
library(sp)
library(spatialEco)
library(mapview)
library(sf)
library(dplyr)
library(tidyr)
library(plm)
library(corrplot)

Start of interactive Tutorial::

wd = "~/Uni Temp Work/Big Data"
plsep <- .Platform$file.sep
march <- read.csv(paste(wd, plsep, "march.csv", sep =""))

Exploratory Data Analysis

factors <- c("hour", "day", "RateCodeID", "Payment_type", "summary")
march[, factors] <- lapply(march[factors], as.factor)
days <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
rate_code <- c("")
march <- arrange(transform(march,
                           day=factor(day,levels=days)),day)

Distance

ggplot(march, aes(x = Trip_distance))+
  geom_histogram(binwidth = 0.3, aes(fill = ..count..)) +
  scale_fill_gradient( name = "Frequency",
                       low = "orange",
                       high = "darkred", 
                       na.value = "grey50",
                       guide = "colourbar") +
  coord_cartesian(xlim = c(0,12)) +
  ggtitle("Histogram for Trip Distance") +
  xlab("Trip Distance (miles)") + 
  ylab("Frequency of Trip Distance") +
  geom_vline(aes(xintercept=mean(Trip_distance)),
             color="black", linetype="dashed", size=1) +
  theme_bw()

What distribution do you think this approximates to ?

Time of day

hour <- ggplot(march, aes(x = hour, y = Trip_distance)) + 
  geom_col(fill = "#6CB2EB") + 
  theme_bw() +
  ggtitle("Trip Distance by time of day") +
  xlab("Hour of day") +
  ylab(expression('Miles Travelled by Green Taxi')) +
  theme(axis.title.x=element_text(size = 12, face = "bold")) +
  theme(axis.text.x=element_text(size=10, face = "bold")) + 
  theme(axis.title.y=element_text(size=12, face = "bold")) +
  theme(axis.text.y=element_text(size=10, face = "bold")) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) 
hour

day <- ggplot(march, aes(x = day, y = Trip_distance)) + 
  geom_col(aes(fill = day )) + 
  scale_fill_brewer(palette="Dark2") +
  theme_bw() +
  ggtitle("Trip Distance by Day of the Week") +
  xlab("Day of Week") +
  ylab(expression('Miles Travelled by Green Taxi')) +
  theme(axis.title.x=element_text(size = 12, face = "bold")) +
  theme(axis.text.x=element_text(size=10, face = "bold")) + 
  theme(axis.title.y=element_text(size=12, face = "bold")) +
  theme(axis.text.y=element_text(size=10, face = "bold")) +
  theme(legend.position = "none") +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  scale_x_discrete(limits = days)
day

hour_day <- ggplot(march, aes(x = hour, y = Trip_distance)) + 
  geom_col(fill = "#6CB2EB") + 
  theme_bw() +
  ggtitle("Trip Distance by time of day and day of week") +
  xlab("Hour of day") +
  ylab(expression('Miles Travelled by Green Taxi')) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_blank()) + 
  theme(axis.title.y=element_text(size=12, face = "bold")) +
  theme(axis.text.y=element_text(size=10, face = "bold")) +
  theme(axis.ticks.x = element_blank()) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  facet_wrap(~ day)
hour_day

When is peak taxi demand ? What problems occur from having such variations in demand? For City Planners ? For Taxi Companies ? For Taxi Drivers ? For Air Pollution ?

Taxi Fare:

ggplot(march, aes(x = day, y = Total_amount)) +
  geom_boxplot(fill = "#6CB2EB", color = "#22292F",
               alpha = .8) +
  labs(
    title = "Total taxi fare amount and day of week",
    subtitle = str_wrap(paste("Hmm not very interesting! Quite a lot of outliers (Remember it is lognormally distributed!!)"), 
                        width = 80)) +
  theme(
    panel.background = element_rect(color = "steelblue"),
    axis.text        = element_text(size = rel(1.2))
  )

fare_graph_hour <- march %>% 
  group_by(hour) %>% 
  ggplot(aes(x = hour, y = Fare_amount)) + 
  stat_summary(fun.data = "mean_cl_normal",
               fun.args = list(
                 conf.int = .99
               )) +
  facet_wrap(~ day) +
  geom_hline(aes(yintercept=mean(march$Fare_amount)),
             color="red", linetype="dashed", size=1) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_blank()) + 
  theme(axis.ticks.x = element_blank()) +
  ylab(expression('Fare Amount $'))
fare_graph_hour

tip_graph_hour <- march %>% 
  group_by(hour) %>% 
  ggplot(aes(x = hour, y = Tip_amount)) + 
  stat_summary(fun.data = "mean_cl_normal",
               fun.args = list(
                 conf.int = .99
               )) +
  facet_wrap(~ day) +
  geom_hline(aes(yintercept=mean(march$Tip_amount)),
             color="red", linetype="dashed", size=1) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_blank()) + 
  theme(axis.ticks.x = element_blank()) +
  ylab(expression('Tip Amount $'))
tip_graph_hour

Why does Taxi fare spike in the morning? hen would you prefer to be a taxi driver? ### Capacity:

capacity_graph <- march %>% 
  group_by(hour) %>% 
  ggplot(aes(x = hour, y = Passenger_count)) + 
  stat_summary(fun.data = "mean_cl_normal",
               fun.args = list(
                 conf.int = .99
               )) +
  geom_hline(aes(yintercept=mean(march$Passenger_count)),
             color="red", linetype="dashed", size=1) +
  ylab(expression('Passenger Count')) +
  xlab("Time of Day") +
  labs(
    title = "Passenger Count and Time of day",
    subtitle = str_wrap(paste("Lower amount of passengers per vehicle at times of day when Congestion is typically higher"), 
                        width = 80))
capacity_graph

What problems are presented from this graph? How would you attempt to increase ridership capacity? ### Weather:

ggplot(march, aes(x = apparentTemperature))+
  geom_histogram(binwidth = 1, aes(fill = ..count..)) +
  scale_fill_gradient( name = "Frequency",
                       low = "orange",
                       high = "darkred", 
                       na.value = "grey50",
                       guide = "colourbar") +
  coord_cartesian(xlim = c(-10,28)) +
  ggtitle("Histogram for Apparent Temperature") +
  xlab("Apparent Temperature (C)") + 
  ylab("Frequency of Temperature") +
  geom_vline(aes(xintercept=mean(apparentTemperature)),
             color="black", linetype="dashed", size=1) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  theme_bw()

What do you notice about the temperature variation? How do you think this would impact taxi demand?

Spatial Analysis

Now that we have analysed our data, we need to assign each data point a zone in order to complete our regression analysis. Here we will do so by dividing our data into New York City Postcodes. We get this data from publically available New York City Data. Additionally, this study was able to find demographic data on a post code level of granularity. This would enable us to perform more interesting regression analyse and analyse the impact that demographics may have on taxi demand.

output_areas <- readOGR(dsn = wd, layer = "ZIP_CODE_040114")
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\44792\Documents\Uni Temp Work\Big Data", layer: "ZIP_CODE_040114"
## with 263 features
## It has 12 fields
demographic <- read_csv(paste(wd, plsep, "Demographic_Statistics_By_Zip_Code.csv", sep =""))
summary(output_areas@data)
##    ZIPCODE            BLDGZIP            PO_NAME            POPULATION      
##  Length:263         Length:263         Length:263         Min.   :     0.0  
##  Class :character   Class :character   Class :character   1st Qu.:    49.5  
##  Mode  :character   Mode  :character   Mode  :character   Median : 27985.0  
##                                                           Mean   : 31933.9  
##                                                           3rd Qu.: 54445.0  
##                                                           Max.   :109069.0  
##       AREA              STATE              COUNTY            ST_FIPS         
##  Min.   :     3155   Length:263         Length:263         Length:263        
##  1st Qu.:   964323   Class :character   Class :character   Class :character  
##  Median : 21927545   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 31816554                                                           
##  3rd Qu.: 45935567                                                           
##  Max.   :473985727                                                           
##    CTY_FIPS             URL              SHAPE_AREA   SHAPE_LEN
##  Length:263         Length:263         Min.   :0    Min.   :0  
##  Class :character   Class :character   1st Qu.:0    1st Qu.:0  
##  Mode  :character   Mode  :character   Median :0    Median :0  
##                                        Mean   :0    Mean   :0  
##                                        3rd Qu.:0    3rd Qu.:0  
##                                        Max.   :0    Max.   :0
summary(demographic) ### Clear problems with the data, how do you evaluate bad data: 
##  JURISDICTION NAME COUNT PARTICIPANTS  COUNT FEMALE   PERCENT FEMALE 
##  Min.   :10001     Min.   :  0.00     Min.   :  0.0   Min.   :0.000  
##  1st Qu.:10452     1st Qu.:  0.00     1st Qu.:  0.0   1st Qu.:0.000  
##  Median :11216     Median :  0.00     Median :  0.0   Median :0.000  
##  Mean   :11127     Mean   : 17.66     Mean   : 10.3   Mean   :0.244  
##  3rd Qu.:11422     3rd Qu.: 13.00     3rd Qu.:  6.0   3rd Qu.:0.515  
##  Max.   :20459     Max.   :272.00     Max.   :194.0   Max.   :1.000  
##    COUNT MALE       PERCENT MALE   COUNT GENDER UNKNOWN PERCENT GENDER UNKNOWN
##  Min.   :  0.000   Min.   :0.000   Min.   :0            Min.   :0             
##  1st Qu.:  0.000   1st Qu.:0.000   1st Qu.:0            1st Qu.:0             
##  Median :  0.000   Median :0.000   Median :0            Median :0             
##  Mean   :  7.364   Mean   :0.201   Mean   :0            Mean   :0             
##  3rd Qu.:  4.000   3rd Qu.:0.400   3rd Qu.:0            3rd Qu.:0             
##  Max.   :157.000   Max.   :1.000   Max.   :0            Max.   :0             
##  COUNT GENDER TOTAL PERCENT GENDER TOTAL COUNT PACIFIC ISLANDER
##  Min.   :  0.00     Min.   :  0.00       Min.   :0.00000       
##  1st Qu.:  0.00     1st Qu.:  0.00       1st Qu.:0.00000       
##  Median :  0.00     Median :  0.00       Median :0.00000       
##  Mean   : 17.66     Mean   : 44.49       Mean   :0.02542       
##  3rd Qu.: 13.00     3rd Qu.:100.00       3rd Qu.:0.00000       
##  Max.   :272.00     Max.   :100.00       Max.   :2.00000       
##  PERCENT PACIFIC ISLANDER COUNT HISPANIC LATINO PERCENT HISPANIC LATINO
##  Min.   :0.0000000        Min.   : 0.000        Min.   :0.00000        
##  1st Qu.:0.0000000        1st Qu.: 0.000        1st Qu.:0.00000        
##  Median :0.0000000        Median : 0.000        Median :0.00000        
##  Mean   :0.0002966        Mean   : 1.856        Mean   :0.07983        
##  3rd Qu.:0.0000000        3rd Qu.: 0.000        3rd Qu.:0.00000        
##  Max.   :0.0200000        Max.   :51.000        Max.   :1.00000        
##  COUNT AMERICAN INDIAN PERCENT AMERICAN INDIAN COUNT ASIAN NON HISPANIC
##  Min.   :0.00000       Min.   :0.000000        Min.   : 0.0000         
##  1st Qu.:0.00000       1st Qu.:0.000000        1st Qu.: 0.0000         
##  Median :0.00000       Median :0.000000        Median : 0.0000         
##  Mean   :0.02119       Mean   :0.001017        Mean   : 0.5254         
##  3rd Qu.:0.00000       3rd Qu.:0.000000        3rd Qu.: 0.0000         
##  Max.   :2.00000       Max.   :0.200000        Max.   :28.0000         
##  PERCENT ASIAN NON HISPANIC COUNT WHITE NON HISPANIC PERCENT WHITE NON HISPANIC
##  Min.   :0.00000            Min.   :  0.00           Min.   :0.0000            
##  1st Qu.:0.00000            1st Qu.:  0.00           1st Qu.:0.0000            
##  Median :0.00000            Median :  0.00           Median :0.0000            
##  Mean   :0.05657            Mean   : 12.19           Mean   :0.1778            
##  3rd Qu.:0.00000            3rd Qu.:  1.00           3rd Qu.:0.0225            
##  Max.   :1.00000            Max.   :262.00           Max.   :1.0000            
##  COUNT BLACK NON HISPANIC PERCENT BLACK NON HISPANIC COUNT OTHER ETHNICITY
##  Min.   : 0.000           Min.   :0.0000             Min.   : 0.0000      
##  1st Qu.: 0.000           1st Qu.:0.0000             1st Qu.: 0.0000      
##  Median : 0.000           Median :0.0000             Median : 0.0000      
##  Mean   : 2.233           Mean   :0.1042             Mean   : 0.6864      
##  3rd Qu.: 0.000           3rd Qu.:0.0000             3rd Qu.: 0.0000      
##  Max.   :60.000           Max.   :1.0000             Max.   :17.0000      
##  PERCENT OTHER ETHNICITY COUNT ETHNICITY UNKNOWN PERCENT ETHNICITY UNKNOWN
##  Min.   :0.00000         Min.   :0.0000          Min.   :0.000000         
##  1st Qu.:0.00000         1st Qu.:0.0000          1st Qu.:0.000000         
##  Median :0.00000         Median :0.0000          Median :0.000000         
##  Mean   :0.02131         Mean   :0.1229          Mean   :0.003941         
##  3rd Qu.:0.00000         3rd Qu.:0.0000          3rd Qu.:0.000000         
##  Max.   :0.50000         Max.   :5.0000          Max.   :0.250000         
##  COUNT ETHNICITY TOTAL PERCENT ETHNICITY TOTAL COUNT PERMANENT RESIDENT ALIEN
##  Min.   :  0.00        Min.   :  0.00          Min.   : 0.0000               
##  1st Qu.:  0.00        1st Qu.:  0.00          1st Qu.: 0.0000               
##  Median :  0.00        Median :  0.00          Median : 0.0000               
##  Mean   : 17.66        Mean   : 44.45          Mean   : 0.4449               
##  3rd Qu.: 13.00        3rd Qu.:100.00          3rd Qu.: 0.0000               
##  Max.   :272.00        Max.   :100.00          Max.   :10.0000               
##  PERCENT PERMANENT RESIDENT ALIEN COUNT US CITIZEN PERCENT US CITIZEN
##  Min.   :0.00000                  Min.   :  0.00   Min.   :0.0000    
##  1st Qu.:0.00000                  1st Qu.:  0.00   1st Qu.:0.0000    
##  Median :0.00000                  Median :  0.00   Median :0.0000    
##  Mean   :0.02419                  Mean   : 17.17   Mean   :0.4187    
##  3rd Qu.:0.00000                  3rd Qu.: 12.00   3rd Qu.:0.9900    
##  Max.   :1.00000                  Max.   :271.00   Max.   :1.0000    
##  COUNT OTHER CITIZEN STATUS PERCENT OTHER CITIZEN STATUS
##  Min.   :0.00000            Min.   :0.000000            
##  1st Qu.:0.00000            1st Qu.:0.000000            
##  Median :0.00000            Median :0.000000            
##  Mean   :0.04661            Mean   :0.002076            
##  3rd Qu.:0.00000            3rd Qu.:0.000000            
##  Max.   :2.00000            Max.   :0.330000            
##  COUNT CITIZEN STATUS UNKNOWN PERCENT CITIZEN STATUS UNKNOWN
##  Min.   :0                    Min.   :0                     
##  1st Qu.:0                    1st Qu.:0                     
##  Median :0                    Median :0                     
##  Mean   :0                    Mean   :0                     
##  3rd Qu.:0                    3rd Qu.:0                     
##  Max.   :0                    Max.   :0                     
##  COUNT CITIZEN STATUS TOTAL PERCENT CITIZEN STATUS TOTAL
##  Min.   :  0.00             Min.   :  0.00              
##  1st Qu.:  0.00             1st Qu.:  0.00              
##  Median :  0.00             Median :  0.00              
##  Mean   : 17.66             Mean   : 44.49              
##  3rd Qu.: 13.00             3rd Qu.:100.00              
##  Max.   :272.00             Max.   :100.00              
##  COUNT RECEIVES PUBLIC ASSISTANCE PERCENT RECEIVES PUBLIC ASSISTANCE
##  Min.   :  0.000                  Min.   :0.0000                    
##  1st Qu.:  0.000                  1st Qu.:0.0000                    
##  Median :  0.000                  Median :0.0000                    
##  Mean   :  5.975                  Mean   :0.1392                    
##  3rd Qu.:  3.250                  3rd Qu.:0.2525                    
##  Max.   :155.000                  Max.   :1.0000                    
##  COUNT NRECEIVES PUBLIC ASSISTANCE PERCENT NRECEIVES PUBLIC ASSISTANCE
##  Min.   :  0.00                    Min.   :0.0000                     
##  1st Qu.:  0.00                    1st Qu.:0.0000                     
##  Median :  0.00                    Median :0.0000                     
##  Mean   : 11.69                    Mean   :0.3058                     
##  3rd Qu.:  8.00                    3rd Qu.:0.6625                     
##  Max.   :206.00                    Max.   :1.0000                     
##  COUNT PUBLIC ASSISTANCE UNKNOWN PERCENT PUBLIC ASSISTANCE UNKNOWN
##  Min.   :0                       Min.   :0                        
##  1st Qu.:0                       1st Qu.:0                        
##  Median :0                       Median :0                        
##  Mean   :0                       Mean   :0                        
##  3rd Qu.:0                       3rd Qu.:0                        
##  Max.   :0                       Max.   :0                        
##  COUNT PUBLIC ASSISTANCE TOTAL PERCENT PUBLIC ASSISTANCE TOTAL
##  Min.   :  0.00                Min.   :  0.00                 
##  1st Qu.:  0.00                1st Qu.:  0.00                 
##  Median :  0.00                Median :  0.00                 
##  Mean   : 17.66                Mean   : 44.49                 
##  3rd Qu.: 13.00                3rd Qu.:100.00                 
##  Max.   :272.00                Max.   :100.00
output_areas@data <- output_areas@data[, c(1,3,4,5,7)]
demographic <- demographic[, c(1,4,6,14,18,20,22)]
output_areas@data$ZIPCODE <- as.numeric(output_areas@data$ZIPCODE)

Unfortunately the data on postcode from the government website is inadequate for further investigation. There is a lack of data points for each post-code which yield findings that are inconsistent with reality.

#Join data to the shapefile:

summary(output_areas@data)
##     ZIPCODE        PO_NAME            POPULATION            AREA          
##  Min.   :   83   Length:263         Min.   :     0.0   Min.   :     3155  
##  1st Qu.:10114   Class :character   1st Qu.:    49.5   1st Qu.:   964323  
##  Median :10459   Mode  :character   Median : 27985.0   Median : 21927545  
##  Mean   :10629                      Mean   : 31933.9   Mean   : 31816554  
##  3rd Qu.:11234                      3rd Qu.: 54445.0   3rd Qu.: 45935567  
##  Max.   :11697                      Max.   :109069.0   Max.   :473985727  
##     COUNTY         
##  Length:263        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
row.names(output_areas@data) <- NULL 
output_areas@data$POP_DENSITY <- output_areas@data$POPULATION / output_areas@data$AREA
output_areas@data$PO_NAME <- as.factor(output_areas@data$PO_NAME)
output_areas@data$COUNTY <- as.factor(output_areas@data$COUNTY)
output_areas@data$log_POP_DENSITY <- log(output_areas@data$POPULATION / output_areas@data$AREA)
#output_areas@data <- output_areas@data[!duplicated(output_areas@data$ZIPCODE), ]

Display New York Area

tm_shape(output_areas) + tm_fill("POPULATION", style = "pretty", palette =  "-RdPu",  title = "Population") +
  tm_borders(alpha = .4) + 
  tm_compass() + 
  tm_layout(title = "New York Post Codes by \nPopulation")

tm_shape(output_areas) + tm_fill("log_POP_DENSITY", style = "pretty", palette =  "-RdPu",  title = "Log Pop Density") +
  tm_borders(alpha = .4) + 
  tm_compass() + 
  tm_layout(title = "New York Post Codes by \nPopulation Density")

Join the point data and illustrate pick-up locations spatially

data_50000 <- march[1:50000,]

locations_sf <- data_50000 %>% 
  st_as_sf(coords = c("Pickup_longitude", "Pickup_latitude")) %>% 
          st_set_crs(4269) %>% 
          st_transform(2263)
postcodes_sf <- st_as_sf(output_areas, crs = "NAD83")

march_gis <- march %>% 
  st_as_sf(coords = c("Pickup_longitude", "Pickup_latitude")) %>% 
  st_set_crs(4269) %>% 
  st_transform(2263)
st_crs(march_gis) == st_crs(postcodes_sf)
## [1] TRUE
joined <- st_join(march_gis, postcodes_sf, join = st_within)
head(joined)
## Simple feature collection with 6 features and 36 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 997080.2 ymin: 189213.6 xmax: 1014775 ymax: 257568.8
## projected CRS:  NAD83 / New York Long Island (ftUS)
##            time ï..VendorID Store_and_fwd_flag RateCodeID Dropoff_longitude
## 1 03/07/2016 00           2                  N          1         -73.94080
## 2 03/07/2016 00           2                  N          1         -73.88988
## 3 03/07/2016 00           2                  N          1         -73.94886
## 4 03/07/2016 00           2                  N          1         -73.93330
## 5 03/07/2016 00           2                  N          1         -73.88295
## 6 03/07/2016 00           2                  N          1         -73.95479
##   Dropoff_latitude Passenger_count Trip_distance Fare_amount Extra MTA_tax
## 1         40.80592               6          1.69         7.5   0.5     0.5
## 2         40.74689               1          0.02         2.5   0.5     0.5
## 3         40.82837               5          0.52         4.0   0.5     0.5
## 4         40.68824               1          0.63         4.5   0.5     0.5
## 5         40.86977               1          0.68         4.0   0.5     0.5
## 6         40.73359               1          1.07         5.0   0.5     0.5
##   Tip_amount Tolls_amount improvement_surcharge Total_amount Payment_type
## 1          0            0                   0.3          8.8            2
## 2          0            0                   0.3          3.8            2
## 3          1            0                   0.3          6.3            1
## 4          0            0                   0.3          5.8            2
## 5          0            0                   0.3          5.3            1
## 6          0            0                   0.3          6.3            2
##   Trip_type     Pickup_Datetime    Dropoff_Datetime    day hour month  Duration
## 1         1 2016-03-07 00:37:47 2016-03-07 00:44:17 Monday    0     3 6.5000000
## 2         1 2016-03-07 00:01:22 2016-03-07 00:01:44 Monday    0     3 0.3666667
## 3         1 2016-03-07 00:00:11 2016-03-07 00:02:11 Monday    0     3 2.0000000
## 4         1 2016-03-07 00:00:06 2016-03-07 00:03:02 Monday    0     3 2.9333333
## 5         1 2016-03-07 00:01:25 2016-03-07 00:03:33 Monday    0     3 2.1333333
## 6         1 2016-03-07 00:00:14 2016-03-07 00:03:24 Monday    0     3 3.1666667
##       Speed  summary precipProbability apparentTemperature humidity windSpeed
## 1 15.600000 Overcast              0.01                4.47     0.45      1.25
## 2  3.272727 Overcast              0.01                4.47     0.45      1.25
## 3 15.600000 Overcast              0.01                4.47     0.45      1.25
## 4 12.886364 Overcast              0.01                4.47     0.45      1.25
## 5 19.125000 Overcast              0.01                4.47     0.45      1.25
## 6 20.273684 Overcast              0.01                4.47     0.45      1.25
##   ZIPCODE          PO_NAME POPULATION     AREA   COUNTY  POP_DENSITY
## 1   10029         New York      77503 22963436 New York 0.0033750612
## 2   11373         Elmhurst     101282 42654860   Queens 0.0023744539
## 3   10031         New York      57010 16902152 New York 0.0033729432
## 4   11216         Brooklyn      53862 26478229    Kings 0.0020341995
## 5   10468            Bronx      72877 34447605    Bronx 0.0021155898
## 6   11101 Long Island City      26254 78962089   Queens 0.0003324887
##   log_POP_DENSITY                  geometry
## 1       -5.691342 POINT (998453.4 226735.6)
## 2       -6.042988  POINT (1014508 211403.9)
## 3       -5.691970 POINT (997080.2 238712.4)
## 4       -6.197653 POINT (998858.4 189213.6)
## 5       -6.158422  POINT (1014775 257568.8)
## 6       -8.008905 POINT (999280.8 211017.2)

Getting Open Streetmap Data:

Next we will gather built environment factors to proceeed with our regression analysis. This tutorial selected nightclubs, hospitals, bars, social facilities and tourist attractions as locations that may be of interest. However this tutorial wants you to go out there and analyse other built environment factors that you feel are relevant. See https://wiki.openstreetmap.org/wiki/Map_features

bb <- getbb('New York City, New York, USA', format_out = 'polygon')

nightclub <- opq(bbox = bb) %>%
  add_osm_feature(key = 'amenity', value = 'nightclub') %>%
  osmdata_sf() %>%
  trim_osmdata(bb)

hospital <- opq(bbox = bb) %>%
  add_osm_feature(key = 'amenity', value = 'hospital') %>%
  osmdata_sf() %>%
  trim_osmdata(bb)

bar <- opq(bbox = bb) %>% 
  add_osm_feature(key = 'amenity', value = 'bar') %>%
  osmdata_sf() %>%
  trim_osmdata(bb)

social_facility <- opq(bbox = bb) %>% 
  add_osm_feature(key = 'amenity', value = 'social_facility') %>%
  osmdata_sf() %>%
  trim_osmdata(bb)

attraction <- opq(bbox = bb) %>%  
  add_osm_feature(key = 'tourism', value = 'attraction') %>%
  osmdata_sf() %>%
  trim_osmdata(bb)

Clean Open Street Map Data and put into dataframe.

variables <- c("name","addr.postcode")
features <- c("nightclub", "hospital", "bar", "attraction", "social_facility")

result <- lapply(mget(features), function(x) 
  x$osm_points[!is.na(x$osm_points$addr.postcode), variables])

nightclub1 <- nightclub$osm_points[!is.na(nightclub$osm_points$addr.postcode), variables]
attraction1 <- attraction$osm_points[!is.na(attraction$osm_points$addr.postcode), variables]
bar1 <- bar$osm_points[!is.na(bar$osm_points$addr.postcode), variables]
social_facility1 <- social_facility$osm_points[!is.na(social_facility$osm_points$addr.postcode), variables]
hospital1 <- hospital$osm_points[!is.na(hospital$osm_points$addr.postcode), variables]

feature <- rep("hospital", nrow(hospital1))
hospital1 <- cbind(hospital1, feature)

feature <- rep("attraction", nrow(attraction1))
attraction1 <- cbind(attraction1, feature)

feature <- rep("nightclub", nrow(nightclub1))
nightclub1 <- cbind(nightclub1, feature)

feature <- rep("social_facility", nrow(social_facility1))
social_facility1 <- cbind(social_facility1, feature)

feature <- rep("bar", nrow(bar1))
bar1 <- cbind(bar1, feature)

all_features <- rbind(nightclub1, attraction1, bar1, social_facility1, hospital1)
head(all_features)
## Simple feature collection with 6 features and 3 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -74.00703 ymin: 40.67994 xmax: -73.8798 ymax: 40.76283
## geographic CRS: WGS 84
##                               name addr.postcode   feature
## 2105118497                 The Lab         11216 nightclub
## 2234739141                   Cielo         10014 nightclub
## 2244704716 The Village Underground         10012 nightclub
## 2553635461                     Sif         11369 nightclub
## 2568669177                 444Club         11237 nightclub
## 2710091424                    Lavo         10022 nightclub
##                              geometry
## 2105118497 POINT (-73.94357 40.67994)
## 2234739141  POINT (-74.00703 40.7398)
## 2244704716 POINT (-74.00082 40.73067)
## 2553635461  POINT (-73.8798 40.76002)
## 2568669177   POINT (-73.926 40.70545)
## 2710091424  POINT (-73.9713 40.76283)
all_features <- all_features[, c(2,3)] 

Merge Built environment features with joined dataset for built environment regression:

built_environment <- joined
st_geometry(built_environment) <- NULL 
st_geometry(all_features) <- NULL  

built_environment_agg <- built_environment[,c("time", "Passenger_count", "Trip_distance", "day", "hour", "summary", "apparentTemperature", "windSpeed",
                                              "ZIPCODE", "POPULATION", "AREA")]
characters <- c("day", "hour", "summary", "apparentTemperature", "windSpeed", "ZIPCODE", "POPULATION", "AREA")
built_environment_agg[, characters]  <- lapply(built_environment_agg[characters], as.character)

built_environment_agg1 <- built_environment_agg %>% group_by(ZIPCODE, time, summary, day, hour, apparentTemperature, POPULATION, AREA)  %>% 
  summarise(total_passengers = sum(Passenger_count), total_trip_distance = sum(Trip_distance))  
names(all_features) <- c("ZIPCODE", "feature")

all_features1 <- all_features %>% group_by(ZIPCODE)  %>% 
    summarise(bar = sum(feature %in% "bar"),
              nightclub = sum(feature %in% "nightclub"),
              attraction = sum(feature %in% "attraction"),
              hospital = sum(feature %in% "hospital"),
              social_facility = sum(feature %in% "social_facility"))
regression <- merge(built_environment_agg1, all_features1, by = "ZIPCODE", all.x = TRUE)
head(regression)
##   ZIPCODE          time       summary       day hour apparentTemperature
## 1   10001 03/19/2016 00 Partly Cloudy  Saturday    0               13.65
## 2   10002 03/02/2016 09 Mostly Cloudy Wednesday    9                6.57
## 3   10002 03/13/2016 00         Clear    Sunday    0               12.76
## 4   10007 03/07/2016 10         Clear    Monday   10                3.18
## 5   10016 03/10/2016 18 Partly Cloudy  Thursday   18               16.52
## 6   10018 03/16/2016 21 Mostly Cloudy Wednesday   21               12.63
##   POPULATION             AREA total_passengers total_trip_distance bar
## 1      22413 17794940.7727089                2                0.64  14
## 2      81305 26280128.5438885                1                6.20  27
## 3      81305 26280128.5438885                1                0.10  27
## 4       7323 5327870.97676123                1                0.18   1
## 5      54606 15042403.0377915                1                3.00   1
## 6       5503 10705796.4791701                1                0.30   7
##   nightclub attraction hospital social_facility
## 1         3          0        0               1
## 2         0          0        0               0
## 3         0          0        0               0
## 4         0          0        0               0
## 5         0          0        1               1
## 6         0          3        0               0
rm(list = c("attraction", "attraction1", "bar", "bar1", "hospital", "hospital1", "nightclub", "nightclub1",
            "social_facility", "social_facility1"))