In this section we will first explore the dataset to gain an understanding of our relevant parameters and to determine trends. Then we will aggreggate our data to zones which in this case we selected to be New York City Post Code. From this, we then call the Open Streem Map API to gain data on the built environment factors that may influence Passenger Demand.
library(glmnet)
library(lme4)
library(readr)
library(Hmisc)
library(osmdata)
library(darksky)
library(ggplot2)
library(tidyverse)
library(rgdal)
library(rgeos)
library(tmap)
library(leaflet)
library(RColorBrewer)
library(sp)
library(spatialEco)
library(mapview)
library(sf)
library(dplyr)
library(tidyr)
library(plm)
library(corrplot)
wd = "~/Uni Temp Work/Big Data"
plsep <- .Platform$file.sep
march <- read.csv(paste(wd, plsep, "march.csv", sep =""))
factors <- c("hour", "day", "RateCodeID", "Payment_type", "summary")
march[, factors] <- lapply(march[factors], as.factor)
days <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
rate_code <- c("")
march <- arrange(transform(march,
day=factor(day,levels=days)),day)
ggplot(march, aes(x = Trip_distance))+
geom_histogram(binwidth = 0.3, aes(fill = ..count..)) +
scale_fill_gradient( name = "Frequency",
low = "orange",
high = "darkred",
na.value = "grey50",
guide = "colourbar") +
coord_cartesian(xlim = c(0,12)) +
ggtitle("Histogram for Trip Distance") +
xlab("Trip Distance (miles)") +
ylab("Frequency of Trip Distance") +
geom_vline(aes(xintercept=mean(Trip_distance)),
color="black", linetype="dashed", size=1) +
theme_bw()
What distribution do you think this approximates to ?
hour <- ggplot(march, aes(x = hour, y = Trip_distance)) +
geom_col(fill = "#6CB2EB") +
theme_bw() +
ggtitle("Trip Distance by time of day") +
xlab("Hour of day") +
ylab(expression('Miles Travelled by Green Taxi')) +
theme(axis.title.x=element_text(size = 12, face = "bold")) +
theme(axis.text.x=element_text(size=10, face = "bold")) +
theme(axis.title.y=element_text(size=12, face = "bold")) +
theme(axis.text.y=element_text(size=10, face = "bold")) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
hour
day <- ggplot(march, aes(x = day, y = Trip_distance)) +
geom_col(aes(fill = day )) +
scale_fill_brewer(palette="Dark2") +
theme_bw() +
ggtitle("Trip Distance by Day of the Week") +
xlab("Day of Week") +
ylab(expression('Miles Travelled by Green Taxi')) +
theme(axis.title.x=element_text(size = 12, face = "bold")) +
theme(axis.text.x=element_text(size=10, face = "bold")) +
theme(axis.title.y=element_text(size=12, face = "bold")) +
theme(axis.text.y=element_text(size=10, face = "bold")) +
theme(legend.position = "none") +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
scale_x_discrete(limits = days)
day
hour_day <- ggplot(march, aes(x = hour, y = Trip_distance)) +
geom_col(fill = "#6CB2EB") +
theme_bw() +
ggtitle("Trip Distance by time of day and day of week") +
xlab("Hour of day") +
ylab(expression('Miles Travelled by Green Taxi')) +
theme(axis.title.x=element_blank()) +
theme(axis.text.x=element_blank()) +
theme(axis.title.y=element_text(size=12, face = "bold")) +
theme(axis.text.y=element_text(size=10, face = "bold")) +
theme(axis.ticks.x = element_blank()) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
facet_wrap(~ day)
hour_day
When is peak taxi demand ? What problems occur from having such variations in demand? For City Planners ? For Taxi Companies ? For Taxi Drivers ? For Air Pollution ?
ggplot(march, aes(x = day, y = Total_amount)) +
geom_boxplot(fill = "#6CB2EB", color = "#22292F",
alpha = .8) +
labs(
title = "Total taxi fare amount and day of week",
subtitle = str_wrap(paste("Hmm not very interesting! Quite a lot of outliers (Remember it is lognormally distributed!!)"),
width = 80)) +
theme(
panel.background = element_rect(color = "steelblue"),
axis.text = element_text(size = rel(1.2))
)
fare_graph_hour <- march %>%
group_by(hour) %>%
ggplot(aes(x = hour, y = Fare_amount)) +
stat_summary(fun.data = "mean_cl_normal",
fun.args = list(
conf.int = .99
)) +
facet_wrap(~ day) +
geom_hline(aes(yintercept=mean(march$Fare_amount)),
color="red", linetype="dashed", size=1) +
theme(axis.title.x=element_blank()) +
theme(axis.text.x=element_blank()) +
theme(axis.ticks.x = element_blank()) +
ylab(expression('Fare Amount $'))
fare_graph_hour
tip_graph_hour <- march %>%
group_by(hour) %>%
ggplot(aes(x = hour, y = Tip_amount)) +
stat_summary(fun.data = "mean_cl_normal",
fun.args = list(
conf.int = .99
)) +
facet_wrap(~ day) +
geom_hline(aes(yintercept=mean(march$Tip_amount)),
color="red", linetype="dashed", size=1) +
theme(axis.title.x=element_blank()) +
theme(axis.text.x=element_blank()) +
theme(axis.ticks.x = element_blank()) +
ylab(expression('Tip Amount $'))
tip_graph_hour
Why does Taxi fare spike in the morning? hen would you prefer to be a taxi driver? ### Capacity:
capacity_graph <- march %>%
group_by(hour) %>%
ggplot(aes(x = hour, y = Passenger_count)) +
stat_summary(fun.data = "mean_cl_normal",
fun.args = list(
conf.int = .99
)) +
geom_hline(aes(yintercept=mean(march$Passenger_count)),
color="red", linetype="dashed", size=1) +
ylab(expression('Passenger Count')) +
xlab("Time of Day") +
labs(
title = "Passenger Count and Time of day",
subtitle = str_wrap(paste("Lower amount of passengers per vehicle at times of day when Congestion is typically higher"),
width = 80))
capacity_graph
What problems are presented from this graph? How would you attempt to increase ridership capacity? ### Weather:
ggplot(march, aes(x = apparentTemperature))+
geom_histogram(binwidth = 1, aes(fill = ..count..)) +
scale_fill_gradient( name = "Frequency",
low = "orange",
high = "darkred",
na.value = "grey50",
guide = "colourbar") +
coord_cartesian(xlim = c(-10,28)) +
ggtitle("Histogram for Apparent Temperature") +
xlab("Apparent Temperature (C)") +
ylab("Frequency of Temperature") +
geom_vline(aes(xintercept=mean(apparentTemperature)),
color="black", linetype="dashed", size=1) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
theme_bw()
What do you notice about the temperature variation? How do you think this would impact taxi demand?
Now that we have analysed our data, we need to assign each data point a zone in order to complete our regression analysis. Here we will do so by dividing our data into New York City Postcodes. We get this data from publically available New York City Data. Additionally, this study was able to find demographic data on a post code level of granularity. This would enable us to perform more interesting regression analyse and analyse the impact that demographics may have on taxi demand.
output_areas <- readOGR(dsn = wd, layer = "ZIP_CODE_040114")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\44792\Documents\Uni Temp Work\Big Data", layer: "ZIP_CODE_040114"
## with 263 features
## It has 12 fields
demographic <- read_csv(paste(wd, plsep, "Demographic_Statistics_By_Zip_Code.csv", sep =""))
summary(output_areas@data)
## ZIPCODE BLDGZIP PO_NAME POPULATION
## Length:263 Length:263 Length:263 Min. : 0.0
## Class :character Class :character Class :character 1st Qu.: 49.5
## Mode :character Mode :character Mode :character Median : 27985.0
## Mean : 31933.9
## 3rd Qu.: 54445.0
## Max. :109069.0
## AREA STATE COUNTY ST_FIPS
## Min. : 3155 Length:263 Length:263 Length:263
## 1st Qu.: 964323 Class :character Class :character Class :character
## Median : 21927545 Mode :character Mode :character Mode :character
## Mean : 31816554
## 3rd Qu.: 45935567
## Max. :473985727
## CTY_FIPS URL SHAPE_AREA SHAPE_LEN
## Length:263 Length:263 Min. :0 Min. :0
## Class :character Class :character 1st Qu.:0 1st Qu.:0
## Mode :character Mode :character Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
summary(demographic) ### Clear problems with the data, how do you evaluate bad data:
## JURISDICTION NAME COUNT PARTICIPANTS COUNT FEMALE PERCENT FEMALE
## Min. :10001 Min. : 0.00 Min. : 0.0 Min. :0.000
## 1st Qu.:10452 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:0.000
## Median :11216 Median : 0.00 Median : 0.0 Median :0.000
## Mean :11127 Mean : 17.66 Mean : 10.3 Mean :0.244
## 3rd Qu.:11422 3rd Qu.: 13.00 3rd Qu.: 6.0 3rd Qu.:0.515
## Max. :20459 Max. :272.00 Max. :194.0 Max. :1.000
## COUNT MALE PERCENT MALE COUNT GENDER UNKNOWN PERCENT GENDER UNKNOWN
## Min. : 0.000 Min. :0.000 Min. :0 Min. :0
## 1st Qu.: 0.000 1st Qu.:0.000 1st Qu.:0 1st Qu.:0
## Median : 0.000 Median :0.000 Median :0 Median :0
## Mean : 7.364 Mean :0.201 Mean :0 Mean :0
## 3rd Qu.: 4.000 3rd Qu.:0.400 3rd Qu.:0 3rd Qu.:0
## Max. :157.000 Max. :1.000 Max. :0 Max. :0
## COUNT GENDER TOTAL PERCENT GENDER TOTAL COUNT PACIFIC ISLANDER
## Min. : 0.00 Min. : 0.00 Min. :0.00000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:0.00000
## Median : 0.00 Median : 0.00 Median :0.00000
## Mean : 17.66 Mean : 44.49 Mean :0.02542
## 3rd Qu.: 13.00 3rd Qu.:100.00 3rd Qu.:0.00000
## Max. :272.00 Max. :100.00 Max. :2.00000
## PERCENT PACIFIC ISLANDER COUNT HISPANIC LATINO PERCENT HISPANIC LATINO
## Min. :0.0000000 Min. : 0.000 Min. :0.00000
## 1st Qu.:0.0000000 1st Qu.: 0.000 1st Qu.:0.00000
## Median :0.0000000 Median : 0.000 Median :0.00000
## Mean :0.0002966 Mean : 1.856 Mean :0.07983
## 3rd Qu.:0.0000000 3rd Qu.: 0.000 3rd Qu.:0.00000
## Max. :0.0200000 Max. :51.000 Max. :1.00000
## COUNT AMERICAN INDIAN PERCENT AMERICAN INDIAN COUNT ASIAN NON HISPANIC
## Min. :0.00000 Min. :0.000000 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.: 0.0000
## Median :0.00000 Median :0.000000 Median : 0.0000
## Mean :0.02119 Mean :0.001017 Mean : 0.5254
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.: 0.0000
## Max. :2.00000 Max. :0.200000 Max. :28.0000
## PERCENT ASIAN NON HISPANIC COUNT WHITE NON HISPANIC PERCENT WHITE NON HISPANIC
## Min. :0.00000 Min. : 0.00 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 0.00 1st Qu.:0.0000
## Median :0.00000 Median : 0.00 Median :0.0000
## Mean :0.05657 Mean : 12.19 Mean :0.1778
## 3rd Qu.:0.00000 3rd Qu.: 1.00 3rd Qu.:0.0225
## Max. :1.00000 Max. :262.00 Max. :1.0000
## COUNT BLACK NON HISPANIC PERCENT BLACK NON HISPANIC COUNT OTHER ETHNICITY
## Min. : 0.000 Min. :0.0000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 0.0000
## Median : 0.000 Median :0.0000 Median : 0.0000
## Mean : 2.233 Mean :0.1042 Mean : 0.6864
## 3rd Qu.: 0.000 3rd Qu.:0.0000 3rd Qu.: 0.0000
## Max. :60.000 Max. :1.0000 Max. :17.0000
## PERCENT OTHER ETHNICITY COUNT ETHNICITY UNKNOWN PERCENT ETHNICITY UNKNOWN
## Min. :0.00000 Min. :0.0000 Min. :0.000000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000000
## Median :0.00000 Median :0.0000 Median :0.000000
## Mean :0.02131 Mean :0.1229 Mean :0.003941
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :0.50000 Max. :5.0000 Max. :0.250000
## COUNT ETHNICITY TOTAL PERCENT ETHNICITY TOTAL COUNT PERMANENT RESIDENT ALIEN
## Min. : 0.00 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0000
## Median : 0.00 Median : 0.00 Median : 0.0000
## Mean : 17.66 Mean : 44.45 Mean : 0.4449
## 3rd Qu.: 13.00 3rd Qu.:100.00 3rd Qu.: 0.0000
## Max. :272.00 Max. :100.00 Max. :10.0000
## PERCENT PERMANENT RESIDENT ALIEN COUNT US CITIZEN PERCENT US CITIZEN
## Min. :0.00000 Min. : 0.00 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 0.00 1st Qu.:0.0000
## Median :0.00000 Median : 0.00 Median :0.0000
## Mean :0.02419 Mean : 17.17 Mean :0.4187
## 3rd Qu.:0.00000 3rd Qu.: 12.00 3rd Qu.:0.9900
## Max. :1.00000 Max. :271.00 Max. :1.0000
## COUNT OTHER CITIZEN STATUS PERCENT OTHER CITIZEN STATUS
## Min. :0.00000 Min. :0.000000
## 1st Qu.:0.00000 1st Qu.:0.000000
## Median :0.00000 Median :0.000000
## Mean :0.04661 Mean :0.002076
## 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :2.00000 Max. :0.330000
## COUNT CITIZEN STATUS UNKNOWN PERCENT CITIZEN STATUS UNKNOWN
## Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0
## Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
## COUNT CITIZEN STATUS TOTAL PERCENT CITIZEN STATUS TOTAL
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 0.00
## Mean : 17.66 Mean : 44.49
## 3rd Qu.: 13.00 3rd Qu.:100.00
## Max. :272.00 Max. :100.00
## COUNT RECEIVES PUBLIC ASSISTANCE PERCENT RECEIVES PUBLIC ASSISTANCE
## Min. : 0.000 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.:0.0000
## Median : 0.000 Median :0.0000
## Mean : 5.975 Mean :0.1392
## 3rd Qu.: 3.250 3rd Qu.:0.2525
## Max. :155.000 Max. :1.0000
## COUNT NRECEIVES PUBLIC ASSISTANCE PERCENT NRECEIVES PUBLIC ASSISTANCE
## Min. : 0.00 Min. :0.0000
## 1st Qu.: 0.00 1st Qu.:0.0000
## Median : 0.00 Median :0.0000
## Mean : 11.69 Mean :0.3058
## 3rd Qu.: 8.00 3rd Qu.:0.6625
## Max. :206.00 Max. :1.0000
## COUNT PUBLIC ASSISTANCE UNKNOWN PERCENT PUBLIC ASSISTANCE UNKNOWN
## Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0
## Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
## COUNT PUBLIC ASSISTANCE TOTAL PERCENT PUBLIC ASSISTANCE TOTAL
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 0.00
## Mean : 17.66 Mean : 44.49
## 3rd Qu.: 13.00 3rd Qu.:100.00
## Max. :272.00 Max. :100.00
output_areas@data <- output_areas@data[, c(1,3,4,5,7)]
demographic <- demographic[, c(1,4,6,14,18,20,22)]
output_areas@data$ZIPCODE <- as.numeric(output_areas@data$ZIPCODE)
Unfortunately the data on postcode from the government website is inadequate for further investigation. There is a lack of data points for each post-code which yield findings that are inconsistent with reality.
#Join data to the shapefile:
summary(output_areas@data)
## ZIPCODE PO_NAME POPULATION AREA
## Min. : 83 Length:263 Min. : 0.0 Min. : 3155
## 1st Qu.:10114 Class :character 1st Qu.: 49.5 1st Qu.: 964323
## Median :10459 Mode :character Median : 27985.0 Median : 21927545
## Mean :10629 Mean : 31933.9 Mean : 31816554
## 3rd Qu.:11234 3rd Qu.: 54445.0 3rd Qu.: 45935567
## Max. :11697 Max. :109069.0 Max. :473985727
## COUNTY
## Length:263
## Class :character
## Mode :character
##
##
##
row.names(output_areas@data) <- NULL
output_areas@data$POP_DENSITY <- output_areas@data$POPULATION / output_areas@data$AREA
output_areas@data$PO_NAME <- as.factor(output_areas@data$PO_NAME)
output_areas@data$COUNTY <- as.factor(output_areas@data$COUNTY)
output_areas@data$log_POP_DENSITY <- log(output_areas@data$POPULATION / output_areas@data$AREA)
#output_areas@data <- output_areas@data[!duplicated(output_areas@data$ZIPCODE), ]
tm_shape(output_areas) + tm_fill("POPULATION", style = "pretty", palette = "-RdPu", title = "Population") +
tm_borders(alpha = .4) +
tm_compass() +
tm_layout(title = "New York Post Codes by \nPopulation")
tm_shape(output_areas) + tm_fill("log_POP_DENSITY", style = "pretty", palette = "-RdPu", title = "Log Pop Density") +
tm_borders(alpha = .4) +
tm_compass() +
tm_layout(title = "New York Post Codes by \nPopulation Density")
data_50000 <- march[1:50000,]
locations_sf <- data_50000 %>%
st_as_sf(coords = c("Pickup_longitude", "Pickup_latitude")) %>%
st_set_crs(4269) %>%
st_transform(2263)
postcodes_sf <- st_as_sf(output_areas, crs = "NAD83")
march_gis <- march %>%
st_as_sf(coords = c("Pickup_longitude", "Pickup_latitude")) %>%
st_set_crs(4269) %>%
st_transform(2263)
st_crs(march_gis) == st_crs(postcodes_sf)
## [1] TRUE
joined <- st_join(march_gis, postcodes_sf, join = st_within)
head(joined)
## Simple feature collection with 6 features and 36 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 997080.2 ymin: 189213.6 xmax: 1014775 ymax: 257568.8
## projected CRS: NAD83 / New York Long Island (ftUS)
## time ï..VendorID Store_and_fwd_flag RateCodeID Dropoff_longitude
## 1 03/07/2016 00 2 N 1 -73.94080
## 2 03/07/2016 00 2 N 1 -73.88988
## 3 03/07/2016 00 2 N 1 -73.94886
## 4 03/07/2016 00 2 N 1 -73.93330
## 5 03/07/2016 00 2 N 1 -73.88295
## 6 03/07/2016 00 2 N 1 -73.95479
## Dropoff_latitude Passenger_count Trip_distance Fare_amount Extra MTA_tax
## 1 40.80592 6 1.69 7.5 0.5 0.5
## 2 40.74689 1 0.02 2.5 0.5 0.5
## 3 40.82837 5 0.52 4.0 0.5 0.5
## 4 40.68824 1 0.63 4.5 0.5 0.5
## 5 40.86977 1 0.68 4.0 0.5 0.5
## 6 40.73359 1 1.07 5.0 0.5 0.5
## Tip_amount Tolls_amount improvement_surcharge Total_amount Payment_type
## 1 0 0 0.3 8.8 2
## 2 0 0 0.3 3.8 2
## 3 1 0 0.3 6.3 1
## 4 0 0 0.3 5.8 2
## 5 0 0 0.3 5.3 1
## 6 0 0 0.3 6.3 2
## Trip_type Pickup_Datetime Dropoff_Datetime day hour month Duration
## 1 1 2016-03-07 00:37:47 2016-03-07 00:44:17 Monday 0 3 6.5000000
## 2 1 2016-03-07 00:01:22 2016-03-07 00:01:44 Monday 0 3 0.3666667
## 3 1 2016-03-07 00:00:11 2016-03-07 00:02:11 Monday 0 3 2.0000000
## 4 1 2016-03-07 00:00:06 2016-03-07 00:03:02 Monday 0 3 2.9333333
## 5 1 2016-03-07 00:01:25 2016-03-07 00:03:33 Monday 0 3 2.1333333
## 6 1 2016-03-07 00:00:14 2016-03-07 00:03:24 Monday 0 3 3.1666667
## Speed summary precipProbability apparentTemperature humidity windSpeed
## 1 15.600000 Overcast 0.01 4.47 0.45 1.25
## 2 3.272727 Overcast 0.01 4.47 0.45 1.25
## 3 15.600000 Overcast 0.01 4.47 0.45 1.25
## 4 12.886364 Overcast 0.01 4.47 0.45 1.25
## 5 19.125000 Overcast 0.01 4.47 0.45 1.25
## 6 20.273684 Overcast 0.01 4.47 0.45 1.25
## ZIPCODE PO_NAME POPULATION AREA COUNTY POP_DENSITY
## 1 10029 New York 77503 22963436 New York 0.0033750612
## 2 11373 Elmhurst 101282 42654860 Queens 0.0023744539
## 3 10031 New York 57010 16902152 New York 0.0033729432
## 4 11216 Brooklyn 53862 26478229 Kings 0.0020341995
## 5 10468 Bronx 72877 34447605 Bronx 0.0021155898
## 6 11101 Long Island City 26254 78962089 Queens 0.0003324887
## log_POP_DENSITY geometry
## 1 -5.691342 POINT (998453.4 226735.6)
## 2 -6.042988 POINT (1014508 211403.9)
## 3 -5.691970 POINT (997080.2 238712.4)
## 4 -6.197653 POINT (998858.4 189213.6)
## 5 -6.158422 POINT (1014775 257568.8)
## 6 -8.008905 POINT (999280.8 211017.2)
Next we will gather built environment factors to proceeed with our regression analysis. This tutorial selected nightclubs, hospitals, bars, social facilities and tourist attractions as locations that may be of interest. However this tutorial wants you to go out there and analyse other built environment factors that you feel are relevant. See https://wiki.openstreetmap.org/wiki/Map_features
bb <- getbb('New York City, New York, USA', format_out = 'polygon')
nightclub <- opq(bbox = bb) %>%
add_osm_feature(key = 'amenity', value = 'nightclub') %>%
osmdata_sf() %>%
trim_osmdata(bb)
hospital <- opq(bbox = bb) %>%
add_osm_feature(key = 'amenity', value = 'hospital') %>%
osmdata_sf() %>%
trim_osmdata(bb)
bar <- opq(bbox = bb) %>%
add_osm_feature(key = 'amenity', value = 'bar') %>%
osmdata_sf() %>%
trim_osmdata(bb)
social_facility <- opq(bbox = bb) %>%
add_osm_feature(key = 'amenity', value = 'social_facility') %>%
osmdata_sf() %>%
trim_osmdata(bb)
attraction <- opq(bbox = bb) %>%
add_osm_feature(key = 'tourism', value = 'attraction') %>%
osmdata_sf() %>%
trim_osmdata(bb)
Clean Open Street Map Data and put into dataframe.
variables <- c("name","addr.postcode")
features <- c("nightclub", "hospital", "bar", "attraction", "social_facility")
result <- lapply(mget(features), function(x)
x$osm_points[!is.na(x$osm_points$addr.postcode), variables])
nightclub1 <- nightclub$osm_points[!is.na(nightclub$osm_points$addr.postcode), variables]
attraction1 <- attraction$osm_points[!is.na(attraction$osm_points$addr.postcode), variables]
bar1 <- bar$osm_points[!is.na(bar$osm_points$addr.postcode), variables]
social_facility1 <- social_facility$osm_points[!is.na(social_facility$osm_points$addr.postcode), variables]
hospital1 <- hospital$osm_points[!is.na(hospital$osm_points$addr.postcode), variables]
feature <- rep("hospital", nrow(hospital1))
hospital1 <- cbind(hospital1, feature)
feature <- rep("attraction", nrow(attraction1))
attraction1 <- cbind(attraction1, feature)
feature <- rep("nightclub", nrow(nightclub1))
nightclub1 <- cbind(nightclub1, feature)
feature <- rep("social_facility", nrow(social_facility1))
social_facility1 <- cbind(social_facility1, feature)
feature <- rep("bar", nrow(bar1))
bar1 <- cbind(bar1, feature)
all_features <- rbind(nightclub1, attraction1, bar1, social_facility1, hospital1)
head(all_features)
## Simple feature collection with 6 features and 3 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: -74.00703 ymin: 40.67994 xmax: -73.8798 ymax: 40.76283
## geographic CRS: WGS 84
## name addr.postcode feature
## 2105118497 The Lab 11216 nightclub
## 2234739141 Cielo 10014 nightclub
## 2244704716 The Village Underground 10012 nightclub
## 2553635461 Sif 11369 nightclub
## 2568669177 444Club 11237 nightclub
## 2710091424 Lavo 10022 nightclub
## geometry
## 2105118497 POINT (-73.94357 40.67994)
## 2234739141 POINT (-74.00703 40.7398)
## 2244704716 POINT (-74.00082 40.73067)
## 2553635461 POINT (-73.8798 40.76002)
## 2568669177 POINT (-73.926 40.70545)
## 2710091424 POINT (-73.9713 40.76283)
all_features <- all_features[, c(2,3)]
built_environment <- joined
st_geometry(built_environment) <- NULL
st_geometry(all_features) <- NULL
built_environment_agg <- built_environment[,c("time", "Passenger_count", "Trip_distance", "day", "hour", "summary", "apparentTemperature", "windSpeed",
"ZIPCODE", "POPULATION", "AREA")]
characters <- c("day", "hour", "summary", "apparentTemperature", "windSpeed", "ZIPCODE", "POPULATION", "AREA")
built_environment_agg[, characters] <- lapply(built_environment_agg[characters], as.character)
built_environment_agg1 <- built_environment_agg %>% group_by(ZIPCODE, time, summary, day, hour, apparentTemperature, POPULATION, AREA) %>%
summarise(total_passengers = sum(Passenger_count), total_trip_distance = sum(Trip_distance))
names(all_features) <- c("ZIPCODE", "feature")
all_features1 <- all_features %>% group_by(ZIPCODE) %>%
summarise(bar = sum(feature %in% "bar"),
nightclub = sum(feature %in% "nightclub"),
attraction = sum(feature %in% "attraction"),
hospital = sum(feature %in% "hospital"),
social_facility = sum(feature %in% "social_facility"))
regression <- merge(built_environment_agg1, all_features1, by = "ZIPCODE", all.x = TRUE)
head(regression)
## ZIPCODE time summary day hour apparentTemperature
## 1 10001 03/19/2016 00 Partly Cloudy Saturday 0 13.65
## 2 10002 03/02/2016 09 Mostly Cloudy Wednesday 9 6.57
## 3 10002 03/13/2016 00 Clear Sunday 0 12.76
## 4 10007 03/07/2016 10 Clear Monday 10 3.18
## 5 10016 03/10/2016 18 Partly Cloudy Thursday 18 16.52
## 6 10018 03/16/2016 21 Mostly Cloudy Wednesday 21 12.63
## POPULATION AREA total_passengers total_trip_distance bar
## 1 22413 17794940.7727089 2 0.64 14
## 2 81305 26280128.5438885 1 6.20 27
## 3 81305 26280128.5438885 1 0.10 27
## 4 7323 5327870.97676123 1 0.18 1
## 5 54606 15042403.0377915 1 3.00 1
## 6 5503 10705796.4791701 1 0.30 7
## nightclub attraction hospital social_facility
## 1 3 0 0 1
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 1 1
## 6 0 3 0 0
rm(list = c("attraction", "attraction1", "bar", "bar1", "hospital", "hospital1", "nightclub", "nightclub1",
"social_facility", "social_facility1"))