This dataset shows the different Pedestrian Counts in all five boroughs of New York City and the East River Bridges and the Hudson River Bridges.
For this assignment, we will show the concentration of the pedestrians in only the five boroughs.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(sf)
## Linking to GEOS 3.5.1, GDAL 2.2.2, PROJ 4.9.2
library(mapview)
The dataset shows the count of pedesstrian circulation in different areas in the boroughs of New York City during summer (May) and fall (September) from 2007 to 2017.
PedestrianCountNYC_Raw <- read_csv('PedCountNYC.csv')
## Parsed with column specification:
## cols(
## .default = col_character(),
## Loc = col_double(),
## OBJECTID = col_double(),
## May09_AM = col_double(),
## May09_PM = col_double(),
## May09_MD = col_double(),
## May10_AM = col_double(),
## May10_PM = col_double(),
## May10_MD = col_double(),
## Sept10_AM = col_double(),
## Sept10_PM = col_double(),
## Sept10_MD = col_double(),
## May11_AM = col_double(),
## May11_PM = col_double(),
## May11_MD = col_double(),
## Sept11_AM = col_double(),
## Sept11_PM = col_double(),
## Sept11_MD = col_double(),
## May12_AM = col_double(),
## May12_PM = col_double(),
## May12_MD = col_double()
## # ... with 33 more columns
## )
## See spec(...) for full column specifications.
For this assignment, I want to show which areas in the 5 Boroughs of NYC has the most concentration of pedestrians during the mornings and afternoons. I also removed East River Bridges and Hudson River Bridges from the dataset so I can focus on the 5 Boroughs.
Also, the columns that have numbers in this dataset is not numeric columns. We must address this in order to proceed to our next step.
NYC_AM <- PedestrianCountNYC_Raw %>%
select(the_geom, Borough, ends_with("AM")) %>%
filter(Borough != "East River Bridges") %>%
filter(Borough != "Harlem River Bridges")
sapply(NYC_AM, class)
## the_geom Borough Street_Nam May07_AM Sept07_AM May08_AM
## "character" "character" "character" "character" "character" "character"
## Sept08_AM May09_AM Sept09_AM May10_AM Sept10_AM May11_AM
## "character" "numeric" "character" "numeric" "numeric" "numeric"
## Sept11_AM May12_AM Sept12_AM May13_AM Sept13_AM May14_AM
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## Sept14_AM May15_AM Sept15_AM May16_AM Sept16_AM May17_AM
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## Sept17_AM
## "numeric"
NYC_AM_1 <- NYC_AM %>%
mutate(May07_AM1 = as.numeric(May07_AM)) %>%
mutate(Sept07_AM1 = as.numeric(Sept07_AM)) %>%
mutate(May08_AM1 = as.numeric(May08_AM)) %>%
mutate(Sept08_AM1 = as.numeric(Sept08_AM)) %>%
mutate(Sept09_AM1 = as.numeric(Sept09_AM))
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
NYC_AM_2 <- NYC_AM_1[ -c(4,5,6,7,9) ]
sapply(NYC_AM_2, class)
## the_geom Borough Street_Nam May09_AM May10_AM Sept10_AM
## "character" "character" "character" "numeric" "numeric" "numeric"
## May11_AM Sept11_AM May12_AM Sept12_AM May13_AM Sept13_AM
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## May14_AM Sept14_AM May15_AM Sept15_AM May16_AM Sept16_AM
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## May17_AM Sept17_AM May07_AM1 Sept07_AM1 May08_AM1 Sept08_AM1
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## Sept09_AM1
## "numeric"
NYC_PM <- PedestrianCountNYC_Raw %>%
select(the_geom, Borough, ends_with("PM")) %>%
filter(Borough != "East River Bridges") %>%
filter(Borough != "Harlem River Bridges")
sapply(NYC_PM, class)
## the_geom Borough May07_PM Sept07_PM May08_PM Sept08_PM
## "character" "character" "character" "character" "character" "character"
## May09_PM Sept09_PM May10_PM Sept10_PM May11_PM Sept11_PM
## "numeric" "character" "numeric" "numeric" "numeric" "numeric"
## May12_PM Sept12_PM May13_PM Sept13_PM May14_PM Sept14_PM
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## May15_PM Sept15_PM May16_PM Sept16_PM May17_PM Sept17_PM
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
NYC_PM_1 <- NYC_PM %>%
mutate(May07_PM1 = as.numeric(May07_PM)) %>%
mutate(Sept07_PM1 = as.numeric(Sept07_PM)) %>%
mutate(May08_PM1 = as.numeric(May08_PM)) %>%
mutate(Sept08_PM1 = as.numeric(Sept08_PM)) %>%
mutate(Sept09_PM1 = as.numeric(Sept09_PM))
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
NYC_PM_2 <- NYC_PM_1[ -c(4,5,6,7,9) ]
sapply(NYC_PM_2, class)
## the_geom Borough May07_PM Sept09_PM Sept10_PM May11_PM
## "character" "character" "character" "character" "numeric" "numeric"
## Sept11_PM May12_PM Sept12_PM May13_PM Sept13_PM May14_PM
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## Sept14_PM May15_PM Sept15_PM May16_PM Sept16_PM May17_PM
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## Sept17_PM May07_PM1 Sept07_PM1 May08_PM1 Sept08_PM1 Sept09_PM1
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
For this step, we will find out which Borough gets the most pedestrian circulation during the summers of 2007 and 2017.
PC_for_May2007 <- ggplot(NYC_AM_2) +
aes(x = Borough, y = May07_AM1, color = Borough) +
geom_point(position= "jitter", size = 3 , alpha = 3/4) +
labs(
title = "Pedestrian Count in the 5 Boroughs During Summer Mornings in 2007", size =1,
subtitle = "New York City, May 2007-2010 AM Data", size =1,
y = "Pedestrian Count", size =1,
x = "Borough", size =1,
caption = "Source: NYC Open Data", size = 0.5
)
PC_for_May2007
## Warning: Removed 1 rows containing missing values (geom_point).
#YAY
ggplotly(PC_for_May2007)
# do another for May 2017
PC_for_May2017 <- ggplot(NYC_AM_2) +
aes(x = Borough, y = May17_AM, color = Borough) +
geom_point(position= "jitter", size = 3 , alpha = 3/4) +
labs(
title = "Pedestrian Count in the 5 Boroughs During Summer Mornings in 2017", size =1,
subtitle = "New York City, May 2017 AM Data", size =1,
y = "Pedestrian Count", size =1,
x = "Borough", size =1,
caption = "Source: NYC Open Data", size = 0.5
)
PC_for_May2017
ggplotly(PC_for_May2017)
For this step, we will get the mean of the pedestrian counts of the 5 Boroughs from 2007-2017. I will then plot this out to show the increase or decrease of pedestrian circulation in the Borough through the years.
Mean_MAY_AM1 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2007 = mean(May07_AM1, na.rm = TRUE))
Mean_MAY_AM2 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2008 = mean(May08_AM1, na.rm = TRUE))
Mean_MAY_AM3 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2009 = mean(May09_AM, na.rm = TRUE))
Mean_MAY_AM4 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2010 = mean(May10_AM, na.rm = TRUE))
Mean_MAY_AM5 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2011 = mean(May11_AM, na.rm = TRUE))
Mean_MAY_AM6 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2012 = mean(May12_AM, na.rm = TRUE))
Mean_MAY_AM7 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2013 = mean(May13_AM, na.rm = TRUE))
Mean_MAY_AM8 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2014 = mean(May14_AM, na.rm = TRUE))
Mean_MAY_AM9 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2015 = mean(May15_AM, na.rm = TRUE))
Mean_MAY_AM10 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2016 = mean(May16_AM, na.rm = TRUE))
Mean_MAY_AM11 <- NYC_AM_2 %>%
group_by(Borough) %>%
summarise(May2017 = mean(May17_AM, na.rm = TRUE))
Mean_MAY_AM_01 <- left_join(Mean_MAY_AM1, Mean_MAY_AM2, by = c ('Borough'))
Mean_MAY_AM_02 <- left_join(Mean_MAY_AM_01, Mean_MAY_AM3, by = c ('Borough'))
Mean_MAY_AM_03 <- left_join(Mean_MAY_AM_02, Mean_MAY_AM4, by = c ('Borough'))
Mean_MAY_AM_04 <- left_join(Mean_MAY_AM_03, Mean_MAY_AM5, by = c ('Borough'))
Mean_MAY_AM_05 <- left_join(Mean_MAY_AM_04, Mean_MAY_AM6, by = c ('Borough'))
Mean_MAY_AM_06 <- left_join(Mean_MAY_AM_05, Mean_MAY_AM7, by = c ('Borough'))
Mean_MAY_AM_07 <- left_join(Mean_MAY_AM_06, Mean_MAY_AM8, by = c ('Borough'))
Mean_MAY_AM_08 <- left_join(Mean_MAY_AM_07, Mean_MAY_AM9, by = c ('Borough'))
Mean_MAY_AM_09 <- left_join(Mean_MAY_AM_08, Mean_MAY_AM10, by = c ('Borough'))
Mean_MAY_AM <- left_join(Mean_MAY_AM_09, Mean_MAY_AM11, by = c ('Borough'))
# PLOT Mean_MAY_AM
# May 2007 AM Mean Pedestrian Count
MayAM_PCMeans <- ggplot(Mean_MAY_AM) +
aes(x = Borough, y = May2007, fill = Borough) +
geom_col() +
labs(
title = "Average Pedestrian Count in the 5 Boroughs During Summer Mornings in 2007",
subtitle = "New York City, May 2007-2010 AM Data",
x = "Boroughs",
y = "Pedestrian Count",
caption = "Source: NYC Open Data"
)
MayAM_PCMeans
ggplotly(MayAM_PCMeans)
# May 2017 AM Mean Pedestrian Count
MayAM_PCMeans1 <- ggplot(Mean_MAY_AM) +
aes(x = Borough, y = May2017, fill = Borough) +
geom_col() +
labs(
title = "Average Pedestrian Count in the 5 Boroughs During Summer Mornings in 2017",
subtitle = "New York City, May 2007-2010 AM Data",
x = "Boroughs",
y = "Pedestrian Count",
caption = "Source: NYC Open Data"
)
MayAM_PCMeans1
ggplotly(MayAM_PCMeans1)
Make a shapefile that will show where the pedestrian circulation count data was gathered.
NYC_PC_sf <- PedestrianCountNYC_Raw %>%
st_as_sf(wkt = "the_geom", crs = 4326)
mapview(NYC_PC_sf)