Pedestrian Counts in NYC

This dataset shows the different Pedestrian Counts in all five boroughs of New York City and the East River Bridges and the Hudson River Bridges.

For this assignment, we will show the concentration of the pedestrians in only the five boroughs.

First, import all the libraries we will need to analyze our data and run them.

  1. Tidyverse
  2. Dplyr
  3. Ggplot
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(sf)
## Linking to GEOS 3.5.1, GDAL 2.2.2, PROJ 4.9.2
library(mapview)

Second, import the dataset

The dataset shows the count of pedesstrian circulation in different areas in the boroughs of New York City during summer (May) and fall (September) from 2007 to 2017.

PedestrianCountNYC_Raw <- read_csv('PedCountNYC.csv')
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   Loc = col_double(),
##   OBJECTID = col_double(),
##   May09_AM = col_double(),
##   May09_PM = col_double(),
##   May09_MD = col_double(),
##   May10_AM = col_double(),
##   May10_PM = col_double(),
##   May10_MD = col_double(),
##   Sept10_AM = col_double(),
##   Sept10_PM = col_double(),
##   Sept10_MD = col_double(),
##   May11_AM = col_double(),
##   May11_PM = col_double(),
##   May11_MD = col_double(),
##   Sept11_AM = col_double(),
##   Sept11_PM = col_double(),
##   Sept11_MD = col_double(),
##   May12_AM = col_double(),
##   May12_PM = col_double(),
##   May12_MD = col_double()
##   # ... with 33 more columns
## )
## See spec(...) for full column specifications.

Clean up dataset. Separate datasets into time of day.

For this assignment, I want to show which areas in the 5 Boroughs of NYC has the most concentration of pedestrians during the mornings and afternoons. I also removed East River Bridges and Hudson River Bridges from the dataset so I can focus on the 5 Boroughs.

Also, the columns that have numbers in this dataset is not numeric columns. We must address this in order to proceed to our next step.

NYC_AM <- PedestrianCountNYC_Raw %>%
  select(the_geom, Borough, ends_with("AM")) %>%
  filter(Borough != "East River Bridges") %>%
  filter(Borough != "Harlem River Bridges") 
  
sapply(NYC_AM, class)
##    the_geom     Borough  Street_Nam    May07_AM   Sept07_AM    May08_AM 
## "character" "character" "character" "character" "character" "character" 
##   Sept08_AM    May09_AM   Sept09_AM    May10_AM   Sept10_AM    May11_AM 
## "character"   "numeric" "character"   "numeric"   "numeric"   "numeric" 
##   Sept11_AM    May12_AM   Sept12_AM    May13_AM   Sept13_AM    May14_AM 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##   Sept14_AM    May15_AM   Sept15_AM    May16_AM   Sept16_AM    May17_AM 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##   Sept17_AM 
##   "numeric"
NYC_AM_1 <- NYC_AM %>% 
  mutate(May07_AM1 = as.numeric(May07_AM)) %>%
  mutate(Sept07_AM1 = as.numeric(Sept07_AM)) %>%
  mutate(May08_AM1 = as.numeric(May08_AM)) %>%
  mutate(Sept08_AM1 = as.numeric(Sept08_AM)) %>%
  mutate(Sept09_AM1 = as.numeric(Sept09_AM))
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
NYC_AM_2 <- NYC_AM_1[ -c(4,5,6,7,9) ]

sapply(NYC_AM_2, class)
##    the_geom     Borough  Street_Nam    May09_AM    May10_AM   Sept10_AM 
## "character" "character" "character"   "numeric"   "numeric"   "numeric" 
##    May11_AM   Sept11_AM    May12_AM   Sept12_AM    May13_AM   Sept13_AM 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##    May14_AM   Sept14_AM    May15_AM   Sept15_AM    May16_AM   Sept16_AM 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##    May17_AM   Sept17_AM   May07_AM1  Sept07_AM1   May08_AM1  Sept08_AM1 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##  Sept09_AM1 
##   "numeric"
NYC_PM <- PedestrianCountNYC_Raw %>%
  select(the_geom, Borough, ends_with("PM")) %>%
  filter(Borough != "East River Bridges") %>%
  filter(Borough != "Harlem River Bridges")

sapply(NYC_PM, class)
##    the_geom     Borough    May07_PM   Sept07_PM    May08_PM   Sept08_PM 
## "character" "character" "character" "character" "character" "character" 
##    May09_PM   Sept09_PM    May10_PM   Sept10_PM    May11_PM   Sept11_PM 
##   "numeric" "character"   "numeric"   "numeric"   "numeric"   "numeric" 
##    May12_PM   Sept12_PM    May13_PM   Sept13_PM    May14_PM   Sept14_PM 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##    May15_PM   Sept15_PM    May16_PM   Sept16_PM    May17_PM   Sept17_PM 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"
NYC_PM_1 <- NYC_PM %>% 
  mutate(May07_PM1 = as.numeric(May07_PM)) %>%
  mutate(Sept07_PM1 = as.numeric(Sept07_PM)) %>%
  mutate(May08_PM1 = as.numeric(May08_PM)) %>%
  mutate(Sept08_PM1 = as.numeric(Sept08_PM)) %>%
  mutate(Sept09_PM1 = as.numeric(Sept09_PM))
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
NYC_PM_2 <- NYC_PM_1[ -c(4,5,6,7,9) ]

sapply(NYC_PM_2, class)
##    the_geom     Borough    May07_PM   Sept09_PM   Sept10_PM    May11_PM 
## "character" "character" "character" "character"   "numeric"   "numeric" 
##   Sept11_PM    May12_PM   Sept12_PM    May13_PM   Sept13_PM    May14_PM 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##   Sept14_PM    May15_PM   Sept15_PM    May16_PM   Sept16_PM    May17_PM 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##   Sept17_PM   May07_PM1  Sept07_PM1   May08_PM1  Sept08_PM1  Sept09_PM1 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"

Analyze the data and plot the 1st plot!

For this step, we will find out which Borough gets the most pedestrian circulation during the summers of 2007 and 2017.

PC_for_May2007 <- ggplot(NYC_AM_2) +
  aes(x = Borough, y = May07_AM1, color = Borough) +
  geom_point(position= "jitter", size = 3 , alpha = 3/4) +
  labs(
   title = "Pedestrian Count in the 5 Boroughs During Summer Mornings in 2007", size =1,
   subtitle = "New York City, May 2007-2010 AM Data", size =1,
   y = "Pedestrian Count", size =1,
   x = "Borough", size =1,
   caption = "Source: NYC Open Data", size = 0.5
  )

PC_for_May2007
## Warning: Removed 1 rows containing missing values (geom_point).

#YAY
ggplotly(PC_for_May2007)
# do another for May 2017

PC_for_May2017 <- ggplot(NYC_AM_2) +
  aes(x = Borough, y = May17_AM, color = Borough) +
  geom_point(position= "jitter", size = 3 , alpha = 3/4) +
  labs(
   title = "Pedestrian Count in the 5 Boroughs During Summer Mornings in 2017", size =1,
   subtitle = "New York City, May 2017 AM Data", size =1,
   y = "Pedestrian Count", size =1,
   x = "Borough", size =1,
   caption = "Source: NYC Open Data", size = 0.5
  )

PC_for_May2017

ggplotly(PC_for_May2017)

Analyze again and plot the 2nd plot!

For this step, we will get the mean of the pedestrian counts of the 5 Boroughs from 2007-2017. I will then plot this out to show the increase or decrease of pedestrian circulation in the Borough through the years.

Mean_MAY_AM1 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2007 = mean(May07_AM1, na.rm = TRUE))
Mean_MAY_AM2 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2008 = mean(May08_AM1, na.rm = TRUE)) 
Mean_MAY_AM3 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2009 = mean(May09_AM, na.rm = TRUE)) 
Mean_MAY_AM4 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2010 = mean(May10_AM, na.rm = TRUE)) 
Mean_MAY_AM5 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2011 = mean(May11_AM, na.rm = TRUE)) 
Mean_MAY_AM6 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2012 = mean(May12_AM, na.rm = TRUE)) 
Mean_MAY_AM7 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2013 = mean(May13_AM, na.rm = TRUE)) 
Mean_MAY_AM8 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2014 = mean(May14_AM, na.rm = TRUE)) 
Mean_MAY_AM9 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2015 = mean(May15_AM, na.rm = TRUE)) 
Mean_MAY_AM10 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2016 = mean(May16_AM, na.rm = TRUE)) 
Mean_MAY_AM11 <- NYC_AM_2 %>% 
  group_by(Borough) %>% 
  summarise(May2017 = mean(May17_AM, na.rm = TRUE)) 

Mean_MAY_AM_01 <- left_join(Mean_MAY_AM1, Mean_MAY_AM2, by = c ('Borough'))
Mean_MAY_AM_02 <- left_join(Mean_MAY_AM_01, Mean_MAY_AM3, by = c ('Borough'))
Mean_MAY_AM_03 <- left_join(Mean_MAY_AM_02, Mean_MAY_AM4, by = c ('Borough'))
Mean_MAY_AM_04 <- left_join(Mean_MAY_AM_03, Mean_MAY_AM5, by = c ('Borough'))
Mean_MAY_AM_05 <- left_join(Mean_MAY_AM_04, Mean_MAY_AM6, by = c ('Borough'))
Mean_MAY_AM_06 <- left_join(Mean_MAY_AM_05, Mean_MAY_AM7, by = c ('Borough'))
Mean_MAY_AM_07 <- left_join(Mean_MAY_AM_06, Mean_MAY_AM8, by = c ('Borough'))
Mean_MAY_AM_08 <- left_join(Mean_MAY_AM_07, Mean_MAY_AM9, by = c ('Borough'))
Mean_MAY_AM_09 <- left_join(Mean_MAY_AM_08, Mean_MAY_AM10, by = c ('Borough'))
Mean_MAY_AM <- left_join(Mean_MAY_AM_09, Mean_MAY_AM11, by = c ('Borough'))


# PLOT Mean_MAY_AM
# May 2007 AM Mean Pedestrian Count

MayAM_PCMeans <- ggplot(Mean_MAY_AM) +
  aes(x = Borough, y = May2007, fill = Borough) +
  geom_col() +
  labs(
    title = "Average Pedestrian Count in the 5 Boroughs During Summer Mornings in 2007",
    subtitle = "New York City, May 2007-2010 AM Data",
    x = "Boroughs",
    y = "Pedestrian Count",
    caption = "Source: NYC Open Data"
  )

MayAM_PCMeans

ggplotly(MayAM_PCMeans)
# May 2017 AM Mean Pedestrian Count

MayAM_PCMeans1 <- ggplot(Mean_MAY_AM) +
  aes(x = Borough, y = May2017, fill = Borough) +
  geom_col() +
  labs(
    title = "Average Pedestrian Count in the 5 Boroughs During Summer Mornings in 2017",
    subtitle = "New York City, May 2007-2010 AM Data",
    x = "Boroughs",
    y = "Pedestrian Count",
    caption = "Source: NYC Open Data"
  )

MayAM_PCMeans1

ggplotly(MayAM_PCMeans1)

Find out where the data came from!

Make a shapefile that will show where the pedestrian circulation count data was gathered.

NYC_PC_sf <- PedestrianCountNYC_Raw %>% 
  st_as_sf(wkt = "the_geom", crs = 4326)

mapview(NYC_PC_sf)