# --------------------------Importing all the necessary packages
library(ggplot2) # for graphs
library(tidyverse) # to prepare the data
library(dplyr) # to prepare the data
library(leaflet) # for interactive maps
library(leaflet.extras) # for interactive maps
library(kableExtra) # for kable tables
library(formattable) # handling percentages
library(gridExtra) #to organize the graphs
library(lubridate) # for time format
library(kableExtra) # for tables
library(extrafont) # for importing fonts
library(DescTools) # for descriptive stats
library(wordcloud) # for wordcloud
library(RColorBrewer)# used in the wordcloud
library(tm) # used for prearing words for the wordcloud
#----------------Aesthetic variables
purple <- "#36328C"
red <- "#D81E5B"
blue <- "#8DE4FF"
yellow <- "#FFC914"
green <- "#8AD973"
white <- "#FFFFFF"
black <- "#000000"
blue2 <- "#5AA9E6"
grey <- "#808080"
sage_green <- "#8A9A5B"
glaucous <- "#6082B6"
midnight_blue <- c("#191970")
matte_black <- c("#28282B")
# -----------------Importing the data
file_path <- file.path("Data", "NYC_EVENTS.csv") # this way the file can be opened through multiple operating systems
nyc_events <- read_csv(file = file_path, lazy = FALSE)
file_path_location <- file.path("Data", "NYC_EVENTS_LOCATIONS.csv")
nyc_events_loc <- read_csv(file = file_path_location, lazy = FALSE)
file_path_cat <- file.path("Data", "NYC_EVENTS_CATEGORIES.csv")
nyc_cat <- read_csv(file = file_path_cat, lazy = FALSE)
file_path_org <- file.path("Data", "NYC_EVENTS_ORGANIZER.csv")
nyc_org <- read_csv(file = file_path_org, lazy = FALSE)
Introduction
Context
New York City, or NYC for short, is the most populous city in the United States. Hosting almost 9 million people (8 804 190) in a surface of over 778 km squared, the city is a host to many events along the years that institue the magic that the city is know for.
«I get out of the taxi and it’s probably the only city which in reality looks better than on the postcards: New York. »
Milos Forman - Film director
Description of our database
Disclaimer: All rights over the data presented in this document goes to NYC OpenData
The database used in this document is a relational database of events hosted in New York City that belongs to Parks and Recreational Department of NYC.
The diagram of the database can be presented as follows:
# Compute a cumulative sum of records in the database grouped by year
records <-
nyc_events %>% distinct(event_id, date) %>% separate(date, c("month", "day", "year"), "/", remove = F) %>% group_by(year) %>% summarise(number_events = n()) %>% mutate(cumulative_records = cumsum(number_events))
windowsFonts(Times_new_roman = windowsFont("Times New Roman")) # Import the font
# Plot the graph of cumulative sum of records by year
records_graph <-
ggplot(records, aes(x = year, y = cumulative_records)) +
geom_col(alpha = 1,
width = 0.5,
fill = purple,
position = "dodge") +
labs(
title = "Cumulative sum of number of records in the database",
subtitle = "2013 - 2019",
y = "Number of records",
x = NULL
) +
geom_text(
aes(label = cumulative_records),
position = position_nudge(y = 5000),
family = "Times_new_roman",
check_overlap = T
) +
theme(
text = element_text(
family = "Times_new_roman",
face = "bold",
size = 16
),
legend.position = "none",
panel.background = element_blank(),
axis.line = element_line()
)
records_graph

In our database, we have a total number of 74880 records. With an average of 34 new records per day, our database was regularly updated.
Problematic
- How did the number of events evolve over the years?
- What are the main categories of events hosted by the city?
- Who are the major events’ organizers?
- How are the events spread around the city?
# Data preparation
nyc <- inner_join(nyc_events, nyc_events_loc, by = "event_id") # merge the events dataset with the locations dataset
nyc$start_time <- hms(nyc$start_time) # change to time format
nyc$end_time <- hms(nyc$end_time) # change to time format
nyc$duration <- nyc$end_time - nyc$start_time # Compute duration of the events
nyc <- nyc %>% separate(date, c("month","day","year"), "/", remove = F) # Seperate the date into days, months, and years
Major Events Organizers
# Selecting the organizers and their emails
organizers <-
nyc %>% inner_join(nyc_org, by = "event_id") %>% distinct(event_id, email, event_organizer) %>% group_by(event_organizer, email)
# Dropping NAs
organizers <- na.omit(organizers)
# emails to lower case to match similar ones that have been written differently
organizers$email <- tolower(organizers$email)
# Compute the number of events for each organizer
organizers <-
organizers %>% group_by(event_organizer, email) %>% summarize(number_events = n())
# Renaming the cols for the table
colnames(organizers) <-
c("Organizer", "Organizer's Email", "Nbr. events organized")
# Order by nbr of events
organizers <-
organizers %>% arrange(desc(organizers$`Nbr. events organized`))
# Kable table of the top 20
head(organizers, n = 20) %>%
arrange(desc(`Nbr. events organized`)) %>%
kable(escape = F, align = c("l", "l", "c")) %>%
kable_styling(c("striped", "hover", "condensed"), full_width = F) %>%
row_spec(0,
bold = T,
color = "black",
font_size = 20) %>%
column_spec(1,
bold = T,
width = "40%",
color = matte_black) %>%
column_spec(2, bold = T, width = "30%") %>%
column_spec(3, bold = T, color = purple)
Main areas of hosted events
# Selecting the coords and the organizers
location <-
nyc %>% distinct(event_id, long, lat) %>% inner_join(nyc_org, by = "event_id")
# Handling NAs
location <- na.exclude(location)
# Number of events for each organizer and average coords
location <-
location %>% group_by(event_organizer) %>% summarize(number_events = n(),
lat = mean(lat),
long = mean(long))
# Major organizers
location <-
location %>% filter(number_events > 10)
# Color palette
pal <- colorNumeric(palette = "YlGnBu",
domain = location$number_events)
# Building the map
nyc_map <- leaflet() %>%
addTiles() %>%
setView(
lng = mean(location$long),
lat = mean(location$lat),
zoom = 10
) %>%
addCircles(
lng = location$long,
lat = location$lat,
popup = paste(
"<b> Main area of events organized by:<b/>",
location$event_organizer,
"<br/> <b>The number of events organized:<b/>",
location$number_events
),
weight = 3,
radius = location$number_events,
stroke = TRUE,
color = pal(location$number_events),
fillOpacity = 0.8
) %>%
addLegend("bottomright",
pal = pal, values = location$number_events,
title = "Number of events",
opacity = 1
)
nyc_map
Conclusion
# Exporting the file for further analysis
# Make sure to add a lazy read in the read_csv to be able to write back the prepared data
file_path_export <- file.path("Data","NYC_EVENTS_PREPARED.csv")
# Uncomment this next line if you want to export the prepared data as a csv file
# write_csv(file = file_path_export, x = nyc, progress = T)
To sum up, we can say that New York City is indeed the right place to go for a good time. With its multiple events around the year, New York has, and forever will, attracted many visitors around the world. On a personal level, working on this particular project has been very interesting in terms of how to handle a large database with multiple data sets, how to prepare data to answer a specific problematic, and how to organize and pick the right resources to do so. Working on large dataset certainly has its ups and downs. On one hand, you have enough data from which you can extract valuable information.
On the other hand, the larger volume of data requires a higher level of analysis since you cannot spot abnormalities simply by looking at the raw data, and the larger volume of data can limit the use of some visualization tools such as interactive maps.