library(plyr)
library(tidyverse)
library(knitr)
library(cowplot)
library(RColorBrewer)
NYC OpenData provides a massive dataset of 311 service requests from 2010 to present.
After separating their huge .csv file into more manageable chunks using Free Huge CSV Splitter, we load the data into R.
As we load the data, we reorganize the service requests by the agency they were relayed to. That way, we can have separate .csv files for each agency since we’ll only be working with a subset of the data.
all_col_names <- c("Unique_Key", "Created_Date", "Closed_Date", "Agency", "Agency_Name", "Complaint_Type", "Descriptor", "Location_Type", "Incident_Zip", "Incident_Address", "Street_Name", "Cross_Street_1", "Cross_Street_2", "Intersection_Street_1", "Intersection_Street_2", "Address_Type", "City", "Landmark", "Facility_Type", "Status", "Due_Date", "Resolution_Description", "Resolution_Action_Updated_Date", "Community_Board", "BBL", "Borough", "X_Coordinate_State_Plane", "Y_Coordinate_State_Plane", "Open_Data_Channel_Type", "Park_Facility_Name", "Park_Borough", "Vehicle_Type", "Taxi_Company_Borough", "Taxi_Pick_Up_Location", "Bridge_Highway_Name", "Bridge_Highway_Direction", "Road_Ramp", "Bridge_Highway_Segment", "Latitude", "Longitude", "Location")
agency_names <- c("NYC311", "Administration for Children Services", "Administrative Justice Coordinator", "City Hall", "Department of Consumer Affairs", "Department of City Planning", "Department of Environmental Protection", "Department for the Aging", "Department of Homeless Services", "Department of Buildings", "Department of Education", "Department of Finance", "Department of Health and Mental Hygiene", "Department of Information Technology and Telecommunications", "Department of Probation", "Department of Transportation", "Department of Parks and Recreation", "Department of Sanitation", "Department of Youth and Community Development", "Economic Development Corporation", "Fire Department of New York", "Department of Housing Preservation and Development", "Human Resources Administration", "Mayor's Office of Immigrant Affairs", "NYC Emergency Management", "New York City Housing Authority", "New York City Police Department", "Office of Administrative Trials and Hearings", "Mayor's Office of Operations", "Taxi and Limousine Commission")
agency_codes <- c("XXX", "ACS", "AJC", "CHALL", "DCA", "DCP", "DEP", "DFTA", "DHS", "DOB", "DOE", "DOF", "DOHMH", "DOITT", "DOP", "DOT", "DPR", "DSNY", "DYCD", "EDC", "FDNY", "HPD", "HRA", "MOIA", "NYCEM", "NYCHA", "NYPD", "OATH", "OPS", "TLC") #replaced 311 with XXX
completed <- readLines("completed.txt")
if (length(completed) == 0){
df_list <- list()
}
if(length(completed) != 30){
files <- list.files(pattern = ".csv$")
for (j in 1:length(agency_codes)){
if (agency_codes[j] != "XXX" & !(agency_codes[j] %in% completed)){
empty_df <- as.data.frame(matrix(NA, nrow = 0, ncol = 41))
colnames(empty_df) <- all_col_names
df_list <- append(df_list, empty_df)
names(df_list)[j] <- agency_codes[j]
for (i in 1:length(files)){
file <- files[i]
csv <- read.csv(file = file, header = FALSE)
colnames(csv) <- all_col_names
csv <- csv %>%
filter(Agency == agency_codes[j])
df_list[[j]] <- rbind(df_list[[j]], csv)
}
write.csv(df_list[[j]], paste0(agency_codes[j], ".csv"),
row.names = FALSE)
write(agency_codes[j], file = "completed.txt", append = TRUE)
df_list[[j]] <- NA
}else if(agency_codes[j] == "XXX" & !(agency_codes[j] %in% completed)){
empty_df <- as.data.frame(matrix(NA, nrow = 0, ncol = 41))
colnames(empty_df) <- all_col_names
df_list <- append(df_list, empty_df)
names(df_list)[j] <- agency_codes[j]
for (i in 1:length(files)){
file <- files[i]
csv <- read.csv(file = file, header = FALSE)
colnames(csv) <- all_col_names
csv <- csv %>%
filter(Agency_Name == "NYC311")
df_list[[j]] <- rbind(df_list[[j]], csv)
}
write.csv(df_list[[j]], paste0(agency_codes[j], ".csv"),
row.names = FALSE)
write(agency_codes[j], file = "completed.txt", append = TRUE)
df_list[[j]] <- NA
}
}
}
Below, we load the newly created .csv files for only the agencies of interest.
completed2 <- readLines("completed2.txt")
col_classes <- c(rep("character", 3),
rep("NULL", 1),
rep("character", 3),
rep("NULL", 1),
rep("character", 1),
rep("NULL", 9),
rep("character", 2),
rep("NULL", 1),
rep("character", 2),
rep("NULL", 2),
rep("character", 1),
rep("NULL", 2),
rep("character", 3),
rep("NULL", 9),
rep("character", 1))
if (length(completed2) == 0){
df_list <- list()
}
if(length(completed2) != 16){
files <- list.files(pattern = ".csv$")
files <- sort(files)
for (i in 1:length(files)){
fn <- files[i]
df_list <- append(df_list, NA)
names(df_list)[i] <- str_replace(fn, ".csv", "")
if (!(fn %in% completed2)){
csv <- read.csv(file = fn, header = FALSE,
col.names = all_col_names,
colClasses = col_classes)
df_list[[i]] <- csv
}
write(fn, file = "completed2.txt", append = TRUE)
}
}
Finally, we create the main data frame for this project.
completed3 <- readLines("completed3.txt")
if (length(completed3) == 0){
nyc_parks_311_df <- plyr::ldply(df_list, data.frame)
df_list <- list()
nyc_parks_311_df <- nyc_parks_311_df %>%
filter(!Park_Facility_Name %in% c("Unspecified", "N, A", "N/A"))
nyc_parks_311_df <- nyc_parks_311_df[, -1]
nyc_parks_311_df <- nyc_parks_311_df[, -8]
write.csv(nyc_parks_311_df, "nyc_parks_311.csv", row.names = FALSE)
write("nyc_parks_311.csv", file = "completed3.txt", append = TRUE)
}else{
base1 <- "https://raw.githubusercontent.com/geedoubledee/"
base2 <- "data606_final_project/main/nyc_parks_311_"
my_urls <- c(paste0(base1, base2, "0.csv"), paste0(base1, base2, "1.csv"),
paste0(base1, base2, "2.csv"), paste0(base1, base2, "3.csv"),
paste0(base1, base2, "4.csv"), paste0(base1, base2, "5.csv"),
paste0(base1, base2, "6.csv"))
cols <- c("Unique_Key", "Created_Date", "Closed_Date", "Agency_Name",
"Complaint_Type", "Descriptor", "Incident_Zip", "Status",
"Resolution_Description", "Resolution_Action_Updated_Date",
"Borough", "Open_Data_Channel_Type", "Park_Facility_Name",
"Park_Borough", "Location")
nyc_parks_311_df <- as.data.frame(matrix(NA, nrow = 0, ncol = 15))
colnames(nyc_parks_311_df) <- cols
for (i in 1:length(my_urls)){
partial_df <- read.csv(file = my_urls[i], header = FALSE)
colnames(partial_df) <- cols
nyc_parks_311_df <- rbind(nyc_parks_311_df, partial_df)
}
nyc_parks_311_df <- nyc_parks_311_df %>%
group_by(across(everything())) %>%
filter(n() ==1)
nyc_parks_311_df <- nyc_parks_311_df[-103464, ] #remove overlooked duplicate
}
We also need to supplement the data with additional information from other sources, which we load below.
my_url <- "https://raw.githubusercontent.com/geedoubledee/data606_final_project/main/Parks_Inspection_Program_Sites.csv"
parks_inspection_program_sites_df <- read.csv(my_url)
large_parks_sites_df <- parks_inspection_program_sites_df %>%
filter(Category == "Large Park")
large_parks_names <- unique(large_parks_sites_df$Prop.Name)
large_parks_nums <- unique(large_parks_sites_df$PropNum)
my_url2 <- "https://raw.githubusercontent.com/geedoubledee/data606_final_project/main/Parks_Inspection_Program_Inspections.csv"
parks_inspection_program_ratings_df <- read.csv(my_url2)
parks_inspection_program_ratings_df <- parks_inspection_program_ratings_df %>%
mutate(PropNum = gsub("-.*$", "", Prop.ID))
large_parks_ratings_df <- parks_inspection_program_ratings_df %>%
filter(PropNum %in% large_parks_nums)
Inspectors with the Parks Inspection Program (PIP) “currently perform approximately 6,000 PIP inspections each year, giving each inspected property an ‘Acceptable’ (A) or ‘Unacceptable’ (U) rating for overall condition and cleanliness.”1 Some sites also receive these ratings regarding their safety condition and/or structural condition.
PIP identifies 233 parks in NYC as “large parks.” For these large parks, inclusive of any sites that fall under their umbrella, can a park’s 311 service request volume predict a decrease in its PIP rating? Do certain complaint types of 311 service requests contribute more to the likelihood of resulting in a PIP rating decrease?
There are 187,829 cases in my main data frame. Each case is a 311 service request related to a specific NYC park that was relayed to either the Department of Parks and Recreation (DPR), the Department of Homeless Services (DHS), or the New York City Police Department (NYPD).
The NYC OpenData 311 service requests from 2010 to present dataset is automatically updated on a daily basis to include new service requests submitted to 311 by phone, online, and other means.
This is an observational study.
We are using a few data sources for this project:
The response variable will be change in PIP rating, a binary variable that takes on one of two values: 1 (indicating a success, i.e. a decrease in PIP rating) or 0 (indicating a failure, i.e. no change or an increase in PIP rating). Here’s how we create it:
The variable it is based on, PIP rating, is qualitative. However, converting the overall rating and cleanliness rating into scores of 0 (Unacceptable, U) and 0.5 (Acceptable, A) and summing them seems like the best approach to working with this variable. (Other categories of PIP ratings (safety and structural) will be ignored since not all sites have them. )
So each site would get a score from 0 (two U’s) to 1 (two A’s) for every time it was inspected. If a park was inspected more than once a month, the intention is to take the average. Alternatively, we could always use the last rating for each month. If a park was not inspected in a given month, but was inspected previously, the intention is to roll the previous PIP rating over to fill gaps.
So if the month-over-month change is positive or 0, we will record a change in PIP rating of 0 for that month. If the month-over-month change is negative, we will record a change in PIP rating of 1 for that month.
The independent variable is the count of 311 service requests, which is quantitative. The intention is to separate the counts by month and year for most of the analysis and by complaint type for some of it.
We will use logistic regression to estimate the probability of a park’s PIP rating decreasing in a particular month based on its 311 service request volume for that month.
Without isolating the large parks and the sites that fall under their umbrella in the 311 service requests yet, we look at summary statistics for 311 service requests for all parks in a few ways:
by_borough <- nyc_parks_311_df %>%
group_by(Park_Borough) %>%
summarize(Request_Count = n()) %>%
arrange(desc(Request_Count))
kable(by_borough, format = "simple")
| Park_Borough | Request_Count |
|---|---|
| MANHATTAN | 56869 |
| BROOKLYN | 53198 |
| QUEENS | 39846 |
| BRONX | 26695 |
| STATEN ISLAND | 8054 |
| Unspecified | 3167 |
summary(by_borough)
## Park_Borough Request_Count
## Length:6 Min. : 3167
## Class :character 1st Qu.:12714
## Mode :character Median :33271
## Mean :31305
## 3rd Qu.:49860
## Max. :56869
The data is left-skewed and unimodal. The median number of 311 service requests per borough is 33,271, and 50% of the data lies between 12,714 and 49,860 requests. Staten Island falls below this IQR, while Manhattan and Brooklyn both fall above it. The mean is pulled down by Staten Island and 3,167 service requests where the borough is “Unspecified.” We will likely be able to correct the missing values based on the recorded Incident_Zip and Park_Facility_Name variables. If we can’t correct the missing values, we will replace “Unspecified” values with NA values and remove them from all or some portions of analysis.
plot_bars <- function(dat){
ggplot(dat, aes(x = reorder(Park_Borough, Request_Count),
y = Request_Count, fill = Park_Borough)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Borough", y = "Requests") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")) +
coord_flip()
}
p <- plot_bars(by_borough)
p + scale_fill_brewer(palette = "PRGn")
by_complaint_type <- nyc_parks_311_df
by_complaint_type$Complaint_Type <-
gsub("^Noise - .*", "Noise Issue", by_complaint_type$Complaint_Type)
by_complaint_type$Complaint_Type <-
gsub(".* Tree$|.*Tree.*", "Tree Issue", by_complaint_type$Complaint_Type)
by_complaint_type <- by_complaint_type %>%
group_by(Complaint_Type) %>%
summarize(Request_Count = n()) %>%
mutate(Complaint_Type = ifelse(Request_Count < 100,
"Other", Complaint_Type)) %>%
group_by(Complaint_Type) %>%
summarize(Request_Count = sum(Request_Count)) %>%
arrange(desc(Request_Count))
kable(by_complaint_type, format = "simple")
| Complaint_Type | Request_Count |
|---|---|
| Maintenance or Facility | 110393 |
| Violation of Park Rules | 22718 |
| Noise Issue | 20483 |
| Animal in a Park | 19078 |
| DPR Internal | 7560 |
| Tree Issue | 5579 |
| Homeless Person Assistance | 1957 |
| Other | 61 |
summary(by_complaint_type)
## Complaint_Type Request_Count
## Length:8 Min. : 61
## Class :character 1st Qu.: 4674
## Mode :character Median : 13319
## Mean : 23479
## 3rd Qu.: 21042
## Max. :110393
We group some complaint types together, i.e. all noise issues and all tree issues, and we classify some minimally represented complaint types as “Other” to simplify the data. It is left-skewed and unimodal. The median number of 311 service requests per complaint_type is 13,319, and 50% of the data lies between 4,674 and 21,042 requests. “Homeless Person Assistance” falls below this IQR, while “Violation of Park Rules” falls just above it and “Maintenance or Facility” falls way above it. The mean is being pulled up by all these “Maintenance or Facility” requests, so far in fact that it lies outside the IQR.
plot_bars <- function(dat){
ggplot(dat, aes(x = reorder(Complaint_Type, Request_Count),
y = Request_Count, fill = Complaint_Type)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Complaint Type", y = "Requests") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")) +
coord_flip()
}
p <- plot_bars(by_complaint_type)
p + scale_fill_brewer(palette = "PRGn")
First, we look at the 10 parks with the largest volume of 311 service requests (across all years).
nyc_parks_311_df$Created_Date <- gsub(" .*$", "",nyc_parks_311_df$Created_Date)
nyc_parks_311_df$Created_Date <- as.Date(nyc_parks_311_df$Created_Date,
tryFormats = "%m/%d/%Y")
nyc_parks_311_df$Closed_Date <- gsub(" .*$", "",nyc_parks_311_df$Closed_Date)
nyc_parks_311_df$Closed_Date <- as.Date(nyc_parks_311_df$Closed_Date,
tryFormats = "%m/%d/%Y")
nyc_parks_311_df$Resolution_Action_Updated_Date <-
gsub(" .*$", "",nyc_parks_311_df$Resolution_Action_Updated_Date)
nyc_parks_311_df$Resolution_Action_Updated_Date <-
as.Date(nyc_parks_311_df$Resolution_Action_Updated_Date,
tryFormats = "%m/%d/%Y")
nyc_parks_311_df <- nyc_parks_311_df %>%
mutate(Request_Year = lubridate::year(Created_Date))
nyc_parks_311_df_summary <- nyc_parks_311_df %>%
group_by(Park_Facility_Name, Request_Year) %>%
summarize(Request_Count = n())
top_10_parks_by_request_count <- nyc_parks_311_df_summary %>%
group_by(Park_Facility_Name) %>%
summarize(Total_Requests_for_Park = sum(Request_Count)) %>%
arrange(desc(Total_Requests_for_Park)) %>%
top_n(10)
kable(top_10_parks_by_request_count, format = "simple")
| Park_Facility_Name | Total_Requests_for_Park |
|---|---|
| Central Park | 8567 |
| Prospect Park | 4117 |
| Riverside Park | 3628 |
| Flushing Meadows Corona Park | 2983 |
| Forest Park | 2075 |
| Franz Sigel Park | 1725 |
| Tompkins Square Park | 1688 |
| Washington Square Park | 1548 |
| Van Cortlandt Park | 1523 |
| Sunset Park | 1435 |
Now we can visualize how the 311 service request volume has changed over time for all 10 parks together:
plot_stacked_bars <- function(dat){
ggplot(dat, aes(x = Request_Year, y = Request_Count,
fill = Park_Facility_Name)) +
geom_bar(stat = "identity") +
labs(x = "Year", y = "Requests") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank())
}
top_10_parks_names <- unique(top_10_parks_by_request_count$Park_Facility_Name)
top_10_parks_changes_over_time <- nyc_parks_311_df_summary %>%
filter(Park_Facility_Name %in% top_10_parks_names)
p <- plot_stacked_bars(top_10_parks_changes_over_time)
p + scale_fill_brewer(palette = "PRGn")
And we can also look more closely at how the 311 service request volume has changed over time for each of these 10 parks:
PRGn <- list("#40004B", "#762A83", "#9970AB", "#C2A5CF", "#E7D4E8",
"#F7F7F7", "#D9F0D3", "#A6DBA0", "#5AAE61", "#1B7837",
"#00441B")
plot_lines <- function(dat){
ggplot(dat, aes(x = Request_Year, y = Request_Count)) +
geom_line(color = PRGn[[11]]) +
geom_point(color = PRGn[[11]]) +
ylim(0, 1200) +
labs(x = "Year", y = "Requests") +
theme(plot.margin = unit(c(1.5, 0.5, 0, 0), "lines"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
}
by_park <- top_10_parks_changes_over_time %>%
group_nest(Park_Facility_Name)
by_park <- by_park %>%
mutate(plot = map(data, plot_lines))
plot_grid(plotlist = by_park$plot, labels = by_park$Park_Facility_Name,
label_colour = PRGn[[1]], nrow = 5, ncol = 2, label_x = 0.15,
hjust = 0)
We can see that 311 service requests were already on the rise for these parks collectively prior to the onset of the COVID-19 pandemic, and each park individually saw a pandemic spike that continued the upward trend. Some people might want to separate the data into two time periods, pre-pandemic and pandemic-onward, for this analysis. That is not presently part of the plan.
Lastly, we can look at summary statistics for park inspections.
parks_inspection_program_ratings_df <- parks_inspection_program_ratings_df[, -9]
parks_inspection_program_ratings_df$Date <-
gsub(" .*$", "", parks_inspection_program_ratings_df$Date)
parks_inspection_program_ratings_df$Date <-
as.Date(parks_inspection_program_ratings_df$Date, tryFormats = "%m/%d/%Y")
parks_inspection_program_ratings_df <- parks_inspection_program_ratings_df %>%
mutate(Inspection_Year = lubridate::year(Date),
Inspection_Month = lubridate::month(Date))
pip_ratings_summary_by_site_by_year <- parks_inspection_program_ratings_df %>%
group_by(PropNum, Inspection_Year) %>%
summarize(Inspection_Count = n())
cols <- c("Prop.ID", "Boro", "Prop.Name", "Category")
pip_ratings_summary_by_site <- pip_ratings_summary_by_site_by_year %>%
group_by(PropNum) %>%
summarize(Total_Inspections = sum(Inspection_Count))
pip_ratings_summary_by_site <- pip_ratings_summary_by_site %>%
left_join(parks_inspection_program_sites_df %>% select(all_of(cols)) %>%
rename(PropNum = Prop.ID), by = "PropNum") %>%
arrange(desc(Total_Inspections))
summary(pip_ratings_summary_by_site[, 1:2])
## PropNum Total_Inspections
## Length:3071 Min. : 1.0
## Class :character 1st Qu.: 21.0
## Mode :character Median : 28.0
## Mean : 42.2
## 3rd Qu.: 49.0
## Max. :2900.0
The data is left-skewed and unimodal. The median number of times a park has been inspected is 28. Most parks have been inspected between 21 and 49 times. Several parks that have been inspected much more frequently are pulling the mean up to 42.2. Below are the 10 most inspected parks:
top_10_parks_by_total_inspections <- pip_ratings_summary_by_site %>%
top_n(10, Total_Inspections)
kable(top_10_parks_by_total_inspections, format = "simple")
| PropNum | Total_Inspections | Boro | Prop.Name | Category |
|---|---|---|---|---|
| M010 | 2900 | M | Central Park | Large Park |
| B073 | 1647 | B | Prospect Park | Large Park |
| M071 | 1640 | M | Riverside Park | Large Park |
| Q099 | 1539 | Q | Flushing Meadows Corona Park | Large Park |
| X039 | 1324 | X | Pelham Bay Park | Large Park |
| X092 | 1263 | X | Van Cortlandt Park | Large Park |
| Q015 | 1143 | Q | Forest Park | Large Park |
| X010 | 930 | X | Crotona Park | Large Park |
| M037 | 778 | M | Highbridge Park | Large Park |
| R065 | 705 | R | Richmond Parkway | Large Park |
It should be beneficial to our analysis that the large parks have the most 311 service requests and are also the most frequently inspected parks. We will have more data to measure the effects of for these parks than if we tried to look at all parks (and/or everything else the Parks Inspection Program rates).
Source: PIP’s data dictionary.↩︎