Data 606 - Final Project

Load the Required Packages:

library(plyr)
library(tidyverse)
library(knitr)
library(cowplot)
library(RColorBrewer)

Data Preparation:

NYC OpenData provides a massive dataset of 311 service requests from 2010 to present.

After separating their huge .csv file into more manageable chunks using Free Huge CSV Splitter, we load the data into R.

As we load the data, we reorganize the service requests by the agency they were relayed to. That way, we can have separate .csv files for each agency since we’ll only be working with a subset of the data.

all_col_names <- c("Unique_Key", "Created_Date", "Closed_Date", "Agency", "Agency_Name", "Complaint_Type", "Descriptor", "Location_Type", "Incident_Zip", "Incident_Address", "Street_Name", "Cross_Street_1", "Cross_Street_2", "Intersection_Street_1", "Intersection_Street_2", "Address_Type", "City", "Landmark", "Facility_Type", "Status", "Due_Date", "Resolution_Description", "Resolution_Action_Updated_Date", "Community_Board", "BBL", "Borough", "X_Coordinate_State_Plane", "Y_Coordinate_State_Plane", "Open_Data_Channel_Type", "Park_Facility_Name", "Park_Borough", "Vehicle_Type", "Taxi_Company_Borough", "Taxi_Pick_Up_Location", "Bridge_Highway_Name", "Bridge_Highway_Direction", "Road_Ramp", "Bridge_Highway_Segment", "Latitude", "Longitude", "Location")

agency_names <- c("NYC311", "Administration for Children Services", "Administrative Justice Coordinator", "City Hall", "Department of Consumer Affairs", "Department of City Planning", "Department of Environmental Protection", "Department for the Aging", "Department of Homeless Services", "Department of Buildings", "Department of Education", "Department of Finance", "Department of Health and Mental Hygiene", "Department of Information Technology and Telecommunications", "Department of Probation", "Department of Transportation", "Department of Parks and Recreation", "Department of Sanitation", "Department of Youth and Community Development", "Economic Development Corporation", "Fire Department of New York", "Department of Housing Preservation and Development", "Human Resources Administration", "Mayor's Office of Immigrant Affairs", "NYC Emergency Management", "New York City Housing Authority", "New York City Police Department", "Office of Administrative Trials and Hearings", "Mayor's Office of Operations", "Taxi and Limousine Commission")

agency_codes <- c("XXX", "ACS", "AJC", "CHALL", "DCA", "DCP", "DEP", "DFTA", "DHS", "DOB", "DOE", "DOF", "DOHMH", "DOITT", "DOP", "DOT", "DPR", "DSNY", "DYCD", "EDC", "FDNY", "HPD", "HRA", "MOIA", "NYCEM", "NYCHA", "NYPD", "OATH", "OPS", "TLC") #replaced 311 with XXX

completed <- readLines("completed.txt")
if (length(completed) == 0){
    df_list <- list()
}
if(length(completed) != 30){
    files <- list.files(pattern = ".csv$")
    for (j in 1:length(agency_codes)){
        if (agency_codes[j] != "XXX" & !(agency_codes[j] %in% completed)){
            empty_df <- as.data.frame(matrix(NA, nrow = 0, ncol = 41))
            colnames(empty_df) <- all_col_names
            df_list <- append(df_list, empty_df)
            names(df_list)[j] <- agency_codes[j]
            for (i in 1:length(files)){
                file <- files[i]
                csv <- read.csv(file = file, header = FALSE)
                colnames(csv) <- all_col_names
                csv <- csv %>%
                    filter(Agency == agency_codes[j])
                df_list[[j]] <- rbind(df_list[[j]], csv)
            }
            write.csv(df_list[[j]], paste0(agency_codes[j], ".csv"),
                      row.names = FALSE)
            write(agency_codes[j], file = "completed.txt", append = TRUE)
            df_list[[j]] <- NA
        }else if(agency_codes[j] == "XXX" & !(agency_codes[j] %in% completed)){
            empty_df <- as.data.frame(matrix(NA, nrow = 0, ncol = 41))
            colnames(empty_df) <- all_col_names
            df_list <- append(df_list, empty_df)
            names(df_list)[j] <- agency_codes[j]
            for (i in 1:length(files)){
                file <- files[i]
                csv <- read.csv(file = file, header = FALSE)
                colnames(csv) <- all_col_names
                csv <- csv %>%
                    filter(Agency_Name == "NYC311")
                df_list[[j]] <- rbind(df_list[[j]], csv)
            }
            write.csv(df_list[[j]], paste0(agency_codes[j], ".csv"),
                      row.names = FALSE)
            write(agency_codes[j], file = "completed.txt", append = TRUE)
            df_list[[j]] <- NA
        }
    }
}

Below, we load the newly created .csv files for only the agencies of interest.

completed2 <- readLines("completed2.txt")
col_classes <- c(rep("character", 3),
                 rep("NULL", 1),
                 rep("character", 3),
                 rep("NULL", 1),
                 rep("character", 1),
                 rep("NULL", 9),
                 rep("character", 2),
                 rep("NULL", 1),
                 rep("character", 2),
                 rep("NULL", 2),
                 rep("character", 1),
                 rep("NULL", 2),
                 rep("character", 3),
                 rep("NULL", 9),
                 rep("character", 1))
if (length(completed2) == 0){
    df_list <- list()
}
if(length(completed2) != 16){
    files <- list.files(pattern = ".csv$")
    files <- sort(files)
    for (i in 1:length(files)){
        fn <- files[i]
        df_list <- append(df_list, NA)
        names(df_list)[i] <- str_replace(fn, ".csv", "")
        if (!(fn %in% completed2)){
            csv <- read.csv(file = fn, header = FALSE,
                            col.names = all_col_names,
                            colClasses = col_classes)
            df_list[[i]] <- csv
        }
        write(fn, file = "completed2.txt", append = TRUE)
    }
}

Finally, we create the main data frame for this project.

completed3 <- readLines("completed3.txt")
if (length(completed3) == 0){
    nyc_parks_311_df <- plyr::ldply(df_list, data.frame)
    df_list <- list()
    nyc_parks_311_df <- nyc_parks_311_df %>%
        filter(!Park_Facility_Name %in% c("Unspecified", "N, A", "N/A"))
    nyc_parks_311_df <- nyc_parks_311_df[, -1]
    nyc_parks_311_df <- nyc_parks_311_df[, -8]
    write.csv(nyc_parks_311_df, "nyc_parks_311.csv", row.names = FALSE)
    write("nyc_parks_311.csv", file = "completed3.txt", append = TRUE)
}else{
    base1 <- "https://raw.githubusercontent.com/geedoubledee/"
    base2 <- "data606_final_project/main/nyc_parks_311_"
    my_urls <- c(paste0(base1, base2, "0.csv"), paste0(base1, base2, "1.csv"),
                 paste0(base1, base2, "2.csv"), paste0(base1, base2, "3.csv"),
                 paste0(base1, base2, "4.csv"), paste0(base1, base2, "5.csv"),
                 paste0(base1, base2, "6.csv"))
    cols <- c("Unique_Key", "Created_Date", "Closed_Date", "Agency_Name",
              "Complaint_Type", "Descriptor", "Incident_Zip", "Status",
              "Resolution_Description", "Resolution_Action_Updated_Date",
              "Borough", "Open_Data_Channel_Type", "Park_Facility_Name",
              "Park_Borough", "Location")
    nyc_parks_311_df <- as.data.frame(matrix(NA, nrow = 0, ncol = 15))
    colnames(nyc_parks_311_df) <- cols
    for (i in 1:length(my_urls)){
        partial_df <- read.csv(file = my_urls[i], header = FALSE)
        colnames(partial_df) <- cols
        nyc_parks_311_df <- rbind(nyc_parks_311_df, partial_df)
    }
    nyc_parks_311_df <- nyc_parks_311_df %>%
        group_by(across(everything())) %>%
        filter(n() ==1)
    nyc_parks_311_df <- nyc_parks_311_df[-103464, ] #remove overlooked duplicate
}

We also need to supplement the data with additional information from other sources, which we load below.

my_url <- "https://raw.githubusercontent.com/geedoubledee/data606_final_project/main/Parks_Inspection_Program_Sites.csv"
parks_inspection_program_sites_df <- read.csv(my_url)
large_parks_sites_df <- parks_inspection_program_sites_df %>%
    filter(Category == "Large Park")
large_parks_names <- unique(large_parks_sites_df$Prop.Name)
large_parks_nums <- unique(large_parks_sites_df$PropNum)
my_url2 <- "https://raw.githubusercontent.com/geedoubledee/data606_final_project/main/Parks_Inspection_Program_Inspections.csv"
parks_inspection_program_ratings_df <- read.csv(my_url2)
parks_inspection_program_ratings_df <- parks_inspection_program_ratings_df %>%
    mutate(PropNum = gsub("-.*$", "", Prop.ID))
large_parks_ratings_df <- parks_inspection_program_ratings_df %>%
    filter(PropNum %in% large_parks_nums)

Research Question:

Inspectors with the Parks Inspection Program (PIP) “currently perform approximately 6,000 PIP inspections each year, giving each inspected property an ‘Acceptable’ (A) or ‘Unacceptable’ (U) rating for overall condition and cleanliness.”¹ Some sites also receive these ratings regarding their safety condition and/or structural condition.

PIP identifies 233 parks in NYC as “large parks.” For these large parks, inclusive of any sites that fall under their umbrella, can a park’s 311 service request volume predict a decrease in its PIP rating? Do certain complaint types of 311 service requests contribute more to the likelihood of resulting in a PIP rating decrease?

Cases:

There are 187,829 cases in my main data frame. Each case is a 311 service request related to a specific NYC park that was relayed to either the Department of Parks and Recreation (DPR), the Department of Homeless Services (DHS), or the New York City Police Department (NYPD).

Data Collection:

The NYC OpenData 311 service requests from 2010 to present dataset is automatically updated on a daily basis to include new service requests submitted to 311 by phone, online, and other means.

Type of Study:

This is an observational study.

Data Sources:

We are using a few data sources for this project:

Dependent Variable:

The response variable will be change in PIP rating, a binary variable that takes on one of two values: 1 (indicating a success, i.e. a decrease in PIP rating) or 0 (indicating a failure, i.e. no change or an increase in PIP rating). Here’s how we create it:

The variable it is based on, PIP rating, is qualitative. However, converting the overall rating and cleanliness rating into scores of 0 (Unacceptable, U) and 0.5 (Acceptable, A) and summing them seems like the best approach to working with this variable. (Other categories of PIP ratings (safety and structural) will be ignored since not all sites have them. )

So each site would get a score from 0 (two U’s) to 1 (two A’s) for every time it was inspected. If a park was inspected more than once a month, the intention is to take the average. Alternatively, we could always use the last rating for each month. If a park was not inspected in a given month, but was inspected previously, the intention is to roll the previous PIP rating over to fill gaps.

So if the month-over-month change is positive or 0, we will record a change in PIP rating of 0 for that month. If the month-over-month change is negative, we will record a change in PIP rating of 1 for that month.

Independent Variable(s):

The independent variable is the count of 311 service requests, which is quantitative. The intention is to separate the counts by month and year for most of the analysis and by complaint type for some of it.

Statistical Test:

We will use logistic regression to estimate the probability of a park’s PIP rating decreasing in a particular month based on its 311 service request volume for that month.

Relevant Summary Statistics: 311 Service Requests:

Without isolating the large parks and the sites that fall under their umbrella in the 311 service requests yet, we look at summary statistics for 311 service requests for all parks in a few ways:

Total Volume By Borough:

by_borough <- nyc_parks_311_df %>%
    group_by(Park_Borough) %>%
    summarize(Request_Count = n()) %>%
    arrange(desc(Request_Count))
kable(by_borough, format = "simple")

Park_Borough	Request_Count
MANHATTAN	56869
BROOKLYN	53198
QUEENS	39846
BRONX	26695
STATEN ISLAND	8054
Unspecified	3167

summary(by_borough)

##  Park_Borough       Request_Count  
##  Length:6           Min.   : 3167  
##  Class :character   1st Qu.:12714  
##  Mode  :character   Median :33271  
##                     Mean   :31305  
##                     3rd Qu.:49860  
##                     Max.   :56869

The data is left-skewed and unimodal. The median number of 311 service requests per borough is 33,271, and 50% of the data lies between 12,714 and 49,860 requests. Staten Island falls below this IQR, while Manhattan and Brooklyn both fall above it. The mean is pulled down by Staten Island and 3,167 service requests where the borough is “Unspecified.” We will likely be able to correct the missing values based on the recorded Incident_Zip and Park_Facility_Name variables. If we can’t correct the missing values, we will replace “Unspecified” values with NA values and remove them from all or some portions of analysis.

plot_bars <- function(dat){
    ggplot(dat, aes(x = reorder(Park_Borough, Request_Count),
                    y = Request_Count, fill = Park_Borough)) +
    geom_bar(stat = "identity", show.legend = FALSE) +
    labs(x = "Borough", y = "Requests") + 
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          axis.line = element_line(colour = "black")) +
    coord_flip()
}
p <- plot_bars(by_borough)
p + scale_fill_brewer(palette = "PRGn")

Total Volume By Complaint Type:

by_complaint_type <- nyc_parks_311_df
by_complaint_type$Complaint_Type <- 
    gsub("^Noise - .*", "Noise Issue", by_complaint_type$Complaint_Type)
by_complaint_type$Complaint_Type <- 
    gsub(".* Tree$|.*Tree.*", "Tree Issue", by_complaint_type$Complaint_Type)
by_complaint_type <- by_complaint_type %>%
    group_by(Complaint_Type) %>%
    summarize(Request_Count = n()) %>%
    mutate(Complaint_Type = ifelse(Request_Count < 100,
                                   "Other", Complaint_Type)) %>%
    group_by(Complaint_Type) %>%
    summarize(Request_Count = sum(Request_Count)) %>%
    arrange(desc(Request_Count))
kable(by_complaint_type, format = "simple")

Complaint_Type	Request_Count
Maintenance or Facility	110393
Violation of Park Rules	22718
Noise Issue	20483
Animal in a Park	19078
DPR Internal	7560
Tree Issue	5579
Homeless Person Assistance	1957
Other	61

summary(by_complaint_type)

##  Complaint_Type     Request_Count   
##  Length:8           Min.   :    61  
##  Class :character   1st Qu.:  4674  
##  Mode  :character   Median : 13319  
##                     Mean   : 23479  
##                     3rd Qu.: 21042  
##                     Max.   :110393

We group some complaint types together, i.e. all noise issues and all tree issues, and we classify some minimally represented complaint types as “Other” to simplify the data. It is left-skewed and unimodal. The median number of 311 service requests per complaint_type is 13,319, and 50% of the data lies between 4,674 and 21,042 requests. “Homeless Person Assistance” falls below this IQR, while “Violation of Park Rules” falls just above it and “Maintenance or Facility” falls way above it. The mean is being pulled up by all these “Maintenance or Facility” requests, so far in fact that it lies outside the IQR.

plot_bars <- function(dat){
    ggplot(dat, aes(x = reorder(Complaint_Type, Request_Count),
                    y = Request_Count, fill = Complaint_Type)) +
    geom_bar(stat = "identity", show.legend = FALSE) +
    labs(x = "Complaint Type", y = "Requests") + 
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          axis.line = element_line(colour = "black")) +
    coord_flip()
}
p <- plot_bars(by_complaint_type)
p + scale_fill_brewer(palette = "PRGn")

Yearly Volume for the Top 10 Parks Together and Individually:

First, we look at the 10 parks with the largest volume of 311 service requests (across all years).

nyc_parks_311_df$Created_Date <- gsub(" .*$", "",nyc_parks_311_df$Created_Date)
nyc_parks_311_df$Created_Date <- as.Date(nyc_parks_311_df$Created_Date,
                                         tryFormats = "%m/%d/%Y")
nyc_parks_311_df$Closed_Date <- gsub(" .*$", "",nyc_parks_311_df$Closed_Date)
nyc_parks_311_df$Closed_Date <- as.Date(nyc_parks_311_df$Closed_Date,
                                        tryFormats = "%m/%d/%Y")
nyc_parks_311_df$Resolution_Action_Updated_Date <-
    gsub(" .*$", "",nyc_parks_311_df$Resolution_Action_Updated_Date)
nyc_parks_311_df$Resolution_Action_Updated_Date <-
    as.Date(nyc_parks_311_df$Resolution_Action_Updated_Date,
            tryFormats = "%m/%d/%Y")
nyc_parks_311_df <- nyc_parks_311_df %>%
    mutate(Request_Year = lubridate::year(Created_Date))
nyc_parks_311_df_summary <- nyc_parks_311_df %>%
    group_by(Park_Facility_Name, Request_Year) %>%
    summarize(Request_Count = n())
top_10_parks_by_request_count <- nyc_parks_311_df_summary %>%
    group_by(Park_Facility_Name) %>%
    summarize(Total_Requests_for_Park = sum(Request_Count)) %>%
    arrange(desc(Total_Requests_for_Park)) %>%
    top_n(10)
kable(top_10_parks_by_request_count, format = "simple")

Park_Facility_Name	Total_Requests_for_Park
Central Park	8567
Prospect Park	4117
Riverside Park	3628
Flushing Meadows Corona Park	2983
Forest Park	2075
Franz Sigel Park	1725
Tompkins Square Park	1688
Washington Square Park	1548
Van Cortlandt Park	1523
Sunset Park	1435

Now we can visualize how the 311 service request volume has changed over time for all 10 parks together:

plot_stacked_bars <- function(dat){
    ggplot(dat, aes(x = Request_Year, y = Request_Count,
                    fill = Park_Facility_Name)) +
    geom_bar(stat = "identity") +
    labs(x = "Year", y = "Requests") + 
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank())
    }
top_10_parks_names <- unique(top_10_parks_by_request_count$Park_Facility_Name)
top_10_parks_changes_over_time <- nyc_parks_311_df_summary %>%
    filter(Park_Facility_Name %in% top_10_parks_names)
p <- plot_stacked_bars(top_10_parks_changes_over_time)
p + scale_fill_brewer(palette = "PRGn")

And we can also look more closely at how the 311 service request volume has changed over time for each of these 10 parks:

PRGn <- list("#40004B", "#762A83", "#9970AB", "#C2A5CF", "#E7D4E8",
             "#F7F7F7", "#D9F0D3", "#A6DBA0", "#5AAE61", "#1B7837",
             "#00441B")
plot_lines <- function(dat){
    ggplot(dat, aes(x = Request_Year, y = Request_Count)) +
    geom_line(color = PRGn[[11]]) + 
    geom_point(color = PRGn[[11]]) + 
    ylim(0, 1200) + 
    labs(x = "Year", y = "Requests") + 
    theme(plot.margin = unit(c(1.5, 0.5, 0, 0), "lines"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          axis.line = element_line(colour = "black"))
    }
by_park <- top_10_parks_changes_over_time %>%
    group_nest(Park_Facility_Name)
by_park <- by_park %>%
    mutate(plot = map(data, plot_lines))
plot_grid(plotlist = by_park$plot, labels = by_park$Park_Facility_Name,
          label_colour = PRGn[[1]], nrow = 5, ncol = 2, label_x = 0.15,
          hjust = 0)

We can see that 311 service requests were already on the rise for these parks collectively prior to the onset of the COVID-19 pandemic, and each park individually saw a pandemic spike that continued the upward trend. Some people might want to separate the data into two time periods, pre-pandemic and pandemic-onward, for this analysis. That is not presently part of the plan.

Relevant Summary Statistics: Park Inspections:

Lastly, we can look at summary statistics for park inspections.

parks_inspection_program_ratings_df <- parks_inspection_program_ratings_df[, -9]
parks_inspection_program_ratings_df$Date <-
    gsub(" .*$", "", parks_inspection_program_ratings_df$Date)
parks_inspection_program_ratings_df$Date <-
    as.Date(parks_inspection_program_ratings_df$Date, tryFormats = "%m/%d/%Y")
parks_inspection_program_ratings_df <- parks_inspection_program_ratings_df %>%
    mutate(Inspection_Year = lubridate::year(Date),
           Inspection_Month = lubridate::month(Date))
pip_ratings_summary_by_site_by_year <- parks_inspection_program_ratings_df %>%
    group_by(PropNum, Inspection_Year) %>%
    summarize(Inspection_Count = n())
cols <- c("Prop.ID", "Boro", "Prop.Name", "Category")
pip_ratings_summary_by_site <- pip_ratings_summary_by_site_by_year %>%
    group_by(PropNum) %>%
    summarize(Total_Inspections = sum(Inspection_Count))
pip_ratings_summary_by_site <- pip_ratings_summary_by_site %>%
    left_join(parks_inspection_program_sites_df %>% select(all_of(cols)) %>%
                  rename(PropNum = Prop.ID), by = "PropNum") %>%
    arrange(desc(Total_Inspections))
summary(pip_ratings_summary_by_site[, 1:2])

##    PropNum          Total_Inspections
##  Length:3071        Min.   :   1.0   
##  Class :character   1st Qu.:  21.0   
##  Mode  :character   Median :  28.0   
##                     Mean   :  42.2   
##                     3rd Qu.:  49.0   
##                     Max.   :2900.0

The data is left-skewed and unimodal. The median number of times a park has been inspected is 28. Most parks have been inspected between 21 and 49 times. Several parks that have been inspected much more frequently are pulling the mean up to 42.2. Below are the 10 most inspected parks:

top_10_parks_by_total_inspections <- pip_ratings_summary_by_site %>%
    top_n(10, Total_Inspections)
kable(top_10_parks_by_total_inspections, format = "simple")

PropNum	Total_Inspections	Boro	Prop.Name	Category
M010	2900	M	Central Park	Large Park
B073	1647	B	Prospect Park	Large Park
M071	1640	M	Riverside Park	Large Park
Q099	1539	Q	Flushing Meadows Corona Park	Large Park
X039	1324	X	Pelham Bay Park	Large Park
X092	1263	X	Van Cortlandt Park	Large Park
Q015	1143	Q	Forest Park	Large Park
X010	930	X	Crotona Park	Large Park
M037	778	M	Highbridge Park	Large Park
R065	705	R	Richmond Parkway	Large Park

It should be beneficial to our analysis that the large parks have the most 311 service requests and are also the most frequently inspected parks. We will have more data to measure the effects of for these parks than if we tried to look at all parks (and/or everything else the Parks Inspection Program rates).

Source: PIP’s data dictionary.↩︎

Data 606 - Final Project - Proposal

Glen Dale Davis

2023-03-28