Data 606 - Final Project

Load Required Packages:

Below, we load the packages required for data analysis and visualization.

library(plyr)
library(tidyverse)
library(knitr)
library(DT)
library(RColorBrewer)

Load Data:

We load the data we collected from NYC OpenData regarding park-related 311 service requests and Parks Inspection Program ratings. We will be limiting our analysis to the years 2010-2022 because that is the period of overlap in the data sets.

base1 <- "https://raw.githubusercontent.com/geedoubledee/"
base2 <- "data606_final_project/main/nyc_parks_311_"
my_urls <- c(paste0(base1, base2, "0.csv"), paste0(base1, base2, "1.csv"),
             paste0(base1, base2, "2.csv"), paste0(base1, base2, "3.csv"),
             paste0(base1, base2, "4.csv"), paste0(base1, base2, "5.csv"),
             paste0(base1, base2, "6.csv"))
cols <- c("Unique_Key", "Created_Date", "Closed_Date", "Agency_Name",
          "Complaint_Type", "Descriptor", "Incident_Zip", "Status",
          "Resolution_Description", "Resolution_Action_Updated_Date",
          "Borough", "Open_Data_Channel_Type", "Park_Facility_Name",
          "Park_Borough", "Location")
nyc_parks_311_df <- as.data.frame(matrix(NA, nrow = 0, ncol = 15))
colnames(nyc_parks_311_df) <- cols
for (i in 1:length(my_urls)){
    partial_df <- read.csv(file = my_urls[i], header = FALSE)
    colnames(partial_df) <- cols
    nyc_parks_311_df <- rbind(nyc_parks_311_df, partial_df)
}
nyc_parks_311_df <- nyc_parks_311_df %>%
    group_by(across(everything())) %>%
    filter(n() ==1)
nyc_parks_311_df <- nyc_parks_311_df[-103464, ] #remove overlooked duplicate
my_url <- "https://raw.githubusercontent.com/geedoubledee/data606_final_project/main/Parks_Inspection_Program_Sites.csv"
parks_inspection_program_sites_df <- read.csv(my_url)
subset_parks_sites_df <- parks_inspection_program_sites_df %>%
    filter(Category == "Large Park" | Category == "Small Park")
subset_parks_names <- unique(subset_parks_sites_df$Prop.Name)
subset_parks_nums <- unique(subset_parks_sites_df$PropNum)
subset_parks_ids <- unique(subset_parks_sites_df$Prop.ID)
my_url2 <- "https://raw.githubusercontent.com/geedoubledee/data606_final_project/main/Parks_Inspection_Program_Inspections.csv"
parks_inspection_program_ratings_df <- read.csv(my_url2)
parks_inspection_program_ratings_df <- parks_inspection_program_ratings_df %>%
    mutate(PropNum = gsub("-.*$", "", Prop.ID))
subset_parks_ratings_df <- parks_inspection_program_ratings_df %>%
    filter(Prop.ID %in% subset_parks_ids)

Research Question:

The research question we will be attempting to answer is:

Can a park’s monthly 311 service request volume predict a decrease in its overall/cleanliness rating assigned by the Parks Inspection Program?

Data Preparation:

We fix several date columns in our data frames since we’ll be analyzing 311 call volume and rating downgrades by month.

nyc_parks_311_df$Created_Date <- lubridate::mdy(format(as.POSIXct(
    nyc_parks_311_df$Created_Date, format = "%m/%d/%Y %H:%M"),
    format = "%m/%d/%Y"))
nyc_parks_311_df$Closed_Date <- lubridate::mdy(format(as.POSIXct(
    nyc_parks_311_df$Closed_Date, format = "%m/%d/%Y %H:%M"),
    format = "%m/%d/%Y"))
nyc_parks_311_df$Resolution_Action_Updated_Date <- lubridate::mdy(format(as.POSIXct(
    nyc_parks_311_df$Resolution_Action_Updated_Date, format = "%m/%d/%Y %H:%M"),
    format = "%m/%d/%Y"))
nyc_parks_311_df <- nyc_parks_311_df %>%
    mutate(Created_Month = lubridate::month(Created_Date),
           Created_Year = lubridate::year(Created_Date))

subset_parks_ratings_df$Date <- lubridate::mdy(format(as.POSIXct(
    subset_parks_ratings_df$Date, format = "%m/%d/%Y %I:%M:%S AM"),
    format = "%m/%d/%Y"))
remove <- c("Inspection.Year")
subset_parks_ratings_df <- subset_parks_ratings_df[, !colnames(
    subset_parks_ratings_df) %in% remove]
subset_parks_ratings_df <- subset_parks_ratings_df %>%
    mutate(Inspection_Month = lubridate::month(Date),
           Inspection_Year = lubridate::year(Date)) %>%
    filter(Inspection_Year >= 2010)

We also fix missing NYC borough values in the 311 calls data by matching incident zip codes to the boroughs those zip codes cover.

my_url3 <- "https://raw.githubusercontent.com/geedoubledee/data606_final_project/main/zip_codes_to_boroughs.csv"
zip_codes_to_boroughs <- read.csv(my_url3)
zip_codes_to_boroughs$Zip_Code <- as.character(zip_codes_to_boroughs$Zip_Code)
nyc_parks_311_df <- nyc_parks_311_df %>%
    left_join(zip_codes_to_boroughs, by = join_by(Incident_Zip == Zip_Code))
nyc_parks_311_df$Park_Borough <- na_if(nyc_parks_311_df$Park_Borough, "Unspecified")
nyc_parks_311_df <- nyc_parks_311_df %>%
    mutate(Park_Borough = coalesce(Park_Borough, Borough.y))
remove <- c("Borough.x", "Borough.y")
nyc_parks_311_df <- nyc_parks_311_df[, !colnames(nyc_parks_311_df) %in% remove]

General Information:

Park-related 311 calls vary by borough and also over time:

by_borough <- nyc_parks_311_df %>%
    filter(!is.na(Park_Borough)) %>%
    filter(Created_Year < 2023) %>%
    group_by(Park_Borough) %>%
    summarize(Request_Count = n()) %>%
    arrange(desc(Request_Count))
plot_bars <- function(dat){
    ggplot(dat, aes(x = reorder(Park_Borough, Request_Count),
                    y = Request_Count, fill = Park_Borough)) +
    geom_bar(stat = "identity", show.legend = FALSE) +
    labs(x = "Borough", y = "Requests") + 
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          axis.line = element_line(colour = "black")) +
    geom_text(label = by_borough$Request_Count, size = 4, hjust = 1.5) +
    coord_flip()
}
p <- plot_bars(by_borough)
p + scale_fill_brewer(palette = "PRGn")

plot_stacked_bars <- function(dat){
    ggplot(dat, aes(x = Created_Year, y = Request_Count,
                    fill = Park_Borough)) +
    geom_bar(stat = "identity") +
    labs(x = "Year", y = "Requests") + 
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank())
    }
by_year <- nyc_parks_311_df %>%
    filter(Created_Year < 2023) %>%
    filter(!is.na(Park_Borough)) %>%
    group_by(Park_Borough, Created_Year) %>%
    summarize(Request_Count = n()) %>%
    arrange(Created_Year)
p <- plot_stacked_bars(by_year)
p + scale_fill_brewer(palette = "PRGn")

Later we will also see how monthly park-related 311 calls vary by Category (i.e. “Large Park” vs. “Small Park”).

Creating the Numerical Calls Variable:

To perform our analysis, first we count those monthly 311 calls per park.

nyc_parks_311_df_summary <- nyc_parks_311_df %>%
    group_by(Park_Facility_Name, Created_Year, Created_Month) %>%
    summarize(Calls = n())

Gathering Site Info for Parks that Were the Subject of 311 Calls:

For all the months in which a park was the subject of 311 calls, we then pull those parks’ site information by matching their names in the 311 data to the names in the Parks Inspection Program sites data.

remove <- c("AMPSDistrict", "Prop.Location", "Site.Location", "Sub.Category", "Rated", "Reason.Not.Rated", "ZipCode")
subset_parks_sites_df <- subset_parks_sites_df[, !colnames(
    subset_parks_sites_df) %in% remove]
subset_parks_sites_df$Name_Match1 <- str_to_lower(
    str_replace_all(subset_parks_sites_df$Prop.Name, "[[:punct:]]", ""))
subset_parks_sites_df$Name_Match1 <- str_replace_all(
    subset_parks_sites_df$Name_Match1, "  ", " ")
subset_parks_sites_df$Name_Match2 <- str_to_lower(
    str_replace_all(subset_parks_sites_df$Site.Name, "[[:punct:]]", ""))
subset_parks_sites_df$Name_Match2 <- str_replace_all(
    subset_parks_sites_df$Name_Match2, "  ", " ")
nyc_parks_311_df_summary$Name_Match <- str_to_lower(
    str_replace_all(nyc_parks_311_df_summary$Park_Facility_Name, "[[:punct:]]", ""))
nyc_parks_311_df_summary$Name_Match <- str_replace_all(
    nyc_parks_311_df_summary$Name_Match, "  ", " ")
nyc_parks_311_df_summary <- nyc_parks_311_df_summary %>%
    left_join(subset_parks_sites_df, by = join_by(Name_Match == Name_Match1),
              multiple = "first")
nyc_parks_311_df_summary$Prop.ID <- NA
remove <- c("Prop.Name", "Site.Name", "Name_Match2")
nyc_parks_311_df_summary <- nyc_parks_311_df_summary[, !colnames(
    nyc_parks_311_df_summary) %in% remove]
nyc_parks_311_df_summary <- nyc_parks_311_df_summary %>%
    left_join(subset_parks_sites_df, by = join_by(Name_Match == Name_Match2),
              multiple = "first")
remove <- c("Prop.Name", "Site.Name", "Name_Match1")
nyc_parks_311_df_summary <- nyc_parks_311_df_summary[, !colnames(
    nyc_parks_311_df_summary) %in% remove]
nyc_parks_311_df_summary <- nyc_parks_311_df_summary %>%
    mutate(Prop_Num = coalesce(PropNum.x, PropNum.y),
           Prop_ID = coalesce(Prop.ID.x, Prop.ID.y),
           Borough_Indicator = coalesce(Boro.x, Boro.y),
           Acres = coalesce(Acres.x, Acres.y),
           Category = coalesce(Category.x, Category.y))
remove <- c("Name_Match", "PropNum.x", "PropNum.y", "Prop.ID.x", "Prop.ID.y",
            "Boro.x", "Boro.y", "Acres.x", "Acres.y", "Category.x", "Category.y")
nyc_parks_311_df_summary <- nyc_parks_311_df_summary[, !colnames(
    nyc_parks_311_df_summary) %in% remove]

Converting the Parks Inspection Program Ratings to Numeric Variables:

Since the Parks Inspection Program ratings are “U” for Unacceptable and “A” for Acceptable, we transform those values into numerical values: 0 for Unacceptable and 1 for Acceptable. Each rating has one value for its overall condition and one value for its cleanliness condition.

remove <- c("AMPSDistrict", "Season", "Round", "BeginInspection", "EndInspection",
            "inspector", "inspector2", "Safety.Condition", "Structural.Condition",
            "Closed.", "Comments", "inspAddedDate")
subset_parks_ratings_df <- subset_parks_ratings_df[, !colnames(
    subset_parks_ratings_df) %in% remove]
subset_parks_ratings_df <- subset_parks_ratings_df %>%
    filter(Overall.Condition != "N" & Cleanliness != "N") %>%
    mutate(Overall.Condition = ifelse(Overall.Condition == "U", 0, 1),
           Cleanliness = ifelse(Cleanliness == "U", 0, 1))

To represent parks’ monthly ratings by single values, we sum their overall rating and their cleanliness rating for that month and divide by their number of ratings for that month. A score of 0 indicating all Unacceptable ratings is then the minimum, and a score of 2 indicating all Acceptable ratings is then the maximum.

subset_parks_ratings_df_summary <- subset_parks_ratings_df %>%
    group_by(PropNum, Inspection_Year, Inspection_Month) %>%
    summarize(Num_Ratings = n(),
              Cur_Rating = sum(Overall.Condition, Cleanliness) / Num_Ratings)

Merging the Calls and Ratings Data Together for Analysis:

Now that we have the monthly 311 calls, sites information, and monthly ratings, we merge them together to create a data frame suitable for analysis.

calls_and_ratings_analysis_df <- merge(x = nyc_parks_311_df_summary,
                                       y = subset_parks_ratings_df_summary,
                                       by.x = c("Created_Year", "Created_Month",
                                                "Prop_Num"),
                                       by.y = c("Inspection_Year",
                                                "Inspection_Month",
                                                "PropNum"))
remove <- c("Num_Ratings")
calls_and_ratings_analysis_df <- calls_and_ratings_analysis_df[, !colnames(
    calls_and_ratings_analysis_df) %in% remove]
cols <- c("Year", "Month", "Prop_Num", "Park_Facility_Name", "Calls", "Prop_ID",
          "Borough_Indicator", "Acres", "Category", "Cur_Rating")
colnames(calls_and_ratings_analysis_df) <- cols
reorder <- c("Prop_Num", "Prop_ID", "Park_Facility_Name", "Borough_Indicator",
             "Acres", "Category", "Year", "Month", "Calls", "Cur_Rating")
calls_and_ratings_analysis_df <- calls_and_ratings_analysis_df[, reorder]

Creating the Binary Categorical Downgrade Variable:

To determine whether a park received a downgrade in a given month where there were 311 calls related to that park, we compare each month’s current rating to its prior rating and record 1 for true if the current rating is less than the prior rating.

calls_and_ratings_analysis_df <- calls_and_ratings_analysis_df %>%
    arrange(Park_Facility_Name, Year, Month) %>%
    group_by(Park_Facility_Name) %>%
    mutate(Prior_Rating = lag(Cur_Rating),
           Downgrade = ifelse(Cur_Rating < Prior_Rating, 1, 0))

downgrade_perc <- 100 * round(sum(calls_and_ratings_analysis_df$Downgrade,
                            na.rm = TRUE) / 
                            nrow(calls_and_ratings_analysis_df), 2)

Calls and Ratings Data Summary:

We now turn several of our variables into factors to facilitate analysis and display the first few rows of the final data frame.

factor_cols <- c("Borough_Indicator", "Category", "Downgrade")
calls_and_ratings_analysis_df[factor_cols] <- lapply(
    calls_and_ratings_analysis_df[factor_cols], as.factor)
datatable(head(calls_and_ratings_analysis_df, n = 5),
          options = list(scrollX = TRUE))

There are a few summary statistics of note:

summary(calls_and_ratings_analysis_df$Calls)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   5.119   5.000 247.000

Each month of the year that a park was the subject of 311 calls is an observation here. For 75% of these observations, the number of 311 calls was 5 or fewer. There are many significant outliers in the 4th quartile or upper 25% of the data, with the max number of recorded calls being 247 calls related to Franz Sigel Park in the Bronx in July 2020.

summary(calls_and_ratings_analysis_df$Downgrade)

##    0    1 NA's 
## 9213 2544 1119

Downgrades are rare and represent only around 20% of the outcomes in our data frame. They are split pretty evenly among Large Parks and Small Parks.

plot_stacked_bars <- function(dat){
    ggplot(dat, aes(x = Year, y = Downgrade_Count,
                    fill = Category)) +
    geom_bar(stat = "identity") +
    labs(x = "Year", y = "Number of Downgrades") + 
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank())
    }
downgrades <- calls_and_ratings_analysis_df %>%
    filter(Downgrade == 1) %>%
    group_by(Category, Year) %>%
    summarize(Downgrade_Count = n()) %>%
    arrange(Year)
p <- plot_stacked_bars(downgrades)
p + scale_fill_brewer(palette = "PRGn")

calls_vs_downgrade <- calls_and_ratings_analysis_df %>%
    filter(Downgrade == 1) %>%
    group_by(Category) %>%
    summarize(Avg_Calls_Per_Downgrade = median(Calls))

The median number of calls per month associated with a downgrade was 5 for Large Parks and 2 for Small Parks. The data is very right-skewed due to the outliers with much higher numbers of calls per month associated with downgrades.

Calls and Ratings Data Analysis:

Fitting the Logistic Regression Model:

We fit a logistic regression model based solely on Calls as the predictor of Downgrade.

calls_and_ratings_analysis_df_filtered <- calls_and_ratings_analysis_df %>%
    filter(!is.na(Downgrade))
m_logit <- glm(Downgrade ~ Calls,
               data = calls_and_ratings_analysis_df_filtered,
               family = binomial(link = "logit"))
summary(m_logit)

## 
## Call:
## glm(formula = Downgrade ~ Calls, family = binomial(link = "logit"), 
##     data = calls_and_ratings_analysis_df_filtered)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.363608   0.025025  -54.49  < 2e-16 ***
## Calls        0.013207   0.001827    7.23 4.83e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12281  on 11756  degrees of freedom
## Residual deviance: 12229  on 11755  degrees of freedom
## AIC: 12233
## 
## Number of Fisher Scoring iterations: 4

The equation for the regression curve is:

\[ \hat{y} = -1.363608 + 0.013207x\, \text{where}\, x = \text{Calls} \]

Calls is a statistically significant predictor of Downgrade according to the low p-value reported in the model output.

Visualizing the Logistic Regression Model:

We visualize our actual data points vs. predicted data points modeled by the regression curve:

predicted <- data.frame(Calls = seq(
    min(calls_and_ratings_analysis_df_filtered$Calls),
    max(calls_and_ratings_analysis_df_filtered$Calls),
    len = 1000))
predicted <- predicted 
predicted$Downgrade = predict(m_logit, predicted, type = "response")
ggplot(data = calls_and_ratings_analysis_df_filtered,
       aes(x = Calls, y = as.numeric(as.character(Downgrade)))) +
    geom_point() +
    geom_line(data = predicted, linewidth = 1) +
    labs(x = "Calls", y = "Downgrade Probability") + 
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          axis.line = element_line(colour = "black"))

It does not appear from the visualization as if the model captures the actual relationship very well. We struggled to find an assessment method that would add more context to this disconnect, but we will share more thoughts in the Conclusions section.

Improving the Logistic Regression Model:

We can improve the first model by adding other relevant variables to it and comparing the new model’s Akaike Information Criterion (AIC) to the previous model’s AIC. The variables we choose to add are Acres, Borough (“B” for Brooklyn, “M” for Manhattan, “Q” for Queens, “R” for Staten Island, and “X” for Bronx), and Category (i.e. “Large Park” vs. “Small Park”).

m_logit_update <- glm(Downgrade ~ Calls + Acres + Borough_Indicator + Category,
                      data = calls_and_ratings_analysis_df_filtered,
                      family = binomial(link = "logit"))
summary(m_logit_update)

## 
## Call:
## glm(formula = Downgrade ~ Calls + Acres + Borough_Indicator + 
##     Category, family = binomial(link = "logit"), data = calls_and_ratings_analysis_df_filtered)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.117439   0.065328 -17.105  < 2e-16 ***
## Calls               0.010116   0.001873   5.401 6.62e-08 ***
## Acres               0.005678   0.001429   3.973 7.11e-05 ***
## Borough_IndicatorM -0.143674   0.064115  -2.241   0.0250 *  
## Borough_IndicatorQ -0.293399   0.063855  -4.595 4.33e-06 ***
## Borough_IndicatorR -0.658232   0.117141  -5.619 1.92e-08 ***
## Borough_IndicatorX -0.134738   0.067318  -2.002   0.0453 *  
## CategorySmall Park -0.241556   0.055522  -4.351 1.36e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12281  on 11756  degrees of freedom
## Residual deviance: 12129  on 11749  degrees of freedom
## AIC: 12145
## 
## Number of Fisher Scoring iterations: 4

These additional variables reduced the AIC, indicating this new model would be better than the old model. However, we have struggled to visualize the improved model with a sigmoid regression curve. It seems likely that that is not the most appropriate visualization of more complicated logistic regression models such as this improved model.

Conclusions:

Although the model indicates that Calls, Acres, Category, and Borough are all statistically significant predictors of Downgrade, we don’t believe that the logistic regression model we developed was a good fit.

Visualizing the model just based on Calls, it doesn’t appear that the model captures the relationship very well. We believe the four main issues were:

that most parks are the location of only 5 or fewer 311 calls per month
that decreases in ratings only represented 20% of our outcomes
that the relationship between the predictor variables and the dependent variable was weak
that outliers presented a challenge, but removing them seemed improper

If the relationship were stronger, decreases in ratings were a more common outcome, or we found a reasonable way to deal with the outliers, the logistic regression model might have been a better fit.

Data 606 - Final Project - Research

Glen Dale Davis

2023-05-02