Load the Data on NYC Rodent Inspections into a data frame.
library(TeachingDemos)
library(dplyr)
library(readr)
rod.inspection = read_csv("C:/Users/Viviane Weinstabl/Documents/Rodent_Inspection.csv")
myname = "Viviane Weinstabl"
set.seed(char2seed(myname))
rod.inspection = sample_frac(rod.inspection, .8)
rod.inspection = rod.inspection[, sample(1:ncol(rod.inspection))]
a. Line chart illustrating, for each of the five NYC boroughs, how rat sightings have been changing from month to month over the most recent five years.
#cleaning up the date
library(lubridate)
rod.inspection$date <- parse_date_time(rod.inspection$INSPECTION_DATE,
"%m/%d/%y %H:%M%:S %p")
library(zoo)
rod.inspection$Sighting_Date <- as.yearmon(rod.inspection$date)
temp <- rod.inspection[order(rod.inspection$date, decreasing = TRUE), "date"]
lastdate <- temp[1, ]
as.data.frame(lastdate)
## date
## 1 2015-11-30 11:35:00
#last date in data set was found to be 2015-11-30 11:35:00, so I will be using data
# that goes back 5 years from that date for inspections resulting in Active Rat Signs.
five <- rod.inspection[which(rod.inspection$date>="2010-11-30 11:35:00"), ]
activefive <- five[which(five$RESULT=="Active Rat Signs"), ]
#tally up the sightings by date and borough
library(dplyr)
p1a <- activefive %>% group_by(Sighting_Date, BOROUGH) %>% tally()
#plot
library(plotly)
plot_ly() %>%
add_trace(p1a,
x = ~ p1a$Sighting_Date,
y = ~ p1a$n,
type = 'scatter',
color = ~ p1a$BOROUGH,
mode = 'lines',
hoverinfo = "text",
text = paste(p1a$n, 'Sightings in', p1a$Sighting_Date)) %>%
layout(title = 'Rat Sightings in NYC',
xaxis = list(title = "Sighting Date"),
yaxis = list(title = "Count"), autosize = F, width = 800, height = 500)
b. In general, have rat sightings been increasing, decreasing, or remained steady over this five year window? Rat sightings have remained relatively stable as a whole and for all boroughs individually over the past 5 years of this data.
c. Are there seasonal trends in rat sightings? Are there more rat sightings in some months than others? On average, rat sightings are much lower in the winter months as seen in the big slumps around Nov-Jan and higher in spring-summer months, around March-July.
d. Plot the “efficiency” of rat inspections (number of inspections yielding “Active Rat Signs” in that month divided by the total number of inspections that take place in that month)
#determine which inspections resulted in rat signs
five$rat <- five$RESULT=="Active Rat Signs"
#tally up how many were positive and how many negative,
#then find the percentage for each positive/negative result of all inspections
p1d <- five %>% group_by(Sighting_Date, BOROUGH, rat) %>% tally() %>%
group_by(Sighting_Date, BOROUGH) %>%
mutate(Efficiency = round(((100*n)/sum(n)),2))
p1d <- p1d[which(p1d$rat==TRUE), c(1,2,5)]
#plot
plot_ly() %>%
add_trace(p1d,
x = ~ p1d$Sighting_Date,
y = ~ p1d$Efficiency,
type = 'scatter',
color = ~ p1d$BOROUGH,
mode = 'lines',
hoverinfo = "text",
text = paste(p1d$Efficiency, '% of Total Inspections in', p1d$Sighting_Date)) %>%
layout(title = 'Efficiency of Inspections in NYC in Discovering Active Rat Signs',
xaxis = list(title = "Sighting Date"),
yaxis = list(title = "Efficiency (%)"), autosize = F, width = 800, height = 500)
e. List of the top ten zip codes with the largest number of inspections yielding active rat signs.
#assumption: still using the same 5 year interval.
zips <- activefive[which(activefive$ZIP_CODE > 10000), ] %>%
group_by(ZIP_CODE) %>% tally()
zips[order(zips$n, decreasing = TRUE), ] %>% head(10) %>% as.data.frame()
## ZIP_CODE n
## 1 10457 3185
## 2 10458 2936
## 3 10456 2332
## 4 10468 1911
## 5 11221 1806
## 6 10453 1775
## 7 10452 1345
## 8 11237 1263
## 9 10467 1199
## 10 11216 1054
a. Chart of phone calls about rat/vermin sightings in the weeks before and after Hurricane Sandy. The data show a marked increase in daily calls about rodent sightings. Had telephone lines not been down shortly after the Hurricane, the number of phone call may have been even more.
sandy = read_csv("C:/Users/Viviane Weinstabl/Documents/sandyrelated.csv")
library(tidyr)
library(ggplot2)
sandy$pdate <- parse_date_time(sandy$`Created Date`, "%m/%d/%y %H:%M%")
sandy <- separate(sandy, pdate, into = c("date", "time"), sep=" ")
sandy$rodent <- (sandy$`Complaint Type`=="Rodent")
p2a <- sandy %>% group_by(date) %>% summarise(RodentCalls=sum(rodent))
ggplot(p2a) + geom_col(data= p2a, aes(x=date, y=RodentCalls), fill="#E69F00") +
ggtitle("Rodent Complaint Calls to 311 Following Hurricane Sandy") +
xlab("Date") + ylab("Number of Rodent Complaints") +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
plot.title = element_text(face='bold', hjust=.5, vjust=0.5)) +
geom_vline(xintercept = 2, linetype="dashed", color= "#56B4E9") +
geom_text(x=2, y=25, label = "Hurricane \nSandy", color = "#56B4E9")
b. Correlation between other complaint types and Rodent complaints across zip codes.
#identify the top 15 non-rodent complaings
topcomp <- sandy %>% group_by(`Complaint Type`) %>% tally()
topcomp <- topcomp[order(topcomp$n, decreasing = TRUE), -2] %>% head(15)
#complaint counts by zipcode include Rodent Complaints
complaints <- rbind(topcomp, "Rodent")
compzip <- sandy[,c("Complaint Type", "Incident Zip")]
compzip <- compzip[which(compzip$`Incident Zip`>=10000), ] %>%
group_by(`Incident Zip`, `Complaint Type`) %>% tally()
compcount <- inner_join(compzip, complaints,
by = c("Complaint Type"="Complaint Type"))
compcount <- compcount[, c(2,1,3)]
#convert data to zipcode rows and complaint columns and fill in non-values with 0
library(tidyr)
compcorr <- compcount %>% spread(`Complaint Type`, n, fill = 0,
sep = NULL, drop = FALSE)
#remove zipcode and use the counts to compute the correlation
corrs <- compcorr[,-1] %>% cor() %>% round(2)
#I've included a corrplot but for a visual representation.
library(corrplot)
corrplot(corrs, method = "color", type="lower",
tl.srt = 45, tl.col = "black", order = "AOE", tl.cex=0.8)
Top 2 Complaints Based on Correlation with Rodent Complaints? ‘PLUMBING’ (correlation of .619) and ‘NONCONST’ (correlation of 0.681), followed by heating, paint-plaster, and general construction. I’ve included a chart to visually represent these correlations.
Investigating whether the restaurant violation database can be useful for predicting which property inspections are most likely to yield Active Rat Signs. Using a simple predictive model: ActiveRatSigns = log(RestaurantRatViolations + 1) + month + year + ??
Step 1. Restaurant inspection data set which, for each month-year-zip code combination, contains the total number of rodent-related inspection violations (relevant inspection codes for a restaurant rat violation are: 04L, 04K, and 08A).
rest.inspection <- read_csv("C:/Users/Viviane Weinstabl/Documents/dohmh.csv")
rest <- rest.inspection[which(rest.inspection$`VIOLATION CODE`== "04L" |
rest.inspection$`VIOLATION CODE`== "04K" |
rest.inspection$`VIOLATION CODE`== "08A"),
c("INSPECTION DATE", "ZIPCODE")]
rest$`INSPECTION DATE` <- parse_date_time(rest$`INSPECTION DATE`, "%m/%d/%y")
rest$`INSPECTION DATE` <- as.yearmon(rest$`INSPECTION DATE`)
rest <- rest[which(rest$ZIPCODE >= 10000), ] %>%
separate(`INSPECTION DATE`, into = c("month", "year"), sep=" ") %>%
group_by(ZIPCODE, month, year) %>%
tally()
names(rest) <- c("zip", "month", "year", "count")
Step 2. Join onto rodent inspection data so there is a row for each rodent inspection that also contains a measure of the number of restaurant violations for the zip code, month, and year in which the inspection occurred. Adding one to this restaurant violation measure before logging it accounts for zero values, which would otherwise yield the value negative Infinity.
rod <- five[which(five$ZIP_CODE >= 10000 & five$ZIP_CODE < 20000),] %>%
separate(Sighting_Date, into = c("month", "year"), sep=" ")
#Step 3: Converting the column created in 1d, to a binary variable
# for inspections yielding Active Rat Signs
rod$rat <- as.numeric(rod$rat)
#joining rodent & restaurant inspection data,
# creating the RestaurantRatViolations variable
rodrest <- left_join(rod, rest, by = c("ZIP_CODE" = "zip", "month" = "month",
"year" = "year")) %>%
rename(RestaurantRatViolations = count)
library(magrittr)
rodrest$RestaurantRatViolations <- rodrest$RestaurantRatViolations %>%
replace_na(0) %>% add(1) %>% log()
Step 4. Logistic regression to test whether the estimated coefficient on the restaurant sightings variable has a statistically significant relationship with whether or not an inspection yields Active Rat Signs. Month and year are included in the regression as factor (dummy) variables.
model = glm(data = rodrest, rat ~ RestaurantRatViolations +
month + year, family=binomial)
summary(model)
##
## Call:
## glm(formula = rat ~ RestaurantRatViolations + month + year, family = binomial,
## data = rodrest)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5827 -0.5185 -0.4944 -0.4553 2.2557
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.889436 0.049136 -38.453 < 2e-16 ***
## RestaurantRatViolations 0.064680 0.006801 9.511 < 2e-16 ***
## monthAug 0.174645 0.021953 7.955 1.79e-15 ***
## monthDec -0.069880 0.026089 -2.679 0.007394 **
## monthFeb 0.063555 0.024880 2.554 0.010634 *
## monthJan 0.228212 0.025395 8.986 < 2e-16 ***
## monthJul 0.243214 0.021616 11.251 < 2e-16 ***
## monthJun 0.119566 0.021924 5.454 4.93e-08 ***
## monthMar 0.127319 0.020987 6.067 1.31e-09 ***
## monthMay 0.050379 0.021300 2.365 0.018019 *
## monthNov 0.103155 0.024227 4.258 2.06e-05 ***
## monthOct 0.221769 0.021802 10.172 < 2e-16 ***
## monthSep 0.080635 0.022480 3.587 0.000335 ***
## year2011 -0.255332 0.047580 -5.366 8.04e-08 ***
## year2012 -0.502900 0.048440 -10.382 < 2e-16 ***
## year2013 -0.327777 0.050824 -6.449 1.12e-10 ***
## year2014 -0.349920 0.050841 -6.883 5.88e-12 ***
## year2015 -0.300945 0.052124 -5.774 7.76e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 299496 on 416080 degrees of freedom
## Residual deviance: 298551 on 416063 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 298587
##
## Number of Fisher Scoring iterations: 4
Does this evidence indicate that there is a statistically significant relationship between the restaurant inspection based measures and whether a rodent inspection yields Active Rat Signs? Yes. The z-value is relatively high at 9.5 and the p-value is far below 0.05 indicating that the RestaurantRatViolations indicator is very significant in predicting whether an inspection in a given zipcode will result in an active rat sighting. The p-value being much smaller than 0.05 demonstrates that the estimate is significantly different to being 0. With an estimate value of 0.06468, (indicating the partial effect of log(RestaurantRatViolations+1) on the log odds of Y=1, having adjusted for the effect of all other Xs on Y) for a logged value, we can see a positive estimated coefficient in predicting whether an inspection will lead to Active Rat Signs.
In the past few years, a lot of important reseach has been done on neighbourhood gentrification, even analyzing the rise in 311 calls (usually white people (dubbed “Apartment Patty”, “BBQ Becky” and the likes)) in a series of racialized and consequential complaints). Great data analysis of such calls and linking them to gentrification, include ones by: Buzzfeed, NYU and Science vs.. Analysis linking 311 calls about rodents to gentrification does not yet exist, although this linkage is acknowledged (even by Pest Control Agencies like M&M Pest Control and Robert Sullivan, author of “Rats: Observations on the History & Habitat of the City’s Most Unwanted Inhabitants,” stating that maps with rat-infested areas mostly indicate, “where the pressures of gentrification are at the moment most extreme.”)
An example of similar data sets from coredata.nyc being analyzed for gentrification as well as explaining the process of extraction can be found in this student’s project. For each of the above data sections I would calculate the 10-year Precent Change, join each of them together onto a data set that pairs zip-codes to sub-boroughs and census tracts and then left join all of these indicators as columns to (a greater portion of) the 311 Data for Rodent Calls, to determine correlations with number of Rodent Calls (similar to Part 3 Step 4). In addition to correlations, maps visualising the density of complaints as well as the gentrification data could help demonstrate this analysis.
This project was done for Prasanna Tambe’s OIDD 245 Course: Analytics & the Digital Economy.