Rats HW OIDD 2450
Install Packages
install.packages(“TeachingDemos”) install.packages(“readr”) install.packages(“tidyverse”) install.packages(“lubridate”) install.packages(“readr”) install.packages(“ggplot2”) install.packages(“rlang”) install.packages(“htmltools”) install.packages(“htmlwidgets”) install.packages(“shiny”)
library(TeachingDemos)
library(lubridate)
library(readr)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(rlang)
library(htmltools)
library(htmlwidgets)
library(shiny)
rod.inspection = read_csv("rodent_inspection.csv")
Rows: 939924 Columns: 20── Column specification ───────────────────────────────
Delimiter: ","
chr (11): INSPECTION_TYPE, JOB_ID, BLOCK, LOT, HOUS...
dbl (9): JOB_TICKET_OR_WORK_ORDER_ID, JOB_PROGRESS...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
myname = "Mattison Boveri"
# Then run the following lines
set.seed(char2seed(myname))
rod.inspection = sample_frac(rod.inspection, .8)
rod.inspection = rod.inspection[, sample(1:ncol(rod.inspection))]
# Part 1, A
# Fix the date format
rod.inspection$INSPECTION_DATE <- as.POSIXct(rod.inspection$INSPECTION_DATE, format = "%m/%d/%Y %I:%M:%S %p")
# Make RESULTS column numeric
rod.inspection$RESULT <- ifelse(rod.inspection$RESULT == "Active Rat Signs", 1, 0)
# Collect data from 2011 - 2016
rod.inspection <- rod.inspection %>%
mutate(INSPECTION_DATE = if_else(INSPECTION_DATE >= as.Date("2011-01-01"), INSPECTION_DATE, NA))
# Delete data from before 2011
rod.inspection <- rod.inspection[complete.cases(rod.inspection$INSPECTION_DATE), ]
# Format data
rod.inspection <- rod.inspection %>%
mutate(month_year = format(ymd_hms(INSPECTION_DATE), "%Y-%m"),
year = year(ymd_hms(INSPECTION_DATE)),
month = month(ymd_hms(INSPECTION_DATE), label = TRUE))
# Aggregate data by month and year for each borough
rod.agg <- rod.inspection %>%
group_by(BOROUGH, month_year, year, month) %>%
summarise(active_rat_signs = sum(RESULT))
`summarise()` has grouped output by 'BOROUGH', 'month_year', 'year'. You can override using the `.groups` argument.
# Create line chart
graph = ggplot(data = rod.agg, aes(x = month_year, y = active_rat_signs, color = BOROUGH, group = BOROUGH)) +
theme(axis.text.x = element_text(angle = -90, hjust = 0)) +
theme(plot.margin = unit(c(1, 0.5, 1, 1), "cm")) +
geom_line() +
labs(x = "Month and Year",
y = "Number of Active Rate Signs",
title = "Monthly Trend of Rat Sightings by Borough",
color = "Borough")
graph
Rat sightings over the past 5 years have remained the same.
Rat sightings normally decrease during the winter months from December until February and the rat sightings increase again starting in March until June.
# Part 1, D
# Find the Efficiency
efficiency <- rod.inspection %>%
group_by(BOROUGH, month_year) %>%
summarize(rod.inspection = n(),
active_rat_signs = sum(RESULT),
Efficiency = active_rat_signs / rod.inspection)
`summarise()` has grouped output by 'BOROUGH'. You can override using the `.groups` argument.
# Graph the Efficiency of rat inspections
graph2 = ggplot(data = efficiency, aes(x = month_year, y = Efficiency, color = BOROUGH, group = BOROUGH)) +
theme(axis.text.x = element_text(angle = -90, hjust = 0)) +
theme(plot.margin = unit(c(1, 0.5, 1, 1), "cm")) +
geom_line() +
labs(x = "Month and Year",
y = "Efficiency",
title = "Efficiency of Rat Inspections by Borough",
color = "Borough")
graph2
# Part 1, E
# Find the top 10 zipcodes
efficiency_zipcode <- rod.inspection %>%
group_by(BOROUGH, month_year, ZIP_CODE) %>%
summarize(rod.inspection = n(),
active_rat_signs = sum(RESULT),
Efficiency = active_rat_signs / rod.inspection)
`summarise()` has grouped output by 'BOROUGH', 'month_year'. You can override using the `.groups` argument.
top_10_zipcodes <- efficiency_zipcode %>%
arrange(desc(Efficiency)) %>%
head(10) %>%
select(ZIP_CODE)
Adding missing grouping variables: `BOROUGH`, `month_year`
# View the top 10 zip codes in descending order by efficiency
print(top_10_zipcodes)
# Part 2, A
complaints <- read.csv("sandyrelated.csv")
# Filter to find all Rodent complaints
rodent_complaints <- complaints %>%
filter(Complaint.Type == "Rodent")
# Create a new column to differentiate between complaints before and after sandy
rodent_complaints <- rodent_complaints %>%
mutate(before_sandy = if_else(Created.Date < "10/29/12", 1, 0))
# Find a numeric value for before and after sandy
complaints_before <- sum(rodent_complaints$before_sandy)
complaints_after <- nrow(rodent_complaints) - complaints_before
# Create a vector with the two variables
complaints <- c(complaints_before, complaints_after)
# Create a barplot with the vector as input
barplot(complaints,
names.arg = c("Before Sandy", "After Sandy"),
xlab = "Time Period",
ylab = "Number of Complaints",
main = "Rodent Complaints Before and After Hurricane Sandy")
The data suggests that Hurricane Sandy led to an increase in rodent sightings on the island because there were more rodent complaints in the weeks following the hurricane.
# Part 2, B
complaints <- read.csv("sandyrelated.csv")
# Find all non-rodent complaints
non_rodent_complaints <- complaints %>%
filter(Complaint.Type != "Rodent")
# Group by complaint type and count number of complaints
complaint_counts <- non_rodent_complaints %>%
group_by(Complaint.Type) %>%
summarize(Count = n()) %>%
ungroup()
# Sort by count and select top 15 complaint types
top_complaint_types <- complaint_counts %>%
arrange(desc(Count)) %>%
head(15)
# View the top 15 complaint types and their counts
top_complaint_types
# Create a new data frame with the top 15 non-rodent complaints and rodent complaints
top_complaints <- c(top_complaint_types$Complaint.Type, "Rodent")
complaints_subset <- complaints %>%
filter(Complaint.Type %in% top_complaints)
# Group by complaint type and zip code and count number of complaints
complaint_counts <- complaints_subset %>%
group_by(Incident.Zip, Complaint.Type) %>%
summarize(Count = n()) %>%
ungroup()
`summarise()` has grouped output by 'Incident.Zip'. You can override using the `.groups` argument.
# Get rid of rows without zipcodes
complaint_counts <- complaint_counts %>%
filter(as.numeric(Incident.Zip) >= 1000)
Warning: There was 1 warning in `filter()`.
ℹ In argument: `as.numeric(Incident.Zip) >= 1000`.
Caused by warning:
! NAs introduced by coercion
# View the resulting data frame
head(complaint_counts)
complaint_counts_wide <- complaint_counts %>%
pivot_wider(names_from = Complaint.Type, values_from = Count, values_fill = 0)
# Get rid of the Incident Zip column
complaint_counts_wide_nozip <- complaint_counts_wide %>%
select(-Incident.Zip)
# Find the correlations
complaint_correlations <- cor(complaint_counts_wide_nozip)
rodent_correlations <- complaint_correlations["Rodent", ]
rodent_correlations
Consumer Complaint
0.11494767
Damaged Tree
-0.10293178
ELECTRIC
0.40108714
GENERAL CONSTRUCTION
0.59992869
General Construction/Plumbing
0.26319429
HEATING
0.60705494
NONCONST
0.68215526
PAINT - PLASTER
0.56843051
PLUMBING
0.62028448
Rodent
1.00000000
Sewer
0.10801506
Street Condition
0.25272826
Water System
0.04584400
Blocked Driveway
0.20435230
Street Light Condition
0.08343904
Traffic Signal Condition
0.15980346
The two complaint types most highly correlated with rodent sightings were NONCONST and PLUMBING.
# PART 3
inspections <- read.csv("dohmh.csv", stringsAsFactors = FALSE)
# Convert INSPECTION.DATE to a Date object
inspections$INSPECTION.DATE <- as.Date(inspections$INSPECTION.DATE, format = "%m/%d/%Y")
# Extract month and year as a string
inspections$MonthYear <- format(inspections$INSPECTION.DATE, "%m-%Y")
# Find all rodent violations from the violation codes
rodent_violations <- inspections %>%
filter(VIOLATION.CODE %in% c("04L", "04K", "08A")) %>%
select(MonthYear, ZIPCODE)
# Count rodent violations by month, year, and zipcode
rodent_counts <- rodent_violations %>%
group_by(MonthYear, ZIPCODE) %>%
summarize(RestaurantRatViolations = n())
`summarise()` has grouped output by 'MonthYear'. You can override using the `.groups` argument.
rodent_inspections = read_csv("rodent_inspection.csv")
Rows: 939924 Columns: 20── Column specification ───────────────────────────────
Delimiter: ","
chr (11): INSPECTION_TYPE, JOB_ID, BLOCK, LOT, HOUS...
dbl (9): JOB_TICKET_OR_WORK_ORDER_ID, JOB_PROGRESS...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert INSPECTION DATE column to Date format
rodent_inspections$INSPECTION_DATE <- as.Date(rodent_inspections$INSPECTION_DATE, format = "%m/%d/%Y")
# Create MonthYear column from INSPECTION DATE
rodent_inspections$MonthYear <- format(rodent_inspections$INSPECTION_DATE, "%m-%Y")
names(rodent_inspections)[names(rodent_inspections) == "ZIP_CODE"] <- "ZIPCODE"
# Merge with rodent violation data
merged_data <- merge(rodent_inspections, rodent_counts, by = c("ZIPCODE", "MonthYear"))
merged_data$RESULT <- ifelse(merged_data$RESULT == "Active Rat Signs", 1, 0)
# Create separate month and year columns
merged_data$month <- as.numeric(substr(merged_data$MonthYear, 1, 2))
merged_data$year <- as.numeric(substr(merged_data$MonthYear, 4, 7))
# Create train and test cases
train = merged_data[1:(0.8*nrow(merged_data)),]
test = merged_data[-(1:(0.8*nrow(merged_data))),]
# Run the model
model <- glm(RESULT ~ log(RestaurantRatViolations + 1) + month + year, data = merged_data, family = binomial())
summary(model)
Call:
glm(formula = RESULT ~ log(RestaurantRatViolations + 1) + month +
year, family = binomial(), data = merged_data)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.5806 -0.5266 -0.5061 -0.4808 2.1726
Coefficients:
Estimate Std. Error
(Intercept) -1.141e+02 7.809e+00
log(RestaurantRatViolations + 1) 7.206e-02 5.653e-03
month -3.379e-03 1.296e-03
year 5.559e-02 3.879e-03
z value Pr(>|z|)
(Intercept) -14.607 < 2e-16 ***
log(RestaurantRatViolations + 1) 12.747 < 2e-16 ***
month -2.607 0.00914 **
year 14.331 < 2e-16 ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 392274 on 527643 degrees of freedom
Residual deviance: 391624 on 527640 degrees of freedom
(2 observations deleted due to missingness)
AIC: 391632
Number of Fisher Scoring iterations: 4
The coefficient for log(RestaurantRatViolations + 1) is positive and statistically significant, indicating that higher values of Restaurant Rat Violations are associated with higher log adds of the outcome variable (Active Rat Signs). The coefficient for month is negative and statistically significant, indicating that later months (higher values of month) are associated with lower log odds of the outcome variable (Active Rat Signs). The coefficient for year is positive and statistically significant, indicating that later years (higher values of year) are associated with higher log odds of the outcome variable (Active Rat Signs). This evidence indicates that Restaurant Rat Violations, month, and year are all important predictors of the outcome variable (Active Rat Signs).
# PART 4, A
Another area to explore in conjunction with the NYC rat data would be DSNY Litter Basket data. This data set contains information about litter baskets located around NYC. https://data.cityofnewyork.us/dataset/DSNY-Litter-Basket-Map-/d6m8-cwh9 I would like to use this data to see if litter basket location correlates with active rat sightings around the city. I would download the data from the NYC Open Data website and merge the data with the Rodent Inspection data set based on location using GPS coordinates. I would like to do some analysis on correlation between litter basket location and active rat sightings. I would also like to create a chart based on the different boroughs and see how number of litter baskets and active rat sightings vary by borough.