library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.3
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(stringr)
heatdf<- read_csv("C:/Users/ambra/Desktop/Data 607/W6/NYCHeatComplaints.csv", na=" ", col_names = TRUE)
## Parsed with column specification:
## cols(
## `Unique Key` = col_integer(),
## `Created Date` = col_character(),
## `Closed Date` = col_character(),
## `Complaint Type` = col_character(),
## Descriptor = col_character(),
## `Incident Zip` = col_integer(),
## `Incident Address` = col_character(),
## `Street Name` = col_character(),
## City = col_character(),
## Status = col_character(),
## Borough = col_character(),
## `X Coordinate (State Plane)` = col_integer(),
## `Y Coordinate (State Plane)` = col_integer(),
## Latitude = col_double(),
## Longitude = col_double(),
## Location = col_character()
## )
##this dataset is quite massive, with 169902 obs- I am selecting a random sample
##Clean up values in cols 2 and 3, by removing the time
heatdf<- heatdf %>% sample_frac(0.003, replace = TRUE)
##Analyze the average resolution time of open incidents by borough
##Filter by Status=Closed
heatdf<- heatdf %>% filter(Status=="Closed") %>% na.omit
extractdate<- function(x) {
str_extract_all(x, "[[:digit:]]*\\/[[:digit:]]*\\/[[:digit:]]*")}
for (i in 1:nrow(heatdf)){
heatdf$`Created Date`[i]<-extractdate(heatdf$`Created Date`[[i]])
heatdf$`Closed Date`[i]<-extractdate(heatdf$`Closed Date`[[i]])
}
##Create new variable for length of incident resolution
heatdf<- heatdf %>% mutate(resolutiondays=(as.Date(as.character(.$`Closed Date`),format="%Y/%m/%d")) - (as.Date(as.character(.$`Created Date`), format="%Y/%m/%d")))
meanres<- heatdf %>% na.omit %>% arrange(Borough) %>% group_by(Borough) %>% summarise(avg = round(mean(resolutiondays), digits=2))
ggplot(meanres, aes(Borough, avg, color = Borough, fill = Borough)) +
geom_bar(stat="identity")+
ggtitle("2016-2017 Average days to resolve a heat complaint across NYC boroughs")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
mean(meanres$avg)
## Time difference of 81.604 days
##Count closed complaints by Borough
totcompl<- heatdf %>% na.omit %>% arrange(Borough) %>% group_by(Borough) %>% summarise(totc=n())
df<- inner_join(meanres, totcompl, by = "Borough")
t <- ggplot(df, aes(avg, totc)) + geom_point()
t + facet_grid(. ~ Borough)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
Among the random sample, Staten Island-related heat incidents take the longest to be resolved, with an average of 91.7 days (vs. a NYC average of 76.1 days). However, when we look at the total number of complaints raised, there was only one in Staten Island.