Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
sandy_data <-read_csv("https://raw.githubusercontent.com/dianaplunkett/607/main/sandy-311-calls-by-day.csv")
## Rows: 1783 Columns: 25
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): date
## dbl (24): NYC-3-1-1, ACS, BPSI, CAU, CHALL, DEP, DOB, DOE, DOF, DOHMH, DPR, ...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
First things first, let’s split the year off in order to synchronize with Diana’s year based approach
print(str_sub("10/22/12", -2))
## [1] "12"
sandy_data$date = paste0("20",str_sub(sandy_data$date, -2))
head(sandy_data)
## # A tibble: 6 x 25
## date `NYC-3-1-1` ACS BPSI CAU CHALL DEP DOB DOE DOF DOHMH DPR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012 0 0 0 0 0 0 0 1 0 0 0
## 2 2012 0 0 0 0 0 0 0 1 0 0 0
## 3 2012 0 0 0 0 0 0 0 1 0 0 0
## 4 2012 0 0 0 0 0 0 0 4 0 0 0
## 5 2012 0 0 0 0 0 0 0 36 0 0 0
## 6 2012 207 0 0 0 0 0 0 312 0 0 0
## # ... with 13 more variables: FEMA <dbl>, HPD <dbl>, HRA <dbl>, MFANYC <dbl>,
## # MOSE <dbl>, NYCEM <dbl>, NYCHA <dbl>, NYCSERVICE <dbl>, NYPD <dbl>,
## # NYSDOL <dbl>, SBS <dbl>, NYSEMERGENCYMG <dbl>, total <dbl>
From there, let’s summarise the data, adding all values by date together.
sandy_data_by_year <- sandy_data %>% group_by(date) %>%
summarise(across(everything(), sum))
Now at this point let’s pause and take a quick peek at the data to ensure that we have properly structured everything.
head(sandy_data_by_year)
## # A tibble: 6 x 25
## date `NYC-3-1-1` ACS BPSI CAU CHALL DEP DOB DOE DOF DOHMH DPR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012 28295 30 152 1371 6925 60 1934 12597 1076 197 572
## 2 2013 0 22 19 38 3519 108 844 53 1821 348 0
## 3 2014 0 0 0 0 196 26 91 0 472 52 0
## 4 2015 1 0 0 0 8 15 57 0 170 10 0
## 5 2016 2 0 0 0 16 0 39 0 46 8 0
## 6 2017 1 0 0 0 7 0 20 0 14 16 0
## # ... with 13 more variables: FEMA <dbl>, HPD <dbl>, HRA <dbl>, MFANYC <dbl>,
## # MOSE <dbl>, NYCEM <dbl>, NYCHA <dbl>, NYCSERVICE <dbl>, NYPD <dbl>,
## # NYSDOL <dbl>, SBS <dbl>, NYSEMERGENCYMG <dbl>, total <dbl>
First things first, lets pivot the data
sandy_call_breakdown <- sandy_data_by_year %>% pivot_longer(!date, names_to = "Department", values_to = "Call_Count", values_drop_na = TRUE)
head(sandy_call_breakdown)
## # A tibble: 6 x 3
## date Department Call_Count
## <chr> <chr> <dbl>
## 1 2012 NYC-3-1-1 28295
## 2 2012 ACS 30
## 3 2012 BPSI 152
## 4 2012 CAU 1371
## 5 2012 CHALL 6925
## 6 2012 DEP 60
Then let’s extract out the totals per year
call_count_year <-sandy_call_breakdown[(sandy_call_breakdown$Department=="total"),]
head(call_count_year)
## # A tibble: 6 x 3
## date Department Call_Count
## <chr> <chr> <dbl>
## 1 2012 total 62850
## 2 2013 total 12819
## 3 2014 total 1527
## 4 2015 total 571
## 5 2016 total 249
## 6 2017 total 142
sandy_call_breakdown<-sandy_call_breakdown[!(sandy_call_breakdown$Department=="total"),]
Now lets calculate the call percentage.
length <- NROW(sandy_call_breakdown)
sandy_call_breakdown$Call_percentage = 0
for (row_number in 1:length){
sandy_call_breakdown$Call_percentage[row_number] = sandy_call_breakdown$Call_Count[row_number]/(call_count_year$Call_Count[call_count_year$date==sandy_call_breakdown$date[row_number]])
}
head(sandy_call_breakdown,20)
## # A tibble: 20 x 4
## date Department Call_Count Call_percentage
## <chr> <chr> <dbl> <dbl>
## 1 2012 NYC-3-1-1 28295 0.450
## 2 2012 ACS 30 0.000477
## 3 2012 BPSI 152 0.00242
## 4 2012 CAU 1371 0.0218
## 5 2012 CHALL 6925 0.110
## 6 2012 DEP 60 0.000955
## 7 2012 DOB 1934 0.0308
## 8 2012 DOE 12597 0.200
## 9 2012 DOF 1076 0.0171
## 10 2012 DOHMH 197 0.00313
## 11 2012 DPR 572 0.00910
## 12 2012 FEMA 0 0
## 13 2012 HPD 85 0.00135
## 14 2012 HRA 1380 0.0220
## 15 2012 MFANYC 1164 0.0185
## 16 2012 MOSE 70 0.00111
## 17 2012 NYCEM 1229 0.0196
## 18 2012 NYCHA 135 0.00215
## 19 2012 NYCSERVICE 473 0.00753
## 20 2012 NYPD 91 0.00145
Now lets grab just the top 5 Departments by year.
get_top5 <- function(x) {
top5 <-sandy_call_breakdown %>%
filter(date == x) %>%
arrange(desc(Call_percentage)) %>%
head(5)
return(top5)
}
year_start = 2012
year_end = 2017
df_test <- get_top5(year_start)
if ((year_end-year_start)> 1){
for (i in (year_start+1):year_end) {
#print()
df_test <- union (df_test, get_top5(i))
}
}
all_top5 <- df_test
head(all_top5)
## # A tibble: 6 x 4
## date Department Call_Count Call_percentage
## <chr> <chr> <dbl> <dbl>
## 1 2012 NYC-3-1-1 28295 0.450
## 2 2012 DOE 12597 0.200
## 3 2012 CHALL 6925 0.110
## 4 2012 NYSDOL 4490 0.0714
## 5 2012 DOB 1934 0.0308
## 6 2013 CHALL 3519 0.275
And finally let’s make a chart
color_map<-tibble(type= c('2012', '2013', '2014','2015', '2016', '2017'),
color = c('#154360', '#2980b9', '#5aa8dc',
'#a9cce3', '#aed6f1', '#ebf5fb' ))
all_top5 %>% ggplot() +
geom_col(mapping = aes(x=Department, y=Call_percentage, fill=date)) +
scale_fill_manual(values=color_map$color) +
coord_flip() +
ggtitle("Top 5 Agencies involved in 311 calls about Huricane Sandy by Year") +
ylab('% total calls made in a year')+
xlab("Agency")
We could possibly extend this by creating a look up table using something akin to: https://nychazardmitigation.com/acronyms/ to populate all of the various department values.
All in all, I liked my adjustments as I thought they made things more efficient.