Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

sandy_data <-read_csv("https://raw.githubusercontent.com/dianaplunkett/607/main/sandy-311-calls-by-day.csv")

## Rows: 1783 Columns: 25

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (1): date
## dbl (24): NYC-3-1-1, ACS, BPSI, CAU, CHALL, DEP, DOB, DOE, DOF, DOHMH, DPR, ...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Managing the Year

First things first, let’s split the year off in order to synchronize with Diana’s year based approach

print(str_sub("10/22/12", -2))

## [1] "12"

sandy_data$date = paste0("20",str_sub(sandy_data$date, -2))
head(sandy_data)

## # A tibble: 6 x 25
##   date  `NYC-3-1-1`   ACS  BPSI   CAU CHALL   DEP   DOB   DOE   DOF DOHMH   DPR
##   <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012            0     0     0     0     0     0     0     1     0     0     0
## 2 2012            0     0     0     0     0     0     0     1     0     0     0
## 3 2012            0     0     0     0     0     0     0     1     0     0     0
## 4 2012            0     0     0     0     0     0     0     4     0     0     0
## 5 2012            0     0     0     0     0     0     0    36     0     0     0
## 6 2012          207     0     0     0     0     0     0   312     0     0     0
## # ... with 13 more variables: FEMA <dbl>, HPD <dbl>, HRA <dbl>, MFANYC <dbl>,
## #   MOSE <dbl>, NYCEM <dbl>, NYCHA <dbl>, NYCSERVICE <dbl>, NYPD <dbl>,
## #   NYSDOL <dbl>, SBS <dbl>, NYSEMERGENCYMG <dbl>, total <dbl>

From there, let’s summarise the data, adding all values by date together.

sandy_data_by_year <- sandy_data %>% group_by(date) %>% 
  summarise(across(everything(), sum))

Now at this point let’s pause and take a quick peek at the data to ensure that we have properly structured everything.

head(sandy_data_by_year)

## # A tibble: 6 x 25
##   date  `NYC-3-1-1`   ACS  BPSI   CAU CHALL   DEP   DOB   DOE   DOF DOHMH   DPR
##   <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012        28295    30   152  1371  6925    60  1934 12597  1076   197   572
## 2 2013            0    22    19    38  3519   108   844    53  1821   348     0
## 3 2014            0     0     0     0   196    26    91     0   472    52     0
## 4 2015            1     0     0     0     8    15    57     0   170    10     0
## 5 2016            2     0     0     0    16     0    39     0    46     8     0
## 6 2017            1     0     0     0     7     0    20     0    14    16     0
## # ... with 13 more variables: FEMA <dbl>, HPD <dbl>, HRA <dbl>, MFANYC <dbl>,
## #   MOSE <dbl>, NYCEM <dbl>, NYCHA <dbl>, NYCSERVICE <dbl>, NYPD <dbl>,
## #   NYSDOL <dbl>, SBS <dbl>, NYSEMERGENCYMG <dbl>, total <dbl>

Data Pivoting

First things first, lets pivot the data

sandy_call_breakdown <- sandy_data_by_year %>% pivot_longer(!date, names_to = "Department", values_to = "Call_Count", values_drop_na = TRUE)
head(sandy_call_breakdown)

## # A tibble: 6 x 3
##   date  Department Call_Count
##   <chr> <chr>           <dbl>
## 1 2012  NYC-3-1-1       28295
## 2 2012  ACS                30
## 3 2012  BPSI              152
## 4 2012  CAU              1371
## 5 2012  CHALL            6925
## 6 2012  DEP                60

Then let’s extract out the totals per year

call_count_year <-sandy_call_breakdown[(sandy_call_breakdown$Department=="total"),]
head(call_count_year)

## # A tibble: 6 x 3
##   date  Department Call_Count
##   <chr> <chr>           <dbl>
## 1 2012  total           62850
## 2 2013  total           12819
## 3 2014  total            1527
## 4 2015  total             571
## 5 2016  total             249
## 6 2017  total             142

sandy_call_breakdown<-sandy_call_breakdown[!(sandy_call_breakdown$Department=="total"),]

Now lets calculate the call percentage.

length <- NROW(sandy_call_breakdown)
sandy_call_breakdown$Call_percentage = 0
for (row_number in 1:length){
  sandy_call_breakdown$Call_percentage[row_number] = sandy_call_breakdown$Call_Count[row_number]/(call_count_year$Call_Count[call_count_year$date==sandy_call_breakdown$date[row_number]])
}
head(sandy_call_breakdown,20)

## # A tibble: 20 x 4
##    date  Department Call_Count Call_percentage
##    <chr> <chr>           <dbl>           <dbl>
##  1 2012  NYC-3-1-1       28295        0.450   
##  2 2012  ACS                30        0.000477
##  3 2012  BPSI              152        0.00242 
##  4 2012  CAU              1371        0.0218  
##  5 2012  CHALL            6925        0.110   
##  6 2012  DEP                60        0.000955
##  7 2012  DOB              1934        0.0308  
##  8 2012  DOE             12597        0.200   
##  9 2012  DOF              1076        0.0171  
## 10 2012  DOHMH             197        0.00313 
## 11 2012  DPR               572        0.00910 
## 12 2012  FEMA                0        0       
## 13 2012  HPD                85        0.00135 
## 14 2012  HRA              1380        0.0220  
## 15 2012  MFANYC           1164        0.0185  
## 16 2012  MOSE               70        0.00111 
## 17 2012  NYCEM            1229        0.0196  
## 18 2012  NYCHA             135        0.00215 
## 19 2012  NYCSERVICE        473        0.00753 
## 20 2012  NYPD               91        0.00145

Data Filtering

Now lets grab just the top 5 Departments by year.

get_top5 <- function(x) {
    top5 <-sandy_call_breakdown %>% 
    filter(date == x) %>%
    arrange(desc(Call_percentage)) %>%
    head(5)
    return(top5)
}


year_start = 2012
year_end = 2017
df_test <- get_top5(year_start)
if ((year_end-year_start)> 1){
  for (i in (year_start+1):year_end) {
     #print()
     df_test <- union (df_test, get_top5(i))
  }
}
all_top5 <- df_test
head(all_top5)

## # A tibble: 6 x 4
##   date  Department Call_Count Call_percentage
##   <chr> <chr>           <dbl>           <dbl>
## 1 2012  NYC-3-1-1       28295          0.450 
## 2 2012  DOE             12597          0.200 
## 3 2012  CHALL            6925          0.110 
## 4 2012  NYSDOL           4490          0.0714
## 5 2012  DOB              1934          0.0308
## 6 2013  CHALL            3519          0.275

And finally let’s make a chart

color_map<-tibble(type= c('2012', '2013', '2014','2015', '2016', '2017'), 
                  color = c('#154360', '#2980b9', '#5aa8dc', 
                            '#a9cce3', '#aed6f1',  '#ebf5fb' ))

all_top5 %>% ggplot() +
    geom_col(mapping = aes(x=Department, y=Call_percentage, fill=date)) + 
    scale_fill_manual(values=color_map$color) +
    coord_flip() +
    ggtitle("Top 5 Agencies involved in 311 calls about Huricane Sandy by Year") + 
    ylab('% total calls made in a year')+
    xlab("Agency")

Conclusion

We could possibly extend this by creating a look up table using something akin to: https://nychazardmitigation.com/acronyms/ to populate all of the various department values.

All in all, I liked my adjustments as I thought they made things more efficient.

Making Good Code Better

Alex Moyse

Managing the Year

Data Pivoting

Data Filtering

Conclusion