The data set used here is from the article “The (Very) Long Tail of Hurricane Recovery”.
https://projects.fivethirtyeight.com/sandy-311/
Try to improve on the code in Diana’s approach for determining the top 5 agencies for each year, and
Try to improve on her plot that shows “which agencies were involved with the 311 calls each year (limiting it to only the top 5 agencies per year.”
## This is to pull csv into R dataframe
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
sandy_data <-read_csv("https://raw.githubusercontent.com/dianaplunkett/607/main/sandy-311-calls-by-day.csv")
## Rows: 1783 Columns: 25
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): date
## dbl (24): NYC-3-1-1, ACS, BPSI, CAU, CHALL, DEP, DOB, DOE, DOF, DOHMH, DPR, ...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sandy_data)
## # A tibble: 6 x 25
## date `NYC-3-1-1` ACS BPSI CAU CHALL DEP DOB DOE DOF DOHMH DPR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10/22~ 0 0 0 0 0 0 0 1 0 0 0
## 2 10/23~ 0 0 0 0 0 0 0 1 0 0 0
## 3 10/24~ 0 0 0 0 0 0 0 1 0 0 0
## 4 10/25~ 0 0 0 0 0 0 0 4 0 0 0
## 5 10/26~ 0 0 0 0 0 0 0 36 0 0 0
## 6 10/27~ 207 0 0 0 0 0 0 312 0 0 0
## # ... with 13 more variables: FEMA <dbl>, HPD <dbl>, HRA <dbl>, MFANYC <dbl>,
## # MOSE <dbl>, NYCEM <dbl>, NYCHA <dbl>, NYCSERVICE <dbl>, NYPD <dbl>,
## # NYSDOL <dbl>, SBS <dbl>, NYSEMERGENCYMG <dbl>, total <dbl>
##glimpse(sandy_data)
library(tidyr)
sandy_data <- tidyr::separate(sandy_data, date, c('Month', 'Day', 'Year'), sep = "/",remove = FALSE)
head(sandy_data)
## # A tibble: 6 x 28
## date Month Day Year `NYC-3-1-1` ACS BPSI CAU CHALL DEP DOB DOE
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10/22~ 10 22 12 0 0 0 0 0 0 0 1
## 2 10/23~ 10 23 12 0 0 0 0 0 0 0 1
## 3 10/24~ 10 24 12 0 0 0 0 0 0 0 1
## 4 10/25~ 10 25 12 0 0 0 0 0 0 0 4
## 5 10/26~ 10 26 12 0 0 0 0 0 0 0 36
## 6 10/27~ 10 27 12 207 0 0 0 0 0 0 312
## # ... with 16 more variables: DOF <dbl>, DOHMH <dbl>, DPR <dbl>, FEMA <dbl>,
## # HPD <dbl>, HRA <dbl>, MFANYC <dbl>, MOSE <dbl>, NYCEM <dbl>, NYCHA <dbl>,
## # NYCSERVICE <dbl>, NYPD <dbl>, NYSDOL <dbl>, SBS <dbl>,
## # NYSEMERGENCYMG <dbl>, total <dbl>
## This is to pull required columns only (I removed the date, Month, Day and total columns)
library(tidyr)
sandy_data <- sandy_data[, c("Year", "NYC-3-1-1", "ACS", "BPSI", "CAU", "CHALL", "DEP", "DOB", "DOE", "DOF", "DOHMH", "DPR", "FEMA", "HPD", "HRA", "MFANYC", "MOSE", "NYCEM", "NYCHA", "NYCSERVICE", "NYPD", "NYSDOL", "SBS", "NYSEMERGENCYMG")]
head(sandy_data)
## # A tibble: 6 x 24
## Year `NYC-3-1-1` ACS BPSI CAU CHALL DEP DOB DOE DOF DOHMH DPR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 0 0 0 0 0 0 0 1 0 0 0
## 2 12 0 0 0 0 0 0 0 1 0 0 0
## 3 12 0 0 0 0 0 0 0 1 0 0 0
## 4 12 0 0 0 0 0 0 0 4 0 0 0
## 5 12 0 0 0 0 0 0 0 36 0 0 0
## 6 12 207 0 0 0 0 0 0 312 0 0 0
## # ... with 12 more variables: FEMA <dbl>, HPD <dbl>, HRA <dbl>, MFANYC <dbl>,
## # MOSE <dbl>, NYCEM <dbl>, NYCHA <dbl>, NYCSERVICE <dbl>, NYPD <dbl>,
## # NYSDOL <dbl>, SBS <dbl>, NYSEMERGENCYMG <dbl>
colnames(sandy_data) <- c("Year", "NYC311", "ACS", "BPSI", "CAU", "CHALL", "DEP", "DOB", "DOE", "DOF", "DOHMH", "DPR", "FEMA", "HPD", "HRA", "MFANYC", "MOSE", "NYCEM", "NYCHA", "NYCSERVICE", "NYPD", "NYSDOL", "SBS", "NYSEMERGENCYMG")
head(sandy_data)
## # A tibble: 6 x 24
## Year NYC311 ACS BPSI CAU CHALL DEP DOB DOE DOF DOHMH DPR FEMA
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 0 0 0 0 0 0 0 1 0 0 0 0
## 2 12 0 0 0 0 0 0 0 1 0 0 0 0
## 3 12 0 0 0 0 0 0 0 1 0 0 0 0
## 4 12 0 0 0 0 0 0 0 4 0 0 0 0
## 5 12 0 0 0 0 0 0 0 36 0 0 0 0
## 6 12 207 0 0 0 0 0 0 312 0 0 0 0
## # ... with 11 more variables: HPD <dbl>, HRA <dbl>, MFANYC <dbl>, MOSE <dbl>,
## # NYCEM <dbl>, NYCHA <dbl>, NYCSERVICE <dbl>, NYPD <dbl>, NYSDOL <dbl>,
## # SBS <dbl>, NYSEMERGENCYMG <dbl>
data <- sandy_data %>%
group_by(Year) %>%
summarise_if(is.numeric, sum, na.rm = TRUE)
data
## # A tibble: 6 x 24
## Year NYC311 ACS BPSI CAU CHALL DEP DOB DOE DOF DOHMH DPR FEMA
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 28295 30 152 1371 6925 60 1934 12597 1076 197 572 0
## 2 13 0 22 19 38 3519 108 844 53 1821 348 0 0
## 3 14 0 0 0 0 196 26 91 0 472 52 0 0
## 4 15 1 0 0 0 8 15 57 0 170 10 0 10
## 5 16 2 0 0 0 16 0 39 0 46 8 0 0
## 6 17 1 0 0 0 7 0 20 0 14 16 0 0
## # ... with 11 more variables: HPD <dbl>, HRA <dbl>, MFANYC <dbl>, MOSE <dbl>,
## # NYCEM <dbl>, NYCHA <dbl>, NYCSERVICE <dbl>, NYPD <dbl>, NYSDOL <dbl>,
## # SBS <dbl>, NYSEMERGENCYMG <dbl>
colnames(data)[apply(data,1,which.max)]
## [1] "NYC311" "CHALL" "NYSEMERGENCYMG" "NYSEMERGENCYMG"
## [5] "NYSEMERGENCYMG" "NYSEMERGENCYMG"
## the code was running with errors so I had to use suppress warning command. But right now it is running without any errors. Not sure how it was taken care of.
##suppressWarnings(colnames(data)[apply(data,1,which.max)])
##install.packages("matrixStats")
library(matrixStats)
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
data1 <- (rowMaxs(as.matrix(data[,c(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24)])))
data1
## [1] 28295 3519 519 271 120 69
tdata = setNames(data.frame(t(data[,-1])), data[,1])
colnames(tdata) <- c("2012", "2013", "2014", "2015", "2016", "2017")
head(tdata)
## 2012 2013 2014 2015 2016 2017
## NYC311 28295 0 0 1 2 1
## ACS 30 22 0 0 0 0
## BPSI 152 19 0 0 0 0
## CAU 1371 38 0 0 0 0
## CHALL 6925 3519 196 8 16 7
## DEP 60 108 26 15 0 0
tdata
## 2012 2013 2014 2015 2016 2017
## NYC311 28295 0 0 1 2 1
## ACS 30 22 0 0 0 0
## BPSI 152 19 0 0 0 0
## CAU 1371 38 0 0 0 0
## CHALL 6925 3519 196 8 16 7
## DEP 60 108 26 15 0 0
## DOB 1934 844 91 57 39 20
## DOE 12597 53 0 0 0 0
## DOF 1076 1821 472 170 46 14
## DOHMH 197 348 52 10 8 16
## DPR 572 0 0 0 0 0
## FEMA 0 0 0 10 0 0
## HPD 85 1382 0 0 0 0
## HRA 1380 0 0 0 0 0
## MFANYC 1164 0 0 0 0 0
## MOSE 70 164 9 0 0 0
## NYCEM 1229 149 24 0 0 0
## NYCHA 135 57 0 0 0 0
## NYCSERVICE 473 932 40 0 0 0
## NYPD 91 115 35 21 18 12
## NYSDOL 4490 638 16 0 0 0
## SBS 524 715 47 8 0 3
## NYSEMERGENCYMG 0 1895 519 271 120 69
I tried to plot the graph but was not able to. The transpose of the data did not go as expected and I was not able to name the first column and hence had issues wih the plot.