library(tidyverse) # ggplot2, dplyr, tidyr, readr, tibble, sringr and more
read_csv is part of readr whereas read.csv is base R. I’m not sure that read_csv is tidier than read.csv
CURR_PATH<-str_trim(getwd())
# to do : use the kaggle api
# https://www.kaggle.com/mysarahmadbhat/nyc-traffic-accidents
# nyc_traffic_accidents<-read.csv("NYC Accidents 2020.csv")
nyc_traffic_accidents <- read_csv("https://raw.githubusercontent.com/TheReallyBigApple/CunyAssignments/main/DATA607/NYC%20Accidents%202020.csv",show_col_types = FALSE)the spec function displays the schema for the data frame or tibble
# the readr spec method shows you the schema
spec(nyc_traffic_accidents)
#> cols(
#> `CRASH DATE` = col_date(format = ""),
#> `CRASH TIME` = col_time(format = ""),
#> BOROUGH = col_character(),
#> `ZIP CODE` = col_double(),
#> LATITUDE = col_double(),
#> LONGITUDE = col_double(),
#> LOCATION = col_character(),
#> `ON STREET NAME` = col_character(),
#> `CROSS STREET NAME` = col_character(),
#> `OFF STREET NAME` = col_character(),
#> `NUMBER OF PERSONS INJURED` = col_double(),
#> `NUMBER OF PERSONS KILLED` = col_double(),
#> `NUMBER OF PEDESTRIANS INJURED` = col_double(),
#> `NUMBER OF PEDESTRIANS KILLED` = col_double(),
#> `NUMBER OF CYCLIST INJURED` = col_double(),
#> `NUMBER OF CYCLIST KILLED` = col_double(),
#> `NUMBER OF MOTORIST INJURED` = col_double(),
#> `NUMBER OF MOTORIST KILLED` = col_double(),
#> `CONTRIBUTING FACTOR VEHICLE 1` = col_character(),
#> `CONTRIBUTING FACTOR VEHICLE 2` = col_character(),
#> `CONTRIBUTING FACTOR VEHICLE 3` = col_character(),
#> `CONTRIBUTING FACTOR VEHICLE 4` = col_character(),
#> `CONTRIBUTING FACTOR VEHICLE 5` = col_character(),
#> COLLISION_ID = col_double(),
#> `VEHICLE TYPE CODE 1` = col_character(),
#> `VEHICLE TYPE CODE 2` = col_character(),
#> `VEHICLE TYPE CODE 3` = col_character(),
#> `VEHICLE TYPE CODE 4` = col_character(),
#> `VEHICLE TYPE CODE 5` = col_character()
#> )
# this is base R, it removes the spaces from the column names
colnames(nyc_traffic_accidents) <- make.names(colnames(nyc_traffic_accidents))magrittr provides several operators including the commonly used %>% which pipes results from one function to the next
nyc_traffic_accidents %>%
group_by(BOROUGH) %>%
summarise(observations = n(), killed = sum(NUMBER.OF.PERSONS.KILLED, na.rm = TRUE))
#> # A tibble: 6 × 3
#> BOROUGH observations killed
#> <chr> <int> <dbl>
#> 1 BRONX 9417 10
#> 2 BROOKLYN 16907 27
#> 3 MANHATTAN 7353 9
#> 4 QUEENS 14017 20
#> 5 STATEN ISLAND 1446 6
#> 6 <NA> 25741 72Here, we can group by borough and rearrange the dataframe to make viewing the data by borough simpler.
nyc_traffic_accidents <- nyc_traffic_accidents %>%
group_by(BOROUGH) %>%
arrange(BOROUGH)select can select certain columns select can also omit certain columns. Below I omit a sequence of columns starting with LATTITUDE and ending with LOCATION
nyc_traffic_accidents<-nyc_traffic_accidents %>%
select(!(LATITUDE:LOCATION))We can further this tidying of the dataframe by rearranging the data how we see fit.
nyc_traffic_accidents<-nyc_traffic_accidents %>%
relocate(BOROUGH, .before = CRASH.DATE)
nyc_traffic_accidents<-nyc_traffic_accidents %>%
relocate(VEHICLE.TYPE.CODE.1, .after = BOROUGH)
nyc_traffic_accidents<-nyc_traffic_accidents %>%
relocate(VEHICLE.TYPE.CODE.2, .after = VEHICLE.TYPE.CODE.1)
nyc_traffic_accidents<-nyc_traffic_accidents %>%
relocate(CONTRIBUTING.FACTOR.VEHICLE.1, .after = VEHICLE.TYPE.CODE.2)
nyc_traffic_accidents<-nyc_traffic_accidents %>%
relocate(CONTRIBUTING.FACTOR.VEHICLE.2, .after = CONTRIBUTING.FACTOR.VEHICLE.1)
nyc_traffic_accidents[1:5,1:5]
#> # A tibble: 5 × 5
#> # Groups: BOROUGH [1]
#> BOROUGH VEHICLE.TYPE.COD… VEHICLE.TYPE.COD… CONTRIBUTING.FAC… CONTRIBUTING.FA…
#> <chr> <chr> <chr> <chr> <chr>
#> 1 BRONX Sedan Station Wagon/Sp… Passing Too Clos… Unspecified
#> 2 BRONX Station Wagon/Sp… Station Wagon/Sp… Unsafe Speed Unspecified
#> 3 BRONX Station Wagon/Sp… <NA> Unspecified <NA>
#> 4 BRONX Sedan Station Wagon/Sp… Unsafe Speed Unspecified
#> 5 BRONX Station Wagon/Sp… <NA> Unspecified <NA>Here, the data was rearranged to show the types of vehicles involved and the reasons for the crash at the beginning of the dataframe instead of towards the end.
Another use for summarize would be to show which zip codes had the most injuries. This can be further extended to a plot to visualize the injuries by zip code. Because there are nearly 200 different zip codes, it may be challenging to plot them all together.
zip_injuries <- nyc_traffic_accidents %>%
group_by(ZIP.CODE) %>%
summarize(injuries = sum(NUMBER.OF.PERSONS.INJURED, na.rm = TRUE))
zip_injuries
#> # A tibble: 199 × 2
#> ZIP.CODE injuries
#> <dbl> <dbl>
#> 1 10000 7
#> 2 10001 75
#> 3 10002 148
#> 4 10003 76
#> 5 10004 10
#> 6 10005 4
#> 7 10006 3
#> 8 10007 11
#> 9 10009 53
#> 10 10010 53
#> # … with 189 more rowsmuate() can create or modify a column replace_na() can replace all instances of NA with a specified value
nyc_traffic_accidents<-nyc_traffic_accidents %>% dplyr::mutate(BOROUGH = replace_na(BOROUGH, "NYC"))while the other tidyverse packages infer data parsing, ggplot exists to display data having said that, its robust functionality includes data shaping. The histogram is essentially a group_by() as is scale_x_dates
ggplot(data = nyc_traffic_accidents, aes(x = CRASH.DATE)) +
geom_histogram() +
theme(axis.text.x=element_text(angle=60, hjust=1)) +
ggtitle("Accidents Per Day") +
scale_x_date(date_breaks = "weeks", date_labels = "%Y-%m-%d") +
xlab("Date") First, I will show a pitfall of too much or improperly grouped data:
ggplot(zip_injuries, aes(x=ZIP.CODE, y = injuries))+
geom_col()+
#geom_density(alpha=.2, fill = 'red')+
ggtitle('Injuries by ZIP Code')+
theme(plot.title = element_text(hjust = 0.5))+
xlab('ZIP Code')+
ylab('Injuries')There are a few issues here. First, with the x representing ZIP.CODE, ggplot will sort the x-axis data based on the x-values, which are five-digit numbers that should not have any order in this plot. Second, there are nearly 200 entries to work with, which leads to a very busy plot.
To remedy this, we need a smaller subset of the zip code injuries dataframe. For simplicity, I will plot the top ten as a barplot. geom_col( ) and geom_bar( ) are very similar with some slightly different rules. The main difference is that geom_bar( ) uses stat_count( ) for y values while geom_col( ) uses stat_identity( ) for yy values. Another good solution is to sort the bars in descending order. This can be accomplished by arranging the data in descending order of injuries and then plotting using the index as the x-value.
zip_injuries <- zip_injuries %>% arrange(desc(injuries))
zip10 <- head(zip_injuries, n = 10)
zip10 <- na.omit(zip10)
zip10$idx <- as.numeric(row.names(zip10))
ggplot(zip10, aes(x=idx, y = injuries, color = ZIP.CODE))+
geom_col(aes(fill = ZIP.CODE))+
#geom_density(alpha=.2, fill = 'red')+
ggtitle('Injuries by ZIP Code')+
theme(plot.title = element_text(hjust = 0.5, size = 20))+
theme(text = element_text(size = 15))+
xlab('ZIP Code')+
geom_text(
label=zip10$ZIP.CODE,
nudge_x = 0, nudge_y = 10,
check_overlap = T,
size = 5
)+
ylab('Injuries')