library(tidyverse) # ggplot2, dplyr, tidyr, readr, tibble, sringr and more
read_csv is part of readr whereas read.csv is base R. Im not sure that read_csv is tidier than read.csv
CURR_PATH<-str_trim(getwd())
# to do : use the kaggle api
# https://www.kaggle.com/mysarahmadbhat/nyc-traffic-accidents
# nyc_traffic_accidents<-read.csv("NYC Accidents 2020.csv")
nyc_traffic_accidents<-read_csv("https://raw.githubusercontent.com/TheReallyBigApple/CunyAssignments/main/DATA607/NYC%20Accidents%202020.csv",show_col_types = FALSE)the spec function displays the schema for the data frame or tibble
# the readr spec method shows you the schema
spec(nyc_traffic_accidents)
#> cols(
#> `CRASH DATE` = col_date(format = ""),
#> `CRASH TIME` = col_time(format = ""),
#> BOROUGH = col_character(),
#> `ZIP CODE` = col_double(),
#> LATITUDE = col_double(),
#> LONGITUDE = col_double(),
#> LOCATION = col_character(),
#> `ON STREET NAME` = col_character(),
#> `CROSS STREET NAME` = col_character(),
#> `OFF STREET NAME` = col_character(),
#> `NUMBER OF PERSONS INJURED` = col_double(),
#> `NUMBER OF PERSONS KILLED` = col_double(),
#> `NUMBER OF PEDESTRIANS INJURED` = col_double(),
#> `NUMBER OF PEDESTRIANS KILLED` = col_double(),
#> `NUMBER OF CYCLIST INJURED` = col_double(),
#> `NUMBER OF CYCLIST KILLED` = col_double(),
#> `NUMBER OF MOTORIST INJURED` = col_double(),
#> `NUMBER OF MOTORIST KILLED` = col_double(),
#> `CONTRIBUTING FACTOR VEHICLE 1` = col_character(),
#> `CONTRIBUTING FACTOR VEHICLE 2` = col_character(),
#> `CONTRIBUTING FACTOR VEHICLE 3` = col_character(),
#> `CONTRIBUTING FACTOR VEHICLE 4` = col_character(),
#> `CONTRIBUTING FACTOR VEHICLE 5` = col_character(),
#> COLLISION_ID = col_double(),
#> `VEHICLE TYPE CODE 1` = col_character(),
#> `VEHICLE TYPE CODE 2` = col_character(),
#> `VEHICLE TYPE CODE 3` = col_character(),
#> `VEHICLE TYPE CODE 4` = col_character(),
#> `VEHICLE TYPE CODE 5` = col_character()
#> )
# this is base R, it removes the spaces from the column names
colnames(nyc_traffic_accidents) <- make.names(colnames(nyc_traffic_accidents))magrittr provides several operators including the commonly used %>% which pipes results from one function to the next
nyc_traffic_accidents %>%
group_by(BOROUGH) %>%
summarise(observations = n(), killed = sum(NUMBER.OF.PERSONS.KILLED, na.rm = TRUE))
#> # A tibble: 6 x 3
#> BOROUGH observations killed
#> <chr> <int> <dbl>
#> 1 BRONX 9417 10
#> 2 BROOKLYN 16907 27
#> 3 MANHATTAN 7353 9
#> 4 QUEENS 14017 20
#> 5 STATEN ISLAND 1446 6
#> 6 <NA> 25741 72select can select certain columns select can also omit certain columns. Below I omit a sequence of columns starting with LATTITUDE and ending with LOCATION
nyc_traffic_accidents<-nyc_traffic_accidents %>%
select(!(LATITUDE:LOCATION))muate() can create or modify a column replace_na() can replace all instances of NA with a specified value
nyc_traffic_accidents<-nyc_traffic_accidents %>% dplyr::mutate(BOROUGH = replace_na(BOROUGH, "NYC"))while the other tidyverse packages infer data parsing, ggplot exists to display data having said that, its robust functionality includes data shaping. The histogram is essentially a group_by() as is scale_x_dates
ggplot(data = nyc_traffic_accidents, aes(x = CRASH.DATE)) +
geom_histogram() +
theme(axis.text.x=element_text(angle=60, hjust=1)) +
ggtitle("Accidents Per Day") +
scale_x_date(date_breaks = "weeks", date_labels = "%Y-%m-%d") +
xlab("Date")