pacman:: p_load(tidyverse, lubridate, plotly, sf, leaflet, tmap, highcharter, ggsci)
The Data: I will be working with a data set provided by the City of Chicago that has records from all traffic crashes in the City of Chicago and under Chicago Police Department (CPD) jurisdiction from 2015. Traffic crashes within the city limits for which CPD is not the responding police agency, typically crashes on interstate highways, freeway ramps, and on local roads along the City boundary, are excluded from this dataset. About half of all crash reports, mostly minor crashes, are self-reported at the police district by the driver(s) involved and the other half are recorded at the scene by the police officer responding to the crash. (Information taken from the Chicago Open Data Portal)
The City of Chicago is commited to reduce the number of traffic crashes, deaths, and serious injuries in the city. The Chicago Department of Transportation has started an initiative called Vision Zero Chicago, which brings together policies, technology, research, and partnerships to prevent death and serious injuries. Vision Zero Chicago aknowledges that traffic deaths are unacceptable and preventable and presents a clear plan towards reducing deaths from traffic crashes 20% citywide by 2020 and reducing serious injuries from traffic crashes 35% by 2020. (Vision Zero Chicago)
More information can be found here
Want to be part of Montgomery County Vision Zero? Take this survey!
crash <- read_csv("~/Documents/DATA110/Datasets/crash.csv",
col_types = cols(RD_NO = col_skip(),
CRASH_DATE_EST_I = col_skip(), LANE_CNT = col_skip(),
INTERSECTION_RELATED_I = col_skip(),
NOT_RIGHT_OF_WAY_I = col_skip(),
HIT_AND_RUN_I = col_skip(), PHOTOS_TAKEN_I = col_skip(),
STATEMENTS_TAKEN_I = col_skip(),
DOORING_I = col_skip(), WORK_ZONE_I = col_skip(),
WORK_ZONE_TYPE = col_skip(), WORKERS_PRESENT_I = col_skip(),
INJURIES_UNKNOWN = col_skip()))
crash$CRASH_DATE <- mdy_hms(crash$CRASH_DATE)
The time (hour) is already set in another column. So I will remove it from the datetime column.
crash$CRASH_DATE <- as_date(crash$CRASH_DATE)
min(crash$CRASH_DATE)
## [1] "2013-03-03"
From September 2017 it has the information from all police districts. Therefore, I will use only data from September 2017.
crash <- crash %>%
filter(CRASH_DATE>=(ymd("2017-09-01")))
min(crash$CRASH_DATE)
## [1] "2017-09-01"
I will remove July 2020 because there are only a couple days of data.
crash <- crash %>%
filter(CRASH_DATE<(ymd("2020-07-01")))
max(crash$CRASH_DATE)
## [1] "2020-06-30"
count <- crash %>%
group_by(month= month(CRASH_DATE), year= year(CRASH_DATE)) %>%
summarize(n=n())
## `summarise()` regrouping output by 'month' (override with `.groups` argument)
head(count,9)
## # A tibble: 9 x 3
## # Groups: month [3]
## month year n
## <dbl> <dbl> <int>
## 1 1 2018 9526
## 2 1 2019 9081
## 3 1 2020 8628
## 4 2 2018 8705
## 5 2 2019 8586
## 6 2 2020 8968
## 7 3 2018 9277
## 8 3 2019 9710
## 9 3 2020 6635
count <- count %>%
arrange(year)
head(count,9)
## # A tibble: 9 x 3
## # Groups: month [9]
## month year n
## <dbl> <dbl> <int>
## 1 9 2017 9049
## 2 10 2017 10023
## 3 11 2017 9510
## 4 12 2017 10098
## 5 1 2018 9526
## 6 2 2018 8705
## 7 3 2018 9277
## 8 4 2018 9605
## 9 5 2018 10655
count$x <- seq(1,34) #Vector in case I need it for the x axis
count$date <- mdy(paste0(count$month,"-1-",count$year)) #Date Column
count <- count %>%
rename(Date=date, Count=n, Month=month, Year=year)
hc <- highchart(type="stock") %>%
hc_add_series(data = count,
type = "line", hcaes(x = Date,
y = Count,
)) %>%
hc_xAxis(title = list(text="Date")) %>%
hc_colors("#007fff") %>%
hc_plotOptions(series=list(backgroundColor = "#535353")) %>%
hc_tooltip(borderColor = "black",
pointFormat = "<b> Crashes</b>: {point.Count}<br>")
## Warning: `parse_quosure()` is deprecated as of rlang 0.2.0.
## Please use `parse_quo()` instead.
## This warning is displayed once per session.
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `as_data_frame()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
hc
It is very clear the reduction in crashes starting in March 2020. It is very likely that this is due to the movement restrictions imposed during the COVID-19 Pandemic.
crash %>%
group_by(speed_limit=POSTED_SPEED_LIMIT) %>%
summarize(n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 37 x 2
## speed_limit n
## <dbl> <int>
## 1 0 3765
## 2 1 16
## 3 2 15
## 4 3 88
## 5 4 1
## 6 5 2393
## 7 6 5
## 8 7 1
## 9 9 26
## 10 10 6480
## # … with 27 more rows
I will use speeds only from 5 to 70 mph in intervals of 5 mph.
slc <- crash %>%
filter(POSTED_SPEED_LIMIT %in% seq(from=5, to=70, by=5)) %>%
group_by(speed_limit=POSTED_SPEED_LIMIT) %>%
summarize(n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
n1 <- slc %>%
ggplot(aes(x=speed_limit,y=n))+
geom_bar(stat = "identity",fill="#68ccff")+
scale_x_continuous(breaks=seq(5,70,5), labels=seq(5,70,5))+
ylab("Crashes")+
xlab("Posted Speed Limit (mph)")+
theme(panel.background = element_rect(fill = "#e8f7ff"), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
n1
Most of the crashes, by far, happened on a 30 mph street.
slc %>%
filter(!speed_limit==30) %>%
ggplot(aes(x=speed_limit,y=n))+
geom_bar(stat = "identity", fill="#68ccff")+
scale_x_continuous(breaks=seq(5,70,5), labels=seq(5,70,5))+
ylab("Crashes")+
xlab("Posted Speed Limit (mph)")+
theme(panel.background = element_rect(fill = "#e8f7ff"),panel.grid.major = element_blank(), panel.grid.minor = element_blank())
slc %>%
filter(speed_limit>=40) %>%
ggplot(aes(x=speed_limit,y=n))+
geom_bar(stat = "identity", fill="#68ccff")+
scale_x_continuous(breaks=seq(40,70,5), labels=seq(40,70,5))+
geom_text(aes(label=n), vjust=-0.2)+
ylab("Crashes")+
xlab("Posted Speed Limit (mph)")+
theme(panel.background = element_rect(fill = "#e8f7ff"),panel.grid.major = element_blank(), panel.grid.minor = element_blank())
It is hard to draw conclusions from this. It is tempting to suggest that since there is a significantly lower number of crashes that happened at 65 mph compared to those that happened at 55 mph, one strategy to lower the amount of crashes on the highways would be to raise the speed limit to 65 mph. However, although I tried, it is very hard to find data that would give the proportion of streets that have a certain speed limit in Chicago; and this is needed because if there are just 5 miles of streets with a 70 mph speed limit, of course we will see less accidents in a 70 mph zone (since there are only 3 miles in the city where that could happen). These barplots were just informational and not useful to draw conclusions from them.
beatgeo <- st_read("/Users/camilovelezr/Documents/DATA110/Datasets/beat")
## Reading layer `beat' from data source `/Users/camilovelezr/Documents/DATA110/Datasets/beat' using driver `ESRI Shapefile'
## Simple feature collection with 277 features and 4 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: -87.94011 ymin: 41.64455 xmax: -87.52414 ymax: 42.02303
## CRS: 4326
crash <- crash %>% rename(beat_num=BEAT_OF_OCCURRENCE)
v1 <- crash$beat_num %>% unique() %>% sort()
v2 <- beatgeo$beat_num %>% as.character() %>% as.numeric() %>% unique() %>% sort()
identical(v1,v2)
## [1] FALSE
v1
## [1] 111 112 113 114 121 122 123 124 131 132 133 211 212 213 214
## [16] 215 221 222 223 224 225 231 232 233 234 235 311 312 313 314
## [31] 321 322 323 324 331 332 333 334 411 412 413 414 421 422 423
## [46] 424 431 432 433 434 511 512 513 522 523 524 531 532 533 611
## [61] 612 613 614 621 622 623 624 631 632 633 634 711 712 713 714
## [76] 715 722 723 724 725 726 731 732 733 734 735 811 812 813 814
## [91] 815 821 822 823 824 825 831 832 833 834 835 911 912 913 914
## [106] 915 921 922 923 924 925 931 932 933 934 935 1011 1012 1013 1014
## [121] 1021 1022 1023 1024 1031 1032 1033 1034 1111 1112 1113 1114 1115 1121 1122
## [136] 1123 1124 1125 1131 1132 1133 1134 1135 1211 1212 1213 1214 1215 1221 1222
## [151] 1223 1224 1225 1231 1232 1233 1234 1235 1411 1412 1413 1414 1421 1422 1423
## [166] 1424 1431 1432 1433 1434 1511 1512 1513 1522 1523 1524 1531 1532 1533 1611
## [181] 1612 1613 1614 1621 1622 1623 1624 1631 1632 1633 1634 1651 1652 1653 1654
## [196] 1655 1711 1712 1713 1722 1723 1724 1731 1732 1733 1811 1812 1813 1814 1821
## [211] 1822 1823 1824 1831 1832 1833 1834 1911 1912 1913 1914 1915 1921 1922 1923
## [226] 1924 1925 1931 1932 1933 1934 1935 2011 2012 2013 2022 2023 2024 2031 2032
## [241] 2033 2211 2212 2213 2221 2222 2223 2232 2233 2234 2411 2412 2413 2422 2423
## [256] 2424 2431 2432 2433 2511 2512 2513 2514 2515 2521 2522 2523 2524 2525 2531
## [271] 2532 2533 2534 2535 6100
v2
## [1] 111 112 113 114 121 122 123 124 131 132 133 211 212 213 214
## [16] 215 221 222 223 224 225 231 232 233 234 235 311 312 313 314
## [31] 321 322 323 324 331 332 333 334 411 412 413 414 421 422 423
## [46] 424 431 432 433 434 511 512 513 522 523 524 531 532 533 611
## [61] 612 613 614 621 622 623 624 631 632 633 634 711 712 713 714
## [76] 715 722 723 724 725 726 731 732 733 734 735 811 812 813 814
## [91] 815 821 822 823 824 825 831 832 833 834 835 911 912 913 914
## [106] 915 921 922 923 924 925 931 932 933 934 935 1011 1012 1013 1014
## [121] 1021 1022 1023 1024 1031 1032 1033 1034 1111 1112 1113 1114 1115 1121 1122
## [136] 1123 1124 1125 1131 1132 1133 1134 1135 1211 1212 1213 1214 1215 1221 1222
## [151] 1223 1224 1225 1231 1232 1233 1234 1235 1411 1412 1413 1414 1421 1422 1423
## [166] 1424 1431 1432 1433 1434 1511 1512 1513 1522 1523 1524 1531 1532 1533 1611
## [181] 1612 1613 1614 1621 1622 1623 1624 1631 1632 1633 1634 1651 1652 1653 1654
## [196] 1655 1711 1712 1713 1722 1723 1724 1731 1732 1733 1811 1812 1813 1814 1821
## [211] 1822 1823 1824 1831 1832 1833 1834 1911 1912 1913 1914 1915 1921 1922 1923
## [226] 1924 1925 1931 1932 1933 1934 1935 2011 2012 2013 2022 2023 2024 2031 2032
## [241] 2033 2211 2212 2213 2221 2222 2223 2232 2233 2234 2411 2412 2413 2422 2423
## [256] 2424 2431 2432 2433 2511 2512 2513 2514 2515 2521 2522 2523 2524 2525 2531
## [271] 2532 2533 2534 2535 3100
# Hypothesis: 3100 is written as 6100 in the crash data set
# Data from the CPD of Beats as a data frame
bb <- rio:: import("/Users/camilovelezr/Documents/DATA110/Datasets/PoliceBeatDec2012.csv")
bb$BEAT_NUM %>% unique() %>% sort() #CPD Data. No beat 6100
## [1] 111 112 113 114 121 122 123 124 131 132 133 211 212 213 214
## [16] 215 221 222 223 224 225 231 232 233 234 235 311 312 313 314
## [31] 321 322 323 324 331 332 333 334 411 412 413 414 421 422 423
## [46] 424 431 432 433 434 511 512 513 522 523 524 531 532 533 611
## [61] 612 613 614 621 622 623 624 631 632 633 634 711 712 713 714
## [76] 715 722 723 724 725 726 731 732 733 734 735 811 812 813 814
## [91] 815 821 822 823 824 825 831 832 833 834 835 911 912 913 914
## [106] 915 921 922 923 924 925 931 932 933 934 935 1011 1012 1013 1014
## [121] 1021 1022 1023 1024 1031 1032 1033 1034 1111 1112 1113 1114 1115 1121 1122
## [136] 1123 1124 1125 1131 1132 1133 1134 1135 1211 1212 1213 1214 1215 1221 1222
## [151] 1223 1224 1225 1231 1232 1233 1234 1235 1411 1412 1413 1414 1421 1422 1423
## [166] 1424 1431 1432 1433 1434 1511 1512 1513 1522 1523 1524 1531 1532 1533 1611
## [181] 1612 1613 1614 1621 1622 1623 1624 1631 1632 1633 1634 1651 1652 1653 1654
## [196] 1655 1711 1712 1713 1722 1723 1724 1731 1732 1733 1811 1812 1813 1814 1821
## [211] 1822 1823 1824 1831 1832 1833 1834 1911 1912 1913 1914 1915 1921 1922 1923
## [226] 1924 1925 1931 1932 1933 1934 1935 2011 2012 2013 2022 2023 2024 2031 2032
## [241] 2033 2211 2212 2213 2221 2222 2223 2232 2233 2234 2411 2412 2413 2422 2423
## [256] 2424 2431 2432 2433 2511 2512 2513 2514 2515 2521 2522 2523 2524 2525 2531
## [271] 2532 2533 2534 2535 3100
crash %>% filter(beat_num==6100)
## # A tibble: 1 x 36
## CRASH_RECORD_ID CRASH_DATE POSTED_SPEED_LI… TRAFFIC_CONTROL… DEVICE_CONDITION
## <chr> <date> <dbl> <chr> <chr>
## 1 f9bc29bde650a2… 2020-01-26 0 UNKNOWN UNKNOWN
## # … with 31 more variables: WEATHER_CONDITION <chr>, LIGHTING_CONDITION <chr>,
## # FIRST_CRASH_TYPE <chr>, TRAFFICWAY_TYPE <chr>, ALIGNMENT <chr>,
## # ROADWAY_SURFACE_COND <chr>, ROAD_DEFECT <chr>, REPORT_TYPE <chr>,
## # CRASH_TYPE <chr>, DAMAGE <chr>, DATE_POLICE_NOTIFIED <chr>,
## # PRIM_CONTRIBUTORY_CAUSE <chr>, SEC_CONTRIBUTORY_CAUSE <chr>,
## # STREET_NO <dbl>, STREET_DIRECTION <chr>, STREET_NAME <chr>, beat_num <dbl>,
## # NUM_UNITS <dbl>, MOST_SEVERE_INJURY <chr>, INJURIES_TOTAL <dbl>,
## # INJURIES_FATAL <dbl>, INJURIES_INCAPACITATING <dbl>,
## # INJURIES_NON_INCAPACITATING <dbl>, INJURIES_REPORTED_NOT_EVIDENT <dbl>,
## # INJURIES_NO_INDICATION <dbl>, CRASH_HOUR <dbl>, CRASH_DAY_OF_WEEK <dbl>,
## # CRASH_MONTH <dbl>, LATITUDE <dbl>, LONGITUDE <dbl>, LOCATION <chr>
There is too much missing information so I will delete that row from the data set. It is also problematic since there is actually no beat 6100 in the Chicago Police Department.
crash <- crash %>% filter(!beat_num==6100)
v1 <- crash$beat_num %>% unique() %>% sort()
v2 <- beatgeo$beat_num %>% as.character() %>% as.numeric() %>% unique() %>% sort()
identical(v1,v2)
## [1] FALSE
v1
## [1] 111 112 113 114 121 122 123 124 131 132 133 211 212 213 214
## [16] 215 221 222 223 224 225 231 232 233 234 235 311 312 313 314
## [31] 321 322 323 324 331 332 333 334 411 412 413 414 421 422 423
## [46] 424 431 432 433 434 511 512 513 522 523 524 531 532 533 611
## [61] 612 613 614 621 622 623 624 631 632 633 634 711 712 713 714
## [76] 715 722 723 724 725 726 731 732 733 734 735 811 812 813 814
## [91] 815 821 822 823 824 825 831 832 833 834 835 911 912 913 914
## [106] 915 921 922 923 924 925 931 932 933 934 935 1011 1012 1013 1014
## [121] 1021 1022 1023 1024 1031 1032 1033 1034 1111 1112 1113 1114 1115 1121 1122
## [136] 1123 1124 1125 1131 1132 1133 1134 1135 1211 1212 1213 1214 1215 1221 1222
## [151] 1223 1224 1225 1231 1232 1233 1234 1235 1411 1412 1413 1414 1421 1422 1423
## [166] 1424 1431 1432 1433 1434 1511 1512 1513 1522 1523 1524 1531 1532 1533 1611
## [181] 1612 1613 1614 1621 1622 1623 1624 1631 1632 1633 1634 1651 1652 1653 1654
## [196] 1655 1711 1712 1713 1722 1723 1724 1731 1732 1733 1811 1812 1813 1814 1821
## [211] 1822 1823 1824 1831 1832 1833 1834 1911 1912 1913 1914 1915 1921 1922 1923
## [226] 1924 1925 1931 1932 1933 1934 1935 2011 2012 2013 2022 2023 2024 2031 2032
## [241] 2033 2211 2212 2213 2221 2222 2223 2232 2233 2234 2411 2412 2413 2422 2423
## [256] 2424 2431 2432 2433 2511 2512 2513 2514 2515 2521 2522 2523 2524 2525 2531
## [271] 2532 2533 2534 2535
v2
## [1] 111 112 113 114 121 122 123 124 131 132 133 211 212 213 214
## [16] 215 221 222 223 224 225 231 232 233 234 235 311 312 313 314
## [31] 321 322 323 324 331 332 333 334 411 412 413 414 421 422 423
## [46] 424 431 432 433 434 511 512 513 522 523 524 531 532 533 611
## [61] 612 613 614 621 622 623 624 631 632 633 634 711 712 713 714
## [76] 715 722 723 724 725 726 731 732 733 734 735 811 812 813 814
## [91] 815 821 822 823 824 825 831 832 833 834 835 911 912 913 914
## [106] 915 921 922 923 924 925 931 932 933 934 935 1011 1012 1013 1014
## [121] 1021 1022 1023 1024 1031 1032 1033 1034 1111 1112 1113 1114 1115 1121 1122
## [136] 1123 1124 1125 1131 1132 1133 1134 1135 1211 1212 1213 1214 1215 1221 1222
## [151] 1223 1224 1225 1231 1232 1233 1234 1235 1411 1412 1413 1414 1421 1422 1423
## [166] 1424 1431 1432 1433 1434 1511 1512 1513 1522 1523 1524 1531 1532 1533 1611
## [181] 1612 1613 1614 1621 1622 1623 1624 1631 1632 1633 1634 1651 1652 1653 1654
## [196] 1655 1711 1712 1713 1722 1723 1724 1731 1732 1733 1811 1812 1813 1814 1821
## [211] 1822 1823 1824 1831 1832 1833 1834 1911 1912 1913 1914 1915 1921 1922 1923
## [226] 1924 1925 1931 1932 1933 1934 1935 2011 2012 2013 2022 2023 2024 2031 2032
## [241] 2033 2211 2212 2213 2221 2222 2223 2232 2233 2234 2411 2412 2413 2422 2423
## [256] 2424 2431 2432 2433 2511 2512 2513 2514 2515 2521 2522 2523 2524 2525 2531
## [271] 2532 2533 2534 2535 3100
beatgeo$beat_num <- beatgeo$beat_num %>% as.character() %>% as.numeric()
beatgeo$beat_num %>% head()
## [1] 1713 3100 1651 1914 1915 1913
beatgeo <- beatgeo %>%
arrange(beat_num)
length(beatgeo$beat_num)
## [1] 277
beatgeo$beat_num %>% unique() %>% length
## [1] 275
There are 277 rows in the data set and only 275 unique values. I need to see which one(s) is (are) repeated.
duplicated(beatgeo$beat_num) %>% which()
## [1] 276 277
beatgeo$beat_num[275:277]
## [1] 3100 3100 3100
beatgeo3100 <- beatgeo %>%
mutate(is_3100=(beat_num==3100))
beatgeo3100 <- beatgeo %>% filter(beat_num==3100)
tm_shape(beatgeo) +
tm_borders()+
tm_shape(beatgeo3100)+
tm_fill(col = "#ff2052",alpha=0.65)
Map of Beats and Areas
beatgeo <- beatgeo %>% filter(!beat_num==3100)
beatcount <- crash %>%
group_by(beat_num) %>%
summarize(count=n()) %>%
arrange(beat_num) # Creating a data set with the count of crashes in each beat
## `summarise()` ungrouping output (override with `.groups` argument)
identical(beatgeo$beat_num, beatcount$beat_num)
## [1] TRUE
beatmap <- merge(beatgeo,beatcount)
beatmap <- beatmap %>%
rename(Crashes=count)
qtm(beatmap)
qtm(beatmap,"Crashes")
beatmap <- st_transform(beatmap, "+proj=longlat +datum=WGS84") %>% st_zm()
cc <- colorNumeric(palette = "Blues", domain=beatmap$Crashes)
leaflet(beatmap) %>%
addProviderTiles("CartoDB.Positron") %>%
addPolygons(stroke=F,
smoothFactor = 0.2,
fillOpacity = .8,
color= ~cc(beatmap$Crashes))
There is no distinct pattern that indicates general areas where crashes are more likely to occur. The number of crashes seems to be fairly distributed among the beats, with only one beat with a number of crashes greater than 4000.
crash_with_injuries <- crash %>%
mutate(Injury=(crash$INJURIES_TOTAL>0)) #NOT A FILTER OF ONLY CRASHES WITH INJURIES. ADDED COLUMN OF LOGICAL "HAS INJURY?"
crash_with_injuries$WEATHER_CONDITION %>% unique()
## [1] "CLEAR" "RAIN"
## [3] "SNOW" "SLEET/HAIL"
## [5] "UNKNOWN" "CLOUDY/OVERCAST"
## [7] "FOG/SMOKE/HAZE" "FREEZING RAIN/DRIZZLE"
## [9] "SEVERE CROSS WIND GATE" "OTHER"
## [11] "BLOWING SNOW" "BLOWING SAND, SOIL, DIRT"
crash_with_injuries <- crash_with_injuries %>% filter(!WEATHER_CONDITION %in% c("UNKNOWN", "OTHER"))
crash_with_injuries$WEATHER_CONDITION <- crash_with_injuries$WEATHER_CONDITION %>% as.factor()
crash_with_injuries$WEATHER_CONDITION %>% levels()
## [1] "BLOWING SAND, SOIL, DIRT" "BLOWING SNOW"
## [3] "CLEAR" "CLOUDY/OVERCAST"
## [5] "FOG/SMOKE/HAZE" "FREEZING RAIN/DRIZZLE"
## [7] "RAIN" "SEVERE CROSS WIND GATE"
## [9] "SLEET/HAIL" "SNOW"
crash_with_injuries$CRASH_DAY_OF_WEEK <-weekdays(crash_with_injuries$CRASH_DATE)
crash_with_injuries$CRASH_DAY_OF_WEEK <- crash_with_injuries$CRASH_DAY_OF_WEEK %>% as.factor()
crash_with_injuries$CRASH_DAY_OF_WEEK %>% levels()
## [1] "Friday" "Monday" "Saturday" "Sunday" "Thursday" "Tuesday"
## [7] "Wednesday"
I want to create 3 levels: + Early AM: 12 AM - 5 AM + AM: 6 AM - 12 PM + PM: 1 PM - 11 PM
crash_with_injuries$Time_Level <- rep(NA, length(crash_with_injuries$CRASH_RECORD_ID))
crash_with_injuries$Time_Level <- ifelse(crash_with_injuries$CRASH_HOUR %in% c(0:5),
crash_with_injuries$Time_Level <- "EARLY AM", ifelse(crash_with_injuries$CRASH_HOUR %in% c(6:12), crash_with_injuries$Time_Level <- "AM", "PM"))
crash_with_injuries$Time_Level <- crash_with_injuries$Time_Level %>% as.factor()
crash_with_injuries$Time_Level %>% levels()
## [1] "AM" "EARLY AM" "PM"
crash_with_injuries$LIGHTING_CONDITION %>% unique()
## [1] "DARKNESS" "DAYLIGHT" "DARKNESS, LIGHTED ROAD"
## [4] "DAWN" "DUSK" "UNKNOWN"
crash_with_injuries <- crash_with_injuries %>%
filter(!LIGHTING_CONDITION=="UNKNOWN")
crash_with_injuries$LIGHTING_CONDITION <- crash_with_injuries$LIGHTING_CONDITION %>% as.factor()
crash_with_injuries$LIGHTING_CONDITION %>% levels()
## [1] "DARKNESS" "DARKNESS, LIGHTED ROAD" "DAWN"
## [4] "DAYLIGHT" "DUSK"
I will star by creating a logistic regression model predicting the probability of a crash involving injuries with these predictors:
log1 <- glm(Injury~ LIGHTING_CONDITION + WEATHER_CONDITION + CRASH_DAY_OF_WEEK + Time_Level, data=crash_with_injuries, family = binomial)
summary(log1)
##
## Call:
## glm(formula = Injury ~ LIGHTING_CONDITION + WEATHER_CONDITION +
## CRASH_DAY_OF_WEEK + Time_Level, family = binomial, data = crash_with_injuries)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7243 -0.5787 -0.5318 -0.5242 2.1125
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.612363 26.664409 -0.285 0.775270
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD 0.323670 0.026239 12.336 < 2e-16
## LIGHTING_CONDITIONDAWN 0.168805 0.044366 3.805 0.000142
## LIGHTING_CONDITIONDAYLIGHT 0.036968 0.025917 1.426 0.153759
## LIGHTING_CONDITIONDUSK 0.146114 0.037799 3.866 0.000111
## WEATHER_CONDITIONBLOWING SNOW 5.783911 26.666639 0.217 0.828289
## WEATHER_CONDITIONCLEAR 5.660026 26.664389 0.212 0.831897
## WEATHER_CONDITIONCLOUDY/OVERCAST 5.857262 26.664402 0.220 0.826131
## WEATHER_CONDITIONFOG/SMOKE/HAZE 5.878696 26.664590 0.220 0.825506
## WEATHER_CONDITIONFREEZING RAIN/DRIZZLE 5.940347 26.664723 0.223 0.823707
## WEATHER_CONDITIONRAIN 5.886063 26.664393 0.221 0.825290
## WEATHER_CONDITIONSEVERE CROSS WIND GATE 5.594165 26.667482 0.210 0.833843
## WEATHER_CONDITIONSLEET/HAIL 5.945672 26.664628 0.223 0.823551
## WEATHER_CONDITIONSNOW 5.501027 26.664402 0.206 0.836552
## CRASH_DAY_OF_WEEKMonday 0.005810 0.019261 0.302 0.762940
## CRASH_DAY_OF_WEEKSaturday -0.006293 0.018980 -0.332 0.740228
## CRASH_DAY_OF_WEEKSunday 0.020690 0.020023 1.033 0.301474
## CRASH_DAY_OF_WEEKThursday 0.031874 0.018986 1.679 0.093187
## CRASH_DAY_OF_WEEKTuesday 0.014323 0.019019 0.753 0.451408
## CRASH_DAY_OF_WEEKWednesday 0.057262 0.018974 3.018 0.002545
## Time_LevelEARLY AM 0.081481 0.022648 3.598 0.000321
## Time_LevelPM 0.016440 0.012608 1.304 0.192277
##
## (Intercept)
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD ***
## LIGHTING_CONDITIONDAWN ***
## LIGHTING_CONDITIONDAYLIGHT
## LIGHTING_CONDITIONDUSK ***
## WEATHER_CONDITIONBLOWING SNOW
## WEATHER_CONDITIONCLEAR
## WEATHER_CONDITIONCLOUDY/OVERCAST
## WEATHER_CONDITIONFOG/SMOKE/HAZE
## WEATHER_CONDITIONFREEZING RAIN/DRIZZLE
## WEATHER_CONDITIONRAIN
## WEATHER_CONDITIONSEVERE CROSS WIND GATE
## WEATHER_CONDITIONSLEET/HAIL
## WEATHER_CONDITIONSNOW
## CRASH_DAY_OF_WEEKMonday
## CRASH_DAY_OF_WEEKSaturday
## CRASH_DAY_OF_WEEKSunday
## CRASH_DAY_OF_WEEKThursday .
## CRASH_DAY_OF_WEEKTuesday
## CRASH_DAY_OF_WEEKWednesday **
## Time_LevelEARLY AM ***
## Time_LevelPM
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 245399 on 297455 degrees of freedom
## Residual deviance: 244409 on 297434 degrees of freedom
## (2330 observations deleted due to missingness)
## AIC: 244453
##
## Number of Fisher Scoring iterations: 6
Surprisingly, the weather condition, given that all the factors have a p-value greater than 0.82 is not at all significant as a predictor for the probability of injuries occurring in a crash.
log2 <- glm(Injury~ LIGHTING_CONDITION + CRASH_DAY_OF_WEEK + Time_Level, data=crash_with_injuries, family = binomial)
summary(log2)
##
## Call:
## glm(formula = Injury ~ LIGHTING_CONDITION + CRASH_DAY_OF_WEEK +
## Time_Level, family = binomial, data = crash_with_injuries)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6460 -0.5480 -0.5372 -0.5320 2.0285
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.919130 0.029641 -64.745 < 2e-16
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD 0.324167 0.026222 12.362 < 2e-16
## LIGHTING_CONDITIONDAWN 0.176020 0.044325 3.971 7.15e-05
## LIGHTING_CONDITIONDAYLIGHT 0.029106 0.025838 1.126 0.25996
## LIGHTING_CONDITIONDUSK 0.152535 0.037765 4.039 5.37e-05
## CRASH_DAY_OF_WEEKMonday 0.006336 0.019243 0.329 0.74196
## CRASH_DAY_OF_WEEKSaturday -0.001503 0.018956 -0.079 0.93679
## CRASH_DAY_OF_WEEKSunday 0.025218 0.019997 1.261 0.20728
## CRASH_DAY_OF_WEEKThursday 0.031588 0.018975 1.665 0.09597
## CRASH_DAY_OF_WEEKTuesday 0.017987 0.019007 0.946 0.34397
## CRASH_DAY_OF_WEEKWednesday 0.060433 0.018962 3.187 0.00144
## Time_LevelEARLY AM 0.073741 0.022600 3.263 0.00110
## Time_LevelPM 0.009311 0.012565 0.741 0.45869
##
## (Intercept) ***
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD ***
## LIGHTING_CONDITIONDAWN ***
## LIGHTING_CONDITIONDAYLIGHT
## LIGHTING_CONDITIONDUSK ***
## CRASH_DAY_OF_WEEKMonday
## CRASH_DAY_OF_WEEKSaturday
## CRASH_DAY_OF_WEEKSunday
## CRASH_DAY_OF_WEEKThursday .
## CRASH_DAY_OF_WEEKTuesday
## CRASH_DAY_OF_WEEKWednesday **
## Time_LevelEARLY AM **
## Time_LevelPM
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 245399 on 297455 degrees of freedom
## Residual deviance: 244691 on 297443 degrees of freedom
## (2330 observations deleted due to missingness)
## AIC: 244717
##
## Number of Fisher Scoring iterations: 4
Apparently, Wednesday has a very significant effect on injuries. The other days seem to be insignificant.
In regards to time of day, only Early AM has a significant effect as a predictor.
crash_with_injuries$Is_Wednesday <- crash_with_injuries$CRASH_DAY_OF_WEEK=="Wednesday"
crash_with_injuries$Is_Early_AM <- crash_with_injuries$Time_Level=="EARLY AM"
log3 <- glm(Injury~ LIGHTING_CONDITION + Is_Early_AM + Is_Wednesday, data=crash_with_injuries, family = binomial)
summary(log3)
##
## Call:
## glm(formula = Injury ~ LIGHTING_CONDITION + Is_Early_AM + Is_Wednesday,
## family = binomial, data = crash_with_injuries)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6462 -0.5468 -0.5349 -0.5349 2.0189
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.89836 0.02459 -77.192 < 2e-16
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD 0.32426 0.02622 12.367 < 2e-16
## LIGHTING_CONDITIONDAWN 0.17330 0.04411 3.929 8.53e-05
## LIGHTING_CONDITIONDAYLIGHT 0.02616 0.02534 1.032 0.301873
## LIGHTING_CONDITIONDUSK 0.15340 0.03776 4.063 4.85e-05
## Is_Early_AMTRUE 0.06639 0.01928 3.443 0.000575
## Is_WednesdayTRUE 0.04757 0.01488 3.197 0.001387
##
## (Intercept) ***
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD ***
## LIGHTING_CONDITIONDAWN ***
## LIGHTING_CONDITIONDAYLIGHT
## LIGHTING_CONDITIONDUSK ***
## Is_Early_AMTRUE ***
## Is_WednesdayTRUE **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 245399 on 297455 degrees of freedom
## Residual deviance: 244697 on 297449 degrees of freedom
## (2330 observations deleted due to missingness)
## AIC: 244711
##
## Number of Fisher Scoring iterations: 4
We are left with only one variable that does not have a significant effect as a predictor of the probability of a crash involving at least one injury: Lighting Condition: Daylight.
crash_with_injuries_nodaylight <- crash_with_injuries %>%
filter(!LIGHTING_CONDITION=="DAYLIGHT")
crash_with_injuries_nodaylight$LIGHTING_CONDITION %>% unique()
## [1] DARKNESS DARKNESS, LIGHTED ROAD DAWN
## [4] DUSK
## Levels: DARKNESS DARKNESS, LIGHTED ROAD DAWN DAYLIGHT DUSK
log4 <- glm(Injury~ LIGHTING_CONDITION + Is_Early_AM + Is_Wednesday, data=crash_with_injuries_nodaylight, family = binomial)
summary(log4)
##
## Call:
## glm(formula = Injury ~ LIGHTING_CONDITION + Is_Early_AM + Is_Wednesday,
## family = binomial, data = crash_with_injuries_nodaylight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6595 -0.6102 -0.6102 -0.5258 2.0234
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.90874 0.02482 -76.906 < 2e-16
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD 0.32219 0.02623 12.285 < 2e-16
## LIGHTING_CONDITIONDAWN 0.17328 0.04411 3.928 8.56e-05
## LIGHTING_CONDITIONDUSK 0.15863 0.03778 4.198 2.69e-05
## Is_Early_AMTRUE 0.09713 0.02004 4.847 1.25e-06
## Is_WednesdayTRUE 0.07436 0.02618 2.840 0.00451
##
## (Intercept) ***
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD ***
## LIGHTING_CONDITIONDAWN ***
## LIGHTING_CONDITIONDUSK ***
## Is_Early_AMTRUE ***
## Is_WednesdayTRUE **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 86565 on 96666 degrees of freedom
## Residual deviance: 86344 on 96661 degrees of freedom
## (960 observations deleted due to missingness)
## AIC: 86356
##
## Number of Fisher Scoring iterations: 4
All our predictors are statistically very significant. All the p-values are smaller than 0.01.
\(let\:p_i= probability\:that\:the\:crash\:involves\:at\:least\:one\:injury;\: then\\\log(\frac{p_i}{1-p_i})=-1.90874+(0.32219\times darkness.lighted.road)+(0.17328\times dawn) + (0.15863\times dusk) + (0.09713\times is.early.am) + (0.07436\times is.wednesday)\)
darkness.lighted.road will take a value of 1 if the lighting condition is Darkness Lighted Road, otherwise the value is 0. The same case applies for dawn and dusk variables.
is.early.am will take a value of 1 if the crash happens during the time defined as early am (12 am to 5 am), otherwise it will take a value of 0.
is.wednesday will take a value of 1 if the day of the crash is wednesday, otherwise it will take a value of 0.
It is interesting that in these results, the weather conditions had such a high p-value. It is also interesting, that when ignoring the p-value for a second and focusing on the coefficients, factors such as rain, have a positive — and quite high — coefficient. Studies suggest that rain-related accidents show a consistent (and significant) decrease in severity when compared with accidents in fine weather (Edwards, 1998).
crash_with_injuries <- crash %>%
mutate(Injury=(crash$INJURIES_TOTAL>0)) #NOT A FILTER OF ONLY CRASHES WITH INJURIES. ADDED COLUMN OF LOGICAL "HAS INJURY?"
crash_with_injuries$CRASH_DAY_OF_WEEK <-weekdays(crash_with_injuries$CRASH_DATE)
wed.crashes <- crash_with_injuries %>%
filter(Injury==T) %>%
filter(CRASH_DAY_OF_WEEK=="Wednesday") %>%
group_by(Month= month(CRASH_DATE), Year= year(CRASH_DATE)) %>%
summarize(Count_W=n())
## `summarise()` regrouping output by 'Month' (override with `.groups` argument)
no.wed.crashes <- crash_with_injuries %>%
filter(Injury==T) %>%
filter(!CRASH_DAY_OF_WEEK=="Wednesday") %>%
group_by(Month= month(CRASH_DATE), Year= year(CRASH_DATE)) %>%
summarize(Count_NW=ceiling(n()/6))
## `summarise()` regrouping output by 'Month' (override with `.groups` argument)
For the number of crashes with injuries that happened during a day that was not Wednesday, for each month I calculated an average by taking the total number of crashes with injuries in that month (no Wednesdays) and dividing it by 6.
wcount <- merge(wed.crashes,no.wed.crashes)
wcount %>% head(4)
## Month Year Count_W Count_NW
## 1 1 2018 254 180
## 2 1 2019 162 150
## 3 1 2020 202 162
## 4 10 2017 217 219
wcount$Date <- mdy(paste0(wcount$Month,"-1-",wcount$Year))
p2 <- wcount %>%
ggplot()+
geom_point(aes(x=Date,y=Count_W) ,color="#33ccff")+
geom_point(aes(x=Date,y=Count_NW),color="#ff4040")+
geom_line(aes(x=Date,y=Count_W),color="#33ccff")+
geom_line(aes(x=Date,y=Count_NW),color="#ff4040")+
geom_smooth(aes(x=Date,y=Count_W),color="#33ccff", linetype="twodash", se=F)+
geom_smooth(aes(x=Date,y=Count_NW),color="#ff4040", linetype="dashed", se=F)+
theme_dark()+
theme(legend.direction = "horizontal",legend.position ="top",panel.background = element_rect(fill = "#535353"),
legend.key = element_rect(fill = "#535353"))+
scale_color_tron()+
ylab("Crashes with Injuries")+
labs(title = "Crashes with Injuries on Wednesday vs Rest of Week",
caption = "Dashed lines represent loess regression lines")
p2
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
wed.crashes$W <- T
no.wed.crashes$W <- F
new_wcount <- rbind(wed.crashes,no.wed.crashes)
new_wcount$Count <- rep(NA,68)
new_wcount$Count[1:34] <- new_wcount$Count_W[1:34]
new_wcount$Count[35:68] <- new_wcount$Count_NW[35:68]
long_wcount <- new_wcount %>%
select(Month, Year, Count, W)
long_wcount$Date <- mdy(paste0(long_wcount$Month,"-1-",long_wcount$Year)) #Date Column
p3 <- long_wcount %>%
ggplot()+
geom_point(aes(x=Date,y=Count,col=W))+
geom_line(aes(x=Date,y=Count,col=W))+
geom_smooth(aes(x=Date,y=Count,col=W), se=F, method = "loess", linetype="twodash", formula = "y~x")+
theme_dark()+
theme(legend.background=element_rect(fill="#9f9f9f"),legend.position = c(0.88, 0.8),panel.background = element_rect(fill = "#535353"),
legend.key = element_rect(fill = "#535353"), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank())+
scale_color_tron()+
ylab("Crashes with Injuries")+
labs(title = "Crashes with Injuries on Wednesday vs Rest of Week",
caption = "Dashed lines represent loess regression lines")+
labs(col="Is Wednesday?")
p3
From the graph it could be interpreted that there is indeed a higher number (on average) of crashes with injuries on Wednesday, when compared to the rest of the week. It is interesting to see that from late 2018 to mid 2019 the trend of both groups (Wednesday and Other Days) was very similar. It also becomes clear that the change in behavior from February 2020, due to the movement restrictions during the Covid-19 Pandemic, are influencing the pattern seen in the following months. It seems as if Wednesdays crash injuries were actually getting lower for 2020.
Traffic Crashes Data and Chicago Police Department Beats Data: taken from Chicago Open Data Portal.
Edwards, J. B. (1998). The relationship between road accident severity and recorded weather. Journal of Safety Research, 29(4), 249-262.