Project Part 1- Dataset and Motivation

DSA406_001_SP25_FP1_cisrael

Author

Chloe Israel

Published

February 18, 2025

Loading Libraries

# Load the tidyverse library
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.4.2
Warning: package 'forcats' was built under R version 4.4.2
Warning: package 'lubridate' was built under R version 4.4.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Reading in the Dataset

# Use read.csv to load the data set
accidents_data_raw <- read.csv("data/traffic_accidents.csv")

Dataset Inspection

# Inspect the first few rows of the dataset
head(accidents_data_raw)
              crash_date traffic_control_device weather_condition
1 07/29/2023 01:00:00 PM         TRAFFIC SIGNAL             CLEAR
2 08/13/2023 12:11:00 AM         TRAFFIC SIGNAL             CLEAR
3 12/09/2021 10:30:00 AM         TRAFFIC SIGNAL             CLEAR
4 08/09/2023 07:55:00 PM         TRAFFIC SIGNAL             CLEAR
5 08/19/2023 02:55:00 PM         TRAFFIC SIGNAL             CLEAR
6 09/06/2023 12:59:00 AM            NO CONTROLS              RAIN
      lighting_condition first_crash_type trafficway_type          alignment
1               DAYLIGHT          TURNING     NOT DIVIDED STRAIGHT AND LEVEL
2 DARKNESS, LIGHTED ROAD          TURNING        FOUR WAY STRAIGHT AND LEVEL
3               DAYLIGHT         REAR END  T-INTERSECTION STRAIGHT AND LEVEL
4               DAYLIGHT            ANGLE        FOUR WAY STRAIGHT AND LEVEL
5               DAYLIGHT         REAR END  T-INTERSECTION STRAIGHT AND LEVEL
6 DARKNESS, LIGHTED ROAD     FIXED OBJECT     NOT DIVIDED STRAIGHT AND LEVEL
  roadway_surface_cond road_defect                       crash_type
1              UNKNOWN     UNKNOWN           NO INJURY / DRIVE AWAY
2                  DRY  NO DEFECTS           NO INJURY / DRIVE AWAY
3                  DRY  NO DEFECTS           NO INJURY / DRIVE AWAY
4                  DRY  NO DEFECTS INJURY AND / OR TOW DUE TO CRASH
5              UNKNOWN     UNKNOWN           NO INJURY / DRIVE AWAY
6                  WET     UNKNOWN INJURY AND / OR TOW DUE TO CRASH
  intersection_related_i        damage             prim_contributory_cause
1                      Y $501 - $1,500                 UNABLE TO DETERMINE
2                      Y   OVER $1,500          IMPROPER TURNING/NO SIGNAL
3                      Y $501 - $1,500               FOLLOWING TOO CLOSELY
4                      Y   OVER $1,500                 UNABLE TO DETERMINE
5                      Y $501 - $1,500 DRIVING SKILLS/KNOWLEDGE/EXPERIENCE
6                      N $501 - $1,500                 UNABLE TO DETERMINE
  num_units       most_severe_injury injuries_total injuries_fatal
1         2  NO INDICATION OF INJURY              0              0
2         2  NO INDICATION OF INJURY              0              0
3         3  NO INDICATION OF INJURY              0              0
4         2 NONINCAPACITATING INJURY              5              0
5         2  NO INDICATION OF INJURY              0              0
6         1 NONINCAPACITATING INJURY              2              0
  injuries_incapacitating injuries_non_incapacitating
1                       0                           0
2                       0                           0
3                       0                           0
4                       0                           5
5                       0                           0
6                       0                           2
  injuries_reported_not_evident injuries_no_indication crash_hour
1                             0                      3         13
2                             0                      2          0
3                             0                      3         10
4                             0                      0         19
5                             0                      3         14
6                             0                      0          0
  crash_day_of_week crash_month
1                 7           7
2                 1           8
3                 5          12
4                 4           8
5                 7           8
6                 4           9
# Get the dataset dimensions and column names
dim(accidents_data_raw)
[1] 209306     24
names(accidents_data_raw)
 [1] "crash_date"                    "traffic_control_device"       
 [3] "weather_condition"             "lighting_condition"           
 [5] "first_crash_type"              "trafficway_type"              
 [7] "alignment"                     "roadway_surface_cond"         
 [9] "road_defect"                   "crash_type"                   
[11] "intersection_related_i"        "damage"                       
[13] "prim_contributory_cause"       "num_units"                    
[15] "most_severe_injury"            "injuries_total"               
[17] "injuries_fatal"                "injuries_incapacitating"      
[19] "injuries_non_incapacitating"   "injuries_reported_not_evident"
[21] "injuries_no_indication"        "crash_hour"                   
[23] "crash_day_of_week"             "crash_month"                  
# Inspect dataset's structure
str(accidents_data_raw)
'data.frame':   209306 obs. of  24 variables:
 $ crash_date                   : chr  "07/29/2023 01:00:00 PM" "08/13/2023 12:11:00 AM" "12/09/2021 10:30:00 AM" "08/09/2023 07:55:00 PM" ...
 $ traffic_control_device       : chr  "TRAFFIC SIGNAL" "TRAFFIC SIGNAL" "TRAFFIC SIGNAL" "TRAFFIC SIGNAL" ...
 $ weather_condition            : chr  "CLEAR" "CLEAR" "CLEAR" "CLEAR" ...
 $ lighting_condition           : chr  "DAYLIGHT" "DARKNESS, LIGHTED ROAD" "DAYLIGHT" "DAYLIGHT" ...
 $ first_crash_type             : chr  "TURNING" "TURNING" "REAR END" "ANGLE" ...
 $ trafficway_type              : chr  "NOT DIVIDED" "FOUR WAY" "T-INTERSECTION" "FOUR WAY" ...
 $ alignment                    : chr  "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" ...
 $ roadway_surface_cond         : chr  "UNKNOWN" "DRY" "DRY" "DRY" ...
 $ road_defect                  : chr  "UNKNOWN" "NO DEFECTS" "NO DEFECTS" "NO DEFECTS" ...
 $ crash_type                   : chr  "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" ...
 $ intersection_related_i       : chr  "Y" "Y" "Y" "Y" ...
 $ damage                       : chr  "$501 - $1,500" "OVER $1,500" "$501 - $1,500" "OVER $1,500" ...
 $ prim_contributory_cause      : chr  "UNABLE TO DETERMINE" "IMPROPER TURNING/NO SIGNAL" "FOLLOWING TOO CLOSELY" "UNABLE TO DETERMINE" ...
 $ num_units                    : int  2 2 3 2 2 1 2 2 2 2 ...
 $ most_severe_injury           : chr  "NO INDICATION OF INJURY" "NO INDICATION OF INJURY" "NO INDICATION OF INJURY" "NONINCAPACITATING INJURY" ...
 $ injuries_total               : num  0 0 0 5 0 2 0 1 0 0 ...
 $ injuries_fatal               : num  0 0 0 0 0 0 0 0 0 0 ...
 $ injuries_incapacitating      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ injuries_non_incapacitating  : num  0 0 0 5 0 2 0 1 0 0 ...
 $ injuries_reported_not_evident: num  0 0 0 0 0 0 0 0 0 0 ...
 $ injuries_no_indication       : num  3 2 3 0 3 0 2 1 3 4 ...
 $ crash_hour                   : int  13 0 10 19 14 0 11 14 18 17 ...
 $ crash_day_of_week            : int  7 1 5 4 7 4 3 4 2 5 ...
 $ crash_month                  : int  7 8 12 8 8 9 12 9 6 9 ...

Dataset Description

The accidents_data_raw dataset has over 200,000 observations of 24 variables. Its variable types include Character, Integer, and Numerical values. The dataset describes information about traffic accidents from 2016 to 2023. The variables describe aspects regarding the accident such as the conditions, type, and outcomes. Based on the initial inspection, “crash_date” stands out as a column of interest. It is stored as a character, but could be more useful as a datetime, so that may be a value to change in the future. The author states that the data for this dataset was “obtained from the internet”.

Source: Kaggle.com, https://www.kaggle.com/datasets/oktayrdeki/traffic-accidents/data

Author/Owner: Oktay Ördekçi

CSV File: traffic_accidents.csv

Motivation

I want to explore this dataset to gain a better understanding of the potential causes of traffic accidents. Last year, my sister was in a car accident that left her with a broken leg. She wasn’t able to walk for a while, and still has a bit of a limp. I’ve always been a careful driver, but since then I’ve been extra vigilant. Exploring this data will help me be more aware of the conditions behind traffic accidents and how I can best avoid them.

Questions

  • In what weather and lighting conditions do the most traffic accidents occur?

  • What is the most common primary cause of severe traffic accidents?

  • Is there a correlation between crash type, severity, and/or damage?

  • Do significantly more traffic accidents occur during a specific time of year?

Hypothesis

Traffic accidents resulting in severe injuries (fatal or incapacitating) occur more often during adverse weather conditions and poor lighting, suggesting a significant positive correlation between poor driving conditions and accident-related injuries.

Ethical Considerations

Given that the data was collected from the internet, there could be some concerns regarding how the data was collected. Its source could be unreliable or inaccurate. There is also no indication about what area or region this data was collected from. Based on the types of variables in the data, I am assuming that the information for this dataset likely came from police or insurance reports.

Data Dictionary

Variable Name Class/Data Type Continuity Description Suggested R Function
crash_date Character Continuous The date the accident occurred. separate(), mutate()
traffic_control_device Character Continuous The type of traffic control device involved (e.g., traffic light, sign) group_by()
weather_condition Character Continuous The weather conditions at the time of the accident group_by()
lighting_condition Character Continuous The lighting conditions at the time of the accident group_by()
first_crash_type Character Continuous The initial type of the crash (e.g., head-on, rear-end) group_by()
trafficway_type Character Continuous The type of roadway involved in the accident (e.g., highway, local road) group_by()
alignment Character Continuous The alignment of the road where the accident occurred (e.g., straight, curved) group_by()
roadway_surface_cond Character Continuous The condition of the roadway surface (e.g., dry, wet, icy) group_by()
road_defect Character Continuous Any defects present on the road surface group_by()
crash_type Character Continuous The overall type of the crash group_by()
intersection_related_i Character Discrete Whether the accident was related to an intersection filter(), group_by()
damage Character Continuous The extent of the damage caused by the accident group_by()
prim_contributory_cause Character Continuous The primary cause contributing to the crash group_by()
num_units Numerical: Integer Discrete The number of vehicles involved in the accident filter()
most_severe_injury Character Continuous The most severe injury sustained in the crash group_by()
injuries_total Numerical Discrete The total number of injuries reported filter(), summary()
injuries_fatal Numerical Discrete The number of fatal injuries resulting from the accident filter(), summary()
injuries_incapacitating Numerical Discrete The number of incapacitating injuries filter(), summary()
injuries_non_incapacitating Numerical Discrete The number of non-incapacitating injuries filter(), summary()
injuries_reported_not_evident Numerical Discrete The number of injuries reported but not visibly evident filter(), summary()
injuries_no_indication Numerical Discrete The number of cases with no indication of injury filter(), summary()
crash_hour Numerical: Integer Discrete The hour the accident occurred (0-23) filter()
crash_day_of_week Numerical: Integer Discrete The day of the week the accident occurred (1-7) filter()
crash_month Numerical: Integer Discrete The month the accident occurred filter()