Installing pacman (“package manager”)

if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(datasets, pacman, tidyverse, tsibble, 
               lubridate, XML,readxl,rio,psych,dplyr)

#Exploring the Data

HW1_Data <- read_excel("C:/hp/AI in CE/HW1_Data.xlsx")

dim (HW1_Data)
## [1] 1295   19
str (HW1_Data)
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
##  $ Wthr_Cond_ID        : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
##  $ Light_Cond_ID       : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
##  $ Road_Type_ID        : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
##  $ Road_Algn_ID        : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
##  $ SurfDry             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Traffic_Cntl_ID     : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
##  $ Harm_Evnt_ID        : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
##  $ Intrsct_Relat_ID    : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
##  $ FHE_Collsn_ID       : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
##  $ Road_Part_Adj_ID    : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
##  $ Road_Cls_ID         : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
##  $ Pop_Group_ID        : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
##  $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
##  $ Veh_Body_Styl_ID    : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
##  $ Prsn_Ethnicity_ID   : chr [1:1295] "White" "White" "White" "White" ...
##  $ GenMale             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ TrafVol             : num [1:1295] 7654 13770 11470 16972 413 ...
##  $ Prsn_Age            : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
##  $ Prsn_Injry_Sev_ID   : chr [1:1295] "O" "O" "O" "O" ...
glimpse(HW1_Data)
## Rows: 1,295
## Columns: 19
## $ Wthr_Cond_ID         <chr> "Clear", "Clear", "Clear", "Clear", "Clear", "Cle…
## $ Light_Cond_ID        <chr> "Dark, not lighted", "Dark, not lighted", "Daylig…
## $ Road_Type_ID         <chr> "2 lane, 2 way", "2 lane, 2 way", "2 lane, 2 way"…
## $ Road_Algn_ID         <chr> "Straight, level", "Straight, level", "Straight, …
## $ SurfDry              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Traffic_Cntl_ID      <chr> "Marked lanes", "Center stripe/divider", "Marked …
## $ Harm_Evnt_ID         <chr> "Motor vehicle in transport", "Motor vehicle in t…
## $ Intrsct_Relat_ID     <chr> "Non intersection", "Non intersection", "Intersec…
## $ FHE_Collsn_ID        <chr> "Sd both going straight-rear end", "Sd both going…
## $ Road_Part_Adj_ID     <chr> "Main/proper lane", "Main/proper lane", "Main/pro…
## $ Road_Cls_ID          <chr> "Farm to market", "Us & state highways", "Farm to…
## $ Pop_Group_ID         <chr> "10,000 - 24,999 pop", "Rural", "Other", "Rural",…
## $ Crash_Speed_LimitCat <chr> "30-40 mph", "65-70 mph", "45-60 mph", "65-70 mph…
## $ Veh_Body_Styl_ID     <chr> "Farm equipment", "Farm equipment", "Farm equipme…
## $ Prsn_Ethnicity_ID    <chr> "White", "White", "White", "White", "Other", "Whi…
## $ GenMale              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1…
## $ TrafVol              <dbl> 7654, 13770, 11470, 16972, 413, 2906, 22840, 4882…
## $ Prsn_Age             <chr> "25-54 years", "25-54 years", "Other", "25-54 yea…
## $ Prsn_Injry_Sev_ID    <chr> "O", "O", "O", "O", "O", "O", "O", "O", "O", "O",…
view (HW1_Data)

head (HW1_Data)
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Marked lanes   
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Center stripe/…
## 3 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 4 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Center stripe/…
## 5 Clear        Dark, not ligh… 2 lane, 2 w… Straight, g…       1 None           
## 6 Clear        Daylight        Unknown      Straight, l…       1 None           
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
tail (HW1_Data)
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Daylight        4 or more l… Straight, l…       1 Other          
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Center stripe/…
## 3 Clear        Dark, lighted   4 or more l… Straight, l…       1 Marked lanes   
## 4 Clear        Daylight        4 or more l… Straight, l…       1 Marked lanes   
## 5 Clear        Other           2 lane, 2 w… Straight, l…       1 Other          
## 6 Cloudy       Daylight        Unknown      Straight, l…       1 Other          
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
attach (HW1_Data)

summary ( TrafVol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     215    6765   14170   14417   22010   28970
boxplot (TrafVol)

#Cleaning Data

HW1_Data <- HW1_Data %>%
  mutate(across(c(Wthr_Cond_ID, Light_Cond_ID, Road_Type_ID, Road_Algn_ID, Traffic_Cntl_ID, Harm_Evnt_ID, Intrsct_Relat_ID, FHE_Collsn_ID,
                  Road_Part_Adj_ID, Road_Cls_ID, Pop_Group_ID, 
                  Crash_Speed_LimitCat, Veh_Body_Styl_ID, Prsn_Ethnicity_ID, 
                  Prsn_Injry_Sev_ID,Prsn_Age, GenMale), as.factor))

str (HW1_Data)
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
##  $ Wthr_Cond_ID        : Factor w/ 5 levels "Clear","Cloudy",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Light_Cond_ID       : Factor w/ 5 levels "Dark, lighted",..: 2 2 3 3 2 3 3 3 3 3 ...
##  $ Road_Type_ID        : Factor w/ 5 levels "2 lane, 2 way",..: 1 1 1 1 1 5 2 2 5 1 ...
##  $ Road_Algn_ID        : Factor w/ 5 levels "Curve, level",..: 5 5 5 5 3 5 1 1 5 5 ...
##  $ SurfDry             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Traffic_Cntl_ID     : Factor w/ 5 levels "Center stripe/divider",..: 2 1 2 1 4 4 2 2 4 1 ...
##  $ Harm_Evnt_ID        : Factor w/ 5 levels "Fixed object",..: 2 2 2 1 2 2 2 2 2 2 ...
##  $ Intrsct_Relat_ID    : Factor w/ 4 levels "Driveway access",..: 4 4 2 4 4 1 4 4 4 1 ...
##  $ FHE_Collsn_ID       : Factor w/ 5 levels "Omv vehicle going straight",..: 3 3 2 1 3 2 3 3 2 2 ...
##  $ Road_Part_Adj_ID    : Factor w/ 5 levels "Exit/off ramp",..: 2 2 2 2 2 2 5 5 2 2 ...
##  $ Road_Cls_ID         : Factor w/ 5 levels "City street",..: 3 5 3 5 3 2 5 5 2 3 ...
##  $ Pop_Group_ID        : Factor w/ 5 levels "10,000 - 24,999 pop",..: 1 4 3 4 4 4 3 3 4 5 ...
##  $ Crash_Speed_LimitCat: Factor w/ 5 levels "> 70 mph","30-40 mph",..: 2 4 3 4 3 2 4 4 2 3 ...
##  $ Veh_Body_Styl_ID    : Factor w/ 1 level "Farm equipment": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Prsn_Ethnicity_ID   : Factor w/ 5 levels "Black","Hispanic",..: 5 5 5 5 3 5 2 2 5 5 ...
##  $ GenMale             : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ TrafVol             : num [1:1295] 7654 13770 11470 16972 413 ...
##  $ Prsn_Age            : Factor w/ 5 levels "15-24 years",..: 2 2 5 2 2 3 1 2 4 2 ...
##  $ Prsn_Injry_Sev_ID   : Factor w/ 3 levels "BC","KA","O": 3 3 3 3 3 3 3 3 3 3 ...

Visualizing Data

##Distribution of Weather Conditions

 ggplot(HW1_Data, aes(x = Wthr_Cond_ID)) +
  geom_bar(fill = "skyblue") +
  theme_minimal() +
  labs(title = "Distribution of Weather Conditions", x = "Weather Condition", y = "Count")

Distribution of Light Condition

ggplot(HW1_Data, aes(x = Light_Cond_ID)) +
  geom_bar(fill = "lightgreen") +
  theme_minimal() +
  labs(title = "Distribution of Light Conditions", x = "Light Condition", y = "Count")

##Distribution of Road Type

unique (Road_Type_ID)
## [1] "2 lane, 2 way"              "Unknown"                   
## [3] "4 or more lanes, divided"   "4 or more lanes, undivided"
## [5] "Other"
  Road_Type_ID <- HW1_Data %>%
    filter(!Road_Type_ID %in% c("Unknown", "Other"))
  
  ggplot(Road_Type_ID, aes(x = Road_Type_ID)) +
    geom_bar(fill = "skyblue", color = "black") +
    theme_minimal() +
    labs(title = "Distribution of Road Types", 
         x = "Road Type", 
         y = "Count")

##Distribution of Road Alignment

  unique (Road_Algn_ID)
## [1] "Straight, level"     "Straight, grade"     "Curve, level"       
## [4] "Straight, hillcrest" "Other"
  ggplot(HW1_Data, aes(x = Road_Algn_ID)) +
    geom_bar(fill = "blue", color = "white") +
    theme_minimal() +
    labs(title = "Distribution of Road Alignment", 
         x = "Road Alignment", 
         y = "Count")

##Distribution of FHE_Colssn_ID

ggplot(HW1_Data, aes(x = FHE_Collsn_ID)) +
    
    geom_bar(fill = "skyblue", color = "black") +
    
    theme_minimal() +
    labs(title = "Distribution of FHE_Colls_ID", x = "FHE_Colls_ID", y = "Count") +
    
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Distribution of Person Ethnicity Id

ggplot(HW1_Data, aes(x=Prsn_Ethnicity_ID)) +
    
    geom_bar(fill = "lightblue", color = "black") +
    
    theme_minimal() +
    
    labs(title = "Distribution of Person Ethnicity",
         x = "Ethnicity ID",
         y = "Count")

##Distribution of Population Groups

ggplot(HW1_Data, aes(x = Pop_Group_ID)) +
    
    geom_bar(fill = "lightgreen", color = "black") +
    
    theme_minimal() +
    
    labs(title = "Distribution of Population Groups",
         x = "Population Group ID",
         y = "Count") +
    
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Distribution for Genders

HW1_Data$GenMale <- factor(HW1_Data$GenMale, levels = c(0, 1), labels = c("Female", "Male"))
  
  
  ggplot(HW1_Data, aes(x =  GenMale )) +
    geom_bar(fill = "skyblue", color = "black") +
    theme_minimal() +
    labs(title = "Distribution of Gender",
         x = "Gender",
         y = "Count")

##Distribution of SurfDry

HW1_Data$SurfDry <- factor(HW1_Data$SurfDry, levels = c(0, 1), 
                             labels = c("No", "Yes"))
  
  ggplot(HW1_Data, aes(x = SurfDry)) +
    geom_bar(fill = "red", color = "white") +
    theme_minimal() +
     
  labs(title = "Distribution of SurfDry", 
         x = "Surface Dry (0 = No, 1 = Yes)", 
         y = "Count")

##Distribution of Traffic_cntl_ID

 ggplot(HW1_Data, aes(x = Traffic_Cntl_ID)) +
    geom_bar(fill = "dodgerblue", color = "black") +
    theme_minimal() +
    labs(title = "Distribution of Traffic Control Types",
         x = "Traffic Control ID",
         y = "Count") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Distribution for Harmful Event

ggplot(HW1_Data, aes(x = Harm_Evnt_ID)) +
    geom_bar(fill = "purple", color = "black") +
    theme_minimal() +
    labs(title = "Distribution of Harmful Events",
         x = "Harmful Event",
         y = "Count") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Distribution of Road Class

ggplot(HW1_Data, aes(x = Road_Cls_ID)) +
    geom_bar(fill = "purple", color = "black") +
    theme_minimal() +
    labs(title = "Distribution of Road Class",
         x = "Road Class",
         y = "Count") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Histogram for numeric variable ‘TrafVol’

ggplot(HW1_Data, aes(x = TrafVol)) +
    
  geom_histogram(binwidth = 1000, fill = "coral", color = "black") +
    
    theme_minimal() +
    
    labs(title = "Histogram of Traffic Volume",
         x = "Traffic Volume", y = "Frequency")

##Box plot for numeric variable ‘TrafVol’

ggplot(HW1_Data, aes(y = TrafVol)) +
  
  geom_boxplot(fill = "lightblue", color = "black") +
  
  theme_minimal() +
  
  labs(title = "Boxplot of Traffic Volume", y = "Traffic Volume")

#Reationship between variables

##Box plot to compare ‘TrafVol’ across different ‘Wthr_Cond_ID’

 ggplot(HW1_Data, aes(x = Wthr_Cond_ID, y = TrafVol)) +
  
  geom_boxplot(fill = "lightblue", color = "black") +
  
  theme_minimal() +
  
  labs(title = "Traffic Volume by Weather Condition", x = "Weather Condition", y = "Traffic Volume")

##Heatmap of Age by Crash Speed Limit Category

 ggplot(HW1_Data, aes(x = Crash_Speed_LimitCat, y = Prsn_Age)) +
  
  geom_tile(aes(fill = ..count..), stat = "bin2d", color = "white") +
  
  theme_minimal() +
  
  labs(title = "Heatmap of Age by Crash Speed Limit Category",
       x = "Crash Speed Limit Category",
       y = "Age Category",
       fill = "Count") +
  scale_fill_gradient(low = "lightblue", high = "darkblue")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##Stacked bar plot for ‘Prsn_Age’ by ‘Crash_Speed_LimitCat’

ggplot(HW1_Data, aes(x = Crash_Speed_LimitCat, fill = Prsn_Age)) +
  
  geom_bar(position = "fill") + 
  
  labs(title = "Proportion of Age Categories by Crash Speed Limit Category",
       x = "Crash Speed Limit Category",
       y = "Proportion",
       fill = "Age Category") +
  
  scale_y_continuous(labels = scales::percent)  

##Facet grid to explore interaction between ‘Wthr_Cond_ID’, ‘Light_Cond_ID’, and ‘TrafVol’

ggplot(HW1_Data, aes(x = Light_Cond_ID, y = TrafVol)) +
  
  geom_boxplot(fill = "lightblue", color = "black") +
  
  facet_wrap(~ Wthr_Cond_ID) +
  theme_minimal() +
  
  labs(title = "Traffic Volume by Light and Weather Conditions", 
       x = "Light Condition", y = "Traffic") +
  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))