if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(datasets, pacman, tidyverse, tsibble,
lubridate, XML,readxl,rio,psych,dplyr)
#Exploring the Data
HW1_Data <- read_excel("C:/hp/AI in CE/HW1_Data.xlsx")
dim (HW1_Data)
## [1] 1295 19
str (HW1_Data)
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
## $ Wthr_Cond_ID : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
## $ Light_Cond_ID : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
## $ Road_Type_ID : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
## $ Road_Algn_ID : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
## $ SurfDry : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ Traffic_Cntl_ID : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
## $ Harm_Evnt_ID : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
## $ Intrsct_Relat_ID : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
## $ FHE_Collsn_ID : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
## $ Road_Part_Adj_ID : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
## $ Road_Cls_ID : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
## $ Pop_Group_ID : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
## $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
## $ Veh_Body_Styl_ID : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
## $ Prsn_Ethnicity_ID : chr [1:1295] "White" "White" "White" "White" ...
## $ GenMale : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ TrafVol : num [1:1295] 7654 13770 11470 16972 413 ...
## $ Prsn_Age : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
## $ Prsn_Injry_Sev_ID : chr [1:1295] "O" "O" "O" "O" ...
glimpse(HW1_Data)
## Rows: 1,295
## Columns: 19
## $ Wthr_Cond_ID <chr> "Clear", "Clear", "Clear", "Clear", "Clear", "Cle…
## $ Light_Cond_ID <chr> "Dark, not lighted", "Dark, not lighted", "Daylig…
## $ Road_Type_ID <chr> "2 lane, 2 way", "2 lane, 2 way", "2 lane, 2 way"…
## $ Road_Algn_ID <chr> "Straight, level", "Straight, level", "Straight, …
## $ SurfDry <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Traffic_Cntl_ID <chr> "Marked lanes", "Center stripe/divider", "Marked …
## $ Harm_Evnt_ID <chr> "Motor vehicle in transport", "Motor vehicle in t…
## $ Intrsct_Relat_ID <chr> "Non intersection", "Non intersection", "Intersec…
## $ FHE_Collsn_ID <chr> "Sd both going straight-rear end", "Sd both going…
## $ Road_Part_Adj_ID <chr> "Main/proper lane", "Main/proper lane", "Main/pro…
## $ Road_Cls_ID <chr> "Farm to market", "Us & state highways", "Farm to…
## $ Pop_Group_ID <chr> "10,000 - 24,999 pop", "Rural", "Other", "Rural",…
## $ Crash_Speed_LimitCat <chr> "30-40 mph", "65-70 mph", "45-60 mph", "65-70 mph…
## $ Veh_Body_Styl_ID <chr> "Farm equipment", "Farm equipment", "Farm equipme…
## $ Prsn_Ethnicity_ID <chr> "White", "White", "White", "White", "Other", "Whi…
## $ GenMale <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1…
## $ TrafVol <dbl> 7654, 13770, 11470, 16972, 413, 2906, 22840, 4882…
## $ Prsn_Age <chr> "25-54 years", "25-54 years", "Other", "25-54 yea…
## $ Prsn_Injry_Sev_ID <chr> "O", "O", "O", "O", "O", "O", "O", "O", "O", "O",…
view (HW1_Data)
head (HW1_Data)
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 5 Clear Dark, not ligh… 2 lane, 2 w… Straight, g… 1 None
## 6 Clear Daylight Unknown Straight, l… 1 None
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
tail (HW1_Data)
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Daylight 4 or more l… Straight, l… 1 Other
## 2 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Dark, lighted 4 or more l… Straight, l… 1 Marked lanes
## 4 Clear Daylight 4 or more l… Straight, l… 1 Marked lanes
## 5 Clear Other 2 lane, 2 w… Straight, l… 1 Other
## 6 Cloudy Daylight Unknown Straight, l… 1 Other
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
attach (HW1_Data)
summary ( TrafVol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 215 6765 14170 14417 22010 28970
boxplot (TrafVol)
#Cleaning Data
HW1_Data <- HW1_Data %>%
mutate(across(c(Wthr_Cond_ID, Light_Cond_ID, Road_Type_ID, Road_Algn_ID, Traffic_Cntl_ID, Harm_Evnt_ID, Intrsct_Relat_ID, FHE_Collsn_ID,
Road_Part_Adj_ID, Road_Cls_ID, Pop_Group_ID,
Crash_Speed_LimitCat, Veh_Body_Styl_ID, Prsn_Ethnicity_ID,
Prsn_Injry_Sev_ID,Prsn_Age, GenMale), as.factor))
str (HW1_Data)
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
## $ Wthr_Cond_ID : Factor w/ 5 levels "Clear","Cloudy",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Light_Cond_ID : Factor w/ 5 levels "Dark, lighted",..: 2 2 3 3 2 3 3 3 3 3 ...
## $ Road_Type_ID : Factor w/ 5 levels "2 lane, 2 way",..: 1 1 1 1 1 5 2 2 5 1 ...
## $ Road_Algn_ID : Factor w/ 5 levels "Curve, level",..: 5 5 5 5 3 5 1 1 5 5 ...
## $ SurfDry : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ Traffic_Cntl_ID : Factor w/ 5 levels "Center stripe/divider",..: 2 1 2 1 4 4 2 2 4 1 ...
## $ Harm_Evnt_ID : Factor w/ 5 levels "Fixed object",..: 2 2 2 1 2 2 2 2 2 2 ...
## $ Intrsct_Relat_ID : Factor w/ 4 levels "Driveway access",..: 4 4 2 4 4 1 4 4 4 1 ...
## $ FHE_Collsn_ID : Factor w/ 5 levels "Omv vehicle going straight",..: 3 3 2 1 3 2 3 3 2 2 ...
## $ Road_Part_Adj_ID : Factor w/ 5 levels "Exit/off ramp",..: 2 2 2 2 2 2 5 5 2 2 ...
## $ Road_Cls_ID : Factor w/ 5 levels "City street",..: 3 5 3 5 3 2 5 5 2 3 ...
## $ Pop_Group_ID : Factor w/ 5 levels "10,000 - 24,999 pop",..: 1 4 3 4 4 4 3 3 4 5 ...
## $ Crash_Speed_LimitCat: Factor w/ 5 levels "> 70 mph","30-40 mph",..: 2 4 3 4 3 2 4 4 2 3 ...
## $ Veh_Body_Styl_ID : Factor w/ 1 level "Farm equipment": 1 1 1 1 1 1 1 1 1 1 ...
## $ Prsn_Ethnicity_ID : Factor w/ 5 levels "Black","Hispanic",..: 5 5 5 5 3 5 2 2 5 5 ...
## $ GenMale : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ TrafVol : num [1:1295] 7654 13770 11470 16972 413 ...
## $ Prsn_Age : Factor w/ 5 levels "15-24 years",..: 2 2 5 2 2 3 1 2 4 2 ...
## $ Prsn_Injry_Sev_ID : Factor w/ 3 levels "BC","KA","O": 3 3 3 3 3 3 3 3 3 3 ...
##Distribution of Weather Conditions
ggplot(HW1_Data, aes(x = Wthr_Cond_ID)) +
geom_bar(fill = "skyblue") +
theme_minimal() +
labs(title = "Distribution of Weather Conditions", x = "Weather Condition", y = "Count")
ggplot(HW1_Data, aes(x = Light_Cond_ID)) +
geom_bar(fill = "lightgreen") +
theme_minimal() +
labs(title = "Distribution of Light Conditions", x = "Light Condition", y = "Count")
##Distribution of Road Type
unique (Road_Type_ID)
## [1] "2 lane, 2 way" "Unknown"
## [3] "4 or more lanes, divided" "4 or more lanes, undivided"
## [5] "Other"
Road_Type_ID <- HW1_Data %>%
filter(!Road_Type_ID %in% c("Unknown", "Other"))
ggplot(Road_Type_ID, aes(x = Road_Type_ID)) +
geom_bar(fill = "skyblue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Road Types",
x = "Road Type",
y = "Count")
##Distribution of Road Alignment
unique (Road_Algn_ID)
## [1] "Straight, level" "Straight, grade" "Curve, level"
## [4] "Straight, hillcrest" "Other"
ggplot(HW1_Data, aes(x = Road_Algn_ID)) +
geom_bar(fill = "blue", color = "white") +
theme_minimal() +
labs(title = "Distribution of Road Alignment",
x = "Road Alignment",
y = "Count")
##Distribution of FHE_Colssn_ID
ggplot(HW1_Data, aes(x = FHE_Collsn_ID)) +
geom_bar(fill = "skyblue", color = "black") +
theme_minimal() +
labs(title = "Distribution of FHE_Colls_ID", x = "FHE_Colls_ID", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Distribution of Person Ethnicity Id
ggplot(HW1_Data, aes(x=Prsn_Ethnicity_ID)) +
geom_bar(fill = "lightblue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Person Ethnicity",
x = "Ethnicity ID",
y = "Count")
##Distribution of Population Groups
ggplot(HW1_Data, aes(x = Pop_Group_ID)) +
geom_bar(fill = "lightgreen", color = "black") +
theme_minimal() +
labs(title = "Distribution of Population Groups",
x = "Population Group ID",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Distribution for Genders
HW1_Data$GenMale <- factor(HW1_Data$GenMale, levels = c(0, 1), labels = c("Female", "Male"))
ggplot(HW1_Data, aes(x = GenMale )) +
geom_bar(fill = "skyblue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Gender",
x = "Gender",
y = "Count")
##Distribution of SurfDry
HW1_Data$SurfDry <- factor(HW1_Data$SurfDry, levels = c(0, 1),
labels = c("No", "Yes"))
ggplot(HW1_Data, aes(x = SurfDry)) +
geom_bar(fill = "red", color = "white") +
theme_minimal() +
labs(title = "Distribution of SurfDry",
x = "Surface Dry (0 = No, 1 = Yes)",
y = "Count")
##Distribution of Traffic_cntl_ID
ggplot(HW1_Data, aes(x = Traffic_Cntl_ID)) +
geom_bar(fill = "dodgerblue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Traffic Control Types",
x = "Traffic Control ID",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Distribution for Harmful Event
ggplot(HW1_Data, aes(x = Harm_Evnt_ID)) +
geom_bar(fill = "purple", color = "black") +
theme_minimal() +
labs(title = "Distribution of Harmful Events",
x = "Harmful Event",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Distribution of Road Class
ggplot(HW1_Data, aes(x = Road_Cls_ID)) +
geom_bar(fill = "purple", color = "black") +
theme_minimal() +
labs(title = "Distribution of Road Class",
x = "Road Class",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Histogram for numeric variable ‘TrafVol’
ggplot(HW1_Data, aes(x = TrafVol)) +
geom_histogram(binwidth = 1000, fill = "coral", color = "black") +
theme_minimal() +
labs(title = "Histogram of Traffic Volume",
x = "Traffic Volume", y = "Frequency")
##Box plot for numeric variable ‘TrafVol’
ggplot(HW1_Data, aes(y = TrafVol)) +
geom_boxplot(fill = "lightblue", color = "black") +
theme_minimal() +
labs(title = "Boxplot of Traffic Volume", y = "Traffic Volume")
#Reationship between variables
##Box plot to compare ‘TrafVol’ across different ‘Wthr_Cond_ID’
ggplot(HW1_Data, aes(x = Wthr_Cond_ID, y = TrafVol)) +
geom_boxplot(fill = "lightblue", color = "black") +
theme_minimal() +
labs(title = "Traffic Volume by Weather Condition", x = "Weather Condition", y = "Traffic Volume")
##Heatmap of Age by Crash Speed Limit Category
ggplot(HW1_Data, aes(x = Crash_Speed_LimitCat, y = Prsn_Age)) +
geom_tile(aes(fill = ..count..), stat = "bin2d", color = "white") +
theme_minimal() +
labs(title = "Heatmap of Age by Crash Speed Limit Category",
x = "Crash Speed Limit Category",
y = "Age Category",
fill = "Count") +
scale_fill_gradient(low = "lightblue", high = "darkblue")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##Stacked bar plot for ‘Prsn_Age’ by ‘Crash_Speed_LimitCat’
ggplot(HW1_Data, aes(x = Crash_Speed_LimitCat, fill = Prsn_Age)) +
geom_bar(position = "fill") +
labs(title = "Proportion of Age Categories by Crash Speed Limit Category",
x = "Crash Speed Limit Category",
y = "Proportion",
fill = "Age Category") +
scale_y_continuous(labels = scales::percent)
##Facet grid to explore interaction between ‘Wthr_Cond_ID’, ‘Light_Cond_ID’, and ‘TrafVol’
ggplot(HW1_Data, aes(x = Light_Cond_ID, y = TrafVol)) +
geom_boxplot(fill = "lightblue", color = "black") +
facet_wrap(~ Wthr_Cond_ID) +
theme_minimal() +
labs(title = "Traffic Volume by Light and Weather Conditions",
x = "Light Condition", y = "Traffic") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))