HW_1 by Momin

Submitted by Md Shah Mominul Islam Momin; NetID: mif37

Reading of the HW1_Data

#HW1_Data excel file was converted to csv file
#read the csv file

data <- read.csv("D:/TXST/CE7393/HW1/HW1_Data.csv")

Data Summarization

#summarize the data

summary("data")
##    Length     Class      Mode 
##         1 character character

Check for Missing Values

#check if there are any missing values

sum(is.na(data))
## [1] 0
#there is no missing data

Exploratory Data Analysis Report

Weather Condition

library(ggplot2)


ggplot(data, aes(x = Wthr_Cond_ID)) + 
  geom_bar(fill = "yellow" , width = .6) + 
  labs(title = "Distribution of Weather Conditions", x = "Weather Condition", y = "Count") + 
  theme_minimal()

Light Condition

ggplot(data, aes(x=Light_Cond_ID)) + 
  geom_bar(fill = "blue" , width = .5) + 
  labs(title= "Distribution of Light Condition", x = "Light Condition" , y = "Count") + 
  theme_minimal()

Surface Conditon

ggplot(data, aes(x = factor(SurfDry))) + 
  geom_bar(fill = "magenta" , width = .5, alpha = .5) + 
  labs(title = "Distribution of Road Surface Conditions", 
       x = "Surface Condition (1 = Dry, 0 = Not Dry)", 
       y = "Count") + 
  theme_minimal()

Road Alignment

ggplot(data, aes(x = factor(Road_Algn_ID))) + 
  geom_bar(fill = "green" , width = .5) + 
  labs(title = "Distribution of Roadway Alignemnt", 
       x = "Roadway Alignment", 
       y = "Count") + 
  theme_minimal()

Roadway Types

ggplot(data, aes(x = factor(Road_Type_ID))) + 
  geom_bar(fill = "darkblue" , width = .5) + 
  labs(title = "Distribution of Roadway Type", 
       x = "Road Type", 
       y = "Count") + 
  theme_minimal()

Person Ethnicity

ggplot(data, aes(x = factor(Prsn_Ethnicity_ID))) +
  geom_bar(fill = "skyblue", color = "black" , width = .7) +
  labs(title = "Bar Chart of Ethnicity IDs",
       x = "Person Ethnicity",
       y = "Count") +
  theme_minimal()

Gender

ggplot(data, aes(x = factor(GenMale))) + 
  geom_bar(fill = "orange" , width = .5) + 
  labs(title = "Gender Distribution", 
       x = "GenMale (1 = Male, 0 = Others)", 
       y = "Count") + 
  theme_minimal()

Traffic Volume

ggplot(data, aes(x = TrafVol)) + 
  geom_histogram(binwidth = 500, fill = "purple", color = "green") + 
  labs(title = "Traffic Volume Distribution", x = "Traffic Volume", y = "Count") + 
  theme_minimal()

Frequency Polygon of Traffic Volume

ggplot(data, aes(x = TrafVol)) +
  geom_freqpoly(binwidth = 1000, color = "black", size = 1.5) +
  labs(title = "Frequency Polygon of Traffic Volume", x = "Traffic Volume", y = "Frequency")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Histogram and Frequency Polygon for Traffic Volume

ggplot(data, aes(x = TrafVol)) +
  # Histogram
   geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", alpha = 0.6, color = "black") + 
  # Density curve (frequency polygon)
  geom_density(aes(y = ..density..), color = "red", size = 1) +  
  labs(title = "Histogram and Frequency Polygon for Traffic Volume", x = "Traffic Volumes", y = "Frequency") +
  theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Crash Speed Limit vs Injury Severity

ggplot(data, aes(x = Crash_Speed_LimitCat, fill = Prsn_Injry_Sev_ID)) + 
    geom_bar() + 
    labs(title = "Crash Speed Limit and Injury Severity", x = "Speed Limit Category", y = "Count") + 
    theme_minimal()

Crash Speed Limit Distribution

ggplot(data, aes(x = Crash_Speed_LimitCat)) + 
    geom_bar(fill = "darkred" , width = .5) + 
    labs(title = "Crash Speed Limit", 
         x = "Speed Limit Category" , 
         y = "Count") + 
    theme_minimal()

Boxplot of Traffic Volume by Age Group

ggplot(data, aes(x = Prsn_Age, y = TrafVol , fill = Prsn_Age)) + 
    geom_boxplot(alpha=0.3) + 
    scale_fill_manual(values = c("red", "blue", "green", "orange", "purple")) +
    labs(title = "Traffic Volume by Age Group", x = "Age Group", y = "Traffic Volume") + 
    theme_minimal()

Injury Severity by Population Group

ggplot(data, aes(x = Pop_Group_ID, fill = Prsn_Injry_Sev_ID)) + 
    geom_bar() +
  scale_fill_manual(values = c("yellow", "lightblue", "darkgreen")) +
    labs(title = "Injury Severity by Population Group", x = "Population Group", y = "Count") + 
    theme_minimal()

Mean, Variance and Standard Deviation of Traffic Volume

# Extract the TrafVol column
traffic_volume <- data$TrafVol

# Mean of Traffic Volume
mean_traffic_volume <- mean(traffic_volume, na.rm = TRUE)

# Variance of Traffic Volume
variance_traffic_volume <- var(traffic_volume, na.rm = TRUE)

# Standard Deviation of Traffic Volume
sd_traffic_volume <- sd(traffic_volume, na.rm = TRUE)

# Mode of Traffic Volume
# Function to calculate mode
mode_function <- function(x) {
  uniq_vals <- unique(x)
  uniq_vals[which.max(tabulate(match(x, uniq_vals)))]
}

mode_traffic_volume <- mode_function(traffic_volume)

# 85th Percentile of Traffic Volume
percentile_85 <- quantile(traffic_volume, 0.85, na.rm = TRUE)

# 15th Percentile of Traffic Volume
percentile_15 <- quantile(traffic_volume, 0.15, na.rm = TRUE)

# Print the results
cat("Mean Traffic Volume: ", mean_traffic_volume, "\n")
## Mean Traffic Volume:  14273.95
cat("Variance of Traffic Volume: ", variance_traffic_volume, "\n")
## Variance of Traffic Volume:  70004422
cat("Standard Deviation of Traffic Volume: ", sd_traffic_volume, "\n")
## Standard Deviation of Traffic Volume:  8366.865
cat("Mode of Traffic Volume: ", mode_traffic_volume, "\n")
## Mode of Traffic Volume:  3160
cat("85th Percentile of Traffic Volume: ", percentile_85, "\n")
## 85th Percentile of Traffic Volume:  24275.6
cat("15th Percentile of Traffic Volume: ", percentile_15, "\n")
## 15th Percentile of Traffic Volume:  3985.9

Density of Crash Speed Limits

ggplot(data, aes(x = Crash_Speed_LimitCat, fill = Crash_Speed_LimitCat)) +
  geom_density() +
  scale_fill_manual(values = c("grey", "lightblue", "darkgreen", "yellow", "pink")) +
  labs(title = "Density of Crash Speed Limits", x = "Speed Groups", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = .5))

Density of Age Group

ggplot(data, aes(x = Prsn_Age, fill = Prsn_Age)) +
  geom_density() +
  scale_fill_manual(values = c("red", "blue", "green", "orange", "purple")) +
  labs(title = "Density of Person Age", x = "Age Group", y = "Count") +
  theme(axis.text.x = element_text(angle = 90, hjust = .5))

Scatter plot between Traffic Volume and Crash Speed Limit Category

ggplot(data, aes(x = Crash_Speed_LimitCat, y = TrafVol)) +
  geom_point(alpha = 0.5, color = "blue") +
  labs(title = "Scatter Plot of Traffic Volume vs. Crash Speed Limit Category",
       x = "Crash Speed Limit Category",
       y = "Traffic Volume") +
  theme_minimal()

Heat Map for Various Numeric Variables

# Load necessary libraries
library(ggcorrplot)

# Calculate the correlation matrix for numeric variables
numeric_columns <- data[, sapply(data, is.numeric)] 
correlation_matrix <- cor(numeric_columns, use = "complete.obs")

# Create the heatmap
ggcorrplot(correlation_matrix, method = "circle", lab = TRUE, lab_size = 5, 
           title = "Correlation Heatmap of Numeric Variables",
           colors = c("red", "green", "blue"))

Bar Plot of Crash Severity by Ethnicity

# Bar plot to compare crash severity across different ethnicity categories
ggplot(data, aes(x = factor(Prsn_Ethnicity_ID), fill = factor(Prsn_Injry_Sev_ID))) +
  geom_bar(position = "dodge") +
  labs(title = "Bar Plot of Crash Severity by Ethnicity",
       x = "Person Ethnicity ID",
       fill = "Prsn_Injry_Sev_ID") +
  theme_minimal()

Density Plot of Traffic Volume by Crash Severity

# Density plot to show distribution of Traffic Volume across different crash severity levels
ggplot(data, aes(x = TrafVol, fill = factor(Prsn_Injry_Sev_ID))) +
  geom_density(alpha = 0.4) +
  labs(title = "Density Plot of Traffic Volume by Crash Severity",
       x = "Traffic Volume",
       fill = "Prsn_Injry_Sev_ID") +
  theme_minimal()

Heat Map for Traffic Control and FHE Collision

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# Summarize the data by counting combinations of Traffic_Cntl_ID and FHE_Collsn_ID
heatmap_data <- data %>%
  group_by(Traffic_Cntl_ID, FHE_Collsn_ID) %>%
  summarise(count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Traffic_Cntl_ID'. You can override using
## the `.groups` argument.
# Create the heatmap
ggplot(heatmap_data, aes(x = Traffic_Cntl_ID, y = FHE_Collsn_ID, fill = count)) +
  geom_tile(color = "red") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Heat Map of Traffic Control vs. FHE Collision",
       x = "Traffic Control",
       y = "FHE Collision",
       fill = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))