HW1_Anannya
library(stats)
library(dplyr)
library(readxl)
library(ggplot2)
library(reshape2)
library(RColorBrewer)Read the Data
df <- read_excel("C:/Users/gpk30/OneDrive - Texas State University/AI_Civ_Eng/HW1/HW1_Data.xlsx")
str(df) ## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
## $ Wthr_Cond_ID : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
## $ Light_Cond_ID : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
## $ Road_Type_ID : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
## $ Road_Algn_ID : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
## $ SurfDry : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ Traffic_Cntl_ID : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
## $ Harm_Evnt_ID : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
## $ Intrsct_Relat_ID : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
## $ FHE_Collsn_ID : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
## $ Road_Part_Adj_ID : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
## $ Road_Cls_ID : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
## $ Pop_Group_ID : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
## $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
## $ Veh_Body_Styl_ID : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
## $ Prsn_Ethnicity_ID : chr [1:1295] "White" "White" "White" "White" ...
## $ GenMale : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ TrafVol : num [1:1295] 18215 21568 23677 27232 21480 ...
## $ Prsn_Age : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
## $ Prsn_Injry_Sev_ID : chr [1:1295] "O" "O" "O" "O" ...
## [1] "Wthr_Cond_ID" "Light_Cond_ID" "Road_Type_ID"
## [4] "Road_Algn_ID" "SurfDry" "Traffic_Cntl_ID"
## [7] "Harm_Evnt_ID" "Intrsct_Relat_ID" "FHE_Collsn_ID"
## [10] "Road_Part_Adj_ID" "Road_Cls_ID" "Pop_Group_ID"
## [13] "Crash_Speed_LimitCat" "Veh_Body_Styl_ID" "Prsn_Ethnicity_ID"
## [16] "GenMale" "TrafVol" "Prsn_Age"
## [19] "Prsn_Injry_Sev_ID"
Data Summarization
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID
## Length:1295 Length:1295 Length:1295 Length:1295
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## SurfDry Traffic_Cntl_ID Harm_Evnt_ID Intrsct_Relat_ID
## Min. :0.0000 Length:1295 Length:1295 Length:1295
## 1st Qu.:1.0000 Class :character Class :character Class :character
## Median :1.0000 Mode :character Mode :character Mode :character
## Mean :0.9143
## 3rd Qu.:1.0000
## Max. :1.0000
## FHE_Collsn_ID Road_Part_Adj_ID Road_Cls_ID Pop_Group_ID
## Length:1295 Length:1295 Length:1295 Length:1295
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Crash_Speed_LimitCat Veh_Body_Styl_ID Prsn_Ethnicity_ID GenMale
## Length:1295 Length:1295 Length:1295 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:1.0000
## Mode :character Mode :character Mode :character Median :1.0000
## Mean :0.8842
## 3rd Qu.:1.0000
## Max. :1.0000
## TrafVol Prsn_Age Prsn_Injry_Sev_ID
## Min. : 209 Length:1295 Length:1295
## 1st Qu.: 7174 Class :character Class :character
## Median :15155 Mode :character Mode :character
## Mean :14650
## 3rd Qu.:21556
## Max. :28992
Data Visualization
Visualization of Numerical Data
Visualization of Categorical Data
Barplot of Weather Condition
##
## Clear Cloudy Fog Other Rain
## 1084 146 10 10 45
barplot(table(df$Wthr_Cond_ID), main = "Weather Condition", xlab = "Frequency",
ylab = "Weather Condition Categories", horiz = T,
col = c("dodgerblue", "grey", "aliceblue", "skyblue", "darkturquoise"),
xlim = c(0,1200))Barplot of Light_Cond_ID
##
## Dark, lighted Dark, not lighted Daylight Dusk
## 42 154 1040 39
## Other
## 20
barplot(table(df$Light_Cond_ID), main = "Lighting Condition", xlab = "Frequency",
ylab = "Light Condition Categories", horiz = T,
col = c("bisque4", "black", "aliceblue", "cornflowerblue", "darkslategray"),
xlim = c(0,1200))Barplot of Road Type
##
## 2 lane, 2 way 4 or more lanes, divided
## 551 185
## 4 or more lanes, undivided Other
## 96 2
## Unknown
## 461
barplot(table(df$Road_Type_ID), main = "Road Type", xlab = "Frequency",
ylab = "Road Types",
col = "moccasin",
ylim = c(0,600))
### Barplot of Road_Algn_ID
##
## Curve, level Other Straight, grade Straight, hillcrest
## 62 44 135 47
## Straight, level
## 1007
barplot(table(df$Road_Algn_ID), main = "Road Alignment", xlab = "Road Alignment",
ylab = "Fequency",
col = "darkseagreen2",
ylim = c(0,1200))Barplot of Surface Condition
frequency_table <- table(df$SurfDry)
names(frequency_table) <- gsub("0", "Wet", names(frequency_table))
names(frequency_table) <- gsub("1", "Dry", names(frequency_table))
frequency_table## Wet Dry
## 111 1184
barplot(frequency_table, main = "Road Surface", xlab = "Surface Type",
ylab = "Frequency",
col = c("gray10", "gray48"),
ylim = c(0,1200))Density Plot
Density Plot of Surface Condition
##
## > 70 mph 30-40 mph 45-60 mph 65-70 mph Other
## 203 336 471 223 62
ggplot(df, aes(x = Crash_Speed_LimitCat, fill = factor(Crash_Speed_LimitCat))) +
geom_density(alpha = 0.5) + # Adjust alpha for transparency
labs(title = "Density Plot of Crash Speed Limit (mph)",
x = "Crash Speed Limit (mph))", y = "Density",
fill = "Crash Speed Limits") +
theme_minimal()Density Plot of Person Age
ggplot(df, aes(x = Prsn_Age, fill = factor(Prsn_Age))) +
geom_density(alpha = 0.5) + # Adjust alpha for transparency
labs(title = "Density Plot of Person Age by Age Range",
x = "Person Age (years)",
fill = "Age Range") +
scale_fill_manual(values = c("blue", "olivedrab", "orange", "skyblue", "saddlebrown")) + # Custom colors
theme_minimal()Boxplot
Boxplot of Traffic Volume by Ethnicity
ggplot(df, aes(x = factor(Prsn_Ethnicity_ID), y = TrafVol)) +
geom_boxplot(fill = c("lightslateblue", "navajowhite3", "lightsalmon", "indianred4", "peachpuff2"), outlier.colour = "red", alpha = 0.3) +
geom_jitter(width = 0.2, color = "blue", alpha = 0.3) + # Add jittered points
labs(title = "Box Plot of Traffic Volume by Person Ethnicity",
x = "Ethnicity",
y = "Traffic Volume") +
theme_minimal()Boxplot of Traffic Volume by Person Age (years)
ggplot(df, aes(x = factor(Prsn_Age), y = TrafVol)) +
geom_boxplot(fill = c("tomato", "red", "tomato1", "tan1", "orangered"), outlier.colour = "red", alpha = 0.35) +
geom_jitter(width = 0.2, color = "blue", alpha = 0.3) + # Add jittered points
labs(title = "Box Plot of Traffic Volume by Person Age",
x = "Person Age (years)",
y = "Traffic Volume") +
theme_minimal()##Heatmap
HeatMap of Road Alignment and Harm Event
#creating Contingency Table
con_tab_1 <- xtabs(~ Prsn_Age + Prsn_Injry_Sev_ID, data = df)
print(con_tab_1)## Prsn_Injry_Sev_ID
## Prsn_Age BC KA O
## 15-24 years 18 5 165
## 25-54 years 56 19 497
## 55-64 years 23 8 201
## 65-74 years 14 10 94
## Other 9 8 168
#converting to dataframe
con_df1 <- as.data.frame(con_tab_1)
#HeatMap
ggplot(con_df1, aes(x = Prsn_Age, y = Prsn_Injry_Sev_ID, fill = Freq)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "navy") +
labs(title = "Heatmap of Person Injury Severity vs Age",
x = "Person Age (years)",
y = "Injury Severity",
fill = "Frequency") +
theme_minimal()HeatMap of FHE Collision and Road Class
#creating Contingency Table
con_tab_2 <- xtabs(~ FHE_Collsn_ID + Road_Cls_ID, data = df)
print(con_tab_2)## Road_Cls_ID
## FHE_Collsn_ID City street County road Farm to market Other
## Omv vehicle going straight 58 52 59 25
## Other 101 81 91 32
## Sd both going straight-rear end 23 17 110 15
## Sd both going straight-sideswipe 24 16 35 15
## Sd one straight-one left turn 15 25 79 2
## Road_Cls_ID
## FHE_Collsn_ID Us & state highways
## Omv vehicle going straight 65
## Other 92
## Sd both going straight-rear end 147
## Sd both going straight-sideswipe 69
## Sd one straight-one left turn 47
#converting to dataframe
con_df2 <- as.data.frame(con_tab_2)
#HeatMap
ggplot(con_df2, aes(x = FHE_Collsn_ID, y = Road_Cls_ID, fill = Freq, , alpha =0.5)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Heatmap of Road Class vs FHE Collision Type",
x = "FHE Collision Type",
y = "Road Class",
fill = "Frequency") +
theme_minimal() + theme(axis.text.x = element_text(angle = 20, hjust = 1)) Correlation Matrix
library(vcd) # For computing chi-square statistic
# Sample data: Replace this with your actual dataset
# The character variables from your dataset
categorical_vars <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID",
"Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID", "FHE_Collsn_ID",
"Road_Part_Adj_ID", "Road_Cls_ID", "Pop_Group_ID", "Crash_Speed_LimitCat",
"Veh_Body_Styl_ID", "Prsn_Ethnicity_ID", "Prsn_Age")
# Filter only the character columns from your actual dataset
data_categorical <- df[categorical_vars]
# Function to calculate Contingency Coefficient for a pair of categorical variables
contingency_coefficient <- function(x, y) {
# Create a contingency table
tbl <- table(x, y)
# Perform chi-square test
chi2_test <- chisq.test(tbl)
# Calculate Contingency Coefficient
N <- sum(tbl) # Total number of observations
chi2 <- chi2_test$statistic # Chi-square statistic
C <- sqrt(chi2 / (chi2 + N)) # Contingency Coefficient formula
return(C)
}
# Create an empty matrix to store Contingency Coefficients
contingency_matrix_4 <- matrix(NA, ncol = length(categorical_vars), nrow = length(categorical_vars))
colnames(contingency_matrix_4) <- categorical_vars
rownames(contingency_matrix_4) <- categorical_vars
# Loop through each pair of categorical variables and calculate Contingency Coefficients
for (i in 1:length(categorical_vars)) {
for (j in 1:length(categorical_vars)) {
if (i != j) {
contingency_matrix_4[i, j] <- contingency_coefficient(data_categorical[[categorical_vars[i]]],
data_categorical[[categorical_vars[j]]])
}
}
}
# Convert the matrix to a data frame for easier viewing and plotting
contingency_matrix_df <- as.data.frame(contingency_matrix_4)
# Melt the matrix for use in ggplot2, ensure row and column names are set correctly
contingency_matrix_melt <- melt(as.matrix(contingency_matrix_df), varnames = c("Var1", "Var2"))
# Plot the Contingency Coefficient matrix as a heatmap
ggplot(data = contingency_matrix_melt, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightgrey", high = "blue", na.value = "white") +
geom_text(aes(label = round(value, 2)), color = "white", size = 4) +
labs(title = "Contingency Coefficient Matrix for Categorical Variables", x = "Categorical Variables", y = "Categorical Variables") +
theme_minimal() + theme(axis.text.x = element_text(angle = 40, hjust = 1)) Correlation Plot of Numerical Values
numeric_cols <- df[, sapply(df, is.numeric)]
cor_matrix <- cor(numeric_cols, use = "complete.obs")
library(corrplot)## corrplot 0.94 loaded
Violin Plot
set.seed(42)
mean_values <- df %>%
group_by(Prsn_Injry_Sev_ID) %>%
summarize(mean_TrafVol = mean(TrafVol))
ggplot(df, aes(x = factor(Prsn_Injry_Sev_ID), y = TrafVol, fill = factor(Prsn_Injry_Sev_ID)))+
geom_violin(fill = "lightgray", color = "black", alpha = 0.7, trim = FALSE) +
geom_boxplot(width = 0.1, color = "black", outlier.shape = NA, alpha = 0.5) + # Boxplot inside the violin for better clarity
geom_jitter(width = 0.1, size = 1, alpha = 0.3, color = "#34495e") +
geom_point(data = mean_values, aes(x = Prsn_Injry_Sev_ID, y = mean_TrafVol),
color = "black", size = 3, position = position_dodge(width = 0.75)) +
labs(title = "Violin Plot of Traffic Volume by Injury Severity",
x = "Injury Severity",
y = "Traffic Volume") + # Added closing parenthesis here
theme_minimal()Cramer’s V Matrix
library(vcd)
library(dplyr)
library(DT)
cramersV_matrix <- function(data) {
# Create an empty matrix to store the Cramér's V values
vars <- names(data)
n <- length(vars)
v_matrix <- matrix(NA, nrow = n, ncol = n, dimnames = list(vars, vars))
for (i in 1:n) {
for (j in i:n) {
# Create a contingency table
tbl <- table(data[[i]], data[[j]])
if (length(tbl) > 1) {
# Calculate Cramér's V only if there are at least 2 levels in both variables
v_matrix[i, j] <- vcd::assocstats(tbl)$cramer
v_matrix[j, i] <- v_matrix[i, j] # Fill in the symmetric part
} else {
v_matrix[i, j] <- NA
v_matrix[j, i] <- NA
}
}
}
return(v_matrix)
}
# Assuming df is your original data frame
categorical_df <- df %>%
select(where(~ is.factor(.) | is.character(.)))
cramers_v_result <- cramersV_matrix(categorical_df)
# Convert the matrix to a data frame
cramers_v_df <- as.data.frame(cramers_v_result)
# Display as an interactive table
datatable(cramers_v_df, options = list(pageLength = 10, autoWidth = TRUE))