HW1_Anannya

library(stats)
library(dplyr)       
library(readxl)
library(ggplot2)
library(reshape2)
library(RColorBrewer)

Read the Data

df <- read_excel("C:/Users/gpk30/OneDrive - Texas State University/AI_Civ_Eng/HW1/HW1_Data.xlsx")
str(df)   
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
##  $ Wthr_Cond_ID        : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
##  $ Light_Cond_ID       : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
##  $ Road_Type_ID        : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
##  $ Road_Algn_ID        : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
##  $ SurfDry             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Traffic_Cntl_ID     : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
##  $ Harm_Evnt_ID        : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
##  $ Intrsct_Relat_ID    : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
##  $ FHE_Collsn_ID       : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
##  $ Road_Part_Adj_ID    : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
##  $ Road_Cls_ID         : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
##  $ Pop_Group_ID        : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
##  $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
##  $ Veh_Body_Styl_ID    : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
##  $ Prsn_Ethnicity_ID   : chr [1:1295] "White" "White" "White" "White" ...
##  $ GenMale             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ TrafVol             : num [1:1295] 18215 21568 23677 27232 21480 ...
##  $ Prsn_Age            : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
##  $ Prsn_Injry_Sev_ID   : chr [1:1295] "O" "O" "O" "O" ...
names(df)#States about the structure of data frame
##  [1] "Wthr_Cond_ID"         "Light_Cond_ID"        "Road_Type_ID"        
##  [4] "Road_Algn_ID"         "SurfDry"              "Traffic_Cntl_ID"     
##  [7] "Harm_Evnt_ID"         "Intrsct_Relat_ID"     "FHE_Collsn_ID"       
## [10] "Road_Part_Adj_ID"     "Road_Cls_ID"          "Pop_Group_ID"        
## [13] "Crash_Speed_LimitCat" "Veh_Body_Styl_ID"     "Prsn_Ethnicity_ID"   
## [16] "GenMale"              "TrafVol"              "Prsn_Age"            
## [19] "Prsn_Injry_Sev_ID"

Data Summarization

summary(df)
##  Wthr_Cond_ID       Light_Cond_ID      Road_Type_ID       Road_Algn_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     SurfDry       Traffic_Cntl_ID    Harm_Evnt_ID       Intrsct_Relat_ID  
##  Min.   :0.0000   Length:1295        Length:1295        Length:1295       
##  1st Qu.:1.0000   Class :character   Class :character   Class :character  
##  Median :1.0000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.9143                                                           
##  3rd Qu.:1.0000                                                           
##  Max.   :1.0000                                                           
##  FHE_Collsn_ID      Road_Part_Adj_ID   Road_Cls_ID        Pop_Group_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Crash_Speed_LimitCat Veh_Body_Styl_ID   Prsn_Ethnicity_ID     GenMale      
##  Length:1295          Length:1295        Length:1295        Min.   :0.0000  
##  Class :character     Class :character   Class :character   1st Qu.:1.0000  
##  Mode  :character     Mode  :character   Mode  :character   Median :1.0000  
##                                                             Mean   :0.8842  
##                                                             3rd Qu.:1.0000  
##                                                             Max.   :1.0000  
##     TrafVol        Prsn_Age         Prsn_Injry_Sev_ID 
##  Min.   :  209   Length:1295        Length:1295       
##  1st Qu.: 7174   Class :character   Class :character  
##  Median :15155   Mode  :character   Mode  :character  
##  Mean   :14650                                        
##  3rd Qu.:21556                                        
##  Max.   :28992

Data Visualization

Visualization of Numerical Data

Histogram of Traffic Volume

hist(df$TrafVol, xlab = "Traffic Volume", col = rgb(0.2,0.6,0.5, alpha = 0.3), 
     main = "Histogram of Traffic Volume",
     ylim = c(50,120) )

Visualization of Categorical Data

Barplot of Weather Condition

table(df$Wthr_Cond_ID)
## 
##  Clear Cloudy    Fog  Other   Rain 
##   1084    146     10     10     45
barplot(table(df$Wthr_Cond_ID), main = "Weather Condition", xlab = "Frequency",
        ylab  = "Weather Condition Categories", horiz = T, 
        col = c("dodgerblue", "grey", "aliceblue", "skyblue", "darkturquoise"), 
        xlim = c(0,1200))

Barplot of Light_Cond_ID

table(df$Light_Cond_ID)
## 
##     Dark, lighted Dark, not lighted          Daylight              Dusk 
##                42               154              1040                39 
##             Other 
##                20
barplot(table(df$Light_Cond_ID), main = "Lighting Condition", xlab = "Frequency",
        ylab  = "Light Condition Categories", horiz = T, 
        col = c("bisque4", "black", "aliceblue", "cornflowerblue", "darkslategray"), 
        xlim = c(0,1200))

Barplot of Road Type

table(df$Road_Type_ID)
## 
##              2 lane, 2 way   4 or more lanes, divided 
##                        551                        185 
## 4 or more lanes, undivided                      Other 
##                         96                          2 
##                    Unknown 
##                        461
barplot(table(df$Road_Type_ID),  main = "Road Type", xlab = "Frequency",
        ylab  = "Road Types",  
        col = "moccasin",
        ylim = c(0,600))

### Barplot of Road_Algn_ID

table(df$Road_Algn_ID)
## 
##        Curve, level               Other     Straight, grade Straight, hillcrest 
##                  62                  44                 135                  47 
##     Straight, level 
##                1007
barplot(table(df$Road_Algn_ID),  main = "Road Alignment", xlab = "Road Alignment",
        ylab  = "Fequency",  
        col = "darkseagreen2", 
        ylim = c(0,1200))

Barplot of Surface Condition

frequency_table <- table(df$SurfDry)
names(frequency_table) <- gsub("0", "Wet", names(frequency_table))
names(frequency_table) <- gsub("1", "Dry", names(frequency_table))
frequency_table
##  Wet  Dry 
##  111 1184
barplot(frequency_table,  main = "Road Surface", xlab = "Surface Type",
        ylab  = "Frequency",  
        col = c("gray10", "gray48"), 
        ylim = c(0,1200))

Density Plot

Density Plot of Surface Condition

table(df$Crash_Speed_LimitCat)
## 
##  > 70 mph 30-40 mph 45-60 mph 65-70 mph     Other 
##       203       336       471       223        62
ggplot(df, aes(x = Crash_Speed_LimitCat, fill = factor(Crash_Speed_LimitCat))) +
  geom_density(alpha = 0.5) +  # Adjust alpha for transparency
  labs(title = "Density Plot of Crash Speed Limit (mph)",
       x = "Crash Speed Limit (mph))", y = "Density",
       fill = "Crash Speed Limits") +
  theme_minimal()

Density Plot of Person Age

ggplot(df, aes(x = Prsn_Age, fill = factor(Prsn_Age))) +
  geom_density(alpha = 0.5) +  # Adjust alpha for transparency
  labs(title = "Density Plot of Person Age by Age Range",
       x = "Person Age (years)",
       fill = "Age Range") +
  scale_fill_manual(values = c("blue", "olivedrab", "orange", "skyblue", "saddlebrown")) +  # Custom colors
  theme_minimal()

Density Plot of Prsn_Injry_Sev_ID

ggplot(df, aes(x = Prsn_Injry_Sev_ID)) +
  geom_density(fill = "deepskyblue", alpha = 0.35) +
  labs(title = "Density Plot of Person Injury Severity",
       x = "Injury Severity",
       y = "Density") +
  theme_minimal()

Boxplot

Boxplot of Traffic Volume by Ethnicity

ggplot(df, aes(x = factor(Prsn_Ethnicity_ID), y = TrafVol)) +
  geom_boxplot(fill = c("lightslateblue", "navajowhite3", "lightsalmon", "indianred4", "peachpuff2"), outlier.colour = "red", alpha = 0.3) +
  geom_jitter(width = 0.2, color = "blue", alpha = 0.3) +  # Add jittered points
  labs(title = "Box Plot of Traffic Volume by Person Ethnicity",
       x = "Ethnicity",
       y = "Traffic Volume") +
  theme_minimal()

Boxplot of Traffic Volume by Person Age (years)

ggplot(df, aes(x = factor(Prsn_Age), y = TrafVol)) +
  geom_boxplot(fill = c("tomato", "red", "tomato1", "tan1", "orangered"), outlier.colour = "red", alpha = 0.35) +
  geom_jitter(width = 0.2, color = "blue", alpha = 0.3) +  # Add jittered points
  labs(title = "Box Plot of Traffic Volume by Person Age",
       x = "Person Age (years)",
       y = "Traffic Volume") +
  theme_minimal()

##Heatmap

HeatMap of Road Alignment and Harm Event

#creating Contingency Table
con_tab_1 <- xtabs(~ Prsn_Age + Prsn_Injry_Sev_ID, data = df)
print(con_tab_1)
##              Prsn_Injry_Sev_ID
## Prsn_Age       BC  KA   O
##   15-24 years  18   5 165
##   25-54 years  56  19 497
##   55-64 years  23   8 201
##   65-74 years  14  10  94
##   Other         9   8 168
#converting to dataframe
con_df1 <- as.data.frame(con_tab_1)


#HeatMap
ggplot(con_df1, aes(x = Prsn_Age, y = Prsn_Injry_Sev_ID, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "navy") +
  labs(title = "Heatmap of Person Injury Severity vs Age",
       x = "Person Age (years)",
       y = "Injury Severity",
       fill = "Frequency") +
  theme_minimal()

HeatMap of FHE Collision and Road Class

#creating Contingency Table
con_tab_2 <- xtabs(~ FHE_Collsn_ID + Road_Cls_ID, data = df)
print(con_tab_2)
##                                   Road_Cls_ID
## FHE_Collsn_ID                      City street County road Farm to market Other
##   Omv vehicle going straight                58          52             59    25
##   Other                                    101          81             91    32
##   Sd both going straight-rear end           23          17            110    15
##   Sd both going straight-sideswipe          24          16             35    15
##   Sd one straight-one left turn             15          25             79     2
##                                   Road_Cls_ID
## FHE_Collsn_ID                      Us & state highways
##   Omv vehicle going straight                        65
##   Other                                             92
##   Sd both going straight-rear end                  147
##   Sd both going straight-sideswipe                  69
##   Sd one straight-one left turn                     47
#converting to dataframe
con_df2 <- as.data.frame(con_tab_2)


#HeatMap
ggplot(con_df2, aes(x = FHE_Collsn_ID, y = Road_Cls_ID, fill = Freq, , alpha =0.5)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") +
  labs(title = "Heatmap of Road Class vs FHE Collision Type",
       x = "FHE Collision Type",
       y = "Road Class",
       fill = "Frequency") +
  theme_minimal() + theme(axis.text.x = element_text(angle = 20, hjust = 1)) 

Correlation Matrix

library(vcd)         # For computing chi-square statistic

# Sample data: Replace this with your actual dataset
# The character variables from your dataset
categorical_vars <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", 
                      "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID", "FHE_Collsn_ID", 
                      "Road_Part_Adj_ID", "Road_Cls_ID", "Pop_Group_ID", "Crash_Speed_LimitCat", 
                      "Veh_Body_Styl_ID", "Prsn_Ethnicity_ID", "Prsn_Age")

# Filter only the character columns from your actual dataset
data_categorical <- df[categorical_vars]

# Function to calculate Contingency Coefficient for a pair of categorical variables
contingency_coefficient <- function(x, y) {
  # Create a contingency table
  tbl <- table(x, y)
  
  # Perform chi-square test
  chi2_test <- chisq.test(tbl)
  
  # Calculate Contingency Coefficient
  N <- sum(tbl)  # Total number of observations
  chi2 <- chi2_test$statistic  # Chi-square statistic
  C <- sqrt(chi2 / (chi2 + N))  # Contingency Coefficient formula
  
  return(C)
}

# Create an empty matrix to store Contingency Coefficients
contingency_matrix_4 <- matrix(NA, ncol = length(categorical_vars), nrow = length(categorical_vars))
colnames(contingency_matrix_4) <- categorical_vars
rownames(contingency_matrix_4) <- categorical_vars

# Loop through each pair of categorical variables and calculate Contingency Coefficients
for (i in 1:length(categorical_vars)) {
  for (j in 1:length(categorical_vars)) {
    if (i != j) {
      contingency_matrix_4[i, j] <- contingency_coefficient(data_categorical[[categorical_vars[i]]], 
                                                          data_categorical[[categorical_vars[j]]])
    }
  }
}

# Convert the matrix to a data frame for easier viewing and plotting
contingency_matrix_df <- as.data.frame(contingency_matrix_4)

# Melt the matrix for use in ggplot2, ensure row and column names are set correctly
contingency_matrix_melt <- melt(as.matrix(contingency_matrix_df), varnames = c("Var1", "Var2"))

# Plot the Contingency Coefficient matrix as a heatmap
ggplot(data = contingency_matrix_melt, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightgrey", high = "blue", na.value = "white") +
  geom_text(aes(label = round(value, 2)), color = "white", size = 4) +
  labs(title = "Contingency Coefficient Matrix for Categorical Variables", x = "Categorical Variables", y = "Categorical Variables") +
  theme_minimal() + theme(axis.text.x = element_text(angle = 40, hjust = 1)) 

Correlation Plot of Numerical Values

numeric_cols <- df[, sapply(df, is.numeric)]
cor_matrix <- cor(numeric_cols, use = "complete.obs")
library(corrplot)
## corrplot 0.94 loaded
corrplot(cor_matrix, method = "circle", type = "lower")

Violin Plot

set.seed(42)

mean_values <- df %>%
  group_by(Prsn_Injry_Sev_ID) %>%
  summarize(mean_TrafVol = mean(TrafVol))


ggplot(df, aes(x = factor(Prsn_Injry_Sev_ID), y = TrafVol, fill = factor(Prsn_Injry_Sev_ID)))+
  geom_violin(fill = "lightgray", color = "black", alpha = 0.7, trim = FALSE) +
 geom_boxplot(width = 0.1, color = "black", outlier.shape = NA, alpha = 0.5) +  # Boxplot inside the violin for better clarity
  geom_jitter(width = 0.1, size = 1, alpha = 0.3, color = "#34495e") + 
  
  geom_point(data = mean_values, aes(x = Prsn_Injry_Sev_ID, y = mean_TrafVol), 
             color = "black", size = 3, position = position_dodge(width = 0.75)) +
  labs(title = "Violin Plot of Traffic Volume by Injury Severity",
       x = "Injury Severity",
       y = "Traffic Volume") +  # Added closing parenthesis here

  theme_minimal()

Cramer’s V Matrix

library(vcd)
library(dplyr)
library(DT)

cramersV_matrix <- function(data) {
  # Create an empty matrix to store the Cramér's V values
  vars <- names(data)
  n <- length(vars)
  v_matrix <- matrix(NA, nrow = n, ncol = n, dimnames = list(vars, vars))
  
  for (i in 1:n) {
    for (j in i:n) {
      # Create a contingency table
      tbl <- table(data[[i]], data[[j]])
      if (length(tbl) > 1) {
        # Calculate Cramér's V only if there are at least 2 levels in both variables
        v_matrix[i, j] <- vcd::assocstats(tbl)$cramer
        v_matrix[j, i] <- v_matrix[i, j]  # Fill in the symmetric part
      } else {
        v_matrix[i, j] <- NA
        v_matrix[j, i] <- NA
      }
    }
  }
  return(v_matrix)
}

# Assuming df is your original data frame
categorical_df <- df %>%
  select(where(~ is.factor(.) | is.character(.)))

cramers_v_result <- cramersV_matrix(categorical_df)



# Convert the matrix to a data frame
cramers_v_df <- as.data.frame(cramers_v_result)

# Display as an interactive table
datatable(cramers_v_df, options = list(pageLength = 10, autoWidth = TRUE))