# Load Libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
# Load Dataset
df <- read.csv("C:/Users/talwi/Downloads/indian_roads_dataset (1).csv")
# View Data
head(df)
##   accident_id       city       state latitude longitude       date  time hour
## 1           0       Pune Maharashtra 18.68083  73.93039 2023-10-22  5:00    5
## 2           1     Mumbai Maharashtra 18.81773  72.79085 2023-05-21  4:00    4
## 3           2     Mumbai Maharashtra 19.09689  72.81942 2024-07-10 13:00   13
## 4           3 Chandigarh      Punjab 30.78780  76.84751 2025-03-30 11:00   11
## 5           4    Chennai  Tamil Nadu 12.96515  80.28331 2024-01-25 16:00   16
## 6           5      Delhi       Delhi 28.79949  77.04967 2024-07-29  8:00    8
##   day_of_week is_weekend road_type lanes traffic_signal weather visibility
## 1      Sunday          1   highway     3              1     fog        low
## 2      Sunday          1     urban     4              0   clear       high
## 3   Wednesday          0     urban     3              0     fog        low
## 4      Sunday          1     urban     1              1     fog        low
## 5    Thursday          0   highway     3              1   clear       high
## 6      Monday          0     urban     5              1   clear       high
##   temperature traffic_density        cause accident_severity vehicles_involved
## 1          32            high      weather             fatal                 2
## 2          34             low      weather             major                 4
## 3          21          medium      weather             minor                 1
## 4          30            high  distraction             minor                 5
## 5          24             low  distraction             minor                 2
## 6          36            high overspeeding             fatal                 4
##   casualties is_peak_hour festival risk_score
## 1          2            0     None       0.85
## 2          3            0     None       0.10
## 3          1            0     None       0.45
## 4          2            0     None       0.65
## 5          1            0     None       0.10
## 6          4            1     None       0.65
str(df)
## 'data.frame':    20000 obs. of  24 variables:
##  $ accident_id      : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ city             : chr  "Pune" "Mumbai" "Mumbai" "Chandigarh" ...
##  $ state            : chr  "Maharashtra" "Maharashtra" "Maharashtra" "Punjab" ...
##  $ latitude         : num  18.7 18.8 19.1 30.8 13 ...
##  $ longitude        : num  73.9 72.8 72.8 76.8 80.3 ...
##  $ date             : chr  "2023-10-22" "2023-05-21" "2024-07-10" "2025-03-30" ...
##  $ time             : chr  "5:00" "4:00" "13:00" "11:00" ...
##  $ hour             : int  5 4 13 11 16 8 6 14 3 4 ...
##  $ day_of_week      : chr  "Sunday" "Sunday" "Wednesday" "Sunday" ...
##  $ is_weekend       : int  1 1 0 1 0 0 0 1 0 1 ...
##  $ road_type        : chr  "highway" "urban" "urban" "urban" ...
##  $ lanes            : int  3 4 3 1 3 5 3 3 1 3 ...
##  $ traffic_signal   : int  1 0 0 1 1 1 0 1 0 1 ...
##  $ weather          : chr  "fog" "clear" "fog" "fog" ...
##  $ visibility       : chr  "low" "high" "low" "low" ...
##  $ temperature      : int  32 34 21 30 24 36 33 25 22 23 ...
##  $ traffic_density  : chr  "high" "low" "medium" "high" ...
##  $ cause            : chr  "weather" "weather" "weather" "distraction" ...
##  $ accident_severity: chr  "fatal" "major" "minor" "minor" ...
##  $ vehicles_involved: int  2 4 1 5 2 4 4 3 1 1 ...
##  $ casualties       : int  2 3 1 2 1 4 4 0 0 0 ...
##  $ is_peak_hour     : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ festival         : chr  "None" "None" "None" "None" ...
##  $ risk_score       : num  0.85 0.1 0.45 0.65 0.1 0.65 0.3 0.1 0.45 0.1 ...
# DATA PREPROCESSING


df$accident_severity <- as.factor(df$accident_severity)
df$traffic_density <- as.factor(df$traffic_density)
df$weather <- as.factor(df$weather)
df$cause <- as.factor(df$cause)
df$road_type <- as.factor(df$road_type)

df$festival[is.na(df$festival)] <- "None"
# ==========================================
# BASIC STATISTICAL ANALYSIS
# ==========================================

# Q1: What is the average number of casualties occurring in road accidents?
print(mean(df$casualties))
## [1] 1.72645
# Q2: What is the median value of casualties in the dataset?
print(median(df$casualties))
## [1] 1
# Q3: How much variation exists in the number of casualties across accidents?
print(sd(df$casualties))
## [1] 1.489104
# Q4: How are casualties distributed across different accidents?
ggplot(df, aes(casualties)) + geom_histogram(bins=30)

INTERPRETATIONS: “This graph shows how casualties are spread across accidents. Most accidents have a lower number of casualties, and only a few accidents have very high casualties.”

# Q5: What is the frequency distribution of different accident severity levels?
print(table(df$accident_severity))
## 
## fatal major minor 
##  2987  5988 11025
ggplot(df, aes(accident_severity)) + geom_bar(fill="blue")

INTERPRETATIONS: The bar chart illustrates the frequency distribution of accident severity levels, showing a clear trend where minor accidents are the most frequent, exceeding 9,000 cases. In contrast, fatal accidents occur least often (approximately 3,000 cases), while major accidents fall in between. Overall, the data indicates an inverse relationship between the severity of an accident and its frequency of occurrence.

# Q6: How are road accidents distributed under different weather conditions?
ggplot(df, aes(weather)) + geom_bar(fill="orange")

INTERPRETATIONS: Based on the bar graph, road accidents are distributed almost equally across clear, foggy, and rainy weather conditions, with each category recording a count of approximately 6,000 cases. The uniform height of the bars suggests that, within this dataset, weather conditions do not significantly impact the total frequency of accidents.

# Q7: How does traffic density influence the occurrence of accidents?
ggplot(df, aes(traffic_density)) + geom_bar(fill="purple")

INTERPRETATION: Based on the bar graph, road accidents occur most frequently at both high and low traffic densities, with each category recording approximately 6,000 cases. Interestingly, medium traffic density shows a lower frequency of accidents, totaling around 5,000 cases. This distribution suggests that accidents are more prevalent during extreme traffic conditions—either very high congestion or very sparse flow—compared to moderate traffic levels.

# Q8: Which types of roads experience the highest number of accidents?
ggplot(df, aes(road_type)) + geom_bar(fill="brown")

INTERPRETATION: The bar chart visualizes the distribution of accidents across three distinct road types: highway, rural, and urban. Based on the data shown, the frequency of accidents is nearly identical for all three categories, with each bar reaching a count of approximately 6,000. This suggests that, within this specific dataset, the road environment does not appear to be a major differentiating factor for the number of accidents, as the occurrences are balanced across all road types.

# ==========================================
# RELATIONSHIP ANALYSIS
# ==========================================

# Q9: Is there a relationship between the number of vehicles involved and casualties?
print(cor(df$vehicles_involved, df$casualties))
## [1] 0.5509664
ggplot(df, aes(vehicles_involved, casualties)) +
  geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

INTERPRETATION: “The correlation value is around 0.55, which indicates a moderate positive relationship between vehicles involved and casualties. This means that as the number of vehicles increases, the number of casualties also tends to increase.”

# Q10: How does temperature affect the number of casualties in accidents?
ggplot(df, aes(temperature, casualties)) + geom_point()

INTERPRETATION: Visually, there is no clear correlation or trend between the two variables; casualties of all levels occur consistently across the entire temperature range (approximately 15 to 40 units). Based on this specific visualization, temperature does not appear to have a significant or predictable impact on the number of casualties.

# Q11: What is the relationship between the number of lanes and casualties?
ggplot(df, aes(lanes, casualties)) + geom_point()

INTERPRETATOPN: Based on this visualization, there is no clear correlation or trend; casualties appear to occur across the entire range of lane counts without a visible increase or decrease as the number of lanes changes. Additionally, the grid layout suggests significant overplotting, meaning many individual data points may be stacked directly on top of each other at those specific coordinates.

# Q12: On which days of the week do most accidents occur?
ggplot(df, aes(day_of_week)) + geom_bar(fill="orange")

# Q13: At what hours of the day are accidents most frequent?
ggplot(df, aes(hour)) + geom_histogram(bins=24, fill="cyan")

INTERPRETATION: The histogram illustrates the frequency of accidents distributed across the 24 hours of a day. The data shows a remarkably uniform distribution, suggesting that accidents occur at a relatively consistent rate regardless of the hour. While there are minor fluctuations throughout the day, there is no single significant peak or period where accident frequency drastically increases or decreases, indicating a steady risk level across the entire 24-hour cycle.

# Q14: Do peak hours result in more accidents compared to non-peak hours?
ggplot(df, aes(is_peak_hour)) + geom_bar(fill="pink")

# ==========================================
# LOCATION ANALYSIS
# ==========================================

# Q15: Which cities have the highest number of road accidents?
top_cities <- df %>%
  group_by(city) %>%
  summarise(total_accidents = n()) %>%
  arrange(desc(total_accidents)) %>%
  head(10)
print(top_cities)
## # A tibble: 8 × 2
##   city       total_accidents
##   <chr>                <int>
## 1 Chandigarh            2577
## 2 Chennai               2575
## 3 Kolkata               2559
## 4 Pune                  2517
## 5 Mumbai                2492
## 6 Bangalore             2438
## 7 Delhi                 2433
## 8 Hyderabad             2409
ggplot(top_cities, aes(reorder(city, total_accidents), total_accidents)) +
  geom_bar(stat="identity", fill="red") +
  coord_flip()

# Q16: Which states have the highest number of road accidents?
top_states <- df %>%
  group_by(state) %>%
  summarise(total_accidents = n()) %>%
  arrange(desc(total_accidents)) %>%
  head(10)
print(top_states)
## # A tibble: 7 × 2
##   state       total_accidents
##   <chr>                 <int>
## 1 Maharashtra            5009
## 2 Punjab                 2577
## 3 Tamil Nadu             2575
## 4 West Bengal            2559
## 5 Karnataka              2438
## 6 Delhi                  2433
## 7 Telangana              2409
ggplot(top_states, aes(reorder(state, total_accidents), total_accidents)) +
  geom_bar(stat="identity", fill="blue") +
  coord_flip()

# Q17: Which cities report the highest number of casualties?
city_casualties <- df %>%
  group_by(city) %>%
  summarise(total_casualties = sum(casualties)) %>%
  arrange(desc(total_casualties)) %>%
  head(10)
print(city_casualties)
## # A tibble: 8 × 2
##   city       total_casualties
##   <chr>                 <int>
## 1 Chennai                4567
## 2 Kolkata                4508
## 3 Pune                   4419
## 4 Chandigarh             4377
## 5 Mumbai                 4297
## 6 Delhi                  4175
## 7 Bangalore              4128
## 8 Hyderabad              4058
ggplot(city_casualties, aes(reorder(city, total_casualties), total_casualties)) +
  geom_bar(stat="identity", fill="green") +
  coord_flip()

# Q19: How do weather conditions affect accident severity levels?
ggplot(df, aes(weather, fill=accident_severity)) +
  geom_bar(position="dodge")

INTERPRETATION: This bar chart illustrates the relationship between weather conditions (clear, fog, and rain) and the frequency of different accident severity levels (fatal, major, and minor). Across all three weather categories, the distribution remains remarkably consistent: minor accidents are the most frequent, occurring roughly 4,000 times, followed by major accidents at approximately 2,000 instances, and fatal accidents as the least common at around 1,000. Because the height and ratio of the bars are nearly identical for clear, fog, and rain conditions, the graph suggests that, for this specific dataset, weather conditions do not have a significant impact on the severity level of accidents.

# ==========================================
# REGRESSION ANALYSIS
# ==========================================

# Q21: Can the number of casualties be predicted based on vehicles involved?
model1 <- lm(casualties ~ vehicles_involved, data=df)
print(summary(model1))
## 
## Call:
## lm(formula = casualties ~ vehicles_involved, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8903 -0.8903  0.2689  0.8485  2.1097 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.007719   0.020548  -0.376    0.707    
## vehicles_involved  0.579602   0.006208  93.364   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.243 on 19998 degrees of freedom
## Multiple R-squared:  0.3036, Adjusted R-squared:  0.3035 
## F-statistic:  8717 on 1 and 19998 DF,  p-value: < 2.2e-16

INTERPRETATION: The result shows a positive relationship, meaning as the number of vehicles increases, casualties also increase. The p-value is very small, which means the result is statistically significant. The R-squared value is around 0.30, which means about 30% of the variation in casualties is explained by vehicles involved.”

# Q22: How do multiple factors such as vehicles, temperature, and lanes affect casualties?
model2 <- lm(casualties ~ vehicles_involved + temperature + lanes, data=df)
print(summary(model2))
## 
## Call:
## lm(formula = casualties ~ vehicles_involved + temperature + lanes, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8988 -0.8819  0.2611  0.8521  2.1182 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.081e-02  4.261e-02   0.254    0.800    
## vehicles_involved  5.796e-01  6.208e-03  93.356   <2e-16 ***
## temperature       -6.740e-04  1.179e-03  -0.572    0.568    
## lanes              3.085e-05  5.153e-03   0.006    0.995    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.243 on 19996 degrees of freedom
## Multiple R-squared:  0.3036, Adjusted R-squared:  0.3035 
## F-statistic:  2905 on 3 and 19996 DF,  p-value: < 2.2e-16

INTERPRETATION: “In this model, I used multiple factors like vehicles involved, temperature, and number of lanes to predict casualties. The result shows that only the number of vehicles has a strong effect on casualties, while temperature and lanes do not have a significant impact. The model is statistically significant overall, and it explains about 30% of the variation in casualties.”

# ANOVA
# Q24: Is there a significant difference in casualties across different accident severity levels?
anova1 <- aov(casualties ~ accident_severity, data=df)
print(summary(anova1))
##                      Df Sum Sq Mean Sq F value Pr(>F)    
## accident_severity     2   5711  2855.4    1478 <2e-16 ***
## Residuals         19997  38636     1.9                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

INTERPRETATION: “I used ANOVA to check whether casualties differ across different accident severity levels. The p-value is very small (less than 0.05), which means there is a significant difference. So, accident severity has a strong impact on the number of casualties.”

# RISK SCORE MODEL
# Q26: Can the accident risk score be predicted using casualties and other variables?
model4 <- lm(risk_score ~ casualties + vehicles_involved + temperature, data=df)
print(summary(model4))
## 
## Call:
## lm(formula = risk_score ~ casualties + vehicles_involved + temperature, 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.39133 -0.16918  0.00597  0.16969  0.55231 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.4385368  0.0066938  65.514   <2e-16 ***
## casualties         0.0251551  0.0012285  20.476   <2e-16 ***
## vehicles_involved -0.0144820  0.0012923 -11.206   <2e-16 ***
## temperature       -0.0000381  0.0002048  -0.186    0.852    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2159 on 19996 degrees of freedom
## Multiple R-squared:  0.02054,    Adjusted R-squared:  0.02039 
## F-statistic: 139.8 on 3 and 19996 DF,  p-value: < 2.2e-16

INTERPRETATION: “I built a regression model to predict the accident risk score using casualties, number of vehicles, and temperature. The model is statistically significant, but the R-squared value is very low (around 0.02), which means it explains only a small part of the data. Among the variables, casualties and vehicles have some effect, but temperature does not have a significant impact.”

# ==========================================
# TRAIN-TEST SPLIT
# ==========================================

#“We split the data into two parts so that we can train and test the model properly.”
set.seed(123)
sample_size <- floor(0.7 * nrow(df))
train_index <- sample(seq_len(nrow(df)), size = sample_size)

train_data <- df[train_index, ]
test_data <- df[-train_index, ]
#“We train the model using the training data so it can learn patterns.”
model_train <- lm(casualties ~ vehicles_involved + temperature + lanes, data=train_data)
print(summary(model_train))
## 
## Call:
## lm(formula = casualties ~ vehicles_involved + temperature + lanes, 
##     data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8986 -0.8820  0.2608  0.8527  2.1195 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.843e-02  5.110e-02  -0.556    0.578    
## vehicles_involved  5.797e-01  7.445e-03  77.860   <2e-16 ***
## temperature        7.197e-04  1.410e-03   0.511    0.610    
## lanes             -1.715e-05  6.145e-03  -0.003    0.998    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.241 on 13996 degrees of freedom
## Multiple R-squared:  0.3023, Adjusted R-squared:  0.3021 
## F-statistic:  2021 on 3 and 13996 DF,  p-value: < 2.2e-16
#“We train the model using the training data so it can learn patterns.”
predictions <- predict(model_train, newdata=test_data)
head(predictions)
##         3         8        12        19        23        24 
## 0.5662817 1.7284612 1.1545342 0.5662646 1.7292152 1.7363951
#“We calculate RMSE to check how much error the model has.”
rmse <- sqrt(mean((test_data$casualties - predictions)^2))
print(rmse)
## [1] 1.246435
#“We compare predicted values with actual values to see accuracy.”
plot(test_data$casualties, predictions)
abline(0,1,col="red")

# CONCLUSION
# ==========================================
cat("Conclusion: Vehicles, traffic density, and peak hours significantly impact accident severity.")
## Conclusion: Vehicles, traffic density, and peak hours significantly impact accident severity.