# Load Libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
# Load Dataset
df <- read.csv("C:/Users/talwi/Downloads/indian_roads_dataset (1).csv")
# View Data
head(df)
## accident_id city state latitude longitude date time hour
## 1 0 Pune Maharashtra 18.68083 73.93039 2023-10-22 5:00 5
## 2 1 Mumbai Maharashtra 18.81773 72.79085 2023-05-21 4:00 4
## 3 2 Mumbai Maharashtra 19.09689 72.81942 2024-07-10 13:00 13
## 4 3 Chandigarh Punjab 30.78780 76.84751 2025-03-30 11:00 11
## 5 4 Chennai Tamil Nadu 12.96515 80.28331 2024-01-25 16:00 16
## 6 5 Delhi Delhi 28.79949 77.04967 2024-07-29 8:00 8
## day_of_week is_weekend road_type lanes traffic_signal weather visibility
## 1 Sunday 1 highway 3 1 fog low
## 2 Sunday 1 urban 4 0 clear high
## 3 Wednesday 0 urban 3 0 fog low
## 4 Sunday 1 urban 1 1 fog low
## 5 Thursday 0 highway 3 1 clear high
## 6 Monday 0 urban 5 1 clear high
## temperature traffic_density cause accident_severity vehicles_involved
## 1 32 high weather fatal 2
## 2 34 low weather major 4
## 3 21 medium weather minor 1
## 4 30 high distraction minor 5
## 5 24 low distraction minor 2
## 6 36 high overspeeding fatal 4
## casualties is_peak_hour festival risk_score
## 1 2 0 None 0.85
## 2 3 0 None 0.10
## 3 1 0 None 0.45
## 4 2 0 None 0.65
## 5 1 0 None 0.10
## 6 4 1 None 0.65
str(df)
## 'data.frame': 20000 obs. of 24 variables:
## $ accident_id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ city : chr "Pune" "Mumbai" "Mumbai" "Chandigarh" ...
## $ state : chr "Maharashtra" "Maharashtra" "Maharashtra" "Punjab" ...
## $ latitude : num 18.7 18.8 19.1 30.8 13 ...
## $ longitude : num 73.9 72.8 72.8 76.8 80.3 ...
## $ date : chr "2023-10-22" "2023-05-21" "2024-07-10" "2025-03-30" ...
## $ time : chr "5:00" "4:00" "13:00" "11:00" ...
## $ hour : int 5 4 13 11 16 8 6 14 3 4 ...
## $ day_of_week : chr "Sunday" "Sunday" "Wednesday" "Sunday" ...
## $ is_weekend : int 1 1 0 1 0 0 0 1 0 1 ...
## $ road_type : chr "highway" "urban" "urban" "urban" ...
## $ lanes : int 3 4 3 1 3 5 3 3 1 3 ...
## $ traffic_signal : int 1 0 0 1 1 1 0 1 0 1 ...
## $ weather : chr "fog" "clear" "fog" "fog" ...
## $ visibility : chr "low" "high" "low" "low" ...
## $ temperature : int 32 34 21 30 24 36 33 25 22 23 ...
## $ traffic_density : chr "high" "low" "medium" "high" ...
## $ cause : chr "weather" "weather" "weather" "distraction" ...
## $ accident_severity: chr "fatal" "major" "minor" "minor" ...
## $ vehicles_involved: int 2 4 1 5 2 4 4 3 1 1 ...
## $ casualties : int 2 3 1 2 1 4 4 0 0 0 ...
## $ is_peak_hour : int 0 0 0 0 0 1 0 0 0 0 ...
## $ festival : chr "None" "None" "None" "None" ...
## $ risk_score : num 0.85 0.1 0.45 0.65 0.1 0.65 0.3 0.1 0.45 0.1 ...
# DATA PREPROCESSING
df$accident_severity <- as.factor(df$accident_severity)
df$traffic_density <- as.factor(df$traffic_density)
df$weather <- as.factor(df$weather)
df$cause <- as.factor(df$cause)
df$road_type <- as.factor(df$road_type)
df$festival[is.na(df$festival)] <- "None"
# ==========================================
# BASIC STATISTICAL ANALYSIS
# ==========================================
# Q1: What is the average number of casualties occurring in road accidents?
print(mean(df$casualties))
## [1] 1.72645
# Q2: What is the median value of casualties in the dataset?
print(median(df$casualties))
## [1] 1
# Q3: How much variation exists in the number of casualties across accidents?
print(sd(df$casualties))
## [1] 1.489104
# Q4: How are casualties distributed across different accidents?
ggplot(df, aes(casualties)) + geom_histogram(bins=30)
INTERPRETATIONS: “This graph shows how casualties are spread across
accidents. Most accidents have a lower number of casualties, and only a
few accidents have very high casualties.”
# Q5: What is the frequency distribution of different accident severity levels?
print(table(df$accident_severity))
##
## fatal major minor
## 2987 5988 11025
ggplot(df, aes(accident_severity)) + geom_bar(fill="blue")
INTERPRETATIONS: The bar chart illustrates the frequency distribution of
accident severity levels, showing a clear trend where minor accidents
are the most frequent, exceeding 9,000 cases. In contrast, fatal
accidents occur least often (approximately 3,000 cases), while major
accidents fall in between. Overall, the data indicates an inverse
relationship between the severity of an accident and its frequency of
occurrence.
# Q6: How are road accidents distributed under different weather conditions?
ggplot(df, aes(weather)) + geom_bar(fill="orange")
INTERPRETATIONS: Based on the bar graph, road accidents are distributed almost equally across clear, foggy, and rainy weather conditions, with each category recording a count of approximately 6,000 cases. The uniform height of the bars suggests that, within this dataset, weather conditions do not significantly impact the total frequency of accidents.
# Q7: How does traffic density influence the occurrence of accidents?
ggplot(df, aes(traffic_density)) + geom_bar(fill="purple")
INTERPRETATION: Based on the bar graph, road accidents occur most frequently at both high and low traffic densities, with each category recording approximately 6,000 cases. Interestingly, medium traffic density shows a lower frequency of accidents, totaling around 5,000 cases. This distribution suggests that accidents are more prevalent during extreme traffic conditions—either very high congestion or very sparse flow—compared to moderate traffic levels.
# Q8: Which types of roads experience the highest number of accidents?
ggplot(df, aes(road_type)) + geom_bar(fill="brown")
INTERPRETATION: The bar chart visualizes the distribution of accidents across three distinct road types: highway, rural, and urban. Based on the data shown, the frequency of accidents is nearly identical for all three categories, with each bar reaching a count of approximately 6,000. This suggests that, within this specific dataset, the road environment does not appear to be a major differentiating factor for the number of accidents, as the occurrences are balanced across all road types.
# ==========================================
# RELATIONSHIP ANALYSIS
# ==========================================
# Q9: Is there a relationship between the number of vehicles involved and casualties?
print(cor(df$vehicles_involved, df$casualties))
## [1] 0.5509664
ggplot(df, aes(vehicles_involved, casualties)) +
geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'
INTERPRETATION: “The correlation value is around 0.55, which indicates a moderate positive relationship between vehicles involved and casualties. This means that as the number of vehicles increases, the number of casualties also tends to increase.”
# Q10: How does temperature affect the number of casualties in accidents?
ggplot(df, aes(temperature, casualties)) + geom_point()
INTERPRETATION: Visually, there is no clear correlation or trend between the two variables; casualties of all levels occur consistently across the entire temperature range (approximately 15 to 40 units). Based on this specific visualization, temperature does not appear to have a significant or predictable impact on the number of casualties.
# Q11: What is the relationship between the number of lanes and casualties?
ggplot(df, aes(lanes, casualties)) + geom_point()
INTERPRETATOPN: Based on this visualization, there is no clear correlation or trend; casualties appear to occur across the entire range of lane counts without a visible increase or decrease as the number of lanes changes. Additionally, the grid layout suggests significant overplotting, meaning many individual data points may be stacked directly on top of each other at those specific coordinates.
# Q12: On which days of the week do most accidents occur?
ggplot(df, aes(day_of_week)) + geom_bar(fill="orange")
# Q13: At what hours of the day are accidents most frequent?
ggplot(df, aes(hour)) + geom_histogram(bins=24, fill="cyan")
INTERPRETATION: The histogram illustrates the frequency of accidents distributed across the 24 hours of a day. The data shows a remarkably uniform distribution, suggesting that accidents occur at a relatively consistent rate regardless of the hour. While there are minor fluctuations throughout the day, there is no single significant peak or period where accident frequency drastically increases or decreases, indicating a steady risk level across the entire 24-hour cycle.
# Q14: Do peak hours result in more accidents compared to non-peak hours?
ggplot(df, aes(is_peak_hour)) + geom_bar(fill="pink")
# ==========================================
# LOCATION ANALYSIS
# ==========================================
# Q15: Which cities have the highest number of road accidents?
top_cities <- df %>%
group_by(city) %>%
summarise(total_accidents = n()) %>%
arrange(desc(total_accidents)) %>%
head(10)
print(top_cities)
## # A tibble: 8 × 2
## city total_accidents
## <chr> <int>
## 1 Chandigarh 2577
## 2 Chennai 2575
## 3 Kolkata 2559
## 4 Pune 2517
## 5 Mumbai 2492
## 6 Bangalore 2438
## 7 Delhi 2433
## 8 Hyderabad 2409
ggplot(top_cities, aes(reorder(city, total_accidents), total_accidents)) +
geom_bar(stat="identity", fill="red") +
coord_flip()
# Q16: Which states have the highest number of road accidents?
top_states <- df %>%
group_by(state) %>%
summarise(total_accidents = n()) %>%
arrange(desc(total_accidents)) %>%
head(10)
print(top_states)
## # A tibble: 7 × 2
## state total_accidents
## <chr> <int>
## 1 Maharashtra 5009
## 2 Punjab 2577
## 3 Tamil Nadu 2575
## 4 West Bengal 2559
## 5 Karnataka 2438
## 6 Delhi 2433
## 7 Telangana 2409
ggplot(top_states, aes(reorder(state, total_accidents), total_accidents)) +
geom_bar(stat="identity", fill="blue") +
coord_flip()
# Q17: Which cities report the highest number of casualties?
city_casualties <- df %>%
group_by(city) %>%
summarise(total_casualties = sum(casualties)) %>%
arrange(desc(total_casualties)) %>%
head(10)
print(city_casualties)
## # A tibble: 8 × 2
## city total_casualties
## <chr> <int>
## 1 Chennai 4567
## 2 Kolkata 4508
## 3 Pune 4419
## 4 Chandigarh 4377
## 5 Mumbai 4297
## 6 Delhi 4175
## 7 Bangalore 4128
## 8 Hyderabad 4058
ggplot(city_casualties, aes(reorder(city, total_casualties), total_casualties)) +
geom_bar(stat="identity", fill="green") +
coord_flip()
# Q19: How do weather conditions affect accident severity levels?
ggplot(df, aes(weather, fill=accident_severity)) +
geom_bar(position="dodge")
INTERPRETATION: This bar chart illustrates the relationship between weather conditions (clear, fog, and rain) and the frequency of different accident severity levels (fatal, major, and minor). Across all three weather categories, the distribution remains remarkably consistent: minor accidents are the most frequent, occurring roughly 4,000 times, followed by major accidents at approximately 2,000 instances, and fatal accidents as the least common at around 1,000. Because the height and ratio of the bars are nearly identical for clear, fog, and rain conditions, the graph suggests that, for this specific dataset, weather conditions do not have a significant impact on the severity level of accidents.
# ==========================================
# REGRESSION ANALYSIS
# ==========================================
# Q21: Can the number of casualties be predicted based on vehicles involved?
model1 <- lm(casualties ~ vehicles_involved, data=df)
print(summary(model1))
##
## Call:
## lm(formula = casualties ~ vehicles_involved, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8903 -0.8903 0.2689 0.8485 2.1097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.007719 0.020548 -0.376 0.707
## vehicles_involved 0.579602 0.006208 93.364 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.243 on 19998 degrees of freedom
## Multiple R-squared: 0.3036, Adjusted R-squared: 0.3035
## F-statistic: 8717 on 1 and 19998 DF, p-value: < 2.2e-16
INTERPRETATION: The result shows a positive relationship, meaning as the number of vehicles increases, casualties also increase. The p-value is very small, which means the result is statistically significant. The R-squared value is around 0.30, which means about 30% of the variation in casualties is explained by vehicles involved.”
# Q22: How do multiple factors such as vehicles, temperature, and lanes affect casualties?
model2 <- lm(casualties ~ vehicles_involved + temperature + lanes, data=df)
print(summary(model2))
##
## Call:
## lm(formula = casualties ~ vehicles_involved + temperature + lanes,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8988 -0.8819 0.2611 0.8521 2.1182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.081e-02 4.261e-02 0.254 0.800
## vehicles_involved 5.796e-01 6.208e-03 93.356 <2e-16 ***
## temperature -6.740e-04 1.179e-03 -0.572 0.568
## lanes 3.085e-05 5.153e-03 0.006 0.995
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.243 on 19996 degrees of freedom
## Multiple R-squared: 0.3036, Adjusted R-squared: 0.3035
## F-statistic: 2905 on 3 and 19996 DF, p-value: < 2.2e-16
INTERPRETATION: “In this model, I used multiple factors like vehicles involved, temperature, and number of lanes to predict casualties. The result shows that only the number of vehicles has a strong effect on casualties, while temperature and lanes do not have a significant impact. The model is statistically significant overall, and it explains about 30% of the variation in casualties.”
# ANOVA
# Q24: Is there a significant difference in casualties across different accident severity levels?
anova1 <- aov(casualties ~ accident_severity, data=df)
print(summary(anova1))
## Df Sum Sq Mean Sq F value Pr(>F)
## accident_severity 2 5711 2855.4 1478 <2e-16 ***
## Residuals 19997 38636 1.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
INTERPRETATION: “I used ANOVA to check whether casualties differ across different accident severity levels. The p-value is very small (less than 0.05), which means there is a significant difference. So, accident severity has a strong impact on the number of casualties.”
# RISK SCORE MODEL
# Q26: Can the accident risk score be predicted using casualties and other variables?
model4 <- lm(risk_score ~ casualties + vehicles_involved + temperature, data=df)
print(summary(model4))
##
## Call:
## lm(formula = risk_score ~ casualties + vehicles_involved + temperature,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39133 -0.16918 0.00597 0.16969 0.55231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4385368 0.0066938 65.514 <2e-16 ***
## casualties 0.0251551 0.0012285 20.476 <2e-16 ***
## vehicles_involved -0.0144820 0.0012923 -11.206 <2e-16 ***
## temperature -0.0000381 0.0002048 -0.186 0.852
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2159 on 19996 degrees of freedom
## Multiple R-squared: 0.02054, Adjusted R-squared: 0.02039
## F-statistic: 139.8 on 3 and 19996 DF, p-value: < 2.2e-16
INTERPRETATION: “I built a regression model to predict the accident risk score using casualties, number of vehicles, and temperature. The model is statistically significant, but the R-squared value is very low (around 0.02), which means it explains only a small part of the data. Among the variables, casualties and vehicles have some effect, but temperature does not have a significant impact.”
# ==========================================
# TRAIN-TEST SPLIT
# ==========================================
#“We split the data into two parts so that we can train and test the model properly.”
set.seed(123)
sample_size <- floor(0.7 * nrow(df))
train_index <- sample(seq_len(nrow(df)), size = sample_size)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
#“We train the model using the training data so it can learn patterns.”
model_train <- lm(casualties ~ vehicles_involved + temperature + lanes, data=train_data)
print(summary(model_train))
##
## Call:
## lm(formula = casualties ~ vehicles_involved + temperature + lanes,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8986 -0.8820 0.2608 0.8527 2.1195
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.843e-02 5.110e-02 -0.556 0.578
## vehicles_involved 5.797e-01 7.445e-03 77.860 <2e-16 ***
## temperature 7.197e-04 1.410e-03 0.511 0.610
## lanes -1.715e-05 6.145e-03 -0.003 0.998
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.241 on 13996 degrees of freedom
## Multiple R-squared: 0.3023, Adjusted R-squared: 0.3021
## F-statistic: 2021 on 3 and 13996 DF, p-value: < 2.2e-16
#“We train the model using the training data so it can learn patterns.”
predictions <- predict(model_train, newdata=test_data)
head(predictions)
## 3 8 12 19 23 24
## 0.5662817 1.7284612 1.1545342 0.5662646 1.7292152 1.7363951
#“We calculate RMSE to check how much error the model has.”
rmse <- sqrt(mean((test_data$casualties - predictions)^2))
print(rmse)
## [1] 1.246435
#“We compare predicted values with actual values to see accuracy.”
plot(test_data$casualties, predictions)
abline(0,1,col="red")
# CONCLUSION
# ==========================================
cat("Conclusion: Vehicles, traffic density, and peak hours significantly impact accident severity.")
## Conclusion: Vehicles, traffic density, and peak hours significantly impact accident severity.