Road Accident Exploratory Data Analysis

data <- read.csv("India_accidents_datagov.csv")

summary(data)
##      SL_NO    STATE_UT_CITY      LOCATION_TYPE      PARENT_STATE      
##  Min.   : 1   Length:89          Length:89          Length:89         
##  1st Qu.:23   Class :character   Class :character   Class :character  
##  Median :45   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :45                                                           
##  3rd Qu.:67                                                           
##  Max.   :89                                                           
##     REGION          DOMINANT_ROAD_TYPE PRIMARY_CAUSE_OF_ACCIDENT
##  Length:89          Length:89          Length:89                
##  Class :character   Class :character   Class :character         
##  Mode  :character   Mode  :character   Mode  :character         
##                                                                 
##                                                                 
##                                                                 
##  ROAD_SURFACE_CONDITION PEAK_ACCIDENT_TIME MOST_INVOLVED_VEHICLE
##  Length:89              Length:89          Length:89            
##  Class :character       Class :character   Class :character     
##  Mode  :character       Mode  :character   Mode  :character     
##                                                                 
##                                                                 
##                                                                 
##  ROAD_ACC_CASES  ROAD_ACC_INJURED ROAD_ACC_DIED   TOTAL_TRAFFIC_ACC_CASES
##  Min.   :    4   Min.   :    3    Min.   :    2   Min.   :    4          
##  1st Qu.:  451   1st Qu.:  333    1st Qu.:  150   1st Qu.:  473          
##  Median : 1181   Median :  987    Median :  272   Median : 1232          
##  Mean   : 5766   Mean   : 5397    Mean   : 2102   Mean   : 6075          
##  3rd Qu.: 3452   3rd Qu.: 3372    3rd Qu.:  838   3rd Qu.: 3452          
##  Max.   :64105   Max.   :67703    Max.   :24109   Max.   :66117          
##  TOTAL_TRAFFIC_INJURED TOTAL_TRAFFIC_DIED CASE_FATALITY_RATE_PCT
##  Min.   :    3         Min.   :    2      Min.   :  6.79        
##  1st Qu.:  333         1st Qu.:  165      1st Qu.: 21.37        
##  Median :  987         Median :  281      Median : 38.51        
##  Mean   : 5427         Mean   : 2382      Mean   : 41.88        
##  3rd Qu.: 3446         3rd Qu.:  918      3rd Qu.: 51.43        
##  Max.   :67892         Max.   :28615      Max.   :116.90        
##  INJURY_RATE_PCT  FATALITY_RATIO_PCT
##  Min.   :  0.74   Min.   : 6.41     
##  1st Qu.: 68.55   1st Qu.:19.55     
##  Median : 82.33   Median :29.81     
##  Mean   : 83.69   Mean   :33.24     
##  3rd Qu.: 96.57   3rd Qu.:40.92     
##  Max.   :186.43   Max.   :99.26
str(data)
## 'data.frame':    89 obs. of  19 variables:
##  $ SL_NO                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ STATE_UT_CITY            : chr  "Andhra Pradesh" "Arunachal Pradesh" "Assam" "Bihar" ...
##  $ LOCATION_TYPE            : chr  "State" "State" "State" "State" ...
##  $ PARENT_STATE             : chr  "Andhra Pradesh" "Arunachal Pradesh" "Assam" "Bihar" ...
##  $ REGION                   : chr  "South" "Northeast" "Northeast" "East" ...
##  $ DOMINANT_ROAD_TYPE       : chr  "Other Roads" "Other Roads" "National Highway" "State Highway" ...
##  $ PRIMARY_CAUSE_OF_ACCIDENT: chr  "Over-Speeding" "Bad Weather" "Use of Mobile Phone" "Over-Speeding" ...
##  $ ROAD_SURFACE_CONDITION   : chr  "Dry" "Dry" "Dry" "Wet" ...
##  $ PEAK_ACCIDENT_TIME       : chr  "Afternoon (12PM-6PM)" "Afternoon (12PM-6PM)" "Morning (6AM-12PM)" "Night (10PM-6AM)" ...
##  $ MOST_INVOLVED_VEHICLE    : chr  "Truck/Lorry" "Two-Wheeler" "Bus" "Two-Wheeler" ...
##  $ ROAD_ACC_CASES           : int  21070 215 7028 10801 13091 3012 15777 10654 2484 5175 ...
##  $ ROAD_ACC_INJURED         : int  21340 177 5679 7068 11459 1071 15139 8353 3891 3747 ...
##  $ ROAD_ACC_DIED            : int  8293 152 3060 8898 5890 274 7634 5228 979 3898 ...
##  $ TOTAL_TRAFFIC_ACC_CASES  : int  22099 215 7739 12297 13524 3066 16549 11876 2484 5544 ...
##  $ TOTAL_TRAFFIC_INJURED    : int  21340 177 5679 7074 11480 1071 15139 8379 3891 3766 ...
##  $ TOTAL_TRAFFIC_DIED       : int  9330 152 3775 10397 6323 328 8417 6424 979 4251 ...
##  $ CASE_FATALITY_RATE_PCT   : num  42.2 70.7 48.8 84.5 46.8 ...
##  $ INJURY_RATE_PCT          : num  96.6 82.3 73.4 57.5 84.9 ...
##  $ FATALITY_RATIO_PCT       : num  30.4 46.2 39.9 59.5 35.5 ...
south_data <- subset(data, REGION == "South")
hist(south_data$INJURY_RATE_PCT,
     main="Injury Rate in South Region",
     xlab="Injury Rate %",
     col="lightblue")

north_data <- subset(data, REGION == "North")
hist(north_data$INJURY_RATE_PCT,
     main="Injury Rate in North Region",
     xlab="Injury Rate %",
     col="lightgreen")

nh_data <- subset(data, DOMINANT_ROAD_TYPE == "National Highway")
hist(nh_data$ROAD_ACC_DIED,
     main="Deaths on National Highways",
     xlab="Number of Deaths",
     col="red")

bike_data <- subset(data, MOST_INVOLVED_VEHICLE == "Two-Wheeler")
hist(bike_data$ROAD_ACC_CASES,
     main="Two-Wheeler Accident Cases",
     xlab="Accident Cases",
     col="cyan")

barplot(table(data$LOCATION_TYPE),
        main="Accidents by Location Type",
        xlab="Location Type",
        ylab="Count",
        col="lightgreen")

barplot(table(data$DOMINANT_ROAD_TYPE),
        main="Accidents by Road Type",
        ylab="Count",
        col="orange",
        las=2)

barplot(table(data$MOST_INVOLVED_VEHICLE),
        main="Most Involved Vehicle in Accidents",
        xlab="Vehicle Type",
        ylab="Count",
        col="purple",
        las=2)

barplot(table(data$PEAK_ACCIDENT_TIME),
        main="Accidents by Time of Day",
        xlab="Time",
        ylab="Count",
        col="yellow",
        las=2)

plot(data$ROAD_ACC_CASES, data$ROAD_ACC_DIED,
     main="Accident Cases vs Deaths",
     xlab="Number of Accident Cases",
     ylab="Number of Deaths",
     col="blue",
     pch=19)

plot(data$ROAD_ACC_INJURED, data$ROAD_ACC_DIED,
     main="Injuries vs Deaths",
     xlab="Number of Injured",
     ylab="Number of Deaths",
     col="red",
     pch=19)

plot(data$INJURY_RATE_PCT, data$FATALITY_RATIO_PCT,
     main="Injury Rate vs Fatality Ratio",
     xlab="Injury Rate %",
     ylab="Fatality Ratio %",
     col="green",
     pch=19)

plot(data$ROAD_ACC_CASES, data$ROAD_ACC_DIED,
     main="Accident Cases vs Deaths",
     xlab="Accident Cases",
     ylab="Deaths",
     col="blue",
     pch=19)

abline(lm(ROAD_ACC_DIED ~ ROAD_ACC_CASES, data=data),
       col="red",
       lwd=2)

boxplot(ROAD_ACC_DIED ~ REGION,
        data=data,
        main="Deaths by Region",
        xlab="Region",
        ylab="Number of Deaths",
        col="orange")

mosaicplot(table(data$DOMINANT_ROAD_TYPE, data$REGION),
           main="Road Type vs Region",
           xlab="Road Type",
           ylab="Region",
           col="orange")

mosaicplot(table(data$MOST_INVOLVED_VEHICLE, data$PEAK_ACCIDENT_TIME),
           main="Vehicle Type vs Accident Time",
           xlab="Vehicle Type",
           ylab="Accident Time",
           col="green",
           las=2)