library(tidyverse)
library(VIM)
library(DT)
library(lubridate)
library(ggthemes)
library(viridis)

1 Data

1.1 Load data

data1 <- read.csv("C:/r data warehouse/vehicle collison/database.csv", stringsAsFactors = F, na.strings = "")

1.2 Check Dimension

dim(data1)
## [1] 65499    29

Data1 set has 65,499 rows and 29 variables

1.3 List of variable names

names(data1)
##  [1] "UNIQUE.KEY"          "DATE"                "TIME"               
##  [4] "BOROUGH"             "ZIP.CODE"            "LATITUDE"           
##  [7] "LONGITUDE"           "LOCATION"            "ON.STREET.NAME"     
## [10] "CROSS.STREET.NAME"   "OFF.STREET.NAME"     "PERSONS.INJURED"    
## [13] "PERSONS.KILLED"      "PEDESTRIANS.INJURED" "PEDESTRIANS.KILLED" 
## [16] "CYCLISTS.INJURED"    "CYCLISTS.KILLED"     "MOTORISTS.INJURED"  
## [19] "MOTORISTS.KILLED"    "VEHICLE.1.TYPE"      "VEHICLE.2.TYPE"     
## [22] "VEHICLE.3.TYPE"      "VEHICLE.4.TYPE"      "VEHICLE.5.TYPE"     
## [25] "VEHICLE.1.FACTOR"    "VEHICLE.2.FACTOR"    "VEHICLE.3.FACTOR"   
## [28] "VEHICLE.4.FACTOR"    "VEHICLE.5.FACTOR"

1.4 Counting missing values

data1 %>%  map_int(~sum(is.na(.)))
##          UNIQUE.KEY                DATE                TIME 
##                   0                   0                   0 
##             BOROUGH            ZIP.CODE            LATITUDE 
##               16632               16639               10954 
##           LONGITUDE            LOCATION      ON.STREET.NAME 
##               10954               10954                9524 
##   CROSS.STREET.NAME     OFF.STREET.NAME     PERSONS.INJURED 
##                9524               63333                   0 
##      PERSONS.KILLED PEDESTRIANS.INJURED  PEDESTRIANS.KILLED 
##                   0                   0                   0 
##    CYCLISTS.INJURED     CYCLISTS.KILLED   MOTORISTS.INJURED 
##                   0                   0                   0 
##    MOTORISTS.KILLED      VEHICLE.1.TYPE      VEHICLE.2.TYPE 
##                   0                  16                7069 
##      VEHICLE.3.TYPE      VEHICLE.4.TYPE      VEHICLE.5.TYPE 
##               61282               64588               65268 
##    VEHICLE.1.FACTOR    VEHICLE.2.FACTOR    VEHICLE.3.FACTOR 
##                 214                8090               61242 
##    VEHICLE.4.FACTOR    VEHICLE.5.FACTOR 
##               64583               65267

There are lots of missing values and we don’t know why. It could be deliberatley deleted or it could be Missing Completely at Random (MCAR) - we just don’t know!

1.5 Quick visualization of missing values

aggr(data1, sortVars = T,cex.axis = .5, cex.numbers = .5, combined = T, sortombs = T)

Above tables and chart shows that important variables such as persons injured or killed don’t have missing data but varibles related to it have missing data.

1.6 Preview of data

datatable(head(data1, 10), options = list(scrollX = T, scrollY = 400))

1.7 Summary of variables

summary(data1)
##    UNIQUE.KEY          DATE               TIME          
##  Min.   :3145850   Length:65499       Length:65499      
##  1st Qu.:3162574   Class :character   Class :character  
##  Median :3178969   Mode  :character   Mode  :character  
##  Mean   :3179157                                        
##  3rd Qu.:3195356                                        
##  Max.   :3403285                                        
##                                                         
##    BOROUGH             ZIP.CODE        LATITUDE       LONGITUDE     
##  Length:65499       Min.   :10001   Min.   :40.50   Min.   :-74.25  
##  Class :character   1st Qu.:10128   1st Qu.:40.67   1st Qu.:-73.98  
##  Mode  :character   Median :11206   Median :40.72   Median :-73.93  
##                     Mean   :10821   Mean   :40.72   Mean   :-73.92  
##                     3rd Qu.:11236   3rd Qu.:40.77   3rd Qu.:-73.87  
##                     Max.   :11697   Max.   :40.91   Max.   :-73.70  
##                     NA's   :16639   NA's   :10954   NA's   :10954   
##    LOCATION         ON.STREET.NAME     CROSS.STREET.NAME 
##  Length:65499       Length:65499       Length:65499      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  OFF.STREET.NAME    PERSONS.INJURED   PERSONS.KILLED     
##  Length:65499       Min.   : 0.0000   Min.   :0.0000000  
##  Class :character   1st Qu.: 0.0000   1st Qu.:0.0000000  
##  Mode  :character   Median : 0.0000   Median :0.0000000  
##                     Mean   : 0.2116   Mean   :0.0009924  
##                     3rd Qu.: 0.0000   3rd Qu.:0.0000000  
##                     Max.   :32.0000   Max.   :2.0000000  
##                                                          
##  PEDESTRIANS.INJURED PEDESTRIANS.KILLED  CYCLISTS.INJURED 
##  Min.   :0.00000     Min.   :0.0000000   Min.   :0.00000  
##  1st Qu.:0.00000     1st Qu.:0.0000000   1st Qu.:0.00000  
##  Median :0.00000     Median :0.0000000   Median :0.00000  
##  Mean   :0.04738     Mean   :0.0005344   Mean   :0.00968  
##  3rd Qu.:0.00000     3rd Qu.:0.0000000   3rd Qu.:0.00000  
##  Max.   :4.00000     Max.   :2.0000000   Max.   :2.00000  
##                                                           
##  CYCLISTS.KILLED    MOTORISTS.INJURED MOTORISTS.KILLED   
##  Min.   :0.00e+00   Min.   : 0.0000   Min.   :0.0000000  
##  1st Qu.:0.00e+00   1st Qu.: 0.0000   1st Qu.:0.0000000  
##  Median :0.00e+00   Median : 0.0000   Median :0.0000000  
##  Mean   :4.58e-05   Mean   : 0.1545   Mean   :0.0004122  
##  3rd Qu.:0.00e+00   3rd Qu.: 0.0000   3rd Qu.:0.0000000  
##  Max.   :1.00e+00   Max.   :30.0000   Max.   :2.0000000  
##                                                          
##  VEHICLE.1.TYPE     VEHICLE.2.TYPE     VEHICLE.3.TYPE    
##  Length:65499       Length:65499       Length:65499      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  VEHICLE.4.TYPE     VEHICLE.5.TYPE     VEHICLE.1.FACTOR  
##  Length:65499       Length:65499       Length:65499      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  VEHICLE.2.FACTOR   VEHICLE.3.FACTOR   VEHICLE.4.FACTOR  
##  Length:65499       Length:65499       Length:65499      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  VEHICLE.5.FACTOR  
##  Length:65499      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

1.8 Change all variables to lower case

var.names <- tolower(names(data1))
colnames(data1)<- var.names

2 Data Exploration

Factors that caused vehicle collisons

unique(data1$vehicle.1.factor)
##  [1] "Traffic Control Disregarded"                          
##  [2] "Animals Action"                                       
##  [3] "Fatigued/Drowsy"                                      
##  [4] "Lost Consciousness"                                   
##  [5] "Unspecified"                                          
##  [6] "Alcohol Involvement"                                  
##  [7] "Driver Inattention/Distraction"                       
##  [8] "Failure to Yield Right-of-Way"                        
##  [9] "Oversized Vehicle"                                    
## [10] NA                                                     
## [11] "Other Vehicular"                                      
## [12] "Other Electronic Device"                              
## [13] "Prescription Medication"                              
## [14] "Physical Disability"                                  
## [15] "Turning Improperly"                                   
## [16] "Outside Car Distraction"                              
## [17] "Obstruction/Debris"                                   
## [18] "Illness"                                              
## [19] "Driver Inexperience"                                  
## [20] "Backing Unsafely"                                     
## [21] "Cell Phone (hand-held)"                               
## [22] "Pavement Slippery"                                    
## [23] "View Obstructed/Limited"                              
## [24] "Aggressive Driving/Road Rage"                         
## [25] "Reaction to Other Uninvolved Vehicle"                 
## [26] "Glare"                                                
## [27] "Unsafe Speed"                                         
## [28] "Failure to Keep Right"                                
## [29] "Passenger Distraction"                                
## [30] "Cell Phone (hands-free)"                              
## [31] "Accelerator Defective"                                
## [32] "Fell Asleep"                                          
## [33] "Unsafe Lane Changing"                                 
## [34] "Tire Failure/Inadequate"                              
## [35] "Following Too Closely"                                
## [36] "Brakes Defective"                                     
## [37] "Lane Marking Improper/Inadequate"                     
## [38] "Steering Failure"                                     
## [39] "Pavement Defective"                                   
## [40] "Traffic Control Device Improper/Non-Working"          
## [41] "Drugs (Illegal)"                                      
## [42] "Shoulders Defective/Improper"                         
## [43] "Other Lighting Defects"                               
## [44] "Tow Hitch Defective"                                  
## [45] "Windshield Inadequate"                                
## [46] "Passing or Lane Usage Improper"                       
## [47] "Pedestrian/Bicyclist/Other Pedestrian Error/Confusion"
## [48] "Headlights Defective"

2.1 Change character date variable to date format

data1$date_edited <- mdy(data1$date)

data1$date_time <- paste(data1$date, data1$time)

data1$date_time <- mdy_hm(data1$date_time)

data1$day <- wday(data1$date_time, label = T)

data1$month <- month(data1$date_edited, label = T)

data1$hour <- hour(data1$date_time)

2.2 Total Death due to vechile collsion by Borough

str(data1)
## 'data.frame':    65499 obs. of  34 variables:
##  $ unique.key         : int  3146911 3146180 3146384 3146013 3146120 3146102 3146344 3145900 3145960 3145881 ...
##  $ date               : chr  "1/1/15" "1/1/15" "1/1/15" "1/1/15" ...
##  $ time               : chr  "0:20" "0:20" "0:21" "0:30" ...
##  $ borough            : chr  "QUEENS" NA "BROOKLYN" "BROOKLYN" ...
##  $ zip.code           : int  11358 NA 11205 11213 NA 11105 11203 10024 11223 10036 ...
##  $ latitude           : num  40.8 40.8 40.7 40.7 NA ...
##  $ longitude          : num  -73.8 -73.9 -74 -73.9 NA ...
##  $ location           : chr  "(40.7518471, -73.787862)" "(40.7712888, -73.9466928)" "(40.6894449, -73.9551212)" "(40.6738445, -73.9250801)" ...
##  $ on.street.name     : chr  "47 AVENUE" NA "BEDFORD AVENUE" "BUFFALO AVENUE" ...
##  $ cross.street.name  : chr  "193 STREET" NA "LAFAYETTE AVENUE" "SAINT MARKS AVENUE" ...
##  $ off.street.name    : chr  NA NA NA NA ...
##  $ persons.injured    : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ persons.killed     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pedestrians.injured: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ pedestrians.killed : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cyclists.injured   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cyclists.killed    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ motorists.injured  : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ motorists.killed   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ vehicle.1.type     : chr  "SPORT UTILITY / STATION WAGON" "PASSENGER VEHICLE" "PASSENGER VEHICLE" "BUS" ...
##  $ vehicle.2.type     : chr  NA NA "UNKNOWN" "PASSENGER VEHICLE" ...
##  $ vehicle.3.type     : chr  NA NA NA NA ...
##  $ vehicle.4.type     : chr  NA NA NA NA ...
##  $ vehicle.5.type     : chr  NA NA NA NA ...
##  $ vehicle.1.factor   : chr  "Traffic Control Disregarded" "Animals Action" "Fatigued/Drowsy" "Lost Consciousness" ...
##  $ vehicle.2.factor   : chr  NA NA "Unspecified" NA ...
##  $ vehicle.3.factor   : chr  NA NA NA NA ...
##  $ vehicle.4.factor   : chr  NA NA NA NA ...
##  $ vehicle.5.factor   : chr  NA NA NA NA ...
##  $ date_edited        : Date, format: "2015-01-01" "2015-01-01" ...
##  $ date_time          : POSIXct, format: "2015-01-01 00:20:00" "2015-01-01 00:20:00" ...
##  $ day                : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ month              : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ hour               : int  0 0 0 0 0 10 10 10 10 10 ...
data1$death <-rowSums(data1[, c(13,15,17,19)])

data1 %>% 
        filter(borough != "") %>% 
        group_by(borough) %>% 
        summarise(ToT_death = sum(death)) %>% 
        arrange(ToT_death) %>% 
        ggplot(aes(x = borough, y = ToT_death)) + geom_bar(stat = "identity")

2.3 Total Injuries due to vechile collsion by Borough

data1$injuries <- rowSums(data1[, c(12, 14, 16, 18)]) 

data1 %>% 
        filter(borough!="") %>% 
        group_by(borough) %>% 
        summarise(Tot_injuries = sum(injuries)) %>% 
        ggplot(aes(x = borough, y = Tot_injuries)) + geom_bar(stat="identity")

According to Wikiperdia population, Manhattan has the highest density per sq. Km followed by The Bronx. However, the no of injuries is in Brooklyn and followed by Queens.

2.4 Injuries vs time

temp <-data1 %>% 
        select(persons.killed, persons.injured, vehicle.1.factor, vehicle.2.factor, vehicle.3.factor, vehicle.4.factor, vehicle.5.factor) %>% 
        gather(type,value, 1:2) %>% 
        gather(vehicle_type,cause,1:5) %>% 
        filter(value != 0, cause != "", cause != "Unspecified")

temp_1 <- temp %>% 
        select(-vehicle_type) %>% 
        group_by(type, cause) %>% 
        summarise(total = sum(value, na.rm=T))

ggplot(data = temp_1, aes(x = cause, y = log(total), fill = type))+
               geom_bar(data = subset(temp_1, type == "persons.injured"), stat = "identity")+
               geom_bar(data = subset(temp_1, type=="persons.killed"), stat = "identity", position = "identity", mapping = aes(y = -log(total)))+
               scale_y_continuous(labels = abs)+
               coord_flip()+
               ggtitle("Cause of Accidents")

data1 %>% 
        filter(borough !="") %>% 
        group_by(date_edited, borough) %>% 
        summarise(n=mean(n())) %>% 
        na.omit() %>% 
        ggplot(aes(x = date_edited, y = n, color = borough, group = borough ))+
        geom_line()+
        theme_hc(bgcolor = "darkunica")+
        geom_point(size = 2, shape = 1)+
        scale_fill_hc("darkunica") +
        ggtitle("Borough Accidents(Mean) by Time")+
        geom_text(aes(label=ifelse(n>150,n,"")), size = 3, hjust = 2)

day_wise<- data1 %>% 
        select(borough, date_edited, motorists.injured, cyclists.injured, pedestrians.injured, day, hour) %>% 
        gather(type, value, 3:5) %>% 
        group_by(borough, day, type, hour) %>% 
        summarise(n=sum(value, na.rm=T)) %>% 
        filter(borough != "")

ggplot(day_wise, aes(x = day, y = hour, fill=(n)))+
        geom_tile(color = "black", size = 0.01)+
        scale_fill_viridis(name = "Mean Injury by Type", option = "C")+
        facet_grid(borough~type)+
        scale_y_continuous(trans = "reverse", breaks = c(0,4,8,12,16,20))+
        theme_minimal(base_size = 8)+
        theme(strip.background = element_rect(colour="gray"))