library(tidyverse)
library(VIM)
library(DT)
library(lubridate)
library(ggthemes)
library(viridis)data1 <- read.csv("C:/r data warehouse/vehicle collison/database.csv", stringsAsFactors = F, na.strings = "")dim(data1)## [1] 65499 29
Data1 set has 65,499 rows and 29 variables
names(data1)## [1] "UNIQUE.KEY" "DATE" "TIME"
## [4] "BOROUGH" "ZIP.CODE" "LATITUDE"
## [7] "LONGITUDE" "LOCATION" "ON.STREET.NAME"
## [10] "CROSS.STREET.NAME" "OFF.STREET.NAME" "PERSONS.INJURED"
## [13] "PERSONS.KILLED" "PEDESTRIANS.INJURED" "PEDESTRIANS.KILLED"
## [16] "CYCLISTS.INJURED" "CYCLISTS.KILLED" "MOTORISTS.INJURED"
## [19] "MOTORISTS.KILLED" "VEHICLE.1.TYPE" "VEHICLE.2.TYPE"
## [22] "VEHICLE.3.TYPE" "VEHICLE.4.TYPE" "VEHICLE.5.TYPE"
## [25] "VEHICLE.1.FACTOR" "VEHICLE.2.FACTOR" "VEHICLE.3.FACTOR"
## [28] "VEHICLE.4.FACTOR" "VEHICLE.5.FACTOR"
data1 %>% map_int(~sum(is.na(.)))## UNIQUE.KEY DATE TIME
## 0 0 0
## BOROUGH ZIP.CODE LATITUDE
## 16632 16639 10954
## LONGITUDE LOCATION ON.STREET.NAME
## 10954 10954 9524
## CROSS.STREET.NAME OFF.STREET.NAME PERSONS.INJURED
## 9524 63333 0
## PERSONS.KILLED PEDESTRIANS.INJURED PEDESTRIANS.KILLED
## 0 0 0
## CYCLISTS.INJURED CYCLISTS.KILLED MOTORISTS.INJURED
## 0 0 0
## MOTORISTS.KILLED VEHICLE.1.TYPE VEHICLE.2.TYPE
## 0 16 7069
## VEHICLE.3.TYPE VEHICLE.4.TYPE VEHICLE.5.TYPE
## 61282 64588 65268
## VEHICLE.1.FACTOR VEHICLE.2.FACTOR VEHICLE.3.FACTOR
## 214 8090 61242
## VEHICLE.4.FACTOR VEHICLE.5.FACTOR
## 64583 65267
There are lots of missing values and we don’t know why. It could be deliberatley deleted or it could be Missing Completely at Random (MCAR) - we just don’t know!
aggr(data1, sortVars = T,cex.axis = .5, cex.numbers = .5, combined = T, sortombs = T)Above tables and chart shows that important variables such as persons injured or killed don’t have missing data but varibles related to it have missing data.
datatable(head(data1, 10), options = list(scrollX = T, scrollY = 400))summary(data1)## UNIQUE.KEY DATE TIME
## Min. :3145850 Length:65499 Length:65499
## 1st Qu.:3162574 Class :character Class :character
## Median :3178969 Mode :character Mode :character
## Mean :3179157
## 3rd Qu.:3195356
## Max. :3403285
##
## BOROUGH ZIP.CODE LATITUDE LONGITUDE
## Length:65499 Min. :10001 Min. :40.50 Min. :-74.25
## Class :character 1st Qu.:10128 1st Qu.:40.67 1st Qu.:-73.98
## Mode :character Median :11206 Median :40.72 Median :-73.93
## Mean :10821 Mean :40.72 Mean :-73.92
## 3rd Qu.:11236 3rd Qu.:40.77 3rd Qu.:-73.87
## Max. :11697 Max. :40.91 Max. :-73.70
## NA's :16639 NA's :10954 NA's :10954
## LOCATION ON.STREET.NAME CROSS.STREET.NAME
## Length:65499 Length:65499 Length:65499
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## OFF.STREET.NAME PERSONS.INJURED PERSONS.KILLED
## Length:65499 Min. : 0.0000 Min. :0.0000000
## Class :character 1st Qu.: 0.0000 1st Qu.:0.0000000
## Mode :character Median : 0.0000 Median :0.0000000
## Mean : 0.2116 Mean :0.0009924
## 3rd Qu.: 0.0000 3rd Qu.:0.0000000
## Max. :32.0000 Max. :2.0000000
##
## PEDESTRIANS.INJURED PEDESTRIANS.KILLED CYCLISTS.INJURED
## Min. :0.00000 Min. :0.0000000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000000 Median :0.00000
## Mean :0.04738 Mean :0.0005344 Mean :0.00968
## 3rd Qu.:0.00000 3rd Qu.:0.0000000 3rd Qu.:0.00000
## Max. :4.00000 Max. :2.0000000 Max. :2.00000
##
## CYCLISTS.KILLED MOTORISTS.INJURED MOTORISTS.KILLED
## Min. :0.00e+00 Min. : 0.0000 Min. :0.0000000
## 1st Qu.:0.00e+00 1st Qu.: 0.0000 1st Qu.:0.0000000
## Median :0.00e+00 Median : 0.0000 Median :0.0000000
## Mean :4.58e-05 Mean : 0.1545 Mean :0.0004122
## 3rd Qu.:0.00e+00 3rd Qu.: 0.0000 3rd Qu.:0.0000000
## Max. :1.00e+00 Max. :30.0000 Max. :2.0000000
##
## VEHICLE.1.TYPE VEHICLE.2.TYPE VEHICLE.3.TYPE
## Length:65499 Length:65499 Length:65499
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## VEHICLE.4.TYPE VEHICLE.5.TYPE VEHICLE.1.FACTOR
## Length:65499 Length:65499 Length:65499
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## VEHICLE.2.FACTOR VEHICLE.3.FACTOR VEHICLE.4.FACTOR
## Length:65499 Length:65499 Length:65499
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## VEHICLE.5.FACTOR
## Length:65499
## Class :character
## Mode :character
##
##
##
##
var.names <- tolower(names(data1))
colnames(data1)<- var.namesFactors that caused vehicle collisons
unique(data1$vehicle.1.factor)## [1] "Traffic Control Disregarded"
## [2] "Animals Action"
## [3] "Fatigued/Drowsy"
## [4] "Lost Consciousness"
## [5] "Unspecified"
## [6] "Alcohol Involvement"
## [7] "Driver Inattention/Distraction"
## [8] "Failure to Yield Right-of-Way"
## [9] "Oversized Vehicle"
## [10] NA
## [11] "Other Vehicular"
## [12] "Other Electronic Device"
## [13] "Prescription Medication"
## [14] "Physical Disability"
## [15] "Turning Improperly"
## [16] "Outside Car Distraction"
## [17] "Obstruction/Debris"
## [18] "Illness"
## [19] "Driver Inexperience"
## [20] "Backing Unsafely"
## [21] "Cell Phone (hand-held)"
## [22] "Pavement Slippery"
## [23] "View Obstructed/Limited"
## [24] "Aggressive Driving/Road Rage"
## [25] "Reaction to Other Uninvolved Vehicle"
## [26] "Glare"
## [27] "Unsafe Speed"
## [28] "Failure to Keep Right"
## [29] "Passenger Distraction"
## [30] "Cell Phone (hands-free)"
## [31] "Accelerator Defective"
## [32] "Fell Asleep"
## [33] "Unsafe Lane Changing"
## [34] "Tire Failure/Inadequate"
## [35] "Following Too Closely"
## [36] "Brakes Defective"
## [37] "Lane Marking Improper/Inadequate"
## [38] "Steering Failure"
## [39] "Pavement Defective"
## [40] "Traffic Control Device Improper/Non-Working"
## [41] "Drugs (Illegal)"
## [42] "Shoulders Defective/Improper"
## [43] "Other Lighting Defects"
## [44] "Tow Hitch Defective"
## [45] "Windshield Inadequate"
## [46] "Passing or Lane Usage Improper"
## [47] "Pedestrian/Bicyclist/Other Pedestrian Error/Confusion"
## [48] "Headlights Defective"
data1$date_edited <- mdy(data1$date)
data1$date_time <- paste(data1$date, data1$time)
data1$date_time <- mdy_hm(data1$date_time)
data1$day <- wday(data1$date_time, label = T)
data1$month <- month(data1$date_edited, label = T)
data1$hour <- hour(data1$date_time)str(data1)## 'data.frame': 65499 obs. of 34 variables:
## $ unique.key : int 3146911 3146180 3146384 3146013 3146120 3146102 3146344 3145900 3145960 3145881 ...
## $ date : chr "1/1/15" "1/1/15" "1/1/15" "1/1/15" ...
## $ time : chr "0:20" "0:20" "0:21" "0:30" ...
## $ borough : chr "QUEENS" NA "BROOKLYN" "BROOKLYN" ...
## $ zip.code : int 11358 NA 11205 11213 NA 11105 11203 10024 11223 10036 ...
## $ latitude : num 40.8 40.8 40.7 40.7 NA ...
## $ longitude : num -73.8 -73.9 -74 -73.9 NA ...
## $ location : chr "(40.7518471, -73.787862)" "(40.7712888, -73.9466928)" "(40.6894449, -73.9551212)" "(40.6738445, -73.9250801)" ...
## $ on.street.name : chr "47 AVENUE" NA "BEDFORD AVENUE" "BUFFALO AVENUE" ...
## $ cross.street.name : chr "193 STREET" NA "LAFAYETTE AVENUE" "SAINT MARKS AVENUE" ...
## $ off.street.name : chr NA NA NA NA ...
## $ persons.injured : int 0 1 0 0 0 0 0 1 0 0 ...
## $ persons.killed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pedestrians.injured: int 0 0 0 0 0 0 0 1 0 0 ...
## $ pedestrians.killed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cyclists.injured : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cyclists.killed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ motorists.injured : int 0 1 0 0 0 0 0 0 0 0 ...
## $ motorists.killed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ vehicle.1.type : chr "SPORT UTILITY / STATION WAGON" "PASSENGER VEHICLE" "PASSENGER VEHICLE" "BUS" ...
## $ vehicle.2.type : chr NA NA "UNKNOWN" "PASSENGER VEHICLE" ...
## $ vehicle.3.type : chr NA NA NA NA ...
## $ vehicle.4.type : chr NA NA NA NA ...
## $ vehicle.5.type : chr NA NA NA NA ...
## $ vehicle.1.factor : chr "Traffic Control Disregarded" "Animals Action" "Fatigued/Drowsy" "Lost Consciousness" ...
## $ vehicle.2.factor : chr NA NA "Unspecified" NA ...
## $ vehicle.3.factor : chr NA NA NA NA ...
## $ vehicle.4.factor : chr NA NA NA NA ...
## $ vehicle.5.factor : chr NA NA NA NA ...
## $ date_edited : Date, format: "2015-01-01" "2015-01-01" ...
## $ date_time : POSIXct, format: "2015-01-01 00:20:00" "2015-01-01 00:20:00" ...
## $ day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 5 5 5 5 5 5 5 5 5 5 ...
## $ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ hour : int 0 0 0 0 0 10 10 10 10 10 ...
data1$death <-rowSums(data1[, c(13,15,17,19)])
data1 %>%
filter(borough != "") %>%
group_by(borough) %>%
summarise(ToT_death = sum(death)) %>%
arrange(ToT_death) %>%
ggplot(aes(x = borough, y = ToT_death)) + geom_bar(stat = "identity")data1$injuries <- rowSums(data1[, c(12, 14, 16, 18)])
data1 %>%
filter(borough!="") %>%
group_by(borough) %>%
summarise(Tot_injuries = sum(injuries)) %>%
ggplot(aes(x = borough, y = Tot_injuries)) + geom_bar(stat="identity")According to Wikiperdia population, Manhattan has the highest density per sq. Km followed by The Bronx. However, the no of injuries is in Brooklyn and followed by Queens.
temp <-data1 %>%
select(persons.killed, persons.injured, vehicle.1.factor, vehicle.2.factor, vehicle.3.factor, vehicle.4.factor, vehicle.5.factor) %>%
gather(type,value, 1:2) %>%
gather(vehicle_type,cause,1:5) %>%
filter(value != 0, cause != "", cause != "Unspecified")
temp_1 <- temp %>%
select(-vehicle_type) %>%
group_by(type, cause) %>%
summarise(total = sum(value, na.rm=T))
ggplot(data = temp_1, aes(x = cause, y = log(total), fill = type))+
geom_bar(data = subset(temp_1, type == "persons.injured"), stat = "identity")+
geom_bar(data = subset(temp_1, type=="persons.killed"), stat = "identity", position = "identity", mapping = aes(y = -log(total)))+
scale_y_continuous(labels = abs)+
coord_flip()+
ggtitle("Cause of Accidents")data1 %>%
filter(borough !="") %>%
group_by(date_edited, borough) %>%
summarise(n=mean(n())) %>%
na.omit() %>%
ggplot(aes(x = date_edited, y = n, color = borough, group = borough ))+
geom_line()+
theme_hc(bgcolor = "darkunica")+
geom_point(size = 2, shape = 1)+
scale_fill_hc("darkunica") +
ggtitle("Borough Accidents(Mean) by Time")+
geom_text(aes(label=ifelse(n>150,n,"")), size = 3, hjust = 2)day_wise<- data1 %>%
select(borough, date_edited, motorists.injured, cyclists.injured, pedestrians.injured, day, hour) %>%
gather(type, value, 3:5) %>%
group_by(borough, day, type, hour) %>%
summarise(n=sum(value, na.rm=T)) %>%
filter(borough != "")
ggplot(day_wise, aes(x = day, y = hour, fill=(n)))+
geom_tile(color = "black", size = 0.01)+
scale_fill_viridis(name = "Mean Injury by Type", option = "C")+
facet_grid(borough~type)+
scale_y_continuous(trans = "reverse", breaks = c(0,4,8,12,16,20))+
theme_minimal(base_size = 8)+
theme(strip.background = element_rect(colour="gray"))