#LOAD LIBRARIES GET DATA
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(ggplot2)
collisions <- read.csv('~/Consulting/PBI/Collisions.csv', stringsAsFactors = FALSE)
#Get complete cases
collisions.new <- collisions[which(collisions$STATUS=='Matched'),]
#Clean Data
#RECODE UNDFL: 1= 'Y' and 0= 'N'
collisions.new$UNDERINFL[collisions.new$UNDERINFL==1] <- 'Y'
collisions.new$UNDERINFL[collisions.new$UNDERINFL==0] <- 'N'
#RECODE BLANK WEATHER TO UNKNOWN
collisions.new$WEATHER[collisions.new$WEATHER==""] <- 'Unknown'
#ADD COLLISION COUNT
collisions.new$COLLISIONCOUNT <- 1
#RECODE BLANK COLLISON TYPE TO UNKNOWN
collisions.new$COLLISIONTYPE[collisions.new$COLLISIONTYPE==""] <- 'Unknown'
#RECODE BLANK JUNCTION TYPE TO UNKNOWN
collisions.new$JUNCTIONTYPE[collisions.new$JUNCTIONTYPE==""] <- 'Unknown'
#collisions.new <- collisions %>% select(6:8,10,13:27,29:32,36:40)
#collisions.new <- collisions.new[complete.cases(collisions.new),]
#HISTOGRAM OF FATALITIES
collisions.fatalities <- collisions.new[ which(collisions.new$FATALITIES>0),]
hist(collisions.fatalities$FATALITIES,
breaks = 10,
xlim = c(0,10),
col="red",
border = "black",
xlab = "Fatalities",
ylab = "Count of Fatalities",
main = "Histogram of Number of Fatalities")

#HISTOGRAM OF PERSONCOUNT
collisions.persons <- collisions.new[which(collisions.new$PERSONCOUNT>0),]
hist(collisions.persons$PERSONCOUNT,
breaks = 10,
xlim = c(0,100),
col="red",
border = "black",
xlab = "People",
ylab = "Count of People",
main = "Histogram of Number of People")

#SUMMARY AND PLOT OF PERSONCOUNT
summary(collisions.persons$PERSONCOUNT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 2.537 3.000 93.000
plot(collisions.persons$PERSONCOUNT)

#HISTOGRAM OF INJURIES
collisions.injuries <- collisions.new[which(collisions.new$INJURIES>0),]
hist(collisions.injuries$INJURIES,
breaks = 10,
xlim = c(0,100),
col="red",
border = "black",
xlab = "Injuries",
ylab = "Count of Injuries",
main = "Histogram of Number of Injuries")

#Calculate Skewness of PERSONCOUNT & FATALITIES
#PERSON COUNT SKEWNESS
personcount.skewness <- (3*(mean(collisions.persons$PERSONCOUNT) - median(collisions.persons$PERSONCOUNT)))/sd(collisions.persons$PERSONCOUNT)
personcount.skewness
## [1] 1.205358
#FATALITIES SKEWNESS
fatality.skewness <- (3*(mean(collisions.fatalities$FATALITIES) - median(collisions.fatalities$FATALITIES)))/sd(collisions.fatalities$FATALITIES)
fatality.skewness
## [1] 0.5697579
#INJURIES BY COLLISION TYPE
p <- ggplot(collisions.new, aes(x=COLLISIONTYPE, y=INJURIES)) +
geom_boxplot() + coord_flip()
p

#INJURIES BY ROAD CONDITION
p <- ggplot(collisions.new, aes(x=ROADCOND, y=INJURIES)) +
geom_boxplot() + coord_flip()
p

#INJURIES BY LIGHT CONDITION
p <- ggplot(collisions.new, aes(x=LIGHTCOND, y=INJURIES)) +
geom_boxplot() + coord_flip()
p

#INJURIES BY WEATHER
p <- ggplot(collisions.new, aes(x=WEATHER, y=INJURIES)) +
geom_boxplot() + coord_flip()
p

#INJURIES BY UNDER THE INFLUENCE
p <- ggplot(collisions.new, aes(x=UNDERINFL, y=INJURIES)) +
geom_boxplot() + coord_flip()
p

#PERSON COUNT BY SEVERITY
p <- ggplot(collisions.new, aes(x=SEVERITYDESC, y=PERSONCOUNT)) +
geom_boxplot() + coord_flip()
p

#PERSONCOUNT ~ INJURIES
ggplot(collisions.new, aes(x=PERSONCOUNT, y=INJURIES)) + geom_point(size=2, shape=23) +geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#AVERAGE PERSON COUNT BY SEVERITY
persons.by.severity <- aggregate(PERSONCOUNT~SEVERITYDESC, collisions.new, mean)
persons.by.severity <- persons.by.severity[order(persons.by.severity$PERSONCOUNT),]
persons.by.severity
#AVERAGE VEHICLE COUNT BY SEVERITY
vehicles.by.severity <- aggregate(VEHCOUNT~SEVERITYDESC, collisions.new, mean)
vehicles.by.severity <- vehicles.by.severity[order(vehicles.by.severity$VEHCOUNT),]
vehicles.by.severity
#TOTAL FATALITIES BY WEATHER
fatalities.by.weather <- aggregate(FATALITIES~WEATHER, collisions.new, sum)
fatalities.by.weather <- fatalities.by.weather[order(fatalities.by.weather$FATALITIES),]
fatalities.by.weather
#TOTAL COLLISIONS BY COLLISION TYPE
collisions.by.type <- aggregate(COLLISIONCOUNT~COLLISIONTYPE, collisions.new, sum)
collisions.by.type <- collisions.by.type[order(collisions.by.type$COLLISIONCOUNT),]
collisions.by.type
#TOTAL COLLISIONS BY JUNCTION TYPE
collisions.by.junction <- aggregate(COLLISIONCOUNT~JUNCTIONTYPE, collisions.new, sum)
collisions.by.junction <- collisions.by.junction[order(collisions.by.junction$COLLISIONCOUNT),]
collisions.by.junction
#TOTAL COLLISIONS BY ACCIDENT DESCRIPTION
collisions.by.desc <- aggregate(COLLISIONCOUNT~ST_COLDESC, collisions.new, sum)
collisions.by.desc <- collisions.by.desc[order(collisions.by.desc$COLLISIONCOUNT),]
collisions.by.desc
#write.csv(collisions.new,'Collisions_Revised.csv')