#LOAD LIBRARIES GET DATA

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(ggplot2)

collisions <- read.csv('~/Consulting/PBI/Collisions.csv', stringsAsFactors = FALSE) 
#Get complete cases
collisions.new <- collisions[which(collisions$STATUS=='Matched'),]


#Clean Data


#RECODE UNDFL: 1= 'Y' and 0= 'N'

collisions.new$UNDERINFL[collisions.new$UNDERINFL==1] <- 'Y'
collisions.new$UNDERINFL[collisions.new$UNDERINFL==0] <- 'N' 


#RECODE BLANK WEATHER TO UNKNOWN
collisions.new$WEATHER[collisions.new$WEATHER==""] <- 'Unknown'

#ADD COLLISION COUNT

collisions.new$COLLISIONCOUNT <- 1

#RECODE BLANK COLLISON TYPE TO UNKNOWN
collisions.new$COLLISIONTYPE[collisions.new$COLLISIONTYPE==""] <- 'Unknown'

#RECODE BLANK JUNCTION TYPE TO UNKNOWN

collisions.new$JUNCTIONTYPE[collisions.new$JUNCTIONTYPE==""] <- 'Unknown'


#collisions.new <- collisions %>% select(6:8,10,13:27,29:32,36:40)
#collisions.new <- collisions.new[complete.cases(collisions.new),]
#HISTOGRAM OF FATALITIES

collisions.fatalities <- collisions.new[ which(collisions.new$FATALITIES>0),]

hist(collisions.fatalities$FATALITIES,
     breaks = 10,
     xlim = c(0,10),
     col="red",
     border = "black",
     xlab = "Fatalities",
     ylab = "Count of Fatalities",
     main = "Histogram of Number of Fatalities")

#HISTOGRAM OF PERSONCOUNT
collisions.persons <- collisions.new[which(collisions.new$PERSONCOUNT>0),]

hist(collisions.persons$PERSONCOUNT,
     breaks = 10,
     xlim = c(0,100),
     col="red",
     border = "black",
     xlab = "People",
     ylab = "Count of People",
     main = "Histogram of Number of People")

#SUMMARY AND PLOT OF PERSONCOUNT
summary(collisions.persons$PERSONCOUNT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.537   3.000  93.000
plot(collisions.persons$PERSONCOUNT)

#HISTOGRAM OF INJURIES
collisions.injuries <- collisions.new[which(collisions.new$INJURIES>0),]

hist(collisions.injuries$INJURIES,
     breaks = 10,
     xlim = c(0,100),
     col="red",
     border = "black",
     xlab = "Injuries",
     ylab = "Count of Injuries",
     main = "Histogram of Number of Injuries")

#Calculate Skewness of PERSONCOUNT & FATALITIES

#PERSON COUNT SKEWNESS
personcount.skewness <- (3*(mean(collisions.persons$PERSONCOUNT) - median(collisions.persons$PERSONCOUNT)))/sd(collisions.persons$PERSONCOUNT)

personcount.skewness
## [1] 1.205358
#FATALITIES SKEWNESS
fatality.skewness <- (3*(mean(collisions.fatalities$FATALITIES) - median(collisions.fatalities$FATALITIES)))/sd(collisions.fatalities$FATALITIES)

fatality.skewness
## [1] 0.5697579
#INJURIES BY COLLISION TYPE

p <- ggplot(collisions.new, aes(x=COLLISIONTYPE, y=INJURIES)) + 
  geom_boxplot() + coord_flip()

p

#INJURIES BY ROAD CONDITION

p <- ggplot(collisions.new, aes(x=ROADCOND, y=INJURIES)) + 
  geom_boxplot() + coord_flip()

p

#INJURIES BY LIGHT CONDITION

p <- ggplot(collisions.new, aes(x=LIGHTCOND, y=INJURIES)) + 
  geom_boxplot() + coord_flip()

p

#INJURIES BY WEATHER

p <- ggplot(collisions.new, aes(x=WEATHER, y=INJURIES)) + 
  geom_boxplot() + coord_flip()

p

#INJURIES BY UNDER THE INFLUENCE


  
p <- ggplot(collisions.new, aes(x=UNDERINFL, y=INJURIES)) + 
  geom_boxplot() + coord_flip()

p

#PERSON COUNT BY SEVERITY


p <- ggplot(collisions.new, aes(x=SEVERITYDESC, y=PERSONCOUNT)) + 
  geom_boxplot() + coord_flip()

p

#PERSONCOUNT ~ INJURIES

ggplot(collisions.new, aes(x=PERSONCOUNT, y=INJURIES)) + geom_point(size=2, shape=23) +geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#AVERAGE PERSON COUNT BY SEVERITY

persons.by.severity <- aggregate(PERSONCOUNT~SEVERITYDESC, collisions.new, mean)
persons.by.severity <- persons.by.severity[order(persons.by.severity$PERSONCOUNT),]
persons.by.severity 
#AVERAGE VEHICLE COUNT BY SEVERITY

vehicles.by.severity <- aggregate(VEHCOUNT~SEVERITYDESC, collisions.new, mean)
vehicles.by.severity <- vehicles.by.severity[order(vehicles.by.severity$VEHCOUNT),]
vehicles.by.severity
#TOTAL FATALITIES BY WEATHER

fatalities.by.weather <- aggregate(FATALITIES~WEATHER, collisions.new, sum)
fatalities.by.weather <- fatalities.by.weather[order(fatalities.by.weather$FATALITIES),]
fatalities.by.weather
#TOTAL COLLISIONS BY COLLISION TYPE

collisions.by.type <- aggregate(COLLISIONCOUNT~COLLISIONTYPE, collisions.new, sum)
collisions.by.type <- collisions.by.type[order(collisions.by.type$COLLISIONCOUNT),]
collisions.by.type
#TOTAL COLLISIONS BY JUNCTION TYPE


collisions.by.junction <- aggregate(COLLISIONCOUNT~JUNCTIONTYPE, collisions.new, sum) 
collisions.by.junction <- collisions.by.junction[order(collisions.by.junction$COLLISIONCOUNT),]
collisions.by.junction
#TOTAL COLLISIONS BY ACCIDENT DESCRIPTION

collisions.by.desc <- aggregate(COLLISIONCOUNT~ST_COLDESC, collisions.new, sum)
collisions.by.desc <- collisions.by.desc[order(collisions.by.desc$COLLISIONCOUNT),]
collisions.by.desc
#write.csv(collisions.new,'Collisions_Revised.csv')