#install.packages("kableExtra")
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.2.2
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'
Variable <- c("Number of drivers involved in fatal collisions per billion miles", "Percentage Of Drivers Involved In Fatal Collisions Who Were Speeding", "Percentage Of Drivers Involved In Fatal Collisions Who Were Alcohol-Impaired", "Percentage Of Drivers Involved In Fatal Collisions Who Were Not Distracted","Percentage Of Drivers Involved In Fatal Collisions Who Had Not Been Involved In Any Previous Accidents", "Car Insurance Premiums", "Losses incurred by insurance companies for collisions per insured driver")
Source <- c("National Highway Traffic Safety Administration, 2012", "National Highway Traffic Safety Administration, 2009", "National Highway Traffic Safety Administration, 2012", "National Highway Traffic Safety Administration, 2012", "National Highway Traffic Safety Administration, 2012", "National Association of Insurance Commissioners, 2011", "National Association of Insurance Commissioners, 2010")
data_source <- data.frame(Variable, Source)
kable(data_source)
Variable | Source |
---|---|
Number of drivers involved in fatal collisions per billion miles | National Highway Traffic Safety Administration, 2012 |
Percentage Of Drivers Involved In Fatal Collisions Who Were Speeding | National Highway Traffic Safety Administration, 2009 |
Percentage Of Drivers Involved In Fatal Collisions Who Were Alcohol-Impaired | National Highway Traffic Safety Administration, 2012 |
Percentage Of Drivers Involved In Fatal Collisions Who Were Not Distracted | National Highway Traffic Safety Administration, 2012 |
Percentage Of Drivers Involved In Fatal Collisions Who Had Not Been Involved In Any Previous Accidents | National Highway Traffic Safety Administration, 2012 |
Car Insurance Premiums | National Association of Insurance Commissioners, 2011 |
Losses incurred by insurance companies for collisions per insured driver | National Association of Insurance Commissioners, 2010 |
# Install the libraries.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
#install.packages("plotly")
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library(infer)
library(forcats)
library(DT)
## Warning: package 'DT' was built under R version 4.2.2
# Upload the dataset and take a quick look at the dataset.
theUrl <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/bad-drivers/bad-drivers.csv"
worst_drivers <- read.table(file=theUrl, header=TRUE, sep=",")
head(worst_drivers)
## State Number.of.drivers.involved.in.fatal.collisions.per.billion.miles
## 1 Alabama 18.8
## 2 Alaska 18.1
## 3 Arizona 18.6
## 4 Arkansas 22.4
## 5 California 12.0
## 6 Colorado 13.6
## Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding
## 1 39
## 2 41
## 3 35
## 4 18
## 5 35
## 6 37
## Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired
## 1 30
## 2 25
## 3 28
## 4 26
## 5 28
## 6 28
## Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted
## 1 96
## 2 90
## 3 84
## 4 94
## 5 91
## 6 79
## Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents
## 1 80
## 2 94
## 3 96
## 4 95
## 5 89
## 6 95
## Car.Insurance.Premiums....
## 1 784.55
## 2 1053.48
## 3 899.47
## 4 827.34
## 5 878.41
## 6 835.50
## Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver....
## 1 145.08
## 2 133.93
## 3 110.35
## 4 142.39
## 5 165.63
## 6 139.91
# The glimpse() function of the dplyr package can be used to see the columns of the dataset and display some portion of the data with respect to each attribute that can fit on a single line.
glimpse(worst_drivers)
## Rows: 51
## Columns: 8
## $ State <chr> …
## $ Number.of.drivers.involved.in.fatal.collisions.per.billion.miles <dbl> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding <int> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired <int> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted <int> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents <int> …
## $ Car.Insurance.Premiums.... <dbl> …
## $ Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver.... <dbl> …
# Rename each column to make it shorter and easier to understand.
colnames(worst_drivers) <- c("STATE",
"DRIVERS_INVOLVED",
"PERC_DRIVERS_SPEED",
"PERC_DRIVERS_ALCHO",
"PERC_DRIVERS_NOT_DIST",
"PERC_DRIVERS_NO_ACC",
"INS_PREM",
"LOSS_INS_COMP")
glimpse(worst_drivers)
## Rows: 51
## Columns: 8
## $ STATE <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cal…
## $ DRIVERS_INVOLVED <dbl> 18.8, 18.1, 18.6, 22.4, 12.0, 13.6, 10.8, 16.2, …
## $ PERC_DRIVERS_SPEED <int> 39, 41, 35, 18, 35, 37, 46, 38, 34, 21, 19, 54, …
## $ PERC_DRIVERS_ALCHO <int> 30, 25, 28, 26, 28, 28, 36, 30, 27, 29, 25, 41, …
## $ PERC_DRIVERS_NOT_DIST <int> 96, 90, 84, 94, 91, 79, 87, 87, 100, 92, 95, 82,…
## $ PERC_DRIVERS_NO_ACC <int> 80, 94, 96, 95, 89, 95, 82, 99, 100, 94, 93, 87,…
## $ INS_PREM <dbl> 784.55, 1053.48, 899.47, 827.34, 878.41, 835.50,…
## $ LOSS_INS_COMP <dbl> 145.08, 133.93, 110.35, 142.39, 165.63, 139.91, …
# Columns PERC_DRIVERS_SPEED, PERC_DRIVERS_ALCHO, PERC_DRIVERS_NOT_DIST, PERC_DRIVERS_NO_ACC are percentages of DRIVERS_INVOLVED. In the next step I mutate new columns DRIVERS_SPEED, DRIVERS_ALCHO, DRIVERS_NOT_DIST, DRIVERS_NO_ACC by taking the given percentage of DRIVERS_INVOLVED column.
change_percent_worst_drivers <- worst_drivers %>%
mutate(DRIVERS_SPEED=(DRIVERS_INVOLVED*PERC_DRIVERS_SPEED)/100) %>%
mutate(DRIVERS_ALCHO=(DRIVERS_INVOLVED*PERC_DRIVERS_ALCHO)/100) %>%
mutate(DRIVERS_NOT_DIST=(DRIVERS_INVOLVED*PERC_DRIVERS_NOT_DIST)/100) %>%
mutate(DRIVERS_NO_ACC=(DRIVERS_INVOLVED*PERC_DRIVERS_NO_ACC)/100)
glimpse(change_percent_worst_drivers)
## Rows: 51
## Columns: 12
## $ STATE <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cal…
## $ DRIVERS_INVOLVED <dbl> 18.8, 18.1, 18.6, 22.4, 12.0, 13.6, 10.8, 16.2, …
## $ PERC_DRIVERS_SPEED <int> 39, 41, 35, 18, 35, 37, 46, 38, 34, 21, 19, 54, …
## $ PERC_DRIVERS_ALCHO <int> 30, 25, 28, 26, 28, 28, 36, 30, 27, 29, 25, 41, …
## $ PERC_DRIVERS_NOT_DIST <int> 96, 90, 84, 94, 91, 79, 87, 87, 100, 92, 95, 82,…
## $ PERC_DRIVERS_NO_ACC <int> 80, 94, 96, 95, 89, 95, 82, 99, 100, 94, 93, 87,…
## $ INS_PREM <dbl> 784.55, 1053.48, 899.47, 827.34, 878.41, 835.50,…
## $ LOSS_INS_COMP <dbl> 145.08, 133.93, 110.35, 142.39, 165.63, 139.91, …
## $ DRIVERS_SPEED <dbl> 7.332, 7.421, 6.510, 4.032, 4.200, 5.032, 4.968,…
## $ DRIVERS_ALCHO <dbl> 5.640, 4.525, 5.208, 5.824, 3.360, 3.808, 3.888,…
## $ DRIVERS_NOT_DIST <dbl> 18.048, 16.290, 15.624, 21.056, 10.920, 10.744, …
## $ DRIVERS_NO_ACC <dbl> 15.040, 17.014, 17.856, 21.280, 10.680, 12.920, …
# Select only State, and percentage of Number of drivers involved in fatal collisions per billion miles, percentage of drivers involved in fatal collisions who were speeding, percentage of drivers involved in fatal collisions who were alcohol impaired, percentage of drivers involved in fatal collisions who were not distracted, percentage of drivers involved in fatal collisions who had not been involved in any previous accidents.
percent_worst_drivers <- select(change_percent_worst_drivers, STATE, DRIVERS_INVOLVED,DRIVERS_SPEED, DRIVERS_ALCHO, DRIVERS_NOT_DIST, DRIVERS_NO_ACC)
datatable(percent_worst_drivers)
# Created barplot comparing all the number of drivers involved in fatal collisions per billion miles with the drivers speed.
percent_worst_drivers %>%
select(STATE, DRIVERS_INVOLVED, DRIVERS_SPEED) %>%
gather(type, value, DRIVERS_INVOLVED:DRIVERS_SPEED) %>%
mutate(STATE = fct_reorder(STATE, value)) %>%
ggplot(., aes(x = STATE,y = value, fill = type)) +
geom_bar(position = "stack", stat="identity") +
scale_fill_manual(values = c("red", "darkred")) +
ylab("Drivers involved in Fatal collision while Speeding") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
coord_flip()
# Created barplot comparing all the number of drivers involved in fatal collisions per billion miles with the drivers alcohol.
percent_worst_drivers %>%
select(STATE, DRIVERS_INVOLVED, DRIVERS_ALCHO) %>%
gather(type, value, DRIVERS_INVOLVED:DRIVERS_ALCHO) %>%
mutate(STATE = fct_reorder(STATE, value)) %>%
ggplot(., aes(x = STATE,y = value, fill = type)) +
geom_bar(position = "stack", stat="identity") +
scale_fill_manual(values = c("green", "darkgreen")) +
ylab("Drivers involved in Fatal collision while Alcho-Impaired") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
coord_flip()
# Created barplot comparing all the number of drivers involved in fatal collisions per billion miles with the drivers distracted.
percent_worst_drivers %>%
select(STATE, DRIVERS_INVOLVED, DRIVERS_NOT_DIST) %>%
gather(type, value, DRIVERS_INVOLVED:DRIVERS_NOT_DIST) %>%
mutate(STATE = fct_reorder(STATE, value)) %>%
ggplot(., aes(x = STATE,y = value, fill = type)) +
geom_bar(position = "stack", stat="identity") +
scale_fill_manual(values = c("lightyellow", "yellow")) +
ylab("Drivers involved in Fatal collision not distracted") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
coord_flip()
# Created barplot comparing all the number of drivers involved in fatal collisions per billion miles with drivers involved in fatal collisions who had not been involved in any previous accidents.
percent_worst_drivers %>%
select(STATE, DRIVERS_INVOLVED, DRIVERS_NO_ACC) %>%
gather(type, value, DRIVERS_INVOLVED:DRIVERS_NO_ACC) %>%
mutate(STATE = fct_reorder(STATE, value)) %>%
ggplot(., aes(x = STATE,y = value, fill = type)) +
geom_bar(position = "stack", stat="identity") +
scale_fill_manual(values = c("blue", "darkblue")) +
ylab("Drivers involved in Fatal collision no pre accident") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
coord_flip()
# Created barplot comparing all the State with the highest losses inccured by insurance companies for collisions per insured driver.
Prem %>%
mutate(STATE = fct_reorder(STATE, LOSS_INS_COMP)) %>%
ggplot(., aes(x = STATE,y = LOSS_INS_COMP, fill = STATE)) +
geom_bar(position = "stack", stat="identity") +
ylab("Losses Incurred by Insurance Companies") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
coord_flip()
# Created barplot comparing all the State insurance premium and losses inccured by insurance companies for collisions per insured driver with all the drivers involved in fatal collision.
Prem_plot <- ggplot(data = (gather(Prem,"variable", "value", 2:4)), aes(x = STATE, y = value, fill = variable))+ geom_bar(position = "stack", stat="identity") + ggtitle("DRIVERS INVOLVED, LOSSES and INS PREMIUMS") + ylab("Number of Drivers Involved in Fatal collision Per Billion Mile/
Losses Iincurred by Insurance Companies for Collisions Per Insured Driver($)/
Insurance Premiums ($)") + facet_wrap(~ variable) + coord_flip()
ggplotly(Prem_plot)
# Quantify the strength of the relationship with the correlation coefficient.
cor(Prem$INS_PREM, Prem$DRIVERS_INVOLVED)
## [1] -0.1997019
Prem %>%
filter(INS_PREM==max(INS_PREM))
## STATE DRIVERS_INVOLVED INS_PREM LOSS_INS_COMP
## 1 New Jersey 11.2 1301.52 159.85
Prem %>%
filter(INS_PREM==median(INS_PREM))
## STATE DRIVERS_INVOLVED INS_PREM LOSS_INS_COMP
## 1 South Carolina 23.9 858.97 116.29
Prem %>%
filter(INS_PREM==min(INS_PREM))
## STATE DRIVERS_INVOLVED INS_PREM LOSS_INS_COMP
## 1 Idaho 15.3 641.96 82.75
DRIVERS_INVOLVED <- 11.2
#Prem_Pred = Intercept + drivers_involved * DRIVERS_INVOLVED
Prem_Pred <- 1023.354 + (-8.638) * DRIVERS_INVOLVED
Prem_Pred
## [1] 926.6084
DRIVERS_INVOLVED <- 23.9
#Prem_Pred = Intercept + Drivers_Involved * DRIVERS_INVOLVED
Prem_Pred <- 1023.354 + (-8.638) * DRIVERS_INVOLVED
Prem_Pred
## [1] 816.9058
DRIVERS_INVOLVED <- 15.3
#Prem_Pred = Intercept + Drivers_Involved * DRIVERS_INVOLVED
Prem_Pred <- 1023.354 + (-8.638) * DRIVERS_INVOLVED
Prem_Pred
## [1] 891.1926
LOSS_INS_COMP <- 159.85
# Prem_Pred = Intercept + Loss_Ins_Comp * LOSS_INS_COMP
Prem_Pred <- 285.3251 + 4.4733 * LOSS_INS_COMP
Prem_Pred
## [1] 1000.382
LOSS_INS_COMP <- 116.29
# Prem_Pred = Intercept + Loss_Ins_Comp * LOSS_IN_COMP
Prem_Pred <- 285.3251 + 4.4733 * LOSS_INS_COMP
Prem_Pred
## [1] 805.5252
LOSS_INS_COMP <- 82.75
# Prem_Pred = Intercept + Loss_Ins_Comp * LOSS_IN_COMP
Prem_Pred <- 285.3251 + 4.4733 * LOSS_INS_COMP
Prem_Pred
## [1] 655.4907
par(mfrow=c(2,2))
plot(Prem$DRIVERS_INVOLVED, Prem$INS_PREM)
hist(m1$residuals)
qqnorm(m1$residuals)
qqline(m1$residuals)
plot(Prem$DRIVERS_INVOLVED, m1$residuals)
abline(h = 0, lty = 3)