library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(dplyr)
library(infer)
theUrl <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/bad-drivers/bad-drivers.csv"
worst_drivers <- read.table(file=theUrl, header=TRUE, sep=",")
head(worst_drivers)
## State Number.of.drivers.involved.in.fatal.collisions.per.billion.miles
## 1 Alabama 18.8
## 2 Alaska 18.1
## 3 Arizona 18.6
## 4 Arkansas 22.4
## 5 California 12.0
## 6 Colorado 13.6
## Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding
## 1 39
## 2 41
## 3 35
## 4 18
## 5 35
## 6 37
## Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired
## 1 30
## 2 25
## 3 28
## 4 26
## 5 28
## 6 28
## Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted
## 1 96
## 2 90
## 3 84
## 4 94
## 5 91
## 6 79
## Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents
## 1 80
## 2 94
## 3 96
## 4 95
## 5 89
## 6 95
## Car.Insurance.Premiums....
## 1 784.55
## 2 1053.48
## 3 899.47
## 4 827.34
## 5 878.41
## 6 835.50
## Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver....
## 1 145.08
## 2 133.93
## 3 110.35
## 4 142.39
## 5 165.63
## 6 139.91
colnames(worst_drivers)
## [1] "State"
## [2] "Number.of.drivers.involved.in.fatal.collisions.per.billion.miles"
## [3] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding"
## [4] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired"
## [5] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted"
## [6] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents"
## [7] "Car.Insurance.Premiums...."
## [8] "Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver...."
colnames(worst_drivers) <- c("STATE",
"DRIVERS_INVOLVED",
"PERC_DRIVERS_SPEED",
"PERC_DRIVERS_ALCHO",
"PERC_DRIVERS_NOT_DIST",
"PERC_DRIVERS_NO_ACC",
"INS_PREM",
"LOSS_INSCOMP")
glimpse(worst_drivers)
## Rows: 51
## Columns: 8
## $ STATE <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cal…
## $ DRIVERS_INVOLVED <dbl> 18.8, 18.1, 18.6, 22.4, 12.0, 13.6, 10.8, 16.2, …
## $ PERC_DRIVERS_SPEED <int> 39, 41, 35, 18, 35, 37, 46, 38, 34, 21, 19, 54, …
## $ PERC_DRIVERS_ALCHO <int> 30, 25, 28, 26, 28, 28, 36, 30, 27, 29, 25, 41, …
## $ PERC_DRIVERS_NOT_DIST <int> 96, 90, 84, 94, 91, 79, 87, 87, 100, 92, 95, 82,…
## $ PERC_DRIVERS_NO_ACC <int> 80, 94, 96, 95, 89, 95, 82, 99, 100, 94, 93, 87,…
## $ INS_PREM <dbl> 784.55, 1053.48, 899.47, 827.34, 878.41, 835.50,…
## $ LOSS_INSCOMP <dbl> 145.08, 133.93, 110.35, 142.39, 165.63, 139.91, …
# create new column DRIVERS_SPEED which will be (DRIVERS_INVOLVED*PERC_DRIVERS_SPEED)/100
percent_worst_drivers <- worst_drivers %>%
mutate(DRIVERS_SPEED=(DRIVERS_INVOLVED*PERC_DRIVERS_SPEED)/100) %>%
mutate(DRIVERS_ALCHO=(DRIVERS_INVOLVED*PERC_DRIVERS_ALCHO)/100) %>%
mutate(DRIVERS_NOT_DIST=(DRIVERS_INVOLVED*PERC_DRIVERS_NOT_DIST)/100) %>%
mutate(DRIVERS_NO_ACC=(DRIVERS_INVOLVED*PERC_DRIVERS_NO_ACC)/100)
glimpse(percent_worst_drivers)
## Rows: 51
## Columns: 12
## $ STATE <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cal…
## $ DRIVERS_INVOLVED <dbl> 18.8, 18.1, 18.6, 22.4, 12.0, 13.6, 10.8, 16.2, …
## $ PERC_DRIVERS_SPEED <int> 39, 41, 35, 18, 35, 37, 46, 38, 34, 21, 19, 54, …
## $ PERC_DRIVERS_ALCHO <int> 30, 25, 28, 26, 28, 28, 36, 30, 27, 29, 25, 41, …
## $ PERC_DRIVERS_NOT_DIST <int> 96, 90, 84, 94, 91, 79, 87, 87, 100, 92, 95, 82,…
## $ PERC_DRIVERS_NO_ACC <int> 80, 94, 96, 95, 89, 95, 82, 99, 100, 94, 93, 87,…
## $ INS_PREM <dbl> 784.55, 1053.48, 899.47, 827.34, 878.41, 835.50,…
## $ LOSS_INSCOMP <dbl> 145.08, 133.93, 110.35, 142.39, 165.63, 139.91, …
## $ DRIVERS_SPEED <dbl> 7.332, 7.421, 6.510, 4.032, 4.200, 5.032, 4.968,…
## $ DRIVERS_ALCHO <dbl> 5.640, 4.525, 5.208, 5.824, 3.360, 3.808, 3.888,…
## $ DRIVERS_NOT_DIST <dbl> 18.048, 16.290, 15.624, 21.056, 10.920, 10.744, …
## $ DRIVERS_NO_ACC <dbl> 15.040, 17.014, 17.856, 21.280, 10.680, 12.920, …
# Created barplot comparing all the drivers involved with the drivers speed.
percent_worst_drivers %>%
select(STATE, DRIVERS_INVOLVED, DRIVERS_SPEED) %>%
gather(type, value, DRIVERS_INVOLVED:DRIVERS_SPEED) %>%
ggplot(., aes(x = STATE,y = value, fill = type)) +
geom_bar(position = "stack", stat="identity") +
scale_fill_manual(values = c("red", "darkred")) +
ylab("Drivers involved in Fatal collision while Speeding") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
# Created barplot comparing all the drivers involved with the drivers alcohol.
percent_worst_drivers %>%
select(STATE, DRIVERS_INVOLVED, DRIVERS_ALCHO) %>%
gather(type, value, DRIVERS_INVOLVED:DRIVERS_ALCHO) %>%
ggplot(., aes(x = STATE,y = value, fill = type)) +
geom_bar(position = "stack", stat="identity") +
scale_fill_manual(values = c("green", "darkgreen")) +
ylab("Drivers involved in Fatal collision while Alcho-Impaired") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
# Created barplot comparing all the drivers involved with the drivers distracted.
percent_worst_drivers %>%
select(STATE, DRIVERS_INVOLVED, DRIVERS_NOT_DIST) %>%
gather(type, value, DRIVERS_INVOLVED:DRIVERS_NOT_DIST) %>%
ggplot(., aes(x = STATE,y = value, fill = type)) +
geom_bar(position = "stack", stat="identity") +
scale_fill_manual(values = c("lightyellow", "yellow")) +
ylab("Drivers involved in Fatal collision not distracted") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
# Created barplot comparing all the drivers involved with the drivers no alcohol.
percent_worst_drivers %>%
select(STATE, DRIVERS_INVOLVED, DRIVERS_NO_ACC) %>%
gather(type, value, DRIVERS_INVOLVED:DRIVERS_NO_ACC) %>%
ggplot(., aes(x = STATE,y = value, fill = type)) +
geom_bar(position = "stack", stat="identity") +
scale_fill_manual(values = c("blue", "darkblue")) +
ylab("Drivers involved in Fatal collision no pre accident") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
# Created barplot comparing all the State with the highest Insurance Premium.
percent_worst_drivers %>%
ggplot(., aes(x = STATE,y = INS_PREM)) +
geom_bar(position = "stack", stat="identity") +
ylab("Car Insurance Premium") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))