library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(dplyr)
library(infer)
theUrl <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/bad-drivers/bad-drivers.csv"

worst_drivers <- read.table(file=theUrl, header=TRUE, sep=",")
head(worst_drivers)
##        State Number.of.drivers.involved.in.fatal.collisions.per.billion.miles
## 1    Alabama                                                             18.8
## 2     Alaska                                                             18.1
## 3    Arizona                                                             18.6
## 4   Arkansas                                                             22.4
## 5 California                                                             12.0
## 6   Colorado                                                             13.6
##   Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding
## 1                                                                   39
## 2                                                                   41
## 3                                                                   35
## 4                                                                   18
## 5                                                                   35
## 6                                                                   37
##   Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired
## 1                                                                           30
## 2                                                                           25
## 3                                                                           28
## 4                                                                           26
## 5                                                                           28
## 6                                                                           28
##   Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted
## 1                                                                         96
## 2                                                                         90
## 3                                                                         84
## 4                                                                         94
## 5                                                                         91
## 6                                                                         79
##   Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents
## 1                                                                                                     80
## 2                                                                                                     94
## 3                                                                                                     96
## 4                                                                                                     95
## 5                                                                                                     89
## 6                                                                                                     95
##   Car.Insurance.Premiums....
## 1                     784.55
## 2                    1053.48
## 3                     899.47
## 4                     827.34
## 5                     878.41
## 6                     835.50
##   Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver....
## 1                                                                       145.08
## 2                                                                       133.93
## 3                                                                       110.35
## 4                                                                       142.39
## 5                                                                       165.63
## 6                                                                       139.91

RESEARCH QUESTION

which State Has The Worst Drivers?

CASES

Each case represents the 50 states and D.C. and 8 variables (state, num_drivers, perc_speeding, perc_alcohol_impaired, perc_not_distracted, perc_no_previous, insurance_premiums, losses). There are 51 number of rows observations in the given data set.

glimpse(worst_drivers)
## Rows: 51
## Columns: 8
## $ State                                                                                                  <chr> …
## $ Number.of.drivers.involved.in.fatal.collisions.per.billion.miles                                       <dbl> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding                                   <int> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired                           <int> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted                             <int> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents <int> …
## $ Car.Insurance.Premiums....                                                                             <dbl> …
## $ Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver....                           <dbl> …

DATA COLLECTION

Data is collected by the National Highway Traffic Safety Administration 2009 and 2012 (NHTSA) (https://www-fars.nhtsa.dot.gov/Main/index.aspx) and National Association of Insurance Commissioners 2010 & 2011(website was not provided).

TYPE OF STUDY

This is an observational study.

DATA SOURCE

Data is collected by Mona Chalabi from FiveThirtyEight and is available online here: https://github.com/fivethirtyeight/data/tree/master/bad-drivers For this project, the data was uploaded by calling the URL and using the read_table() function.

RESPONSE

The response variable is Car.Insurance.Premiums and Losses.incured.by.insurance.companies.for.collisions.per. insured.driver. Both are numerical.

EXPLANATORY

The explanatory variable is State, Number.of.drivers.involved.in.fatal.collisions.per.billion.miles, Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding, Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired, Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted, Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents. All numbers accept for State that is Character.

RELEVENT SUMMARY STATISTICS

colnames(worst_drivers)
## [1] "State"                                                                                                 
## [2] "Number.of.drivers.involved.in.fatal.collisions.per.billion.miles"                                      
## [3] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding"                                  
## [4] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired"                          
## [5] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted"                            
## [6] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents"
## [7] "Car.Insurance.Premiums...."                                                                            
## [8] "Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver...."
colnames(worst_drivers) <- c("STATE", 
                           "DRIVERS_INVOLVED", 
                           "PERC_DRIVERS_SPEED", 
                           "PERC_DRIVERS_ALCHO", 
                           "PERC_DRIVERS_NOT_DIST", 
                           "PERC_DRIVERS_NO_ACC", 
                           "INS_PREM", 
                           "LOSS_INSCOMP")

glimpse(worst_drivers)
## Rows: 51
## Columns: 8
## $ STATE                 <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cal…
## $ DRIVERS_INVOLVED      <dbl> 18.8, 18.1, 18.6, 22.4, 12.0, 13.6, 10.8, 16.2, …
## $ PERC_DRIVERS_SPEED    <int> 39, 41, 35, 18, 35, 37, 46, 38, 34, 21, 19, 54, …
## $ PERC_DRIVERS_ALCHO    <int> 30, 25, 28, 26, 28, 28, 36, 30, 27, 29, 25, 41, …
## $ PERC_DRIVERS_NOT_DIST <int> 96, 90, 84, 94, 91, 79, 87, 87, 100, 92, 95, 82,…
## $ PERC_DRIVERS_NO_ACC   <int> 80, 94, 96, 95, 89, 95, 82, 99, 100, 94, 93, 87,…
## $ INS_PREM              <dbl> 784.55, 1053.48, 899.47, 827.34, 878.41, 835.50,…
## $ LOSS_INSCOMP          <dbl> 145.08, 133.93, 110.35, 142.39, 165.63, 139.91, …
# create new column DRIVERS_SPEED which will be (DRIVERS_INVOLVED*PERC_DRIVERS_SPEED)/100
percent_worst_drivers <- worst_drivers %>% 
  mutate(DRIVERS_SPEED=(DRIVERS_INVOLVED*PERC_DRIVERS_SPEED)/100) %>% 
  mutate(DRIVERS_ALCHO=(DRIVERS_INVOLVED*PERC_DRIVERS_ALCHO)/100) %>% 
  mutate(DRIVERS_NOT_DIST=(DRIVERS_INVOLVED*PERC_DRIVERS_NOT_DIST)/100) %>% 
  mutate(DRIVERS_NO_ACC=(DRIVERS_INVOLVED*PERC_DRIVERS_NO_ACC)/100)

glimpse(percent_worst_drivers)
## Rows: 51
## Columns: 12
## $ STATE                 <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cal…
## $ DRIVERS_INVOLVED      <dbl> 18.8, 18.1, 18.6, 22.4, 12.0, 13.6, 10.8, 16.2, …
## $ PERC_DRIVERS_SPEED    <int> 39, 41, 35, 18, 35, 37, 46, 38, 34, 21, 19, 54, …
## $ PERC_DRIVERS_ALCHO    <int> 30, 25, 28, 26, 28, 28, 36, 30, 27, 29, 25, 41, …
## $ PERC_DRIVERS_NOT_DIST <int> 96, 90, 84, 94, 91, 79, 87, 87, 100, 92, 95, 82,…
## $ PERC_DRIVERS_NO_ACC   <int> 80, 94, 96, 95, 89, 95, 82, 99, 100, 94, 93, 87,…
## $ INS_PREM              <dbl> 784.55, 1053.48, 899.47, 827.34, 878.41, 835.50,…
## $ LOSS_INSCOMP          <dbl> 145.08, 133.93, 110.35, 142.39, 165.63, 139.91, …
## $ DRIVERS_SPEED         <dbl> 7.332, 7.421, 6.510, 4.032, 4.200, 5.032, 4.968,…
## $ DRIVERS_ALCHO         <dbl> 5.640, 4.525, 5.208, 5.824, 3.360, 3.808, 3.888,…
## $ DRIVERS_NOT_DIST      <dbl> 18.048, 16.290, 15.624, 21.056, 10.920, 10.744, …
## $ DRIVERS_NO_ACC        <dbl> 15.040, 17.014, 17.856, 21.280, 10.680, 12.920, …
# Created barplot comparing all the drivers involved with the drivers speed. 

percent_worst_drivers %>% 
  select(STATE, DRIVERS_INVOLVED, DRIVERS_SPEED) %>% 
  gather(type, value, DRIVERS_INVOLVED:DRIVERS_SPEED) %>% 
  ggplot(., aes(x = STATE,y = value, fill = type)) + 
  geom_bar(position = "stack", stat="identity") + 
  scale_fill_manual(values = c("red", "darkred")) + 
  ylab("Drivers involved in Fatal collision while Speeding") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

# Created barplot comparing all the drivers involved with the drivers alcohol. 

percent_worst_drivers %>%  
  select(STATE, DRIVERS_INVOLVED, DRIVERS_ALCHO) %>% 
  gather(type, value, DRIVERS_INVOLVED:DRIVERS_ALCHO) %>% 
  ggplot(., aes(x = STATE,y = value, fill = type)) + 
  geom_bar(position = "stack", stat="identity") + 
  scale_fill_manual(values = c("green", "darkgreen")) + 
  ylab("Drivers involved in Fatal collision while Alcho-Impaired") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

# Created barplot comparing all the drivers involved with the drivers distracted.

percent_worst_drivers %>% 
  select(STATE, DRIVERS_INVOLVED, DRIVERS_NOT_DIST) %>% 
  gather(type, value, DRIVERS_INVOLVED:DRIVERS_NOT_DIST) %>% 
  ggplot(., aes(x = STATE,y = value, fill = type)) + 
  geom_bar(position = "stack", stat="identity") + 
  scale_fill_manual(values = c("lightyellow", "yellow")) + 
  ylab("Drivers involved in Fatal collision not distracted") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

# Created barplot comparing all the drivers involved with the drivers no alcohol.

percent_worst_drivers %>% 
  select(STATE, DRIVERS_INVOLVED, DRIVERS_NO_ACC) %>% 
  gather(type, value, DRIVERS_INVOLVED:DRIVERS_NO_ACC) %>% 
  ggplot(., aes(x = STATE,y = value, fill = type)) + 
  geom_bar(position = "stack", stat="identity") + 
  scale_fill_manual(values = c("blue", "darkblue")) + 
  ylab("Drivers involved in Fatal collision no pre accident") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

# Created barplot comparing all the State with the highest Insurance Premium.

percent_worst_drivers %>% 
  ggplot(., aes(x = STATE,y = INS_PREM)) + 
  geom_bar(position = "stack", stat="identity") + 
  ylab("Car Insurance Premium") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))