library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(dplyr)
library(infer)
library(forcats)
theUrl <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/bad-drivers/bad-drivers.csv"

worst_drivers <- read.table(file=theUrl, header=TRUE, sep=",")
head(worst_drivers)
##        State Number.of.drivers.involved.in.fatal.collisions.per.billion.miles
## 1    Alabama                                                             18.8
## 2     Alaska                                                             18.1
## 3    Arizona                                                             18.6
## 4   Arkansas                                                             22.4
## 5 California                                                             12.0
## 6   Colorado                                                             13.6
##   Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding
## 1                                                                   39
## 2                                                                   41
## 3                                                                   35
## 4                                                                   18
## 5                                                                   35
## 6                                                                   37
##   Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired
## 1                                                                           30
## 2                                                                           25
## 3                                                                           28
## 4                                                                           26
## 5                                                                           28
## 6                                                                           28
##   Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted
## 1                                                                         96
## 2                                                                         90
## 3                                                                         84
## 4                                                                         94
## 5                                                                         91
## 6                                                                         79
##   Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents
## 1                                                                                                     80
## 2                                                                                                     94
## 3                                                                                                     96
## 4                                                                                                     95
## 5                                                                                                     89
## 6                                                                                                     95
##   Car.Insurance.Premiums....
## 1                     784.55
## 2                    1053.48
## 3                     899.47
## 4                     827.34
## 5                     878.41
## 6                     835.50
##   Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver....
## 1                                                                       145.08
## 2                                                                       133.93
## 3                                                                       110.35
## 4                                                                       142.39
## 5                                                                       165.63
## 6                                                                       139.91

RESEARCH QUESTION

Which state has the worst drivers? Can we predict the state’s average insurance premiums by looking at the driving record of that particular state?

CASES

Each case represents the 50 states and D.C. and 8 variables (state, num_drivers, perc_speeding, perc_alcohol_impaired, perc_not_distracted, perc_no_previous, insurance_premiums, losses). There are 51 number of rows observations in the given data set.

glimpse(worst_drivers)
## Rows: 51
## Columns: 8
## $ State                                                                                                  <chr> …
## $ Number.of.drivers.involved.in.fatal.collisions.per.billion.miles                                       <dbl> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding                                   <int> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired                           <int> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted                             <int> …
## $ Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents <int> …
## $ Car.Insurance.Premiums....                                                                             <dbl> …
## $ Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver....                           <dbl> …

DATA COLLECTION

Data is collected by the National Highway Traffic Safety Administration 2009 and 2012 (NHTSA) (https://www-fars.nhtsa.dot.gov/Main/index.aspx) and National Association of Insurance Commissioners 2010 & 2011(website was not provided).

TYPE OF STUDY

This is an observational study.

DATA SOURCE

Data is collected by Mona Chalabi from FiveThirtyEight and is available online here: https://github.com/fivethirtyeight/data/tree/master/bad-drivers For this project, the data was uploaded by calling the URL and using the read_table() function.

RESPONSE

The response variable is Car.Insurance.Premiums and Losses.incured.by.insurance.companies.for.collisions.per. insured.driver. Both are numerical.

EXPLANATORY

The explanatory variable is State, Number.of.drivers.involved.in.fatal.collisions.per.billion.miles, Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding, Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired, Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted, Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents. All numbers accept for State that is Character.

RELEVENT SUMMARY STATISTICS

colnames(worst_drivers)
## [1] "State"                                                                                                 
## [2] "Number.of.drivers.involved.in.fatal.collisions.per.billion.miles"                                      
## [3] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Speeding"                                  
## [4] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Alcohol.Impaired"                          
## [5] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Were.Not.Distracted"                            
## [6] "Percentage.Of.Drivers.Involved.In.Fatal.Collisions.Who.Had.Not.Been.Involved.In.Any.Previous.Accidents"
## [7] "Car.Insurance.Premiums...."                                                                            
## [8] "Losses.incurred.by.insurance.companies.for.collisions.per.insured.driver...."
colnames(worst_drivers) <- c("STATE", 
                           "DRIVERS_INVOLVED", 
                           "PERC_DRIVERS_SPEED", 
                           "PERC_DRIVERS_ALCHO", 
                           "PERC_DRIVERS_NOT_DIST", 
                           "PERC_DRIVERS_NO_ACC", 
                           "INS_PREM", 
                           "LOSS_INSCOMP")

glimpse(worst_drivers)
## Rows: 51
## Columns: 8
## $ STATE                 <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cal…
## $ DRIVERS_INVOLVED      <dbl> 18.8, 18.1, 18.6, 22.4, 12.0, 13.6, 10.8, 16.2, …
## $ PERC_DRIVERS_SPEED    <int> 39, 41, 35, 18, 35, 37, 46, 38, 34, 21, 19, 54, …
## $ PERC_DRIVERS_ALCHO    <int> 30, 25, 28, 26, 28, 28, 36, 30, 27, 29, 25, 41, …
## $ PERC_DRIVERS_NOT_DIST <int> 96, 90, 84, 94, 91, 79, 87, 87, 100, 92, 95, 82,…
## $ PERC_DRIVERS_NO_ACC   <int> 80, 94, 96, 95, 89, 95, 82, 99, 100, 94, 93, 87,…
## $ INS_PREM              <dbl> 784.55, 1053.48, 899.47, 827.34, 878.41, 835.50,…
## $ LOSS_INSCOMP          <dbl> 145.08, 133.93, 110.35, 142.39, 165.63, 139.91, …
# create new column DRIVERS_SPEED which will be (DRIVERS_INVOLVED*PERC_DRIVERS_SPEED)/100
percent_worst_drivers <- worst_drivers %>% 
  mutate(DRIVERS_SPEED=(DRIVERS_INVOLVED*PERC_DRIVERS_SPEED)/100) %>% 
  mutate(DRIVERS_ALCHO=(DRIVERS_INVOLVED*PERC_DRIVERS_ALCHO)/100) %>% 
  mutate(DRIVERS_NOT_DIST=(DRIVERS_INVOLVED*PERC_DRIVERS_NOT_DIST)/100) %>% 
  mutate(DRIVERS_NO_ACC=(DRIVERS_INVOLVED*PERC_DRIVERS_NO_ACC)/100)

glimpse(percent_worst_drivers)
## Rows: 51
## Columns: 12
## $ STATE                 <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cal…
## $ DRIVERS_INVOLVED      <dbl> 18.8, 18.1, 18.6, 22.4, 12.0, 13.6, 10.8, 16.2, …
## $ PERC_DRIVERS_SPEED    <int> 39, 41, 35, 18, 35, 37, 46, 38, 34, 21, 19, 54, …
## $ PERC_DRIVERS_ALCHO    <int> 30, 25, 28, 26, 28, 28, 36, 30, 27, 29, 25, 41, …
## $ PERC_DRIVERS_NOT_DIST <int> 96, 90, 84, 94, 91, 79, 87, 87, 100, 92, 95, 82,…
## $ PERC_DRIVERS_NO_ACC   <int> 80, 94, 96, 95, 89, 95, 82, 99, 100, 94, 93, 87,…
## $ INS_PREM              <dbl> 784.55, 1053.48, 899.47, 827.34, 878.41, 835.50,…
## $ LOSS_INSCOMP          <dbl> 145.08, 133.93, 110.35, 142.39, 165.63, 139.91, …
## $ DRIVERS_SPEED         <dbl> 7.332, 7.421, 6.510, 4.032, 4.200, 5.032, 4.968,…
## $ DRIVERS_ALCHO         <dbl> 5.640, 4.525, 5.208, 5.824, 3.360, 3.808, 3.888,…
## $ DRIVERS_NOT_DIST      <dbl> 18.048, 16.290, 15.624, 21.056, 10.920, 10.744, …
## $ DRIVERS_NO_ACC        <dbl> 15.040, 17.014, 17.856, 21.280, 10.680, 12.920, …
# Created barplot comparing all the drivers involved with the drivers speed. 

percent_worst_drivers %>% 
  select(STATE, DRIVERS_INVOLVED, DRIVERS_SPEED) %>% 
  gather(type, value, DRIVERS_INVOLVED:DRIVERS_SPEED) %>%
  mutate(STATE = fct_reorder(STATE, value)) %>%
  ggplot(., aes(x = STATE,y = value, fill = type)) +  
  geom_bar(position = "stack", stat="identity") + 
  scale_fill_manual(values = c("red", "darkred")) + 
  ylab("Drivers involved in Fatal collision while Speeding") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
 coord_flip()

# Created barplot comparing all the drivers involved with the drivers alcohol. 

percent_worst_drivers %>%  
  select(STATE, DRIVERS_INVOLVED, DRIVERS_ALCHO) %>% 
  gather(type, value, DRIVERS_INVOLVED:DRIVERS_ALCHO) %>% 
  mutate(STATE = fct_reorder(STATE, value)) %>%
  ggplot(., aes(x = STATE,y = value, fill = type)) + 
  geom_bar(position = "stack", stat="identity") + 
  scale_fill_manual(values = c("green", "darkgreen")) + 
  ylab("Drivers involved in Fatal collision while Alcho-Impaired") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
 coord_flip()

# Created barplot comparing all the drivers involved with the drivers distracted.

percent_worst_drivers %>% 
  select(STATE, DRIVERS_INVOLVED, DRIVERS_NOT_DIST) %>% 
  gather(type, value, DRIVERS_INVOLVED:DRIVERS_NOT_DIST) %>% 
  mutate(STATE = fct_reorder(STATE, value)) %>%
  ggplot(., aes(x = STATE,y = value, fill = type)) + 
  geom_bar(position = "stack", stat="identity") + 
  scale_fill_manual(values = c("lightyellow", "yellow")) + 
  ylab("Drivers involved in Fatal collision not distracted") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
   coord_flip()

# Created barplot comparing all the drivers involved with the drivers no alcohol.

percent_worst_drivers %>% 
  select(STATE, DRIVERS_INVOLVED, DRIVERS_NO_ACC) %>% 
  gather(type, value, DRIVERS_INVOLVED:DRIVERS_NO_ACC) %>% 
  mutate(STATE = fct_reorder(STATE, value)) %>%
  ggplot(., aes(x = STATE,y = value, fill = type)) + 
  geom_bar(position = "stack", stat="identity") + 
  scale_fill_manual(values = c("blue", "darkblue")) + 
  ylab("Drivers involved in Fatal collision no pre accident") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
   coord_flip()

# Created barplot comparing all the State with the highest Insurance Premium.

percent_worst_drivers %>% 
  mutate(STATE = fct_reorder(STATE, INS_PREM)) %>%
  ggplot(., aes(x = STATE,y = INS_PREM, fill = STATE)) + 
  geom_bar(position = "stack", stat="identity") + 
  ylab("Car Insurance Premium") + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
   coord_flip()