







#################################################################
# Import dataset ####
#################################################################
data_input<- Groupproject
#################################################################
# Analysis Data ####
#################################################################
summary(data_input)
## state st_case ve_total ve_forms
## Min. : 1.00 Min. : 10001 Min. : 1.000 Min. : 1.000
## 1st Qu.:12.00 1st Qu.:122120 1st Qu.: 1.000 1st Qu.: 1.000
## Median :28.00 Median :280279 Median : 1.000 Median : 1.000
## Mean :27.95 Mean :280092 Mean : 1.522 Mean : 1.486
## 3rd Qu.:42.00 3rd Qu.:420723 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :56.00 Max. :560139 Max. :90.000 Max. :90.000
## NA's :1833
## county city day month
## Min. : 0.00 Min. : 0 Min. : 1.00 Min. : 1.000
## 1st Qu.: 31.00 1st Qu.: 0 1st Qu.: 8.00 1st Qu.: 4.000
## Median : 71.00 Median : 0 Median :16.00 Median : 7.000
## Mean : 91.54 Mean :1194 Mean :15.67 Mean : 6.754
## 3rd Qu.:115.00 3rd Qu.:1940 3rd Qu.:23.00 3rd Qu.:10.000
## Max. :997.00 Max. :9997 Max. :31.00 Max. :12.000
## NA's :1838 NA's :1891 NA's :1833 NA's :1833
## year date_SIF day_week hour minute
## Min. :2010 Min. :18263 Min. :1.000 Min. : 0.00 Min. : 0.0
## 1st Qu.:2011 1st Qu.:18735 1st Qu.:2.000 1st Qu.: 7.00 1st Qu.:14.0
## Median :2012 Median :19182 Median :4.000 Median :14.00 Median :29.0
## Mean :2012 Mean :19182 Mean :4.144 Mean :12.63 Mean :28.4
## 3rd Qu.:2013 3rd Qu.:19632 3rd Qu.:6.000 3rd Qu.:18.00 3rd Qu.:44.0
## Max. :2014 Max. :20088 Max. :7.000 Max. :23.00 Max. :59.0
## NA's :1833 NA's :1833 NA's :1833 NA's :2940 NA's :2961
## nhs latitude longitud lgt_cond
## Min. :0.0000 Min. :18.96 Min. :-165.41 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:33.26 1st Qu.: -97.72 1st Qu.:1.000
## Median :0.0000 Median :36.58 Median : -87.74 Median :2.000
## Mean :0.3267 Mean :36.72 Mean : -91.90 Mean :1.827
## 3rd Qu.:1.0000 3rd Qu.:40.66 3rd Qu.: -81.33 3rd Qu.:2.000
## Max. :1.0000 Max. :68.43 Max. : -66.99 Max. :7.000
## NA's :2321 NA's :3481 NA's :3481 NA's :2485
## weather fatals drunk_dr total_numoccs
## Min. : 1.000 Min. : 1.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:0.0000 1st Qu.: 1.000
## Median : 1.000 Median : 1.000 Median :0.0000 Median : 2.000
## Mean : 2.588 Mean : 1.089 Mean :0.3156 Mean : 2.269
## 3rd Qu.: 2.000 3rd Qu.: 1.000 3rd Qu.:1.0000 3rd Qu.: 3.000
## Max. :12.000 Max. :15.000 Max. :4.0000 Max. :127.000
## NA's :3040 NA's :1833 NA's :1833 NA's :921
## total_hit_run total_registered_owner total_not_registered total_other_owner
## Min. :0.0000 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median :0.0000 Median : 1.00 Median : 0.00 Median : 0.000
## Mean :0.0474 Mean : 0.84 Mean : 0.43 Mean : 0.215
## 3rd Qu.:0.0000 3rd Qu.: 1.00 3rd Qu.: 1.00 3rd Qu.: 0.000
## Max. :1.0000 Max. :49.00 Max. :17.00 Max. :23.000
## NA's :806 NA's :3837 NA's :3837 NA's :3837
## total_fire_exp total_valid_license total_invalid_license no_prev_acc
## Min. :0.0000 Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.:0.0000 1st Qu.: 1.000
## Median :0.0000 Median : 1.000 Median :0.0000 Median : 1.000
## Mean :0.0405 Mean : 1.299 Mean :0.1734 Mean : 1.284
## 3rd Qu.:0.0000 3rd Qu.: 2.000 3rd Qu.:0.0000 3rd Qu.: 2.000
## Max. :1.0000 Max. :83.000 Max. :5.0000 Max. :74.000
## NA's :694 NA's :4163 NA's :694 NA's :15103
## one_prev_acc two_prev_acc no_prev_sus one_prev_sus
## Min. : 0.000 Min. :0.000 Min. : 0.000 Min. :0.000
## 1st Qu.: 0.000 1st Qu.:0.000 1st Qu.: 1.000 1st Qu.:0.000
## Median : 0.000 Median :0.000 Median : 1.000 Median :0.000
## Mean : 0.153 Mean :0.069 Mean : 1.266 Mean :0.102
## 3rd Qu.: 0.000 3rd Qu.:0.000 3rd Qu.: 2.000 3rd Qu.:0.000
## Max. :12.000 Max. :7.000 Max. :80.000 Max. :6.000
## NA's :15103 NA's :15103 NA's :4997 NA's :4997
## two_prev_sus no_prev_dwi one_prev_dwi no_prev_spd
## Min. :0.000 Min. : 0.000 Min. :0.000 Min. : 0.000
## 1st Qu.:0.000 1st Qu.: 1.000 1st Qu.:0.000 1st Qu.: 1.000
## Median :0.000 Median : 1.000 Median :0.000 Median : 1.000
## Mean :0.131 Mean : 1.436 Mean :0.033 Mean : 1.227
## 3rd Qu.:0.000 3rd Qu.: 2.000 3rd Qu.:0.000 3rd Qu.: 2.000
## Max. :6.000 Max. :88.000 Max. :2.000 Max. :76.000
## NA's :4997 NA's :4997 NA's :4997 NA's :4997
## one_prev_spd no_prev_oth one_prev_oth speed_related
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :0.000
## 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.: 0.000 1st Qu.:0.000
## Median : 0.000 Median : 1.000 Median : 0.000 Median :0.000
## Mean : 0.183 Mean : 1.222 Mean : 0.174 Mean :0.235
## 3rd Qu.: 0.000 3rd Qu.: 2.000 3rd Qu.: 0.000 3rd Qu.:0.000
## Max. :11.000 Max. :71.000 Max. :12.000 Max. :1.000
## NA's :4997 NA's :4997 NA's :4997 NA's :6503
## dr_age_lower16 dr_age_lower18 dr_age_lower21 dr_age_lower30
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.000 Median :0.000
## Mean :0.005 Mean :0.035 Mean :0.105 Mean :0.316
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.000
## Max. :2.000 Max. :3.000 Max. :5.000 Max. :9.000
## NA's :4844 NA's :4844 NA's :4844 NA's :4844
## dr_age_lower65 dr_age_65over dr_male dr_female
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.: 0.000
## Median : 1.000 Median : 0.000 Median : 1.000 Median : 0.000
## Mean : 0.826 Mean : 0.194 Mean : 1.094 Mean : 0.387
## 3rd Qu.: 1.000 3rd Qu.: 0.000 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :64.000 Max. :11.000 Max. :67.000 Max. :23.000
## NA's :4844 NA's :4844 NA's :4666 NA's :4666
## drugs_inv dr_alcohol_drug_med dr_other_impair total_moving_violations
## Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.0000
## Median :0.00 Median :0.00 Median :0.00 Median :0.0000
## Mean :0.25 Mean :0.27 Mean :0.07 Mean :0.1512
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:0.00 3rd Qu.:0.0000
## Max. :1.00 Max. :1.00 Max. :1.00 Max. :1.0000
## NA's :101756 NA's :40450 NA's :40450 NA's :3152
## nm_alcohol_drug_med nm_other_impair nm_involved pernotmvit
## Min. : 0.0 Min. : 0.00 Min. :0.0000 Min. : 0.00
## 1st Qu.:99.0 1st Qu.:99.00 1st Qu.:0.0000 1st Qu.: 0.00
## Median :99.0 Median :99.00 Median :0.0000 Median : 0.00
## Mean :87.3 Mean :87.28 Mean :0.1793 Mean : 0.21
## 3rd Qu.:99.0 3rd Qu.:99.00 3rd Qu.:0.0000 3rd Qu.: 0.00
## Max. :99.0 Max. :99.00 Max. :1.0000 Max. :21.00
## NA's :10572 NA's :10572 NA's :32129
## permvit
## Min. : 0.00
## 1st Qu.: 1.00
## Median : 2.00
## Mean : 2.25
## 3rd Qu.: 3.00
## Max. :127.00
## NA's :32129
str(data_input)
## 'data.frame': 153260 obs. of 58 variables:
## $ state : num 1 1 1 1 1 1 1 1 1 1 ...
## $ st_case : num 10001 10002 10003 10004 10005 ...
## $ ve_total : num 1 1 3 2 2 1 1 1 1 2 ...
## $ ve_forms : num 1 1 3 2 2 1 1 1 1 2 ...
## $ county : num 81 35 97 115 117 21 97 9 51 113 ...
## $ city : num 2340 1080 2100 160 0 0 0 0 0 0 ...
## $ day : num 15 11 14 21 4 1 1 2 2 3 ...
## $ month : num 1 1 1 1 1 1 1 1 1 1 ...
## $ year : num 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ date_SIF : num 18277 18273 18276 18283 18266 ...
## $ day_week : num 6 2 5 5 2 6 6 7 7 1 ...
## $ hour : num 4 6 15 1 6 5 2 15 9 13 ...
## $ minute : num 10 0 10 16 38 34 40 37 55 50 ...
## $ nhs : num 1 0 0 0 1 1 0 0 0 0 ...
## $ latitude : num 32.6 31.4 30.7 33.9 33.3 ...
## $ longitud : num -85.4 -87 -88.1 -86.3 -86.8 ...
## $ lgt_cond : num 3 3 1 3 4 4 3 3 1 1 ...
## $ weather : num 1 1 1 2 1 2 2 1 1 1 ...
## $ fatals : num 1 1 1 1 1 1 2 1 1 1 ...
## $ drunk_dr : num 1 0 0 0 0 0 0 0 0 1 ...
## $ total_numoccs : num 2 1 3 2 2 1 2 4 2 4 ...
## $ total_hit_run : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_registered_owner : num 0 1 3 0 2 1 0 0 1 1 ...
## $ total_not_registered : num 1 0 0 0 0 0 1 1 0 0 ...
## $ total_other_owner : num 0 0 0 2 0 0 0 0 0 1 ...
## $ total_fire_exp : num 1 0 0 0 1 0 0 0 0 0 ...
## $ total_valid_license : num 1 1 3 2 2 1 1 1 1 1 ...
## $ total_invalid_license : num 0 0 0 0 0 0 0 0 0 1 ...
## $ no_prev_acc : num NA 1 1 1 2 1 0 1 1 2 ...
## $ one_prev_acc : num NA 0 2 1 0 0 0 0 0 0 ...
## $ two_prev_acc : num NA 0 0 0 0 0 1 0 0 0 ...
## $ no_prev_sus : num 0 1 3 2 2 1 1 1 1 1 ...
## $ one_prev_sus : num 0 0 0 0 0 0 0 0 0 1 ...
## $ two_prev_sus : num 1 0 0 0 0 0 0 0 0 0 ...
## $ no_prev_dwi : num 1 1 3 2 2 1 1 1 1 1 ...
## $ one_prev_dwi : num 0 0 0 0 0 0 0 0 0 1 ...
## $ no_prev_spd : num 0 1 3 2 1 1 0 1 1 1 ...
## $ one_prev_spd : num 0 0 0 0 0 0 1 0 0 1 ...
## $ no_prev_oth : num 1 1 2 2 2 1 1 1 1 2 ...
## $ one_prev_oth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ speed_related : num 0 0 0 0 0 0 0 0 1 0 ...
## $ dr_age_lower16 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dr_age_lower18 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dr_age_lower21 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ dr_age_lower30 : num 0 0 2 1 0 0 0 0 0 1 ...
## $ dr_age_lower65 : num 1 1 1 1 2 1 0 0 0 1 ...
## $ dr_age_65over : num 0 0 0 0 0 0 0 1 1 0 ...
## $ dr_male : num 0 0 1 2 2 1 0 1 0 2 ...
## $ dr_female : num 1 1 2 0 0 0 1 0 1 0 ...
## $ drugs_inv : num NA 0 NA NA NA 0 NA NA NA NA ...
## $ dr_alcohol_drug_med : num 0 0 0 0 NA 0 0 0 0 1 ...
## $ dr_other_impair : num 1 0 0 0 NA 1 1 0 1 0 ...
## $ total_moving_violations: num 0 0 0 0 0 0 0 0 0 1 ...
## $ nm_alcohol_drug_med : num 99 99 99 99 99 99 99 99 99 99 ...
## $ nm_other_impair : num 99 99 99 99 99 99 99 99 99 99 ...
## $ nm_involved : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pernotmvit : num NA NA NA NA NA NA NA NA NA NA ...
## $ permvit : num NA NA NA NA NA NA NA NA NA NA ...
#Find the proportion of missing data that occurs in each category of data
miss<-function(x){sum(is.na(x))/length(x)*100} #Create a function of calculate proportion
apply(data_input,2,miss) #Run the function with our data
## state st_case ve_total
## 0.0000000 0.0000000 1.1960068
## ve_forms county city
## 0.0000000 1.1992692 1.2338510
## day month year
## 1.1960068 1.1960068 1.1960068
## date_SIF day_week hour
## 1.1960068 1.1960068 1.9183088
## minute nhs latitude
## 1.9320110 1.5144199 2.2713037
## longitud lgt_cond weather
## 2.2713037 1.6214276 1.9835574
## fatals drunk_dr total_numoccs
## 1.1960068 1.1960068 0.6009396
## total_hit_run total_registered_owner total_not_registered
## 0.5259037 2.5035887 2.5035887
## total_other_owner total_fire_exp total_valid_license
## 2.5035887 0.4528253 2.7162991
## total_invalid_license no_prev_acc one_prev_acc
## 0.4528253 9.8544956 9.8544956
## two_prev_acc no_prev_sus one_prev_sus
## 9.8544956 3.2604724 3.2604724
## two_prev_sus no_prev_dwi one_prev_dwi
## 3.2604724 3.2604724 3.2604724
## no_prev_spd one_prev_spd no_prev_oth
## 3.2604724 3.2604724 3.2604724
## one_prev_oth speed_related dr_age_lower16
## 3.2604724 4.2431163 3.1606420
## dr_age_lower18 dr_age_lower21 dr_age_lower30
## 3.1606420 3.1606420 3.1606420
## dr_age_lower65 dr_age_65over dr_male
## 3.1606420 3.1606420 3.0444995
## dr_female drugs_inv dr_alcohol_drug_med
## 3.0444995 66.3943625 26.3930575
## dr_other_impair total_moving_violations nm_alcohol_drug_med
## 26.3930575 2.0566358 6.8980817
## nm_other_impair nm_involved pernotmvit
## 6.8980817 0.0000000 20.9637218
## permvit
## 20.9637218
#Remove the data with a missing ratio greater than 15%.
data <- dplyr::select(data_input,-permvit,-dr_other_impair,-drugs_inv,-pernotmvit )
#Clean all NA value
data <- data[complete.cases(data),]
#################################################################
# Create subset for regration analysis ####
#################################################################
#Create subset for weather and light
data_r<- dummy_cols(data,select_columns = "weather") # 1 = clean, 5 = Snow
data_r <- dummy_cols(data_r,select_columns = "lgt_cond") # 1 = Daylight; 2 = Dark - Not lighted; 3 = Dark - Lighted;
data_r$Valid_license_ratio <- data_r$total_valid_license/data_r$ve_total*100
data_r$no_prev_spd_ratio <- data_r$no_prev_spd/data_r$ve_total*100
str(data_r)
## 'data.frame': 92034 obs. of 74 variables:
## $ state : num 1 1 1 1 1 1 1 1 1 1 ...
## $ st_case : num 10002 10003 10004 10006 10007 ...
## $ ve_total : num 1 3 2 1 1 1 1 2 1 1 ...
## $ ve_forms : num 1 3 2 1 1 1 1 2 1 1 ...
## $ county : num 35 97 115 21 97 9 51 113 95 91 ...
## $ city : num 1080 2100 160 0 0 0 0 0 0 0 ...
## $ day : num 11 14 21 1 1 2 2 3 4 4 ...
## $ month : num 1 1 1 1 1 1 1 1 1 1 ...
## $ year : num 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ date_SIF : num 18273 18276 18283 18263 18263 ...
## $ day_week : num 2 5 5 6 6 7 7 1 2 2 ...
## $ hour : num 6 15 1 5 2 15 9 13 5 16 ...
## $ minute : num 0 10 16 34 40 37 55 50 50 20 ...
## $ nhs : num 0 0 0 1 0 0 0 0 0 0 ...
## $ latitude : num 31.4 30.7 33.9 32.8 30.5 ...
## $ longitud : num -87 -88.1 -86.3 -86.6 -88.3 ...
## $ lgt_cond : num 3 1 3 4 3 3 1 1 4 1 ...
## $ weather : num 1 1 2 2 2 1 1 1 1 1 ...
## $ fatals : num 1 1 1 1 2 1 1 1 1 1 ...
## $ drunk_dr : num 0 0 0 0 0 0 0 1 0 1 ...
## $ total_numoccs : num 1 3 2 1 2 4 2 4 1 3 ...
## $ total_hit_run : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_registered_owner : num 1 3 0 1 0 0 1 1 0 1 ...
## $ total_not_registered : num 0 0 0 0 1 1 0 0 0 0 ...
## $ total_other_owner : num 0 0 2 0 0 0 0 1 1 0 ...
## $ total_fire_exp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_valid_license : num 1 3 2 1 1 1 1 1 1 1 ...
## $ total_invalid_license : num 0 0 0 0 0 0 0 1 0 0 ...
## $ no_prev_acc : num 1 1 1 1 0 1 1 2 1 1 ...
## $ one_prev_acc : num 0 2 1 0 0 0 0 0 0 0 ...
## $ two_prev_acc : num 0 0 0 0 1 0 0 0 0 0 ...
## $ no_prev_sus : num 1 3 2 1 1 1 1 1 1 1 ...
## $ one_prev_sus : num 0 0 0 0 0 0 0 1 0 0 ...
## $ two_prev_sus : num 0 0 0 0 0 0 0 0 0 0 ...
## $ no_prev_dwi : num 1 3 2 1 1 1 1 1 1 1 ...
## $ one_prev_dwi : num 0 0 0 0 0 0 0 1 0 0 ...
## $ no_prev_spd : num 1 3 2 1 0 1 1 1 0 0 ...
## $ one_prev_spd : num 0 0 0 0 1 0 0 1 1 1 ...
## $ no_prev_oth : num 1 2 2 1 1 1 1 2 1 1 ...
## $ one_prev_oth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ speed_related : num 0 0 0 0 0 0 1 0 1 1 ...
## $ dr_age_lower16 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dr_age_lower18 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dr_age_lower21 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ dr_age_lower30 : num 0 2 1 0 0 0 0 1 1 1 ...
## $ dr_age_lower65 : num 1 1 1 1 0 0 0 1 0 0 ...
## $ dr_age_65over : num 0 0 0 0 0 1 1 0 0 0 ...
## $ dr_male : num 0 1 2 1 0 1 0 2 1 1 ...
## $ dr_female : num 1 2 0 0 1 0 1 0 0 0 ...
## $ dr_alcohol_drug_med : num 0 0 0 0 0 0 0 1 0 1 ...
## $ total_moving_violations: num 0 0 0 0 0 0 0 1 0 1 ...
## $ nm_alcohol_drug_med : num 99 99 99 99 99 99 99 99 99 99 ...
## $ nm_other_impair : num 99 99 99 99 99 99 99 99 99 99 ...
## $ nm_involved : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_1 : int 1 1 0 0 0 1 1 1 1 1 ...
## $ weather_2 : int 0 0 1 1 1 0 0 0 0 0 ...
## $ weather_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_8 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_10 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_12 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lgt_cond_1 : int 0 1 0 0 0 0 1 1 0 1 ...
## $ lgt_cond_2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lgt_cond_3 : int 1 0 1 0 1 1 0 0 0 0 ...
## $ lgt_cond_4 : int 0 0 0 1 0 0 0 0 1 0 ...
## $ lgt_cond_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lgt_cond_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lgt_cond_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Valid_license_ratio : num 100 100 100 100 100 100 100 50 100 100 ...
## $ no_prev_spd_ratio : num 100 100 100 100 0 100 100 50 0 0 ...
#################################################################
# correlation coefficient matrix ####
#################################################################
str(data_r)
## 'data.frame': 92034 obs. of 74 variables:
## $ state : num 1 1 1 1 1 1 1 1 1 1 ...
## $ st_case : num 10002 10003 10004 10006 10007 ...
## $ ve_total : num 1 3 2 1 1 1 1 2 1 1 ...
## $ ve_forms : num 1 3 2 1 1 1 1 2 1 1 ...
## $ county : num 35 97 115 21 97 9 51 113 95 91 ...
## $ city : num 1080 2100 160 0 0 0 0 0 0 0 ...
## $ day : num 11 14 21 1 1 2 2 3 4 4 ...
## $ month : num 1 1 1 1 1 1 1 1 1 1 ...
## $ year : num 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ date_SIF : num 18273 18276 18283 18263 18263 ...
## $ day_week : num 2 5 5 6 6 7 7 1 2 2 ...
## $ hour : num 6 15 1 5 2 15 9 13 5 16 ...
## $ minute : num 0 10 16 34 40 37 55 50 50 20 ...
## $ nhs : num 0 0 0 1 0 0 0 0 0 0 ...
## $ latitude : num 31.4 30.7 33.9 32.8 30.5 ...
## $ longitud : num -87 -88.1 -86.3 -86.6 -88.3 ...
## $ lgt_cond : num 3 1 3 4 3 3 1 1 4 1 ...
## $ weather : num 1 1 2 2 2 1 1 1 1 1 ...
## $ fatals : num 1 1 1 1 2 1 1 1 1 1 ...
## $ drunk_dr : num 0 0 0 0 0 0 0 1 0 1 ...
## $ total_numoccs : num 1 3 2 1 2 4 2 4 1 3 ...
## $ total_hit_run : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_registered_owner : num 1 3 0 1 0 0 1 1 0 1 ...
## $ total_not_registered : num 0 0 0 0 1 1 0 0 0 0 ...
## $ total_other_owner : num 0 0 2 0 0 0 0 1 1 0 ...
## $ total_fire_exp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_valid_license : num 1 3 2 1 1 1 1 1 1 1 ...
## $ total_invalid_license : num 0 0 0 0 0 0 0 1 0 0 ...
## $ no_prev_acc : num 1 1 1 1 0 1 1 2 1 1 ...
## $ one_prev_acc : num 0 2 1 0 0 0 0 0 0 0 ...
## $ two_prev_acc : num 0 0 0 0 1 0 0 0 0 0 ...
## $ no_prev_sus : num 1 3 2 1 1 1 1 1 1 1 ...
## $ one_prev_sus : num 0 0 0 0 0 0 0 1 0 0 ...
## $ two_prev_sus : num 0 0 0 0 0 0 0 0 0 0 ...
## $ no_prev_dwi : num 1 3 2 1 1 1 1 1 1 1 ...
## $ one_prev_dwi : num 0 0 0 0 0 0 0 1 0 0 ...
## $ no_prev_spd : num 1 3 2 1 0 1 1 1 0 0 ...
## $ one_prev_spd : num 0 0 0 0 1 0 0 1 1 1 ...
## $ no_prev_oth : num 1 2 2 1 1 1 1 2 1 1 ...
## $ one_prev_oth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ speed_related : num 0 0 0 0 0 0 1 0 1 1 ...
## $ dr_age_lower16 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dr_age_lower18 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dr_age_lower21 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ dr_age_lower30 : num 0 2 1 0 0 0 0 1 1 1 ...
## $ dr_age_lower65 : num 1 1 1 1 0 0 0 1 0 0 ...
## $ dr_age_65over : num 0 0 0 0 0 1 1 0 0 0 ...
## $ dr_male : num 0 1 2 1 0 1 0 2 1 1 ...
## $ dr_female : num 1 2 0 0 1 0 1 0 0 0 ...
## $ dr_alcohol_drug_med : num 0 0 0 0 0 0 0 1 0 1 ...
## $ total_moving_violations: num 0 0 0 0 0 0 0 1 0 1 ...
## $ nm_alcohol_drug_med : num 99 99 99 99 99 99 99 99 99 99 ...
## $ nm_other_impair : num 99 99 99 99 99 99 99 99 99 99 ...
## $ nm_involved : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_1 : int 1 1 0 0 0 1 1 1 1 1 ...
## $ weather_2 : int 0 0 1 1 1 0 0 0 0 0 ...
## $ weather_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_8 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_10 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather_12 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lgt_cond_1 : int 0 1 0 0 0 0 1 1 0 1 ...
## $ lgt_cond_2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lgt_cond_3 : int 1 0 1 0 1 1 0 0 0 0 ...
## $ lgt_cond_4 : int 0 0 0 1 0 0 0 0 1 0 ...
## $ lgt_cond_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lgt_cond_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lgt_cond_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Valid_license_ratio : num 100 100 100 100 100 100 100 50 100 100 ...
## $ no_prev_spd_ratio : num 100 100 100 100 0 100 100 50 0 0 ...
data_c <- dplyr::select(data_r,speed_related,drunk_dr,Valid_license_ratio,no_prev_sus,no_prev_dwi,no_prev_spd,dr_age_65over,lgt_cond_1,lgt_cond_2)
str(data_c)
## 'data.frame': 92034 obs. of 9 variables:
## $ speed_related : num 0 0 0 0 0 0 1 0 1 1 ...
## $ drunk_dr : num 0 0 0 0 0 0 0 1 0 1 ...
## $ Valid_license_ratio: num 100 100 100 100 100 100 100 50 100 100 ...
## $ no_prev_sus : num 1 3 2 1 1 1 1 1 1 1 ...
## $ no_prev_dwi : num 1 3 2 1 1 1 1 1 1 1 ...
## $ no_prev_spd : num 1 3 2 1 0 1 1 1 0 0 ...
## $ dr_age_65over : num 0 0 0 0 0 1 1 0 0 0 ...
## $ lgt_cond_1 : int 0 1 0 0 0 0 1 1 0 1 ...
## $ lgt_cond_2 : int 0 0 0 0 0 0 0 0 0 0 ...
cor_data<-cor(data_c)
cor(cor_data)
## speed_related drunk_dr Valid_license_ratio no_prev_sus
## speed_related 1.0000000 0.4228224 -0.50368378 -0.7560828
## drunk_dr 0.4228224 1.0000000 -0.37224131 -0.5203786
## Valid_license_ratio -0.5036838 -0.3722413 1.00000000 0.5206754
## no_prev_sus -0.7560828 -0.5203786 0.52067538 1.0000000
## no_prev_dwi -0.7451149 -0.4795151 0.47211829 0.9900590
## no_prev_spd -0.7296817 -0.4670526 0.40812329 0.9614250
## dr_age_65over -0.4763418 -0.6207393 0.04821349 0.3905728
## lgt_cond_1 -0.3441922 -0.6906966 0.09495841 0.3403067
## lgt_cond_2 0.2867490 0.5544491 -0.15259737 -0.4207686
## no_prev_dwi no_prev_spd dr_age_65over lgt_cond_1
## speed_related -0.7451149 -0.7296817 -0.47634178 -0.34419218
## drunk_dr -0.4795151 -0.4670526 -0.62073927 -0.69069663
## Valid_license_ratio 0.4721183 0.4081233 0.04821349 0.09495841
## no_prev_sus 0.9900590 0.9614250 0.39057278 0.34030673
## no_prev_dwi 1.0000000 0.9750606 0.36647030 0.31510617
## no_prev_spd 0.9750606 1.0000000 0.37941447 0.31788429
## dr_age_65over 0.3664703 0.3794145 1.00000000 0.52762396
## lgt_cond_1 0.3151062 0.3178843 0.52762396 1.00000000
## lgt_cond_2 -0.4025139 -0.4057150 -0.52337502 -0.93105054
## lgt_cond_2
## speed_related 0.2867490
## drunk_dr 0.5544491
## Valid_license_ratio -0.1525974
## no_prev_sus -0.4207686
## no_prev_dwi -0.4025139
## no_prev_spd -0.4057150
## dr_age_65over -0.5233750
## lgt_cond_1 -0.9310505
## lgt_cond_2 1.0000000
corrplot(cor_data,) #Plot correlation coefficient matrix

#################################################################
# Split the data into a train and test set ####
#################################################################
set.seed(123)
trainIndex <- createDataPartition(data_r$speed_related,p = 0.70, list = FALSE)
train <- data_c[trainIndex,] #Split for train set
test <- data_c[-trainIndex,] #Split for test set
#################################################################
# Fit a logistic regression model ####
#################################################################
Fit1 <- glm(speed_related ~ drunk_dr+Valid_license_ratio+no_prev_dwi+no_prev_spd+no_prev_sus+
dr_age_65over,data= train, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(Fit1)
##
## Call:
## glm(formula = speed_related ~ drunk_dr + Valid_license_ratio +
## no_prev_dwi + no_prev_spd + no_prev_sus + dr_age_65over,
## family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1061 -0.7640 -0.4374 -0.1261 6.1910
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.7922831 0.0341623 23.192 < 2e-16 ***
## drunk_dr 0.6551503 0.0197466 33.178 < 2e-16 ***
## Valid_license_ratio -0.0022767 0.0002785 -8.175 2.96e-16 ***
## no_prev_dwi -0.9971590 0.0322813 -30.890 < 2e-16 ***
## no_prev_spd -0.4345266 0.0220119 -19.741 < 2e-16 ***
## no_prev_sus -0.2149254 0.0270941 -7.933 2.15e-15 ***
## dr_age_65over -0.3858241 0.0309390 -12.470 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 69975 on 64423 degrees of freedom
## Residual deviance: 59660 on 64417 degrees of freedom
## AIC: 59674
##
## Number of Fisher Scoring iterations: 6
#################################################################
# Train set predictions ####
#################################################################
probabilities.train <- predict(Fit1,newdata = train, type="response")
predicted.classes.min <- as.factor(ifelse(probabilities.train >= 0.5, "1","0"))
train$speed_related= as.factor(train$speed_related)
confusionMatrix(predicted.classes.min,train$speed_related, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 47082 12370
## 1 2320 2652
##
## Accuracy : 0.772
## 95% CI : (0.7687, 0.7752)
## No Information Rate : 0.7668
## P-Value [Acc > NIR] : 0.0009813
##
## Kappa : 0.1689
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.17654
## Specificity : 0.95304
## Pos Pred Value : 0.53339
## Neg Pred Value : 0.79193
## Prevalence : 0.23317
## Detection Rate : 0.04116
## Detection Prevalence : 0.07718
## Balanced Accuracy : 0.56479
##
## 'Positive' Class : 1
##
#################################################################
# test set predictions ####
#################################################################
probabilities.test <- predict(Fit1,newdata = test, type="response")
predicted.classes.min <- as.factor(ifelse(probabilities.test >= 0.5, "1","0"))
test$speed_related= as.factor(test$speed_related)
confusionMatrix(predicted.classes.min,test$speed_related, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 20113 5384
## 1 965 1148
##
## Accuracy : 0.77
## 95% CI : (0.765, 0.775)
## No Information Rate : 0.7634
## P-Value [Acc > NIR] : 0.004777
##
## Kappa : 0.1695
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.17575
## Specificity : 0.95422
## Pos Pred Value : 0.54330
## Neg Pred Value : 0.78884
## Prevalence : 0.23658
## Detection Rate : 0.04158
## Detection Prevalence : 0.07653
## Balanced Accuracy : 0.56498
##
## 'Positive' Class : 1
##
#view(probabilities.test)
#################################################################
# Creat ROC ####
#################################################################
ROC1 <- roc(test$speed_related,probabilities.test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ROC1, col= "Blue", ylab = "sensitivity ~ TP Rate", xlab = "Specificity ~ FP Rate")

auc1 = auc(test$speed_related,probabilities.test)
auc1
## [1] 0.7704526
#################################################################
# LASSO Analysis ####
#################################################################
#Spliting the data into 2. 80/20
set.seed(1234)
trainIndex <- sample(x=nrow(data), size=nrow(data)*0.8)
train <- data[trainIndex,]
test <- data[-trainIndex,]
x <- model.matrix(total_valid_license~.,train)[,-1]
y <- train$total_valid_license
lasso.mod <- glmnet(x, y, alpha=1)
l_cv.out <- cv.glmnet(x, y, alpha=1)
plot(l_cv.out)

best.lambda <- l_cv.out$lambda.min
best.lambda
## [1] 0.0006016217
max.lambda <-l_cv.out$lambda.1se
max.lambda
## [1] 0.001389824
group1 = as.data.frame(as.matrix(coef(l_cv.out)))
kable(group1,
caption = "<center>Min Model Coefficient</center>",
align = "c",
col.names = "Coefficient") %>%
kable_styling(bootstrap_options = c("bordered",
"responsive",
"hover"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "Total valid License Model",
general_title = "LASSO: ")
Min Model Coefficient
|
|
Coefficient
|
|
(Intercept)
|
0.0028736
|
|
state
|
0.0000000
|
|
st_case
|
0.0000000
|
|
ve_total
|
0.0000000
|
|
ve_forms
|
0.0000000
|
|
county
|
-0.0000041
|
|
city
|
0.0000000
|
|
day
|
0.0000000
|
|
month
|
0.0000000
|
|
year
|
0.0000000
|
|
date_SIF
|
0.0000000
|
|
day_week
|
0.0000000
|
|
hour
|
0.0000000
|
|
minute
|
0.0000000
|
|
nhs
|
0.0000000
|
|
latitude
|
0.0003538
|
|
longitud
|
0.0000511
|
|
lgt_cond
|
0.0000000
|
|
weather
|
0.0000000
|
|
fatals
|
0.0000000
|
|
drunk_dr
|
0.0000000
|
|
total_numoccs
|
0.0005239
|
|
total_hit_run
|
-0.0309499
|
|
total_registered_owner
|
0.1399413
|
|
total_not_registered
|
0.1382886
|
|
total_other_owner
|
0.1316912
|
|
total_fire_exp
|
0.0000000
|
|
total_invalid_license
|
-0.9710006
|
|
no_prev_acc
|
0.3102732
|
|
one_prev_acc
|
0.3124176
|
|
two_prev_acc
|
0.2700281
|
|
no_prev_sus
|
0.0087693
|
|
one_prev_sus
|
0.0088239
|
|
two_prev_sus
|
0.0000000
|
|
no_prev_dwi
|
0.4783067
|
|
one_prev_dwi
|
0.4640121
|
|
no_prev_spd
|
0.0293923
|
|
one_prev_spd
|
0.0309741
|
|
no_prev_oth
|
0.0211508
|
|
one_prev_oth
|
0.0216682
|
|
speed_related
|
-0.0027545
|
|
dr_age_lower16
|
0.0000000
|
|
dr_age_lower18
|
0.0000000
|
|
dr_age_lower21
|
0.0000000
|
|
dr_age_lower30
|
0.0017386
|
|
dr_age_lower65
|
0.0009423
|
|
dr_age_65over
|
0.0000000
|
|
dr_male
|
0.0073201
|
|
dr_female
|
0.0051949
|
|
dr_alcohol_drug_med
|
0.0000000
|
|
total_moving_violations
|
0.0000000
|
|
nm_alcohol_drug_med
|
0.0000097
|
|
nm_other_impair
|
0.0000026
|
|
nm_involved
|
0.0000000
|
|
LASSO:
|
|
Total valid License Model
|
model=lm(speed_related ~ ., data = data)
summary(model)
##
## Call:
## lm(formula = speed_related ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7375 -0.2703 -0.1404 0.1145 12.2685
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.746e+03 1.414e+03 1.942 0.052117 .
## state 7.630e-02 2.776e-02 2.749 0.005978 **
## st_case -7.479e-06 2.775e-06 -2.695 0.007030 **
## ve_total 3.764e-02 5.351e-03 7.034 2.02e-12 ***
## ve_forms 3.409e-02 2.372e-02 1.437 0.150748
## county -5.110e-05 1.394e-05 -3.667 0.000246 ***
## city 3.029e-06 6.828e-07 4.437 9.15e-06 ***
## day -3.697e-03 1.976e-03 -1.871 0.061311 .
## month -1.169e-01 6.012e-02 -1.944 0.051935 .
## year -1.399e+00 7.214e-01 -1.939 0.052512 .
## date_SIF 3.822e-03 1.975e-03 1.935 0.052954 .
## day_week -2.938e-05 6.136e-04 -0.048 0.961808
## hour -1.197e-03 1.928e-04 -6.207 5.41e-10 ***
## minute 1.338e-05 7.395e-05 0.181 0.856380
## nhs -3.844e-02 2.836e-03 -13.555 < 2e-16 ***
## latitude 7.849e-04 3.111e-04 2.523 0.011638 *
## longitud -5.891e-04 9.925e-05 -5.936 2.93e-09 ***
## lgt_cond 1.056e-02 1.399e-03 7.547 4.50e-14 ***
## weather -9.013e-04 4.012e-04 -2.246 0.024683 *
## fatals 3.495e-04 3.692e-03 0.095 0.924565
## drunk_dr 5.855e-02 3.831e-03 15.283 < 2e-16 ***
## total_numoccs 2.016e-03 8.631e-04 2.336 0.019501 *
## total_hit_run -2.242e-02 9.103e-03 -2.463 0.013779 *
## total_registered_owner 7.757e-02 1.846e-02 4.201 2.66e-05 ***
## total_not_registered 8.992e-02 1.852e-02 4.856 1.20e-06 ***
## total_other_owner 8.297e-02 1.845e-02 4.497 6.90e-06 ***
## total_fire_exp 4.354e-02 6.813e-03 6.391 1.66e-10 ***
## total_valid_license -2.160e-01 1.892e-02 -11.414 < 2e-16 ***
## total_invalid_license -1.956e-01 1.888e-02 -10.360 < 2e-16 ***
## no_prev_acc -1.509e-02 1.993e-02 -0.757 0.448909
## one_prev_acc -1.856e-02 2.014e-02 -0.921 0.356837
## two_prev_acc -3.269e-02 1.905e-02 -1.716 0.086169 .
## no_prev_sus -2.387e-02 4.761e-03 -5.015 5.32e-07 ***
## one_prev_sus 2.239e-03 5.731e-03 0.391 0.696034
## two_prev_sus NA NA NA NA
## no_prev_dwi 5.250e-02 1.348e-02 3.896 9.80e-05 ***
## one_prev_dwi 5.214e-02 1.499e-02 3.479 0.000503 ***
## no_prev_spd -1.289e-02 5.166e-03 -2.495 0.012604 *
## one_prev_spd 9.250e-03 5.753e-03 1.608 0.107895
## no_prev_oth -2.064e-02 5.038e-03 -4.097 4.19e-05 ***
## one_prev_oth -6.935e-04 5.669e-03 -0.122 0.902636
## dr_age_lower16 1.678e-01 5.122e-02 3.276 0.001054 **
## dr_age_lower18 1.410e-01 4.798e-02 2.939 0.003289 **
## dr_age_lower21 1.142e-01 4.765e-02 2.397 0.016510 *
## dr_age_lower30 8.688e-02 4.756e-02 1.827 0.067711 .
## dr_age_lower65 6.331e-02 4.752e-02 1.332 0.182766
## dr_age_65over 2.930e-02 4.759e-02 0.616 0.538091
## dr_male -1.335e-01 5.044e-02 -2.646 0.008142 **
## dr_female -1.499e-01 5.049e-02 -2.968 0.002995 **
## dr_alcohol_drug_med 4.957e-02 4.226e-03 11.728 < 2e-16 ***
## total_moving_violations -3.120e-02 3.643e-03 -8.564 < 2e-16 ***
## nm_alcohol_drug_med -2.337e-02 8.423e-03 -2.775 0.005524 **
## nm_other_impair -1.837e-02 1.572e-02 -1.168 0.242615
## nm_involved -4.307e+00 1.802e+00 -2.390 0.016838 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3921 on 91981 degrees of freedom
## Multiple R-squared: 0.1433, Adjusted R-squared: 0.1428
## F-statistic: 296 on 52 and 91981 DF, p-value: < 2.2e-16
#bothmodel = ols_step_both_aic(model)
#ols_step_forward_p(model, penter = 0.3)
#ols_step_backward_p(model)
#Hypothesis
#H0: Fatalities by year were independent of drunk driver and speeding.
#H1: Fatalities by year were dependent of drunk driver and speeding (claim).
CLchi = 0.98
alphch = 1 - CLchi
drunkdriver = Groupproject %>%
group_by(year) %>%
dplyr::select(drunk_dr, speed_related, fatals)
drunkdriver = na.omit(drunkdriver)
chisq = chisq.test(drunkdriver)
hypochires = ifelse(chisq$p.value > alphch, "Do not reject the null hypothesis","Reject the null hypothesis")
Tskhypores_1 = rbind(CLchi*100, alphch, round(chisq$statistic,3), round(chisq$p.value,0), hypochires)
col_nhypochi = c("Hypothesis Result")
row_nhypochi = c("Confidence Level(%):", "Alpha:", "X^2 Value:","p value:","Hypothesis Result:")
dimnames(Tskhypores_1) = list(row_nhypochi,col_nhypochi)
kable(Tskhypores_1,
caption = "<center>Chi-Square Independence Test</center>",
align = "c",
booktabs = TRUE)%>%
kable_styling(bootstrap_options = c("hover",
"bordered"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "H0: Fatalities by year were independent of drunk driver and speeding.\n
H1: Fatalities by year were dependent of drunk driver and speeding (claim).")
Chi-Square Independence Test
|
|
Hypothesis Result
|
|
Confidence Level(%):
|
98
|
|
Alpha:
|
0.02
|
|
X^2 Value:
|
236704.315
|
|
p value:
|
1
|
|
Hypothesis Result:
|
Do not reject the null hypothesis
|
|
Note:
|
H0: Fatalities by year were independent of drunk driver and
speeding. H1: Fatalities by year were dependent of drunk driver
and speeding (claim).
|
#Hypothesis
#H0: There is no interaction effect between year and type of
#Nation Highway System on Fatalities.
#H1: There is an interaction effect between year and type of
#Nation Highway System on Fatalities.
#The hypotheses for the National Highway System types are
#H0: There is no difference between the means of Fatalities for
#two types of National Highway System.
#H1: There is a difference between the means of Fatalities for
#two types of National Highway System.
MAfatal = MH_acc %>%
group_by(year, nhs) %>%
dplyr::select(st_case, fatals) %>%
mutate(Total_cases = length(st_case),
Total_fatal = sum(fatals))
MAfatal = na.omit(MAfatal)
#MAfatal$year = as.factor(MAfatal$year)
#MAfatal$nhs = as.factor(MAfatal$nhs)
NHfatal = NH_acc %>%
group_by(year, nhs) %>%
dplyr::select(st_case, fatals) %>%
mutate(Total_cases = length(st_case),
Total_fatal = sum(fatals))
NHfatal = na.omit(NHfatal)
#NHfatal$year = as.factor(NHfatal$year)
#NHfatal$nhs = as.factor(NHfatal$nhs)
NHfatal = NHfatal %>% group_by(year) %>%
distinct(year, nhs, Total_cases, Total_fatal) %>%
mutate(State = rep(c("NH")))
MAfatal = MAfatal %>% group_by(year) %>%
distinct(year, nhs, Total_cases, Total_fatal) %>%
mutate(State = rep(c("MA")))
MAandNH = rbind(NHfatal, MAfatal)
MAandNH$year = as.factor(MAandNH$year)
MAandNH$nhs = as.factor(MAandNH$nhs)
MAandNH %>%
group_by(State, nhs) %>%
summarise(mean = mean(Total_fatal),
sd = sd(Total_fatal))
MANH = qplot(interaction(State,nhs),Total_fatal , data=MAandNH, geom="boxplot")
MANH + aes(fill = nhs) +
scale_fill_manual(values = c("turquoise", "salmon"),
labels = c('Non_NHS', 'NHS'),
name = "Accident type")+
labs(title = "Fatality in MA and NH",
x = "Group by State and NHS",
y = "Total Fatalities")+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_discrete(labels=c("MA.Non_NHS", "NH.Non_NHS", "MA.NHS", "NH.NHS"))

MAandNHANOVA = aov(Total_fatal ~ year * nhs, data = MAandNH)
MAandNHANOVA_summary = summary(MAandNHANOVA)
#plot(TukeyHSD(MAandNHANOVA), las = 2)
##Hypothesis Testing#####
CL = 0.95
alph = 1 - CL
Fvalinteraction = MAandNHANOVA_summary[[1]][3, "F value"] #Fvalue of interaction
Fvalnhs = MAandNHANOVA_summary[[1]][2, "F value"] #Fvalue of interaction
DofNnhs = MAandNHANOVA_summary[[1]][2, "Df"] #(a-1)(b-1) a is levels in light, b is levels in food
DofDnhs = MAandNHANOVA_summary[[1]][4, "Df"] # (ab)(n-1) n is the number of data values in each group
ANDofNinteraction = MAandNHANOVA_summary[[1]][3, "Df"] #(a-1)(b-1)
ANDofDinteraction = MAandNHANOVA_summary[[1]][4, "Df"] # (ab)(n-1) n is the number of data values in each group
ANCVinteraction = qf(alph, ANDofNinteraction, ANDofDinteraction, lower.tail = FALSE)
ANCVnhs = qf(alph, DofNnhs, DofDnhs, lower.tail = FALSE)
hypoANOVAinteraction = ifelse(Fvalinteraction < ANCVinteraction, "Do not reject the null hypothesis (Fvalue < CV)", "Reject the null hypothesis (Fvalue > CV)")
hypoANOVAnhs = ifelse(Fvalnhs < ANCVnhs, "Do not reject the null hypothesis (Fvalue < CV)", "Reject the null hypothesis (Fvalue > CV)")
##Information Representation#####
nhsfatal <-
MAandNH %>%
group_by(nhs, year) %>% # group by the two factors
summarise(Means = mean(Total_fatal), SEs = sd(Total_fatal)/sqrt(n())) # Mean and Std Er
ggplot(nhsfatal,
aes(x = nhs, y = Means, fill = year,
ymin = Means - SEs, ymax = Means + SEs)) +
# this adds the mean
geom_col(position = position_dodge()) +
# this adds the error bars
geom_errorbar(position = position_dodge(0.9), width=.2) +
# controlling the appearance
xlab("NHS") + ylab("Fatalities") +
scale_x_discrete(labels=c("Non-NHS", "NHS"))

hyporesANOVA = rbind(CL, alph, round(ANCVinteraction,3), round(Fvalinteraction,3), hypoANOVAinteraction)
col_twoAN = c("Hypothesis Result")
row_twoAN = c("Confidence Level(%):", "Alpha:", "CV:","F value:","Hypothesis Result:")
dimnames(hyporesANOVA) = list(row_twoAN, col_twoAN)
kable(hyporesANOVA,
caption = "<center>Two way ANOVA interaction Test</center>",
align = "c",
booktabs = TRUE) %>%
kable_styling(bootstrap_options = c("hover",
"bordered"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "𝐻0: There is no interaction effect between year and type of Nation Highway System on Fatalities and \n 𝐻1: There is an interaction effect between year and type of Nation Highway System on Fatalities.")
Two way ANOVA interaction Test
|
|
Hypothesis Result
|
|
Confidence Level(%):
|
0.95
|
|
Alpha:
|
0.05
|
|
CV:
|
3.478
|
|
F value:
|
0.014
|
|
Hypothesis Result:
|
Do not reject the null hypothesis (Fvalue < CV)
|
|
Note:
|
𝐻0: There is no interaction effect between year and type of
Nation Highway System on Fatalities and 𝐻1: There is an interaction
effect between year and type of Nation Highway System on Fatalities.
|
hyporestwoANOVAnhs = rbind(CL, alph, round(ANCVnhs,3), round(Fvalnhs,3), hypoANOVAnhs)
col_twoANOVAnhs = c("Hypothesis Result")
row_twoANOVAnhs = c("Confidence Level(%):", "Alpha:", "CV:","F value:","Hypothesis Result:")
dimnames(hyporestwoANOVAnhs) = list(row_twoANOVAnhs, col_twoANOVAnhs)
kable(hyporestwoANOVAnhs,
caption = "<center>Two way ANOVA mean of Fatalities to NHS test</center>",
align = "c",
booktabs = TRUE) %>%
kable_styling(bootstrap_options = c("hover",
"bordered"),
font_size = 11) %>%
scroll_box(width = "100%", height = "100%") %>%
footnote(general = "H0: There is no difference between the means of Fatalities for two types of National Highway System and \n H1: There is a difference between the means of Fatalities for two types of National Highway System.")
Two way ANOVA mean of Fatalities to NHS test
|
|
Hypothesis Result
|
|
Confidence Level(%):
|
0.95
|
|
Alpha:
|
0.05
|
|
CV:
|
4.965
|
|
F value:
|
4.169
|
|
Hypothesis Result:
|
Do not reject the null hypothesis (Fvalue < CV)
|
|
Note:
|
H0: There is no difference between the means of Fatalities
for two types of National Highway System and H1: There is a
difference between the means of Fatalities for two types of National
Highway System.
|
data_t<- dplyr::select(data_input,fatals,permvit,pernotmvit,speed_related) #clean data for t test
data_t<- data_t[complete.cases(data_t),] #Clean all NA value
data_t
#Fatalities ratio
data_t$Fatals_ratio <- data_t$fatals/(data_t$pernotmvit+data_t$permvit)*100
speed= subset(data_t, subset=(data_t$speed_related=="1")) #Selet speed dataset
no_speed = subset(data_t, subset=(data_t$speed_related=="0")) #Selet no speed dataset
data_t
data_t$speed <- ifelse(data_t$speed_related==0,"No","Yes") #Creat subset to show "no"and"yes" for speed
ggboxplot(data_t, x = "speed", y = "Fatals_ratio",
color = "speed", palette = c("#00AFBB", "#E7B800"),
ylab = "Fatalities ratio", xlab = "speed conditon")+ #creat boxplot
coord_cartesian(ylim = c(0, 120))

t.test(no_speed$Fatals_ratio,speed$Fatals_ratio, alternative = "two.sided", var.equal = FALSE) #t test
##
## Welch Two Sample t-test
##
## data: no_speed$Fatals_ratio and speed$Fatals_ratio
## t = -74.401, df = 41321, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -16.00511 -15.18348
## sample estimates:
## mean of x mean of y
## 55.00323 70.59752
df <- Groupproject
df1 <- df %>% group_by(state,year) %>% mutate(fatal = sum(fatals))
df1 <- df1 %>% filter(row_number()==1)
pop <- read.csv("Datasets/nst-est2019-alldata.csv") %>% dplyr::select(POPESTIMATE2014,STATE,NAME )
df2 <- merge(pop, df1,by.x="STATE", by.y="state", all.y=T)
df2$fatality = df2$fatal/df2$POPESTIMATE2014 * 100000
# question: what is the rate of fatality in each state in 2014
df3 = dplyr::select(df2, NAME, fatality)
df3 %<>% na.omit()
df3 %>%
kable()%>%
kable_styling(bootstrap_options = "bordered")
|
|
NAME
|
fatality
|
|
1
|
Alabama
|
17.803300
|
|
2
|
Alabama
|
17.617419
|
|
3
|
Alabama
|
17.865260
|
|
4
|
Alabama
|
18.484865
|
|
6
|
Alabama
|
16.935854
|
|
7
|
Alaska
|
7.605771
|
|
8
|
Alaska
|
8.013223
|
|
9
|
Alaska
|
9.914666
|
|
10
|
Alaska
|
6.926684
|
|
11
|
Alaska
|
9.778849
|
|
12
|
Arizona
|
12.198360
|
|
13
|
Arizona
|
11.277168
|
|
15
|
Arizona
|
12.614382
|
|
16
|
Arizona
|
12.272649
|
|
17
|
Arizona
|
11.485179
|
|
18
|
Arkansas
|
19.242486
|
|
19
|
Arkansas
|
18.871790
|
|
20
|
Arkansas
|
16.782414
|
|
21
|
Arkansas
|
15.838824
|
|
23
|
Arkansas
|
18.568494
|
|
25
|
California
|
7.684541
|
|
26
|
California
|
7.295909
|
|
27
|
California
|
8.049854
|
|
28
|
California
|
7.047185
|
|
29
|
California
|
8.036900
|
|
31
|
Colorado
|
8.859646
|
|
32
|
Colorado
|
8.354983
|
|
33
|
Colorado
|
9.009176
|
|
34
|
Colorado
|
8.411056
|
|
35
|
Colorado
|
9.121323
|
|
36
|
Connecticut
|
6.148241
|
|
37
|
Connecticut
|
7.344505
|
|
38
|
Connecticut
|
8.902430
|
|
39
|
Connecticut
|
7.956547
|
|
40
|
Connecticut
|
6.899384
|
|
42
|
Delaware
|
12.225371
|
|
43
|
Delaware
|
10.831250
|
|
45
|
Delaware
|
10.616770
|
|
46
|
Delaware
|
13.297773
|
|
47
|
Delaware
|
10.616770
|
|
48
|
District of Columbia
|
3.019652
|
|
49
|
District of Columbia
|
2.264739
|
|
50
|
District of Columbia
|
3.623582
|
|
51
|
District of Columbia
|
4.076530
|
|
52
|
District of Columbia
|
3.472600
|
|
53
|
Florida
|
12.249375
|
|
54
|
Florida
|
12.314879
|
|
55
|
Florida
|
12.093171
|
|
56
|
Florida
|
12.566820
|
|
57
|
Florida
|
12.108288
|
|
59
|
Georgia
|
11.721142
|
|
60
|
Georgia
|
12.386665
|
|
61
|
Georgia
|
11.840341
|
|
62
|
Georgia
|
12.178068
|
|
64
|
Georgia
|
11.562212
|
|
65
|
Hawaii
|
7.069446
|
|
66
|
Hawaii
|
6.715974
|
|
67
|
Hawaii
|
8.836808
|
|
68
|
Hawaii
|
7.210835
|
|
69
|
Hawaii
|
7.988474
|
|
71
|
Idaho
|
11.280648
|
|
72
|
Idaho
|
11.403263
|
|
73
|
Idaho
|
12.813344
|
|
74
|
Idaho
|
10.238414
|
|
76
|
Idaho
|
13.119884
|
|
77
|
Illinois
|
7.171411
|
|
78
|
Illinois
|
7.419772
|
|
79
|
Illinois
|
7.124844
|
|
81
|
Illinois
|
7.691416
|
|
82
|
Illinois
|
7.194695
|
|
84
|
Indiana
|
11.844740
|
|
85
|
Indiana
|
11.890239
|
|
86
|
Indiana
|
11.298760
|
|
87
|
Indiana
|
11.389757
|
|
88
|
Indiana
|
11.435255
|
|
89
|
Iowa
|
11.738788
|
|
90
|
Iowa
|
10.195057
|
|
91
|
Iowa
|
10.355862
|
|
93
|
Iowa
|
11.577982
|
|
94
|
Iowa
|
12.542814
|
|
95
|
Kansas
|
13.273688
|
|
97
|
Kansas
|
13.963230
|
|
98
|
Kansas
|
13.308165
|
|
99
|
Kansas
|
12.066989
|
|
100
|
Kansas
|
14.859635
|
|
101
|
Kentucky
|
15.223083
|
|
102
|
Kentucky
|
16.899434
|
|
104
|
Kentucky
|
14.452867
|
|
105
|
Kentucky
|
17.216582
|
|
106
|
Kentucky
|
16.310446
|
|
107
|
Louisiana
|
15.137770
|
|
108
|
Louisiana
|
15.568432
|
|
110
|
Louisiana
|
14.642508
|
|
111
|
Louisiana
|
15.934495
|
|
112
|
Louisiana
|
15.525366
|
|
114
|
Maine
|
12.326073
|
|
115
|
Maine
|
10.221621
|
|
116
|
Maine
|
10.822893
|
|
117
|
Maine
|
12.100596
|
|
118
|
Maine
|
9.845826
|
|
119
|
Maryland
|
8.141295
|
|
120
|
Maryland
|
8.577736
|
|
122
|
Maryland
|
8.325943
|
|
123
|
Maryland
|
7.805572
|
|
124
|
Maryland
|
7.419490
|
|
125
|
Massachusetts
|
5.663506
|
|
127
|
Massachusetts
|
5.131166
|
|
128
|
Massachusetts
|
5.190315
|
|
129
|
Massachusetts
|
5.530421
|
|
130
|
Massachusetts
|
5.234676
|
|
131
|
Michigan
|
9.466409
|
|
132
|
Michigan
|
9.486550
|
|
133
|
Michigan
|
9.536903
|
|
134
|
Michigan
|
9.073654
|
|
135
|
Michigan
|
8.952806
|
|
137
|
Minnesota
|
7.099512
|
|
138
|
Minnesota
|
6.750957
|
|
139
|
Minnesota
|
7.246272
|
|
140
|
Minnesota
|
6.622542
|
|
141
|
Minnesota
|
7.539792
|
|
143
|
Mississippi
|
21.434772
|
|
144
|
Mississippi
|
21.066937
|
|
145
|
Mississippi
|
20.297826
|
|
146
|
Mississippi
|
20.498464
|
|
148
|
Mississippi
|
19.461837
|
|
149
|
Missouri
|
12.648191
|
|
150
|
Missouri
|
13.556351
|
|
151
|
Missouri
|
12.978431
|
|
153
|
Missouri
|
13.638911
|
|
154
|
Missouri
|
12.499583
|
|
156
|
Montana
|
18.789101
|
|
157
|
Montana
|
20.452720
|
|
158
|
Montana
|
18.495521
|
|
159
|
Montana
|
22.409917
|
|
160
|
Montana
|
20.061280
|
|
161
|
Nebraska
|
10.110034
|
|
162
|
Nebraska
|
11.972409
|
|
163
|
Nebraska
|
9.631138
|
|
164
|
Nebraska
|
11.280670
|
|
165
|
Nebraska
|
11.227459
|
|
167
|
Nevada
|
9.440565
|
|
168
|
Nevada
|
10.327836
|
|
169
|
Nevada
|
9.263111
|
|
170
|
Nevada
|
8.730748
|
|
171
|
Nevada
|
9.121147
|
|
173
|
New Hampshire
|
9.599945
|
|
174
|
New Hampshire
|
10.124942
|
|
175
|
New Hampshire
|
8.099953
|
|
176
|
New Hampshire
|
6.749961
|
|
178
|
New Hampshire
|
7.124959
|
|
179
|
New Jersey
|
7.073137
|
|
180
|
New Jersey
|
6.272192
|
|
181
|
New Jersey
|
6.272192
|
|
182
|
New Jersey
|
6.644462
|
|
183
|
New Jersey
|
6.114259
|
|
185
|
New Mexico
|
18.472718
|
|
186
|
New Mexico
|
17.515582
|
|
187
|
New Mexico
|
14.883459
|
|
188
|
New Mexico
|
16.749874
|
|
189
|
New Mexico
|
16.702017
|
|
191
|
New York
|
5.297427
|
|
192
|
New York
|
6.004768
|
|
193
|
New York
|
6.116722
|
|
195
|
New York
|
5.958969
|
|
196
|
New York
|
6.111633
|
|
198
|
North Carolina
|
12.383107
|
|
199
|
North Carolina
|
13.077769
|
|
200
|
North Carolina
|
12.926755
|
|
201
|
North Carolina
|
13.289188
|
|
202
|
North Carolina
|
12.987161
|
|
204
|
North Dakota
|
20.070491
|
|
205
|
North Dakota
|
23.053942
|
|
206
|
North Dakota
|
14.239200
|
|
207
|
North Dakota
|
20.070491
|
|
208
|
North Dakota
|
18.307542
|
|
209
|
Ohio
|
9.661544
|
|
211
|
Ohio
|
9.308178
|
|
212
|
Ohio
|
8.523878
|
|
213
|
Ohio
|
8.670396
|
|
214
|
Ohio
|
8.765201
|
|
215
|
Oklahoma
|
17.946530
|
|
216
|
Oklahoma
|
17.250329
|
|
217
|
Oklahoma
|
18.281738
|
|
218
|
Oklahoma
|
17.224543
|
|
219
|
Oklahoma
|
17.482396
|
|
221
|
Oregon
|
9.007772
|
|
222
|
Oregon
|
8.503135
|
|
223
|
Oregon
|
7.998498
|
|
224
|
Oregon
|
7.897571
|
|
225
|
Oregon
|
8.351744
|
|
227
|
Pennsylvania
|
9.344469
|
|
228
|
Pennsylvania
|
10.056057
|
|
229
|
Pennsylvania
|
10.353203
|
|
230
|
Pennsylvania
|
9.461764
|
|
231
|
Pennsylvania
|
10.243728
|
|
233
|
Rhode Island
|
6.060973
|
|
234
|
Rhode Island
|
4.829838
|
|
235
|
Rhode Island
|
6.155676
|
|
236
|
Rhode Island
|
6.345082
|
|
237
|
Rhode Island
|
6.250379
|
|
238
|
South Carolina
|
17.061885
|
|
239
|
South Carolina
|
17.165542
|
|
241
|
South Carolina
|
16.771647
|
|
242
|
South Carolina
|
15.900931
|
|
243
|
South Carolina
|
17.891139
|
|
244
|
South Dakota
|
15.663109
|
|
245
|
South Dakota
|
16.016412
|
|
246
|
South Dakota
|
16.487483
|
|
247
|
South Dakota
|
13.072219
|
|
249
|
South Dakota
|
15.898644
|
|
250
|
Tennessee
|
15.516976
|
|
251
|
Tennessee
|
14.324538
|
|
252
|
Tennessee
|
15.776866
|
|
253
|
Tennessee
|
15.211223
|
|
255
|
Tennessee
|
14.722018
|
|
256
|
Texas
|
12.638918
|
|
257
|
Texas
|
11.211106
|
|
258
|
Texas
|
12.568455
|
|
259
|
Texas
|
11.326073
|
|
260
|
Texas
|
13.113619
|
|
262
|
Utah
|
7.388796
|
|
263
|
Utah
|
7.490945
|
|
264
|
Utah
|
8.614587
|
|
265
|
Utah
|
8.274090
|
|
266
|
Utah
|
8.716736
|
|
267
|
Vermont
|
12.315783
|
|
268
|
Vermont
|
11.036221
|
|
270
|
Vermont
|
8.796988
|
|
271
|
Vermont
|
11.356112
|
|
272
|
Vermont
|
7.037590
|
|
273
|
Virginia
|
9.337031
|
|
274
|
Virginia
|
8.903870
|
|
276
|
Virginia
|
9.192644
|
|
277
|
Virginia
|
8.903870
|
|
278
|
Virginia
|
8.458676
|
|
280
|
Washington
|
6.435467
|
|
281
|
Washington
|
6.208666
|
|
282
|
Washington
|
6.520517
|
|
283
|
Washington
|
6.180316
|
|
284
|
Washington
|
6.548867
|
|
285
|
West Virginia
|
18.275318
|
|
287
|
West Virginia
|
17.031731
|
|
288
|
West Virginia
|
17.950904
|
|
289
|
West Virginia
|
18.329387
|
|
290
|
West Virginia
|
14.706765
|
|
291
|
Wisconsin
|
10.692816
|
|
292
|
Wisconsin
|
9.945189
|
|
293
|
Wisconsin
|
9.440974
|
|
294
|
Wisconsin
|
10.119055
|
|
295
|
Wisconsin
|
8.797667
|
|
297
|
Wyoming
|
14.934828
|
|
298
|
Wyoming
|
23.174732
|
|
299
|
Wyoming
|
26.608026
|
|
300
|
Wyoming
|
21.114756
|
|
301
|
Wyoming
|
25.749703
|