#################################################################
#  Import dataset ####
#################################################################
data_input<- Groupproject

#################################################################
# Analysis Data ####
#################################################################
summary(data_input)
##      state          st_case          ve_total         ve_forms     
##  Min.   : 1.00   Min.   : 10001   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.:12.00   1st Qu.:122120   1st Qu.: 1.000   1st Qu.: 1.000  
##  Median :28.00   Median :280279   Median : 1.000   Median : 1.000  
##  Mean   :27.95   Mean   :280092   Mean   : 1.522   Mean   : 1.486  
##  3rd Qu.:42.00   3rd Qu.:420723   3rd Qu.: 2.000   3rd Qu.: 2.000  
##  Max.   :56.00   Max.   :560139   Max.   :90.000   Max.   :90.000  
##                                   NA's   :1833                     
##      county            city           day            month       
##  Min.   :  0.00   Min.   :   0   Min.   : 1.00   Min.   : 1.000  
##  1st Qu.: 31.00   1st Qu.:   0   1st Qu.: 8.00   1st Qu.: 4.000  
##  Median : 71.00   Median :   0   Median :16.00   Median : 7.000  
##  Mean   : 91.54   Mean   :1194   Mean   :15.67   Mean   : 6.754  
##  3rd Qu.:115.00   3rd Qu.:1940   3rd Qu.:23.00   3rd Qu.:10.000  
##  Max.   :997.00   Max.   :9997   Max.   :31.00   Max.   :12.000  
##  NA's   :1838     NA's   :1891   NA's   :1833    NA's   :1833    
##       year         date_SIF        day_week          hour           minute    
##  Min.   :2010   Min.   :18263   Min.   :1.000   Min.   : 0.00   Min.   : 0.0  
##  1st Qu.:2011   1st Qu.:18735   1st Qu.:2.000   1st Qu.: 7.00   1st Qu.:14.0  
##  Median :2012   Median :19182   Median :4.000   Median :14.00   Median :29.0  
##  Mean   :2012   Mean   :19182   Mean   :4.144   Mean   :12.63   Mean   :28.4  
##  3rd Qu.:2013   3rd Qu.:19632   3rd Qu.:6.000   3rd Qu.:18.00   3rd Qu.:44.0  
##  Max.   :2014   Max.   :20088   Max.   :7.000   Max.   :23.00   Max.   :59.0  
##  NA's   :1833   NA's   :1833    NA's   :1833    NA's   :2940    NA's   :2961  
##       nhs            latitude        longitud          lgt_cond    
##  Min.   :0.0000   Min.   :18.96   Min.   :-165.41   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:33.26   1st Qu.: -97.72   1st Qu.:1.000  
##  Median :0.0000   Median :36.58   Median : -87.74   Median :2.000  
##  Mean   :0.3267   Mean   :36.72   Mean   : -91.90   Mean   :1.827  
##  3rd Qu.:1.0000   3rd Qu.:40.66   3rd Qu.: -81.33   3rd Qu.:2.000  
##  Max.   :1.0000   Max.   :68.43   Max.   : -66.99   Max.   :7.000  
##  NA's   :2321     NA's   :3481    NA's   :3481      NA's   :2485   
##     weather           fatals          drunk_dr      total_numoccs    
##  Min.   : 1.000   Min.   : 1.000   Min.   :0.0000   Min.   :  0.000  
##  1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.:0.0000   1st Qu.:  1.000  
##  Median : 1.000   Median : 1.000   Median :0.0000   Median :  2.000  
##  Mean   : 2.588   Mean   : 1.089   Mean   :0.3156   Mean   :  2.269  
##  3rd Qu.: 2.000   3rd Qu.: 1.000   3rd Qu.:1.0000   3rd Qu.:  3.000  
##  Max.   :12.000   Max.   :15.000   Max.   :4.0000   Max.   :127.000  
##  NA's   :3040     NA's   :1833     NA's   :1833     NA's   :921      
##  total_hit_run    total_registered_owner total_not_registered total_other_owner
##  Min.   :0.0000   Min.   : 0.00          Min.   : 0.00        Min.   : 0.000   
##  1st Qu.:0.0000   1st Qu.: 0.00          1st Qu.: 0.00        1st Qu.: 0.000   
##  Median :0.0000   Median : 1.00          Median : 0.00        Median : 0.000   
##  Mean   :0.0474   Mean   : 0.84          Mean   : 0.43        Mean   : 0.215   
##  3rd Qu.:0.0000   3rd Qu.: 1.00          3rd Qu.: 1.00        3rd Qu.: 0.000   
##  Max.   :1.0000   Max.   :49.00          Max.   :17.00        Max.   :23.000   
##  NA's   :806      NA's   :3837           NA's   :3837         NA's   :3837     
##  total_fire_exp   total_valid_license total_invalid_license  no_prev_acc    
##  Min.   :0.0000   Min.   : 0.000      Min.   :0.0000        Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.: 1.000      1st Qu.:0.0000        1st Qu.: 1.000  
##  Median :0.0000   Median : 1.000      Median :0.0000        Median : 1.000  
##  Mean   :0.0405   Mean   : 1.299      Mean   :0.1734        Mean   : 1.284  
##  3rd Qu.:0.0000   3rd Qu.: 2.000      3rd Qu.:0.0000        3rd Qu.: 2.000  
##  Max.   :1.0000   Max.   :83.000      Max.   :5.0000        Max.   :74.000  
##  NA's   :694      NA's   :4163        NA's   :694           NA's   :15103   
##   one_prev_acc     two_prev_acc    no_prev_sus      one_prev_sus  
##  Min.   : 0.000   Min.   :0.000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 0.000   1st Qu.:0.000   1st Qu.: 1.000   1st Qu.:0.000  
##  Median : 0.000   Median :0.000   Median : 1.000   Median :0.000  
##  Mean   : 0.153   Mean   :0.069   Mean   : 1.266   Mean   :0.102  
##  3rd Qu.: 0.000   3rd Qu.:0.000   3rd Qu.: 2.000   3rd Qu.:0.000  
##  Max.   :12.000   Max.   :7.000   Max.   :80.000   Max.   :6.000  
##  NA's   :15103    NA's   :15103   NA's   :4997     NA's   :4997   
##   two_prev_sus    no_prev_dwi      one_prev_dwi    no_prev_spd    
##  Min.   :0.000   Min.   : 0.000   Min.   :0.000   Min.   : 0.000  
##  1st Qu.:0.000   1st Qu.: 1.000   1st Qu.:0.000   1st Qu.: 1.000  
##  Median :0.000   Median : 1.000   Median :0.000   Median : 1.000  
##  Mean   :0.131   Mean   : 1.436   Mean   :0.033   Mean   : 1.227  
##  3rd Qu.:0.000   3rd Qu.: 2.000   3rd Qu.:0.000   3rd Qu.: 2.000  
##  Max.   :6.000   Max.   :88.000   Max.   :2.000   Max.   :76.000  
##  NA's   :4997    NA's   :4997     NA's   :4997    NA's   :4997    
##   one_prev_spd     no_prev_oth      one_prev_oth    speed_related  
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.: 0.000   1st Qu.:0.000  
##  Median : 0.000   Median : 1.000   Median : 0.000   Median :0.000  
##  Mean   : 0.183   Mean   : 1.222   Mean   : 0.174   Mean   :0.235  
##  3rd Qu.: 0.000   3rd Qu.: 2.000   3rd Qu.: 0.000   3rd Qu.:0.000  
##  Max.   :11.000   Max.   :71.000   Max.   :12.000   Max.   :1.000  
##  NA's   :4997     NA's   :4997     NA's   :4997     NA's   :6503   
##  dr_age_lower16  dr_age_lower18  dr_age_lower21  dr_age_lower30 
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :0.000   Median :0.000   Median :0.000  
##  Mean   :0.005   Mean   :0.035   Mean   :0.105   Mean   :0.316  
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:1.000  
##  Max.   :2.000   Max.   :3.000   Max.   :5.000   Max.   :9.000  
##  NA's   :4844    NA's   :4844    NA's   :4844    NA's   :4844   
##  dr_age_lower65   dr_age_65over       dr_male         dr_female     
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.: 0.000  
##  Median : 1.000   Median : 0.000   Median : 1.000   Median : 0.000  
##  Mean   : 0.826   Mean   : 0.194   Mean   : 1.094   Mean   : 0.387  
##  3rd Qu.: 1.000   3rd Qu.: 0.000   3rd Qu.: 1.000   3rd Qu.: 1.000  
##  Max.   :64.000   Max.   :11.000   Max.   :67.000   Max.   :23.000  
##  NA's   :4844     NA's   :4844     NA's   :4666     NA's   :4666    
##    drugs_inv      dr_alcohol_drug_med dr_other_impair total_moving_violations
##  Min.   :0.00     Min.   :0.00        Min.   :0.00    Min.   :0.0000         
##  1st Qu.:0.00     1st Qu.:0.00        1st Qu.:0.00    1st Qu.:0.0000         
##  Median :0.00     Median :0.00        Median :0.00    Median :0.0000         
##  Mean   :0.25     Mean   :0.27        Mean   :0.07    Mean   :0.1512         
##  3rd Qu.:1.00     3rd Qu.:1.00        3rd Qu.:0.00    3rd Qu.:0.0000         
##  Max.   :1.00     Max.   :1.00        Max.   :1.00    Max.   :1.0000         
##  NA's   :101756   NA's   :40450       NA's   :40450   NA's   :3152           
##  nm_alcohol_drug_med nm_other_impair  nm_involved       pernotmvit   
##  Min.   : 0.0        Min.   : 0.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:99.0        1st Qu.:99.00   1st Qu.:0.0000   1st Qu.: 0.00  
##  Median :99.0        Median :99.00   Median :0.0000   Median : 0.00  
##  Mean   :87.3        Mean   :87.28   Mean   :0.1793   Mean   : 0.21  
##  3rd Qu.:99.0        3rd Qu.:99.00   3rd Qu.:0.0000   3rd Qu.: 0.00  
##  Max.   :99.0        Max.   :99.00   Max.   :1.0000   Max.   :21.00  
##  NA's   :10572       NA's   :10572                    NA's   :32129  
##     permvit      
##  Min.   :  0.00  
##  1st Qu.:  1.00  
##  Median :  2.00  
##  Mean   :  2.25  
##  3rd Qu.:  3.00  
##  Max.   :127.00  
##  NA's   :32129
str(data_input)
## 'data.frame':    153260 obs. of  58 variables:
##  $ state                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ st_case                : num  10001 10002 10003 10004 10005 ...
##  $ ve_total               : num  1 1 3 2 2 1 1 1 1 2 ...
##  $ ve_forms               : num  1 1 3 2 2 1 1 1 1 2 ...
##  $ county                 : num  81 35 97 115 117 21 97 9 51 113 ...
##  $ city                   : num  2340 1080 2100 160 0 0 0 0 0 0 ...
##  $ day                    : num  15 11 14 21 4 1 1 2 2 3 ...
##  $ month                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ year                   : num  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ date_SIF               : num  18277 18273 18276 18283 18266 ...
##  $ day_week               : num  6 2 5 5 2 6 6 7 7 1 ...
##  $ hour                   : num  4 6 15 1 6 5 2 15 9 13 ...
##  $ minute                 : num  10 0 10 16 38 34 40 37 55 50 ...
##  $ nhs                    : num  1 0 0 0 1 1 0 0 0 0 ...
##  $ latitude               : num  32.6 31.4 30.7 33.9 33.3 ...
##  $ longitud               : num  -85.4 -87 -88.1 -86.3 -86.8 ...
##  $ lgt_cond               : num  3 3 1 3 4 4 3 3 1 1 ...
##  $ weather                : num  1 1 1 2 1 2 2 1 1 1 ...
##  $ fatals                 : num  1 1 1 1 1 1 2 1 1 1 ...
##  $ drunk_dr               : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ total_numoccs          : num  2 1 3 2 2 1 2 4 2 4 ...
##  $ total_hit_run          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_registered_owner : num  0 1 3 0 2 1 0 0 1 1 ...
##  $ total_not_registered   : num  1 0 0 0 0 0 1 1 0 0 ...
##  $ total_other_owner      : num  0 0 0 2 0 0 0 0 0 1 ...
##  $ total_fire_exp         : num  1 0 0 0 1 0 0 0 0 0 ...
##  $ total_valid_license    : num  1 1 3 2 2 1 1 1 1 1 ...
##  $ total_invalid_license  : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ no_prev_acc            : num  NA 1 1 1 2 1 0 1 1 2 ...
##  $ one_prev_acc           : num  NA 0 2 1 0 0 0 0 0 0 ...
##  $ two_prev_acc           : num  NA 0 0 0 0 0 1 0 0 0 ...
##  $ no_prev_sus            : num  0 1 3 2 2 1 1 1 1 1 ...
##  $ one_prev_sus           : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ two_prev_sus           : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ no_prev_dwi            : num  1 1 3 2 2 1 1 1 1 1 ...
##  $ one_prev_dwi           : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ no_prev_spd            : num  0 1 3 2 1 1 0 1 1 1 ...
##  $ one_prev_spd           : num  0 0 0 0 0 0 1 0 0 1 ...
##  $ no_prev_oth            : num  1 1 2 2 2 1 1 1 1 2 ...
##  $ one_prev_oth           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ speed_related          : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ dr_age_lower16         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower18         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower21         : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ dr_age_lower30         : num  0 0 2 1 0 0 0 0 0 1 ...
##  $ dr_age_lower65         : num  1 1 1 1 2 1 0 0 0 1 ...
##  $ dr_age_65over          : num  0 0 0 0 0 0 0 1 1 0 ...
##  $ dr_male                : num  0 0 1 2 2 1 0 1 0 2 ...
##  $ dr_female              : num  1 1 2 0 0 0 1 0 1 0 ...
##  $ drugs_inv              : num  NA 0 NA NA NA 0 NA NA NA NA ...
##  $ dr_alcohol_drug_med    : num  0 0 0 0 NA 0 0 0 0 1 ...
##  $ dr_other_impair        : num  1 0 0 0 NA 1 1 0 1 0 ...
##  $ total_moving_violations: num  0 0 0 0 0 0 0 0 0 1 ...
##  $ nm_alcohol_drug_med    : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_other_impair        : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_involved            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pernotmvit             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ permvit                : num  NA NA NA NA NA NA NA NA NA NA ...
#Find the proportion of missing data that occurs in each category of data
miss<-function(x){sum(is.na(x))/length(x)*100}                 #Create a function of calculate proportion
apply(data_input,2,miss)                                       #Run the function with our data
##                   state                 st_case                ve_total 
##               0.0000000               0.0000000               1.1960068 
##                ve_forms                  county                    city 
##               0.0000000               1.1992692               1.2338510 
##                     day                   month                    year 
##               1.1960068               1.1960068               1.1960068 
##                date_SIF                day_week                    hour 
##               1.1960068               1.1960068               1.9183088 
##                  minute                     nhs                latitude 
##               1.9320110               1.5144199               2.2713037 
##                longitud                lgt_cond                 weather 
##               2.2713037               1.6214276               1.9835574 
##                  fatals                drunk_dr           total_numoccs 
##               1.1960068               1.1960068               0.6009396 
##           total_hit_run  total_registered_owner    total_not_registered 
##               0.5259037               2.5035887               2.5035887 
##       total_other_owner          total_fire_exp     total_valid_license 
##               2.5035887               0.4528253               2.7162991 
##   total_invalid_license             no_prev_acc            one_prev_acc 
##               0.4528253               9.8544956               9.8544956 
##            two_prev_acc             no_prev_sus            one_prev_sus 
##               9.8544956               3.2604724               3.2604724 
##            two_prev_sus             no_prev_dwi            one_prev_dwi 
##               3.2604724               3.2604724               3.2604724 
##             no_prev_spd            one_prev_spd             no_prev_oth 
##               3.2604724               3.2604724               3.2604724 
##            one_prev_oth           speed_related          dr_age_lower16 
##               3.2604724               4.2431163               3.1606420 
##          dr_age_lower18          dr_age_lower21          dr_age_lower30 
##               3.1606420               3.1606420               3.1606420 
##          dr_age_lower65           dr_age_65over                 dr_male 
##               3.1606420               3.1606420               3.0444995 
##               dr_female               drugs_inv     dr_alcohol_drug_med 
##               3.0444995              66.3943625              26.3930575 
##         dr_other_impair total_moving_violations     nm_alcohol_drug_med 
##              26.3930575               2.0566358               6.8980817 
##         nm_other_impair             nm_involved              pernotmvit 
##               6.8980817               0.0000000              20.9637218 
##                 permvit 
##              20.9637218
#Remove the data with a missing ratio greater than 15%.
data <- dplyr::select(data_input,-permvit,-dr_other_impair,-drugs_inv,-pernotmvit )

#Clean all NA value
data <- data[complete.cases(data),]


#################################################################
# Create subset for regration analysis ####
#################################################################

#Create subset for weather and light
data_r<- dummy_cols(data,select_columns = "weather")        # 1 = clean, 5 = Snow
data_r <- dummy_cols(data_r,select_columns = "lgt_cond")    # 1 = Daylight; 2 = Dark - Not lighted; 3 = Dark - Lighted;
data_r$Valid_license_ratio  <- data_r$total_valid_license/data_r$ve_total*100
data_r$no_prev_spd_ratio  <- data_r$no_prev_spd/data_r$ve_total*100
str(data_r)
## 'data.frame':    92034 obs. of  74 variables:
##  $ state                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ st_case                : num  10002 10003 10004 10006 10007 ...
##  $ ve_total               : num  1 3 2 1 1 1 1 2 1 1 ...
##  $ ve_forms               : num  1 3 2 1 1 1 1 2 1 1 ...
##  $ county                 : num  35 97 115 21 97 9 51 113 95 91 ...
##  $ city                   : num  1080 2100 160 0 0 0 0 0 0 0 ...
##  $ day                    : num  11 14 21 1 1 2 2 3 4 4 ...
##  $ month                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ year                   : num  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ date_SIF               : num  18273 18276 18283 18263 18263 ...
##  $ day_week               : num  2 5 5 6 6 7 7 1 2 2 ...
##  $ hour                   : num  6 15 1 5 2 15 9 13 5 16 ...
##  $ minute                 : num  0 10 16 34 40 37 55 50 50 20 ...
##  $ nhs                    : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ latitude               : num  31.4 30.7 33.9 32.8 30.5 ...
##  $ longitud               : num  -87 -88.1 -86.3 -86.6 -88.3 ...
##  $ lgt_cond               : num  3 1 3 4 3 3 1 1 4 1 ...
##  $ weather                : num  1 1 2 2 2 1 1 1 1 1 ...
##  $ fatals                 : num  1 1 1 1 2 1 1 1 1 1 ...
##  $ drunk_dr               : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ total_numoccs          : num  1 3 2 1 2 4 2 4 1 3 ...
##  $ total_hit_run          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_registered_owner : num  1 3 0 1 0 0 1 1 0 1 ...
##  $ total_not_registered   : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ total_other_owner      : num  0 0 2 0 0 0 0 1 1 0 ...
##  $ total_fire_exp         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_valid_license    : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ total_invalid_license  : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ no_prev_acc            : num  1 1 1 1 0 1 1 2 1 1 ...
##  $ one_prev_acc           : num  0 2 1 0 0 0 0 0 0 0 ...
##  $ two_prev_acc           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ no_prev_sus            : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ one_prev_sus           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ two_prev_sus           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_prev_dwi            : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ one_prev_dwi           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ no_prev_spd            : num  1 3 2 1 0 1 1 1 0 0 ...
##  $ one_prev_spd           : num  0 0 0 0 1 0 0 1 1 1 ...
##  $ no_prev_oth            : num  1 2 2 1 1 1 1 2 1 1 ...
##  $ one_prev_oth           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ speed_related          : num  0 0 0 0 0 0 1 0 1 1 ...
##  $ dr_age_lower16         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower18         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower21         : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ dr_age_lower30         : num  0 2 1 0 0 0 0 1 1 1 ...
##  $ dr_age_lower65         : num  1 1 1 1 0 0 0 1 0 0 ...
##  $ dr_age_65over          : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ dr_male                : num  0 1 2 1 0 1 0 2 1 1 ...
##  $ dr_female              : num  1 2 0 0 1 0 1 0 0 0 ...
##  $ dr_alcohol_drug_med    : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ total_moving_violations: num  0 0 0 0 0 0 0 1 0 1 ...
##  $ nm_alcohol_drug_med    : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_other_impair        : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_involved            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_1              : int  1 1 0 0 0 1 1 1 1 1 ...
##  $ weather_2              : int  0 0 1 1 1 0 0 0 0 0 ...
##  $ weather_3              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_4              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_5              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_6              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_7              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_8              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_10             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_11             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_12             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_1             : int  0 1 0 0 0 0 1 1 0 1 ...
##  $ lgt_cond_2             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_3             : int  1 0 1 0 1 1 0 0 0 0 ...
##  $ lgt_cond_4             : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ lgt_cond_5             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_6             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_7             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Valid_license_ratio    : num  100 100 100 100 100 100 100 50 100 100 ...
##  $ no_prev_spd_ratio      : num  100 100 100 100 0 100 100 50 0 0 ...
#################################################################
# correlation coefficient matrix ####
#################################################################
str(data_r)
## 'data.frame':    92034 obs. of  74 variables:
##  $ state                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ st_case                : num  10002 10003 10004 10006 10007 ...
##  $ ve_total               : num  1 3 2 1 1 1 1 2 1 1 ...
##  $ ve_forms               : num  1 3 2 1 1 1 1 2 1 1 ...
##  $ county                 : num  35 97 115 21 97 9 51 113 95 91 ...
##  $ city                   : num  1080 2100 160 0 0 0 0 0 0 0 ...
##  $ day                    : num  11 14 21 1 1 2 2 3 4 4 ...
##  $ month                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ year                   : num  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ date_SIF               : num  18273 18276 18283 18263 18263 ...
##  $ day_week               : num  2 5 5 6 6 7 7 1 2 2 ...
##  $ hour                   : num  6 15 1 5 2 15 9 13 5 16 ...
##  $ minute                 : num  0 10 16 34 40 37 55 50 50 20 ...
##  $ nhs                    : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ latitude               : num  31.4 30.7 33.9 32.8 30.5 ...
##  $ longitud               : num  -87 -88.1 -86.3 -86.6 -88.3 ...
##  $ lgt_cond               : num  3 1 3 4 3 3 1 1 4 1 ...
##  $ weather                : num  1 1 2 2 2 1 1 1 1 1 ...
##  $ fatals                 : num  1 1 1 1 2 1 1 1 1 1 ...
##  $ drunk_dr               : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ total_numoccs          : num  1 3 2 1 2 4 2 4 1 3 ...
##  $ total_hit_run          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_registered_owner : num  1 3 0 1 0 0 1 1 0 1 ...
##  $ total_not_registered   : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ total_other_owner      : num  0 0 2 0 0 0 0 1 1 0 ...
##  $ total_fire_exp         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_valid_license    : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ total_invalid_license  : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ no_prev_acc            : num  1 1 1 1 0 1 1 2 1 1 ...
##  $ one_prev_acc           : num  0 2 1 0 0 0 0 0 0 0 ...
##  $ two_prev_acc           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ no_prev_sus            : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ one_prev_sus           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ two_prev_sus           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_prev_dwi            : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ one_prev_dwi           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ no_prev_spd            : num  1 3 2 1 0 1 1 1 0 0 ...
##  $ one_prev_spd           : num  0 0 0 0 1 0 0 1 1 1 ...
##  $ no_prev_oth            : num  1 2 2 1 1 1 1 2 1 1 ...
##  $ one_prev_oth           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ speed_related          : num  0 0 0 0 0 0 1 0 1 1 ...
##  $ dr_age_lower16         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower18         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower21         : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ dr_age_lower30         : num  0 2 1 0 0 0 0 1 1 1 ...
##  $ dr_age_lower65         : num  1 1 1 1 0 0 0 1 0 0 ...
##  $ dr_age_65over          : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ dr_male                : num  0 1 2 1 0 1 0 2 1 1 ...
##  $ dr_female              : num  1 2 0 0 1 0 1 0 0 0 ...
##  $ dr_alcohol_drug_med    : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ total_moving_violations: num  0 0 0 0 0 0 0 1 0 1 ...
##  $ nm_alcohol_drug_med    : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_other_impair        : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_involved            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_1              : int  1 1 0 0 0 1 1 1 1 1 ...
##  $ weather_2              : int  0 0 1 1 1 0 0 0 0 0 ...
##  $ weather_3              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_4              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_5              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_6              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_7              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_8              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_10             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_11             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_12             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_1             : int  0 1 0 0 0 0 1 1 0 1 ...
##  $ lgt_cond_2             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_3             : int  1 0 1 0 1 1 0 0 0 0 ...
##  $ lgt_cond_4             : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ lgt_cond_5             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_6             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_7             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Valid_license_ratio    : num  100 100 100 100 100 100 100 50 100 100 ...
##  $ no_prev_spd_ratio      : num  100 100 100 100 0 100 100 50 0 0 ...
data_c <- dplyr::select(data_r,speed_related,drunk_dr,Valid_license_ratio,no_prev_sus,no_prev_dwi,no_prev_spd,dr_age_65over,lgt_cond_1,lgt_cond_2)
str(data_c)
## 'data.frame':    92034 obs. of  9 variables:
##  $ speed_related      : num  0 0 0 0 0 0 1 0 1 1 ...
##  $ drunk_dr           : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ Valid_license_ratio: num  100 100 100 100 100 100 100 50 100 100 ...
##  $ no_prev_sus        : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ no_prev_dwi        : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ no_prev_spd        : num  1 3 2 1 0 1 1 1 0 0 ...
##  $ dr_age_65over      : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ lgt_cond_1         : int  0 1 0 0 0 0 1 1 0 1 ...
##  $ lgt_cond_2         : int  0 0 0 0 0 0 0 0 0 0 ...
cor_data<-cor(data_c)
cor(cor_data)
##                     speed_related   drunk_dr Valid_license_ratio no_prev_sus
## speed_related           1.0000000  0.4228224         -0.50368378  -0.7560828
## drunk_dr                0.4228224  1.0000000         -0.37224131  -0.5203786
## Valid_license_ratio    -0.5036838 -0.3722413          1.00000000   0.5206754
## no_prev_sus            -0.7560828 -0.5203786          0.52067538   1.0000000
## no_prev_dwi            -0.7451149 -0.4795151          0.47211829   0.9900590
## no_prev_spd            -0.7296817 -0.4670526          0.40812329   0.9614250
## dr_age_65over          -0.4763418 -0.6207393          0.04821349   0.3905728
## lgt_cond_1             -0.3441922 -0.6906966          0.09495841   0.3403067
## lgt_cond_2              0.2867490  0.5544491         -0.15259737  -0.4207686
##                     no_prev_dwi no_prev_spd dr_age_65over  lgt_cond_1
## speed_related        -0.7451149  -0.7296817   -0.47634178 -0.34419218
## drunk_dr             -0.4795151  -0.4670526   -0.62073927 -0.69069663
## Valid_license_ratio   0.4721183   0.4081233    0.04821349  0.09495841
## no_prev_sus           0.9900590   0.9614250    0.39057278  0.34030673
## no_prev_dwi           1.0000000   0.9750606    0.36647030  0.31510617
## no_prev_spd           0.9750606   1.0000000    0.37941447  0.31788429
## dr_age_65over         0.3664703   0.3794145    1.00000000  0.52762396
## lgt_cond_1            0.3151062   0.3178843    0.52762396  1.00000000
## lgt_cond_2           -0.4025139  -0.4057150   -0.52337502 -0.93105054
##                     lgt_cond_2
## speed_related        0.2867490
## drunk_dr             0.5544491
## Valid_license_ratio -0.1525974
## no_prev_sus         -0.4207686
## no_prev_dwi         -0.4025139
## no_prev_spd         -0.4057150
## dr_age_65over       -0.5233750
## lgt_cond_1          -0.9310505
## lgt_cond_2           1.0000000
corrplot(cor_data,)                                            #Plot correlation coefficient matrix

#################################################################
# Split the data into a train and test set ####
#################################################################
set.seed(123) 
trainIndex <- createDataPartition(data_r$speed_related,p = 0.70, list = FALSE)
train <- data_c[trainIndex,]                                  #Split for train set
test <- data_c[-trainIndex,]                                  #Split for test set

#################################################################
# Fit a logistic regression model ####
#################################################################
Fit1 <- glm(speed_related ~ drunk_dr+Valid_license_ratio+no_prev_dwi+no_prev_spd+no_prev_sus+
              dr_age_65over,data= train, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(Fit1)
## 
## Call:
## glm(formula = speed_related ~ drunk_dr + Valid_license_ratio + 
##     no_prev_dwi + no_prev_spd + no_prev_sus + dr_age_65over, 
##     family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1061  -0.7640  -0.4374  -0.1261   6.1910  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.7922831  0.0341623  23.192  < 2e-16 ***
## drunk_dr             0.6551503  0.0197466  33.178  < 2e-16 ***
## Valid_license_ratio -0.0022767  0.0002785  -8.175 2.96e-16 ***
## no_prev_dwi         -0.9971590  0.0322813 -30.890  < 2e-16 ***
## no_prev_spd         -0.4345266  0.0220119 -19.741  < 2e-16 ***
## no_prev_sus         -0.2149254  0.0270941  -7.933 2.15e-15 ***
## dr_age_65over       -0.3858241  0.0309390 -12.470  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 69975  on 64423  degrees of freedom
## Residual deviance: 59660  on 64417  degrees of freedom
## AIC: 59674
## 
## Number of Fisher Scoring iterations: 6
#################################################################
# Train set predictions ####
#################################################################
probabilities.train <- predict(Fit1,newdata = train, type="response")
predicted.classes.min <- as.factor(ifelse(probabilities.train >= 0.5, "1","0"))
train$speed_related= as.factor(train$speed_related)

confusionMatrix(predicted.classes.min,train$speed_related, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 47082 12370
##          1  2320  2652
##                                           
##                Accuracy : 0.772           
##                  95% CI : (0.7687, 0.7752)
##     No Information Rate : 0.7668          
##     P-Value [Acc > NIR] : 0.0009813       
##                                           
##                   Kappa : 0.1689          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.17654         
##             Specificity : 0.95304         
##          Pos Pred Value : 0.53339         
##          Neg Pred Value : 0.79193         
##              Prevalence : 0.23317         
##          Detection Rate : 0.04116         
##    Detection Prevalence : 0.07718         
##       Balanced Accuracy : 0.56479         
##                                           
##        'Positive' Class : 1               
## 
#################################################################
# test set predictions ####
#################################################################
probabilities.test <- predict(Fit1,newdata = test, type="response")
predicted.classes.min <- as.factor(ifelse(probabilities.test >= 0.5, "1","0"))
test$speed_related= as.factor(test$speed_related)

confusionMatrix(predicted.classes.min,test$speed_related, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 20113  5384
##          1   965  1148
##                                         
##                Accuracy : 0.77          
##                  95% CI : (0.765, 0.775)
##     No Information Rate : 0.7634        
##     P-Value [Acc > NIR] : 0.004777      
##                                         
##                   Kappa : 0.1695        
##                                         
##  Mcnemar's Test P-Value : < 2.2e-16     
##                                         
##             Sensitivity : 0.17575       
##             Specificity : 0.95422       
##          Pos Pred Value : 0.54330       
##          Neg Pred Value : 0.78884       
##              Prevalence : 0.23658       
##          Detection Rate : 0.04158       
##    Detection Prevalence : 0.07653       
##       Balanced Accuracy : 0.56498       
##                                         
##        'Positive' Class : 1             
## 
#view(probabilities.test)

#################################################################
# Creat ROC ####
#################################################################
ROC1 <- roc(test$speed_related,probabilities.test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ROC1, col= "Blue", ylab = "sensitivity ~ TP Rate", xlab = "Specificity ~ FP Rate")

auc1 = auc(test$speed_related,probabilities.test)
auc1
## [1] 0.7704526
#################################################################
# LASSO Analysis ####
#################################################################




#Spliting the data into 2. 80/20
set.seed(1234)
trainIndex <- sample(x=nrow(data), size=nrow(data)*0.8)
train <- data[trainIndex,]
test <- data[-trainIndex,]

x <- model.matrix(total_valid_license~.,train)[,-1]
y <- train$total_valid_license


lasso.mod <- glmnet(x, y, alpha=1)
l_cv.out <- cv.glmnet(x, y, alpha=1)
plot(l_cv.out)

best.lambda <- l_cv.out$lambda.min
best.lambda
## [1] 0.0006016217
max.lambda <-l_cv.out$lambda.1se
max.lambda
## [1] 0.001389824
group1 = as.data.frame(as.matrix(coef(l_cv.out)))

kable(group1,
      caption = "<center>Min Model Coefficient</center>",
      align = "c",
      col.names = "Coefficient") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Total valid License Model",
           general_title = "LASSO: ")
Min Model Coefficient
Coefficient
(Intercept) 0.0028736
state 0.0000000
st_case 0.0000000
ve_total 0.0000000
ve_forms 0.0000000
county -0.0000041
city 0.0000000
day 0.0000000
month 0.0000000
year 0.0000000
date_SIF 0.0000000
day_week 0.0000000
hour 0.0000000
minute 0.0000000
nhs 0.0000000
latitude 0.0003538
longitud 0.0000511
lgt_cond 0.0000000
weather 0.0000000
fatals 0.0000000
drunk_dr 0.0000000
total_numoccs 0.0005239
total_hit_run -0.0309499
total_registered_owner 0.1399413
total_not_registered 0.1382886
total_other_owner 0.1316912
total_fire_exp 0.0000000
total_invalid_license -0.9710006
no_prev_acc 0.3102732
one_prev_acc 0.3124176
two_prev_acc 0.2700281
no_prev_sus 0.0087693
one_prev_sus 0.0088239
two_prev_sus 0.0000000
no_prev_dwi 0.4783067
one_prev_dwi 0.4640121
no_prev_spd 0.0293923
one_prev_spd 0.0309741
no_prev_oth 0.0211508
one_prev_oth 0.0216682
speed_related -0.0027545
dr_age_lower16 0.0000000
dr_age_lower18 0.0000000
dr_age_lower21 0.0000000
dr_age_lower30 0.0017386
dr_age_lower65 0.0009423
dr_age_65over 0.0000000
dr_male 0.0073201
dr_female 0.0051949
dr_alcohol_drug_med 0.0000000
total_moving_violations 0.0000000
nm_alcohol_drug_med 0.0000097
nm_other_impair 0.0000026
nm_involved 0.0000000
LASSO: Total valid License Model
model=lm(speed_related ~ ., data = data)
summary(model)
## 
## Call:
## lm(formula = speed_related ~ ., data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7375 -0.2703 -0.1404  0.1145 12.2685 
## 
## Coefficients: (1 not defined because of singularities)
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.746e+03  1.414e+03   1.942 0.052117 .  
## state                    7.630e-02  2.776e-02   2.749 0.005978 ** 
## st_case                 -7.479e-06  2.775e-06  -2.695 0.007030 ** 
## ve_total                 3.764e-02  5.351e-03   7.034 2.02e-12 ***
## ve_forms                 3.409e-02  2.372e-02   1.437 0.150748    
## county                  -5.110e-05  1.394e-05  -3.667 0.000246 ***
## city                     3.029e-06  6.828e-07   4.437 9.15e-06 ***
## day                     -3.697e-03  1.976e-03  -1.871 0.061311 .  
## month                   -1.169e-01  6.012e-02  -1.944 0.051935 .  
## year                    -1.399e+00  7.214e-01  -1.939 0.052512 .  
## date_SIF                 3.822e-03  1.975e-03   1.935 0.052954 .  
## day_week                -2.938e-05  6.136e-04  -0.048 0.961808    
## hour                    -1.197e-03  1.928e-04  -6.207 5.41e-10 ***
## minute                   1.338e-05  7.395e-05   0.181 0.856380    
## nhs                     -3.844e-02  2.836e-03 -13.555  < 2e-16 ***
## latitude                 7.849e-04  3.111e-04   2.523 0.011638 *  
## longitud                -5.891e-04  9.925e-05  -5.936 2.93e-09 ***
## lgt_cond                 1.056e-02  1.399e-03   7.547 4.50e-14 ***
## weather                 -9.013e-04  4.012e-04  -2.246 0.024683 *  
## fatals                   3.495e-04  3.692e-03   0.095 0.924565    
## drunk_dr                 5.855e-02  3.831e-03  15.283  < 2e-16 ***
## total_numoccs            2.016e-03  8.631e-04   2.336 0.019501 *  
## total_hit_run           -2.242e-02  9.103e-03  -2.463 0.013779 *  
## total_registered_owner   7.757e-02  1.846e-02   4.201 2.66e-05 ***
## total_not_registered     8.992e-02  1.852e-02   4.856 1.20e-06 ***
## total_other_owner        8.297e-02  1.845e-02   4.497 6.90e-06 ***
## total_fire_exp           4.354e-02  6.813e-03   6.391 1.66e-10 ***
## total_valid_license     -2.160e-01  1.892e-02 -11.414  < 2e-16 ***
## total_invalid_license   -1.956e-01  1.888e-02 -10.360  < 2e-16 ***
## no_prev_acc             -1.509e-02  1.993e-02  -0.757 0.448909    
## one_prev_acc            -1.856e-02  2.014e-02  -0.921 0.356837    
## two_prev_acc            -3.269e-02  1.905e-02  -1.716 0.086169 .  
## no_prev_sus             -2.387e-02  4.761e-03  -5.015 5.32e-07 ***
## one_prev_sus             2.239e-03  5.731e-03   0.391 0.696034    
## two_prev_sus                    NA         NA      NA       NA    
## no_prev_dwi              5.250e-02  1.348e-02   3.896 9.80e-05 ***
## one_prev_dwi             5.214e-02  1.499e-02   3.479 0.000503 ***
## no_prev_spd             -1.289e-02  5.166e-03  -2.495 0.012604 *  
## one_prev_spd             9.250e-03  5.753e-03   1.608 0.107895    
## no_prev_oth             -2.064e-02  5.038e-03  -4.097 4.19e-05 ***
## one_prev_oth            -6.935e-04  5.669e-03  -0.122 0.902636    
## dr_age_lower16           1.678e-01  5.122e-02   3.276 0.001054 ** 
## dr_age_lower18           1.410e-01  4.798e-02   2.939 0.003289 ** 
## dr_age_lower21           1.142e-01  4.765e-02   2.397 0.016510 *  
## dr_age_lower30           8.688e-02  4.756e-02   1.827 0.067711 .  
## dr_age_lower65           6.331e-02  4.752e-02   1.332 0.182766    
## dr_age_65over            2.930e-02  4.759e-02   0.616 0.538091    
## dr_male                 -1.335e-01  5.044e-02  -2.646 0.008142 ** 
## dr_female               -1.499e-01  5.049e-02  -2.968 0.002995 ** 
## dr_alcohol_drug_med      4.957e-02  4.226e-03  11.728  < 2e-16 ***
## total_moving_violations -3.120e-02  3.643e-03  -8.564  < 2e-16 ***
## nm_alcohol_drug_med     -2.337e-02  8.423e-03  -2.775 0.005524 ** 
## nm_other_impair         -1.837e-02  1.572e-02  -1.168 0.242615    
## nm_involved             -4.307e+00  1.802e+00  -2.390 0.016838 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3921 on 91981 degrees of freedom
## Multiple R-squared:  0.1433, Adjusted R-squared:  0.1428 
## F-statistic:   296 on 52 and 91981 DF,  p-value: < 2.2e-16
#bothmodel = ols_step_both_aic(model)
#ols_step_forward_p(model, penter = 0.3)
#ols_step_backward_p(model)
#Hypothesis

#H0: Fatalities by year were independent of drunk driver and speeding. 
#H1: Fatalities by year were dependent of drunk driver and speeding (claim).

CLchi = 0.98
alphch = 1 - CLchi

drunkdriver = Groupproject %>%
  group_by(year) %>%
  dplyr::select(drunk_dr, speed_related, fatals)

drunkdriver = na.omit(drunkdriver)

chisq = chisq.test(drunkdriver)

hypochires = ifelse(chisq$p.value > alphch, "Do not reject the null hypothesis","Reject the null hypothesis")


Tskhypores_1 = rbind(CLchi*100, alphch, round(chisq$statistic,3), round(chisq$p.value,0), hypochires)
col_nhypochi = c("Hypothesis Result")
row_nhypochi = c("Confidence Level(%):", "Alpha:", "X^2 Value:","p value:","Hypothesis Result:")
dimnames(Tskhypores_1) = list(row_nhypochi,col_nhypochi)

kable(Tskhypores_1,
        caption = "<center>Chi-Square Independence Test</center>",
        align = "c",
       booktabs = TRUE)%>%
    kable_styling(bootstrap_options = c("hover",
                                        "bordered"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
  footnote(general = "H0: Fatalities by year were independent of drunk driver and speeding.\n 
H1: Fatalities by year were dependent of drunk driver and speeding (claim).")
Chi-Square Independence Test
Hypothesis Result
Confidence Level(%): 98
Alpha: 0.02
X^2 Value: 236704.315
p value: 1
Hypothesis Result: Do not reject the null hypothesis
Note: H0: Fatalities by year were independent of drunk driver and speeding.

H1: Fatalities by year were dependent of drunk driver and speeding (claim).
#Hypothesis 

#H0: There is no interaction effect between year and type of
#Nation Highway System on Fatalities.
#H1: There is an interaction effect between year and type of
#Nation Highway System on Fatalities.

#The hypotheses for the National Highway System types are
#H0: There is no difference between the means of Fatalities for
#two types of National Highway System.
#H1: There is a difference between the means of Fatalities for
#two types of National Highway System.

MAfatal = MH_acc %>%
  group_by(year, nhs) %>%
  dplyr::select(st_case, fatals) %>%
  mutate(Total_cases = length(st_case),
         Total_fatal = sum(fatals))
MAfatal = na.omit(MAfatal)

#MAfatal$year = as.factor(MAfatal$year)
#MAfatal$nhs = as.factor(MAfatal$nhs)

NHfatal = NH_acc %>%
  group_by(year, nhs) %>%
  dplyr::select(st_case, fatals) %>%
  mutate(Total_cases = length(st_case),
         Total_fatal = sum(fatals))
NHfatal = na.omit(NHfatal)

#NHfatal$year = as.factor(NHfatal$year)
#NHfatal$nhs = as.factor(NHfatal$nhs)

NHfatal = NHfatal %>% group_by(year) %>% 
  distinct(year, nhs, Total_cases, Total_fatal) %>%
  mutate(State  = rep(c("NH")))

MAfatal = MAfatal %>% group_by(year) %>% 
  distinct(year, nhs, Total_cases, Total_fatal) %>%
  mutate(State  = rep(c("MA")))


MAandNH = rbind(NHfatal, MAfatal)
MAandNH$year = as.factor(MAandNH$year)
MAandNH$nhs = as.factor(MAandNH$nhs)

MAandNH %>%
  group_by(State, nhs) %>%
  summarise(mean = mean(Total_fatal),
            sd = sd(Total_fatal))
MANH = qplot(interaction(State,nhs),Total_fatal , data=MAandNH, geom="boxplot")
MANH + aes(fill = nhs) + 
  scale_fill_manual(values = c("turquoise", "salmon"),
    labels = c('Non_NHS', 'NHS'),
    name = "Accident type")+
  labs(title = "Fatality in MA and NH",
       x = "Group by State and NHS",
       y = "Total Fatalities")+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_x_discrete(labels=c("MA.Non_NHS", "NH.Non_NHS", "MA.NHS", "NH.NHS"))

MAandNHANOVA = aov(Total_fatal ~ year * nhs, data = MAandNH)
MAandNHANOVA_summary = summary(MAandNHANOVA)

#plot(TukeyHSD(MAandNHANOVA), las = 2)

##Hypothesis Testing#####

CL = 0.95
alph = 1 - CL

Fvalinteraction = MAandNHANOVA_summary[[1]][3, "F value"] #Fvalue of interaction
Fvalnhs = MAandNHANOVA_summary[[1]][2, "F value"] #Fvalue of interaction

DofNnhs = MAandNHANOVA_summary[[1]][2, "Df"] #(a-1)(b-1) a is levels in light, b is levels in food
DofDnhs = MAandNHANOVA_summary[[1]][4, "Df"] # (ab)(n-1) n is the number of data values in each group
ANDofNinteraction = MAandNHANOVA_summary[[1]][3, "Df"] #(a-1)(b-1) 
ANDofDinteraction = MAandNHANOVA_summary[[1]][4, "Df"] # (ab)(n-1) n is the number of data values in each group

ANCVinteraction = qf(alph, ANDofNinteraction, ANDofDinteraction, lower.tail = FALSE)
ANCVnhs = qf(alph, DofNnhs, DofDnhs, lower.tail = FALSE)

hypoANOVAinteraction = ifelse(Fvalinteraction < ANCVinteraction, "Do not reject the null hypothesis (Fvalue < CV)", "Reject the null hypothesis (Fvalue > CV)")

hypoANOVAnhs = ifelse(Fvalnhs < ANCVnhs, "Do not reject the null hypothesis (Fvalue < CV)", "Reject the null hypothesis (Fvalue > CV)")


##Information Representation#####
nhsfatal <- 
  MAandNH %>% 
  group_by(nhs, year) %>% # group by the two factors
  summarise(Means = mean(Total_fatal), SEs = sd(Total_fatal)/sqrt(n())) # Mean and Std Er
ggplot(nhsfatal, 
       aes(x = nhs, y = Means, fill = year,
           ymin = Means - SEs, ymax = Means + SEs)) +
  # this adds the mean
  geom_col(position = position_dodge()) +
  # this adds the error bars
  geom_errorbar(position = position_dodge(0.9), width=.2) +
  # controlling the appearance
  xlab("NHS") + ylab("Fatalities") +
  scale_x_discrete(labels=c("Non-NHS", "NHS"))

hyporesANOVA = rbind(CL, alph, round(ANCVinteraction,3), round(Fvalinteraction,3), hypoANOVAinteraction)
col_twoAN = c("Hypothesis Result")
row_twoAN = c("Confidence Level(%):", "Alpha:", "CV:","F value:","Hypothesis Result:")
dimnames(hyporesANOVA) = list(row_twoAN, col_twoAN)

kable(hyporesANOVA,
        caption = "<center>Two way ANOVA interaction Test</center>",
        align = "c",
       booktabs = TRUE) %>%
    kable_styling(bootstrap_options = c("hover",
                                        "bordered"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
  footnote(general = "𝐻0: There is no interaction effect between year and type of Nation Highway System on Fatalities and \n 𝐻1: There is an interaction effect between year and type of Nation Highway System on Fatalities.")
Two way ANOVA interaction Test
Hypothesis Result
Confidence Level(%): 0.95
Alpha: 0.05
CV: 3.478
F value: 0.014
Hypothesis Result: Do not reject the null hypothesis (Fvalue < CV)
Note: 𝐻0: There is no interaction effect between year and type of Nation Highway System on Fatalities and
𝐻1: There is an interaction effect between year and type of Nation Highway System on Fatalities.
hyporestwoANOVAnhs = rbind(CL, alph, round(ANCVnhs,3), round(Fvalnhs,3), hypoANOVAnhs)
col_twoANOVAnhs = c("Hypothesis Result")
row_twoANOVAnhs = c("Confidence Level(%):", "Alpha:", "CV:","F value:","Hypothesis Result:")
dimnames(hyporestwoANOVAnhs) = list(row_twoANOVAnhs, col_twoANOVAnhs)

kable(hyporestwoANOVAnhs,
        caption = "<center>Two way ANOVA mean of Fatalities to NHS test</center>",
        align = "c",
       booktabs = TRUE) %>%
    kable_styling(bootstrap_options = c("hover",
                                        "bordered"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
  footnote(general = "H0: There is no difference between the means of Fatalities for two types of National Highway System and \n H1: There is a difference between the means of Fatalities for two types of National Highway System.")
Two way ANOVA mean of Fatalities to NHS test
Hypothesis Result
Confidence Level(%): 0.95
Alpha: 0.05
CV: 4.965
F value: 4.169
Hypothesis Result: Do not reject the null hypothesis (Fvalue < CV)
Note: H0: There is no difference between the means of Fatalities for two types of National Highway System and
H1: There is a difference between the means of Fatalities for two types of National Highway System.
data_t<- dplyr::select(data_input,fatals,permvit,pernotmvit,speed_related)  #clean data for t test
data_t<- data_t[complete.cases(data_t),]   #Clean all NA value
data_t
#Fatalities ratio
data_t$Fatals_ratio  <- data_t$fatals/(data_t$pernotmvit+data_t$permvit)*100

speed= subset(data_t, subset=(data_t$speed_related=="1"))    #Selet speed dataset
no_speed = subset(data_t, subset=(data_t$speed_related=="0"))  #Selet no speed dataset

data_t
data_t$speed <- ifelse(data_t$speed_related==0,"No","Yes") #Creat subset to show "no"and"yes" for speed



ggboxplot(data_t, x = "speed", y = "Fatals_ratio", 
          color = "speed", palette = c("#00AFBB", "#E7B800"),
          ylab = "Fatalities ratio", xlab = "speed conditon")+ #creat boxplot
  coord_cartesian(ylim = c(0, 120))

t.test(no_speed$Fatals_ratio,speed$Fatals_ratio, alternative = "two.sided", var.equal = FALSE)  #t test
## 
##  Welch Two Sample t-test
## 
## data:  no_speed$Fatals_ratio and speed$Fatals_ratio
## t = -74.401, df = 41321, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -16.00511 -15.18348
## sample estimates:
## mean of x mean of y 
##  55.00323  70.59752
df <- Groupproject 
df1 <- df %>%  group_by(state,year) %>% mutate(fatal = sum(fatals))
df1 <- df1 %>% filter(row_number()==1)


pop <- read.csv("Datasets/nst-est2019-alldata.csv") %>% dplyr::select(POPESTIMATE2014,STATE,NAME )



df2 <- merge(pop, df1,by.x="STATE", by.y="state", all.y=T)

df2$fatality = df2$fatal/df2$POPESTIMATE2014 * 100000
# question: what is the rate of fatality in each state in 2014

df3 = dplyr::select(df2, NAME, fatality)
df3 %<>% na.omit()


df3 %>%
  kable()%>%
  kable_styling(bootstrap_options = "bordered")
NAME fatality
1 Alabama 17.803300
2 Alabama 17.617419
3 Alabama 17.865260
4 Alabama 18.484865
6 Alabama 16.935854
7 Alaska 7.605771
8 Alaska 8.013223
9 Alaska 9.914666
10 Alaska 6.926684
11 Alaska 9.778849
12 Arizona 12.198360
13 Arizona 11.277168
15 Arizona 12.614382
16 Arizona 12.272649
17 Arizona 11.485179
18 Arkansas 19.242486
19 Arkansas 18.871790
20 Arkansas 16.782414
21 Arkansas 15.838824
23 Arkansas 18.568494
25 California 7.684541
26 California 7.295909
27 California 8.049854
28 California 7.047185
29 California 8.036900
31 Colorado 8.859646
32 Colorado 8.354983
33 Colorado 9.009176
34 Colorado 8.411056
35 Colorado 9.121323
36 Connecticut 6.148241
37 Connecticut 7.344505
38 Connecticut 8.902430
39 Connecticut 7.956547
40 Connecticut 6.899384
42 Delaware 12.225371
43 Delaware 10.831250
45 Delaware 10.616770
46 Delaware 13.297773
47 Delaware 10.616770
48 District of Columbia 3.019652
49 District of Columbia 2.264739
50 District of Columbia 3.623582
51 District of Columbia 4.076530
52 District of Columbia 3.472600
53 Florida 12.249375
54 Florida 12.314879
55 Florida 12.093171
56 Florida 12.566820
57 Florida 12.108288
59 Georgia 11.721142
60 Georgia 12.386665
61 Georgia 11.840341
62 Georgia 12.178068
64 Georgia 11.562212
65 Hawaii 7.069446
66 Hawaii 6.715974
67 Hawaii 8.836808
68 Hawaii 7.210835
69 Hawaii 7.988474
71 Idaho 11.280648
72 Idaho 11.403263
73 Idaho 12.813344
74 Idaho 10.238414
76 Idaho 13.119884
77 Illinois 7.171411
78 Illinois 7.419772
79 Illinois 7.124844
81 Illinois 7.691416
82 Illinois 7.194695
84 Indiana 11.844740
85 Indiana 11.890239
86 Indiana 11.298760
87 Indiana 11.389757
88 Indiana 11.435255
89 Iowa 11.738788
90 Iowa 10.195057
91 Iowa 10.355862
93 Iowa 11.577982
94 Iowa 12.542814
95 Kansas 13.273688
97 Kansas 13.963230
98 Kansas 13.308165
99 Kansas 12.066989
100 Kansas 14.859635
101 Kentucky 15.223083
102 Kentucky 16.899434
104 Kentucky 14.452867
105 Kentucky 17.216582
106 Kentucky 16.310446
107 Louisiana 15.137770
108 Louisiana 15.568432
110 Louisiana 14.642508
111 Louisiana 15.934495
112 Louisiana 15.525366
114 Maine 12.326073
115 Maine 10.221621
116 Maine 10.822893
117 Maine 12.100596
118 Maine 9.845826
119 Maryland 8.141295
120 Maryland 8.577736
122 Maryland 8.325943
123 Maryland 7.805572
124 Maryland 7.419490
125 Massachusetts 5.663506
127 Massachusetts 5.131166
128 Massachusetts 5.190315
129 Massachusetts 5.530421
130 Massachusetts 5.234676
131 Michigan 9.466409
132 Michigan 9.486550
133 Michigan 9.536903
134 Michigan 9.073654
135 Michigan 8.952806
137 Minnesota 7.099512
138 Minnesota 6.750957
139 Minnesota 7.246272
140 Minnesota 6.622542
141 Minnesota 7.539792
143 Mississippi 21.434772
144 Mississippi 21.066937
145 Mississippi 20.297826
146 Mississippi 20.498464
148 Mississippi 19.461837
149 Missouri 12.648191
150 Missouri 13.556351
151 Missouri 12.978431
153 Missouri 13.638911
154 Missouri 12.499583
156 Montana 18.789101
157 Montana 20.452720
158 Montana 18.495521
159 Montana 22.409917
160 Montana 20.061280
161 Nebraska 10.110034
162 Nebraska 11.972409
163 Nebraska 9.631138
164 Nebraska 11.280670
165 Nebraska 11.227459
167 Nevada 9.440565
168 Nevada 10.327836
169 Nevada 9.263111
170 Nevada 8.730748
171 Nevada 9.121147
173 New Hampshire 9.599945
174 New Hampshire 10.124942
175 New Hampshire 8.099953
176 New Hampshire 6.749961
178 New Hampshire 7.124959
179 New Jersey 7.073137
180 New Jersey 6.272192
181 New Jersey 6.272192
182 New Jersey 6.644462
183 New Jersey 6.114259
185 New Mexico 18.472718
186 New Mexico 17.515582
187 New Mexico 14.883459
188 New Mexico 16.749874
189 New Mexico 16.702017
191 New York 5.297427
192 New York 6.004768
193 New York 6.116722
195 New York 5.958969
196 New York 6.111633
198 North Carolina 12.383107
199 North Carolina 13.077769
200 North Carolina 12.926755
201 North Carolina 13.289188
202 North Carolina 12.987161
204 North Dakota 20.070491
205 North Dakota 23.053942
206 North Dakota 14.239200
207 North Dakota 20.070491
208 North Dakota 18.307542
209 Ohio 9.661544
211 Ohio 9.308178
212 Ohio 8.523878
213 Ohio 8.670396
214 Ohio 8.765201
215 Oklahoma 17.946530
216 Oklahoma 17.250329
217 Oklahoma 18.281738
218 Oklahoma 17.224543
219 Oklahoma 17.482396
221 Oregon 9.007772
222 Oregon 8.503135
223 Oregon 7.998498
224 Oregon 7.897571
225 Oregon 8.351744
227 Pennsylvania 9.344469
228 Pennsylvania 10.056057
229 Pennsylvania 10.353203
230 Pennsylvania 9.461764
231 Pennsylvania 10.243728
233 Rhode Island 6.060973
234 Rhode Island 4.829838
235 Rhode Island 6.155676
236 Rhode Island 6.345082
237 Rhode Island 6.250379
238 South Carolina 17.061885
239 South Carolina 17.165542
241 South Carolina 16.771647
242 South Carolina 15.900931
243 South Carolina 17.891139
244 South Dakota 15.663109
245 South Dakota 16.016412
246 South Dakota 16.487483
247 South Dakota 13.072219
249 South Dakota 15.898644
250 Tennessee 15.516976
251 Tennessee 14.324538
252 Tennessee 15.776866
253 Tennessee 15.211223
255 Tennessee 14.722018
256 Texas 12.638918
257 Texas 11.211106
258 Texas 12.568455
259 Texas 11.326073
260 Texas 13.113619
262 Utah 7.388796
263 Utah 7.490945
264 Utah 8.614587
265 Utah 8.274090
266 Utah 8.716736
267 Vermont 12.315783
268 Vermont 11.036221
270 Vermont 8.796988
271 Vermont 11.356112
272 Vermont 7.037590
273 Virginia 9.337031
274 Virginia 8.903870
276 Virginia 9.192644
277 Virginia 8.903870
278 Virginia 8.458676
280 Washington 6.435467
281 Washington 6.208666
282 Washington 6.520517
283 Washington 6.180316
284 Washington 6.548867
285 West Virginia 18.275318
287 West Virginia 17.031731
288 West Virginia 17.950904
289 West Virginia 18.329387
290 West Virginia 14.706765
291 Wisconsin 10.692816
292 Wisconsin 9.945189
293 Wisconsin 9.440974
294 Wisconsin 10.119055
295 Wisconsin 8.797667
297 Wyoming 14.934828
298 Wyoming 23.174732
299 Wyoming 26.608026
300 Wyoming 21.114756
301 Wyoming 25.749703