FARS Analysis

#################################################################
#  Import dataset ####
#################################################################
data_input<- Groupproject

#################################################################
# Analysis Data ####
#################################################################
summary(data_input)

##      state          st_case          ve_total         ve_forms     
##  Min.   : 1.00   Min.   : 10001   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.:12.00   1st Qu.:122120   1st Qu.: 1.000   1st Qu.: 1.000  
##  Median :28.00   Median :280279   Median : 1.000   Median : 1.000  
##  Mean   :27.95   Mean   :280092   Mean   : 1.522   Mean   : 1.486  
##  3rd Qu.:42.00   3rd Qu.:420723   3rd Qu.: 2.000   3rd Qu.: 2.000  
##  Max.   :56.00   Max.   :560139   Max.   :90.000   Max.   :90.000  
##                                   NA's   :1833                     
##      county            city           day            month       
##  Min.   :  0.00   Min.   :   0   Min.   : 1.00   Min.   : 1.000  
##  1st Qu.: 31.00   1st Qu.:   0   1st Qu.: 8.00   1st Qu.: 4.000  
##  Median : 71.00   Median :   0   Median :16.00   Median : 7.000  
##  Mean   : 91.54   Mean   :1194   Mean   :15.67   Mean   : 6.754  
##  3rd Qu.:115.00   3rd Qu.:1940   3rd Qu.:23.00   3rd Qu.:10.000  
##  Max.   :997.00   Max.   :9997   Max.   :31.00   Max.   :12.000  
##  NA's   :1838     NA's   :1891   NA's   :1833    NA's   :1833    
##       year         date_SIF        day_week          hour           minute    
##  Min.   :2010   Min.   :18263   Min.   :1.000   Min.   : 0.00   Min.   : 0.0  
##  1st Qu.:2011   1st Qu.:18735   1st Qu.:2.000   1st Qu.: 7.00   1st Qu.:14.0  
##  Median :2012   Median :19182   Median :4.000   Median :14.00   Median :29.0  
##  Mean   :2012   Mean   :19182   Mean   :4.144   Mean   :12.63   Mean   :28.4  
##  3rd Qu.:2013   3rd Qu.:19632   3rd Qu.:6.000   3rd Qu.:18.00   3rd Qu.:44.0  
##  Max.   :2014   Max.   :20088   Max.   :7.000   Max.   :23.00   Max.   :59.0  
##  NA's   :1833   NA's   :1833    NA's   :1833    NA's   :2940    NA's   :2961  
##       nhs            latitude        longitud          lgt_cond    
##  Min.   :0.0000   Min.   :18.96   Min.   :-165.41   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:33.26   1st Qu.: -97.72   1st Qu.:1.000  
##  Median :0.0000   Median :36.58   Median : -87.74   Median :2.000  
##  Mean   :0.3267   Mean   :36.72   Mean   : -91.90   Mean   :1.827  
##  3rd Qu.:1.0000   3rd Qu.:40.66   3rd Qu.: -81.33   3rd Qu.:2.000  
##  Max.   :1.0000   Max.   :68.43   Max.   : -66.99   Max.   :7.000  
##  NA's   :2321     NA's   :3481    NA's   :3481      NA's   :2485   
##     weather           fatals          drunk_dr      total_numoccs    
##  Min.   : 1.000   Min.   : 1.000   Min.   :0.0000   Min.   :  0.000  
##  1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.:0.0000   1st Qu.:  1.000  
##  Median : 1.000   Median : 1.000   Median :0.0000   Median :  2.000  
##  Mean   : 2.588   Mean   : 1.089   Mean   :0.3156   Mean   :  2.269  
##  3rd Qu.: 2.000   3rd Qu.: 1.000   3rd Qu.:1.0000   3rd Qu.:  3.000  
##  Max.   :12.000   Max.   :15.000   Max.   :4.0000   Max.   :127.000  
##  NA's   :3040     NA's   :1833     NA's   :1833     NA's   :921      
##  total_hit_run    total_registered_owner total_not_registered total_other_owner
##  Min.   :0.0000   Min.   : 0.00          Min.   : 0.00        Min.   : 0.000   
##  1st Qu.:0.0000   1st Qu.: 0.00          1st Qu.: 0.00        1st Qu.: 0.000   
##  Median :0.0000   Median : 1.00          Median : 0.00        Median : 0.000   
##  Mean   :0.0474   Mean   : 0.84          Mean   : 0.43        Mean   : 0.215   
##  3rd Qu.:0.0000   3rd Qu.: 1.00          3rd Qu.: 1.00        3rd Qu.: 0.000   
##  Max.   :1.0000   Max.   :49.00          Max.   :17.00        Max.   :23.000   
##  NA's   :806      NA's   :3837           NA's   :3837         NA's   :3837     
##  total_fire_exp   total_valid_license total_invalid_license  no_prev_acc    
##  Min.   :0.0000   Min.   : 0.000      Min.   :0.0000        Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.: 1.000      1st Qu.:0.0000        1st Qu.: 1.000  
##  Median :0.0000   Median : 1.000      Median :0.0000        Median : 1.000  
##  Mean   :0.0405   Mean   : 1.299      Mean   :0.1734        Mean   : 1.284  
##  3rd Qu.:0.0000   3rd Qu.: 2.000      3rd Qu.:0.0000        3rd Qu.: 2.000  
##  Max.   :1.0000   Max.   :83.000      Max.   :5.0000        Max.   :74.000  
##  NA's   :694      NA's   :4163        NA's   :694           NA's   :15103   
##   one_prev_acc     two_prev_acc    no_prev_sus      one_prev_sus  
##  Min.   : 0.000   Min.   :0.000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 0.000   1st Qu.:0.000   1st Qu.: 1.000   1st Qu.:0.000  
##  Median : 0.000   Median :0.000   Median : 1.000   Median :0.000  
##  Mean   : 0.153   Mean   :0.069   Mean   : 1.266   Mean   :0.102  
##  3rd Qu.: 0.000   3rd Qu.:0.000   3rd Qu.: 2.000   3rd Qu.:0.000  
##  Max.   :12.000   Max.   :7.000   Max.   :80.000   Max.   :6.000  
##  NA's   :15103    NA's   :15103   NA's   :4997     NA's   :4997   
##   two_prev_sus    no_prev_dwi      one_prev_dwi    no_prev_spd    
##  Min.   :0.000   Min.   : 0.000   Min.   :0.000   Min.   : 0.000  
##  1st Qu.:0.000   1st Qu.: 1.000   1st Qu.:0.000   1st Qu.: 1.000  
##  Median :0.000   Median : 1.000   Median :0.000   Median : 1.000  
##  Mean   :0.131   Mean   : 1.436   Mean   :0.033   Mean   : 1.227  
##  3rd Qu.:0.000   3rd Qu.: 2.000   3rd Qu.:0.000   3rd Qu.: 2.000  
##  Max.   :6.000   Max.   :88.000   Max.   :2.000   Max.   :76.000  
##  NA's   :4997    NA's   :4997     NA's   :4997    NA's   :4997    
##   one_prev_spd     no_prev_oth      one_prev_oth    speed_related  
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.: 0.000   1st Qu.:0.000  
##  Median : 0.000   Median : 1.000   Median : 0.000   Median :0.000  
##  Mean   : 0.183   Mean   : 1.222   Mean   : 0.174   Mean   :0.235  
##  3rd Qu.: 0.000   3rd Qu.: 2.000   3rd Qu.: 0.000   3rd Qu.:0.000  
##  Max.   :11.000   Max.   :71.000   Max.   :12.000   Max.   :1.000  
##  NA's   :4997     NA's   :4997     NA's   :4997     NA's   :6503   
##  dr_age_lower16  dr_age_lower18  dr_age_lower21  dr_age_lower30 
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :0.000   Median :0.000   Median :0.000  
##  Mean   :0.005   Mean   :0.035   Mean   :0.105   Mean   :0.316  
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:1.000  
##  Max.   :2.000   Max.   :3.000   Max.   :5.000   Max.   :9.000  
##  NA's   :4844    NA's   :4844    NA's   :4844    NA's   :4844   
##  dr_age_lower65   dr_age_65over       dr_male         dr_female     
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.: 0.000  
##  Median : 1.000   Median : 0.000   Median : 1.000   Median : 0.000  
##  Mean   : 0.826   Mean   : 0.194   Mean   : 1.094   Mean   : 0.387  
##  3rd Qu.: 1.000   3rd Qu.: 0.000   3rd Qu.: 1.000   3rd Qu.: 1.000  
##  Max.   :64.000   Max.   :11.000   Max.   :67.000   Max.   :23.000  
##  NA's   :4844     NA's   :4844     NA's   :4666     NA's   :4666    
##    drugs_inv      dr_alcohol_drug_med dr_other_impair total_moving_violations
##  Min.   :0.00     Min.   :0.00        Min.   :0.00    Min.   :0.0000         
##  1st Qu.:0.00     1st Qu.:0.00        1st Qu.:0.00    1st Qu.:0.0000         
##  Median :0.00     Median :0.00        Median :0.00    Median :0.0000         
##  Mean   :0.25     Mean   :0.27        Mean   :0.07    Mean   :0.1512         
##  3rd Qu.:1.00     3rd Qu.:1.00        3rd Qu.:0.00    3rd Qu.:0.0000         
##  Max.   :1.00     Max.   :1.00        Max.   :1.00    Max.   :1.0000         
##  NA's   :101756   NA's   :40450       NA's   :40450   NA's   :3152           
##  nm_alcohol_drug_med nm_other_impair  nm_involved       pernotmvit   
##  Min.   : 0.0        Min.   : 0.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:99.0        1st Qu.:99.00   1st Qu.:0.0000   1st Qu.: 0.00  
##  Median :99.0        Median :99.00   Median :0.0000   Median : 0.00  
##  Mean   :87.3        Mean   :87.28   Mean   :0.1793   Mean   : 0.21  
##  3rd Qu.:99.0        3rd Qu.:99.00   3rd Qu.:0.0000   3rd Qu.: 0.00  
##  Max.   :99.0        Max.   :99.00   Max.   :1.0000   Max.   :21.00  
##  NA's   :10572       NA's   :10572                    NA's   :32129  
##     permvit      
##  Min.   :  0.00  
##  1st Qu.:  1.00  
##  Median :  2.00  
##  Mean   :  2.25  
##  3rd Qu.:  3.00  
##  Max.   :127.00  
##  NA's   :32129

str(data_input)

## 'data.frame':    153260 obs. of  58 variables:
##  $ state                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ st_case                : num  10001 10002 10003 10004 10005 ...
##  $ ve_total               : num  1 1 3 2 2 1 1 1 1 2 ...
##  $ ve_forms               : num  1 1 3 2 2 1 1 1 1 2 ...
##  $ county                 : num  81 35 97 115 117 21 97 9 51 113 ...
##  $ city                   : num  2340 1080 2100 160 0 0 0 0 0 0 ...
##  $ day                    : num  15 11 14 21 4 1 1 2 2 3 ...
##  $ month                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ year                   : num  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ date_SIF               : num  18277 18273 18276 18283 18266 ...
##  $ day_week               : num  6 2 5 5 2 6 6 7 7 1 ...
##  $ hour                   : num  4 6 15 1 6 5 2 15 9 13 ...
##  $ minute                 : num  10 0 10 16 38 34 40 37 55 50 ...
##  $ nhs                    : num  1 0 0 0 1 1 0 0 0 0 ...
##  $ latitude               : num  32.6 31.4 30.7 33.9 33.3 ...
##  $ longitud               : num  -85.4 -87 -88.1 -86.3 -86.8 ...
##  $ lgt_cond               : num  3 3 1 3 4 4 3 3 1 1 ...
##  $ weather                : num  1 1 1 2 1 2 2 1 1 1 ...
##  $ fatals                 : num  1 1 1 1 1 1 2 1 1 1 ...
##  $ drunk_dr               : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ total_numoccs          : num  2 1 3 2 2 1 2 4 2 4 ...
##  $ total_hit_run          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_registered_owner : num  0 1 3 0 2 1 0 0 1 1 ...
##  $ total_not_registered   : num  1 0 0 0 0 0 1 1 0 0 ...
##  $ total_other_owner      : num  0 0 0 2 0 0 0 0 0 1 ...
##  $ total_fire_exp         : num  1 0 0 0 1 0 0 0 0 0 ...
##  $ total_valid_license    : num  1 1 3 2 2 1 1 1 1 1 ...
##  $ total_invalid_license  : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ no_prev_acc            : num  NA 1 1 1 2 1 0 1 1 2 ...
##  $ one_prev_acc           : num  NA 0 2 1 0 0 0 0 0 0 ...
##  $ two_prev_acc           : num  NA 0 0 0 0 0 1 0 0 0 ...
##  $ no_prev_sus            : num  0 1 3 2 2 1 1 1 1 1 ...
##  $ one_prev_sus           : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ two_prev_sus           : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ no_prev_dwi            : num  1 1 3 2 2 1 1 1 1 1 ...
##  $ one_prev_dwi           : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ no_prev_spd            : num  0 1 3 2 1 1 0 1 1 1 ...
##  $ one_prev_spd           : num  0 0 0 0 0 0 1 0 0 1 ...
##  $ no_prev_oth            : num  1 1 2 2 2 1 1 1 1 2 ...
##  $ one_prev_oth           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ speed_related          : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ dr_age_lower16         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower18         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower21         : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ dr_age_lower30         : num  0 0 2 1 0 0 0 0 0 1 ...
##  $ dr_age_lower65         : num  1 1 1 1 2 1 0 0 0 1 ...
##  $ dr_age_65over          : num  0 0 0 0 0 0 0 1 1 0 ...
##  $ dr_male                : num  0 0 1 2 2 1 0 1 0 2 ...
##  $ dr_female              : num  1 1 2 0 0 0 1 0 1 0 ...
##  $ drugs_inv              : num  NA 0 NA NA NA 0 NA NA NA NA ...
##  $ dr_alcohol_drug_med    : num  0 0 0 0 NA 0 0 0 0 1 ...
##  $ dr_other_impair        : num  1 0 0 0 NA 1 1 0 1 0 ...
##  $ total_moving_violations: num  0 0 0 0 0 0 0 0 0 1 ...
##  $ nm_alcohol_drug_med    : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_other_impair        : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_involved            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pernotmvit             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ permvit                : num  NA NA NA NA NA NA NA NA NA NA ...

#Find the proportion of missing data that occurs in each category of data
miss<-function(x){sum(is.na(x))/length(x)*100}                 #Create a function of calculate proportion
apply(data_input,2,miss)                                       #Run the function with our data

##                   state                 st_case                ve_total 
##               0.0000000               0.0000000               1.1960068 
##                ve_forms                  county                    city 
##               0.0000000               1.1992692               1.2338510 
##                     day                   month                    year 
##               1.1960068               1.1960068               1.1960068 
##                date_SIF                day_week                    hour 
##               1.1960068               1.1960068               1.9183088 
##                  minute                     nhs                latitude 
##               1.9320110               1.5144199               2.2713037 
##                longitud                lgt_cond                 weather 
##               2.2713037               1.6214276               1.9835574 
##                  fatals                drunk_dr           total_numoccs 
##               1.1960068               1.1960068               0.6009396 
##           total_hit_run  total_registered_owner    total_not_registered 
##               0.5259037               2.5035887               2.5035887 
##       total_other_owner          total_fire_exp     total_valid_license 
##               2.5035887               0.4528253               2.7162991 
##   total_invalid_license             no_prev_acc            one_prev_acc 
##               0.4528253               9.8544956               9.8544956 
##            two_prev_acc             no_prev_sus            one_prev_sus 
##               9.8544956               3.2604724               3.2604724 
##            two_prev_sus             no_prev_dwi            one_prev_dwi 
##               3.2604724               3.2604724               3.2604724 
##             no_prev_spd            one_prev_spd             no_prev_oth 
##               3.2604724               3.2604724               3.2604724 
##            one_prev_oth           speed_related          dr_age_lower16 
##               3.2604724               4.2431163               3.1606420 
##          dr_age_lower18          dr_age_lower21          dr_age_lower30 
##               3.1606420               3.1606420               3.1606420 
##          dr_age_lower65           dr_age_65over                 dr_male 
##               3.1606420               3.1606420               3.0444995 
##               dr_female               drugs_inv     dr_alcohol_drug_med 
##               3.0444995              66.3943625              26.3930575 
##         dr_other_impair total_moving_violations     nm_alcohol_drug_med 
##              26.3930575               2.0566358               6.8980817 
##         nm_other_impair             nm_involved              pernotmvit 
##               6.8980817               0.0000000              20.9637218 
##                 permvit 
##              20.9637218

#Remove the data with a missing ratio greater than 15%.
data <- dplyr::select(data_input,-permvit,-dr_other_impair,-drugs_inv,-pernotmvit )

#Clean all NA value
data <- data[complete.cases(data),]


#################################################################
# Create subset for regration analysis ####
#################################################################

#Create subset for weather and light
data_r<- dummy_cols(data,select_columns = "weather")        # 1 = clean, 5 = Snow
data_r <- dummy_cols(data_r,select_columns = "lgt_cond")    # 1 = Daylight; 2 = Dark - Not lighted; 3 = Dark - Lighted;
data_r$Valid_license_ratio  <- data_r$total_valid_license/data_r$ve_total*100
data_r$no_prev_spd_ratio  <- data_r$no_prev_spd/data_r$ve_total*100
str(data_r)

## 'data.frame':    92034 obs. of  74 variables:
##  $ state                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ st_case                : num  10002 10003 10004 10006 10007 ...
##  $ ve_total               : num  1 3 2 1 1 1 1 2 1 1 ...
##  $ ve_forms               : num  1 3 2 1 1 1 1 2 1 1 ...
##  $ county                 : num  35 97 115 21 97 9 51 113 95 91 ...
##  $ city                   : num  1080 2100 160 0 0 0 0 0 0 0 ...
##  $ day                    : num  11 14 21 1 1 2 2 3 4 4 ...
##  $ month                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ year                   : num  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ date_SIF               : num  18273 18276 18283 18263 18263 ...
##  $ day_week               : num  2 5 5 6 6 7 7 1 2 2 ...
##  $ hour                   : num  6 15 1 5 2 15 9 13 5 16 ...
##  $ minute                 : num  0 10 16 34 40 37 55 50 50 20 ...
##  $ nhs                    : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ latitude               : num  31.4 30.7 33.9 32.8 30.5 ...
##  $ longitud               : num  -87 -88.1 -86.3 -86.6 -88.3 ...
##  $ lgt_cond               : num  3 1 3 4 3 3 1 1 4 1 ...
##  $ weather                : num  1 1 2 2 2 1 1 1 1 1 ...
##  $ fatals                 : num  1 1 1 1 2 1 1 1 1 1 ...
##  $ drunk_dr               : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ total_numoccs          : num  1 3 2 1 2 4 2 4 1 3 ...
##  $ total_hit_run          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_registered_owner : num  1 3 0 1 0 0 1 1 0 1 ...
##  $ total_not_registered   : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ total_other_owner      : num  0 0 2 0 0 0 0 1 1 0 ...
##  $ total_fire_exp         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_valid_license    : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ total_invalid_license  : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ no_prev_acc            : num  1 1 1 1 0 1 1 2 1 1 ...
##  $ one_prev_acc           : num  0 2 1 0 0 0 0 0 0 0 ...
##  $ two_prev_acc           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ no_prev_sus            : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ one_prev_sus           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ two_prev_sus           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_prev_dwi            : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ one_prev_dwi           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ no_prev_spd            : num  1 3 2 1 0 1 1 1 0 0 ...
##  $ one_prev_spd           : num  0 0 0 0 1 0 0 1 1 1 ...
##  $ no_prev_oth            : num  1 2 2 1 1 1 1 2 1 1 ...
##  $ one_prev_oth           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ speed_related          : num  0 0 0 0 0 0 1 0 1 1 ...
##  $ dr_age_lower16         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower18         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower21         : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ dr_age_lower30         : num  0 2 1 0 0 0 0 1 1 1 ...
##  $ dr_age_lower65         : num  1 1 1 1 0 0 0 1 0 0 ...
##  $ dr_age_65over          : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ dr_male                : num  0 1 2 1 0 1 0 2 1 1 ...
##  $ dr_female              : num  1 2 0 0 1 0 1 0 0 0 ...
##  $ dr_alcohol_drug_med    : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ total_moving_violations: num  0 0 0 0 0 0 0 1 0 1 ...
##  $ nm_alcohol_drug_med    : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_other_impair        : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_involved            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_1              : int  1 1 0 0 0 1 1 1 1 1 ...
##  $ weather_2              : int  0 0 1 1 1 0 0 0 0 0 ...
##  $ weather_3              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_4              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_5              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_6              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_7              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_8              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_10             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_11             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_12             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_1             : int  0 1 0 0 0 0 1 1 0 1 ...
##  $ lgt_cond_2             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_3             : int  1 0 1 0 1 1 0 0 0 0 ...
##  $ lgt_cond_4             : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ lgt_cond_5             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_6             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_7             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Valid_license_ratio    : num  100 100 100 100 100 100 100 50 100 100 ...
##  $ no_prev_spd_ratio      : num  100 100 100 100 0 100 100 50 0 0 ...

#################################################################
# correlation coefficient matrix ####
#################################################################
str(data_r)

## 'data.frame':    92034 obs. of  74 variables:
##  $ state                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ st_case                : num  10002 10003 10004 10006 10007 ...
##  $ ve_total               : num  1 3 2 1 1 1 1 2 1 1 ...
##  $ ve_forms               : num  1 3 2 1 1 1 1 2 1 1 ...
##  $ county                 : num  35 97 115 21 97 9 51 113 95 91 ...
##  $ city                   : num  1080 2100 160 0 0 0 0 0 0 0 ...
##  $ day                    : num  11 14 21 1 1 2 2 3 4 4 ...
##  $ month                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ year                   : num  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ date_SIF               : num  18273 18276 18283 18263 18263 ...
##  $ day_week               : num  2 5 5 6 6 7 7 1 2 2 ...
##  $ hour                   : num  6 15 1 5 2 15 9 13 5 16 ...
##  $ minute                 : num  0 10 16 34 40 37 55 50 50 20 ...
##  $ nhs                    : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ latitude               : num  31.4 30.7 33.9 32.8 30.5 ...
##  $ longitud               : num  -87 -88.1 -86.3 -86.6 -88.3 ...
##  $ lgt_cond               : num  3 1 3 4 3 3 1 1 4 1 ...
##  $ weather                : num  1 1 2 2 2 1 1 1 1 1 ...
##  $ fatals                 : num  1 1 1 1 2 1 1 1 1 1 ...
##  $ drunk_dr               : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ total_numoccs          : num  1 3 2 1 2 4 2 4 1 3 ...
##  $ total_hit_run          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_registered_owner : num  1 3 0 1 0 0 1 1 0 1 ...
##  $ total_not_registered   : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ total_other_owner      : num  0 0 2 0 0 0 0 1 1 0 ...
##  $ total_fire_exp         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_valid_license    : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ total_invalid_license  : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ no_prev_acc            : num  1 1 1 1 0 1 1 2 1 1 ...
##  $ one_prev_acc           : num  0 2 1 0 0 0 0 0 0 0 ...
##  $ two_prev_acc           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ no_prev_sus            : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ one_prev_sus           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ two_prev_sus           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_prev_dwi            : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ one_prev_dwi           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ no_prev_spd            : num  1 3 2 1 0 1 1 1 0 0 ...
##  $ one_prev_spd           : num  0 0 0 0 1 0 0 1 1 1 ...
##  $ no_prev_oth            : num  1 2 2 1 1 1 1 2 1 1 ...
##  $ one_prev_oth           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ speed_related          : num  0 0 0 0 0 0 1 0 1 1 ...
##  $ dr_age_lower16         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower18         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dr_age_lower21         : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ dr_age_lower30         : num  0 2 1 0 0 0 0 1 1 1 ...
##  $ dr_age_lower65         : num  1 1 1 1 0 0 0 1 0 0 ...
##  $ dr_age_65over          : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ dr_male                : num  0 1 2 1 0 1 0 2 1 1 ...
##  $ dr_female              : num  1 2 0 0 1 0 1 0 0 0 ...
##  $ dr_alcohol_drug_med    : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ total_moving_violations: num  0 0 0 0 0 0 0 1 0 1 ...
##  $ nm_alcohol_drug_med    : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_other_impair        : num  99 99 99 99 99 99 99 99 99 99 ...
##  $ nm_involved            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_1              : int  1 1 0 0 0 1 1 1 1 1 ...
##  $ weather_2              : int  0 0 1 1 1 0 0 0 0 0 ...
##  $ weather_3              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_4              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_5              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_6              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_7              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_8              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_10             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_11             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather_12             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_1             : int  0 1 0 0 0 0 1 1 0 1 ...
##  $ lgt_cond_2             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_3             : int  1 0 1 0 1 1 0 0 0 0 ...
##  $ lgt_cond_4             : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ lgt_cond_5             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_6             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lgt_cond_7             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Valid_license_ratio    : num  100 100 100 100 100 100 100 50 100 100 ...
##  $ no_prev_spd_ratio      : num  100 100 100 100 0 100 100 50 0 0 ...

data_c <- dplyr::select(data_r,speed_related,drunk_dr,Valid_license_ratio,no_prev_sus,no_prev_dwi,no_prev_spd,dr_age_65over,lgt_cond_1,lgt_cond_2)
str(data_c)

## 'data.frame':    92034 obs. of  9 variables:
##  $ speed_related      : num  0 0 0 0 0 0 1 0 1 1 ...
##  $ drunk_dr           : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ Valid_license_ratio: num  100 100 100 100 100 100 100 50 100 100 ...
##  $ no_prev_sus        : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ no_prev_dwi        : num  1 3 2 1 1 1 1 1 1 1 ...
##  $ no_prev_spd        : num  1 3 2 1 0 1 1 1 0 0 ...
##  $ dr_age_65over      : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ lgt_cond_1         : int  0 1 0 0 0 0 1 1 0 1 ...
##  $ lgt_cond_2         : int  0 0 0 0 0 0 0 0 0 0 ...

cor_data<-cor(data_c)
cor(cor_data)

##                     speed_related   drunk_dr Valid_license_ratio no_prev_sus
## speed_related           1.0000000  0.4228224         -0.50368378  -0.7560828
## drunk_dr                0.4228224  1.0000000         -0.37224131  -0.5203786
## Valid_license_ratio    -0.5036838 -0.3722413          1.00000000   0.5206754
## no_prev_sus            -0.7560828 -0.5203786          0.52067538   1.0000000
## no_prev_dwi            -0.7451149 -0.4795151          0.47211829   0.9900590
## no_prev_spd            -0.7296817 -0.4670526          0.40812329   0.9614250
## dr_age_65over          -0.4763418 -0.6207393          0.04821349   0.3905728
## lgt_cond_1             -0.3441922 -0.6906966          0.09495841   0.3403067
## lgt_cond_2              0.2867490  0.5544491         -0.15259737  -0.4207686
##                     no_prev_dwi no_prev_spd dr_age_65over  lgt_cond_1
## speed_related        -0.7451149  -0.7296817   -0.47634178 -0.34419218
## drunk_dr             -0.4795151  -0.4670526   -0.62073927 -0.69069663
## Valid_license_ratio   0.4721183   0.4081233    0.04821349  0.09495841
## no_prev_sus           0.9900590   0.9614250    0.39057278  0.34030673
## no_prev_dwi           1.0000000   0.9750606    0.36647030  0.31510617
## no_prev_spd           0.9750606   1.0000000    0.37941447  0.31788429
## dr_age_65over         0.3664703   0.3794145    1.00000000  0.52762396
## lgt_cond_1            0.3151062   0.3178843    0.52762396  1.00000000
## lgt_cond_2           -0.4025139  -0.4057150   -0.52337502 -0.93105054
##                     lgt_cond_2
## speed_related        0.2867490
## drunk_dr             0.5544491
## Valid_license_ratio -0.1525974
## no_prev_sus         -0.4207686
## no_prev_dwi         -0.4025139
## no_prev_spd         -0.4057150
## dr_age_65over       -0.5233750
## lgt_cond_1          -0.9310505
## lgt_cond_2           1.0000000

corrplot(cor_data,)                                            #Plot correlation coefficient matrix

#################################################################
# Split the data into a train and test set ####
#################################################################
set.seed(123) 
trainIndex <- createDataPartition(data_r$speed_related,p = 0.70, list = FALSE)
train <- data_c[trainIndex,]                                  #Split for train set
test <- data_c[-trainIndex,]                                  #Split for test set

#################################################################
# Fit a logistic regression model ####
#################################################################
Fit1 <- glm(speed_related ~ drunk_dr+Valid_license_ratio+no_prev_dwi+no_prev_spd+no_prev_sus+
              dr_age_65over,data= train, family = binomial(link = "logit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(Fit1)

## 
## Call:
## glm(formula = speed_related ~ drunk_dr + Valid_license_ratio + 
##     no_prev_dwi + no_prev_spd + no_prev_sus + dr_age_65over, 
##     family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1061  -0.7640  -0.4374  -0.1261   6.1910  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.7922831  0.0341623  23.192  < 2e-16 ***
## drunk_dr             0.6551503  0.0197466  33.178  < 2e-16 ***
## Valid_license_ratio -0.0022767  0.0002785  -8.175 2.96e-16 ***
## no_prev_dwi         -0.9971590  0.0322813 -30.890  < 2e-16 ***
## no_prev_spd         -0.4345266  0.0220119 -19.741  < 2e-16 ***
## no_prev_sus         -0.2149254  0.0270941  -7.933 2.15e-15 ***
## dr_age_65over       -0.3858241  0.0309390 -12.470  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 69975  on 64423  degrees of freedom
## Residual deviance: 59660  on 64417  degrees of freedom
## AIC: 59674
## 
## Number of Fisher Scoring iterations: 6

#################################################################
# Train set predictions ####
#################################################################
probabilities.train <- predict(Fit1,newdata = train, type="response")
predicted.classes.min <- as.factor(ifelse(probabilities.train >= 0.5, "1","0"))
train$speed_related= as.factor(train$speed_related)

confusionMatrix(predicted.classes.min,train$speed_related, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 47082 12370
##          1  2320  2652
##                                           
##                Accuracy : 0.772           
##                  95% CI : (0.7687, 0.7752)
##     No Information Rate : 0.7668          
##     P-Value [Acc > NIR] : 0.0009813       
##                                           
##                   Kappa : 0.1689          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.17654         
##             Specificity : 0.95304         
##          Pos Pred Value : 0.53339         
##          Neg Pred Value : 0.79193         
##              Prevalence : 0.23317         
##          Detection Rate : 0.04116         
##    Detection Prevalence : 0.07718         
##       Balanced Accuracy : 0.56479         
##                                           
##        'Positive' Class : 1               
##

#################################################################
# test set predictions ####
#################################################################
probabilities.test <- predict(Fit1,newdata = test, type="response")
predicted.classes.min <- as.factor(ifelse(probabilities.test >= 0.5, "1","0"))
test$speed_related= as.factor(test$speed_related)

confusionMatrix(predicted.classes.min,test$speed_related, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 20113  5384
##          1   965  1148
##                                         
##                Accuracy : 0.77          
##                  95% CI : (0.765, 0.775)
##     No Information Rate : 0.7634        
##     P-Value [Acc > NIR] : 0.004777      
##                                         
##                   Kappa : 0.1695        
##                                         
##  Mcnemar's Test P-Value : < 2.2e-16     
##                                         
##             Sensitivity : 0.17575       
##             Specificity : 0.95422       
##          Pos Pred Value : 0.54330       
##          Neg Pred Value : 0.78884       
##              Prevalence : 0.23658       
##          Detection Rate : 0.04158       
##    Detection Prevalence : 0.07653       
##       Balanced Accuracy : 0.56498       
##                                         
##        'Positive' Class : 1             
##

#view(probabilities.test)

#################################################################
# Creat ROC ####
#################################################################
ROC1 <- roc(test$speed_related,probabilities.test)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(ROC1, col= "Blue", ylab = "sensitivity ~ TP Rate", xlab = "Specificity ~ FP Rate")

auc1 = auc(test$speed_related,probabilities.test)
auc1

## [1] 0.7704526

#################################################################
# LASSO Analysis ####
#################################################################




#Spliting the data into 2. 80/20
set.seed(1234)
trainIndex <- sample(x=nrow(data), size=nrow(data)*0.8)
train <- data[trainIndex,]
test <- data[-trainIndex,]

x <- model.matrix(total_valid_license~.,train)[,-1]
y <- train$total_valid_license


lasso.mod <- glmnet(x, y, alpha=1)
l_cv.out <- cv.glmnet(x, y, alpha=1)
plot(l_cv.out)

best.lambda <- l_cv.out$lambda.min
best.lambda

## [1] 0.0006016217

max.lambda <-l_cv.out$lambda.1se
max.lambda

## [1] 0.001389824

group1 = as.data.frame(as.matrix(coef(l_cv.out)))

kable(group1,
      caption = "<center>Min Model Coefficient</center>",
      align = "c",
      col.names = "Coefficient") %>%
  kable_styling(bootstrap_options = c("bordered",
                                      "responsive",
                                      "hover"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
   footnote(general = "Total valid License Model",
           general_title = "LASSO: ")

Min Model Coefficient
	Coefficient
(Intercept)	0.0028736
state	0.0000000
st_case	0.0000000
ve_total	0.0000000
ve_forms	0.0000000
county	-0.0000041
city	0.0000000
day	0.0000000
month	0.0000000
year	0.0000000
date_SIF	0.0000000
day_week	0.0000000
hour	0.0000000
minute	0.0000000
nhs	0.0000000
latitude	0.0003538
longitud	0.0000511
lgt_cond	0.0000000
weather	0.0000000
fatals	0.0000000
drunk_dr	0.0000000
total_numoccs	0.0005239
total_hit_run	-0.0309499
total_registered_owner	0.1399413
total_not_registered	0.1382886
total_other_owner	0.1316912
total_fire_exp	0.0000000
total_invalid_license	-0.9710006
no_prev_acc	0.3102732
one_prev_acc	0.3124176
two_prev_acc	0.2700281
no_prev_sus	0.0087693
one_prev_sus	0.0088239
two_prev_sus	0.0000000
no_prev_dwi	0.4783067
one_prev_dwi	0.4640121
no_prev_spd	0.0293923
one_prev_spd	0.0309741
no_prev_oth	0.0211508
one_prev_oth	0.0216682
speed_related	-0.0027545
dr_age_lower16	0.0000000
dr_age_lower18	0.0000000
dr_age_lower21	0.0000000
dr_age_lower30	0.0017386
dr_age_lower65	0.0009423
dr_age_65over	0.0000000
dr_male	0.0073201
dr_female	0.0051949
dr_alcohol_drug_med	0.0000000
total_moving_violations	0.0000000
nm_alcohol_drug_med	0.0000097
nm_other_impair	0.0000026
nm_involved	0.0000000

LASSO: Total valid License Model

model=lm(speed_related ~ ., data = data)
summary(model)

## 
## Call:
## lm(formula = speed_related ~ ., data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7375 -0.2703 -0.1404  0.1145 12.2685 
## 
## Coefficients: (1 not defined because of singularities)
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.746e+03  1.414e+03   1.942 0.052117 .  
## state                    7.630e-02  2.776e-02   2.749 0.005978 ** 
## st_case                 -7.479e-06  2.775e-06  -2.695 0.007030 ** 
## ve_total                 3.764e-02  5.351e-03   7.034 2.02e-12 ***
## ve_forms                 3.409e-02  2.372e-02   1.437 0.150748    
## county                  -5.110e-05  1.394e-05  -3.667 0.000246 ***
## city                     3.029e-06  6.828e-07   4.437 9.15e-06 ***
## day                     -3.697e-03  1.976e-03  -1.871 0.061311 .  
## month                   -1.169e-01  6.012e-02  -1.944 0.051935 .  
## year                    -1.399e+00  7.214e-01  -1.939 0.052512 .  
## date_SIF                 3.822e-03  1.975e-03   1.935 0.052954 .  
## day_week                -2.938e-05  6.136e-04  -0.048 0.961808    
## hour                    -1.197e-03  1.928e-04  -6.207 5.41e-10 ***
## minute                   1.338e-05  7.395e-05   0.181 0.856380    
## nhs                     -3.844e-02  2.836e-03 -13.555  < 2e-16 ***
## latitude                 7.849e-04  3.111e-04   2.523 0.011638 *  
## longitud                -5.891e-04  9.925e-05  -5.936 2.93e-09 ***
## lgt_cond                 1.056e-02  1.399e-03   7.547 4.50e-14 ***
## weather                 -9.013e-04  4.012e-04  -2.246 0.024683 *  
## fatals                   3.495e-04  3.692e-03   0.095 0.924565    
## drunk_dr                 5.855e-02  3.831e-03  15.283  < 2e-16 ***
## total_numoccs            2.016e-03  8.631e-04   2.336 0.019501 *  
## total_hit_run           -2.242e-02  9.103e-03  -2.463 0.013779 *  
## total_registered_owner   7.757e-02  1.846e-02   4.201 2.66e-05 ***
## total_not_registered     8.992e-02  1.852e-02   4.856 1.20e-06 ***
## total_other_owner        8.297e-02  1.845e-02   4.497 6.90e-06 ***
## total_fire_exp           4.354e-02  6.813e-03   6.391 1.66e-10 ***
## total_valid_license     -2.160e-01  1.892e-02 -11.414  < 2e-16 ***
## total_invalid_license   -1.956e-01  1.888e-02 -10.360  < 2e-16 ***
## no_prev_acc             -1.509e-02  1.993e-02  -0.757 0.448909    
## one_prev_acc            -1.856e-02  2.014e-02  -0.921 0.356837    
## two_prev_acc            -3.269e-02  1.905e-02  -1.716 0.086169 .  
## no_prev_sus             -2.387e-02  4.761e-03  -5.015 5.32e-07 ***
## one_prev_sus             2.239e-03  5.731e-03   0.391 0.696034    
## two_prev_sus                    NA         NA      NA       NA    
## no_prev_dwi              5.250e-02  1.348e-02   3.896 9.80e-05 ***
## one_prev_dwi             5.214e-02  1.499e-02   3.479 0.000503 ***
## no_prev_spd             -1.289e-02  5.166e-03  -2.495 0.012604 *  
## one_prev_spd             9.250e-03  5.753e-03   1.608 0.107895    
## no_prev_oth             -2.064e-02  5.038e-03  -4.097 4.19e-05 ***
## one_prev_oth            -6.935e-04  5.669e-03  -0.122 0.902636    
## dr_age_lower16           1.678e-01  5.122e-02   3.276 0.001054 ** 
## dr_age_lower18           1.410e-01  4.798e-02   2.939 0.003289 ** 
## dr_age_lower21           1.142e-01  4.765e-02   2.397 0.016510 *  
## dr_age_lower30           8.688e-02  4.756e-02   1.827 0.067711 .  
## dr_age_lower65           6.331e-02  4.752e-02   1.332 0.182766    
## dr_age_65over            2.930e-02  4.759e-02   0.616 0.538091    
## dr_male                 -1.335e-01  5.044e-02  -2.646 0.008142 ** 
## dr_female               -1.499e-01  5.049e-02  -2.968 0.002995 ** 
## dr_alcohol_drug_med      4.957e-02  4.226e-03  11.728  < 2e-16 ***
## total_moving_violations -3.120e-02  3.643e-03  -8.564  < 2e-16 ***
## nm_alcohol_drug_med     -2.337e-02  8.423e-03  -2.775 0.005524 ** 
## nm_other_impair         -1.837e-02  1.572e-02  -1.168 0.242615    
## nm_involved             -4.307e+00  1.802e+00  -2.390 0.016838 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3921 on 91981 degrees of freedom
## Multiple R-squared:  0.1433, Adjusted R-squared:  0.1428 
## F-statistic:   296 on 52 and 91981 DF,  p-value: < 2.2e-16

#bothmodel = ols_step_both_aic(model)
#ols_step_forward_p(model, penter = 0.3)
#ols_step_backward_p(model)

#Hypothesis

#H0: Fatalities by year were independent of drunk driver and speeding. 
#H1: Fatalities by year were dependent of drunk driver and speeding (claim).

CLchi = 0.98
alphch = 1 - CLchi

drunkdriver = Groupproject %>%
  group_by(year) %>%
  dplyr::select(drunk_dr, speed_related, fatals)

drunkdriver = na.omit(drunkdriver)

chisq = chisq.test(drunkdriver)

hypochires = ifelse(chisq$p.value > alphch, "Do not reject the null hypothesis","Reject the null hypothesis")


Tskhypores_1 = rbind(CLchi*100, alphch, round(chisq$statistic,3), round(chisq$p.value,0), hypochires)
col_nhypochi = c("Hypothesis Result")
row_nhypochi = c("Confidence Level(%):", "Alpha:", "X^2 Value:","p value:","Hypothesis Result:")
dimnames(Tskhypores_1) = list(row_nhypochi,col_nhypochi)

kable(Tskhypores_1,
        caption = "<center>Chi-Square Independence Test</center>",
        align = "c",
       booktabs = TRUE)%>%
    kable_styling(bootstrap_options = c("hover",
                                        "bordered"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
  footnote(general = "H0: Fatalities by year were independent of drunk driver and speeding.\n 
H1: Fatalities by year were dependent of drunk driver and speeding (claim).")

Chi-Square Independence Test
	Hypothesis Result
Confidence Level(%):	98
Alpha:	0.02
X^2 Value:	236704.315
p value:	1
Hypothesis Result:	Do not reject the null hypothesis

Note: H0: Fatalities by year were independent of drunk driver and speeding.

H1: Fatalities by year were dependent of drunk driver and speeding (claim).

#Hypothesis 

#H0: There is no interaction effect between year and type of
#Nation Highway System on Fatalities.
#H1: There is an interaction effect between year and type of
#Nation Highway System on Fatalities.

#The hypotheses for the National Highway System types are
#H0: There is no difference between the means of Fatalities for
#two types of National Highway System.
#H1: There is a difference between the means of Fatalities for
#two types of National Highway System.

MAfatal = MH_acc %>%
  group_by(year, nhs) %>%
  dplyr::select(st_case, fatals) %>%
  mutate(Total_cases = length(st_case),
         Total_fatal = sum(fatals))
MAfatal = na.omit(MAfatal)

#MAfatal$year = as.factor(MAfatal$year)
#MAfatal$nhs = as.factor(MAfatal$nhs)

NHfatal = NH_acc %>%
  group_by(year, nhs) %>%
  dplyr::select(st_case, fatals) %>%
  mutate(Total_cases = length(st_case),
         Total_fatal = sum(fatals))
NHfatal = na.omit(NHfatal)

#NHfatal$year = as.factor(NHfatal$year)
#NHfatal$nhs = as.factor(NHfatal$nhs)

NHfatal = NHfatal %>% group_by(year) %>% 
  distinct(year, nhs, Total_cases, Total_fatal) %>%
  mutate(State  = rep(c("NH")))

MAfatal = MAfatal %>% group_by(year) %>% 
  distinct(year, nhs, Total_cases, Total_fatal) %>%
  mutate(State  = rep(c("MA")))


MAandNH = rbind(NHfatal, MAfatal)
MAandNH$year = as.factor(MAandNH$year)
MAandNH$nhs = as.factor(MAandNH$nhs)

MAandNH %>%
  group_by(State, nhs) %>%
  summarise(mean = mean(Total_fatal),
            sd = sd(Total_fatal))

MANH = qplot(interaction(State,nhs),Total_fatal , data=MAandNH, geom="boxplot")
MANH + aes(fill = nhs) + 
  scale_fill_manual(values = c("turquoise", "salmon"),
    labels = c('Non_NHS', 'NHS'),
    name = "Accident type")+
  labs(title = "Fatality in MA and NH",
       x = "Group by State and NHS",
       y = "Total Fatalities")+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_x_discrete(labels=c("MA.Non_NHS", "NH.Non_NHS", "MA.NHS", "NH.NHS"))

MAandNHANOVA = aov(Total_fatal ~ year * nhs, data = MAandNH)
MAandNHANOVA_summary = summary(MAandNHANOVA)

#plot(TukeyHSD(MAandNHANOVA), las = 2)

##Hypothesis Testing#####

CL = 0.95
alph = 1 - CL

Fvalinteraction = MAandNHANOVA_summary[[1]][3, "F value"] #Fvalue of interaction
Fvalnhs = MAandNHANOVA_summary[[1]][2, "F value"] #Fvalue of interaction

DofNnhs = MAandNHANOVA_summary[[1]][2, "Df"] #(a-1)(b-1) a is levels in light, b is levels in food
DofDnhs = MAandNHANOVA_summary[[1]][4, "Df"] # (ab)(n-1) n is the number of data values in each group
ANDofNinteraction = MAandNHANOVA_summary[[1]][3, "Df"] #(a-1)(b-1) 
ANDofDinteraction = MAandNHANOVA_summary[[1]][4, "Df"] # (ab)(n-1) n is the number of data values in each group

ANCVinteraction = qf(alph, ANDofNinteraction, ANDofDinteraction, lower.tail = FALSE)
ANCVnhs = qf(alph, DofNnhs, DofDnhs, lower.tail = FALSE)

hypoANOVAinteraction = ifelse(Fvalinteraction < ANCVinteraction, "Do not reject the null hypothesis (Fvalue < CV)", "Reject the null hypothesis (Fvalue > CV)")

hypoANOVAnhs = ifelse(Fvalnhs < ANCVnhs, "Do not reject the null hypothesis (Fvalue < CV)", "Reject the null hypothesis (Fvalue > CV)")


##Information Representation#####
nhsfatal <- 
  MAandNH %>% 
  group_by(nhs, year) %>% # group by the two factors
  summarise(Means = mean(Total_fatal), SEs = sd(Total_fatal)/sqrt(n())) # Mean and Std Er
ggplot(nhsfatal, 
       aes(x = nhs, y = Means, fill = year,
           ymin = Means - SEs, ymax = Means + SEs)) +
  # this adds the mean
  geom_col(position = position_dodge()) +
  # this adds the error bars
  geom_errorbar(position = position_dodge(0.9), width=.2) +
  # controlling the appearance
  xlab("NHS") + ylab("Fatalities") +
  scale_x_discrete(labels=c("Non-NHS", "NHS"))

hyporesANOVA = rbind(CL, alph, round(ANCVinteraction,3), round(Fvalinteraction,3), hypoANOVAinteraction)
col_twoAN = c("Hypothesis Result")
row_twoAN = c("Confidence Level(%):", "Alpha:", "CV:","F value:","Hypothesis Result:")
dimnames(hyporesANOVA) = list(row_twoAN, col_twoAN)

kable(hyporesANOVA,
        caption = "<center>Two way ANOVA interaction Test</center>",
        align = "c",
       booktabs = TRUE) %>%
    kable_styling(bootstrap_options = c("hover",
                                        "bordered"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
  footnote(general = "𝐻0: There is no interaction effect between year and type of Nation Highway System on Fatalities and \n 𝐻1: There is an interaction effect between year and type of Nation Highway System on Fatalities.")

Two way ANOVA interaction Test
	Hypothesis Result
Confidence Level(%):	0.95
Alpha:	0.05
CV:	3.478
F value:	0.014
Hypothesis Result:	Do not reject the null hypothesis (Fvalue < CV)

Note: 𝐻0: There is no interaction effect between year and type of Nation Highway System on Fatalities and
𝐻1: There is an interaction effect between year and type of Nation Highway System on Fatalities.

hyporestwoANOVAnhs = rbind(CL, alph, round(ANCVnhs,3), round(Fvalnhs,3), hypoANOVAnhs)
col_twoANOVAnhs = c("Hypothesis Result")
row_twoANOVAnhs = c("Confidence Level(%):", "Alpha:", "CV:","F value:","Hypothesis Result:")
dimnames(hyporestwoANOVAnhs) = list(row_twoANOVAnhs, col_twoANOVAnhs)

kable(hyporestwoANOVAnhs,
        caption = "<center>Two way ANOVA mean of Fatalities to NHS test</center>",
        align = "c",
       booktabs = TRUE) %>%
    kable_styling(bootstrap_options = c("hover",
                                        "bordered"),
                font_size = 11) %>%
  scroll_box(width = "100%", height = "100%") %>%
  footnote(general = "H0: There is no difference between the means of Fatalities for two types of National Highway System and \n H1: There is a difference between the means of Fatalities for two types of National Highway System.")

Two way ANOVA mean of Fatalities to NHS test
	Hypothesis Result
Confidence Level(%):	0.95
Alpha:	0.05
CV:	4.965
F value:	4.169
Hypothesis Result:	Do not reject the null hypothesis (Fvalue < CV)

Note: H0: There is no difference between the means of Fatalities for two types of National Highway System and
H1: There is a difference between the means of Fatalities for two types of National Highway System.

data_t<- dplyr::select(data_input,fatals,permvit,pernotmvit,speed_related)  #clean data for t test
data_t<- data_t[complete.cases(data_t),]   #Clean all NA value
data_t

#Fatalities ratio
data_t$Fatals_ratio  <- data_t$fatals/(data_t$pernotmvit+data_t$permvit)*100

speed= subset(data_t, subset=(data_t$speed_related=="1"))    #Selet speed dataset
no_speed = subset(data_t, subset=(data_t$speed_related=="0"))  #Selet no speed dataset

data_t

data_t$speed <- ifelse(data_t$speed_related==0,"No","Yes") #Creat subset to show "no"and"yes" for speed



ggboxplot(data_t, x = "speed", y = "Fatals_ratio", 
          color = "speed", palette = c("#00AFBB", "#E7B800"),
          ylab = "Fatalities ratio", xlab = "speed conditon")+ #creat boxplot
  coord_cartesian(ylim = c(0, 120))

t.test(no_speed$Fatals_ratio,speed$Fatals_ratio, alternative = "two.sided", var.equal = FALSE)  #t test

## 
##  Welch Two Sample t-test
## 
## data:  no_speed$Fatals_ratio and speed$Fatals_ratio
## t = -74.401, df = 41321, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -16.00511 -15.18348
## sample estimates:
## mean of x mean of y 
##  55.00323  70.59752

df <- Groupproject 
df1 <- df %>%  group_by(state,year) %>% mutate(fatal = sum(fatals))
df1 <- df1 %>% filter(row_number()==1)


pop <- read.csv("Datasets/nst-est2019-alldata.csv") %>% dplyr::select(POPESTIMATE2014,STATE,NAME )



df2 <- merge(pop, df1,by.x="STATE", by.y="state", all.y=T)

df2$fatality = df2$fatal/df2$POPESTIMATE2014 * 100000
# question: what is the rate of fatality in each state in 2014

df3 = dplyr::select(df2, NAME, fatality)
df3 %<>% na.omit()


df3 %>%
  kable()%>%
  kable_styling(bootstrap_options = "bordered")

	NAME	fatality
1	Alabama	17.803300
2	Alabama	17.617419
3	Alabama	17.865260
4	Alabama	18.484865
6	Alabama	16.935854
7	Alaska	7.605771
8	Alaska	8.013223
9	Alaska	9.914666
10	Alaska	6.926684
11	Alaska	9.778849
12	Arizona	12.198360
13	Arizona	11.277168
15	Arizona	12.614382
16	Arizona	12.272649
17	Arizona	11.485179
18	Arkansas	19.242486
19	Arkansas	18.871790
20	Arkansas	16.782414
21	Arkansas	15.838824
23	Arkansas	18.568494
25	California	7.684541
26	California	7.295909
27	California	8.049854
28	California	7.047185
29	California	8.036900
31	Colorado	8.859646
32	Colorado	8.354983
33	Colorado	9.009176
34	Colorado	8.411056
35	Colorado	9.121323
36	Connecticut	6.148241
37	Connecticut	7.344505
38	Connecticut	8.902430
39	Connecticut	7.956547
40	Connecticut	6.899384
42	Delaware	12.225371
43	Delaware	10.831250
45	Delaware	10.616770
46	Delaware	13.297773
47	Delaware	10.616770
48	District of Columbia	3.019652
49	District of Columbia	2.264739
50	District of Columbia	3.623582
51	District of Columbia	4.076530
52	District of Columbia	3.472600
53	Florida	12.249375
54	Florida	12.314879
55	Florida	12.093171
56	Florida	12.566820
57	Florida	12.108288
59	Georgia	11.721142
60	Georgia	12.386665
61	Georgia	11.840341
62	Georgia	12.178068
64	Georgia	11.562212
65	Hawaii	7.069446
66	Hawaii	6.715974
67	Hawaii	8.836808
68	Hawaii	7.210835
69	Hawaii	7.988474
71	Idaho	11.280648
72	Idaho	11.403263
73	Idaho	12.813344
74	Idaho	10.238414
76	Idaho	13.119884
77	Illinois	7.171411
78	Illinois	7.419772
79	Illinois	7.124844
81	Illinois	7.691416
82	Illinois	7.194695
84	Indiana	11.844740
85	Indiana	11.890239
86	Indiana	11.298760
87	Indiana	11.389757
88	Indiana	11.435255
89	Iowa	11.738788
90	Iowa	10.195057
91	Iowa	10.355862
93	Iowa	11.577982
94	Iowa	12.542814
95	Kansas	13.273688
97	Kansas	13.963230
98	Kansas	13.308165
99	Kansas	12.066989
100	Kansas	14.859635
101	Kentucky	15.223083
102	Kentucky	16.899434
104	Kentucky	14.452867
105	Kentucky	17.216582
106	Kentucky	16.310446
107	Louisiana	15.137770
108	Louisiana	15.568432
110	Louisiana	14.642508
111	Louisiana	15.934495
112	Louisiana	15.525366
114	Maine	12.326073
115	Maine	10.221621
116	Maine	10.822893
117	Maine	12.100596
118	Maine	9.845826
119	Maryland	8.141295
120	Maryland	8.577736
122	Maryland	8.325943
123	Maryland	7.805572
124	Maryland	7.419490
125	Massachusetts	5.663506
127	Massachusetts	5.131166
128	Massachusetts	5.190315
129	Massachusetts	5.530421
130	Massachusetts	5.234676
131	Michigan	9.466409
132	Michigan	9.486550
133	Michigan	9.536903
134	Michigan	9.073654
135	Michigan	8.952806
137	Minnesota	7.099512
138	Minnesota	6.750957
139	Minnesota	7.246272
140	Minnesota	6.622542
141	Minnesota	7.539792
143	Mississippi	21.434772
144	Mississippi	21.066937
145	Mississippi	20.297826
146	Mississippi	20.498464
148	Mississippi	19.461837
149	Missouri	12.648191
150	Missouri	13.556351
151	Missouri	12.978431
153	Missouri	13.638911
154	Missouri	12.499583
156	Montana	18.789101
157	Montana	20.452720
158	Montana	18.495521
159	Montana	22.409917
160	Montana	20.061280
161	Nebraska	10.110034
162	Nebraska	11.972409
163	Nebraska	9.631138
164	Nebraska	11.280670
165	Nebraska	11.227459
167	Nevada	9.440565
168	Nevada	10.327836
169	Nevada	9.263111
170	Nevada	8.730748
171	Nevada	9.121147
173	New Hampshire	9.599945
174	New Hampshire	10.124942
175	New Hampshire	8.099953
176	New Hampshire	6.749961
178	New Hampshire	7.124959
179	New Jersey	7.073137
180	New Jersey	6.272192
181	New Jersey	6.272192
182	New Jersey	6.644462
183	New Jersey	6.114259
185	New Mexico	18.472718
186	New Mexico	17.515582
187	New Mexico	14.883459
188	New Mexico	16.749874
189	New Mexico	16.702017
191	New York	5.297427
192	New York	6.004768
193	New York	6.116722
195	New York	5.958969
196	New York	6.111633
198	North Carolina	12.383107
199	North Carolina	13.077769
200	North Carolina	12.926755
201	North Carolina	13.289188
202	North Carolina	12.987161
204	North Dakota	20.070491
205	North Dakota	23.053942
206	North Dakota	14.239200
207	North Dakota	20.070491
208	North Dakota	18.307542
209	Ohio	9.661544
211	Ohio	9.308178
212	Ohio	8.523878
213	Ohio	8.670396
214	Ohio	8.765201
215	Oklahoma	17.946530
216	Oklahoma	17.250329
217	Oklahoma	18.281738
218	Oklahoma	17.224543
219	Oklahoma	17.482396
221	Oregon	9.007772
222	Oregon	8.503135
223	Oregon	7.998498
224	Oregon	7.897571
225	Oregon	8.351744
227	Pennsylvania	9.344469
228	Pennsylvania	10.056057
229	Pennsylvania	10.353203
230	Pennsylvania	9.461764
231	Pennsylvania	10.243728
233	Rhode Island	6.060973
234	Rhode Island	4.829838
235	Rhode Island	6.155676
236	Rhode Island	6.345082
237	Rhode Island	6.250379
238	South Carolina	17.061885
239	South Carolina	17.165542
241	South Carolina	16.771647
242	South Carolina	15.900931
243	South Carolina	17.891139
244	South Dakota	15.663109
245	South Dakota	16.016412
246	South Dakota	16.487483
247	South Dakota	13.072219
249	South Dakota	15.898644
250	Tennessee	15.516976
251	Tennessee	14.324538
252	Tennessee	15.776866
253	Tennessee	15.211223
255	Tennessee	14.722018
256	Texas	12.638918
257	Texas	11.211106
258	Texas	12.568455
259	Texas	11.326073
260	Texas	13.113619
262	Utah	7.388796
263	Utah	7.490945
264	Utah	8.614587
265	Utah	8.274090
266	Utah	8.716736
267	Vermont	12.315783
268	Vermont	11.036221
270	Vermont	8.796988
271	Vermont	11.356112
272	Vermont	7.037590
273	Virginia	9.337031
274	Virginia	8.903870
276	Virginia	9.192644
277	Virginia	8.903870
278	Virginia	8.458676
280	Washington	6.435467
281	Washington	6.208666
282	Washington	6.520517
283	Washington	6.180316
284	Washington	6.548867
285	West Virginia	18.275318
287	West Virginia	17.031731
288	West Virginia	17.950904
289	West Virginia	18.329387
290	West Virginia	14.706765
291	Wisconsin	10.692816
292	Wisconsin	9.945189
293	Wisconsin	9.440974
294	Wisconsin	10.119055
295	Wisconsin	8.797667
297	Wyoming	14.934828
298	Wyoming	23.174732
299	Wyoming	26.608026
300	Wyoming	21.114756
301	Wyoming	25.749703