Link to the project in RPubs: http://rpubs.com/ofomicheva86/382640

#required packages
library(corrplot)
library(PerformanceAnalytics)
library(GGally)
library(RColorBrewer)
library(VIM)
library(dplyr)
library(mice)
library(pROC)
library(caret)
library(pscl)
library(ResourceSelection)
library(stringr)
library(vcd)
  1. DATA EXPLORATION
#read training data set
data <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/insurance_training_data.csv",
stringsAsFactors=T, header=T)

#read testing data set
data_testing <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/insurance-evaluation-data.csv",
stringsAsFactors=T, header=T)

#display first six entries
head(data)
##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ   INCOME PARENT1
## 1     1           0          0        0  60        0  11  $67,349      No
## 2     2           0          0        0  43        0  11  $91,449      No
## 3     4           0          0        0  35        1  10  $16,039      No
## 4     5           0          0        0  51        0  14               No
## 5     6           0          0        0  50        0  NA $114,986      No
## 6     7           1       2946        0  34        1  12 $125,301     Yes
##   HOME_VAL MSTATUS SEX     EDUCATION           JOB TRAVTIME    CAR_USE
## 1       $0    z_No   M           PhD  Professional       14    Private
## 2 $257,252    z_No   M z_High School z_Blue Collar       22 Commercial
## 3 $124,191     Yes z_F z_High School      Clerical        5    Private
## 4 $306,251     Yes   M  <High School z_Blue Collar       32    Private
## 5 $243,925     Yes z_F           PhD        Doctor       36    Private
## 6       $0    z_No z_F     Bachelors z_Blue Collar       46 Commercial
##   BLUEBOOK TIF   CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1  $14,230  11    Minivan     yes   $4,461        2      No       3
## 2  $14,940   1    Minivan     yes       $0        0      No       0
## 3   $4,010   4      z_SUV      no  $38,690        2      No       3
## 4  $15,440   7    Minivan     yes       $0        0      No       0
## 5  $18,000   1      z_SUV      no  $19,217        2     Yes       3
## 6  $17,430   1 Sports Car      no       $0        0      No       0
##   CAR_AGE          URBANICITY
## 1      18 Highly Urban/ Urban
## 2       1 Highly Urban/ Urban
## 3      10 Highly Urban/ Urban
## 4       6 Highly Urban/ Urban
## 5      17 Highly Urban/ Urban
## 6       7 Highly Urban/ Urban
head(data_testing)
##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ  INCOME PARENT1
## 1     3          NA         NA        0  48        0  11 $52,881      No
## 2     9          NA         NA        1  40        1  11 $50,815     Yes
## 3    10          NA         NA        0  44        2  12 $43,486     Yes
## 4    18          NA         NA        0  35        2  NA $21,204     Yes
## 5    21          NA         NA        0  59        0  12 $87,460      No
## 6    30          NA         NA        0  46        0  14              No
##   HOME_VAL MSTATUS SEX     EDUCATION           JOB TRAVTIME    CAR_USE
## 1       $0    z_No   M     Bachelors       Manager       26    Private
## 2       $0    z_No   M z_High School       Manager       21    Private
## 3       $0    z_No z_F z_High School z_Blue Collar       30 Commercial
## 4       $0    z_No   M z_High School      Clerical       74    Private
## 5       $0    z_No   M z_High School       Manager       45    Private
## 6 $207,519     Yes   M     Bachelors  Professional        7 Commercial
##   BLUEBOOK TIF    CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1  $21,970   1         Van     yes       $0        0      No       2
## 2  $18,930   6     Minivan      no   $3,295        1      No       2
## 3   $5,900  10       z_SUV      no       $0        0      No       0
## 4   $9,230   6      Pickup      no       $0        0     Yes       0
## 5  $15,420   1     Minivan     yes  $44,857        2      No       4
## 6  $25,660   1 Panel Truck      no   $2,119        1      No       2
##   CAR_AGE            URBANICITY
## 1      10   Highly Urban/ Urban
## 2       1   Highly Urban/ Urban
## 3      10 z_Highly Rural/ Rural
## 4       4 z_Highly Rural/ Rural
## 5       1   Highly Urban/ Urban
## 6      12   Highly Urban/ Urban
#find dimentions
dim(data)
## [1] 8161   26
#build function that counts missing values
count_nas <- function(data){
  
variable_name_column <- c()
number_missing_column <- c()

for (i in 2:ncol(data)){
  variable_name <- colnames(data[i])
  number_missing <- sum(is.na(data[i]))
  variable_name_column <- c(variable_name_column,variable_name)
  number_missing_column <- c(number_missing_column,number_missing)
}

missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),2)) %>% arrange(desc(percentage))
missing_table
}


#chart for missing values
aggr(data[-1], prop = T, numbers = T, cex.axis=.5, cex.numbers = 0.8,
     ylab=c("Proportion of missingness","Missingness Pattern"),
     labels=names(data[-1]))

#missing values
count_nas(data)
##    variable_name_column number_missing_column percentage
## 1               CAR_AGE                   510       6.25
## 2                   YOJ                   454       5.56
## 3                   AGE                     6       0.07
## 4           TARGET_FLAG                     0       0.00
## 5            TARGET_AMT                     0       0.00
## 6              KIDSDRIV                     0       0.00
## 7              HOMEKIDS                     0       0.00
## 8                INCOME                     0       0.00
## 9               PARENT1                     0       0.00
## 10             HOME_VAL                     0       0.00
## 11              MSTATUS                     0       0.00
## 12                  SEX                     0       0.00
## 13            EDUCATION                     0       0.00
## 14                  JOB                     0       0.00
## 15             TRAVTIME                     0       0.00
## 16              CAR_USE                     0       0.00
## 17             BLUEBOOK                     0       0.00
## 18                  TIF                     0       0.00
## 19             CAR_TYPE                     0       0.00
## 20              RED_CAR                     0       0.00
## 21             OLDCLAIM                     0       0.00
## 22             CLM_FREQ                     0       0.00
## 23              REVOKED                     0       0.00
## 24              MVR_PTS                     0       0.00
## 25           URBANICITY                     0       0.00
count_nas(data_testing[4:length(data_testing)])
##    variable_name_column number_missing_column percentage
## 1               CAR_AGE                   129       6.03
## 2                   YOJ                    94       4.39
## 3                   AGE                     1       0.05
## 4              HOMEKIDS                     0       0.00
## 5                INCOME                     0       0.00
## 6               PARENT1                     0       0.00
## 7              HOME_VAL                     0       0.00
## 8               MSTATUS                     0       0.00
## 9                   SEX                     0       0.00
## 10            EDUCATION                     0       0.00
## 11                  JOB                     0       0.00
## 12             TRAVTIME                     0       0.00
## 13              CAR_USE                     0       0.00
## 14             BLUEBOOK                     0       0.00
## 15                  TIF                     0       0.00
## 16             CAR_TYPE                     0       0.00
## 17              RED_CAR                     0       0.00
## 18             OLDCLAIM                     0       0.00
## 19             CLM_FREQ                     0       0.00
## 20              REVOKED                     0       0.00
## 21              MVR_PTS                     0       0.00
## 22           URBANICITY                     0       0.00
#omit NAs
data <- na.omit(data)
data_testing_no_na <- (data_testing[4:length(data_testing)])
data_testing <- data.frame(data_testing[1:3],data_testing_no_na)

#confirm no NAs
count_nas(data)
##    variable_name_column number_missing_column percentage
## 1           TARGET_FLAG                     0          0
## 2            TARGET_AMT                     0          0
## 3              KIDSDRIV                     0          0
## 4                   AGE                     0          0
## 5              HOMEKIDS                     0          0
## 6                   YOJ                     0          0
## 7                INCOME                     0          0
## 8               PARENT1                     0          0
## 9              HOME_VAL                     0          0
## 10              MSTATUS                     0          0
## 11                  SEX                     0          0
## 12            EDUCATION                     0          0
## 13                  JOB                     0          0
## 14             TRAVTIME                     0          0
## 15              CAR_USE                     0          0
## 16             BLUEBOOK                     0          0
## 17                  TIF                     0          0
## 18             CAR_TYPE                     0          0
## 19              RED_CAR                     0          0
## 20             OLDCLAIM                     0          0
## 21             CLM_FREQ                     0          0
## 22              REVOKED                     0          0
## 23              MVR_PTS                     0          0
## 24              CAR_AGE                     0          0
## 25           URBANICITY                     0          0
count_nas(data_testing[4:length(data_testing)])
##    variable_name_column number_missing_column percentage
## 1               CAR_AGE                   129       6.03
## 2                   YOJ                    94       4.39
## 3                   AGE                     1       0.05
## 4              HOMEKIDS                     0       0.00
## 5                INCOME                     0       0.00
## 6               PARENT1                     0       0.00
## 7              HOME_VAL                     0       0.00
## 8               MSTATUS                     0       0.00
## 9                   SEX                     0       0.00
## 10            EDUCATION                     0       0.00
## 11                  JOB                     0       0.00
## 12             TRAVTIME                     0       0.00
## 13              CAR_USE                     0       0.00
## 14             BLUEBOOK                     0       0.00
## 15                  TIF                     0       0.00
## 16             CAR_TYPE                     0       0.00
## 17              RED_CAR                     0       0.00
## 18             OLDCLAIM                     0       0.00
## 19             CLM_FREQ                     0       0.00
## 20              REVOKED                     0       0.00
## 21              MVR_PTS                     0       0.00
## 22           URBANICITY                     0       0.00
  1. DATA PREPARATION

Clean data and convert data to appropriate formats.

#remove "$" and "z_"
data <- data %>% mutate(INCOME = str_replace(INCOME, "[^[:alnum:]]", ""), HOME_VAL = str_replace(HOME_VAL, "[^[:alnum:]]", ""),SEX = as.factor(str_replace(SEX, "z_", "")), OLDCLAIM = as.factor(str_replace(OLDCLAIM, "[^[:alnum:]]", "")), MSTATUS = as.factor(str_replace(MSTATUS, "z_", "")),EDUCATION = as.factor(str_replace(EDUCATION, "z_", "")), BLUEBOOK = as.factor(str_replace(BLUEBOOK, "[^[:alnum:]]", "")),EDUCATION = as.factor(str_replace(EDUCATION, "<", "")),JOB = as.factor(str_replace(JOB, "z_", "")),CAR_TYPE = as.factor(str_replace(CAR_TYPE, "z_", "")),URBANICITY = as.factor(str_replace(URBANICITY, "z_", "")))
data_testing <- data_testing %>% mutate(INCOME = str_replace(INCOME, "[^[:alnum:]]", ""), HOME_VAL = str_replace(HOME_VAL, "[^[:alnum:]]", ""),SEX = as.factor(str_replace(SEX, "z_", "")), OLDCLAIM = as.factor(str_replace(OLDCLAIM, "[^[:alnum:]]", "")), MSTATUS = as.factor(str_replace(MSTATUS, "z_", "")),EDUCATION = as.factor(str_replace(EDUCATION, "z_", "")), BLUEBOOK = as.factor(str_replace(BLUEBOOK, "[^[:alnum:]]", "")),EDUCATION = as.factor(str_replace(EDUCATION, "<", "")),JOB = as.factor(str_replace(JOB, "z_", "")),CAR_TYPE = as.factor(str_replace(CAR_TYPE, "z_", "")),URBANICITY = as.factor(str_replace(URBANICITY, "z_", "")))

#list all variables
colnames(data)
##  [1] "INDEX"       "TARGET_FLAG" "TARGET_AMT"  "KIDSDRIV"    "AGE"        
##  [6] "HOMEKIDS"    "YOJ"         "INCOME"      "PARENT1"     "HOME_VAL"   
## [11] "MSTATUS"     "SEX"         "EDUCATION"   "JOB"         "TRAVTIME"   
## [16] "CAR_USE"     "BLUEBOOK"    "TIF"         "CAR_TYPE"    "RED_CAR"    
## [21] "OLDCLAIM"    "CLM_FREQ"    "REVOKED"     "MVR_PTS"     "CAR_AGE"    
## [26] "URBANICITY"
#convert INCOME, HOME_VAL and OLDCLAIM to numeric
data <- data %>% mutate(INCOME = as.numeric(as.factor(INCOME)), HOME_VAL = as.numeric(as.factor(HOME_VAL)), OLDCLAIM = as.numeric(as.factor(OLDCLAIM)),BLUEBOOK = as.numeric(as.factor(BLUEBOOK)))
data_testing <- data_testing %>% mutate(INCOME = as.numeric(as.factor(INCOME)), HOME_VAL = as.numeric(as.factor(HOME_VAL)), OLDCLAIM = as.numeric(as.factor(OLDCLAIM)),BLUEBOOK = as.numeric(as.factor(BLUEBOOK))) 

head(data)
##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1     1           0          0        0  60        0  11   4506      No
## 2     2           0          0        0  43        0  11   5630      No
## 3     4           0          0        0  35        1  10   1114      No
## 4     5           0          0        0  51        0  14      1      No
## 5     7           1       2946        0  34        1  12    662     Yes
## 6    12           1       2501        0  34        0  10   4278      No
##   HOME_VAL MSTATUS SEX   EDUCATION          JOB TRAVTIME    CAR_USE
## 1        2      No   M         PhD Professional       14    Private
## 2     2886      No   M High School  Blue Collar       22 Commercial
## 3      310     Yes   F High School     Clerical        5    Private
## 4     3484     Yes   M High School  Blue Collar       32    Private
## 5        2      No   F   Bachelors  Blue Collar       46 Commercial
## 6        2      No   F   Bachelors     Clerical       34    Private
##   BLUEBOOK TIF   CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1      429  11    Minivan     yes     1285        2      No       3
## 2      498   1    Minivan     yes        1        0      No       0
## 3     2120   4        SUV      no     1164        2      No       3
## 4      548   7    Minivan     yes        1        0      No       0
## 5      739   1 Sports Car      no        1        0      No       0
## 6      132   1        SUV      no        1        0      No       0
##   CAR_AGE          URBANICITY
## 1      18 Highly Urban/ Urban
## 2       1 Highly Urban/ Urban
## 3      10 Highly Urban/ Urban
## 4       6 Highly Urban/ Urban
## 5       7 Highly Urban/ Urban
## 6       1 Highly Urban/ Urban
head(data_testing)
##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1     3          NA         NA        0  48        0  11   1154      No
## 2     9          NA         NA        1  40        1  11   1119     Yes
## 3    10          NA         NA        0  44        2  12    974     Yes
## 4    18          NA         NA        0  35        2  NA    513     Yes
## 5    21          NA         NA        0  59        0  12   1686      No
## 6    30          NA         NA        0  46        0  14      1      No
##   HOME_VAL MSTATUS SEX   EDUCATION          JOB TRAVTIME    CAR_USE
## 1        2      No   M   Bachelors      Manager       26    Private
## 2        2      No   M High School      Manager       21    Private
## 3        2      No   F High School  Blue Collar       30 Commercial
## 4        2      No   M High School     Clerical       74    Private
## 5        2      No   M High School      Manager       45    Private
## 6      636     Yes   M   Bachelors Professional        7 Commercial
##   BLUEBOOK TIF    CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1      703   1         Van     yes        1        0      No       2
## 2      540   6     Minivan      no      272        1      No       2
## 3     1189  10         SUV      no        1        0      No       0
## 4     1373   6      Pickup      no        1        0     Yes       0
## 5      345   1     Minivan     yes      494        2      No       4
## 6      864   1 Panel Truck      no      137        1      No       2
##   CAR_AGE          URBANICITY
## 1      10 Highly Urban/ Urban
## 2       1 Highly Urban/ Urban
## 3      10 Highly Rural/ Rural
## 4       4 Highly Rural/ Rural
## 5       1 Highly Urban/ Urban
## 6      12 Highly Urban/ Urban

LOGISTIC REGRESSION

The following assumption of logistic regression must be verified:

  1. Verify multicollinearity assumption.
#correlation between variables
corrplot(cor(data[4:length(data_testing)] %>% select_if(is.numeric)), type = "upper", method = "number", tl.cex = 0.5, tl.col="black",number.cex = .5)

  1. Verify linearity assumption.

Analyze scatter plots and mosaic plots.

#create separate boxplots for each numeric variable
par(mfrow=c(1,5))
for(i in 4:ncol(data)) {
  
   if (is.numeric(data[,i])=="TRUE") {
       boxplot(data[,i], main=names(data)[i])
   }
}

#create mosaic plots for each nominal/ordinal variable
par(mfrow=c(1,2))

for(i in 4:ncol(data)) {
  
   if (is.numeric(data[,i])=="FALSE") {
     count <- table(data$TARGET_FLAG, data[,i])
     
     mosaicplot(count, main = names(data)[i],
           xlab = "TARGET_FLAG",
           ylab = names(data)[i],
           las = 1,
           border = "black",
           shade = TRUE
           )
   }
  
}

data_linearity_test <- data %>% select(-TARGET_AMT,-INDEX)

#replacing each numeric variable with variable*log(variable)

for (i in 4:(length(data_linearity_test))){
  
  for (j in 1:nrow(data_linearity_test)){
    
    if (is.double(data_linearity_test[j,i]) == "TRUE" && (data_linearity_test[j,i] < 0 | data_linearity_test[j,i] > 0)){
    
      data_linearity_test[j,i] <- data_linearity_test[j,i]*log(data_linearity_test[j,i])
      
    }
    
  }
}

head(data_linearity_test)
##   TARGET_FLAG KIDSDRIV AGE HOMEKIDS YOJ    INCOME PARENT1     HOME_VAL
## 1           0        0  60        0  11 37909.722      No     1.386294
## 2           0        0  43        0  11 48619.918      No 22994.570770
## 3           0        0  35        1  10  7815.504      No  1778.337412
## 4           0        0  51        0  14     0.000      No 28415.282201
## 5           1        0  34        1  12  4299.866     Yes     1.386294
## 6           1        0  34        0  10 35769.389      No     1.386294
##   MSTATUS SEX   EDUCATION          JOB TRAVTIME    CAR_USE   BLUEBOOK TIF
## 1      No   M         PhD Professional       14    Private  2600.3650  11
## 2      No   M High School  Blue Collar       22 Commercial  3092.8788   1
## 3     Yes   F High School     Clerical        5    Private 16237.4433   4
## 4     Yes   M High School  Blue Collar       32    Private  3455.8389   7
## 5      No   F   Bachelors  Blue Collar       46 Commercial  4881.3152   1
## 6      No   F   Bachelors     Clerical       34    Private   644.5299   1
##     CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1    Minivan     yes 9198.690        2      No       3      18
## 2    Minivan     yes    0.000        0      No       0       1
## 3        SUV      no 8217.395        2      No       3      10
## 4    Minivan     yes    0.000        0      No       0       6
## 5 Sports Car      no    0.000        0      No       0       7
## 6        SUV      no    0.000        0      No       0       1
##            URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Urban/ Urban
## 3 Highly Urban/ Urban
## 4 Highly Urban/ Urban
## 5 Highly Urban/ Urban
## 6 Highly Urban/ Urban
#run regression model that includes all independent variables
model <- glm(formula = TARGET_FLAG ~ . , family = binomial(link = "logit"),
             data = data_linearity_test)
summary(model)
## 
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"), 
##     data = data_linearity_test)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5938  -0.7130  -0.4048   0.6235   3.1627  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -3.960e+00  3.439e-01 -11.517  < 2e-16 ***
## KIDSDRIV                       3.569e-01  6.587e-02   5.419 6.00e-08 ***
## AGE                           -3.512e-03  4.235e-03  -0.829 0.406915    
## HOMEKIDS                       6.132e-02  3.941e-02   1.556 0.119729    
## YOJ                           -1.909e-02  9.141e-03  -2.088 0.036815 *  
## INCOME                        -3.011e-08  2.194e-06  -0.014 0.989048    
## PARENT1Yes                     3.826e-01  1.159e-01   3.301 0.000963 ***
## HOME_VAL                      -1.161e-05  2.933e-06  -3.958 7.56e-05 ***
## MSTATUSYes                    -4.905e-01  8.207e-02  -5.976 2.29e-09 ***
## SEXM                           2.912e-01  1.099e-01   2.650 0.008058 ** 
## EDUCATIONHigh School           4.533e-01  9.307e-02   4.871 1.11e-06 ***
## EDUCATIONMasters               7.600e-02  1.505e-01   0.505 0.613603    
## EDUCATIONPhD                   1.199e-01  1.901e-01   0.631 0.528153    
## JOBBlue Collar                 3.774e-01  1.970e-01   1.916 0.055415 .  
## JOBClerical                    5.933e-01  2.066e-01   2.872 0.004079 ** 
## JOBDoctor                     -2.707e-01  2.731e-01  -0.991 0.321481    
## JOBHome Maker                  5.968e-01  2.147e-01   2.779 0.005445 ** 
## JOBLawyer                      2.467e-01  1.795e-01   1.374 0.169315    
## JOBManager                    -5.858e-01  1.835e-01  -3.193 0.001410 ** 
## JOBProfessional                2.259e-01  1.894e-01   1.193 0.232955    
## JOBStudent                     4.648e-01  2.227e-01   2.088 0.036824 *  
## TRAVTIME                       1.508e-02  2.007e-03   7.513 5.78e-14 ***
## CAR_USEPrivate                -7.844e-01  9.288e-02  -8.445  < 2e-16 ***
## BLUEBOOK                       5.076e-06  4.702e-06   1.080 0.280334    
## TIF                           -5.410e-02  7.804e-03  -6.933 4.12e-12 ***
## CAR_TYPEPanel Truck            2.302e-01  1.512e-01   1.522 0.127954    
## CAR_TYPEPickup                 5.775e-01  1.082e-01   5.338 9.42e-08 ***
## CAR_TYPESports Car             1.266e+00  1.308e-01   9.679  < 2e-16 ***
## CAR_TYPESUV                    9.639e-01  1.101e-01   8.756  < 2e-16 ***
## CAR_TYPEVan                    4.600e-01  1.284e-01   3.584 0.000338 ***
## RED_CARyes                    -1.900e-02  9.183e-02  -0.207 0.836094    
## OLDCLAIM                       1.686e-05  6.495e-06   2.596 0.009427 ** 
## CLM_FREQ                       1.137e-01  3.354e-02   3.389 0.000701 ***
## REVOKEDYes                     7.300e-01  8.584e-02   8.504  < 2e-16 ***
## MVR_PTS                        1.080e-01  1.452e-02   7.438 1.02e-13 ***
## CAR_AGE                       -7.041e-03  7.960e-03  -0.885 0.376367    
## URBANICITYHighly Urban/ Urban  2.342e+00  1.189e-01  19.696  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8303.6  on 7212  degrees of freedom
## Residual deviance: 6465.0  on 7176  degrees of freedom
## AIC: 6539
## 
## Number of Fisher Scoring iterations: 5
#store TARGET_AMT separately
TARGET_AMT <- data$TARGET_AMT
INDEX <- data$INDEX
amount_index <- data.frame(INDEX,TARGET_AMT)

#replace variables that have non-linear relationships with logit function by variables logs
data_logistic_regression <- data %>% select(-INDEX,-TARGET_AMT) %>% mutate(HOME_VAL = ifelse(HOME_VAL > 0,log(HOME_VAL),""), TRAVTIME = ifelse(TRAVTIME > 0,log(TRAVTIME),""), TIF = ifelse(TIF > 0,log(TIF),""),OLDCLAIM = ifelse(OLDCLAIM > 0,log(OLDCLAIM),""))

#count NAs
count_nas(data_logistic_regression)
##    variable_name_column number_missing_column percentage
## 1              KIDSDRIV                     0          0
## 2                   AGE                     0          0
## 3              HOMEKIDS                     0          0
## 4                   YOJ                     0          0
## 5                INCOME                     0          0
## 6               PARENT1                     0          0
## 7              HOME_VAL                     0          0
## 8               MSTATUS                     0          0
## 9                   SEX                     0          0
## 10            EDUCATION                     0          0
## 11                  JOB                     0          0
## 12             TRAVTIME                     0          0
## 13              CAR_USE                     0          0
## 14             BLUEBOOK                     0          0
## 15                  TIF                     0          0
## 16             CAR_TYPE                     0          0
## 17              RED_CAR                     0          0
## 18             OLDCLAIM                     0          0
## 19             CLM_FREQ                     0          0
## 20              REVOKED                     0          0
## 21              MVR_PTS                     0          0
## 22              CAR_AGE                     0          0
## 23           URBANICITY                     0          0
head(data_logistic_regression)
##   TARGET_FLAG KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1  HOME_VAL MSTATUS
## 1           0        0  60        0  11   4506      No 0.6931472      No
## 2           0        0  43        0  11   5630      No 7.9676267      No
## 3           0        0  35        1  10   1114      No 5.7365723     Yes
## 4           0        0  51        0  14      1      No 8.1559363     Yes
## 5           1        0  34        1  12    662     Yes 0.6931472      No
## 6           1        0  34        0  10   4278      No 0.6931472      No
##   SEX   EDUCATION          JOB TRAVTIME    CAR_USE BLUEBOOK      TIF
## 1   M         PhD Professional 2.639057    Private      429 2.397895
## 2   M High School  Blue Collar 3.091042 Commercial      498 0.000000
## 3   F High School     Clerical 1.609438    Private     2120 1.386294
## 4   M High School  Blue Collar 3.465736    Private      548 1.945910
## 5   F   Bachelors  Blue Collar 3.828641 Commercial      739 0.000000
## 6   F   Bachelors     Clerical 3.526361    Private      132 0.000000
##     CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1    Minivan     yes 7.158514        2      No       3      18
## 2    Minivan     yes 0.000000        0      No       0       1
## 3        SUV      no 7.059618        2      No       3      10
## 4    Minivan     yes 0.000000        0      No       0       6
## 5 Sports Car      no 0.000000        0      No       0       7
## 6        SUV      no 0.000000        0      No       0       1
##            URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Urban/ Urban
## 3 Highly Urban/ Urban
## 4 Highly Urban/ Urban
## 5 Highly Urban/ Urban
## 6 Highly Urban/ Urban
  1. BUILD MODELS
#build glm model using stepwise approach
regression_model.null = glm(TARGET_FLAG ~ 1, 
                 data = data_logistic_regression,
                 family = binomial(link="logit")
                 )

regression_model.full = glm(TARGET_FLAG ~ .,
                 data = data_logistic_regression,
                 family = binomial(link="logit")
                 )
     
step(regression_model.null,
     scope = list(upper=regression_model.full),
             direction = "both",
             test = "Chisq",
             data = data_logistic_regression)
## Start:  AIC=8305.56
## TARGET_FLAG ~ 1
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + URBANICITY  1   7864.2 7868.2 439.40 < 2.2e-16 ***
## + OLDCLAIM    1   7871.9 7875.9 431.69 < 2.2e-16 ***
## + MVR_PTS     1   7960.4 7964.4 343.20 < 2.2e-16 ***
## + CLM_FREQ    1   7964.9 7968.9 338.66 < 2.2e-16 ***
## + JOB         8   8054.6 8072.6 248.97 < 2.2e-16 ***
## + PARENT1     1   8143.7 8147.7 159.86 < 2.2e-16 ***
## + CAR_USE     1   8149.6 8153.6 153.93 < 2.2e-16 ***
## + HOME_VAL    1   8151.2 8155.2 152.33 < 2.2e-16 ***
## + CAR_TYPE    5   8145.5 8157.5 158.05 < 2.2e-16 ***
## + EDUCATION   3   8155.2 8163.2 148.40 < 2.2e-16 ***
## + REVOKED     1   8168.7 8172.7 134.90 < 2.2e-16 ***
## + MSTATUS     1   8181.7 8185.7 121.83 < 2.2e-16 ***
## + HOMEKIDS    1   8211.6 8215.6  92.00 < 2.2e-16 ***
## + CAR_AGE     1   8223.8 8227.8  79.80 < 2.2e-16 ***
## + AGE         1   8225.7 8229.7  77.83 < 2.2e-16 ***
## + KIDSDRIV    1   8240.9 8244.9  62.61 2.518e-15 ***
## + TIF         1   8260.9 8264.9  42.67 6.481e-11 ***
## + YOJ         1   8272.4 8276.4  31.19 2.340e-08 ***
## + TRAVTIME    1   8277.5 8281.5  26.05 3.330e-07 ***
## + BLUEBOOK    1   8278.6 8282.6  24.92 5.981e-07 ***
## + INCOME      1   8298.3 8302.3   5.27   0.02174 *  
## + SEX         1   8300.4 8304.4   3.20   0.07356 .  
## <none>            8303.6 8305.6                     
## + RED_CAR     1   8302.9 8306.9   0.70   0.40163    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=7868.15
## TARGET_FLAG ~ URBANICITY
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + JOB         8   7389.8 7409.8 474.32 < 2.2e-16 ***
## + EDUCATION   3   7562.8 7572.8 301.40 < 2.2e-16 ***
## + OLDCLAIM    1   7602.0 7608.0 262.19 < 2.2e-16 ***
## + MVR_PTS     1   7611.5 7617.5 252.68 < 2.2e-16 ***
## + CLM_FREQ    1   7660.1 7666.1 204.03 < 2.2e-16 ***
## + HOME_VAL    1   7676.8 7682.8 187.37 < 2.2e-16 ***
## + PARENT1     1   7681.6 7687.6 182.52 < 2.2e-16 ***
## + CAR_TYPE    5   7674.1 7688.1 190.04 < 2.2e-16 ***
## + CAR_USE     1   7694.3 7700.3 169.90 < 2.2e-16 ***
## + CAR_AGE     1   7706.0 7712.0 158.14 < 2.2e-16 ***
## + MSTATUS     1   7736.4 7742.4 127.76 < 2.2e-16 ***
## + HOMEKIDS    1   7740.2 7746.2 123.94 < 2.2e-16 ***
## + REVOKED     1   7760.0 7766.0 104.19 < 2.2e-16 ***
## + AGE         1   7763.3 7769.3 100.83 < 2.2e-16 ***
## + KIDSDRIV    1   7784.6 7790.6  79.55 < 2.2e-16 ***
## + TRAVTIME    1   7801.5 7807.5  62.65 2.474e-15 ***
## + YOJ         1   7806.2 7812.2  57.95 2.684e-14 ***
## + TIF         1   7816.9 7822.9  47.30 6.102e-12 ***
## + BLUEBOOK    1   7838.0 7844.0  26.11 3.227e-07 ***
## + INCOME      1   7851.6 7857.6  12.59 0.0003887 ***
## + SEX         1   7855.4 7861.4   8.73 0.0031222 ** 
## + RED_CAR     1   7860.7 7866.7   3.46 0.0629175 .  
## <none>            7864.2 7868.2                     
## - URBANICITY  1   8303.6 8305.6 439.40 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=7409.83
## TARGET_FLAG ~ URBANICITY + JOB
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + MVR_PTS     1   7193.8 7215.8 196.00 < 2.2e-16 ***
## + OLDCLAIM    1   7196.7 7218.7 193.13 < 2.2e-16 ***
## + CLM_FREQ    1   7234.6 7256.6 155.26 < 2.2e-16 ***
## + MSTATUS     1   7236.5 7258.5 153.34 < 2.2e-16 ***
## + CAR_TYPE    5   7233.5 7263.5 156.29 < 2.2e-16 ***
## + PARENT1     1   7242.7 7264.7 147.13 < 2.2e-16 ***
## + HOME_VAL    1   7268.8 7290.8 121.00 < 2.2e-16 ***
## + REVOKED     1   7295.2 7317.2  94.59 < 2.2e-16 ***
## + CAR_USE     1   7310.7 7332.7  79.18 < 2.2e-16 ***
## + KIDSDRIV    1   7327.9 7349.9  61.98 3.478e-15 ***
## + TRAVTIME    1   7335.3 7357.3  54.54 1.519e-13 ***
## + TIF         1   7336.9 7358.9  52.96 3.412e-13 ***
## + HOMEKIDS    1   7341.6 7363.6  48.26 3.727e-12 ***
## + AGE         1   7365.7 7387.7  24.12 9.065e-07 ***
## + EDUCATION   3   7363.6 7389.6  26.20 8.666e-06 ***
## + YOJ         1   7372.0 7394.0  17.88 2.355e-05 ***
## + BLUEBOOK    1   7377.5 7399.5  12.32 0.0004491 ***
## + CAR_AGE     1   7381.6 7403.6   8.23 0.0041287 ** 
## + SEX         1   7385.2 7407.2   4.66 0.0308302 *  
## <none>            7389.8 7409.8                     
## + RED_CAR     1   7388.1 7410.1   1.73 0.1878538    
## + INCOME      1   7388.4 7410.4   1.39 0.2387805    
## - JOB         8   7864.2 7868.2 474.32 < 2.2e-16 ***
## - URBANICITY  1   8054.6 8072.6 664.75 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=7215.83
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + MSTATUS     1   7056.1 7080.1 137.72 < 2.2e-16 ***
## + CAR_TYPE    5   7055.8 7087.8 138.04 < 2.2e-16 ***
## + PARENT1     1   7064.1 7088.1 129.68 < 2.2e-16 ***
## + HOME_VAL    1   7090.3 7114.3 103.50 < 2.2e-16 ***
## + REVOKED     1   7103.5 7127.5  90.31 < 2.2e-16 ***
## + OLDCLAIM    1   7119.6 7143.6  74.27 < 2.2e-16 ***
## + CAR_USE     1   7123.7 7147.7  70.10 < 2.2e-16 ***
## + CLM_FREQ    1   7131.3 7155.3  62.49 2.681e-15 ***
## + KIDSDRIV    1   7141.6 7165.6  52.28 4.815e-13 ***
## + TRAVTIME    1   7143.6 7167.6  50.27 1.338e-12 ***
## + TIF         1   7146.6 7170.6  47.20 6.399e-12 ***
## + HOMEKIDS    1   7154.0 7178.0  39.82 2.784e-10 ***
## + EDUCATION   3   7168.9 7196.9  24.95 1.579e-05 ***
## + AGE         1   7176.6 7200.6  17.22 3.326e-05 ***
## + YOJ         1   7180.9 7204.9  12.89 0.0003311 ***
## + BLUEBOOK    1   7181.1 7205.1  12.70 0.0003652 ***
## + CAR_AGE     1   7185.8 7209.8   8.00 0.0046663 ** 
## + SEX         1   7190.4 7214.4   3.38 0.0659479 .  
## + INCOME      1   7191.8 7215.8   2.03 0.1544778    
## <none>            7193.8 7215.8                     
## + RED_CAR     1   7192.1 7216.1   1.69 0.1930480    
## - MVR_PTS     1   7389.8 7409.8 196.00 < 2.2e-16 ***
## - JOB         8   7611.5 7617.5 417.64 < 2.2e-16 ***
## - URBANICITY  1   7735.6 7755.6 541.74 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=7080.11
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + CAR_TYPE    5   6913.8 6947.8 142.30 < 2.2e-16 ***
## + REVOKED     1   6972.8 6998.8  83.30 < 2.2e-16 ***
## + CAR_USE     1   6989.7 7015.7  66.42 3.639e-16 ***
## + OLDCLAIM    1   6991.6 7017.6  64.55 9.403e-16 ***
## + KIDSDRIV    1   6993.1 7019.1  62.97 2.100e-15 ***
## + CLM_FREQ    1   7002.8 7028.8  53.36 2.780e-13 ***
## + TRAVTIME    1   7003.7 7029.7  52.39 4.552e-13 ***
## + HOMEKIDS    1   7006.6 7032.6  49.54 1.941e-12 ***
## + TIF         1   7006.9 7032.9  49.23 2.281e-12 ***
## + PARENT1     1   7014.4 7040.4  41.75 1.035e-10 ***
## + EDUCATION   3   7025.6 7055.6  30.49 1.090e-06 ***
## + HOME_VAL    1   7035.8 7061.8  20.33 6.516e-06 ***
## + BLUEBOOK    1   7045.0 7071.0  11.13 0.0008497 ***
## + CAR_AGE     1   7045.9 7071.9  10.23 0.0013797 ** 
## + AGE         1   7047.7 7073.7   8.42 0.0037186 ** 
## + SEX         1   7052.6 7078.6   3.48 0.0622212 .  
## + YOJ         1   7053.1 7079.1   3.03 0.0815317 .  
## + RED_CAR     1   7054.0 7080.0   2.16 0.1420570    
## <none>            7056.1 7080.1                     
## + INCOME      1   7054.2 7080.2   1.93 0.1644498    
## - MSTATUS     1   7193.8 7215.8 137.72 < 2.2e-16 ***
## - MVR_PTS     1   7236.5 7258.5 180.39 < 2.2e-16 ***
## - JOB         8   7498.9 7506.9 442.74 < 2.2e-16 ***
## - URBANICITY  1   7617.7 7639.7 561.59 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6947.81
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + REVOKED     1   6832.6 6868.6  81.23 < 2.2e-16 ***
## + KIDSDRIV    1   6848.2 6884.2  65.62 5.466e-16 ***
## + CAR_USE     1   6855.6 6891.6  58.16 2.412e-14 ***
## + OLDCLAIM    1   6858.1 6894.1  55.68 8.524e-14 ***
## + TIF         1   6859.8 6895.8  54.01 1.999e-13 ***
## + TRAVTIME    1   6860.6 6896.6  53.17 3.064e-13 ***
## + HOMEKIDS    1   6864.8 6900.8  49.01 2.546e-12 ***
## + CLM_FREQ    1   6867.2 6903.2  46.62 8.614e-12 ***
## + PARENT1     1   6871.9 6907.9  41.93 9.470e-11 ***
## + EDUCATION   3   6885.2 6925.2  28.65 2.647e-06 ***
## + HOME_VAL    1   6894.1 6930.1  19.67 9.186e-06 ***
## + AGE         1   6903.9 6939.9   9.87  0.001681 ** 
## + CAR_AGE     1   6904.5 6940.5   9.30  0.002292 ** 
## + SEX         1   6906.5 6942.5   7.30  0.006886 ** 
## + YOJ         1   6911.6 6947.6   2.22  0.136168    
## <none>            6913.8 6947.8                     
## + RED_CAR     1   6912.4 6948.4   1.38  0.240029    
## + BLUEBOOK    1   6912.7 6948.7   1.12  0.290811    
## + INCOME      1   6912.7 6948.7   1.07  0.301813    
## - CAR_TYPE    5   7056.1 7080.1 142.30 < 2.2e-16 ***
## - MSTATUS     1   7055.8 7087.8 141.99 < 2.2e-16 ***
## - MVR_PTS     1   7075.7 7107.7 161.94 < 2.2e-16 ***
## - JOB         8   7327.3 7345.3 413.48 < 2.2e-16 ***
## - URBANICITY  1   7495.0 7527.0 581.17 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6868.58
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + KIDSDRIV    1   6772.8 6810.8  59.77 1.064e-14 ***
## + CAR_USE     1   6776.8 6814.8  55.78 8.096e-14 ***
## + TRAVTIME    1   6778.1 6816.1  54.50 1.553e-13 ***
## + OLDCLAIM    1   6779.2 6817.2  53.36 2.783e-13 ***
## + TIF         1   6780.7 6818.7  51.88 5.911e-13 ***
## + CLM_FREQ    1   6787.1 6825.1  45.43 1.582e-11 ***
## + HOMEKIDS    1   6788.7 6826.7  43.89 3.481e-11 ***
## + PARENT1     1   6792.5 6830.5  40.11 2.406e-10 ***
## + EDUCATION   3   6804.5 6846.5  28.04 3.557e-06 ***
## + HOME_VAL    1   6814.4 6852.4  18.17 2.018e-05 ***
## + CAR_AGE     1   6823.1 6861.1   9.49  0.002061 ** 
## + AGE         1   6823.9 6861.9   8.67  0.003231 ** 
## + SEX         1   6825.7 6863.7   6.89  0.008668 ** 
## + YOJ         1   6830.0 6868.0   2.56  0.109288    
## <none>            6832.6 6868.6                     
## + BLUEBOOK    1   6831.4 6869.4   1.22  0.268757    
## + RED_CAR     1   6831.4 6869.4   1.14  0.285530    
## + INCOME      1   6831.6 6869.6   0.93  0.334478    
## - REVOKED     1   6913.8 6947.8  81.23 < 2.2e-16 ***
## - CAR_TYPE    5   6972.8 6998.8 140.23 < 2.2e-16 ***
## - MSTATUS     1   6967.4 7001.4 134.80 < 2.2e-16 ***
## - MVR_PTS     1   6991.6 7025.6 159.07 < 2.2e-16 ***
## - JOB         8   7237.9 7257.9 405.34 < 2.2e-16 ***
## - URBANICITY  1   7384.5 7418.5 551.89 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6810.8
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + CAR_USE     1   6713.5 6753.5  59.31 1.346e-14 ***
## + TRAVTIME    1   6716.1 6756.1  56.75 4.946e-14 ***
## + OLDCLAIM    1   6720.5 6760.5  52.34 4.672e-13 ***
## + TIF         1   6722.1 6762.1  50.67 1.093e-12 ***
## + CLM_FREQ    1   6728.4 6768.4  44.42 2.648e-11 ***
## + EDUCATION   3   6745.4 6789.4  27.41 4.830e-06 ***
## + PARENT1     1   6751.5 6791.5  21.30 3.931e-06 ***
## + HOME_VAL    1   6755.5 6795.5  17.28 3.222e-05 ***
## + HOMEKIDS    1   6760.6 6800.6  12.21 0.0004753 ***
## + CAR_AGE     1   6763.7 6803.7   9.08 0.0025847 ** 
## + SEX         1   6763.9 6803.9   8.89 0.0028640 ** 
## + AGE         1   6766.3 6806.3   6.47 0.0109717 *  
## + YOJ         1   6768.6 6808.6   4.16 0.0413811 *  
## <none>            6772.8 6810.8                     
## + RED_CAR     1   6771.1 6811.1   1.70 0.1918995    
## + BLUEBOOK    1   6771.7 6811.7   1.14 0.2850905    
## + INCOME      1   6772.2 6812.2   0.63 0.4287105    
## - KIDSDRIV    1   6832.6 6868.6  59.77 1.064e-14 ***
## - REVOKED     1   6848.2 6884.2  75.38 < 2.2e-16 ***
## - CAR_TYPE    5   6915.8 6943.8 142.95 < 2.2e-16 ***
## - MSTATUS     1   6918.2 6954.2 145.43 < 2.2e-16 ***
## - MVR_PTS     1   6921.7 6957.7 148.94 < 2.2e-16 ***
## - JOB         8   7167.1 7189.1 394.33 < 2.2e-16 ***
## - URBANICITY  1   7341.2 7377.2 568.44 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6753.49
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + TRAVTIME    1   6657.1 6699.1  56.37 6.003e-14 ***
## + TIF         1   6662.3 6704.3  51.20 8.341e-13 ***
## + OLDCLAIM    1   6664.7 6706.7  48.79 2.852e-12 ***
## + CLM_FREQ    1   6672.2 6714.2  41.27 1.325e-10 ***
## + EDUCATION   3   6674.6 6720.6  38.87 1.849e-08 ***
## + PARENT1     1   6692.2 6734.2  21.25 4.035e-06 ***
## + HOME_VAL    1   6694.5 6736.5  18.99 1.313e-05 ***
## + CAR_AGE     1   6700.2 6742.2  13.31 0.0002633 ***
## + HOMEKIDS    1   6700.6 6742.6  12.92 0.0003255 ***
## + SEX         1   6703.4 6745.4  10.13 0.0014561 ** 
## + AGE         1   6706.0 6748.0   7.50 0.0061788 ** 
## + YOJ         1   6710.1 6752.1   3.39 0.0654142 .  
## <none>            6713.5 6753.5                     
## + RED_CAR     1   6711.6 6753.6   1.92 0.1662546    
## + BLUEBOOK    1   6711.9 6753.9   1.63 0.2020085    
## + INCOME      1   6712.8 6754.8   0.74 0.3890476    
## - CAR_USE     1   6772.8 6810.8  59.31 1.346e-14 ***
## - KIDSDRIV    1   6776.8 6814.8  63.30 1.773e-15 ***
## - REVOKED     1   6786.2 6824.2  72.75 < 2.2e-16 ***
## - CAR_TYPE    5   6846.1 6876.1 132.57 < 2.2e-16 ***
## - MVR_PTS     1   6853.0 6891.0 139.52 < 2.2e-16 ***
## - MSTATUS     1   6855.6 6893.6 142.14 < 2.2e-16 ***
## - JOB         8   6990.6 7014.6 277.14 < 2.2e-16 ***
## - URBANICITY  1   7292.7 7330.7 579.20 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6699.12
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + TIF         1   6608.1 6652.1  48.98 2.581e-12 ***
## + OLDCLAIM    1   6610.0 6654.0  47.08 6.800e-12 ***
## + CLM_FREQ    1   6618.4 6662.4  38.73 4.860e-10 ***
## + EDUCATION   3   6614.6 6662.6  42.51 3.125e-09 ***
## + PARENT1     1   6633.5 6677.5  23.64 1.162e-06 ***
## + HOME_VAL    1   6639.8 6683.8  17.33 3.139e-05 ***
## + HOMEKIDS    1   6641.7 6685.7  15.45 8.472e-05 ***
## + CAR_AGE     1   6642.8 6686.8  14.31 0.0001553 ***
## + SEX         1   6647.4 6691.4   9.68 0.0018668 ** 
## + AGE         1   6648.2 6692.2   8.92 0.0028243 ** 
## + YOJ         1   6653.7 6697.7   3.39 0.0655056 .  
## <none>            6657.1 6699.1                     
## + RED_CAR     1   6655.4 6699.4   1.77 0.1834767    
## + BLUEBOOK    1   6655.5 6699.5   1.57 0.2097381    
## + INCOME      1   6656.4 6700.4   0.73 0.3935436    
## - TRAVTIME    1   6713.5 6753.5  56.37 6.003e-14 ***
## - CAR_USE     1   6716.1 6756.1  58.93 1.633e-14 ***
## - KIDSDRIV    1   6722.6 6762.6  65.53 5.732e-16 ***
## - REVOKED     1   6731.2 6771.2  74.10 < 2.2e-16 ***
## - CAR_TYPE    5   6790.3 6822.3 133.14 < 2.2e-16 ***
## - MVR_PTS     1   6793.4 6833.4 136.29 < 2.2e-16 ***
## - MSTATUS     1   6802.3 6842.3 145.19 < 2.2e-16 ***
## - JOB         8   6931.8 6957.8 274.67 < 2.2e-16 ***
## - URBANICITY  1   7272.9 7312.9 615.81 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6652.14
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + OLDCLAIM    1   6561.5 6607.5  46.61 8.642e-12 ***
## + EDUCATION   3   6565.2 6615.2  42.91 2.573e-09 ***
## + CLM_FREQ    1   6570.1 6616.1  38.07 6.841e-10 ***
## + PARENT1     1   6584.6 6630.6  23.56 1.211e-06 ***
## + HOME_VAL    1   6591.0 6637.0  17.12 3.504e-05 ***
## + HOMEKIDS    1   6592.6 6638.6  15.59 7.879e-05 ***
## + CAR_AGE     1   6594.2 6640.2  13.90 0.0001925 ***
## + SEX         1   6598.1 6644.1  10.00 0.0015629 ** 
## + AGE         1   6599.3 6645.3   8.86 0.0029128 ** 
## + YOJ         1   6605.4 6651.4   2.70 0.1003124    
## <none>            6608.1 6652.1                     
## + RED_CAR     1   6606.2 6652.2   1.95 0.1625458    
## + BLUEBOOK    1   6606.3 6652.3   1.87 0.1720231    
## + INCOME      1   6607.7 6653.7   0.47 0.4908447    
## - TIF         1   6657.1 6699.1  48.98 2.581e-12 ***
## - TRAVTIME    1   6662.3 6704.3  54.15 1.854e-13 ***
## - CAR_USE     1   6667.5 6709.5  59.36 1.315e-14 ***
## - KIDSDRIV    1   6672.3 6714.3  64.12 1.172e-15 ***
## - REVOKED     1   6680.4 6722.4  72.27 < 2.2e-16 ***
## - CAR_TYPE    5   6745.8 6779.8 137.71 < 2.2e-16 ***
## - MVR_PTS     1   6739.3 6781.3 131.17 < 2.2e-16 ***
## - MSTATUS     1   6755.2 6797.2 147.06 < 2.2e-16 ***
## - JOB         8   6886.4 6914.4 278.25 < 2.2e-16 ***
## - URBANICITY  1   7232.1 7274.1 624.01 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6607.52
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + EDUCATION   3   6518.8 6570.8  42.69 2.865e-09 ***
## + PARENT1     1   6537.7 6585.7  23.79 1.073e-06 ***
## + HOME_VAL    1   6545.5 6593.5  16.00 6.345e-05 ***
## + HOMEKIDS    1   6546.5 6594.5  15.02 0.0001063 ***
## + CAR_AGE     1   6547.7 6595.7  13.79 0.0002046 ***
## + AGE         1   6552.3 6600.3   9.20 0.0024243 ** 
## + SEX         1   6552.6 6600.6   8.92 0.0028162 ** 
## + YOJ         1   6558.4 6606.4   3.13 0.0767678 .  
## <none>            6561.5 6607.5                     
## + BLUEBOOK    1   6559.7 6607.7   1.87 0.1713670    
## + RED_CAR     1   6559.7 6607.7   1.78 0.1819681    
## + CLM_FREQ    1   6560.1 6608.1   1.41 0.2345579    
## + INCOME      1   6560.9 6608.9   0.59 0.4441139    
## - OLDCLAIM    1   6608.1 6652.1  46.61 8.642e-12 ***
## - TIF         1   6610.0 6654.0  48.51 3.279e-12 ***
## - TRAVTIME    1   6614.0 6658.0  52.45 4.423e-13 ***
## - MVR_PTS     1   6616.0 6660.0  54.47 1.581e-13 ***
## - CAR_USE     1   6617.3 6661.3  55.83 7.919e-14 ***
## - KIDSDRIV    1   6624.7 6668.7  63.22 1.848e-15 ***
## - REVOKED     1   6632.0 6676.0  70.49 < 2.2e-16 ***
## - CAR_TYPE    5   6692.7 6728.7 131.21 < 2.2e-16 ***
## - MSTATUS     1   6701.6 6745.6 140.03 < 2.2e-16 ***
## - JOB         8   6829.3 6859.3 267.83 < 2.2e-16 ***
## - URBANICITY  1   7080.8 7124.8 519.32 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6570.83
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + PARENT1     1   6495.6 6549.6  23.23 1.437e-06 ***
## + HOME_VAL    1   6504.9 6558.9  13.95 0.0001881 ***
## + HOMEKIDS    1   6505.4 6559.4  13.41 0.0002504 ***
## + AGE         1   6510.8 6564.8   8.03 0.0046033 ** 
## + SEX         1   6510.9 6564.9   7.89 0.0049758 ** 
## + YOJ         1   6515.6 6569.6   3.20 0.0737714 .  
## <none>            6518.8 6570.8                     
## + RED_CAR     1   6517.1 6571.1   1.71 0.1912727    
## + CLM_FREQ    1   6517.4 6571.4   1.47 0.2249206    
## + BLUEBOOK    1   6517.6 6571.6   1.23 0.2666877    
## + CAR_AGE     1   6517.9 6571.9   0.95 0.3299285    
## + INCOME      1   6518.7 6572.7   0.10 0.7560501    
## - EDUCATION   3   6561.5 6607.5  42.69 2.865e-09 ***
## - OLDCLAIM    1   6565.2 6615.2  46.39 9.669e-12 ***
## - TIF         1   6567.8 6617.8  49.01 2.549e-12 ***
## - MVR_PTS     1   6572.3 6622.3  53.49 2.600e-13 ***
## - TRAVTIME    1   6574.8 6624.8  55.99 7.274e-14 ***
## - KIDSDRIV    1   6581.3 6631.3  62.51 2.647e-15 ***
## - CAR_USE     1   6586.6 6636.6  67.78 < 2.2e-16 ***
## - REVOKED     1   6588.1 6638.1  69.29 < 2.2e-16 ***
## - JOB         8   6625.0 6661.0 106.20 < 2.2e-16 ***
## - CAR_TYPE    5   6649.1 6691.1 130.26 < 2.2e-16 ***
## - MSTATUS     1   6665.9 6715.9 147.08 < 2.2e-16 ***
## - URBANICITY  1   7051.0 7101.0 532.15 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6549.6
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION + PARENT1
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + HOME_VAL    1   6480.8 6536.8  14.77 0.0001215 ***
## + SEX         1   6486.5 6542.5   9.06 0.0026149 ** 
## + YOJ         1   6491.6 6547.6   4.04 0.0445329 *  
## + AGE         1   6493.4 6549.4   2.20 0.1376563    
## + HOMEKIDS    1   6493.5 6549.5   2.15 0.1428139    
## + RED_CAR     1   6493.5 6549.5   2.10 0.1468804    
## <none>            6495.6 6549.6                     
## + CLM_FREQ    1   6493.9 6549.9   1.74 0.1877295    
## + BLUEBOOK    1   6494.4 6550.4   1.23 0.2677490    
## + CAR_AGE     1   6494.8 6550.8   0.84 0.3585227    
## + INCOME      1   6495.5 6551.5   0.08 0.7764266    
## - PARENT1     1   6518.8 6570.8  23.23 1.437e-06 ***
## - EDUCATION   3   6537.7 6585.7  42.13 3.770e-09 ***
## - KIDSDRIV    1   6537.8 6589.8  42.23 8.115e-11 ***
## - OLDCLAIM    1   6542.2 6594.2  46.59 8.738e-12 ***
## - TIF         1   6544.6 6596.6  48.97 2.605e-12 ***
## - MVR_PTS     1   6547.1 6599.1  51.54 7.006e-13 ***
## - TRAVTIME    1   6554.1 6606.1  58.51 2.019e-14 ***
## - MSTATUS     1   6558.9 6610.9  63.32 1.757e-15 ***
## - CAR_USE     1   6563.2 6615.2  67.63 < 2.2e-16 ***
## - REVOKED     1   6564.5 6616.5  68.90 < 2.2e-16 ***
## - JOB         8   6598.1 6636.1 102.51 < 2.2e-16 ***
## - CAR_TYPE    5   6623.5 6667.5 127.94 < 2.2e-16 ***
## - URBANICITY  1   7030.6 7082.6 534.96 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6536.83
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION + PARENT1 + HOME_VAL
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + SEX         1   6471.9 6529.9   8.94 0.0027828 ** 
## + YOJ         1   6477.2 6535.2   3.64 0.0563974 .  
## + HOMEKIDS    1   6478.8 6536.8   2.02 0.1550890    
## <none>            6480.8 6536.8                     
## + RED_CAR     1   6478.8 6536.8   1.99 0.1587449    
## + AGE         1   6478.9 6536.9   1.90 0.1685293    
## + CLM_FREQ    1   6479.2 6537.2   1.68 0.1952595    
## + BLUEBOOK    1   6479.6 6537.6   1.20 0.2726206    
## + CAR_AGE     1   6479.9 6537.9   0.93 0.3341152    
## + INCOME      1   6480.8 6538.8   0.05 0.8238025    
## - HOME_VAL    1   6495.6 6549.6  14.77 0.0001215 ***
## - PARENT1     1   6504.9 6558.9  24.05 9.368e-07 ***
## - MSTATUS     1   6506.3 6560.3  25.43 4.589e-07 ***
## - EDUCATION   3   6520.8 6570.8  40.00 1.063e-08 ***
## - KIDSDRIV    1   6522.3 6576.3  41.42 1.228e-10 ***
## - OLDCLAIM    1   6526.4 6580.4  45.59 1.457e-11 ***
## - TIF         1   6529.5 6583.5  48.64 3.077e-12 ***
## - MVR_PTS     1   6530.8 6584.8  49.99 1.545e-12 ***
## - TRAVTIME    1   6537.7 6591.7  56.86 4.676e-14 ***
## - REVOKED     1   6548.5 6602.5  67.70 < 2.2e-16 ***
## - CAR_USE     1   6549.7 6603.7  68.88 < 2.2e-16 ***
## - JOB         8   6575.2 6615.2  94.33 < 2.2e-16 ***
## - CAR_TYPE    5   6607.7 6653.7 126.91 < 2.2e-16 ***
## - URBANICITY  1   7016.8 7070.8 536.01 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6529.89
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION + PARENT1 + HOME_VAL + SEX
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + YOJ         1   6468.4 6528.4   3.45  0.063346 .  
## + AGE         1   6469.3 6529.3   2.63  0.105150    
## + HOMEKIDS    1   6469.4 6529.4   2.49  0.114846    
## <none>            6471.9 6529.9                     
## + CLM_FREQ    1   6470.4 6530.4   1.49  0.222077    
## + BLUEBOOK    1   6470.6 6530.6   1.34  0.247682    
## + CAR_AGE     1   6471.0 6531.0   0.88  0.348468    
## + INCOME      1   6471.9 6531.9   0.03  0.858636    
## + RED_CAR     1   6471.9 6531.9   0.02  0.895218    
## - SEX         1   6480.8 6536.8   8.94  0.002783 ** 
## - HOME_VAL    1   6486.5 6542.5  14.66  0.000129 ***
## - PARENT1     1   6497.1 6553.1  25.25 5.046e-07 ***
## - MSTATUS     1   6497.2 6553.2  25.34 4.811e-07 ***
## - EDUCATION   3   6510.9 6562.9  38.97 1.762e-08 ***
## - KIDSDRIV    1   6514.5 6570.5  42.60 6.709e-11 ***
## - OLDCLAIM    1   6516.4 6572.4  44.51 2.528e-11 ***
## - TIF         1   6520.9 6576.9  49.00 2.566e-12 ***
## - MVR_PTS     1   6522.2 6578.2  50.32 1.303e-12 ***
## - TRAVTIME    1   6528.5 6584.5  56.63 5.253e-14 ***
## - REVOKED     1   6539.3 6595.3  67.39 2.228e-16 ***
## - CAR_USE     1   6541.8 6597.8  69.96 < 2.2e-16 ***
## - JOB         8   6568.5 6610.5  96.63 < 2.2e-16 ***
## - CAR_TYPE    5   6595.7 6643.7 123.82 < 2.2e-16 ***
## - URBANICITY  1   7009.2 7065.2 537.28 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6528.44
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION + PARENT1 + HOME_VAL + SEX + YOJ
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## + HOMEKIDS    1   6464.8 6526.8   3.62 0.0572029 .  
## + AGE         1   6466.4 6528.4   2.00 0.1570267    
## <none>            6468.4 6528.4                     
## + CLM_FREQ    1   6467.1 6529.1   1.36 0.2434744    
## + BLUEBOOK    1   6467.1 6529.1   1.31 0.2527242    
## + CAR_AGE     1   6467.5 6529.5   0.91 0.3400224    
## - YOJ         1   6471.9 6529.9   3.45 0.0633461 .  
## + INCOME      1   6468.4 6530.4   0.04 0.8474347    
## + RED_CAR     1   6468.4 6530.4   0.02 0.8985257    
## - SEX         1   6477.2 6535.2   8.75 0.0030928 ** 
## - HOME_VAL    1   6482.7 6540.7  14.26 0.0001588 ***
## - MSTATUS     1   6491.1 6549.1  22.63 1.961e-06 ***
## - PARENT1     1   6494.5 6552.5  26.01 3.388e-07 ***
## - EDUCATION   3   6507.5 6561.5  39.03 1.708e-08 ***
## - KIDSDRIV    1   6512.1 6570.1  43.64 3.948e-11 ***
## - OLDCLAIM    1   6513.4 6571.4  44.95 2.022e-11 ***
## - TIF         1   6516.7 6574.7  48.22 3.802e-12 ***
## - MVR_PTS     1   6517.4 6575.4  48.99 2.569e-12 ***
## - TRAVTIME    1   6525.1 6583.1  56.67 5.149e-14 ***
## - REVOKED     1   6536.1 6594.1  67.69 < 2.2e-16 ***
## - CAR_USE     1   6537.4 6595.4  68.97 < 2.2e-16 ***
## - JOB         8   6558.4 6602.4  89.92 4.832e-16 ***
## - CAR_TYPE    5   6591.1 6641.1 122.68 < 2.2e-16 ***
## - URBANICITY  1   7006.4 7064.4 537.96 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=6526.83
## TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION + PARENT1 + HOME_VAL + SEX + YOJ + HOMEKIDS
## 
##              Df Deviance    AIC    LRT  Pr(>Chi)    
## <none>            6464.8 6526.8                     
## + CLM_FREQ    1   6463.5 6527.5   1.37 0.2423159    
## + BLUEBOOK    1   6463.7 6527.7   1.14 0.2855922    
## + CAR_AGE     1   6463.9 6527.9   0.89 0.3459988    
## + AGE         1   6464.2 6528.2   0.58 0.4457354    
## - HOMEKIDS    1   6468.4 6528.4   3.62 0.0572029 .  
## + INCOME      1   6464.8 6528.8   0.05 0.8163774    
## + RED_CAR     1   6464.8 6528.8   0.03 0.8660822    
## - YOJ         1   6469.4 6529.4   4.58 0.0323854 *  
## - SEX         1   6474.1 6534.1   9.29 0.0023082 ** 
## - PARENT1     1   6477.1 6537.1  12.23 0.0004705 ***
## - HOME_VAL    1   6478.9 6538.9  14.03 0.0001796 ***
## - MSTATUS     1   6490.7 6550.7  25.85 3.685e-07 ***
## - KIDSDRIV    1   6493.2 6553.2  28.33 1.023e-07 ***
## - EDUCATION   3   6503.0 6559.0  38.18 2.588e-08 ***
## - OLDCLAIM    1   6509.5 6569.5  44.62 2.387e-11 ***
## - TIF         1   6513.0 6573.0  48.21 3.822e-12 ***
## - MVR_PTS     1   6513.3 6573.3  48.49 3.323e-12 ***
## - TRAVTIME    1   6522.3 6582.3  57.49 3.393e-14 ***
## - REVOKED     1   6531.5 6591.5  66.64 3.265e-16 ***
## - CAR_USE     1   6534.0 6594.0  69.19 < 2.2e-16 ***
## - JOB         8   6551.7 6597.7  86.90 1.975e-15 ***
## - CAR_TYPE    5   6587.7 6639.7 122.87 < 2.2e-16 ***
## - URBANICITY  1   7002.6 7062.6 537.73 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call:  glm(formula = TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + 
##     CAR_TYPE + REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + 
##     OLDCLAIM + EDUCATION + PARENT1 + HOME_VAL + SEX + YOJ + HOMEKIDS, 
##     family = binomial(link = "logit"), data = data_logistic_regression)
## 
## Coefficients:
##                   (Intercept)  URBANICITYHighly Urban/ Urban  
##                      -4.97393                        2.31267  
##                JOBBlue Collar                    JOBClerical  
##                       0.40049                        0.62287  
##                     JOBDoctor                  JOBHome Maker  
##                      -0.28342                        0.56190  
##                     JOBLawyer                     JOBManager  
##                       0.22651                       -0.60161  
##               JOBProfessional                     JOBStudent  
##                       0.21678                        0.41912  
##                       MVR_PTS                     MSTATUSYes  
##                       0.10286                       -0.44613  
##           CAR_TYPEPanel Truck                 CAR_TYPEPickup  
##                       0.24276                        0.61511  
##            CAR_TYPESports Car                    CAR_TYPESUV  
##                       1.29462                        0.96939  
##                   CAR_TYPEVan                     REVOKEDYes  
##                       0.44868                        0.70437  
##                      KIDSDRIV                 CAR_USEPrivate  
##                       0.34490                       -0.76485  
##                      TRAVTIME                            TIF  
##                       0.40872                       -0.22491  
##                      OLDCLAIM           EDUCATIONHigh School  
##                       0.06630                        0.51158  
##              EDUCATIONMasters                   EDUCATIONPhD  
##                       0.02798                        0.04218  
##                    PARENT1Yes                       HOME_VAL  
##                       0.40297                       -0.04277  
##                          SEXM                            YOJ  
##                       0.28733                       -0.01888  
##                      HOMEKIDS  
##                       0.06966  
## 
## Degrees of Freedom: 7212 Total (i.e. Null);  7182 Residual
## Null Deviance:       8304 
## Residual Deviance: 6465  AIC: 6527
  1. SELECT MODELS

Test Goodness of Fit.

#build the final model for logistic regression
final_logistic_model <- glm(formula = TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + EDUCATION + PARENT1 + HOME_VAL + SEX + YOJ + HOMEKIDS, family = binomial(link = "logit"), data = data_logistic_regression)

summary(final_logistic_model)
## 
## Call:
## glm(formula = TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + 
##     CAR_TYPE + REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + 
##     OLDCLAIM + EDUCATION + PARENT1 + HOME_VAL + SEX + YOJ + HOMEKIDS, 
##     family = binomial(link = "logit"), data = data_logistic_regression)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4136  -0.7171  -0.4019   0.6191   3.1037  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -4.973933   0.339749 -14.640  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  2.312670   0.118587  19.502  < 2e-16 ***
## JOBBlue Collar                 0.400491   0.196687   2.036 0.041732 *  
## JOBClerical                    0.622865   0.205623   3.029 0.002452 ** 
## JOBDoctor                     -0.283421   0.271886  -1.042 0.297215    
## JOBHome Maker                  0.561895   0.213592   2.631 0.008521 ** 
## JOBLawyer                      0.226513   0.179133   1.264 0.206052    
## JOBManager                    -0.601606   0.183188  -3.284 0.001023 ** 
## JOBProfessional                0.216776   0.189090   1.146 0.251625    
## JOBStudent                     0.419121   0.223734   1.873 0.061027 .  
## MVR_PTS                        0.102862   0.014837   6.933 4.12e-12 ***
## MSTATUSYes                    -0.446125   0.087484  -5.099 3.41e-07 ***
## CAR_TYPEPanel Truck            0.242756   0.148254   1.637 0.101540    
## CAR_TYPEPickup                 0.615111   0.105469   5.832 5.47e-09 ***
## CAR_TYPESports Car             1.294620   0.129511   9.996  < 2e-16 ***
## CAR_TYPESUV                    0.969394   0.109561   8.848  < 2e-16 ***
## CAR_TYPEVan                    0.448676   0.128326   3.496 0.000472 ***
## REVOKEDYes                     0.704373   0.085524   8.236  < 2e-16 ***
## KIDSDRIV                       0.344905   0.064709   5.330 9.82e-08 ***
## CAR_USEPrivate                -0.764853   0.092848  -8.238  < 2e-16 ***
## TRAVTIME                       0.408717   0.055212   7.403 1.33e-13 ***
## TIF                           -0.224914   0.032452  -6.931 4.19e-12 ***
## OLDCLAIM                       0.066303   0.009890   6.704 2.03e-11 ***
## EDUCATIONHigh School           0.511577   0.084712   6.039 1.55e-09 ***
## EDUCATIONMasters               0.027985   0.144192   0.194 0.846113    
## EDUCATIONPhD                   0.042183   0.183744   0.230 0.818421    
## PARENT1Yes                     0.402971   0.115354   3.493 0.000477 ***
## HOME_VAL                      -0.042768   0.011411  -3.748 0.000178 ***
## SEXM                           0.287326   0.094891   3.028 0.002462 ** 
## YOJ                           -0.018883   0.008826  -2.139 0.032401 *  
## HOMEKIDS                       0.069662   0.036508   1.908 0.056372 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8303.6  on 7212  degrees of freedom
## Residual deviance: 6464.8  on 7182  degrees of freedom
## AIC: 6526.8
## 
## Number of Fisher Scoring iterations: 5
#reduced models with fewer parameters
logistic_model2 <- glm(formula =TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + EDUCATION + PARENT1 + HOME_VAL + SEX + YOJ, family = binomial(link = "logit"), data = data_logistic_regression)

logistic_model3 <- glm(formula = TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + EDUCATION + PARENT1 + HOME_VAL + SEX, family = binomial(link = "logit"), data = data_logistic_regression)

#residual deviance test
p_value = 1 - pchisq(final_logistic_model$deviance,final_logistic_model$df.residual)
p_value
## [1] 1
#Likelihood Ratio Test
anova(final_logistic_model, logistic_model2, test ="Chisq")
## Analysis of Deviance Table
## 
## Model 1: TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION + PARENT1 + HOME_VAL + SEX + YOJ + HOMEKIDS
## Model 2: TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION + PARENT1 + HOME_VAL + SEX + YOJ
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1      7182     6464.8                       
## 2      7183     6468.4 -1  -3.6167   0.0572 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(final_logistic_model, logistic_model3, test ="Chisq")
## Analysis of Deviance Table
## 
## Model 1: TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION + PARENT1 + HOME_VAL + SEX + YOJ + HOMEKIDS
## Model 2: TARGET_FLAG ~ URBANICITY + JOB + MVR_PTS + MSTATUS + CAR_TYPE + 
##     REVOKED + KIDSDRIV + CAR_USE + TRAVTIME + TIF + OLDCLAIM + 
##     EDUCATION + PARENT1 + HOME_VAL + SEX
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1      7182     6464.8                       
## 2      7184     6471.9 -2  -7.0642  0.02924 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pseudo R^2 Test
pR2(final_logistic_model)
##           llh       llhNull            G2      McFadden          r2ML 
## -3232.4129169 -4151.7778788  1838.7299237     0.2214389     0.2250206 
##          r2CU 
##     0.3291023
#Hosmer-Lemeshow Test
hoslem.test(data_logistic_regression$TARGET_FLAG, fitted(final_logistic_model), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  data_logistic_regression$TARGET_FLAG, fitted(final_logistic_model)
## X-squared = 13.434, df = 8, p-value = 0.09775

Create dummy variables for training data set.

#check variable factor levels
data_logistic_regression$EDUCATION_HighSchool <- ifelse(data_logistic_regression$EDUCATION == "High School",1,0)
data_logistic_regression$EDUCATION_Masters <- ifelse(data_logistic_regression$EDUCATION == "Masters",1,0)
data_logistic_regression$EDUCATION_PhD <- ifelse(data_logistic_regression$EDUCATION == "PhD",1,0)

data_logistic_regression$URBANICITY_HighlyUrban <- ifelse(data_logistic_regression$URBANICITY == "Highly Urban/ Urban",1,0)
data_logistic_regression$JOB_BlueCollar <- ifelse(data_logistic_regression$JOB == "Blue Collar",1,0)
data_logistic_regression$JOB_Clerical <- ifelse(data_logistic_regression$JOB == "Clerical",1,0)
data_logistic_regression$JOB_HomeMaker <- ifelse(data_logistic_regression$JOB == "Home Maker",1,0)
data_logistic_regression$JOB_Manager <- ifelse(data_logistic_regression$JOB == "Manager",1,0)
data_logistic_regression$MSTATUS_Yes <- ifelse(data_logistic_regression$MSTATUS == "Yes",1,0)
data_logistic_regression$CAR_TYPE_Pickup <- ifelse(data_logistic_regression$CAR_TYPE == "Pickup",1,0)
data_logistic_regression$CAR_TYPE_Sports_Car <- ifelse(data_logistic_regression$CAR_TYPE == "Sports Car",1,0)
data_logistic_regression$CAR_TYPE_SUV <- ifelse(data_logistic_regression$CAR_TYPE == "SUV",1,0)
data_logistic_regression$CAR_TYPE_Van <- ifelse(data_logistic_regression$CAR_TYPE == "Van",1,0)
data_logistic_regression$REVOKED_Yes <- ifelse(data_logistic_regression$REVOKED == "Yes",1,0)
data_logistic_regression$CAR_USE_Private <- ifelse(data_logistic_regression$CAR_USE == "Private",1,0)
data_logistic_regression$EDUCATION_HighSchool <- ifelse(data_logistic_regression$EDUCATION == "High School",1,0)
data_logistic_regression$PARENT1_Yes <- ifelse(data_logistic_regression$PARENT1 == "Yes",1,0)
data_logistic_regression$CAR_TYPE_Sports_Car <- ifelse(data_logistic_regression$CAR_TYPE == "Sports Car",1,0)
data_logistic_regression$SEX_M <- ifelse(data_logistic_regression$SEX == "M",1,0)

#create dummy varibles for testing dataset
data_testing$URBANICITY_HighlyUrban <- ifelse(data_testing$URBANICITY == "Highly Urban/ Urban",1,0)
data_testing$JOB_BlueCollar <- ifelse(data_testing$JOB == "Blue Collar",1,0)
data_testing$JOB_Clerical <- ifelse(data_testing$JOB == "Clerical",1,0)
data_testing$JOB_HomeMaker <- ifelse(data_testing$JOB == "Home Maker",1,0)
data_testing$JOB_Manager <- ifelse(data_testing$JOB == "Manager",1,0)
data_testing$MSTATUS_Yes <- ifelse(data_testing$MSTATUS == "Yes",1,0)
data_testing$CAR_TYPE_Pickup <- ifelse(data_testing$CAR_TYPE == "Pickup",1,0)
data_testing$CAR_TYPE_Sports_Car <- ifelse(data_testing$CAR_TYPE == "Sports Car",1,0)
data_testing$CAR_TYPE_SUV <- ifelse(data_testing$CAR_TYPE == "SUV",1,0)
data_testing$CAR_TYPE_Van <- ifelse(data_testing$CAR_TYPE == "Van",1,0)
data_testing$REVOKED_Yes <- ifelse(data_testing$REVOKED == "Yes",1,0)
data_testing$CAR_USE_Private <- ifelse(data_testing$CAR_USE == "Private",1,0)
data_testing$EDUCATION_HighSchool <- ifelse(data_testing$EDUCATION == "High School",1,0)
data_testing$PARENT1_Yes <- ifelse(data_testing$PARENT1 == "Yes",1,0)
data_testing$CAR_TYPE_Sports_Car <- ifelse(data_testing$CAR_TYPE == "Sports Car",1,0)
data_testing$SEX_M <- ifelse(data_testing$SEX == "M",1,0)

Calculate log odds.

#create a new variable 'probability'
data_logistic_regression$probability <- c()
data_testing$probability <- c()

#calculate logit function using optimal model equasion
logit_p <- -4.93738 + 2.30350*data_logistic_regression$URBANICITY_HighlyUrban + 0.400491*data_logistic_regression$JOB_BlueCollar + 0.62651*data_logistic_regression$JOB_Clerical + 0.561895*data_logistic_regression$JOB_HomeMaker - 0.601606*data_logistic_regression$JOB_Manager + 0.102862*data_logistic_regression$MVR_PTS - 0.446125*data_logistic_regression$MSTATUS_Yes + 0.61657*data_logistic_regression$CAR_TYPE_Pickup + 1.29135*data_logistic_regression$CAR_TYPE_Sports_Car + 0.96726*data_logistic_regression$CAR_TYPE_SUV + 0.44968*data_logistic_regression$CAR_TYPE_Van + 0.70753*data_logistic_regression$REVOKED_Yes + 0.344905*data_logistic_regression$KIDSDRIV - 0.76242*data_logistic_regression$CAR_USE_Private + 0.40831*data_logistic_regression$TRAVTIME - 0.22526*data_logistic_regression$TIF + 0.06899*data_logistic_regression$OLDCLAIM +  0.50684*data_logistic_regression$EDUCATION_HighSchool + 0.43558*data_logistic_regression$PARENT1_Yes - 0.04297*data_logistic_regression$HOME_VAL + 0.28046*data_logistic_regression$SEX_M - 0.018883*data$YOJ

#calculate logit function using alternative model equasion
logit_p_alternative <- -4.973933 + 2.312670*data$URBANICITY_HighlyUrban  + 0.400491*data$JOB_BlueCollar + 0.622865*data$JOB_Clerical + 0.561895*data$JOB_HomeMaker - 0.601606*data$JOB_Manager  + 0.102862*data$MVR_PTS - 0.446125*data$MSTATUS_Yes + 0.61657*data$CAR_TYPE_Pickup + 1.29135*data$CAR_TYPE_Sports_Car + 0.96726*data$CAR_TYPE_SUV + 0.44968*data$CAR_TYPE_Van + 0.70753*data$REVOKED_Yes + 0.344905*data$KIDSDRIV - 0.76242*data$CAR_USE_Private + 0.40831*data$TRAVTIME - 0.22526*data$TIF + 0.06899*data$OLDCLAIM +  0.50684*data$EDUCATION_HighSchool + 0.43558*data$PARENT1_Yes - 0.04297*data$HOME_VAL + 0.28046*data$SEX_M - 0.018883*data$YOJ

logit_p_testing <- -4.973933 + 2.312670*data_testing$URBANICITY_HighlyUrban  + 0.400491*data_testing$JOB_BlueCollar + 0.622865*data_testing$JOB_Clerical + 0.561895*data_testing$JOB_HomeMaker - 0.601606*data_testing$JOB_Manager  + 0.102862*data_testing$MVR_PTS - 0.446125*data_testing$MSTATUS_Yes + 0.61657*data_testing$CAR_TYPE_Pickup + 1.29135*data_testing$CAR_TYPE_Sports_Car + 0.96726*data_testing$CAR_TYPE_SUV + 0.44968*data_testing$CAR_TYPE_Van + 0.70753*data_testing$REVOKED_Yes + 0.344905*data_testing$KIDSDRIV - 0.76242*data_testing$CAR_USE_Private + 0.40831*data_testing$TRAVTIME - 0.22526*data_testing$TIF + 0.06899*data_testing$OLDCLAIM +  0.50684*data_testing$EDUCATION_HighSchool + 0.43558*data_testing$PARENT1_Yes - 0.04297*data_testing$HOME_VAL + 0.28046*data_testing$SEX_M - 0.018883*data_testing$YOJ

Calculate probability of getting into car accident. Predict ‘TARGET_FLAG’ class for testing and training data sets.

#calculate probability
data_logistic_regression$probability <- (exp(1)^logit_p)/(1+exp(1)^logit_p)
data_testing$probability <- exp(1)^logit_p_testing/(1+exp(1)^logit_p_testing)
data_testing$probability 
##    [1] 9.991562e-01 1.000000e+00 9.991668e-01           NA 1.000000e+00
##    [6] 1.337410e-08 3.488676e-07 1.474999e-02 1.833339e-23 9.977398e-01
##   [11]           NA 5.199257e-01 1.000000e+00 9.172867e-01 6.637367e-01
##   [16] 9.999998e-01 9.999963e-01 4.989484e-06 1.000000e+00 1.000000e+00
##   [21] 1.000000e+00 4.006423e-20 4.223537e-20 9.999833e-01 7.541605e-22
##   [26] 9.950475e-01 1.000000e+00 2.618009e-06 1.297758e-16 3.817256e-20
##   [31] 5.446553e-20 1.000000e+00 2.541876e-15 1.924232e-19 1.737243e-01
##   [36]           NA 9.789095e-04 2.185046e-01           NA 9.999797e-01
##   [41] 9.999701e-01 9.997209e-01 4.743812e-12 1.000000e+00 2.274956e-03
##   [46] 4.100613e-15 1.000000e+00 1.491647e-05 9.999832e-01 6.075518e-01
##   [51] 8.238502e-16 1.000000e+00 1.000000e+00 9.986468e-01 1.000000e+00
##   [56] 4.772734e-14 1.000000e+00 1.000000e+00 3.671643e-12 9.983249e-01
##   [61] 3.025951e-07 1.084019e-16 8.529005e-01 9.981782e-01 5.375234e-12
##   [66]           NA 1.000000e+00 1.000000e+00 9.982177e-01 9.644926e-01
##   [71] 7.901832e-01 1.000000e+00 1.000000e+00 9.966134e-01 9.718959e-01
##   [76] 1.000000e+00 4.562253e-03 1.000000e+00 9.847121e-04 1.358265e-01
##   [81] 1.000000e+00 1.000000e+00 5.423676e-03 6.827702e-01 9.992215e-01
##   [86] 9.995504e-01 2.808516e-02 1.000000e+00 1.014909e-04 1.000000e+00
##   [91] 9.999947e-01 1.000000e+00 9.032302e-01 2.536132e-03 2.349733e-08
##   [96] 5.409637e-10 1.218352e-03 6.760640e-01 5.824237e-07 8.579031e-10
##  [101] 9.999704e-01 9.992760e-01 9.999999e-01 3.364843e-14 5.646662e-17
##  [106] 8.088321e-02 7.047803e-07 9.967364e-01 1.000000e+00 9.999979e-01
##  [111] 1.000000e+00 9.997268e-01 1.201364e-01 1.595884e-09 9.999940e-01
##  [116] 3.120073e-03 6.572122e-19 1.000000e+00 1.000000e+00 1.000000e+00
##  [121] 1.000000e+00 1.000000e+00 9.978326e-01 1.000000e+00 8.261186e-13
##  [126] 1.000000e+00 6.652920e-10 9.697054e-01 1.000000e+00 6.378436e-20
##  [131] 9.304442e-21           NA 3.923775e-14 3.033100e-01 3.786816e-05
##  [136] 1.000000e+00 1.000000e+00 4.839932e-01 9.998999e-01 9.999999e-01
##  [141] 6.807017e-07 1.000000e+00 8.966885e-01 9.989857e-01 1.874463e-13
##  [146] 1.000000e+00 9.999761e-01 9.983475e-01 1.000000e+00 2.909540e-22
##  [151] 1.000000e+00 1.070080e-07 1.000000e+00 4.337199e-20 9.999962e-01
##  [156] 1.000000e+00 1.000000e+00 4.310062e-13 1.000000e+00 1.000000e+00
##  [161] 1.000000e+00 1.323689e-09 9.983556e-01 3.587998e-04 1.000000e+00
##  [166] 9.752234e-01 4.054002e-02 1.000000e+00 2.786234e-08 1.000000e+00
##  [171] 1.290432e-03 1.000000e+00 9.924147e-01 1.000000e+00 8.077345e-13
##  [176] 3.095271e-17 9.999825e-01 1.000000e+00 1.000000e+00 1.163981e-06
##  [181] 1.000000e+00 7.863900e-06 9.741054e-14 1.593427e-20 1.000000e+00
##  [186] 1.000000e+00 6.298621e-18 5.339705e-16           NA 1.000000e+00
##  [191] 9.688631e-01 1.000000e+00 6.633404e-01 9.999620e-01 4.717433e-06
##  [196] 9.999984e-01 1.000000e+00 1.227725e-12 5.669425e-03 8.717258e-01
##  [201] 9.997371e-01 9.727366e-01 2.210473e-18 9.999700e-01 9.999975e-01
##  [206] 4.622415e-13 9.999998e-01 8.965515e-01 9.428593e-01 8.628850e-13
##  [211] 4.967659e-19 3.330193e-05 3.519627e-01 4.657784e-02 8.460254e-01
##  [216] 1.000000e+00 3.478653e-06 1.000000e+00 5.952505e-13 2.553183e-21
##  [221] 8.519176e-03 1.056105e-15 9.992095e-01 1.000000e+00 8.168794e-14
##  [226] 1.000000e+00 9.785202e-01 9.999998e-01 7.486246e-01 5.610636e-04
##  [231] 3.827775e-08           NA 1.867072e-03 1.880903e-11 1.616136e-21
##  [236] 4.001906e-01 2.859045e-17           NA 3.765984e-24 9.122125e-01
##  [241] 9.999054e-01 9.999977e-01 9.985073e-01 3.701373e-10 1.000000e+00
##  [246] 5.138667e-01 1.000000e+00 9.876840e-01 3.182194e-01 1.000000e+00
##  [251] 1.852952e-10 1.000000e+00 3.972133e-07           NA 9.999931e-01
##  [256] 2.376649e-05 4.055179e-09 1.000000e+00 1.000000e+00 9.996932e-01
##  [261] 9.938856e-01 9.655233e-01 2.962275e-06 1.090752e-03 3.260202e-13
##  [266] 5.982201e-05 3.598859e-10 1.585833e-06 1.000000e+00 9.982667e-01
##  [271] 9.999959e-01 8.223011e-16 2.553424e-07 1.000000e+00 1.247845e-08
##  [276] 4.488421e-22 1.000000e+00 1.000000e+00 2.429337e-04 1.507925e-01
##  [281] 6.286569e-15 1.000000e+00 1.000000e+00 1.000000e+00           NA
##  [286] 1.000000e+00 1.127553e-09 1.000000e+00 9.999997e-01 9.999054e-01
##  [291] 2.253533e-11 8.155802e-01 1.000000e+00 1.000000e+00 3.423680e-11
##  [296] 9.999889e-01 3.496316e-08 1.000000e+00 5.137769e-10 1.000000e+00
##  [301] 1.009203e-12 9.858302e-01 6.988856e-09 9.999997e-01 1.000000e+00
##  [306] 7.928896e-01 9.999973e-01 1.000000e+00 4.693949e-14 9.981822e-01
##  [311] 1.000000e+00 1.000000e+00 6.341284e-14 1.000000e+00 1.029988e-18
##  [316] 9.842820e-01 6.470847e-06 3.371238e-01 1.000000e+00 5.793457e-13
##  [321] 9.961125e-01 9.966951e-01 4.109367e-11 1.570584e-16 2.821716e-02
##  [326] 9.992577e-01 1.000000e+00 9.002133e-01 9.999963e-01 9.999984e-01
##  [331] 1.671764e-16           NA 2.194223e-04 8.113208e-01 1.000000e+00
##  [336] 9.999999e-01 9.374320e-01 9.999997e-01 1.856337e-01 2.715044e-22
##  [341] 9.999999e-01 2.684901e-06 1.000000e+00 1.000000e+00 4.371759e-15
##  [346] 2.123889e-04 3.754087e-11 3.072307e-20           NA 2.599594e-13
##  [351]           NA 1.566188e-10 1.000000e+00 1.000000e+00 6.222192e-02
##  [356] 5.332995e-04 1.000000e+00 9.534229e-01 2.320905e-23 6.442119e-16
##  [361] 9.999992e-01 1.520427e-11 1.943526e-20 5.838187e-04 4.708507e-19
##  [366]           NA 2.380701e-22 4.463268e-07 9.999996e-01 9.999015e-01
##  [371] 2.256447e-18 1.000000e+00 1.000000e+00 9.999987e-01 4.191013e-02
##  [376] 1.000000e+00 5.260667e-20 9.999143e-01 1.164027e-13 3.980915e-06
##  [381] 8.892652e-14 4.620424e-02 1.000000e+00 1.000000e+00 3.931986e-06
##  [386] 9.993981e-01 9.999994e-01 1.954353e-02 5.686822e-05 9.999998e-01
##  [391]           NA 2.899067e-09 6.745375e-11 1.000000e+00 2.746774e-05
##  [396] 3.363339e-03 5.500587e-19           NA 1.000000e+00 1.000000e+00
##  [401] 9.999126e-01 9.998413e-01 4.154888e-15 9.999025e-01 4.763192e-22
##  [406] 9.992728e-01 2.704512e-02 1.000000e+00 2.289895e-02 6.874294e-10
##  [411] 1.838119e-01 9.999998e-01 9.998079e-01 6.207225e-21 1.034743e-01
##  [416] 6.715091e-02 1.195037e-10 1.000000e+00 9.901904e-11 1.239737e-10
##  [421] 9.999899e-01 1.000000e+00 1.000000e+00 5.320582e-01 1.000000e+00
##  [426] 1.000000e+00 7.263333e-04 9.944578e-01 9.695398e-01 2.566057e-01
##  [431] 9.999999e-01 1.627710e-22 2.941131e-05 9.304606e-01 1.000000e+00
##  [436] 1.000000e+00 7.675514e-24 9.996293e-01 1.000000e+00 1.194774e-06
##  [441] 5.133882e-23 4.216702e-06 4.111896e-14 9.999592e-01 1.000000e+00
##  [446] 2.710324e-04 3.505717e-23 9.999895e-01 9.999998e-01 9.999645e-01
##  [451] 5.274005e-12 1.813399e-01 1.000000e+00 2.885222e-21 1.000000e+00
##  [456] 1.000000e+00 2.419765e-07 1.000000e+00 3.852036e-19 4.435915e-20
##  [461] 1.373779e-07 2.143976e-05 9.999990e-01 9.318472e-01           NA
##  [466] 6.917334e-16 1.000000e+00 9.999999e-01 7.006116e-03 1.000000e+00
##  [471] 5.984604e-01 1.000000e+00 9.422565e-01 9.986503e-01 1.000000e+00
##  [476] 1.679516e-06 1.000000e+00 1.000000e+00 2.021736e-02 9.999999e-01
##  [481] 1.363687e-01 1.000000e+00 2.166450e-20 6.113691e-01 1.000000e+00
##  [486] 1.000000e+00 9.997887e-01 1.000000e+00 9.999975e-01 1.000000e+00
##  [491]           NA 1.439298e-04 1.000000e+00           NA 1.172290e-15
##  [496] 1.000000e+00 9.999999e-01 9.988784e-01 1.000000e+00 1.000000e+00
##  [501] 1.000000e+00 5.812707e-01 1.000000e+00 1.727108e-01 1.000000e+00
##  [506] 9.999350e-01 1.000000e+00 5.181745e-01 9.989977e-14           NA
##  [511] 8.641808e-20 1.000000e+00 9.999797e-01           NA 9.999993e-01
##  [516] 8.490683e-01 1.000000e+00 4.242594e-10 9.999983e-01 1.000000e+00
##  [521] 9.551413e-01 9.953319e-01 7.385869e-05 1.000000e+00 1.000000e+00
##  [526]           NA           NA 3.561306e-14 1.143423e-04           NA
##  [531] 9.999999e-01 9.998128e-01 6.273518e-06 1.094297e-11 1.009467e-15
##  [536] 1.503701e-16 9.948465e-01 2.790049e-02 4.393087e-01 1.602631e-16
##  [541] 5.328524e-24 9.999918e-01 1.000000e+00 1.000000e+00 3.779447e-19
##  [546] 1.000000e+00 9.962876e-01 1.000000e+00 9.995475e-01 1.000000e+00
##  [551] 9.999993e-01 1.170845e-11 6.452779e-20 9.999908e-01 4.030420e-01
##  [556]           NA 1.000000e+00           NA 3.868957e-11 9.984978e-01
##  [561] 1.430105e-20 2.022659e-13 9.998395e-01 1.000000e+00 1.000000e+00
##  [566] 1.869859e-05 9.999999e-01 9.854630e-01 2.133315e-04 1.000000e+00
##  [571] 7.595447e-22 9.681855e-01           NA 9.487488e-10 2.561293e-12
##  [576] 6.879257e-06 9.999716e-01 5.884100e-03 2.541639e-04 1.000000e+00
##  [581] 9.942086e-01 1.000000e+00 4.337442e-08 1.000000e+00 6.061236e-19
##  [586] 7.984275e-11 3.921935e-23 1.534987e-01 9.999998e-01 1.000000e+00
##  [591] 1.169033e-02 1.000000e+00 4.997557e-07 1.000000e+00 1.000000e+00
##  [596] 4.502401e-03 1.000000e+00 9.958039e-01 4.224639e-08 9.999812e-01
##  [601] 9.999893e-01 9.991262e-01 1.000000e+00 9.999993e-01 7.946445e-01
##  [606] 1.000000e+00 1.000000e+00 9.999996e-01 9.732310e-01 1.763554e-13
##  [611] 2.258484e-18 1.000000e+00 2.784143e-03 6.394152e-20 9.999955e-01
##  [616] 9.997819e-01 2.376770e-17 1.000000e+00 9.999987e-01 1.000000e+00
##  [621] 9.091605e-01 3.846451e-11 7.784843e-05 1.000000e+00 9.351599e-14
##  [626] 9.999999e-01           NA 6.566844e-02 3.921619e-06 1.000000e+00
##  [631] 9.789665e-01 9.999888e-01 5.741904e-20 1.208058e-08 4.887582e-18
##  [636] 6.777441e-17 9.989519e-01 9.989070e-01 4.153671e-06 9.771372e-01
##  [641] 1.000000e+00 1.000000e+00 9.671374e-01 7.011528e-17 1.361036e-03
##  [646] 6.936542e-01 8.177915e-13 1.000000e+00 1.000000e+00 9.990083e-01
##  [651] 9.823776e-01 1.608801e-12 1.000000e+00           NA 9.240185e-04
##  [656] 1.107481e-14 9.998654e-01 6.824858e-16 6.899854e-01 9.995862e-01
##  [661] 9.999992e-01 9.966609e-01 1.000000e+00 2.124163e-09 9.795351e-01
##  [666] 9.731287e-05 2.374863e-03 7.960313e-14 1.000000e+00 9.842930e-01
##  [671] 1.090538e-02 1.000000e+00 1.000000e+00 9.979324e-01 9.993281e-01
##  [676] 5.255804e-10 1.000000e+00 1.099112e-03 6.619673e-22 5.313369e-23
##  [681] 8.867394e-01 1.263421e-15 9.666521e-01 1.330903e-18 1.000000e+00
##  [686] 9.914735e-03 5.209268e-04 2.958848e-03 1.000000e+00 1.102475e-16
##  [691] 1.000000e+00 9.999988e-01 9.979183e-01 9.698801e-01 1.143315e-21
##  [696] 9.999322e-01 3.868443e-22 3.370639e-07 9.680802e-01 6.417461e-01
##  [701] 1.515519e-24 8.990871e-19 9.973084e-01 7.719807e-01 1.000000e+00
##  [706] 1.733026e-08 9.999998e-01 1.000000e+00 2.850571e-07 2.859341e-14
##  [711] 1.000000e+00 9.996219e-01 9.999866e-01 9.929055e-01 1.264909e-04
##  [716] 1.424522e-17 1.691492e-12 9.440242e-01 7.764469e-11 1.305822e-01
##  [721] 1.000000e+00 1.287217e-01 5.848466e-04 6.828672e-04 1.157504e-18
##  [726] 1.741004e-09 9.999777e-01 2.219224e-07 9.999948e-01 4.644697e-07
##  [731] 4.964628e-01 9.999996e-01 9.999536e-01 1.085689e-08 9.999999e-01
##  [736] 1.000000e+00 9.997503e-01 5.528143e-10 1.000000e+00 1.223813e-14
##  [741] 1.000000e+00 3.654590e-22 9.999128e-01 2.734777e-08 1.000000e+00
##  [746] 2.584609e-02 9.999999e-01 9.935983e-01 3.864813e-15 9.999989e-01
##  [751] 9.903460e-01 1.000000e+00 4.839469e-10 1.000000e+00 5.087945e-21
##  [756] 1.052400e-15 9.999772e-01 1.000000e+00           NA 9.995450e-01
##  [761] 9.997774e-01 7.892228e-05 1.000000e+00 1.000000e+00 1.000000e+00
##  [766] 2.621857e-11 9.999980e-01 1.000000e+00 3.250405e-03 1.000000e+00
##  [771] 1.000000e+00 1.000000e+00 9.992492e-01 2.584695e-18           NA
##  [776] 6.027582e-01 9.999928e-01 7.776258e-01 5.679972e-14 1.038367e-04
##  [781] 4.871833e-01 9.999752e-01           NA 1.257057e-01 5.557837e-11
##  [786] 1.000000e+00 1.712751e-06 1.000000e+00 1.000000e+00 1.711158e-01
##  [791] 4.373955e-05 9.999779e-01 6.130128e-17 7.047327e-15 1.980042e-12
##  [796] 5.505864e-02 2.392870e-15 1.000000e+00 1.000000e+00 8.504784e-01
##  [801] 9.999998e-01 9.990567e-01 5.132889e-17 1.280476e-01 9.833804e-01
##  [806] 5.889707e-13 2.729982e-09 2.344183e-01 8.023373e-20 1.000000e+00
##  [811] 1.019556e-01 7.971523e-17 1.595936e-12 2.235276e-04 1.000000e+00
##  [816] 3.626835e-01 9.999998e-01 1.000000e+00 1.000000e+00 1.000000e+00
##  [821] 1.000000e+00 9.927058e-01           NA 6.090454e-05 1.000000e+00
##  [826] 9.995786e-01 9.999970e-01 2.201530e-16 6.399683e-15 9.999968e-01
##  [831] 9.999934e-01 1.368263e-12           NA 3.241159e-05 1.430014e-11
##  [836] 4.275187e-19 1.000000e+00 9.987315e-01 1.296422e-03 5.171303e-19
##  [841] 9.999986e-01 9.998943e-01 9.999998e-01 9.993799e-01 1.357250e-19
##  [846] 6.708374e-01 1.000000e+00 8.157909e-10 1.000000e+00 3.996953e-01
##  [851] 4.375978e-11 9.135445e-03 9.999953e-01 1.722608e-18 1.000000e+00
##  [856] 9.487880e-01 5.597974e-08 1.241390e-11 1.000000e+00 7.568908e-01
##  [861] 5.558242e-20 1.000000e+00 1.000000e+00 1.698807e-11 1.000000e+00
##  [866] 1.477880e-15 1.000000e+00 2.790839e-15 1.618941e-16 1.000000e+00
##  [871] 1.082136e-16 1.000000e+00 2.712766e-06 9.999999e-01 2.254508e-04
##  [876] 1.000000e+00           NA 4.915092e-15 1.817723e-04 8.766960e-01
##  [881] 1.000000e+00 9.950477e-01 6.546283e-15 4.371365e-15 1.000000e+00
##  [886] 2.381328e-03 1.000000e+00 4.379473e-14 4.477340e-13           NA
##  [891] 1.061228e-16 9.914338e-01 6.502037e-17 1.174163e-17 9.999813e-01
##  [896] 1.000000e+00 9.999912e-01 1.000000e+00 1.000000e+00 1.000000e+00
##  [901] 9.999744e-01 2.634492e-15 1.000000e+00 4.520297e-18 1.087090e-01
##  [906] 1.000000e+00 9.997258e-01 9.965478e-21 1.000000e+00 1.000000e+00
##  [911] 1.000000e+00 2.176042e-02 1.624678e-16 2.198004e-05 5.981268e-01
##  [916] 9.999997e-01 1.000000e+00 9.994737e-01 7.483488e-01 1.000000e+00
##  [921] 9.997740e-01 2.399382e-22 6.581138e-08 9.952393e-01 9.999110e-01
##  [926] 1.366586e-15 9.999992e-01 1.120099e-01 1.000000e+00 4.918310e-21
##  [931] 4.802223e-01 4.959105e-13           NA 2.851642e-23 1.000000e+00
##  [936] 1.000000e+00 2.557063e-16 1.000000e+00 1.000000e+00 6.250288e-02
##  [941] 1.000000e+00 8.685233e-13 1.000000e+00 1.000000e+00 1.000000e+00
##  [946] 9.953357e-01 1.000000e+00 8.602130e-01 9.994929e-01 9.593866e-21
##  [951] 9.999990e-01 6.176959e-12 1.000000e+00 5.076749e-01 9.825757e-01
##  [956] 1.000000e+00 1.000000e+00 2.314578e-13 9.999336e-01 9.905300e-01
##  [961] 9.531057e-02 1.000000e+00 1.073139e-02 9.999939e-01 1.835101e-07
##  [966] 1.000000e+00 9.997392e-01 1.000000e+00 1.000000e+00 1.000000e+00
##  [971] 3.915258e-23 1.000000e+00 1.554460e-07 1.000000e+00 2.471853e-10
##  [976] 1.000000e+00 9.997258e-01 9.999978e-01 3.302584e-12 1.000000e+00
##  [981] 9.999967e-01 1.856028e-02 9.999993e-01 1.000000e+00 1.000000e+00
##  [986] 1.000000e+00 3.112384e-11 3.573540e-02 3.623012e-01           NA
##  [991] 1.306272e-10 2.798720e-11 1.000000e+00 1.000000e+00 9.893673e-01
##  [996] 5.508458e-18 1.000000e+00 9.999112e-01 1.000000e+00 9.999910e-01
## [1001] 9.999997e-01 1.000000e+00 1.000000e+00 5.182325e-09 9.999373e-01
## [1006] 1.179682e-17 3.437426e-09 1.000000e+00 1.371268e-06 1.058572e-03
## [1011] 9.999059e-01 2.708175e-11 9.999740e-01 8.037672e-01 3.230547e-01
## [1016] 1.000000e+00 2.103904e-16 9.483419e-01 2.001485e-02 2.681285e-11
## [1021] 6.217246e-02 6.881544e-01 1.000000e+00 9.999993e-01 1.000000e+00
## [1026] 9.997865e-01 1.000000e+00 1.299356e-11 6.786563e-08 6.757177e-01
## [1031]           NA 1.021648e-12 3.425692e-01 5.431101e-20           NA
## [1036] 1.304633e-09 9.999808e-01 9.999967e-01 1.674953e-13 1.215341e-14
## [1041] 2.961933e-21 2.490302e-09 9.979617e-01 9.988500e-01 7.451310e-09
## [1046]           NA 9.921649e-01 7.766658e-01 9.999591e-01 9.931522e-01
## [1051] 1.000000e+00 1.000000e+00 1.000000e+00 9.998786e-01 1.790505e-01
## [1056] 9.999998e-01 1.000000e+00 3.548504e-01 9.999884e-01 1.000000e+00
## [1061] 4.676205e-09 1.000000e+00 1.000000e+00 1.000000e+00 2.202863e-01
## [1066] 9.002924e-26           NA 1.000000e+00 6.073911e-19 9.954402e-01
## [1071] 9.999998e-01 9.817188e-01 7.952640e-01 1.000000e+00 3.099743e-24
## [1076] 2.028395e-08 9.999475e-01 3.058349e-02 1.000000e+00 5.925240e-12
## [1081] 1.000000e+00 1.000000e+00 2.471801e-11 1.000000e+00 1.000000e+00
## [1086] 1.000000e+00 9.998894e-01 3.441855e-17 1.000000e+00 5.092955e-03
## [1091] 9.981710e-01           NA 3.042461e-17 9.998837e-01 9.952106e-01
## [1096] 1.000000e+00 6.608072e-06 9.990452e-01 9.999866e-01 1.000000e+00
## [1101] 4.723014e-12 9.227700e-09 4.424978e-05 2.707613e-04 2.254040e-14
## [1106] 9.999164e-01 8.602852e-19 1.000000e+00 1.000000e+00 1.000000e+00
## [1111] 9.996936e-01           NA 9.990565e-01 1.000000e+00 4.818888e-13
## [1116] 2.363184e-21 2.777195e-15 1.000000e+00 1.000000e+00 2.442968e-19
## [1121] 9.999091e-01 9.999854e-01 4.768563e-01 1.000000e+00 9.999752e-01
## [1126] 1.165371e-17 9.999964e-01 4.985419e-16 7.252622e-12 9.999993e-01
## [1131] 9.802263e-01 1.000000e+00 1.000000e+00 8.911459e-02 1.356028e-11
## [1136] 9.939776e-01 8.920898e-01 9.999955e-01 9.999514e-01 1.259366e-14
## [1141] 9.021923e-01 9.972352e-01 9.670878e-01 1.000000e+00 1.000000e+00
## [1146]           NA 4.930404e-20 9.979674e-01 4.148612e-08 9.997322e-01
## [1151] 5.614592e-10 1.000000e+00 2.241389e-17 1.000000e+00 1.000000e+00
## [1156] 9.999959e-01 1.000000e+00           NA 9.965107e-01 4.877763e-18
## [1161] 9.661147e-01 1.000000e+00 1.000000e+00 4.329351e-01 7.699403e-19
## [1166] 1.000000e+00 2.080086e-11 8.069246e-11 1.000000e+00 1.162223e-04
## [1171] 1.000000e+00 1.000000e+00 1.000000e+00 9.999997e-01 5.012227e-15
## [1176] 5.661326e-02 1.555183e-13 3.592881e-16 3.516519e-01           NA
## [1181] 9.670120e-01 1.000000e+00 1.000000e+00 9.999153e-01 9.999994e-01
## [1186] 9.999764e-01 9.995202e-01 9.655273e-01 4.571760e-13           NA
## [1191] 1.000000e+00 5.556681e-10 4.480342e-16 1.000000e+00 1.000000e+00
## [1196] 5.523727e-04 6.246453e-19 9.795423e-01 1.000000e+00 9.970333e-01
## [1201]           NA 1.244121e-16 1.364381e-09 1.823221e-17 9.133839e-01
## [1206] 4.189312e-26 1.000000e+00 9.999719e-01 6.426500e-07 1.000000e+00
## [1211] 1.451078e-15 4.711863e-03 1.000000e+00 1.308943e-06 4.319972e-20
## [1216] 4.508766e-16 1.000000e+00 3.103248e-05 9.999985e-01 2.968358e-09
## [1221] 5.223546e-16 6.029151e-04 1.000000e+00 2.809995e-17 9.999814e-01
## [1226] 9.999996e-01 1.963664e-12 2.825371e-08 9.999999e-01 1.000000e+00
## [1231] 7.276623e-09 9.834271e-01 5.954677e-21 1.000000e+00 3.000072e-02
## [1236] 5.289417e-12 1.031292e-24 9.999986e-01 6.889152e-11 1.171281e-14
## [1241] 1.000000e+00 1.692908e-07 9.992236e-01 1.151565e-14 2.001186e-04
## [1246] 1.000000e+00 9.999735e-01 5.036751e-14 7.306185e-24 4.517518e-02
## [1251]           NA 3.872749e-16 1.315503e-15 1.241996e-03 3.086345e-04
## [1256] 9.939606e-01 1.000000e+00 3.854969e-21 1.000000e+00 9.979576e-01
## [1261] 9.999933e-01 1.000000e+00 1.572675e-19 1.000000e+00 7.494387e-01
## [1266] 9.446882e-07 1.000000e+00 1.277306e-11 2.842196e-11 3.845647e-19
## [1271] 9.997151e-01 9.992855e-01 1.831973e-18 6.226591e-01 1.000000e+00
## [1276] 3.238974e-01 1.000000e+00 3.800703e-22 2.386724e-03 1.000000e+00
## [1281] 1.000000e+00 1.587935e-17 3.891884e-07 1.000000e+00 9.999755e-01
## [1286] 9.999131e-01 1.000000e+00 2.249213e-16 7.479305e-01 9.986506e-01
## [1291] 1.000000e+00 9.926651e-01 3.191209e-09 9.999958e-01 1.000000e+00
## [1296] 5.725862e-07 9.997196e-01 1.000000e+00 1.000000e+00 9.999978e-01
## [1301] 9.245224e-02 1.000000e+00 1.157620e-09 1.618905e-21 2.333481e-22
## [1306] 5.536233e-15 1.000000e+00 1.000000e+00 1.680729e-15 1.000000e+00
## [1311] 1.330077e-02 1.000000e+00 1.000000e+00 9.999987e-01 6.355913e-04
## [1316] 2.143935e-03 2.817740e-14 9.999985e-01 9.623464e-14 9.999991e-01
## [1321] 8.232392e-08 1.000000e+00 9.999999e-01 8.498645e-19 9.341426e-21
## [1326] 1.000000e+00 2.404067e-20 9.992990e-01 6.717277e-20 2.560837e-01
## [1331] 2.019726e-18 4.489160e-05 4.520205e-15 5.048549e-03 1.000000e+00
## [1336] 9.999897e-01 1.871312e-06 5.854208e-19 1.415206e-10 9.559760e-01
## [1341] 1.000000e+00 9.997679e-01 4.267144e-18 9.999980e-01           NA
## [1346] 2.779540e-17 7.133935e-21 5.522515e-14 1.350608e-19 9.999976e-01
## [1351] 1.045057e-05 1.000000e+00 1.000000e+00 6.431451e-13 6.748216e-24
## [1356] 2.540680e-09 9.997580e-01 1.000000e+00           NA 1.166962e-17
## [1361] 7.007211e-09 2.808165e-12 9.185688e-01 4.425820e-18 2.416013e-17
## [1366] 3.028005e-10 1.000000e+00 1.000000e+00 1.045653e-05 2.459046e-04
## [1371] 1.898334e-07 3.101114e-07 1.843059e-18 5.409869e-10 1.000000e+00
## [1376] 9.168471e-01           NA 1.000000e+00 6.941115e-15 4.456437e-01
## [1381] 1.000000e+00 1.000000e+00 1.000000e+00 3.735825e-04 5.759823e-09
## [1386] 1.117151e-17 8.093202e-15 2.807417e-18 4.242832e-20 1.000000e+00
## [1391] 1.000000e+00 1.000000e+00 5.805870e-05 9.056923e-01 9.997757e-01
## [1396] 1.345313e-18 9.534968e-10 1.000000e+00           NA 2.310507e-01
## [1401] 1.000000e+00 1.985236e-17 9.994807e-01 1.000000e+00 1.000000e+00
## [1406] 9.999998e-01 7.833879e-19 9.999595e-01 9.931799e-01 9.997381e-01
## [1411] 6.786410e-22 4.657853e-01 9.995077e-01 3.699795e-07 1.181058e-17
## [1416] 2.847569e-04 1.003431e-13 1.445642e-08 1.000000e+00 5.123137e-10
## [1421] 3.771852e-04 1.000000e+00 1.000000e+00 1.000000e+00           NA
## [1426] 1.000000e+00 9.960369e-01 9.978161e-01 1.000000e+00 4.885912e-10
## [1431] 9.999658e-01 5.788648e-09 2.009775e-11 9.859200e-01 2.508557e-08
## [1436] 1.712255e-02 9.909302e-01 1.000000e+00 1.000000e+00 9.999770e-01
## [1441] 1.474555e-06 1.000000e+00 9.980573e-01 6.414261e-20 9.998287e-01
## [1446] 9.524424e-01 1.000000e+00 1.000000e+00 1.610079e-16 2.404218e-20
## [1451] 4.259593e-17 1.000000e+00 3.238851e-19 1.371294e-20 3.303293e-01
## [1456] 5.010851e-02 3.544995e-02 7.852028e-07 1.061335e-21 1.000000e+00
## [1461] 6.550675e-04 2.580609e-17           NA 1.000000e+00 9.950283e-01
## [1466] 9.999871e-01 2.773797e-13 2.809152e-18 3.690653e-02 1.000000e+00
## [1471] 9.999987e-01 4.206957e-17 4.346365e-19 1.000000e+00 9.999447e-01
## [1476] 5.176303e-12 1.000000e+00 1.000000e+00 9.999492e-01 9.999899e-01
## [1481] 9.999673e-01 9.999948e-01 1.000000e+00 4.711385e-11 9.820804e-01
## [1486] 9.999028e-01 9.999901e-01           NA 1.000000e+00 1.783796e-08
## [1491] 7.188745e-03 1.000000e+00 6.636707e-10 9.999993e-01 8.661777e-01
## [1496] 9.999092e-01 2.168586e-13 3.185760e-09 9.975177e-01 4.361918e-21
## [1501] 5.000793e-05 1.000000e+00 9.396631e-13 4.770491e-01           NA
## [1506] 4.854031e-01 1.167496e-07 9.998833e-01 1.000000e+00 1.000000e+00
## [1511] 9.966145e-01 9.899424e-01 9.998829e-01           NA 1.000000e+00
## [1516] 1.000000e+00 9.993433e-01 9.999412e-01           NA 2.218221e-25
## [1521] 1.000000e+00 3.855551e-11           NA 9.999880e-01           NA
## [1526] 1.765835e-01 9.999990e-01 1.000000e+00 7.596878e-03 1.000000e+00
## [1531] 9.847051e-01 1.000000e+00 7.015518e-06 6.237769e-17 1.000000e+00
## [1536] 9.989228e-01 4.557428e-04 9.999807e-01 1.000000e+00 9.999999e-01
## [1541] 5.488882e-18 1.136772e-02 6.137457e-02 1.762514e-04 9.999855e-01
## [1546] 9.999996e-01 2.441639e-15 9.999999e-01 3.399689e-20           NA
## [1551] 1.000000e+00 4.662930e-04 1.000000e+00 1.000000e+00 9.999863e-01
## [1556] 8.556184e-14 1.532708e-12 1.324501e-23 2.458028e-04 9.439324e-01
## [1561] 1.000000e+00 1.000000e+00 1.000000e+00 9.911859e-01 9.999979e-01
## [1566] 1.106031e-16 1.260305e-10 9.999155e-01 9.999903e-01 1.000000e+00
## [1571] 9.999903e-01 1.000000e+00 2.950800e-14 1.000000e+00 9.942329e-01
## [1576] 1.000000e+00 1.696246e-03 5.401610e-01 1.619864e-12 9.999982e-01
## [1581] 1.000000e+00 1.050002e-06 3.652540e-01 2.558637e-18 5.442017e-18
## [1586] 1.591156e-10 2.390662e-23 9.501074e-09 5.454283e-12 1.000000e+00
## [1591] 1.000000e+00           NA 1.000000e+00 9.999972e-01 9.999954e-01
## [1596] 9.999804e-01 9.973166e-01 2.615421e-11 3.366511e-19 9.648945e-01
## [1601] 1.000000e+00 9.999849e-01 1.000000e+00 1.120072e-19 1.000000e+00
## [1606] 9.999989e-01 9.648950e-01 3.071713e-15 6.994893e-01 9.998738e-01
## [1611] 1.000000e+00 9.982714e-01 4.906874e-12 9.960172e-01 2.797203e-12
## [1616] 1.000000e+00 1.162500e-20 1.098531e-06 9.153923e-04 9.999976e-01
## [1621] 9.999999e-01           NA 9.995538e-01 1.000000e+00           NA
## [1626] 9.999991e-01 4.754428e-25 3.084480e-10 1.220238e-11 1.000000e+00
## [1631] 3.618018e-05 1.000000e+00 1.000000e+00 1.000000e+00 9.832858e-04
## [1636] 1.000000e+00 1.000000e+00 1.994722e-05 2.801275e-17 9.635314e-01
## [1641] 2.049975e-17 6.329549e-10 9.980222e-01 1.728319e-14 9.999482e-01
## [1646] 1.240010e-08 9.999863e-01 7.550291e-01           NA 2.766915e-10
## [1651] 4.564187e-15           NA 1.981331e-20 5.938487e-15 1.000000e+00
## [1656] 7.832949e-12 2.948249e-17 1.243778e-15 9.997486e-01 1.579138e-24
## [1661] 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## [1666] 7.507040e-14 8.081452e-01 9.999998e-01 1.000000e+00 9.999676e-01
## [1671] 1.000000e+00 2.335614e-08 1.000000e+00 9.977180e-01 2.603252e-17
## [1676] 9.995223e-01 7.810823e-15 1.000000e+00 5.757485e-09 1.000000e+00
## [1681] 5.877206e-16 1.000000e+00 1.000000e+00 1.000000e+00 3.807763e-20
## [1686] 1.000000e+00 3.495961e-01           NA 1.256335e-20 8.599116e-15
## [1691] 3.810576e-16 9.999914e-01 1.000000e+00           NA 9.954277e-01
## [1696] 1.000000e+00 1.732515e-21 1.000000e+00 1.000000e+00 7.636102e-25
## [1701] 1.000000e+00 9.997089e-01 3.689321e-04 1.000000e+00 2.604604e-16
## [1706] 3.686330e-15 1.000000e+00 8.970585e-01 1.042647e-12 5.100409e-19
## [1711] 9.993807e-01 4.801911e-20 9.999999e-01 1.063157e-10 9.994956e-01
## [1716] 1.993970e-16 9.524541e-19 3.242297e-22 9.969374e-01 7.453020e-01
## [1721] 9.999992e-01 3.770647e-03 1.000000e+00 6.076001e-08 1.000000e+00
## [1726] 5.285071e-23 4.640100e-21 1.000673e-01 1.000000e+00 1.000000e+00
## [1731] 1.000000e+00 4.305449e-16 3.686926e-11 8.614835e-19 9.353600e-25
## [1736] 4.720133e-11 9.999996e-01 1.000000e+00 9.992811e-01 3.641705e-11
## [1741] 1.000000e+00 5.030373e-02 1.474255e-20 3.390660e-08 8.067138e-01
## [1746] 5.482023e-12 1.000000e+00 1.304255e-02           NA 9.991521e-01
## [1751] 9.997294e-01 9.999952e-01 5.683008e-05 5.041837e-17 9.519724e-20
## [1756]           NA 4.610006e-23 9.999997e-01 5.824138e-11 1.000000e+00
## [1761] 9.999919e-01 3.068236e-18 3.627128e-07 2.273130e-19 9.998608e-01
## [1766] 1.000000e+00           NA 2.866193e-18           NA 2.022590e-20
## [1771] 7.500281e-16 2.925723e-08 1.388515e-01 1.000000e+00 7.638453e-01
## [1776] 1.183382e-04 1.000000e+00 1.000000e+00 4.066495e-06 1.000000e+00
## [1781] 4.975315e-04 5.138701e-15 1.000000e+00 4.095130e-05 9.782900e-01
## [1786] 1.000000e+00 3.281824e-07 9.997098e-01 9.999963e-01 3.444783e-09
## [1791] 1.000000e+00 5.584148e-20 1.000000e+00 9.999987e-01 9.833045e-01
## [1796] 1.842127e-11 1.267011e-07           NA 9.984848e-01 2.167942e-02
## [1801] 1.000000e+00 9.999909e-01 1.057802e-06 7.761040e-01 9.999998e-01
## [1806] 2.064967e-11 1.000000e+00           NA 3.159053e-15 6.012364e-22
## [1811] 2.456595e-06 3.312061e-19 1.000000e+00 1.000000e+00 9.999989e-01
## [1816] 5.164403e-16 9.999973e-01 9.998467e-01 5.402356e-01 1.000000e+00
## [1821] 2.754723e-03 9.999328e-01 8.471193e-14 9.913068e-01 9.999995e-01
## [1826]           NA 1.000000e+00 9.999742e-01 8.374018e-07 1.000000e+00
## [1831] 2.847335e-04 9.948628e-01 1.000000e+00 3.212858e-09 4.654129e-08
## [1836] 7.309421e-11 3.568705e-09 1.000000e+00 1.379941e-15 9.836625e-01
## [1841] 3.201926e-05 9.999570e-01           NA 9.999998e-01 4.484342e-18
## [1846] 1.232034e-18 2.417478e-09 1.212987e-17 9.993310e-01 9.999615e-01
## [1851] 1.001033e-04 2.189489e-04 1.147786e-12 6.219471e-01 1.197074e-22
## [1856] 1.467355e-01 8.284841e-01 1.000000e+00 3.390894e-17 4.426540e-06
## [1861] 1.000000e+00 9.999740e-01 1.000000e+00 1.864440e-02 7.336774e-01
## [1866] 9.997399e-01 3.791306e-05 2.850529e-01 9.992548e-01           NA
## [1871]           NA 1.000000e+00 2.447754e-20 1.719540e-10 9.964931e-01
## [1876] 9.992076e-01 9.944044e-01 1.000000e+00 2.297939e-02 9.701568e-01
## [1881] 4.458753e-12 9.999848e-01 1.223601e-15 9.991543e-01 4.945316e-09
## [1886]           NA 2.091477e-06 1.000000e+00 9.889874e-01 9.999999e-01
## [1891] 3.895588e-06 1.004722e-01 1.700279e-05 9.982812e-01 9.997028e-01
## [1896] 9.961612e-01 1.000000e+00 1.041824e-03 3.498314e-11 1.000000e+00
## [1901] 9.486769e-12 1.524189e-16 3.065436e-08 1.000000e+00 1.000000e+00
## [1906] 1.054156e-19 9.215594e-01 9.964497e-01 8.030895e-03 9.999985e-01
## [1911] 1.000000e+00 4.828004e-23 9.386876e-17 6.788389e-08 9.801247e-08
## [1916] 7.659383e-03 9.999610e-01 1.176995e-01 2.683081e-02 1.000000e+00
## [1921] 9.998154e-01           NA 1.109582e-10 8.043427e-01 5.933439e-07
## [1926] 5.756320e-15 1.000000e+00 2.746531e-06 4.329773e-12 2.152519e-01
## [1931] 9.999964e-01 9.967299e-01 1.719664e-03 1.000000e+00 1.800453e-14
## [1936] 1.000000e+00 9.999985e-01 1.740434e-18 1.320803e-12 1.000000e+00
## [1941] 1.288316e-18 4.776766e-19 9.999995e-01 1.380480e-22 7.033695e-01
## [1946] 1.000000e+00 1.000000e+00 7.869808e-18 4.908215e-15 9.995969e-01
## [1951] 1.000000e+00 3.015658e-03 1.000000e+00 9.992042e-01 5.315615e-18
## [1956] 1.000000e+00 1.222353e-19 1.499681e-15 1.000000e+00 9.999992e-01
## [1961] 9.999485e-01 1.000000e+00 2.884319e-09 3.095813e-05 9.996121e-01
## [1966] 5.026826e-08 9.993382e-01 1.902457e-11 1.193061e-03 1.072451e-11
## [1971] 2.954703e-11 8.629093e-01 9.644940e-01 9.999954e-01 2.036249e-17
## [1976] 9.175817e-06 1.843370e-24 1.000000e+00 1.000000e+00 1.793904e-16
## [1981] 2.613376e-08 1.000000e+00 4.117395e-01           NA 1.000000e+00
## [1986] 8.376566e-11 1.766734e-04 3.087063e-02 5.024277e-01 1.000000e+00
## [1991] 1.632363e-19 1.000000e+00 1.000000e+00 9.995091e-01           NA
## [1996] 1.471904e-08 9.999448e-01 1.000000e+00 8.933317e-19 4.256961e-01
## [2001] 1.000000e+00 5.976574e-16 1.000000e+00           NA 1.000000e+00
## [2006] 1.000000e+00 9.999895e-01 7.668453e-20 8.724745e-03 1.000000e+00
## [2011] 1.000000e+00 9.998089e-01 1.000000e+00 1.936573e-20 9.999976e-01
## [2016] 1.000000e+00 9.868468e-17 9.999996e-01 1.000000e+00 8.891802e-02
## [2021] 4.183076e-04 9.998245e-01 9.999972e-01 2.790500e-03 2.639794e-05
## [2026] 6.026683e-19 9.758677e-01           NA 9.999997e-01 1.000000e+00
## [2031] 1.000000e+00 9.999992e-01 7.481084e-08 9.944458e-01 1.000000e+00
## [2036] 1.000000e+00 1.000000e+00 4.363948e-07 1.000000e+00 9.956262e-01
## [2041] 5.499191e-11 4.947837e-16 5.969351e-04 1.489023e-02 1.007883e-06
## [2046] 1.738778e-06 1.818919e-13 1.000000e+00 5.540074e-03 3.149157e-08
## [2051] 6.167468e-06 7.873677e-01 9.927192e-01 9.994217e-01 1.313571e-02
## [2056] 7.621567e-21 1.000000e+00           NA 1.484438e-06 1.000000e+00
## [2061] 7.272900e-12           NA 9.991653e-01 9.797690e-07 9.995833e-01
## [2066]           NA 7.710150e-08 9.824848e-01 9.998230e-01 9.617866e-01
## [2071] 8.047236e-18 1.091597e-06 9.999989e-01 5.442220e-06 1.590952e-01
## [2076] 7.934462e-16 9.999790e-01 9.909290e-01 1.000000e+00 1.000000e+00
## [2081] 3.846653e-25 9.997986e-01 9.999967e-01           NA 4.846822e-07
## [2086] 1.141826e-18 4.014492e-19 1.000000e+00 2.529155e-12 1.000000e+00
## [2091] 1.000000e+00 6.340297e-17 1.000000e+00 1.122172e-20 1.000000e+00
## [2096] 1.000000e+00 3.812552e-06 1.000000e+00 1.000000e+00 1.000000e+00
## [2101] 1.000000e+00           NA           NA 2.926968e-11 9.990667e-01
## [2106] 2.731640e-22 9.999924e-01 9.998809e-01 7.967273e-04           NA
## [2111] 1.000000e+00 2.907499e-05 1.000000e+00 1.639738e-19 7.686795e-20
## [2116] 9.219948e-15 9.999983e-01 1.000000e+00 1.000000e+00 1.579293e-11
## [2121] 9.494199e-01 2.214006e-02 1.000000e+00 9.999921e-01 1.000000e+00
## [2126] 3.056612e-23 2.550070e-08 2.867807e-08 6.676972e-17 5.200752e-04
## [2131] 9.997936e-01 2.991824e-12 9.966760e-01 1.000000e+00 1.271203e-15
## [2136] 1.431374e-22 1.000000e+00 9.999847e-01 2.708294e-19 1.000000e+00
## [2141] 3.058037e-07
#create a new variable that specifies predicted class
data_testing$TARGET_FLAG_pred <-c()

#calculate probability
data_logistic_regression = within(data_logistic_regression, {
    TARGET_FLAG_pred = ifelse(data_logistic_regression$probability < 0.5, 0, 1)
 })

data_testing = within(data_testing, {
    TARGET_FLAG_pred = ifelse(data_testing$probability < 0.5, 0, 1)
 })


head(data_testing)
##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1     3          NA         NA        0  48        0  11   1154      No
## 2     9          NA         NA        1  40        1  11   1119     Yes
## 3    10          NA         NA        0  44        2  12    974     Yes
## 4    18          NA         NA        0  35        2  NA    513     Yes
## 5    21          NA         NA        0  59        0  12   1686      No
## 6    30          NA         NA        0  46        0  14      1      No
##   HOME_VAL MSTATUS SEX   EDUCATION          JOB TRAVTIME    CAR_USE
## 1        2      No   M   Bachelors      Manager       26    Private
## 2        2      No   M High School      Manager       21    Private
## 3        2      No   F High School  Blue Collar       30 Commercial
## 4        2      No   M High School     Clerical       74    Private
## 5        2      No   M High School      Manager       45    Private
## 6      636     Yes   M   Bachelors Professional        7 Commercial
##   BLUEBOOK TIF    CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1      703   1         Van     yes        1        0      No       2
## 2      540   6     Minivan      no      272        1      No       2
## 3     1189  10         SUV      no        1        0      No       0
## 4     1373   6      Pickup      no        1        0     Yes       0
## 5      345   1     Minivan     yes      494        2      No       4
## 6      864   1 Panel Truck      no      137        1      No       2
##   CAR_AGE          URBANICITY URBANICITY_HighlyUrban JOB_BlueCollar
## 1      10 Highly Urban/ Urban                      1              0
## 2       1 Highly Urban/ Urban                      1              0
## 3      10 Highly Rural/ Rural                      0              1
## 4       4 Highly Rural/ Rural                      0              0
## 5       1 Highly Urban/ Urban                      1              0
## 6      12 Highly Urban/ Urban                      1              0
##   JOB_Clerical JOB_HomeMaker JOB_Manager MSTATUS_Yes CAR_TYPE_Pickup
## 1            0             0           1           0               0
## 2            0             0           1           0               0
## 3            0             0           0           0               0
## 4            1             0           0           0               1
## 5            0             0           1           0               0
## 6            0             0           0           1               0
##   CAR_TYPE_Sports_Car CAR_TYPE_SUV CAR_TYPE_Van REVOKED_Yes
## 1                   0            0            1           0
## 2                   0            0            0           0
## 3                   0            1            0           0
## 4                   0            0            0           1
## 5                   0            0            0           0
## 6                   0            0            0           0
##   CAR_USE_Private EDUCATION_HighSchool PARENT1_Yes SEX_M  probability
## 1               1                    0           0     1 9.991562e-01
## 2               1                    1           1     1 1.000000e+00
## 3               0                    1           1     0 9.991668e-01
## 4               1                    1           1     1           NA
## 5               1                    1           0     1 1.000000e+00
## 6               0                    0           0     1 1.337410e-08
##   TARGET_FLAG_pred
## 1                1
## 2                1
## 3                1
## 4               NA
## 5                1
## 6                0

Calculate Classification Metrics.

#create confusion matrix
confusion_matrix <- table("Predicted" = data_logistic_regression$TARGET_FLAG_pred, "Actual" = data_logistic_regression$TARGET_FLAG)
confusion_matrix
##          Actual
## Predicted    0    1
##         0 4983 1215
##         1  337  678
#calculate true positive
TP <- confusion_matrix[4]

#calculate true negative
TN <- confusion_matrix[1]

#calculate false negative
FN <- confusion_matrix[2]

#calculate false positive
FP <- confusion_matrix[3]

#calculate accuracy
accuracy <- (confusion_matrix[1,1] + confusion_matrix[2,2])/nrow(data_logistic_regression)
accuracy
## [1] 0.7848329
#calculate accuracy classification error rate
classification_error_rate = (FP + FN)/(TP + FP + TN + FN)
classification_error_rate
## [1] 0.2151671
#calculate precision
precision = TP/(TP + FP)
precision
## [1] 0.3581616
#calculate sensitivity
sensitivity = TP/(TP + FN)
sensitivity
## [1] 0.6679803
#calculate specificity
specificity <- TN/(TN + FP)
specificity
## [1] 0.803969
#calculate F1 score
F1_score <- (2*precision*sensitivity)/(precision + sensitivity)
F1_score
## [1] 0.4662999
roc.val <- roc(TARGET_FLAG ~ probability, data_logistic_regression)
plot(roc.val, main="ROC plot")

roc.val$auc
## Area under the curve: 0.8097

LINEAR REGRESSION.

#create subset that includes only records with TARGET_AMT=1 (people that got into car crash)
data_crash <- subset(data, data$TARGET_FLAG==1)
head(data_crash)
##    INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 5      7           1   2946.000        0  34        1  12    662     Yes
## 6     12           1   2501.000        0  34        0  10   4278      No
## 8     14           1   6077.000        0  53        0  14   5021      No
## 11    17           1   1267.000        0  53        0  11    767      No
## 12    19           1   2920.167        0  45        0   0      2      No
## 17    25           1   6857.000        0  28        1  13   3076      No
##    HOME_VAL MSTATUS SEX   EDUCATION         JOB TRAVTIME    CAR_USE
## 5         2      No   F   Bachelors Blue Collar       46 Commercial
## 6         2      No   F   Bachelors    Clerical       34    Private
## 8         2      No   F     Masters      Lawyer       15    Private
## 11        2      No   M         PhD                   64 Commercial
## 12       72     Yes   F High School  Home Maker       48    Private
## 17     1207     Yes   F High School Blue Collar       29 Commercial
##    BLUEBOOK TIF    CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 5       739   1  Sports Car      no        1        0      No       0
## 6       132   1         SUV      no        1        0      No       0
## 8       822   1  Sports Car      no        1        0      No       0
## 11     1677   6 Panel Truck     yes        1        0      No       3
## 12     2295   1         SUV      no        1        0      No       3
## 17     2548   6         SUV      no     2402        2      No       0
##    CAR_AGE          URBANICITY
## 5        7 Highly Urban/ Urban
## 6        1 Highly Urban/ Urban
## 8       11 Highly Urban/ Urban
## 11      10 Highly Urban/ Urban
## 12       5 Highly Urban/ Urban
## 17       1 Highly Urban/ Urban
dim(data_crash)
## [1] 1893   26
  1. DATA PREPARATION

The following assumptions must be verified for linear regression.

  1. Linearity assumption.
# histograms and density lines
par(mfrow=c(2,2))
hist(data_crash$KIDSDRIV,breaks=seq(0, 10, 1),probability=TRUE, col="gray", border="white")
d <- density(data_crash$KIDSDRIV)
    lines(d, col="red")
    
hist(data_crash$AGE,breaks=seq(0, 100, 1),probability=TRUE, col="gray", border="white")
d <- density(data_crash$AGE)
    lines(d, col="red")
    
hist(data_crash$HOMEKIDS,breaks=seq(0, 10, 1),probability=TRUE, col="gray", border="white")
d <- density(data_crash$HOMEKIDS)
    lines(d, col="red")
    
hist(data_crash$YOJ,breaks=seq(0, 25, 1),probability=TRUE, col="gray", border="white")
d <- density(data_crash$YOJ)
    lines(d, col="red")

par(mfrow=c(2,2))
hist(data_crash$INCOME,breaks=seq(0, 10000, 500),probability=TRUE, col="gray", border="white")
d <- density(data_crash$INCOME)
    lines(d, col="red")
    
hist(data_crash$HOME_VAL,probability=TRUE, col="gray", border="white")
d <- density(data_crash$HOME_VAL)
    lines(d, col="red")
    
hist(data_crash$TRAVTIME,probability=TRUE, col="gray", border="white")
d <- density(data_crash$TRAVTIME)
    lines(d, col="red")
    
hist(data_crash$BLUEBOOK,breaks=seq(0, 5000, 100),probability=TRUE, col="gray", border="white")
d <- density(data_crash$BLUEBOOK)
    lines(d, col="red")

par(mfrow=c(2,2))
hist(data_crash$TIF,probability=TRUE, col="gray", border="white")
d <- density(data_crash$TIF)
    lines(d, col="red")
    
hist(data_crash$OLDCLAIM,probability=TRUE, col="gray", border="white")
d <- density(data_crash$OLDCLAIM)
    lines(d, col="red")
    
hist(data_crash$CLM_FREQ,breaks=seq(0, 6, 1),probability=TRUE, col="gray", border="white")
d <- density(data_crash$CLM_FREQ)
    lines(d, col="red")
    
hist(data_crash$MVR_PTS,probability=TRUE, col="gray", border="white")
d <- density(data_crash$MVR_PTS)
    lines(d, col="red")

hist(data_crash$CAR_AGE,probability=TRUE, col="gray", border="white")
d <- density(data_crash$CAR_AGE)
    lines(d, col="red")

  1. Normal distribution of dependent variables.
#verify lineriarity
par(mfrow=c(2,2))
#colnames <- dimnames(data)[[2]]
for (i in 4:ncol(data_crash)) {
  if (is.numeric(data_crash[,i]) == "TRUE"){
    plot(data_crash$TARGET_AMT ~ data_crash[,i],main=names(data_crash)[i], xlab=names(data_crash)[i])
    reg_line <- lm(data_crash$TARGET_AMT ~ data_crash[,i])
    abline(reg_line,col="red")
}
}    

#replace variables that have non-linear relationships with logit function or which distribution is not normal(or near normal) by variables logs
data_linear_regression <- data_crash %>% select(-INDEX,-TARGET_FLAG) %>% mutate(HOME_VAL = ifelse(HOME_VAL > 0,log(HOME_VAL),""), BLUEBOOK = ifelse(BLUEBOOK > 0,log(BLUEBOOK),""),INCOME = ifelse(INCOME > 0,log(INCOME),""),TIF = ifelse(TIF > 0,log(TIF),""), OLDCLAIM = ifelse(OLDCLAIM > 0,log(OLDCLAIM),""))

count_nas(data_linear_regression)
##    variable_name_column number_missing_column percentage
## 1              KIDSDRIV                     0          0
## 2                   AGE                     0          0
## 3              HOMEKIDS                     0          0
## 4                   YOJ                     0          0
## 5                INCOME                     0          0
## 6               PARENT1                     0          0
## 7              HOME_VAL                     0          0
## 8               MSTATUS                     0          0
## 9                   SEX                     0          0
## 10            EDUCATION                     0          0
## 11                  JOB                     0          0
## 12             TRAVTIME                     0          0
## 13              CAR_USE                     0          0
## 14             BLUEBOOK                     0          0
## 15                  TIF                     0          0
## 16             CAR_TYPE                     0          0
## 17              RED_CAR                     0          0
## 18             OLDCLAIM                     0          0
## 19             CLM_FREQ                     0          0
## 20              REVOKED                     0          0
## 21              MVR_PTS                     0          0
## 22              CAR_AGE                     0          0
## 23           URBANICITY                     0          0
  1. Multicollinearity assumption.
#correlation between variables
corrplot(cor(data_linear_regression %>% select_if(is.numeric)), type = "upper", method = "number", tl.cex = 0.5, tl.col="black",number.cex = .5)

#build lm model using stepwise approach
linear_model.null = lm(TARGET_AMT ~ 1, data = data_linear_regression)

linear_model.full = lm(TARGET_AMT ~ ., data = data_linear_regression)
     
step(linear_model.null,
     scope = list(upper=linear_model.full),
             direction = "both",
             data = data_linear_regression)
## Start:  AIC=33841.23
## TARGET_AMT ~ 1
## 
##              Df Sum of Sq        RSS   AIC
## + CAR_TYPE    5 881040861 1.0892e+11 33836
## + MSTATUS     1 277922853 1.0952e+11 33838
## + CAR_USE     1 220700204 1.0958e+11 33839
## + SEX         1 192617922 1.0960e+11 33840
## + INCOME      1 128447239 1.0967e+11 33841
## + REVOKED     1 118967437 1.0968e+11 33841
## <none>                    1.0980e+11 33841
## + PARENT1     1 109443748 1.0969e+11 33841
## + MVR_PTS     1 105520144 1.0969e+11 33841
## + YOJ         1 103068817 1.0969e+11 33841
## + BLUEBOOK    1  98244081 1.0970e+11 33842
## + AGE         1  40333734 1.0976e+11 33843
## + CAR_AGE     1  26556088 1.0977e+11 33843
## + HOMEKIDS    1  21854988 1.0978e+11 33843
## + TRAVTIME    1  15490316 1.0978e+11 33843
## + CLM_FREQ    1  15181150 1.0978e+11 33843
## + RED_CAR     1  12635915 1.0978e+11 33843
## + KIDSDRIV    1   7522012 1.0979e+11 33843
## + URBANICITY  1   4219827 1.0979e+11 33843
## + OLDCLAIM    1   3306186 1.0979e+11 33843
## + TIF         1    202114 1.0980e+11 33843
## + HOME_VAL    1      6774 1.0980e+11 33843
## + EDUCATION   3 172507522 1.0962e+11 33844
## + JOB         8 703216490 1.0909e+11 33845
## 
## Step:  AIC=33835.98
## TARGET_AMT ~ CAR_TYPE
## 
##              Df Sum of Sq        RSS   AIC
## + MSTATUS     1 251703068 1.0866e+11 33834
## + PARENT1     1 142021530 1.0877e+11 33836
## + MVR_PTS     1 127222706 1.0879e+11 33836
## <none>                    1.0892e+11 33836
## + REVOKED     1  93890746 1.0882e+11 33836
## + INCOME      1  83566596 1.0883e+11 33837
## + YOJ         1  73539414 1.0884e+11 33837
## + CAR_AGE     1  66685043 1.0885e+11 33837
## + HOMEKIDS    1  45918691 1.0887e+11 33837
## + RED_CAR     1  42747717 1.0887e+11 33837
## + CAR_USE     1  32396980 1.0888e+11 33837
## + BLUEBOOK    1  31661512 1.0888e+11 33837
## + AGE         1  22473139 1.0889e+11 33838
## + TRAVTIME    1  14533771 1.0890e+11 33838
## + CLM_FREQ    1  13619726 1.0890e+11 33838
## + KIDSDRIV    1   7645685 1.0891e+11 33838
## + OLDCLAIM    1   6504566 1.0891e+11 33838
## + SEX         1   5274732 1.0891e+11 33838
## + HOME_VAL    1   1938459 1.0891e+11 33838
## + URBANICITY  1    138093 1.0892e+11 33838
## + TIF         1     12406 1.0892e+11 33838
## + EDUCATION   3  88562536 1.0883e+11 33840
## - CAR_TYPE    5 881040861 1.0980e+11 33841
## + JOB         8 416607293 1.0850e+11 33845
## 
## Step:  AIC=33833.6
## TARGET_AMT ~ CAR_TYPE + MSTATUS
## 
##              Df Sum of Sq        RSS   AIC
## + MVR_PTS     1 118525976 1.0855e+11 33834
## + YOJ         1 117276981 1.0855e+11 33834
## <none>                    1.0866e+11 33834
## + REVOKED     1  99751169 1.0856e+11 33834
## + CAR_AGE     1  83560584 1.0858e+11 33834
## + INCOME      1  79478095 1.0858e+11 33834
## + HOME_VAL    1  67680007 1.0860e+11 33834
## + HOMEKIDS    1  64077606 1.0860e+11 33834
## + AGE         1  41863959 1.0862e+11 33835
## + CAR_USE     1  38465090 1.0863e+11 33835
## + RED_CAR     1  36858180 1.0863e+11 33835
## + BLUEBOOK    1  28181920 1.0864e+11 33835
## + PARENT1     1  18336946 1.0865e+11 33835
## + TRAVTIME    1  17957391 1.0865e+11 33835
## + CLM_FREQ    1  14920962 1.0865e+11 33835
## + SEX         1  12175700 1.0865e+11 33835
## + KIDSDRIV    1  12032899 1.0865e+11 33835
## + OLDCLAIM    1   7384154 1.0866e+11 33835
## + URBANICITY  1   1007899 1.0866e+11 33836
## + TIF         1    499702 1.0866e+11 33836
## - MSTATUS     1 251703068 1.0892e+11 33836
## + EDUCATION   3  72614014 1.0859e+11 33838
## - CAR_TYPE    5 854821076 1.0952e+11 33838
## + JOB         8 411288798 1.0825e+11 33842
## 
## Step:  AIC=33833.53
## TARGET_AMT ~ CAR_TYPE + MSTATUS + MVR_PTS
## 
##              Df Sum of Sq        RSS   AIC
## + YOJ         1 125081593 1.0842e+11 33833
## <none>                    1.0855e+11 33834
## - MVR_PTS     1 118525976 1.0866e+11 33834
## + REVOKED     1  94467874 1.0845e+11 33834
## + HOME_VAL    1  81523631 1.0846e+11 33834
## + INCOME      1  80168955 1.0847e+11 33834
## + CAR_AGE     1  73876351 1.0847e+11 33834
## + HOMEKIDS    1  57153156 1.0849e+11 33835
## + CLM_FREQ    1  56020772 1.0849e+11 33835
## + AGE         1  45800105 1.0850e+11 33835
## + RED_CAR     1  40678583 1.0851e+11 33835
## + CAR_USE     1  31202382 1.0851e+11 33835
## + BLUEBOOK    1  29940033 1.0852e+11 33835
## + TRAVTIME    1  17361415 1.0853e+11 33835
## + PARENT1     1  14181990 1.0853e+11 33835
## + SEX         1  12232883 1.0853e+11 33835
## + KIDSDRIV    1  10929573 1.0853e+11 33835
## + OLDCLAIM    1   1842795 1.0854e+11 33836
## + URBANICITY  1    257193 1.0855e+11 33836
## + TIF         1    169741 1.0855e+11 33836
## - MSTATUS     1 243006338 1.0879e+11 33836
## + EDUCATION   3  74606002 1.0847e+11 33838
## - CAR_TYPE    5 874882524 1.0942e+11 33839
## + JOB         8 394337535 1.0815e+11 33843
## 
## Step:  AIC=33833.35
## TARGET_AMT ~ CAR_TYPE + MSTATUS + MVR_PTS + YOJ
## 
##              Df Sum of Sq        RSS   AIC
## <none>                    1.0842e+11 33833
## + REVOKED     1 105936347 1.0831e+11 33834
## - YOJ         1 125081593 1.0855e+11 33834
## - MVR_PTS     1 126330588 1.0855e+11 33834
## + CAR_AGE     1  84989277 1.0834e+11 33834
## + CLM_FREQ    1  53926426 1.0837e+11 33834
## + HOME_VAL    1  47665441 1.0837e+11 33835
## + HOMEKIDS    1  45961587 1.0837e+11 33835
## + RED_CAR     1  36371947 1.0838e+11 33835
## + CAR_USE     1  34715713 1.0839e+11 33835
## + AGE         1  29453290 1.0839e+11 33835
## + BLUEBOOK    1  27862260 1.0839e+11 33835
## + TRAVTIME    1  14913152 1.0841e+11 33835
## + PARENT1     1  12241647 1.0841e+11 33835
## + SEX         1  11976117 1.0841e+11 33835
## + KIDSDRIV    1   6088028 1.0841e+11 33835
## + INCOME      1   5266242 1.0842e+11 33835
## + OLDCLAIM    1   2478421 1.0842e+11 33835
## + TIF         1    440341 1.0842e+11 33835
## + URBANICITY  1       495 1.0842e+11 33835
## - MSTATUS     1 287498666 1.0871e+11 33836
## - CAR_TYPE    5 835158704 1.0926e+11 33838
## + EDUCATION   3  73898025 1.0835e+11 33838
## + JOB         8 325899692 1.0809e+11 33844
## 
## Call:
## lm(formula = TARGET_AMT ~ CAR_TYPE + MSTATUS + MVR_PTS + YOJ, 
##     data = data_linear_regression)
## 
## Coefficients:
##         (Intercept)  CAR_TYPEPanel Truck       CAR_TYPEPickup  
##             5059.38              2164.76              -168.90  
##  CAR_TYPESports Car          CAR_TYPESUV          CAR_TYPEVan  
##               13.55              -158.22               876.20  
##          MSTATUSYes              MVR_PTS                  YOJ  
##             -788.88               100.24                58.25
  1. BUILD MODELS
#optimal model
final_linear_model <- lm(formula = TARGET_AMT ~ CAR_TYPE + MSTATUS + MVR_PTS + YOJ, data = data_linear_regression)
summary(final_linear_model)
## 
## Call:
## lm(formula = TARGET_AMT ~ CAR_TYPE + MSTATUS + MVR_PTS + YOJ, 
##     data = data_linear_regression)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7251  -3085  -1595    276  79768 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5059.38     624.28   8.104 9.44e-16 ***
## CAR_TYPEPanel Truck  2164.76     742.96   2.914  0.00361 ** 
## CAR_TYPEPickup       -168.90     579.26  -0.292  0.77064    
## CAR_TYPESports Car     13.55     635.51   0.021  0.98299    
## CAR_TYPESUV          -158.22     535.58  -0.295  0.76771    
## CAR_TYPEVan           876.20     721.42   1.215  0.22469    
## MSTATUSYes           -788.88     352.94  -2.235  0.02553 *  
## MVR_PTS               100.24      67.65   1.482  0.13861    
## YOJ                    58.25      39.51   1.474  0.14057    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7586 on 1884 degrees of freedom
## Multiple R-squared:  0.01254,    Adjusted R-squared:  0.008342 
## F-statistic:  2.99 on 8 and 1884 DF,  p-value: 0.002472
#create new dummy variables
data_linear_regression$CAR_TYPE_PanelTruck <- ifelse(data_linear_regression$CAR_TYPE== "Panel Truck",1,0)
data_testing$CAR_TYPE_PanelTruck <- ifelse(data_testing$CAR_TYPE== "Panel Truck",1,0)
data_linear_regression$MSTATUS_Yes <- ifelse(data_linear_regression$MSTATUS== "Yes",1,0)
data_testing$MSTATUS_Yes <- ifelse(data_testing$MSTATUS== "Yes",1,0)
data_linear_regression$TARGET_AMT_pred <- c()
data_testing$TARGET_AMT_pred <- c()
data_testing$TARGET_AMT_pred <- ""

#optimal model equasion
data_linear_regression$TARGET_AMT_pred <- 5059.38 + 2164.76*data_linear_regression$CAR_TYPE_PanelTruck - 788.88*data_linear_regression$MSTATUS_Yes

head(data_testing)
##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1     3          NA         NA        0  48        0  11   1154      No
## 2     9          NA         NA        1  40        1  11   1119     Yes
## 3    10          NA         NA        0  44        2  12    974     Yes
## 4    18          NA         NA        0  35        2  NA    513     Yes
## 5    21          NA         NA        0  59        0  12   1686      No
## 6    30          NA         NA        0  46        0  14      1      No
##   HOME_VAL MSTATUS SEX   EDUCATION          JOB TRAVTIME    CAR_USE
## 1        2      No   M   Bachelors      Manager       26    Private
## 2        2      No   M High School      Manager       21    Private
## 3        2      No   F High School  Blue Collar       30 Commercial
## 4        2      No   M High School     Clerical       74    Private
## 5        2      No   M High School      Manager       45    Private
## 6      636     Yes   M   Bachelors Professional        7 Commercial
##   BLUEBOOK TIF    CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1      703   1         Van     yes        1        0      No       2
## 2      540   6     Minivan      no      272        1      No       2
## 3     1189  10         SUV      no        1        0      No       0
## 4     1373   6      Pickup      no        1        0     Yes       0
## 5      345   1     Minivan     yes      494        2      No       4
## 6      864   1 Panel Truck      no      137        1      No       2
##   CAR_AGE          URBANICITY URBANICITY_HighlyUrban JOB_BlueCollar
## 1      10 Highly Urban/ Urban                      1              0
## 2       1 Highly Urban/ Urban                      1              0
## 3      10 Highly Rural/ Rural                      0              1
## 4       4 Highly Rural/ Rural                      0              0
## 5       1 Highly Urban/ Urban                      1              0
## 6      12 Highly Urban/ Urban                      1              0
##   JOB_Clerical JOB_HomeMaker JOB_Manager MSTATUS_Yes CAR_TYPE_Pickup
## 1            0             0           1           0               0
## 2            0             0           1           0               0
## 3            0             0           0           0               0
## 4            1             0           0           0               1
## 5            0             0           1           0               0
## 6            0             0           0           1               0
##   CAR_TYPE_Sports_Car CAR_TYPE_SUV CAR_TYPE_Van REVOKED_Yes
## 1                   0            0            1           0
## 2                   0            0            0           0
## 3                   0            1            0           0
## 4                   0            0            0           1
## 5                   0            0            0           0
## 6                   0            0            0           0
##   CAR_USE_Private EDUCATION_HighSchool PARENT1_Yes SEX_M  probability
## 1               1                    0           0     1 9.991562e-01
## 2               1                    1           1     1 1.000000e+00
## 3               0                    1           1     0 9.991668e-01
## 4               1                    1           1     1           NA
## 5               1                    1           0     1 1.000000e+00
## 6               0                    0           0     1 1.337410e-08
##   TARGET_FLAG_pred CAR_TYPE_PanelTruck TARGET_AMT_pred
## 1                1                   0                
## 2                1                   0                
## 3                1                   0                
## 4               NA                   0                
## 5                1                   0                
## 6                0                   1
  1. SELECT MODELS

Test Goodness of Fit.

  1. Residuals analysis
#linearity
plot(final_linear_model$residuals ~ data_linear_regression$TARGET_AMT)
abline(h = 0, lty = 3)  # adds a horizontal dashed line at y = 0

#normal residuals
par(mfrow=c(1,2))
hist(final_linear_model$residuals, probability=TRUE,col="gray", border="white", main="Distribution of residuals")
d <- density(final_linear_model$residuals)
    lines(d, col="red")
#normal probability plot 
qqnorm(final_linear_model$residuals)
qqline(final_linear_model$residuals) 

#constant variability
plot(final_linear_model)

  1. Likelihood Ratio Test.
#alternative model
linear_model2 <- lm(formula = TARGET_AMT ~ CAR_TYPE + MSTATUS + MVR_PTS, data = data_linear_regression)
linear_model3 <- lm(formula = TARGET_AMT ~ CAR_TYPE + MSTATUS, data = data_linear_regression)
#Likelihood Ratio Test
anova(final_linear_model, linear_model2, test ="Chisq")
## Analysis of Variance Table
## 
## Model 1: TARGET_AMT ~ CAR_TYPE + MSTATUS + MVR_PTS + YOJ
## Model 2: TARGET_AMT ~ CAR_TYPE + MSTATUS + MVR_PTS
##   Res.Df        RSS Df  Sum of Sq Pr(>Chi)
## 1   1884 1.0842e+11                       
## 2   1885 1.0855e+11 -1 -125081593   0.1404
anova(final_linear_model, linear_model3, test ="Chisq")
## Analysis of Variance Table
## 
## Model 1: TARGET_AMT ~ CAR_TYPE + MSTATUS + MVR_PTS + YOJ
## Model 2: TARGET_AMT ~ CAR_TYPE + MSTATUS
##   Res.Df        RSS Df  Sum of Sq Pr(>Chi)
## 1   1884 1.0842e+11                       
## 2   1886 1.0866e+11 -2 -243607569   0.1204
#calculate accuracy
data_linear_regression$TARGET_AMT_accuracy <- c()
data_linear_regression$TARGET_AMT_accuracy <- (data_linear_regression$TARGET_AMT_pred-data_linear_regression$TARGET_AMT)/data_linear_regression$TARGET_AMT
mean(data_linear_regression$TARGET_AMT_accuracy)
## [1] 0.770665
#draw plot predicted vs actual
plot(data_linear_regression$TARGET_AMT_pred,data_linear_regression$TARGET_AMT,
      xlab="predicted",ylab="actual",col="blue")
 abline(a=0,b=1,col="red")

#calculate car crash payout for testing dataset
data_testing <- data_testing %>% mutate(TARGET_AMT_pred=as.numeric(ifelse(TARGET_FLAG_pred==1,5059.38 + 2164.76*data_testing$CAR_TYPE_PanelTruck - 788.88*data_testing$MSTATUS_Yes,'')))

#export testing data file with predicted class
write.table(data_testing, file = "/Users/olga/downloads/HW4_data_evaluation.csv",append = FALSE)