1 Data Exploration

df_train <- read.csv("/Users/bellajean/Downloads/insurance_training_data.csv")
df_eval  <- read.csv("/Users/bellajean/Downloads/insurance-evaluation-data.csv")

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
?stargazer
summary(df_train)
##      INDEX        TARGET_FLAG       TARGET_AMT        KIDSDRIV     
##  Min.   :    1   Min.   :0.0000   Min.   :     0   Min.   :0.0000  
##  1st Qu.: 2559   1st Qu.:0.0000   1st Qu.:     0   1st Qu.:0.0000  
##  Median : 5133   Median :0.0000   Median :     0   Median :0.0000  
##  Mean   : 5152   Mean   :0.2638   Mean   :  1504   Mean   :0.1711  
##  3rd Qu.: 7745   3rd Qu.:1.0000   3rd Qu.:  1036   3rd Qu.:0.0000  
##  Max.   :10302   Max.   :1.0000   Max.   :107586   Max.   :4.0000  
##                                                                    
##       AGE           HOMEKIDS           YOJ          INCOME         
##  Min.   :16.00   Min.   :0.0000   Min.   : 0.0   Length:8161       
##  1st Qu.:39.00   1st Qu.:0.0000   1st Qu.: 9.0   Class :character  
##  Median :45.00   Median :0.0000   Median :11.0   Mode  :character  
##  Mean   :44.79   Mean   :0.7212   Mean   :10.5                     
##  3rd Qu.:51.00   3rd Qu.:1.0000   3rd Qu.:13.0                     
##  Max.   :81.00   Max.   :5.0000   Max.   :23.0                     
##  NA's   :6                        NA's   :454                      
##    PARENT1            HOME_VAL           MSTATUS              SEX           
##  Length:8161        Length:8161        Length:8161        Length:8161       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   EDUCATION             JOB               TRAVTIME        CAR_USE         
##  Length:8161        Length:8161        Min.   :  5.00   Length:8161       
##  Class :character   Class :character   1st Qu.: 22.00   Class :character  
##  Mode  :character   Mode  :character   Median : 33.00   Mode  :character  
##                                        Mean   : 33.49                     
##                                        3rd Qu.: 44.00                     
##                                        Max.   :142.00                     
##                                                                           
##    BLUEBOOK              TIF           CAR_TYPE           RED_CAR         
##  Length:8161        Min.   : 1.000   Length:8161        Length:8161       
##  Class :character   1st Qu.: 1.000   Class :character   Class :character  
##  Mode  :character   Median : 4.000   Mode  :character   Mode  :character  
##                     Mean   : 5.351                                        
##                     3rd Qu.: 7.000                                        
##                     Max.   :25.000                                        
##                                                                           
##    OLDCLAIM            CLM_FREQ        REVOKED             MVR_PTS      
##  Length:8161        Min.   :0.0000   Length:8161        Min.   : 0.000  
##  Class :character   1st Qu.:0.0000   Class :character   1st Qu.: 0.000  
##  Mode  :character   Median :0.0000   Mode  :character   Median : 1.000  
##                     Mean   :0.7986                      Mean   : 1.696  
##                     3rd Qu.:2.0000                      3rd Qu.: 3.000  
##                     Max.   :5.0000                      Max.   :13.000  
##                                                                         
##     CAR_AGE        URBANICITY       
##  Min.   :-3.000   Length:8161       
##  1st Qu.: 1.000   Class :character  
##  Median : 8.000   Mode  :character  
##  Mean   : 8.328                     
##  3rd Qu.:12.000                     
##  Max.   :28.000                     
##  NA's   :510
summary(df_eval)
##      INDEX       TARGET_FLAG    TARGET_AMT        KIDSDRIV           AGE       
##  Min.   :    3   Mode:logical   Mode:logical   Min.   :0.0000   Min.   :17.00  
##  1st Qu.: 2632   NA's:2141      NA's:2141      1st Qu.:0.0000   1st Qu.:39.00  
##  Median : 5224                                 Median :0.0000   Median :45.00  
##  Mean   : 5150                                 Mean   :0.1625   Mean   :45.02  
##  3rd Qu.: 7669                                 3rd Qu.:0.0000   3rd Qu.:51.00  
##  Max.   :10300                                 Max.   :3.0000   Max.   :73.00  
##                                                                 NA's   :1      
##     HOMEKIDS           YOJ           INCOME            PARENT1         
##  Min.   :0.0000   Min.   : 0.00   Length:2141        Length:2141       
##  1st Qu.:0.0000   1st Qu.: 9.00   Class :character   Class :character  
##  Median :0.0000   Median :11.00   Mode  :character   Mode  :character  
##  Mean   :0.7174   Mean   :10.38                                        
##  3rd Qu.:1.0000   3rd Qu.:13.00                                        
##  Max.   :5.0000   Max.   :19.00                                        
##                   NA's   :94                                           
##    HOME_VAL           MSTATUS              SEX             EDUCATION        
##  Length:2141        Length:2141        Length:2141        Length:2141       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      JOB               TRAVTIME        CAR_USE            BLUEBOOK        
##  Length:2141        Min.   :  5.00   Length:2141        Length:2141       
##  Class :character   1st Qu.: 22.00   Class :character   Class :character  
##  Mode  :character   Median : 33.00   Mode  :character   Mode  :character  
##                     Mean   : 33.15                                        
##                     3rd Qu.: 43.00                                        
##                     Max.   :105.00                                        
##                                                                           
##       TIF           CAR_TYPE           RED_CAR            OLDCLAIM        
##  Min.   : 1.000   Length:2141        Length:2141        Length:2141       
##  1st Qu.: 1.000   Class :character   Class :character   Class :character  
##  Median : 4.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 5.245                                                           
##  3rd Qu.: 7.000                                                           
##  Max.   :25.000                                                           
##                                                                           
##     CLM_FREQ       REVOKED             MVR_PTS          CAR_AGE      
##  Min.   :0.000   Length:2141        Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:0.000   Class :character   1st Qu.: 0.000   1st Qu.: 1.000  
##  Median :0.000   Mode  :character   Median : 1.000   Median : 8.000  
##  Mean   :0.809                      Mean   : 1.766   Mean   : 8.183  
##  3rd Qu.:2.000                      3rd Qu.: 3.000   3rd Qu.:12.000  
##  Max.   :5.000                      Max.   :12.000   Max.   :26.000  
##                                                      NA's   :129     
##   URBANICITY       
##  Length:2141       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

There are 8161 observations of 26 variables in the training data set, and 2141 observations of 26 variables in the evaluation data set.

To remove the $ and , signs in the numerical data and the z_ signs in the categorical data,

currencyconv = function(input) {
  out = sub("\\$", "", input)
  out = as.numeric(sub(",", "", out))
  return(out)
}
# Replace spaces with underscores
underscore = function(input) {
  out = sub(" ", "_", input)
  return(out)
}

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
?as.tbl

df_train <- as.tbl(df_train) %>% 
  mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
            currencyconv) %>%  
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            underscore) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            as.factor) %>% 
  mutate(TARGET_FLAG = as.factor(TARGET_FLAG))
## Warning: `as.tbl()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
df_train
## # A tibble: 8,161 × 26
##    INDEX TARGET_FLAG TARGET…¹ KIDSD…²   AGE HOMEK…³   YOJ INCOME PARENT1 HOME_…⁴
##    <int> <fct>          <dbl>   <int> <int>   <int> <int>  <dbl> <chr>     <dbl>
##  1     1 0                  0       0    60       0    11  67349 No            0
##  2     2 0                  0       0    43       0    11  91449 No       257252
##  3     4 0                  0       0    35       1    10  16039 No       124191
##  4     5 0                  0       0    51       0    14     NA No       306251
##  5     6 0                  0       0    50       0    NA 114986 No       243925
##  6     7 1               2946       0    34       1    12 125301 Yes           0
##  7     8 0                  0       0    54       0    NA  18755 No           NA
##  8    11 1               4021       1    37       2    NA 107961 No       333680
##  9    12 1               2501       0    34       0    10  62978 No            0
## 10    13 0                  0       0    50       0     7 106952 No            0
## # … with 8,151 more rows, 16 more variables: MSTATUS <chr>, SEX <chr>,
## #   EDUCATION <fct>, JOB <fct>, TRAVTIME <int>, CAR_USE <chr>, BLUEBOOK <dbl>,
## #   TIF <int>, CAR_TYPE <fct>, RED_CAR <chr>, OLDCLAIM <dbl>, CLM_FREQ <int>,
## #   REVOKED <chr>, MVR_PTS <int>, CAR_AGE <int>, URBANICITY <fct>, and
## #   abbreviated variable names ¹​TARGET_AMT, ²​KIDSDRIV, ³​HOMEKIDS, ⁴​HOME_VAL
library(knitr)
knitr::kable
## function (x, format, digits = getOption("digits"), row.names = NA, 
##     col.names = NA, align, caption = NULL, label = NULL, format.args = list(), 
##     escape = TRUE, ...) 
## {
##     format = kable_format(format)
##     if (!missing(align) && length(align) == 1L && !grepl("[^lcr]", 
##         align)) 
##         align = strsplit(align, "")[[1]]
##     if (inherits(x, "list")) {
##         format = kable_format_latex(format)
##         res = lapply(x, kable, format = format, digits = digits, 
##             row.names = row.names, col.names = col.names, align = align, 
##             caption = NA, format.args = format.args, escape = escape, 
##             ...)
##         return(kables(res, format, caption, label))
##     }
##     caption = kable_caption(label, caption, format)
##     if (!is.matrix(x)) 
##         x = as.data.frame(x)
##     if (identical(col.names, NA)) 
##         col.names = colnames(x)
##     m = ncol(x)
##     isn = if (is.matrix(x)) 
##         rep(is.numeric(x), m)
##     else sapply(x, is.numeric)
##     if (missing(align) || (format == "latex" && is.null(align))) 
##         align = ifelse(isn, "r", "l")
##     digits = rep(digits, length.out = m)
##     for (j in seq_len(m)) {
##         if (is_numeric(x[, j])) 
##             x[, j] = round(x[, j], digits[j])
##     }
##     if (any(isn)) {
##         if (is.matrix(x)) {
##             if (is.table(x) && length(dim(x)) == 2) 
##                 class(x) = "matrix"
##             x = format_matrix(x, format.args)
##         }
##         else x[, isn] = format_args(x[, isn], format.args)
##     }
##     if (is.na(row.names)) 
##         row.names = has_rownames(x)
##     if (!is.null(align)) 
##         align = rep(align, length.out = m)
##     if (row.names) {
##         x = cbind(` ` = rownames(x), x)
##         if (!is.null(col.names)) 
##             col.names = c(" ", col.names)
##         if (!is.null(align)) 
##             align = c("l", align)
##     }
##     n = nrow(x)
##     x = replace_na(to_character(x), is.na(x))
##     if (!is.matrix(x)) 
##         x = matrix(x, nrow = n)
##     x = trimws(x)
##     colnames(x) = col.names
##     if (format != "latex" && length(align) && !all(align %in% 
##         c("l", "r", "c"))) 
##         stop("'align' must be a character vector of possible values 'l', 'r', and 'c'")
##     attr(x, "align") = align
##     if (format == "simple" && nrow(x) == 0) 
##         format = "pipe"
##     res = do.call(paste("kable", format, sep = "_"), list(x = x, 
##         caption = caption, escape = escape, ...))
##     structure(res, format = format, class = "knitr_kable")
## }
## <bytecode: 0x7f9a68f62320>
## <environment: namespace:knitr>
remotes::install_github("haozhu233/kableExtra")
## Skipping install of 'kableExtra' from a github remote, the SHA1 (292f6071) has not changed since last install.
##   Use `force = TRUE` to force installation
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
summary(df_train) %>% kable() %>% kable_styling()
INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
Min. : 1 0:6008 Min. : 0 Min. :0.0000 Min. :16.00 Min. :0.0000 Min. : 0.0 Min. : 0 Length:8161 Min. : 0 Length:8161 Length:8161 <High_School :1203 z_Blue_Collar:1825 Min. : 5.00 Length:8161 Min. : 1500 Min. : 1.000 Minivan :2145 Length:8161 Min. : 0 Min. :0.0000 Length:8161 Min. : 0.000 Min. :-3.000 Highly_Urban/ Urban :6492
1st Qu.: 2559 1:2153 1st Qu.: 0 1st Qu.:0.0000 1st Qu.:39.00 1st Qu.:0.0000 1st Qu.: 9.0 1st Qu.: 28097 Class :character 1st Qu.: 0 Class :character Class :character Bachelors :2242 Clerical :1271 1st Qu.: 22.00 Class :character 1st Qu.: 9280 1st Qu.: 1.000 Panel_Truck: 676 Class :character 1st Qu.: 0 1st Qu.:0.0000 Class :character 1st Qu.: 0.000 1st Qu.: 1.000 z_Highly_Rural/ Rural:1669
Median : 5133 NA Median : 0 Median :0.0000 Median :45.00 Median :0.0000 Median :11.0 Median : 54028 Mode :character Median :161160 Mode :character Mode :character Masters :1658 Professional :1117 Median : 33.00 Mode :character Median :14440 Median : 4.000 Pickup :1389 Mode :character Median : 0 Median :0.0000 Mode :character Median : 1.000 Median : 8.000 NA
Mean : 5152 NA Mean : 1504 Mean :0.1711 Mean :44.79 Mean :0.7212 Mean :10.5 Mean : 61898 NA Mean :154867 NA NA PhD : 728 Manager : 988 Mean : 33.49 NA Mean :15710 Mean : 5.351 Sports_Car : 907 NA Mean : 4037 Mean :0.7986 NA Mean : 1.696 Mean : 8.328 NA
3rd Qu.: 7745 NA 3rd Qu.: 1036 3rd Qu.:0.0000 3rd Qu.:51.00 3rd Qu.:1.0000 3rd Qu.:13.0 3rd Qu.: 85986 NA 3rd Qu.:238724 NA NA z_High_School:2330 Lawyer : 835 3rd Qu.: 44.00 NA 3rd Qu.:20850 3rd Qu.: 7.000 Van : 750 NA 3rd Qu.: 4636 3rd Qu.:2.0000 NA 3rd Qu.: 3.000 3rd Qu.:12.000 NA
Max. :10302 NA Max. :107586 Max. :4.0000 Max. :81.00 Max. :5.0000 Max. :23.0 Max. :367030 NA Max. :885282 NA NA NA Student : 712 Max. :142.00 NA Max. :69740 Max. :25.000 z_SUV :2294 NA Max. :57037 Max. :5.0000 NA Max. :13.000 Max. :28.000 NA
NA NA NA NA NA’s :6 NA NA’s :454 NA’s :445 NA NA’s :464 NA NA NA (Other) :1413 NA NA NA NA NA NA NA NA NA NA NA’s :510 NA
sapply(df_train, function(x) sum(is.na(x))) %>% kable() %>% kable_styling()
x
INDEX 0
TARGET_FLAG 0
TARGET_AMT 0
KIDSDRIV 0
AGE 6
HOMEKIDS 0
YOJ 454
INCOME 445
PARENT1 0
HOME_VAL 464
MSTATUS 0
SEX 0
EDUCATION 0
JOB 0
TRAVTIME 0
CAR_USE 0
BLUEBOOK 0
TIF 0
CAR_TYPE 0
RED_CAR 0
OLDCLAIM 0
CLM_FREQ 0
REVOKED 0
MVR_PTS 0
CAR_AGE 510
URBANICITY 0
# Visualization
library(ggplot2)
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble  3.2.1     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ✔ purrr   1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()          masks stats::filter()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag()             masks stats::lag()
?ggplot
ntrain <- select_if(df_train, is.numeric)
ntrain %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density()  
## Warning: Removed 1879 rows containing non-finite values (`stat_density()`).

2 Data Preparation

df_train$AGE[is.na(df_train$AGE)] <- mean(df_train$AGE, na.rm=TRUE)
df_train$YOJ[is.na(df_train$YOJ)] <- mean(df_train$YOJ, na.rm=TRUE)
df_train$HOME_VAL[is.na(df_train$HOME_VAL)] <- mean(df_train$HOME_VAL, na.rm=TRUE)
df_train$CAR_AGE[is.na(df_train$CAR_AGE)] <- mean(df_train$CAR_AGE, na.rm=TRUE)
df_train$INCOME[is.na(df_train$INCOME)] <- mean(df_train$INCOME, na.rm=TRUE)

df_train <- df_train[complete.cases(df_train),]
df_train2   <- df_train

df_train <- df_train[, !(colnames(df_train) %in% c("INDEX"))]

# df_train$new <- df_train$tax / (df_train$medv*10)

trainnum <- dplyr::select_if(df_train, is.numeric)

library("Hmisc")
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(as.matrix(trainnum))
##            TARGET_AMT KIDSDRIV   AGE HOMEKIDS   YOJ INCOME HOME_VAL TRAVTIME
## TARGET_AMT       1.00     0.06 -0.04     0.06 -0.02  -0.06    -0.08     0.03
## KIDSDRIV         0.06     1.00 -0.08     0.46  0.04  -0.05    -0.02     0.01
## AGE             -0.04    -0.08  1.00    -0.45  0.13   0.18     0.20     0.01
## HOMEKIDS         0.06     0.46 -0.45     1.00  0.08  -0.16    -0.11    -0.01
## YOJ             -0.02     0.04  0.13     0.08  1.00   0.27     0.26    -0.02
## INCOME          -0.06    -0.05  0.18    -0.16  0.27   1.00     0.54    -0.05
## HOME_VAL        -0.08    -0.02  0.20    -0.11  0.26   0.54     1.00    -0.03
## TRAVTIME         0.03     0.01  0.01    -0.01 -0.02  -0.05    -0.03     1.00
## BLUEBOOK         0.00    -0.02  0.16    -0.11  0.14   0.42     0.25    -0.02
## TIF             -0.05     0.00  0.00     0.01  0.02   0.00     0.00    -0.01
## OLDCLAIM         0.07     0.02 -0.03     0.03  0.00  -0.04    -0.07    -0.02
## CLM_FREQ         0.12     0.04 -0.02     0.03 -0.03  -0.05    -0.09     0.01
## MVR_PTS          0.14     0.05 -0.07     0.06 -0.04  -0.06    -0.08     0.01
## CAR_AGE         -0.06    -0.05  0.17    -0.15  0.06   0.39     0.20    -0.04
##            BLUEBOOK   TIF OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE
## TARGET_AMT     0.00 -0.05     0.07     0.12    0.14   -0.06
## KIDSDRIV      -0.02  0.00     0.02     0.04    0.05   -0.05
## AGE            0.16  0.00    -0.03    -0.02   -0.07    0.17
## HOMEKIDS      -0.11  0.01     0.03     0.03    0.06   -0.15
## YOJ            0.14  0.02     0.00    -0.03   -0.04    0.06
## INCOME         0.42  0.00    -0.04    -0.05   -0.06    0.39
## HOME_VAL       0.25  0.00    -0.07    -0.09   -0.08    0.20
## TRAVTIME      -0.02 -0.01    -0.02     0.01    0.01   -0.04
## BLUEBOOK       1.00 -0.01    -0.03    -0.04   -0.04    0.18
## TIF           -0.01  1.00    -0.02    -0.02   -0.04    0.01
## OLDCLAIM      -0.03 -0.02     1.00     0.50    0.26   -0.01
## CLM_FREQ      -0.04 -0.02     0.50     1.00    0.40   -0.01
## MVR_PTS       -0.04 -0.04     0.26     0.40    1.00   -0.02
## CAR_AGE        0.18  0.01    -0.01    -0.01   -0.02    1.00
## 
## n= 8161 
## 
## 
## P
##            TARGET_AMT KIDSDRIV AGE    HOMEKIDS YOJ    INCOME HOME_VAL TRAVTIME
## TARGET_AMT            0.0000   0.0002 0.0000   0.0585 0.0000 0.0000   0.0115  
## KIDSDRIV   0.0000              0.0000 0.0000   0.0002 0.0000 0.0803   0.4455  
## AGE        0.0002     0.0000          0.0000   0.0000 0.0000 0.0000   0.6342  
## HOMEKIDS   0.0000     0.0000   0.0000          0.0000 0.0000 0.0000   0.5128  
## YOJ        0.0585     0.0002   0.0000 0.0000          0.0000 0.0000   0.1362  
## INCOME     0.0000     0.0000   0.0000 0.0000   0.0000        0.0000   0.0000  
## HOME_VAL   0.0000     0.0803   0.0000 0.0000   0.0000 0.0000          0.0018  
## TRAVTIME   0.0115     0.4455   0.6342 0.5128   0.1362 0.0000 0.0018           
## BLUEBOOK   0.6712     0.0516   0.0000 0.0000   0.0000 0.0000 0.0000   0.1246  
## TIF        0.0000     0.8574   0.9952 0.2859   0.0294 0.9274 0.8569   0.2945  
## OLDCLAIM   0.0000     0.0653   0.0082 0.0069   0.7931 0.0000 0.0000   0.0818  
## CLM_FREQ   0.0000     0.0008   0.0296 0.0080   0.0210 0.0000 0.0000   0.5535  
## MVR_PTS    0.0000     0.0000   0.0000 0.0000   0.0009 0.0000 0.0000   0.3384  
## CAR_AGE    0.0000     0.0000   0.0000 0.0000   0.0000 0.0000 0.0000   0.0009  
##            BLUEBOOK TIF    OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE
## TARGET_AMT 0.6712   0.0000 0.0000   0.0000   0.0000  0.0000 
## KIDSDRIV   0.0516   0.8574 0.0653   0.0008   0.0000  0.0000 
## AGE        0.0000   0.9952 0.0082   0.0296   0.0000  0.0000 
## HOMEKIDS   0.0000   0.2859 0.0069   0.0080   0.0000  0.0000 
## YOJ        0.0000   0.0294 0.7931   0.0210   0.0009  0.0000 
## INCOME     0.0000   0.9274 0.0000   0.0000   0.0000  0.0000 
## HOME_VAL   0.0000   0.8569 0.0000   0.0000   0.0000  0.0000 
## TRAVTIME   0.1246   0.2945 0.0818   0.5535   0.3384  0.0009 
## BLUEBOOK            0.6242 0.0077   0.0010   0.0004  0.0000 
## TIF        0.6242          0.0473   0.0375   0.0002  0.4971 
## OLDCLAIM   0.0077   0.0473          0.0000   0.0000  0.2402 
## CLM_FREQ   0.0010   0.0375 0.0000            0.0000  0.4151 
## MVR_PTS    0.0004   0.0002 0.0000   0.0000           0.0816 
## CAR_AGE    0.0000   0.4971 0.2402   0.4151   0.0816
library("corrplot")
## corrplot 0.92 loaded
corrplot(cor(trainnum), method="square")

cor.test(trainnum$HOMEKIDS,trainnum$AGE,method="pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  trainnum$HOMEKIDS and trainnum$AGE
## t = -44.897, df = 8159, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4623262 -0.4275266
## sample estimates:
##        cor 
## -0.4450944
df_train2 <- df_train

3 Build Models

#MODEL 1
logit <- glm(formula = TARGET_FLAG ~ . - TARGET_AMT, data=df_train, family = "binomial" (link="logit"))
summary(logit)
## 
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"), 
##     data = df_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5850  -0.7129  -0.3982   0.6260   3.1521  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -9.262e-01  3.215e-01  -2.881 0.003961 ** 
## KIDSDRIV                         3.862e-01  6.122e-02   6.307 2.84e-10 ***
## AGE                             -9.998e-04  4.020e-03  -0.249 0.803572    
## HOMEKIDS                         4.974e-02  3.713e-02   1.340 0.180395    
## YOJ                             -1.119e-02  8.591e-03  -1.302 0.192813    
## INCOME                          -3.423e-06  1.079e-06  -3.172 0.001514 ** 
## PARENT1Yes                       3.820e-01  1.096e-01   3.485 0.000492 ***
## HOME_VAL                        -1.305e-06  3.423e-07  -3.812 0.000138 ***
## MSTATUSz_No                      4.939e-01  8.357e-02   5.910 3.43e-09 ***
## SEXz_F                          -8.224e-02  1.120e-01  -0.734 0.462910    
## EDUCATIONBachelors              -3.803e-01  1.156e-01  -3.288 0.001009 ** 
## EDUCATIONMasters                -2.898e-01  1.787e-01  -1.621 0.104941    
## EDUCATIONPhD                    -1.664e-01  2.139e-01  -0.778 0.436721    
## EDUCATIONz_High_School           1.782e-02  9.506e-02   0.187 0.851300    
## JOBClerical                      4.105e-01  1.966e-01   2.087 0.036863 *  
## JOBDoctor                       -4.469e-01  2.671e-01  -1.673 0.094284 .  
## JOBHome_Maker                    2.315e-01  2.101e-01   1.102 0.270566    
## JOBLawyer                        1.049e-01  1.695e-01   0.619 0.535737    
## JOBManager                      -5.576e-01  1.716e-01  -3.250 0.001153 ** 
## JOBProfessional                  1.614e-01  1.784e-01   0.904 0.365739    
## JOBStudent                       2.155e-01  2.145e-01   1.005 0.315098    
## JOBz_Blue_Collar                 3.100e-01  1.856e-01   1.671 0.094786 .  
## TRAVTIME                         1.457e-02  1.883e-03   7.737 1.01e-14 ***
## CAR_USEPrivate                  -7.564e-01  9.172e-02  -8.247  < 2e-16 ***
## BLUEBOOK                        -2.085e-05  5.262e-06  -3.963 7.40e-05 ***
## TIF                             -5.546e-02  7.344e-03  -7.552 4.29e-14 ***
## CAR_TYPEPanel_Truck              5.607e-01  1.618e-01   3.466 0.000528 ***
## CAR_TYPEPickup                   5.537e-01  1.007e-01   5.497 3.85e-08 ***
## CAR_TYPESports_Car               1.025e+00  1.299e-01   7.890 3.02e-15 ***
## CAR_TYPEVan                      6.186e-01  1.265e-01   4.891 1.00e-06 ***
## CAR_TYPEz_SUV                    7.681e-01  1.113e-01   6.903 5.09e-12 ***
## RED_CARyes                      -9.537e-03  8.636e-02  -0.110 0.912067    
## OLDCLAIM                        -1.388e-05  3.910e-06  -3.551 0.000384 ***
## CLM_FREQ                         1.960e-01  2.854e-02   6.865 6.65e-12 ***
## REVOKEDYes                       8.872e-01  9.133e-02   9.714  < 2e-16 ***
## MVR_PTS                          1.132e-01  1.361e-02   8.314  < 2e-16 ***
## CAR_AGE                         -8.599e-04  7.541e-03  -0.114 0.909213    
## URBANICITYz_Highly_Rural/ Rural -2.390e+00  1.128e-01 -21.180  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7297.5  on 8123  degrees of freedom
## AIC: 7373.5
## 
## Number of Fisher Scoring iterations: 5
exp(logit$coefficients)
##                     (Intercept)                        KIDSDRIV 
##                      0.39603775                      1.47133252 
##                             AGE                        HOMEKIDS 
##                      0.99900069                      1.05099559 
##                             YOJ                          INCOME 
##                      0.98887451                      0.99999658 
##                      PARENT1Yes                        HOME_VAL 
##                      1.46522755                      0.99999869 
##                     MSTATUSz_No                          SEXz_F 
##                      1.63869016                      0.92105238 
##              EDUCATIONBachelors                EDUCATIONMasters 
##                      0.68368429                      0.74844699 
##                    EDUCATIONPhD          EDUCATIONz_High_School 
##                      0.84674479                      1.01797922 
##                     JOBClerical                       JOBDoctor 
##                      1.50750416                      0.63959396 
##                   JOBHome_Maker                       JOBLawyer 
##                      1.26049838                      1.11065166 
##                      JOBManager                 JOBProfessional 
##                      0.57257892                      1.17514024 
##                      JOBStudent                JOBz_Blue_Collar 
##                      1.24042545                      1.36344323 
##                        TRAVTIME                  CAR_USEPrivate 
##                      1.01467770                      0.46934226 
##                        BLUEBOOK                             TIF 
##                      0.99997915                      0.94605169 
##             CAR_TYPEPanel_Truck                  CAR_TYPEPickup 
##                      1.75190925                      1.73975757 
##              CAR_TYPESports_Car                     CAR_TYPEVan 
##                      2.78640582                      1.85637992 
##                   CAR_TYPEz_SUV                      RED_CARyes 
##                      2.15562973                      0.99050868 
##                        OLDCLAIM                        CLM_FREQ 
##                      0.99998612                      1.21647700 
##                      REVOKEDYes                         MVR_PTS 
##                      2.42833966                      1.11982364 
##                         CAR_AGE URBANICITYz_Highly_Rural/ Rural 
##                      0.99914047                      0.09165049
logitscalar <- mean(dlogis(predict(logit, type = "link")))
logitscalar * coef(logit)
##                     (Intercept)                        KIDSDRIV 
##                   -1.347373e-01                    5.617440e-02 
##                             AGE                        HOMEKIDS 
##                   -1.454380e-04                    7.235175e-03 
##                             YOJ                          INCOME 
##                   -1.627452e-03                   -4.979343e-07 
##                      PARENT1Yes                        HOME_VAL 
##                    5.556956e-02                   -1.898521e-07 
##                     MSTATUSz_No                          SEXz_F 
##                    7.184528e-02                   -1.196289e-02 
##              EDUCATIONBachelors                EDUCATIONMasters 
##                   -5.531478e-02                   -4.214950e-02 
##                    EDUCATIONPhD          EDUCATIONz_High_School 
##                   -2.419914e-02                    2.592133e-03 
##                     JOBClerical                       JOBDoctor 
##                    5.970732e-02                   -6.501194e-02 
##                   JOBHome_Maker                       JOBLawyer 
##                    3.367643e-02                    1.526621e-02 
##                      JOBManager                 JOBProfessional 
##                   -8.111255e-02                    2.347640e-02 
##                      JOBStudent                JOBz_Blue_Collar 
##                    3.134130e-02                    4.509641e-02 
##                        TRAVTIME                  CAR_USEPrivate 
##                    2.119590e-03                   -1.100339e-01 
##                        BLUEBOOK                             TIF 
##                   -3.033364e-06                   -8.067267e-03 
##             CAR_TYPEPanel_Truck                  CAR_TYPEPickup 
##                    8.156371e-02                    8.055121e-02 
##              CAR_TYPESports_Car                     CAR_TYPEVan 
##                    1.490667e-01                    8.998941e-02 
##                   CAR_TYPEz_SUV                      RED_CARyes 
##                    1.117300e-01                   -1.387259e-03 
##                        OLDCLAIM                        CLM_FREQ 
##                   -2.019267e-06                    2.850538e-02 
##                      REVOKEDYes                         MVR_PTS 
##                    1.290586e-01                    1.646257e-02 
##                         CAR_AGE URBANICITYz_Highly_Rural/ Rural 
##                   -1.250859e-04                   -3.476308e-01
confint.default(logit)
##                                         2.5 %        97.5 %
## (Intercept)                     -1.556324e+00 -2.961675e-01
## KIDSDRIV                         2.661701e-01  5.061669e-01
## AGE                             -8.878262e-03  6.878648e-03
## HOMEKIDS                        -2.303667e-02  1.225125e-01
## YOJ                             -2.802553e-02  5.649843e-03
## INCOME                          -5.538160e-06 -1.307895e-06
## PARENT1Yes                       1.671831e-01  5.968380e-01
## HOME_VAL                        -1.976088e-06 -6.341721e-07
## MSTATUSz_No                      3.300966e-01  6.576979e-01
## SEXz_F                          -3.018165e-01  1.373397e-01
## EDUCATIONBachelors              -6.069282e-01 -1.535899e-01
## EDUCATIONMasters                -6.400225e-01  6.051271e-02
## EDUCATIONPhD                    -5.855846e-01  2.528727e-01
## EDUCATIONz_High_School          -1.684897e-01  2.041287e-01
## JOBClerical                      2.503569e-02  7.958751e-01
## JOBDoctor                       -9.704343e-01  7.659079e-02
## JOBHome_Maker                   -1.803304e-01  6.433448e-01
## JOBLawyer                       -2.272048e-01  4.370987e-01
## JOBManager                      -8.938381e-01 -2.213713e-01
## JOBProfessional                 -1.883299e-01  5.111049e-01
## JOBStudent                      -2.049019e-01  6.358108e-01
## JOBz_Blue_Collar                -5.368115e-02  6.737077e-01
## TRAVTIME                         1.088005e-02  1.826201e-02
## CAR_USEPrivate                  -9.361895e-01 -5.766566e-01
## BLUEBOOK                        -3.116528e-05 -1.054019e-05
## TIF                             -6.985123e-02 -4.106492e-02
## CAR_TYPEPanel_Truck              2.436267e-01  8.777857e-01
## CAR_TYPEPickup                   3.563230e-01  7.511686e-01
## CAR_TYPESports_Car               7.701932e-01  1.279312e+00
## CAR_TYPEVan                      3.707454e-01  8.665112e-01
## CAR_TYPEz_SUV                    5.500061e-01  9.861597e-01
## RED_CARyes                      -1.787959e-01  1.597226e-01
## OLDCLAIM                        -2.154397e-05 -6.218763e-06
## CLM_FREQ                         1.400122e-01  2.519058e-01
## REVOKEDYes                       7.082022e-01  1.066213e+00
## MVR_PTS                          8.649250e-02  1.398499e-01
## CAR_AGE                         -1.563971e-02  1.391991e-02
## URBANICITYz_Highly_Rural/ Rural -2.610914e+00 -2.168632e+00
predlogit <- predict(logit, type="response")
df_train2$pred1 <- predict(logit, type="response")
summary(predlogit)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.002658 0.077043 0.200903 0.263816 0.402781 0.964607
table(true = df_train$TARGET_FLAG, pred = round(fitted(logit)))
##     pred
## true    0    1
##    0 5550  458
##    1 1235  918

It seems that the best predictor for crashes, or has a positive relationship with car accidents, ranked according to the coefficients are: 1. Car Type-Sports 2. License Revoked 3. Car Type-SUV 4. Car Type-Van

Which makes sense, considering that having a sports car will give you the option of going faster and thus putting yourself in higher risk of driving recklessly and crashing. Having had a license revoked also means that there must have been a past record of a driving incident, which would likely mean a higher chance of repeating the same mistake in the future.

The factors that have a negative relationship with car accidents are as ranked below: 1. Job-Doctor 2. Car Use-Private 3. Job-Manager 4. Education-Masters

This also makes sense because doctors tend to be more cautious due to the nature of their occupation. Private cars are also driven less so less opportunities to crash. White-collar jobs and better-educated people also tend to drive more carefully it seems, resulting in less crashes.

#plots for Model 1
par(mfrow=c(2,2))
plot(logit)

data.frame(df_train2$pred1) %>%
    ggplot(aes(x = df_train2.pred1)) + 
    geom_histogram(bins = 50, fill = 'grey50') +
    labs(title = 'Histogram of Predictions') +
    theme_bw()

options(repos = c(CRAN = "http://cran.rstudio.com/"))
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
?plot.roc
plot.roc(df_train$TARGET_FLAG, df_train2$pred1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#extract variables that are significant and rerun model
sigvars <- data.frame(summary(logit)$coef[summary(logit)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::rownames_to_column()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colist <- dplyr::pull(sigvars, vars)
# colist <- colist[2:11]
colist   <- c("KIDSDRIV","INCOME","PARENT1","HOME_VAL","MSTATUS","EDUCATION","JOB","TRAVTIME","CAR_USE","BLUEBOOK","TIF","CAR_TYPE","CLM_FREQ","REVOKED","MVR_PTS","URBANICITY")
idx <- match(colist, names(df_train))
trainmod2 <- cbind(df_train[,idx], df_train2['TARGET_FLAG'])


#MODEL 2
logit2 <- glm(TARGET_FLAG ~ ., data=trainmod2, family = "binomial" (link="logit"))
summary(logit2)
## 
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"), 
##     data = trainmod2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6020  -0.7192  -0.3999   0.6347   3.1387  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -1.056e+00  2.557e-01  -4.131 3.61e-05 ***
## KIDSDRIV                         4.222e-01  5.500e-02   7.677 1.63e-14 ***
## INCOME                          -3.445e-06  1.071e-06  -3.216  0.00130 ** 
## PARENT1Yes                       4.622e-01  9.411e-02   4.911 9.06e-07 ***
## HOME_VAL                        -1.356e-06  3.406e-07  -3.982 6.83e-05 ***
## MSTATUSz_No                      4.692e-01  7.951e-02   5.902 3.60e-09 ***
## EDUCATIONBachelors              -3.817e-01  1.088e-01  -3.509  0.00045 ***
## EDUCATIONMasters                -3.046e-01  1.613e-01  -1.889  0.05886 .  
## EDUCATIONPhD                    -1.695e-01  1.997e-01  -0.849  0.39604    
## EDUCATIONz_High_School           1.828e-02  9.464e-02   0.193  0.84686    
## JOBClerical                      4.172e-01  1.962e-01   2.126  0.03348 *  
## JOBDoctor                       -4.343e-01  2.661e-01  -1.632  0.10259    
## JOBHome_Maker                    2.798e-01  2.038e-01   1.373  0.16980    
## JOBLawyer                        1.132e-01  1.689e-01   0.670  0.50293    
## JOBManager                      -5.601e-01  1.711e-01  -3.272  0.00107 ** 
## JOBProfessional                  1.673e-01  1.780e-01   0.939  0.34748    
## JOBStudent                       2.823e-01  2.105e-01   1.341  0.17993    
## JOBz_Blue_Collar                 3.162e-01  1.852e-01   1.707  0.08784 .  
## TRAVTIME                         1.460e-02  1.878e-03   7.775 7.52e-15 ***
## CAR_USEPrivate                  -7.586e-01  9.158e-02  -8.284  < 2e-16 ***
## BLUEBOOK                        -2.329e-05  4.715e-06  -4.941 7.79e-07 ***
## TIF                             -5.539e-02  7.330e-03  -7.557 4.11e-14 ***
## CAR_TYPEPanel_Truck              6.211e-01  1.507e-01   4.122 3.76e-05 ***
## CAR_TYPEPickup                   5.549e-01  1.006e-01   5.517 3.45e-08 ***
## CAR_TYPESports_Car               9.680e-01  1.074e-01   9.015  < 2e-16 ***
## CAR_TYPEVan                      6.453e-01  1.219e-01   5.292 1.21e-07 ***
## CAR_TYPEz_SUV                    7.202e-01  8.585e-02   8.388  < 2e-16 ***
## CLM_FREQ                         1.496e-01  2.549e-02   5.866 4.46e-09 ***
## REVOKEDYes                       7.339e-01  8.022e-02   9.148  < 2e-16 ***
## MVR_PTS                          1.100e-01  1.351e-02   8.144 3.82e-16 ***
## URBANICITYz_Highly_Rural/ Rural -2.385e+00  1.127e-01 -21.172  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7314.9  on 8130  degrees of freedom
## AIC: 7376.9
## 
## Number of Fisher Scoring iterations: 5
exp(logit2$coefficients)
##                     (Intercept)                        KIDSDRIV 
##                      0.34773874                      1.52532752 
##                          INCOME                      PARENT1Yes 
##                      0.99999655                      1.58754927 
##                        HOME_VAL                     MSTATUSz_No 
##                      0.99999864                      1.59877425 
##              EDUCATIONBachelors                EDUCATIONMasters 
##                      0.68270100                      0.73738978 
##                    EDUCATIONPhD          EDUCATIONz_High_School 
##                      0.84411734                      1.01844512 
##                     JOBClerical                       JOBDoctor 
##                      1.51777396                      0.64769733 
##                   JOBHome_Maker                       JOBLawyer 
##                      1.32285549                      1.11980165 
##                      JOBManager                 JOBProfessional 
##                      0.57117155                      1.18207477 
##                      JOBStudent                JOBz_Blue_Collar 
##                      1.32611593                      1.37186955 
##                        TRAVTIME                  CAR_USEPrivate 
##                      1.01471068                      0.46831392 
##                        BLUEBOOK                             TIF 
##                      0.99997671                      0.94611218 
##             CAR_TYPEPanel_Truck                  CAR_TYPEPickup 
##                      1.86093753                      1.74173381 
##              CAR_TYPESports_Car                     CAR_TYPEVan 
##                      2.63258262                      1.90649272 
##                   CAR_TYPEz_SUV                        CLM_FREQ 
##                      2.05478342                      1.16131156 
##                      REVOKEDYes                         MVR_PTS 
##                      2.08308560                      1.11632537 
## URBANICITYz_Highly_Rural/ Rural 
##                      0.09204753
logit2scalar <- mean(dlogis(predict(logit2, type = "link")))
logit2scalar * coef(logit2)
##                     (Intercept)                        KIDSDRIV 
##                   -1.540887e-01                    6.158990e-02 
##                          INCOME                      PARENT1Yes 
##                   -5.025836e-07                    6.742233e-02 
##                        HOME_VAL                     MSTATUSz_No 
##                   -1.978561e-07                    6.845013e-02 
##              EDUCATIONBachelors                EDUCATIONMasters 
##                   -5.568036e-02                   -4.443926e-02 
##                    EDUCATIONPhD          EDUCATIONz_High_School 
##                   -2.472058e-02                    2.666173e-03 
##                     JOBClerical                       JOBDoctor 
##                    6.086571e-02                   -6.335829e-02 
##                   JOBHome_Maker                       JOBLawyer 
##                    4.081484e-02                    1.650602e-02 
##                      JOBManager                 JOBProfessional 
##                   -8.169976e-02                    2.440074e-02 
##                      JOBStudent                JOBz_Blue_Collar 
##                    4.117394e-02                    4.612205e-02 
##                        TRAVTIME                  CAR_USEPrivate 
##                    2.130294e-03                   -1.106634e-01 
##                        BLUEBOOK                             TIF 
##                   -3.398148e-06                   -8.080637e-03 
##             CAR_TYPEPanel_Truck                  CAR_TYPEPickup 
##                    9.060030e-02                    8.094345e-02 
##              CAR_TYPESports_Car                     CAR_TYPEVan 
##                    1.412023e-01                    9.412828e-02 
##                   CAR_TYPEz_SUV                        CLM_FREQ 
##                    1.050551e-01                    2.181566e-02 
##                      REVOKEDYes                         MVR_PTS 
##                    1.070506e-01                    1.605247e-02 
## URBANICITYz_Highly_Rural/ Rural 
##                   -3.479783e-01
predlogit2 <- predict(logit2, type="response")
df_train2$pred2 <- predict(logit2, type="response")
summary(predlogit2)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.002346 0.077433 0.201978 0.263816 0.399281 0.966132
table(true = df_train$TARGET_FLAG, pred = round(fitted(logit2)))
##     pred
## true    0    1
##    0 5553  455
##    1 1243  910
#plots for Model 2
par(mfrow=c(2,2))

plot(logit2)

data.frame(df_train2$pred2) %>%
    ggplot(aes(x = df_train2.pred2)) + 
    geom_histogram(bins = 50, fill = 'grey50') +
    labs(title = 'Histogram of Predictions') +
    theme_bw()

install.packages("pROC")
## 
## The downloaded binary packages are in
##  /var/folders/_2/ds2qp0zn11v05n6h2k954t800000gn/T//Rtmpx9eFeQ/downloaded_packages
library("pROC")
plot.roc(df_train$TARGET_FLAG, df_train2$pred2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#MODEL 3
#PC Model no racial bias
logit3 <- glm(TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME, data=df_train, family = "binomial" (link="logit"))
summary(logit3)
## 
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME, 
##     family = binomial(link = "logit"), data = df_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6189  -0.8209  -0.6767   1.2589   2.8068  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.785e-01  7.291e-02  -9.306  < 2e-16 ***
## KIDSDRIV     3.998e-01  4.591e-02   8.707  < 2e-16 ***
## INCOME      -3.502e-06  6.824e-07  -5.132 2.87e-07 ***
## HOME_VAL    -2.973e-06  2.498e-07 -11.903  < 2e-16 ***
## TRAVTIME     5.842e-03  1.598e-03   3.656 0.000257 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 9024.8  on 8156  degrees of freedom
## AIC: 9034.8
## 
## Number of Fisher Scoring iterations: 4
exp(logit3$coefficients)
## (Intercept)    KIDSDRIV      INCOME    HOME_VAL    TRAVTIME 
##   0.5073733   1.4914800   0.9999965   0.9999970   1.0058590
predlogit3 <- predict(logit3, type="response")
df_train2$pred3 <- predict(logit3, type="response")
summary(predlogit3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01182 0.19779 0.25596 0.26382 0.32923 0.75815
table(true = df_train$TARGET_FLAG, pred = round(fitted(logit3)))
##     pred
## true    0    1
##    0 5940   68
##    1 2093   60
#plots for Model 3
par(mfrow=c(2,2))

plot(logit3)

data.frame(df_train2$pred3) %>%
    ggplot(aes(x = df_train2.pred3)) + 
    geom_histogram(bins = 50, fill = 'grey50') +
    labs(title = 'Histogram of Predictions') +
    theme_bw()

plot.roc(df_train$TARGET_FLAG, df_train2$pred3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
logit3scalar <- mean(dlogis(predict(logit3, type = "link")))
logit3scalar * coef(logit3)
##   (Intercept)      KIDSDRIV        INCOME      HOME_VAL      TRAVTIME 
## -1.255691e-01  7.398379e-02 -6.480644e-07 -5.501764e-07  1.081145e-03
round(logitscalar * coef(logit),2)
##                     (Intercept)                        KIDSDRIV 
##                           -0.13                            0.06 
##                             AGE                        HOMEKIDS 
##                            0.00                            0.01 
##                             YOJ                          INCOME 
##                            0.00                            0.00 
##                      PARENT1Yes                        HOME_VAL 
##                            0.06                            0.00 
##                     MSTATUSz_No                          SEXz_F 
##                            0.07                           -0.01 
##              EDUCATIONBachelors                EDUCATIONMasters 
##                           -0.06                           -0.04 
##                    EDUCATIONPhD          EDUCATIONz_High_School 
##                           -0.02                            0.00 
##                     JOBClerical                       JOBDoctor 
##                            0.06                           -0.07 
##                   JOBHome_Maker                       JOBLawyer 
##                            0.03                            0.02 
##                      JOBManager                 JOBProfessional 
##                           -0.08                            0.02 
##                      JOBStudent                JOBz_Blue_Collar 
##                            0.03                            0.05 
##                        TRAVTIME                  CAR_USEPrivate 
##                            0.00                           -0.11 
##                        BLUEBOOK                             TIF 
##                            0.00                           -0.01 
##             CAR_TYPEPanel_Truck                  CAR_TYPEPickup 
##                            0.08                            0.08 
##              CAR_TYPESports_Car                     CAR_TYPEVan 
##                            0.15                            0.09 
##                   CAR_TYPEz_SUV                      RED_CARyes 
##                            0.11                            0.00 
##                        OLDCLAIM                        CLM_FREQ 
##                            0.00                            0.03 
##                      REVOKEDYes                         MVR_PTS 
##                            0.13                            0.02 
##                         CAR_AGE URBANICITYz_Highly_Rural/ Rural 
##                            0.00                           -0.35
round(logit2scalar * coef(logit2),2)
##                     (Intercept)                        KIDSDRIV 
##                           -0.15                            0.06 
##                          INCOME                      PARENT1Yes 
##                            0.00                            0.07 
##                        HOME_VAL                     MSTATUSz_No 
##                            0.00                            0.07 
##              EDUCATIONBachelors                EDUCATIONMasters 
##                           -0.06                           -0.04 
##                    EDUCATIONPhD          EDUCATIONz_High_School 
##                           -0.02                            0.00 
##                     JOBClerical                       JOBDoctor 
##                            0.06                           -0.06 
##                   JOBHome_Maker                       JOBLawyer 
##                            0.04                            0.02 
##                      JOBManager                 JOBProfessional 
##                           -0.08                            0.02 
##                      JOBStudent                JOBz_Blue_Collar 
##                            0.04                            0.05 
##                        TRAVTIME                  CAR_USEPrivate 
##                            0.00                           -0.11 
##                        BLUEBOOK                             TIF 
##                            0.00                           -0.01 
##             CAR_TYPEPanel_Truck                  CAR_TYPEPickup 
##                            0.09                            0.08 
##              CAR_TYPESports_Car                     CAR_TYPEVan 
##                            0.14                            0.09 
##                   CAR_TYPEz_SUV                        CLM_FREQ 
##                            0.11                            0.02 
##                      REVOKEDYes                         MVR_PTS 
##                            0.11                            0.02 
## URBANICITYz_Highly_Rural/ Rural 
##                           -0.35
round(logit3scalar * coef(logit3),2)
## (Intercept)    KIDSDRIV      INCOME    HOME_VAL    TRAVTIME 
##       -0.13        0.07        0.00        0.00        0.00
#MODEL 1
model <- lm(TARGET_AMT ~ ., data = df_train)
summary(model)
## 
## Call:
## lm(formula = TARGET_AMT ~ ., data = df_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6287   -469    -57    237 101119 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -5.912e+02  4.878e+02  -1.212 0.225619    
## TARGET_FLAG1                     5.705e+03  1.136e+02  50.241  < 2e-16 ***
## KIDSDRIV                        -3.105e+01  9.909e+01  -0.313 0.754024    
## AGE                              5.967e+00  6.171e+00   0.967 0.333637    
## HOMEKIDS                         3.984e+01  5.709e+01   0.698 0.485235    
## YOJ                              7.892e+00  1.319e+01   0.598 0.549525    
## INCOME                          -2.234e-03  1.577e-03  -1.417 0.156459    
## PARENT1Yes                       1.407e+02  1.767e+02   0.797 0.425723    
## HOME_VAL                         3.955e-04  5.163e-04   0.766 0.443683    
## MSTATUSz_No                      1.692e+02  1.267e+02   1.335 0.181839    
## SEXz_F                          -2.873e+02  1.605e+02  -1.789 0.073572 .  
## EDUCATIONBachelors               6.843e+01  1.790e+02   0.382 0.702279    
## EDUCATIONMasters                 2.237e+02  2.620e+02   0.854 0.393284    
## EDUCATIONPhD                     4.296e+02  3.110e+02   1.382 0.167133    
## EDUCATIONz_High_School          -1.229e+02  1.502e+02  -0.818 0.413279    
## JOBClerical                     -4.752e+00  2.984e+02  -0.016 0.987294    
## JOBDoctor                       -2.788e+02  3.571e+02  -0.781 0.434921    
## JOBHome_Maker                   -6.546e+01  3.185e+02  -0.206 0.837182    
## JOBLawyer                        7.787e+01  2.583e+02   0.302 0.763035    
## JOBManager                      -1.212e+02  2.521e+02  -0.481 0.630525    
## JOBProfessional                  1.764e+02  2.698e+02   0.654 0.513241    
## JOBStudent                      -1.269e+02  3.267e+02  -0.388 0.697709    
## JOBz_Blue_Collar                 5.766e+01  2.813e+02   0.205 0.837595    
## TRAVTIME                         5.649e-01  2.824e+00   0.200 0.841445    
## CAR_USEPrivate                  -9.639e+01  1.443e+02  -0.668 0.504033    
## BLUEBOOK                         2.930e-02  7.536e-03   3.889 0.000102 ***
## TIF                             -2.939e+00  1.068e+01  -0.275 0.783072    
## CAR_TYPEPanel_Truck             -5.226e+01  2.431e+02  -0.215 0.829783    
## CAR_TYPEPickup                  -3.071e+01  1.493e+02  -0.206 0.837045    
## CAR_TYPESports_Car               2.049e+02  1.909e+02   1.073 0.283203    
## CAR_TYPEVan                      9.584e+01  1.864e+02   0.514 0.607236    
## CAR_TYPEz_SUV                    1.602e+02  1.571e+02   1.020 0.307680    
## RED_CARyes                      -2.857e+01  1.302e+02  -0.219 0.826320    
## OLDCLAIM                         3.060e-03  6.501e-03   0.471 0.637817    
## CLM_FREQ                        -4.347e+01  4.821e+01  -0.902 0.367271    
## REVOKEDYes                      -3.277e+02  1.526e+02  -2.148 0.031763 *  
## MVR_PTS                          5.398e+01  2.277e+01   2.371 0.017771 *  
## CAR_AGE                         -2.502e+01  1.118e+01  -2.238 0.025219 *  
## URBANICITYz_Highly_Rural/ Rural  3.364e+01  1.263e+02   0.266 0.790056    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3969 on 8122 degrees of freedom
## Multiple R-squared:  0.2913, Adjusted R-squared:  0.288 
## F-statistic: 87.85 on 38 and 8122 DF,  p-value: < 2.2e-16
par(mfrow=c(1,2))

plot(model$residuals ~ model$fitted.values)
plot(model$fitted.values,df_train$TARGET_AMT)

par(mfrow=c(2,2))
plot(model)

#extract variables that are significant and rerun model
sigvars <- data.frame(summary(model)$coef[summary(model)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::rownames_to_column()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colist<-dplyr::pull(sigvars, vars)
colist<-c("TARGET_FLAG","BLUEBOOK","REVOKED","MVR_PTS","CAR_AGE")
idx <- match(colist, names(df_train))
trainmod2 <- cbind(df_train[,idx], df_train['TARGET_AMT'])



#MODEL 2
model2<-lm(TARGET_AMT ~ ., data = trainmod2)
summary(model2)
## 
## Call:
## lm(formula = TARGET_AMT ~ ., data = trainmod2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6350   -383    -30    193 101423 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4.233e+02  1.183e+02  -3.578 0.000349 ***
## TARGET_FLAG1  5.724e+03  1.040e+02  55.036  < 2e-16 ***
## BLUEBOOK      3.010e-02  5.328e-03   5.650 1.65e-08 ***
## REVOKEDYes   -2.878e+02  1.355e+02  -2.123 0.033774 *  
## MVR_PTS       5.081e+01  2.097e+01   2.423 0.015430 *  
## CAR_AGE      -1.277e+01  8.121e+00  -1.572 0.115961    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3967 on 8155 degrees of freedom
## Multiple R-squared:  0.2891, Adjusted R-squared:  0.2887 
## F-statistic: 663.3 on 5 and 8155 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model2$residuals ~ model2$fitted.values)
plot(model2$fitted.values,df_train$TARGET_AMT)
par(mfrow=c(2,2))

plot(model2)

par(mfrow=c(1,2))
plot(model2$residuals ~ model2$fitted.values, main="New Reduced Var Model")
abline(h = 0)
plot(model$residuals ~ model$fitted.values, main="Orignal Model All Vars")
abline(h = 0)

#MODEL 3
#remove variables with opposite coefficients
model3<-lm(TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME, data = df_train)
summary(model3) 
## 
## Call:
## lm(formula = TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME, 
##     data = df_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -3816  -1652  -1248   -324 106267 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.693e+03  1.468e+02  11.527  < 2e-16 ***
## KIDSDRIV     4.889e+02  1.014e+02   4.822 1.45e-06 ***
## INCOME      -1.259e-03  1.336e-03  -0.943   0.3459    
## HOME_VAL    -2.812e-03  4.921e-04  -5.714 1.14e-08 ***
## TRAVTIME     7.208e+00  3.261e+00   2.211   0.0271 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4680 on 8156 degrees of freedom
## Multiple R-squared:  0.01059,    Adjusted R-squared:  0.01011 
## F-statistic: 21.83 on 4 and 8156 DF,  p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(model3$residuals ~ model3$fitted.values)
plot(model3$fitted.values,df_train$TARGET_AMT)

par(mfrow=c(2,2))
plot(model3)

4 Select Models

test  <- read.csv("/Users/bellajean/Downloads/insurance-evaluation-data.csv")
test2 <- test
dim(test)
## [1] 2141   26
test$TARGET_AMT <- 0
test$TARGET_FLAG <- 0
test = as.tbl(test) %>% 
  mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
            currencyconv) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            underscore) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            as.factor) %>% 
  mutate(TARGET_FLAG = as.factor(TARGET_FLAG))
## Warning: `as.tbl()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# impute data for missing values
# use column mean for calculation
test$HOMEKIDS <- log(test$HOMEKIDS+1)
test$MVR_PTS <- log(test$MVR_PTS+1)
test$OLDCLAIM <- log(test$OLDCLAIM+1)
test$TIF <- log(test$TIF+1)
test$KIDSDRIV <- log(test$KIDSDRIV+1)
test$CLM_FREQ <- log(test$CLM_FREQ+1)
# use column mean for calculation
test$AGE[is.na(test$AGE)] <- mean(test$AGE, na.rm=TRUE)
test$YOJ[is.na(test$YOJ)] <- mean(test$YOJ, na.rm=TRUE)
test$HOME_VAL[is.na(test$HOME_VAL)] <- mean(test$HOME_VAL, na.rm=TRUE)
test$CAR_AGE[is.na(test$CAR_AGE)] <- mean(test$CAR_AGE, na.rm=TRUE)
test$INCOME[is.na(test$INCOME)] <- mean(test$INCOME, na.rm=TRUE)
#get complete cases
#remove rad per correlation in prior section
test <- test[, !(colnames(test) %in% c("INDEX"))]
TARGET_FLAG <- predict(logit, newdata = test, type="response")
y_pred_num <- ifelse(TARGET_FLAG > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
summary(y_pred)
##    0    1 
## 1812  329
rbind(round(summary(predlogit),4), round(summary(TARGET_FLAG),4)) %>% kable()
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0027 0.0770 0.2009 0.2638 0.4028 0.9646
0.0042 0.0865 0.2213 0.2680 0.4035 0.9216
test$TARGET_FLAG <- as.factor(test$TARGET_FLAG)
test2 <- test[, !(colnames(test) %in% c("TARGET_FLAG"))]
TARGET_AMT<- predict(model, newdata = test, interval='confidence') #data from scaling originally to get to actual wins
summary(TARGET_AMT)
##       fit                lwr               upr        
##  Min.   :-1310.19   Min.   :-1935.9   Min.   :-684.4  
##  1st Qu.: -297.30   1st Qu.: -800.7   1st Qu.: 193.8  
##  Median :  -77.68   Median : -559.5   Median : 402.8  
##  Mean   :  -59.14   Mean   : -569.1   Mean   : 450.8  
##  3rd Qu.:  171.05   3rd Qu.: -329.4   3rd Qu.: 682.8  
##  Max.   : 1172.44   Max.   :  452.2   Max.   :1899.9
summary(model)
## 
## Call:
## lm(formula = TARGET_AMT ~ ., data = df_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6287   -469    -57    237 101119 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -5.912e+02  4.878e+02  -1.212 0.225619    
## TARGET_FLAG1                     5.705e+03  1.136e+02  50.241  < 2e-16 ***
## KIDSDRIV                        -3.105e+01  9.909e+01  -0.313 0.754024    
## AGE                              5.967e+00  6.171e+00   0.967 0.333637    
## HOMEKIDS                         3.984e+01  5.709e+01   0.698 0.485235    
## YOJ                              7.892e+00  1.319e+01   0.598 0.549525    
## INCOME                          -2.234e-03  1.577e-03  -1.417 0.156459    
## PARENT1Yes                       1.407e+02  1.767e+02   0.797 0.425723    
## HOME_VAL                         3.955e-04  5.163e-04   0.766 0.443683    
## MSTATUSz_No                      1.692e+02  1.267e+02   1.335 0.181839    
## SEXz_F                          -2.873e+02  1.605e+02  -1.789 0.073572 .  
## EDUCATIONBachelors               6.843e+01  1.790e+02   0.382 0.702279    
## EDUCATIONMasters                 2.237e+02  2.620e+02   0.854 0.393284    
## EDUCATIONPhD                     4.296e+02  3.110e+02   1.382 0.167133    
## EDUCATIONz_High_School          -1.229e+02  1.502e+02  -0.818 0.413279    
## JOBClerical                     -4.752e+00  2.984e+02  -0.016 0.987294    
## JOBDoctor                       -2.788e+02  3.571e+02  -0.781 0.434921    
## JOBHome_Maker                   -6.546e+01  3.185e+02  -0.206 0.837182    
## JOBLawyer                        7.787e+01  2.583e+02   0.302 0.763035    
## JOBManager                      -1.212e+02  2.521e+02  -0.481 0.630525    
## JOBProfessional                  1.764e+02  2.698e+02   0.654 0.513241    
## JOBStudent                      -1.269e+02  3.267e+02  -0.388 0.697709    
## JOBz_Blue_Collar                 5.766e+01  2.813e+02   0.205 0.837595    
## TRAVTIME                         5.649e-01  2.824e+00   0.200 0.841445    
## CAR_USEPrivate                  -9.639e+01  1.443e+02  -0.668 0.504033    
## BLUEBOOK                         2.930e-02  7.536e-03   3.889 0.000102 ***
## TIF                             -2.939e+00  1.068e+01  -0.275 0.783072    
## CAR_TYPEPanel_Truck             -5.226e+01  2.431e+02  -0.215 0.829783    
## CAR_TYPEPickup                  -3.071e+01  1.493e+02  -0.206 0.837045    
## CAR_TYPESports_Car               2.049e+02  1.909e+02   1.073 0.283203    
## CAR_TYPEVan                      9.584e+01  1.864e+02   0.514 0.607236    
## CAR_TYPEz_SUV                    1.602e+02  1.571e+02   1.020 0.307680    
## RED_CARyes                      -2.857e+01  1.302e+02  -0.219 0.826320    
## OLDCLAIM                         3.060e-03  6.501e-03   0.471 0.637817    
## CLM_FREQ                        -4.347e+01  4.821e+01  -0.902 0.367271    
## REVOKEDYes                      -3.277e+02  1.526e+02  -2.148 0.031763 *  
## MVR_PTS                          5.398e+01  2.277e+01   2.371 0.017771 *  
## CAR_AGE                         -2.502e+01  1.118e+01  -2.238 0.025219 *  
## URBANICITYz_Highly_Rural/ Rural  3.364e+01  1.263e+02   0.266 0.790056    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3969 on 8122 degrees of freedom
## Multiple R-squared:  0.2913, Adjusted R-squared:  0.288 
## F-statistic: 87.85 on 38 and 8122 DF,  p-value: < 2.2e-16

Model 1 wins