Introduction

In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0.

A “1” means that the person was in a car crash.

A “0” means that the person was not in a car crash.

The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.

Your objective is to build 2 models:

Multiple linear regression to predict the amounts of money it will cost if the person does crash their car
Binary logistic regression models on the training data to predict the probability that a person will crash their car

You can only use the variables given to you

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(GGally)
library(skimr)
library(ggpubr)

ins_training <- read_csv("insurance_training_data.csv")

## Rows: 8161 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): INCOME, PARENT1, HOME_VAL, MSTATUS, SEX, EDUCATION, JOB, CAR_USE, ...
## dbl (12): INDEX, TARGET_FLAG, TARGET_AMT, KIDSDRIV, AGE, HOMEKIDS, YOJ, TRAV...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Remove INDEX variable}
ins_training <-  ins_training %>% select(-INDEX)

Exploratory Data Analysis

Training Data

head(ins_training, 10)

## # A tibble: 10 × 25
##    TARGET_FLAG TARGET_AMT KIDSDRIV   AGE HOMEKIDS   YOJ INCOME  PARENT1 HOME_VAL
##          <dbl>      <dbl>    <dbl> <dbl>    <dbl> <dbl> <chr>   <chr>   <chr>   
##  1           0          0        0    60        0    11 $67,349 No      $0      
##  2           0          0        0    43        0    11 $91,449 No      $257,252
##  3           0          0        0    35        1    10 $16,039 No      $124,191
##  4           0          0        0    51        0    14 <NA>    No      $306,251
##  5           0          0        0    50        0    NA $114,9… No      $243,925
##  6           1       2946        0    34        1    12 $125,3… Yes     $0      
##  7           0          0        0    54        0    NA $18,755 No      <NA>    
##  8           1       4021        1    37        2    NA $107,9… No      $333,680
##  9           1       2501        0    34        0    10 $62,978 No      $0      
## 10           0          0        0    50        0     7 $106,9… No      $0      
## # ℹ 16 more variables: MSTATUS <chr>, SEX <chr>, EDUCATION <chr>, JOB <chr>,
## #   TRAVTIME <dbl>, CAR_USE <chr>, BLUEBOOK <chr>, TIF <dbl>, CAR_TYPE <chr>,
## #   RED_CAR <chr>, OLDCLAIM <chr>, CLM_FREQ <dbl>, REVOKED <chr>,
## #   MVR_PTS <dbl>, CAR_AGE <dbl>, URBANICITY <chr>

Skimming the first few rows of the data, there are several variables that need to be cleaned before generating summary statistics. Data that represent monitory values are entered with characters which need to be removed then converted. Several categorical variables seem to have extra characters, disrupting the uniformity across categories.

Numeric data types

clean_num <- function(x){
  
  # Create a function that extracts the number from the string and 
  # converts to a numeric data type
  
  # Extract digits from string
  v <- str_extract_all(x,"\\d*,\\d*", simplify = T) 
  
  # Replace 'string',' with empty string
  clean_v <- str_replace(v, "\\,", "") 
  
  # Replace empty string with 0
  new_v <- replace(clean_v, clean_v == "", "0")
  
  new_v <- as.numeric(new_v)
  new_v
}

ins_training <- ins_training %>% mutate(
  INCOME = clean_num(INCOME), 
  HOME_VAL = clean_num(HOME_VAL), 
  BLUEBOOK = clean_num(BLUEBOOK), 
  OLDCLAIM = clean_num(OLDCLAIM)
)

head(ins_training, 3)

## # A tibble: 3 × 25
##   TARGET_FLAG TARGET_AMT KIDSDRIV   AGE HOMEKIDS   YOJ INCOME PARENT1 HOME_VAL
##         <dbl>      <dbl>    <dbl> <dbl>    <dbl> <dbl>  <dbl> <chr>      <dbl>
## 1           0          0        0    60        0    11  67349 No             0
## 2           0          0        0    43        0    11  91449 No        257252
## 3           0          0        0    35        1    10  16039 No        124191
## # ℹ 16 more variables: MSTATUS <chr>, SEX <chr>, EDUCATION <chr>, JOB <chr>,
## #   TRAVTIME <dbl>, CAR_USE <chr>, BLUEBOOK <dbl>, TIF <dbl>, CAR_TYPE <chr>,
## #   RED_CAR <chr>, OLDCLAIM <dbl>, CLM_FREQ <dbl>, REVOKED <chr>,
## #   MVR_PTS <dbl>, CAR_AGE <dbl>, URBANICITY <chr>

Categorical data type

clean_cat <- function(x){
  
  # Create a function that extracts only the necessary portion of the string 
  
  # Extract string
  cleaned_x <- str_extract_all(x, "[A-Z][a-z]*\\s?\\w*", simplify = T)
  
  cleaned_x[,1]
}

clean_cat(ins_training$SEX)[0:5]

## [1] "M" "M" "F" "M" "F"

ins_training <- ins_training %>% mutate(
  PARENT1 = clean_cat(PARENT1),
  MSTATUS = clean_cat(MSTATUS),
  SEX = clean_cat(SEX),
  EDUCATION = clean_cat(EDUCATION),
  JOB = clean_cat(JOB),
  CAR_USE = clean_cat(CAR_USE),
  CAR_TYPE = clean_cat(CAR_TYPE),
  REVOKED = clean_cat(REVOKED)
)

head(ins_training, 3)

## # A tibble: 3 × 25
##   TARGET_FLAG TARGET_AMT KIDSDRIV   AGE HOMEKIDS   YOJ INCOME PARENT1 HOME_VAL
##         <dbl>      <dbl>    <dbl> <dbl>    <dbl> <dbl>  <dbl> <chr>      <dbl>
## 1           0          0        0    60        0    11  67349 No             0
## 2           0          0        0    43        0    11  91449 No        257252
## 3           0          0        0    35        1    10  16039 No        124191
## # ℹ 16 more variables: MSTATUS <chr>, SEX <chr>, EDUCATION <chr>, JOB <chr>,
## #   TRAVTIME <dbl>, CAR_USE <chr>, BLUEBOOK <dbl>, TIF <dbl>, CAR_TYPE <chr>,
## #   RED_CAR <chr>, OLDCLAIM <dbl>, CLM_FREQ <dbl>, REVOKED <chr>,
## #   MVR_PTS <dbl>, CAR_AGE <dbl>, URBANICITY <chr>

# Convert RED_CAR to caps so it matches other columns 
ins_training$RED_CAR <- str_to_title(ins_training$RED_CAR)

ins_training$RED_CAR[0:3]

## [1] "Yes" "Yes" "No"

unique(ins_training$URBANICITY)

## [1] "Highly Urban/ Urban"   "z_Highly Rural/ Rural"

ins_training$URBANICITY <- str_extract_all(ins_training$URBANICITY, "[A-Z][a-z]*.*\\w*", simplify = T)[,1]

unique(ins_training$URBANICITY)

## [1] "Highly Urban/ Urban" "Highly Rural/ Rural"

Now that all columns are cleaned, run skim() to get summary statistics and missing value/completeness rates.

ins_training <- ins_training %>% mutate(across(where(is.character), as.factor))
ins_training$TARGET_FLAG <- as.factor(ins_training$TARGET_FLAG)

skim_summary <- skim(ins_training)
skim_summary

Data summary
Name	ins_training
Number of rows	8161
Number of columns	25
_______________________
Column type frequency:
factor	11
numeric	14
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
TARGET_FLAG	0	1.00	FALSE	2	0: 6008, 1: 2153
PARENT1	0	1.00	FALSE	2	No: 7084, Yes: 1077
MSTATUS	0	1.00	FALSE	2	Yes: 4894, No: 3267
SEX	0	1.00	FALSE	2	F: 4375, M: 3786
EDUCATION	0	1.00	FALSE	4	Hig: 3533, Bac: 2242, Mas: 1658, PhD: 728
JOB	526	0.94	FALSE	8	Blu: 1825, Cle: 1271, Pro: 1117, Man: 988
CAR_USE	0	1.00	FALSE	2	Pri: 5132, Com: 3029
CAR_TYPE	0	1.00	FALSE	6	SUV: 2294, Min: 2145, Pic: 1389, Spo: 907
RED_CAR	0	1.00	FALSE	2	No: 5783, Yes: 2378
REVOKED	0	1.00	FALSE	2	No: 7161, Yes: 1000
URBANICITY	0	1.00	FALSE	2	Hig: 6492, Hig: 1669

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
TARGET_AMT	0	1.00	1504.32	4704.03	0	0	0	1036	107586.1	▇▁▁▁▁
KIDSDRIV	0	1.00	0.17	0.51	0	0	0	0	4.0	▇▁▁▁▁
AGE	6	1.00	44.79	8.63	16	39	45	51	81.0	▁▆▇▂▁
HOMEKIDS	0	1.00	0.72	1.12	0	0	0	1	5.0	▇▂▁▁▁
YOJ	454	0.94	10.50	4.09	0	9	11	13	23.0	▂▃▇▃▁
INCOME	445	0.95	61895.23	47576.39	0	28097	54028	85986	367030.0	▇▃▁▁▁
HOME_VAL	464	0.94	154867.29	129123.77	0	0	161160	238724	885282.0	▇▆▁▁▁
TRAVTIME	0	1.00	33.49	15.91	5	22	33	44	142.0	▇▇▁▁▁
BLUEBOOK	0	1.00	15709.90	8419.73	1500	9280	14440	20850	69740.0	▇▆▁▁▁
TIF	0	1.00	5.35	4.15	1	1	4	7	25.0	▇▆▁▁▁
OLDCLAIM	0	1.00	4027.79	8780.98	0	0	0	4636	57037.0	▇▁▁▁▁
CLM_FREQ	0	1.00	0.80	1.16	0	0	0	2	5.0	▇▂▁▁▁
MVR_PTS	0	1.00	1.70	2.15	0	0	1	3	13.0	▇▂▁▁▁
CAR_AGE	510	0.94	8.33	5.70	-3	1	8	12	28.0	▆▇▇▃▁

Visualizations

ins_training %>% dplyr::select(where(is.numeric), TARGET_FLAG) %>% 
  pivot_longer(cols = -TARGET_FLAG, names_to = "Feature", values_to = "Value") %>% 
  ggplot(aes(x = Value, fill = as.factor(TARGET_FLAG))) + geom_histogram(bins = 30) + 
  facet_wrap(~ Feature, nrow = 3, ncol = 5, scales = "free") + labs(title = "Numeric Variables")

Almost all continuous variables are right skewed indicating higher proportion of values toward the lower end of the range. Transformation of these variables might be important for more accurate models and will be revisited in the later section.

Net we checking for near zero variance for the discrete predictors with a large majority of 0 values

CLM_FREQ HOMEKIDS KIDSDRIV MVR_PTS

Creating a distribution tables that compares the TARGET_FLAG response across all levels within a discrete variable is helpful to evaluate. We can also bin these into two categories.

ins_training %>% select(TARGET_FLAG, CLM_FREQ) %>% table()

##            CLM_FREQ
## TARGET_FLAG    0    1    2    3    4    5
##           0 4111  612  702  462  110   11
##           1  898  385  469  314   80    7

# Group CLM_FREQ values > 0
ins_training %>% select(TARGET_FLAG, CLM_FREQ) %>% mutate(CLM_FREQ = ifelse(CLM_FREQ > 0, 1, 0)) %>% table()

##            CLM_FREQ
## TARGET_FLAG    0    1
##           0 4111 1897
##           1  898 1255

ins_training %>% select(TARGET_FLAG, HOMEKIDS) %>% table()

##            HOMEKIDS
## TARGET_FLAG    0    1    2    3    4    5
##           0 4116  597  736  444  107    8
##           1 1173  305  382  230   57    6

# Group HOMEKIDS values > 0
ins_training %>% select(TARGET_FLAG, HOMEKIDS) %>% mutate(HOMEKIDS = ifelse(HOMEKIDS > 0, 1, 0)) %>% table()

##            HOMEKIDS
## TARGET_FLAG    0    1
##           0 4116 1892
##           1 1173  980

ins_training %>% select(TARGET_FLAG, KIDSDRIV) %>% table()

##            KIDSDRIV
## TARGET_FLAG    0    1    2    3    4
##           0 5407  400  168   31    2
##           1 1773  236  111   31    2

# Group KIDSDRIV values > 0
ins_training %>% select(TARGET_FLAG, KIDSDRIV) %>% mutate(KIDSDRIV = ifelse(KIDSDRIV > 0, 1, 0)) %>% table()

##            KIDSDRIV
## TARGET_FLAG    0    1
##           0 5407  601
##           1 1773  380

ins_training %>% select(TARGET_FLAG, MVR_PTS) %>% table()

##            MVR_PTS
## TARGET_FLAG    0    1    2    3    4    5    6    7    8    9   10   11   13
##           0 2998  886  683  517  394  250  162   73   29   12    2    2    0
##           1  714  271  265  241  205  149  104   94   55   33   11    9    2

# Group MVR_PTS values > 0
ins_training %>% select(TARGET_FLAG, MVR_PTS) %>% mutate(MVR_PTS = ifelse(MVR_PTS > 0, 1, 0)) %>% table()

##            MVR_PTS
## TARGET_FLAG    0    1
##           0 2998 3010
##           1  714 1439

CLM_FREQ The observations with at least 1 claim are more likely get in a crash

HOMEKIDS The observations with at least 1 kid at home are more likely get in a crash

KIDSDRIV The observations with at least 1 kid driving are more likely get in a crash

MVR_PTS As the number of points increase above 6, a person is more likely to get into a crash

ins_training %>% dplyr::select(where(is.factor)) %>% 
  pivot_longer(cols = -TARGET_FLAG, names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(y = Value, fill = TARGET_FLAG)) + geom_bar(position = "dodge") + 
  facet_wrap( ~ Feature, nrow = 4, ncol = 3, scales = "free") + 
  labs(title = "Categorical Variables") + 
  theme_minimal() +
  theme(panel.spacing = unit(0.5, "cm"))

ins_training %>% ggplot(aes(x = TARGET_FLAG, fill = TARGET_FLAG)) + geom_col(stat = "count") + 
  labs(title = "Count of Crashes Recorded") + theme_minimal()

The data is imbalanced and will need to be addressed when ceating models. The minority class which is the labeled 1,≠ indicates a crash.

Correlations

ins_training %>%
  ggplot(aes(x = TARGET_AMT)) + geom_histogram(binwidth = 1000) + 
  labs(title = "Distribution of Crash Payments") + theme_minimal()

ins_training %>%
  ggplot(aes(y = TARGET_AMT)) + geom_boxplot() +
  labs(title = "Distribution of Crash Payments") + theme_minimal()

library(corrplot)

## corrplot 0.95 loaded

# Filter for only rows containing a TAREGT_AMT > 0 
cor_coef <- cor(ins_training %>% select(is.numeric, -TARGET_FLAG) %>%
                  drop_na() %>% filter(TARGET_AMT > 0))

## Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
##   # Was:
##   data %>% select(is.numeric)
## 
##   # Now:
##   data %>% select(where(is.numeric))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

corrplot::corrplot(cor_coef, method = "number", title = "Correlations Bewteen Continuous Variables", 
                  order = "hclust", number.cex = 0.75, type = "lower")

No variables show strong correlations with the continuous TARGET_AMT. Moderate correlations exist between:

HOMEKIDS and AGE (Negative): As AGE increases the # Kids at home decreases CAR_AGE and INCOME (Positive): As INCOME increases the age of the persons car increases HOME_VAL and INCOME (Positive): As INCOME increases the home value increases BLUEBOOK and INCOME (Positive): As INCOME increases the value of the vehicle increases

Data Preparation

Before deciding to perform any transformations, missing values should be dealt with. The visualization above showed 5 variables with missing values (AGE, HOME_VAL, CAR_AGE, INCOME, JOB, YOJ). JOB is the only categorical variables while the others are continuous.

Visualize Missing Values

skim_summary %>% 
  select(skim_variable, n_missing) %>% 
  filter(n_missing > 0) %>% 
  arrange(n_missing) %>% 
  ggplot(aes(x = skim_variable, y = n_missing)) + geom_bar(stat = "identity") + 
  labs(title = "Variables with Missing Values", x = "Variable name", y = "# Missing values") + 
  theme_minimal()

na_matrix <- is.na(ins_training)

for(i in ncol(is.na(na_matrix))){
  for(r in length(na_matrix[,i])){
    if(na_matrix[r,i] == TRUE){
      na_matrix[r,i] <- 1
    }
    else {na_matrix[r,i] <- 0}
  }
}

visdat::vis_miss(ins_training %>% select(AGE, CAR_AGE, HOME_VAL, INCOME, JOB, YOJ), show_perc = F) + labs(title = "Observations with Missing Data") + theme_minimal()

JOB and INCOME

ins_training %>% filter(is.na(JOB)) %>% ggplot(aes(y = INCOME)) + geom_boxplot() + theme_minimal()

ins_training %>% filter(!is.na(JOB)) %>% ggplot(aes(x = JOB, y = INCOME)) + geom_boxplot() + theme_minimal()

ins_training %>% filter(is.na(JOB)) %>% summarize(p25 = quantile(INCOME, 0.25, na.rm = T),
                                                  p50 = quantile(INCOME, 0.5, na.rm = T),
                                                  p75 = quantile(INCOME, 0.75, na.rm = T), 
                                                  IQR = p75 - p25)

## # A tibble: 1 × 4
##      p25     p50     p75   IQR
##    <dbl>   <dbl>   <dbl> <dbl>
## 1 73320. 111340. 155511. 82191

Some of the observations with missing data in the JOB variable list an income. The missing data here might be from the fact that there was no option for them to choose and so they left it blank. In these cases we could add a level called “OTHER”.

In cases where a job is listed but no income is, we could impute these values using a technique that predicts the value.

In cases where neither income or job is listed we could fill in the job with unknown or drop them if there are not that many.

CAR_AGE

Evaluate whether the missing values in CAR_AGE variable is related to how to the vehicle is used.

ins_training %>% filter(is.na(CAR_AGE)) %>% group_by(CAR_USE) %>% summarise(COUNT = n())

## # A tibble: 2 × 2
##   CAR_USE    COUNT
##   <fct>      <int>
## 1 Commercial   170
## 2 Private      340

Further Inspection

# Preserve original data set by creating a copy
ins_training_copy <- ins_training
head(ins_training_copy)

## # A tibble: 6 × 25
##   TARGET_FLAG TARGET_AMT KIDSDRIV   AGE HOMEKIDS   YOJ INCOME PARENT1 HOME_VAL
##   <fct>            <dbl>    <dbl> <dbl>    <dbl> <dbl>  <dbl> <fct>      <dbl>
## 1 0                    0        0    60        0    11  67349 No             0
## 2 0                    0        0    43        0    11  91449 No        257252
## 3 0                    0        0    35        1    10  16039 No        124191
## 4 0                    0        0    51        0    14     NA No        306251
## 5 0                    0        0    50        0    NA 114986 No        243925
## 6 1                 2946        0    34        1    12 125301 Yes            0
## # ℹ 16 more variables: MSTATUS <fct>, SEX <fct>, EDUCATION <fct>, JOB <fct>,
## #   TRAVTIME <dbl>, CAR_USE <fct>, BLUEBOOK <dbl>, TIF <dbl>, CAR_TYPE <fct>,
## #   RED_CAR <fct>, OLDCLAIM <dbl>, CLM_FREQ <dbl>, REVOKED <fct>,
## #   MVR_PTS <dbl>, CAR_AGE <dbl>, URBANICITY <fct>

1. Drop NA Values from AGE

There are only 6 missing values in this variable, which represents < 0.1% of the data. Out of the six observations, 5 are listed as sustaining a crash. This is the minority class and from a business perspective, it will be important to predict the individuals who are more likely to be in crash for this reason, these observations will not be dropped.

ins_training %>% filter(is.na(AGE)) %>% group_by(TARGET_FLAG) %>% summarise(COUNT = n())

## # A tibble: 2 × 2
##   TARGET_FLAG COUNT
##   <fct>       <int>
## 1 0               1
## 2 1               5

2. Create a new category ‘OTHER’ in JOB for those observations where there is an INCOME listed

ins_training_copy <- ins_training_copy %>% mutate(JOB = if_else(is.na(JOB) & !is.na(INCOME), "Other", JOB))

sum(is.na(ins_training_copy$JOB))

## [1] 24

This results in only 24 values where the JOB and INCOME is unknown. Since there are a lot fewer observations were the TARGET_FLAG is 1 (~25% of the data), representing a crash, it still might be worth keeping that information and trying to impute the missing values for those observations.

ins_training_copy %>% filter(is.na(JOB) & is.na(INCOME)) %>% group_by(EDUCATION, SEX, CAR_TYPE, CAR_USE, TARGET_FLAG) %>% summarise(COUNT = n())

## `summarise()` has grouped output by 'EDUCATION', 'SEX', 'CAR_TYPE', 'CAR_USE'.
## You can override using the `.groups` argument.

## # A tibble: 11 × 6
## # Groups:   EDUCATION, SEX, CAR_TYPE, CAR_USE [8]
##    EDUCATION SEX   CAR_TYPE    CAR_USE    TARGET_FLAG COUNT
##    <fct>     <fct> <fct>       <fct>      <fct>       <int>
##  1 Masters   F     Pickup      Commercial 0               1
##  2 Masters   M     Panel Truck Commercial 0               5
##  3 Masters   M     Panel Truck Commercial 1               3
##  4 Masters   M     Pickup      Commercial 0               1
##  5 Masters   M     Pickup      Private    1               1
##  6 Masters   M     Van         Commercial 1               2
##  7 PhD       M     Panel Truck Commercial 0               4
##  8 PhD       M     Panel Truck Commercial 1               1
##  9 PhD       M     Pickup      Commercial 0               3
## 10 PhD       M     Pickup      Commercial 1               1
## 11 PhD       M     Van         Commercial 0               2

All observations where the JOB and INCOME are missing (n = 24) have an education level of “Masters” or “PhD”, the majority of time the vehicle is used for commercial purposes and the individual is Male. Of those 24 observations 8 are recorded as sustaining a crash.

# ins_training_copy %>% filter(!is.na(JOB) | !is.na(INCOME))

Handling Missing Values

MissForest technique - NOT USED BUT CODE IS BELOW

ins_training has categorical variables as factors

library(missForest)
missForest(ins_training)

Potential Issues - Many of the categorical variables are binary and only have two outcomes - There are variables with the names time and status which might be interptreted as ‘survival’ variables

# Rename 
ins_training <- ins_training %>% rename(MARRIED = MSTATUS,
                              WORK_DIST = TRAVTIME) # new_name = old_name

# Convert categories into binary values (0/1)

ins_training_copy <- ins_training

ins_training_copy[] <- lapply(ins_training_copy, function(x) {
  if (is.factor(x) && nlevels(x) == 2) {
    # Convert 2-level factor to numeric 0/1
    as.numeric(x) - 1
  } else {
    x
  }
})

missForest(ins_training_copy)

# Converted binary variables into factor data type
binary_vars <- c("TARGET_FLAG", "PARENT1", "MARRIED", "SEX",
                 "CAR_USE", "RED_CAR", "REVOKED", "URBANICITY")

ins_training_copy[binary_vars] <- lapply(ins_training_copy[binary_vars], factor)

library(missForest)
# Dropped INDEX column
ins_training_copy <- ins_training_copy %>% select(-INDEX)
missForest(ins_training_copy)

library(dplyr)

# Function to check potentially problematic columns
check_missforest_issues <- function(df) {
  problems <- sapply(names(df), function(col_name) {
    col <- df[[col_name]]
    # Check for unsupported types
    type_issue <- !(is.numeric(col) | is.factor(col))
    # Check for factor with no levels or only NA
    empty_factor <- is.factor(col) & length(levels(col)) == 0
    all_na <- all(is.na(col))
    
    any(type_issue, empty_factor, all_na)
  })
  
  # Return names of problematic columns
  names(problems[problems])
}

# Apply to your dataset
problem_cols <- check_missforest_issues(ins_training_copy)
if(length(problem_cols) == 0) {
  cat("No problematic columns detected. missForest should work.\n")
} else {
  cat("These columns may cause missForest to fail:\n")
  print(problem_cols)
}

The error message after each iteration of modifying the data types;

Error in ranger::ranger(x = obsX, y = obsY, num.trees = ntree, mtry = mtry, : Error: Missing value handling not yet implemented for survival forests.

MICE

Next approach for imputation of the missing values is MICE - Multivariate Imputation by Chained Equations. This technique creates multiple different imputed data sets and analysis is conducted on each, and results are pooled. MICE helps to account for the uncertainty of imputations.

library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

ins_training_imp <- mice(ins_training_copy, m=6, maxit = 10, seed = 123, print = FALSE)

## Warning: Number of logged events: 1

ins_training_imp$method

## TARGET_FLAG  TARGET_AMT    KIDSDRIV         AGE    HOMEKIDS         YOJ 
##          ""          ""          ""       "pmm"          ""       "pmm" 
##      INCOME     PARENT1    HOME_VAL     MSTATUS         SEX   EDUCATION 
##       "pmm"          ""       "pmm"          ""          ""          "" 
##         JOB    TRAVTIME     CAR_USE    BLUEBOOK         TIF    CAR_TYPE 
##          ""          ""          ""          ""          ""          "" 
##     RED_CAR    OLDCLAIM    CLM_FREQ     REVOKED     MVR_PTS     CAR_AGE 
##          ""          ""          ""          ""          ""       "pmm" 
##  URBANICITY 
##          ""

plot(ins_training_imp)

#stripplot(ins_training_imp)

Looking at how data converged and where in the range of values the NA’s were imputed across each iteration, it was decided to use only 1 of the completely imputed data sets. This will be selected at random and used for modelling.

set.seed(124)
imp_round <- sample(1:ins_training_imp$m, size = 1)

# Retrieve complete imputation data set and filter random imputation

complete_ins_training_samp1 <- complete(ins_training_imp, action = "long") %>% filter(.imp == imp_round)

head(complete_ins_training_samp1)

##   TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1           0          0        0  60        0  11  67349      No        0
## 2           0          0        0  43        0  11  91449      No   257252
## 3           0          0        0  35        1  10  16039      No   124191
## 4           0          0        0  51        0  14 108047      No   306251
## 5           0          0        0  50        0  13 114986      No   243925
## 6           1       2946        0  34        1  12 125301     Yes        0
##   MSTATUS SEX   EDUCATION          JOB TRAVTIME    CAR_USE BLUEBOOK TIF
## 1      No   M         PhD Professional       14    Private    14230  11
## 2      No   M High School  Blue Collar       22 Commercial    14940   1
## 3     Yes   F High School     Clerical        5    Private     4010   4
## 4     Yes   M High School  Blue Collar       32    Private    15440   7
## 5     Yes   F         PhD       Doctor       36    Private    18000   1
## 6      No   F   Bachelors  Blue Collar       46 Commercial    17430   1
##     CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1    Minivan     Yes     4461        2      No       3      18
## 2    Minivan     Yes        0        0      No       0       1
## 3        SUV      No    38690        2      No       3      10
## 4    Minivan     Yes        0        0      No       0       6
## 5        SUV      No    19217        2     Yes       3      17
## 6 Sports Car      No        0        0      No       0       7
##            URBANICITY .imp .id
## 1 Highly Urban/ Urban    1   1
## 2 Highly Urban/ Urban    1   2
## 3 Highly Urban/ Urban    1   3
## 4 Highly Urban/ Urban    1   4
## 5 Highly Urban/ Urban    1   5
## 6 Highly Urban/ Urban    1   6

For the logistic regression models we will not be using the TARGET_AMT variable or the variables identifying the imputation round or id. Before performing any class imbalance and other data preparation measure the dat ashould be split into a training and testing data set.

Transform discrete variables

As discussed earlier it might be more useful to create binary categories for CLM_FREQ, HOMEKIDS, KIDSDRIV

complete_ins_training_samp1 <- complete_ins_training_samp1 %>% 
  mutate(CLM_FREQ = as_factor(ifelse(CLM_FREQ == 0, 0, 1)), 
         HOMEKIDS = as_factor(ifelse(HOMEKIDS == 0, 0, 1)), 
         KIDSDRIV = as_factor(ifelse(KIDSDRIV == 0, 0, 1)), 
         MVR_PTS = as_factor(ifelse(MVR_PTS <= 6, 0, 1)))

complete_ins_training_samp1 <- complete_ins_training_samp1 %>% select(-c(.imp, .id))
skim(complete_ins_training_samp1)

Data summary
Name	complete_ins_training_sam…
Number of rows	8161
Number of columns	25
_______________________
Column type frequency:
character	1
factor	14
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
JOB	24	1	5	12	0	9	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
TARGET_FLAG	1	FALSE	2	0: 6008, 1: 2153
KIDSDRIV	1	FALSE	2	0: 7180, 1: 981
HOMEKIDS	1	FALSE	2	0: 5289, 1: 2872
PARENT1	1	FALSE	2	No: 7084, Yes: 1077
MSTATUS	1	FALSE	2	Yes: 4894, No: 3267
SEX	1	FALSE	2	F: 4375, M: 3786
EDUCATION	1	FALSE	4	Hig: 3533, Bac: 2242, Mas: 1658, PhD: 728
CAR_USE	1	FALSE	2	Pri: 5132, Com: 3029
CAR_TYPE	1	FALSE	6	SUV: 2294, Min: 2145, Pic: 1389, Spo: 907
RED_CAR	1	FALSE	2	No: 5783, Yes: 2378
CLM_FREQ	1	FALSE	2	0: 5009, 1: 3152
REVOKED	1	FALSE	2	No: 7161, Yes: 1000
MVR_PTS	1	FALSE	2	0: 7839, 1: 322
URBANICITY	1	FALSE	2	Hig: 6492, Hig: 1669

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
TARGET_AMT	1	1504.32	4704.03	0	0	0	1036	107586.1	▇▁▁▁▁
AGE	1	44.78	8.63	16	39	45	51	81.0	▁▆▇▂▁
YOJ	1	10.51	4.09	0	9	11	13	23.0	▂▃▇▃▁
INCOME	1	61603.23	47429.67	0	27940	53628	85479	367030.0	▇▃▁▁▁
HOME_VAL	1	155137.97	129327.84	0	0	161481	238850	885282.0	▇▆▁▁▁
TRAVTIME	1	33.49	15.91	5	22	33	44	142.0	▇▇▁▁▁
BLUEBOOK	1	15709.90	8419.73	1500	9280	14440	20850	69740.0	▇▆▁▁▁
TIF	1	5.35	4.15	1	1	4	7	25.0	▇▆▁▁▁
OLDCLAIM	1	4027.79	8780.98	0	0	0	4636	57037.0	▇▁▁▁▁
CAR_AGE	1	8.36	5.70	-3	1	8	12	28.0	▆▇▇▃▁

Inspecting the data set there is one value that does not make sense. In the CAR_AGE variable there is a negative value. This will need to be investigated further. There is only one value with a negative car age, this will be dropped from the data.

complete_ins_training_samp1 %>% filter(CAR_AGE < 0)

##   TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1           1       1469        0  47        0  12  48696      No   212014
##   MSTATUS SEX EDUCATION          JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE
## 1      No   F Bachelors Professional       46 Private    15390   4   Pickup
##   RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE          URBANICITY
## 1      No    33521        1     Yes       0      -3 Highly Urban/ Urban

complete_ins_training_samp1 <- complete_ins_training_samp1 %>% filter(CAR_AGE >= 0)

Train/Test Split

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──

## ✔ broom        1.0.9     ✔ rsample      1.3.1
## ✔ dials        1.4.2     ✔ tailor       0.1.0
## ✔ infer        1.0.9     ✔ tune         2.0.0
## ✔ modeldata    1.5.1     ✔ workflows    1.3.0
## ✔ parsnip      1.3.3     ✔ workflowsets 1.1.1
## ✔ recipes      1.3.1     ✔ yardstick    1.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ mice::filter()    masks dplyr::filter(), stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()

set.seed(125)
# Split data into train and test sets 
train_test_split <- initial_split(complete_ins_training_samp1, prop = 0.7)

## TRAIN SET
train_df <- training(train_test_split)
train_df %>% ggplot(aes(x = TARGET_FLAG, fill = TARGET_FLAG)) + geom_col(stat = "count") + 
  labs(title = "Count of Crashes Recorded: Train set") + theme_minimal()

## TEST SET
test_df  <- testing(train_test_split)
test_df %>% ggplot(aes(x = TARGET_FLAG, fill = TARGET_FLAG)) + geom_col(stat = "count") + 
  labs(title = "Count of Crashes Recorded: Test set") + theme_minimal()

Dealing with class imbalance

There is a large imbalance between the classes of the response variable TARGET_FLAG, with only 25% of the observations recorded as a crash. Teal with the imbalance is to over sample the minority class using the technique called SMOTE (Synthetic Minority Oversampling Technique). Since MICE was used to impute missing values - SMOTE will need to be run on each imputed data frame.

The strategy is to keep the imputed data set as is and use that for the multiple linear regression. Use a loop that applies SMOTE to all the imputed data sets, builds a model on the data, and saves the model to a list.

library(smotefamily)
library(fastDummies)

# Get factor column names
factor_column_names <- names(Filter(is.factor, train_df))

# Convert factor variables to dummy variables excluding TARGET_AMT
x_train <- dummy_cols(train_df, select_columns = factor_column_names[-1], remove_first_dummy = TRUE)

# Select only the numeric variables + TARGET_FLAG
x_train <- x_train %>% select(where(is.numeric), TARGET_FLAG)

# Convert TARGET_FLAG to numeric for SMOTE
x_train$TARGET_FLAG <- as.numeric(as.character(x_train$TARGET_FLAG))

# Conduct SMOTE on training set 
x_train_smote <- SMOTE(x_train, x_train$TARGET_FLAG, K = 5, dup_size = 0)

# Select only Predictors and exclude the TARGET_AMT
x_train_smote_data <- dplyr::select(x_train_smote$data, -c(TARGET_AMT, TARGET_FLAG, class))

# Extract TARGET_AMT and convert back to factor
y_train <- as_factor(x_train_smote$data[,"TARGET_FLAG"])

## TRAIN SET
x_train_smote$data %>% ggplot(aes(x = factor(TARGET_FLAG), fill = factor(TARGET_FLAG))) + geom_col(stat = "count") + labs(title = "Count of Crashes Recorded: SMOTE") + theme_minimal()

Normalization

cols_to_scale <- c("AGE", "YOJ", "INCOME", "HOME_VAL", "TRAVTIME", "BLUEBOOK", "TIF", "OLDCLAIM", "CAR_AGE")

# Center and scale predictors 
x_train_scaled <- x_train_smote_data %>% mutate(across(all_of(cols_to_scale), scale))

# Merge training data
training_data <- data.frame(TARGET_FLAG = y_train, x_train_scaled)

# Convert factor variables to dummy variables excluding TARGET_AMT
x_test <- dummy_cols(test_df, select_columns = factor_column_names[-1], remove_first_dummy = TRUE)

# Extract only predictors
x_test  <- x_test %>% select(where(is.numeric), -TARGET_AMT)

# Center and scale predictors 
x_test_scaled <- x_test %>% mutate(across(all_of(cols_to_scale), scale))

# Target
y_test  <- test_df$TARGET_FLAG

testing_data <- data.frame(TARGET_FLAG = y_test, x_test_scaled)

Summary of Data Processing

Handled missing values

6 observations were dropped for NA in AGE column
A new category “OTHER” was added to JOB if an income was listed
Multiple Imputation with Chained Equations (MICE) was used to impute the rest NA values

Data form a random iteration of MICE was selected
Discrete variables were binned and turned into binary variables
The data was split into a training and testing sets.
Dummy variables were created for all categorical predictors

This expanding the predictor space to include 36 variables

SMOTE technique was used to address the class imbalance in the training set
Predictors were centered and scaled
Data was combined

training_data testing_data

Modelling

Logistice Regression - Predicting TARGET_FLAG

Model1

The first model that will be built with the entire predictor space.

logreg1 <- glm(TARGET_FLAG ~ ., family=binomial, data = training_data)
summary(logreg1)

## 
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial, data = training_data)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -3.33221    0.16688 -19.967  < 2e-16 ***
## AGE                            -0.04932    0.03569  -1.382  0.16699    
## YOJ                            -0.05783    0.03218  -1.797  0.07233 .  
## INCOME                         -0.20856    0.04871  -4.281 1.86e-05 ***
## HOME_VAL                       -0.10985    0.04331  -2.536  0.01121 *  
## TRAVTIME                        0.26365    0.02969   8.879  < 2e-16 ***
## BLUEBOOK                       -0.19999    0.04339  -4.609 4.04e-06 ***
## TIF                            -0.30316    0.02986 -10.154  < 2e-16 ***
## OLDCLAIM                       -0.25465    0.03945  -6.455 1.08e-10 ***
## CAR_AGE                         0.01327    0.04219   0.315  0.75312    
## KIDSDRIV_1                      0.58353    0.10411   5.605 2.08e-08 ***
## HOMEKIDS_1                      0.26762    0.10154   2.635  0.00840 ** 
## PARENT1_Yes                     0.36119    0.12615   2.863  0.00420 ** 
## MSTATUS_Yes                    -0.57022    0.08973  -6.355 2.08e-10 ***
## SEX_M                           0.23969    0.11114   2.157  0.03103 *  
## EDUCATION_High.School           0.54971    0.08500   6.468 9.96e-11 ***
## EDUCATION_Masters               0.04699    0.10124   0.464  0.64257    
## EDUCATION_PhD                  -0.03842    0.13865  -0.277  0.78171    
## CAR_USE_Private                -1.00391    0.07592 -13.224  < 2e-16 ***
## CAR_TYPE_Panel.Truck            0.41712    0.15299   2.726  0.00640 ** 
## CAR_TYPE_Pickup                 0.43268    0.10017   4.319 1.57e-05 ***
## CAR_TYPE_Sports.Car             1.07743    0.13222   8.148 3.69e-16 ***
## CAR_TYPE_SUV                    0.85372    0.11139   7.664 1.80e-14 ***
## CAR_TYPE_Van                    0.41069    0.12573   3.266  0.00109 ** 
## RED_CAR_Yes                     0.02774    0.08789   0.316  0.75232    
## CLM_FREQ_1                      0.87056    0.07563  11.511  < 2e-16 ***
## REVOKED_Yes                     1.03595    0.09899  10.465  < 2e-16 ***
## MVR_PTS_1                       1.25967    0.15969   7.888 3.07e-15 ***
## URBANICITY_Highly.Urban..Urban  2.47616    0.10832  22.860  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9827.1  on 7225  degrees of freedom
## Residual deviance: 7318.3  on 7197  degrees of freedom
## AIC: 7376.3
## 
## Number of Fisher Scoring iterations: 5

Model1 Diagnostics

VIF

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

plot_vif <- function(fitted_model, model_name){
  # Calculate VIF
  vif_values <- vif(fitted_model)
  
  # Convert VIF results to a data frame for plotting
  vif_df <- data.frame(Variable = names(vif_values), VIF = vif_values)
  
  # Set a threshold to indicate high VIF
  high_vif_threshold <- 5
  
  # Generate bar plot 
  ggplot(vif_df, aes(Variable, VIF)) + 
    geom_bar(stat = "identity", fill = "steelblue") + 
    geom_hline(yintercept = high_vif_threshold, linetype = "dashed", color = "red") + 
    scale_y_continuous(limits = c(0, max(vif_df$VIF) + 1)) + 
    labs(title = paste0("Variance Inflation Factor (VIF) for Regression Model - ", model_name), 
         y = "VIF",
         x = "Variable") + 
    theme_minimal() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
}

plot_vif(logreg1, "Log Reg 1")

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_hline()`).

logreg_metrics <- function(cm){
  
  TN <- cm[1,1]
  TP <- cm[2,2]
  FN <- cm[2,1]
  FP <- cm[1,2]
  
  acc = (TP + TN) / sum(cm)
  er = (FP + FN) / sum(cm)
  pre = TP / (TP + FP)
  sens = TP / (TP + FN)
  spec = TN / (TN + FP)
  f1 = (2 * pre * sens) / (pre + sens)
  
  metrics <- tibble(Metric = c("Accuracy", "Error_rate", "Precision", 
                               "Sensitivity", "Specificity", "F1_score"), 
                    Value = round(c(acc,er,pre,sens,spec,f1), 3))
  
  metrics
}

# Generate predictions using testing data
pred1_probs <- predict(logreg1, testing_data, type = 'response')
# Get the predicted labels
pred1 <- ifelse(pred1_probs >= 0.5, 1, 0)
# COnstruct confusion matrix
cm1 <- table(actual = testing_data$TARGET_FLAG, prediction = pred1)

logreg_metrics(cm1)

## # A tibble: 6 × 2
##   Metric      Value
##   <chr>       <dbl>
## 1 Accuracy    0.757
## 2 Error_rate  0.243
## 3 Precision   0.528
## 4 Sensitivity 0.658
## 5 Specificity 0.792
## 6 F1_score    0.586

Model2

Feature selection with backwards elimination

backwards_logreg <- stats::step(logreg1, direction = "backward",)

## Start:  AIC=7376.31
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + 
##     TIF + OLDCLAIM + CAR_AGE + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes + 
##     MSTATUS_Yes + SEX_M + EDUCATION_High.School + EDUCATION_Masters + 
##     EDUCATION_PhD + CAR_USE_Private + CAR_TYPE_Panel.Truck + 
##     CAR_TYPE_Pickup + CAR_TYPE_Sports.Car + CAR_TYPE_SUV + CAR_TYPE_Van + 
##     RED_CAR_Yes + CLM_FREQ_1 + REVOKED_Yes + MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
## 
##                                  Df Deviance    AIC
## - EDUCATION_PhD                   1   7318.4 7374.4
## - CAR_AGE                         1   7318.4 7374.4
## - RED_CAR_Yes                     1   7318.4 7374.4
## - EDUCATION_Masters               1   7318.5 7374.5
## - AGE                             1   7320.2 7376.2
## <none>                                7318.3 7376.3
## - YOJ                             1   7321.5 7377.5
## - SEX_M                           1   7323.0 7379.0
## - HOME_VAL                        1   7324.7 7380.7
## - HOMEKIDS_1                      1   7325.2 7381.2
## - CAR_TYPE_Panel.Truck            1   7325.8 7381.8
## - PARENT1_Yes                     1   7326.5 7382.5
## - CAR_TYPE_Van                    1   7328.9 7384.9
## - INCOME                          1   7336.8 7392.8
## - CAR_TYPE_Pickup                 1   7336.9 7392.9
## - BLUEBOOK                        1   7339.9 7395.9
## - KIDSDRIV_1                      1   7350.0 7406.0
## - MSTATUS_Yes                     1   7358.7 7414.7
## - EDUCATION_High.School           1   7360.5 7416.5
## - OLDCLAIM                        1   7360.7 7416.7
## - CAR_TYPE_SUV                    1   7378.3 7434.3
## - CAR_TYPE_Sports.Car             1   7385.5 7441.5
## - MVR_PTS_1                       1   7389.3 7445.3
## - TRAVTIME                        1   7398.1 7454.1
## - TIF                             1   7425.8 7481.8
## - REVOKED_Yes                     1   7430.0 7486.0
## - CLM_FREQ_1                      1   7451.7 7507.7
## - CAR_USE_Private                 1   7498.6 7554.6
## - URBANICITY_Highly.Urban..Urban  1   8020.0 8076.0
## 
## Step:  AIC=7374.39
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + 
##     TIF + OLDCLAIM + CAR_AGE + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes + 
##     MSTATUS_Yes + SEX_M + EDUCATION_High.School + EDUCATION_Masters + 
##     CAR_USE_Private + CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + 
##     CAR_TYPE_Sports.Car + CAR_TYPE_SUV + CAR_TYPE_Van + RED_CAR_Yes + 
##     CLM_FREQ_1 + REVOKED_Yes + MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
## 
##                                  Df Deviance    AIC
## - CAR_AGE                         1   7318.5 7372.5
## - RED_CAR_Yes                     1   7318.5 7372.5
## - EDUCATION_Masters               1   7318.8 7372.8
## <none>                                7318.4 7374.4
## - AGE                             1   7320.4 7374.4
## - YOJ                             1   7321.6 7375.6
## - SEX_M                           1   7323.1 7377.1
## - HOME_VAL                        1   7324.8 7378.8
## - HOMEKIDS_1                      1   7325.3 7379.3
## - CAR_TYPE_Panel.Truck            1   7325.8 7379.8
## - PARENT1_Yes                     1   7326.6 7380.6
## - CAR_TYPE_Van                    1   7328.9 7382.9
## - CAR_TYPE_Pickup                 1   7336.9 7390.9
## - BLUEBOOK                        1   7339.9 7393.9
## - INCOME                          1   7340.6 7394.6
## - KIDSDRIV_1                      1   7350.2 7404.2
## - MSTATUS_Yes                     1   7358.8 7412.8
## - OLDCLAIM                        1   7360.8 7414.8
## - EDUCATION_High.School           1   7360.9 7414.9
## - CAR_TYPE_SUV                    1   7378.3 7432.3
## - CAR_TYPE_Sports.Car             1   7385.5 7439.5
## - MVR_PTS_1                       1   7389.4 7443.4
## - TRAVTIME                        1   7398.3 7452.3
## - TIF                             1   7425.8 7479.8
## - REVOKED_Yes                     1   7430.0 7484.0
## - CLM_FREQ_1                      1   7451.7 7505.7
## - CAR_USE_Private                 1   7503.6 7557.6
## - URBANICITY_Highly.Urban..Urban  1   8020.2 8074.2
## 
## Step:  AIC=7372.45
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + 
##     TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes + 
##     MSTATUS_Yes + SEX_M + EDUCATION_High.School + EDUCATION_Masters + 
##     CAR_USE_Private + CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + 
##     CAR_TYPE_Sports.Car + CAR_TYPE_SUV + CAR_TYPE_Van + RED_CAR_Yes + 
##     CLM_FREQ_1 + REVOKED_Yes + MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
## 
##                                  Df Deviance    AIC
## - RED_CAR_Yes                     1   7318.6 7370.6
## - EDUCATION_Masters               1   7319.1 7371.1
## <none>                                7318.5 7372.5
## - AGE                             1   7320.5 7372.5
## - YOJ                             1   7321.6 7373.6
## - SEX_M                           1   7323.1 7375.1
## - HOME_VAL                        1   7324.9 7376.9
## - HOMEKIDS_1                      1   7325.3 7377.3
## - CAR_TYPE_Panel.Truck            1   7325.8 7377.8
## - PARENT1_Yes                     1   7326.7 7378.7
## - CAR_TYPE_Van                    1   7329.0 7381.0
## - CAR_TYPE_Pickup                 1   7337.0 7389.0
## - BLUEBOOK                        1   7340.0 7392.0
## - INCOME                          1   7340.8 7392.8
## - KIDSDRIV_1                      1   7350.3 7402.3
## - MSTATUS_Yes                     1   7358.8 7410.8
## - OLDCLAIM                        1   7360.9 7412.9
## - EDUCATION_High.School           1   7369.4 7421.4
## - CAR_TYPE_SUV                    1   7378.3 7430.3
## - CAR_TYPE_Sports.Car             1   7385.5 7437.5
## - MVR_PTS_1                       1   7389.4 7441.4
## - TRAVTIME                        1   7398.3 7450.3
## - TIF                             1   7425.8 7477.8
## - REVOKED_Yes                     1   7430.1 7482.1
## - CLM_FREQ_1                      1   7451.9 7503.9
## - CAR_USE_Private                 1   7503.6 7555.6
## - URBANICITY_Highly.Urban..Urban  1   8020.3 8072.3
## 
## Step:  AIC=7370.55
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + 
##     TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes + 
##     MSTATUS_Yes + SEX_M + EDUCATION_High.School + EDUCATION_Masters + 
##     CAR_USE_Private + CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + 
##     CAR_TYPE_Sports.Car + CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 + 
##     REVOKED_Yes + MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
## 
##                                  Df Deviance    AIC
## - EDUCATION_Masters               1   7319.2 7369.2
## <none>                                7318.6 7370.6
## - AGE                             1   7320.6 7370.6
## - YOJ                             1   7321.7 7371.7
## - HOME_VAL                        1   7325.0 7375.0
## - SEX_M                           1   7325.3 7375.3
## - HOMEKIDS_1                      1   7325.4 7375.4
## - CAR_TYPE_Panel.Truck            1   7326.0 7376.0
## - PARENT1_Yes                     1   7326.8 7376.8
## - CAR_TYPE_Van                    1   7329.1 7379.1
## - CAR_TYPE_Pickup                 1   7337.1 7387.1
## - BLUEBOOK                        1   7340.2 7390.2
## - INCOME                          1   7340.8 7390.8
## - KIDSDRIV_1                      1   7350.3 7400.3
## - MSTATUS_Yes                     1   7358.9 7408.9
## - OLDCLAIM                        1   7361.0 7411.0
## - EDUCATION_High.School           1   7369.4 7419.4
## - CAR_TYPE_SUV                    1   7378.3 7428.3
## - CAR_TYPE_Sports.Car             1   7385.5 7435.5
## - MVR_PTS_1                       1   7389.5 7439.5
## - TRAVTIME                        1   7398.5 7448.5
## - TIF                             1   7426.2 7476.2
## - REVOKED_Yes                     1   7430.2 7480.2
## - CLM_FREQ_1                      1   7452.0 7502.0
## - CAR_USE_Private                 1   7503.6 7553.6
## - URBANICITY_Highly.Urban..Urban  1   8020.6 8070.6
## 
## Step:  AIC=7369.15
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + 
##     TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes + 
##     MSTATUS_Yes + SEX_M + EDUCATION_High.School + CAR_USE_Private + 
##     CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + CAR_TYPE_Sports.Car + 
##     CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 + REVOKED_Yes + 
##     MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
## 
##                                  Df Deviance    AIC
## - AGE                             1   7321.1 7369.1
## <none>                                7319.2 7369.2
## - YOJ                             1   7322.3 7370.3
## - HOME_VAL                        1   7325.6 7373.6
## - SEX_M                           1   7326.0 7374.0
## - HOMEKIDS_1                      1   7326.0 7374.0
## - CAR_TYPE_Panel.Truck            1   7326.9 7374.9
## - PARENT1_Yes                     1   7327.3 7375.3
## - CAR_TYPE_Van                    1   7329.8 7377.8
## - CAR_TYPE_Pickup                 1   7338.0 7386.0
## - BLUEBOOK                        1   7340.8 7388.8
## - INCOME                          1   7341.2 7389.2
## - KIDSDRIV_1                      1   7350.9 7398.9
## - MSTATUS_Yes                     1   7359.7 7407.7
## - OLDCLAIM                        1   7361.3 7409.3
## - EDUCATION_High.School           1   7372.7 7420.7
## - CAR_TYPE_SUV                    1   7379.0 7427.0
## - CAR_TYPE_Sports.Car             1   7386.2 7434.2
## - MVR_PTS_1                       1   7390.4 7438.4
## - TRAVTIME                        1   7398.9 7446.9
## - TIF                             1   7426.6 7474.6
## - REVOKED_Yes                     1   7430.9 7478.9
## - CLM_FREQ_1                      1   7452.1 7500.1
## - CAR_USE_Private                 1   7505.2 7553.2
## - URBANICITY_Highly.Urban..Urban  1   8025.4 8073.4
## 
## Step:  AIC=7369.11
## TARGET_FLAG ~ YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + 
##     TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes + 
##     MSTATUS_Yes + SEX_M + EDUCATION_High.School + CAR_USE_Private + 
##     CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + CAR_TYPE_Sports.Car + 
##     CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 + REVOKED_Yes + 
##     MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
## 
##                                  Df Deviance    AIC
## <none>                                7321.1 7369.1
## - YOJ                             1   7325.3 7371.3
## - SEX_M                           1   7327.4 7373.4
## - HOME_VAL                        1   7328.0 7374.0
## - PARENT1_Yes                     1   7329.2 7375.2
## - CAR_TYPE_Panel.Truck            1   7329.3 7375.3
## - CAR_TYPE_Van                    1   7332.2 7378.2
## - HOMEKIDS_1                      1   7334.2 7380.2
## - CAR_TYPE_Pickup                 1   7339.8 7385.8
## - INCOME                          1   7342.7 7388.7
## - BLUEBOOK                        1   7345.0 7391.0
## - KIDSDRIV_1                      1   7350.9 7396.9
## - MSTATUS_Yes                     1   7362.9 7408.9
## - OLDCLAIM                        1   7363.7 7409.7
## - EDUCATION_High.School           1   7377.3 7423.3
## - CAR_TYPE_SUV                    1   7379.5 7425.5
## - CAR_TYPE_Sports.Car             1   7386.4 7432.4
## - MVR_PTS_1                       1   7392.4 7438.4
## - TRAVTIME                        1   7401.3 7447.3
## - TIF                             1   7428.2 7474.2
## - REVOKED_Yes                     1   7433.7 7479.7
## - CLM_FREQ_1                      1   7454.2 7500.2
## - CAR_USE_Private                 1   7508.2 7554.2
## - URBANICITY_Highly.Urban..Urban  1   8029.3 8075.3

backwards_logreg$anova

##                  Step Df   Deviance Resid. Df Resid. Dev      AIC
## 1                     NA         NA      7197   7318.312 7376.312
## 2     - EDUCATION_PhD  1 0.07686535      7198   7318.388 7374.388
## 3           - CAR_AGE  1 0.06187303      7199   7318.450 7372.450
## 4       - RED_CAR_Yes  1 0.10362434      7200   7318.554 7370.554
## 5 - EDUCATION_Masters  1 0.60048723      7201   7319.154 7369.154
## 6               - AGE  1 1.95955636      7202   7321.114 7369.114

backwards_logreg$formula

## TARGET_FLAG ~ YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + 
##     TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes + 
##     MSTATUS_Yes + SEX_M + EDUCATION_High.School + CAR_USE_Private + 
##     CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + CAR_TYPE_Sports.Car + 
##     CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 + REVOKED_Yes + 
##     MVR_PTS_1 + URBANICITY_Highly.Urban..Urban

logreg2 <- glm(formula = backwards_logreg$formula, family = "binomial", data = training_data)

summary(logreg2)

## 
## Call:
## glm(formula = backwards_logreg$formula, family = "binomial", 
##     data = training_data)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -3.32601    0.16461 -20.206  < 2e-16 ***
## YOJ                            -0.06411    0.03147  -2.037 0.041610 *  
## INCOME                         -0.20750    0.04496  -4.615 3.94e-06 ***
## HOME_VAL                       -0.11268    0.04306  -2.617 0.008875 ** 
## TRAVTIME                        0.26410    0.02967   8.900  < 2e-16 ***
## BLUEBOOK                       -0.20834    0.04292  -4.854 1.21e-06 ***
## TIF                            -0.30236    0.02983 -10.136  < 2e-16 ***
## OLDCLAIM                       -0.25495    0.03941  -6.469 9.88e-11 ***
## KIDSDRIV_1                      0.55136    0.10142   5.437 5.43e-08 ***
## HOMEKIDS_1                      0.32882    0.09092   3.617 0.000299 ***
## PARENT1_Yes                     0.35738    0.12599   2.837 0.004559 ** 
## MSTATUS_Yes                    -0.57840    0.08950  -6.463 1.03e-10 ***
## SEX_M                           0.24445    0.09800   2.494 0.012622 *  
## EDUCATION_High.School           0.53046    0.07095   7.476 7.65e-14 ***
## CAR_USE_Private                -0.99986    0.07418 -13.480  < 2e-16 ***
## CAR_TYPE_Panel.Truck            0.43464    0.15188   2.862 0.004214 ** 
## CAR_TYPE_Pickup                 0.43225    0.09982   4.330 1.49e-05 ***
## CAR_TYPE_Sports.Car             1.05477    0.13115   8.042 8.81e-16 ***
## CAR_TYPE_SUV                    0.83680    0.11061   7.565 3.86e-14 ***
## CAR_TYPE_Van                    0.41731    0.12528   3.331 0.000865 ***
## CLM_FREQ_1                      0.86830    0.07551  11.499  < 2e-16 ***
## REVOKED_Yes                     1.04041    0.09901  10.508  < 2e-16 ***
## MVR_PTS_1                       1.26184    0.15966   7.903 2.72e-15 ***
## URBANICITY_Highly.Urban..Urban  2.48153    0.10824  22.926  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9827.1  on 7225  degrees of freedom
## Residual deviance: 7321.1  on 7202  degrees of freedom
## AIC: 7369.1
## 
## Number of Fisher Scoring iterations: 5

plot_vif(logreg2, "Log Reg 2")

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_hline()`).

# Generate predictions using testing data
pred2_probs <- predict(logreg2, testing_data, type = 'response')
# Get the predicted labels
pred2 <- ifelse(pred2_probs >= 0.5, 1, 0)
# COnstruct confusion matrix
cm2 <- table(actual = testing_data$TARGET_FLAG, prediction = pred2)

logreg_metrics(cm2)

## # A tibble: 6 × 2
##   Metric      Value
##   <chr>       <dbl>
## 1 Accuracy    0.757
## 2 Error_rate  0.243
## 3 Precision   0.527
## 4 Sensitivity 0.657
## 5 Specificity 0.792
## 6 F1_score    0.585

Update the logreg2 by removing the non significant predictor ‘JOB_Lawyer’ building a third model

Model3

logreg3 <- update(logreg2, .~. - YOJ)

summary(logreg3)

## 
## Call:
## glm(formula = TARGET_FLAG ~ INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + 
##     TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes + 
##     MSTATUS_Yes + SEX_M + EDUCATION_High.School + CAR_USE_Private + 
##     CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + CAR_TYPE_Sports.Car + 
##     CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 + REVOKED_Yes + 
##     MVR_PTS_1 + URBANICITY_Highly.Urban..Urban, family = "binomial", 
##     data = training_data)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -3.29969    0.16387 -20.136  < 2e-16 ***
## INCOME                         -0.22678    0.04404  -5.149 2.61e-07 ***
## HOME_VAL                       -0.12013    0.04295  -2.797 0.005165 ** 
## TRAVTIME                        0.26364    0.02965   8.890  < 2e-16 ***
## BLUEBOOK                       -0.21461    0.04284  -5.010 5.44e-07 ***
## TIF                            -0.30282    0.02981 -10.157  < 2e-16 ***
## OLDCLAIM                       -0.25619    0.03940  -6.503 7.89e-11 ***
## KIDSDRIV_1                      0.55091    0.10138   5.434 5.50e-08 ***
## HOMEKIDS_1                      0.30930    0.09034   3.424 0.000618 ***
## PARENT1_Yes                     0.36520    0.12588   2.901 0.003719 ** 
## MSTATUS_Yes                    -0.58854    0.08936  -6.586 4.52e-11 ***
## SEX_M                           0.23054    0.09773   2.359 0.018328 *  
## EDUCATION_High.School           0.52030    0.07074   7.355 1.91e-13 ***
## CAR_USE_Private                -1.00206    0.07415 -13.514  < 2e-16 ***
## CAR_TYPE_Panel.Truck            0.46276    0.15133   3.058 0.002229 ** 
## CAR_TYPE_Pickup                 0.43632    0.09974   4.375 1.22e-05 ***
## CAR_TYPE_Sports.Car             1.06230    0.13100   8.109 5.10e-16 ***
## CAR_TYPE_SUV                    0.83563    0.11055   7.559 4.06e-14 ***
## CAR_TYPE_Van                    0.43656    0.12491   3.495 0.000474 ***
## CLM_FREQ_1                      0.86640    0.07547  11.479  < 2e-16 ***
## REVOKED_Yes                     1.03753    0.09895  10.485  < 2e-16 ***
## MVR_PTS_1                       1.26509    0.15955   7.929 2.20e-15 ***
## URBANICITY_Highly.Urban..Urban  2.47365    0.10796  22.913  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9827.1  on 7225  degrees of freedom
## Residual deviance: 7325.3  on 7203  degrees of freedom
## AIC: 7371.3
## 
## Number of Fisher Scoring iterations: 5

# Generate predictions using testing data
pred3_probs <- predict(logreg3, testing_data, type = 'response')
# Get the predicted labels
pred3 <- ifelse(pred3_probs >= 0.5, 1, 0)
# COnstruct confusion matrix
cm3 <- table(actual = testing_data$TARGET_FLAG, prediction = pred3)

logreg_metrics(cm3)

## # A tibble: 6 × 2
##   Metric      Value
##   <chr>       <dbl>
## 1 Accuracy    0.757
## 2 Error_rate  0.243
## 3 Precision   0.526
## 4 Sensitivity 0.665
## 5 Specificity 0.789
## 6 F1_score    0.587

# Create the plot
ggplot(data = data.frame(cm3), aes(x = actual, y = prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold") +
  # Customize colors
  scale_fill_gradient(low = "lightblue", high = "darkblue") + 
  labs(title = "Confusion Matrix: logreg3", x = "Actual", y = "Predicted") +
  theme_minimal()

The final model that will be used on the evaluation set is the logreg3 model. Below is a table that shows the coefficient, the coefficients CI, and the change in odds. Based on the AIC and the comparative simplicity between logistic regression models where also considerations taken in deciding to proceed with this model.

# Get ci for all parameters
ci_logreg3 <- confint(logreg3)

## Waiting for profiling to be done...

terms_table <- tibble(Terms = names(logreg3$coefficients), Coefficients = logreg3$coefficients, ci_lower = ci_logreg3[,1], ci_upper = ci_logreg3[,2], Change_odd = exp(logreg3$coefficients))

terms_table

## # A tibble: 23 × 5
##    Terms       Coefficients ci_lower ci_upper Change_odd
##    <chr>              <dbl>    <dbl>    <dbl>      <dbl>
##  1 (Intercept)       -3.30    -3.62   -2.98       0.0369
##  2 INCOME            -0.227   -0.313  -0.141      0.797 
##  3 HOME_VAL          -0.120   -0.204  -0.0359     0.887 
##  4 TRAVTIME           0.264    0.206   0.322      1.30  
##  5 BLUEBOOK          -0.215   -0.299  -0.131      0.807 
##  6 TIF               -0.303   -0.362  -0.245      0.739 
##  7 OLDCLAIM          -0.256   -0.334  -0.179      0.774 
##  8 KIDSDRIV_1         0.551    0.353   0.750      1.73  
##  9 HOMEKIDS_1         0.309    0.132   0.486      1.36  
## 10 PARENT1_Yes        0.365    0.119   0.612      1.44  
## # ℹ 13 more rows

JOB_Clerical and SEX_M confidence intervals both cross 0 indicating that although the p-value was significant, this predictor may add more uncertainty into the model.

Multiple Linear Regression

Data Preparation

complete_ins_training_samp1 contains data from the randomly selected iteration of the MICE imputation technique. To train a multiple linear regression model to predict TARGET_AMT, data should be filtered to only include those observations who sustained a crash.

# complete_ins_training_samp1 %>% filter(TARGET_FLAG == 1)

set.seed(200)

# Filter crash observations
complete_ins_training_amt <- complete_ins_training_samp1 %>% filter(TARGET_FLAG == 1) 
complete_ins_training_amt <- complete_ins_training_amt %>% select(-TARGET_FLAG)
summary(complete_ins_training_amt)

##    TARGET_AMT        KIDSDRIV      AGE        HOMEKIDS      YOJ    
##  Min.   :    30.28   0:1772   Min.   :16.00   0:1172   Min.   : 0  
##  1st Qu.:  2610.70   1: 380   1st Qu.:37.00   1: 980   1st Qu.: 9  
##  Median :  4108.07            Median :43.00            Median :11  
##  Mean   :  5704.15            Mean   :43.27            Mean   :10  
##  3rd Qu.:  5788.00            3rd Qu.:50.00            3rd Qu.:13  
##  Max.   :107586.14            Max.   :76.00            Max.   :19  
##      INCOME       PARENT1       HOME_VAL      MSTATUS    SEX     
##  Min.   :     0   No :1676   Min.   :     0   No :1099   F:1191  
##  1st Qu.: 20644   Yes: 476   1st Qu.:     0   Yes:1053   M: 961  
##  Median : 43612              Median :115248                      
##  Mean   : 50398              Mean   :115828                      
##  3rd Qu.: 69467              3rd Qu.:198150                      
##  Max.   :320127              Max.   :750455                      
##        EDUCATION        JOB               TRAVTIME           CAR_USE    
##  Bachelors  : 522   Length:2152        Min.   : 5.00   Commercial:1047  
##  High School:1178   Class :character   1st Qu.:24.00   Private   :1105  
##  Masters    : 327   Mode  :character   Median :34.00                    
##  PhD        : 125                      Mean   :34.77                    
##                                        3rd Qu.:45.00                    
##                                        Max.   :97.00                    
##     BLUEBOOK          TIF                CAR_TYPE   RED_CAR       OLDCLAIM    
##  Min.   : 1500   Min.   : 1.000   Minivan    :349   No :1536   Min.   :    0  
##  1st Qu.: 7758   1st Qu.: 1.000   Panel Truck:178   Yes: 616   1st Qu.:    0  
##  Median :12590   Median : 4.000   Pickup     :442              Median : 2432  
##  Mean   :14255   Mean   : 4.781   Sports Car :304              Mean   : 6037  
##  3rd Qu.:19218   3rd Qu.: 7.000   SUV        :678              3rd Qu.: 6905  
##  Max.   :62240   Max.   :21.000   Van        :201              Max.   :57037  
##  CLM_FREQ REVOKED    MVR_PTS     CAR_AGE                     URBANICITY  
##  0: 898   No :1710   0:1948   Min.   : 0.000   Highly Rural/ Rural: 115  
##  1:1254   Yes: 442   1: 204   1st Qu.: 1.000   Highly Urban/ Urban:2037  
##                               Median : 7.000                             
##                               Mean   : 7.366                             
##                               3rd Qu.:11.000                             
##                               Max.   :25.000

complete_ins_training_amt %>% select(TARGET_AMT, CAR_AGE, TRAVTIME, TIF) %>% 
  pivot_longer(cols = everything(), names_to = "Feature", values_to = "Value") %>% 
  ggplot(aes(x = Value)) + geom_histogram(bins = 30) + 
  facet_wrap(~ Feature, nrow = 2, ncol = 2, scales = "free") + labs(title = "Numeric Variables: Log")

complete_ins_training_amt <- complete_ins_training_amt %>% 
  mutate(HOME_VAL = as_factor(ifelse(HOME_VAL == 0, 0, 1)), 
         INCOME = as_factor(ifelse(INCOME == 0, 0, 1)), 
         YOJ = as_factor(ifelse(YOJ == 0, 0, 1)),
         OLDCLAIM = as_factor(ifelse(OLDCLAIM == 0, 0, 1)))

factor_column_names <- names(Filter(is.factor, complete_ins_training_amt))

# Split data
train_test_split_amt <- initial_split(complete_ins_training_amt, prop = 0.7)

# Assign data
train_amt <- training(train_test_split_amt)
test_amt  <- testing(train_test_split_amt)

# Get factor column names
factor_column_names <- names(Filter(is.factor, train_amt))

# Convert factor variables to dummy variables excluding TARGET_AMT
x_train_amt <- dummy_cols(train_amt, select_columns = factor_column_names, remove_first_dummy = TRUE)

x_train_amt <- x_train_amt %>% select(where(is.numeric), -TARGET_AMT)

# Center and scale predictors 
# update columns to scale

x_train_amt_scaled <- x_train_amt %>% mutate(across(all_of(cols_to_scale[-c(2,3,4,8)]), scale))

# Extract response 
y_train_amt <- train_amt$TARGET_AMT

# Merge training data
training_data_amt <- data.frame(TARGET_AMT = y_train_amt, x_train_amt_scaled)

# Convert factor variables to dummy variables excluding TARGET_AMT
x_test_amt <- dummy_cols(test_amt, select_columns = factor_column_names, remove_first_dummy = TRUE)

# Extract only predictors
x_test_amt  <- x_test_amt %>% select(where(is.numeric), -TARGET_AMT)

# Center and scale predictors 
x_test_amt_scaled <- x_test_amt %>% mutate(across(all_of(cols_to_scale[-c(2,3,4,8)]), scale))

# Target
y_test_amt  <- test_amt$TARGET_AMT

testing_data_amt <- data.frame(TARGET_AMT = y_test_amt, x_test_amt_scaled)

Model1

linreg1 <- lm(TARGET_AMT ~., data = training_data_amt)

summary(linreg1)

## 
## Call:
## lm(formula = TARGET_AMT ~ ., data = training_data_amt)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8466  -3095  -1416    501  76992 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    4594.9852  1283.7823   3.579 0.000356 ***
## AGE                            -204.3023   238.4897  -0.857 0.391777    
## TRAVTIME                        -26.3616   191.6791  -0.138 0.890631    
## BLUEBOOK                       1167.9566   283.2715   4.123 3.95e-05 ***
## TIF                              88.4180   190.2837   0.465 0.642241    
## CAR_AGE                        -441.8912   271.9784  -1.625 0.104434    
## KIDSDRIV_1                      -87.1250   606.9012  -0.144 0.885870    
## HOMEKIDS_1                      315.3027   661.7984   0.476 0.633836    
## YOJ_1                           319.6359  1206.7342   0.265 0.791141    
## INCOME_1                       -212.6022  1202.7908  -0.177 0.859723    
## PARENT1_Yes                    -553.4415   764.4337  -0.724 0.469187    
## HOME_VAL_1                      -14.5105   497.0517  -0.029 0.976714    
## MSTATUS_Yes                    -790.9191   598.0092  -1.323 0.186178    
## SEX_M                          1814.4078   738.3480   2.457 0.014109 *  
## EDUCATION_High.School          -174.2204   520.9241  -0.334 0.738091    
## EDUCATION_Masters               209.1279   694.4336   0.301 0.763343    
## EDUCATION_PhD                    -6.0141   941.8588  -0.006 0.994906    
## CAR_USE_Private                   0.8156   463.2676   0.002 0.998596    
## CAR_TYPE_Panel.Truck           -367.8091  1043.1761  -0.353 0.724449    
## CAR_TYPE_Pickup                  39.2817   661.4748   0.059 0.952653    
## CAR_TYPE_Sports.Car            1426.5912   867.2890   1.645 0.100206    
## CAR_TYPE_SUV                   1059.7337   778.8413   1.361 0.173831    
## CAR_TYPE_Van                   -539.8711   835.9387  -0.646 0.518492    
## RED_CAR_Yes                    -154.2096   556.1704  -0.277 0.781611    
## OLDCLAIM_1                     -466.5132  1534.7076  -0.304 0.761190    
## CLM_FREQ_1                      927.0864  1539.9717   0.602 0.547256    
## REVOKED_Yes                    -439.0160   466.4877  -0.941 0.346803    
## MVR_PTS_1                      -586.4647   665.7269  -0.881 0.378494    
## URBANICITY_Highly.Urban..Urban   31.7524   866.7877   0.037 0.970783    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7316 on 1477 degrees of freedom
## Multiple R-squared:  0.02592,    Adjusted R-squared:  0.007455 
## F-statistic: 1.404 on 28 and 1477 DF,  p-value: 0.07884

plot(linreg1)

# Get factor column names
# factor_column_names <- names(Filter(is.factor, train_amt))

## DROP OUTLIERS
# Convert factor variables to dummy variables excluding TARGET_AMT
x_train_amt <- dummy_cols(train_amt[-c(641,537, 1466),], select_columns = factor_column_names, remove_first_dummy = TRUE)

x_train_amt <- x_train_amt %>% select(where(is.numeric), -TARGET_AMT)

# Center and scale predictors 
# update columns to scale

x_train_amt_scaled <- x_train_amt %>% mutate(across(all_of(cols_to_scale[-c(2,3,4,8)]), scale))

# Extract response 
y_train_amt <- train_amt$TARGET_AMT
y_train_amt <- y_train_amt[-c(641,537, 1466)]

# Merge training data
training_data_amt <- data.frame(TARGET_AMT = y_train_amt, x_train_amt_scaled)

Model2

# Re-fit data without outliers

linreg2 <- lm(TARGET_AMT ~., data = training_data_amt)

summary(linreg2)

## 
## Call:
## lm(formula = TARGET_AMT ~ ., data = training_data_amt)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8468  -3092  -1421    497  76991 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    4586.934   1285.817   3.567 0.000372 ***
## AGE                            -202.916    239.004  -0.849 0.396016    
## TRAVTIME                        -29.473    192.071  -0.153 0.878064    
## BLUEBOOK                       1171.089    283.676   4.128 3.86e-05 ***
## TIF                              86.669    190.685   0.455 0.649526    
## CAR_AGE                        -446.090    272.954  -1.634 0.102409    
## KIDSDRIV_1                     -110.524    609.047  -0.181 0.856024    
## HOMEKIDS_1                      321.633    663.006   0.485 0.627668    
## YOJ_1                           334.599   1208.155   0.277 0.781857    
## INCOME_1                       -210.536   1203.985  -0.175 0.861209    
## PARENT1_Yes                    -547.916    766.071  -0.715 0.474581    
## HOME_VAL_1                       -1.941    498.087  -0.004 0.996891    
## MSTATUS_Yes                    -797.354    598.656  -1.332 0.183097    
## SEX_M                          1824.498    739.359   2.468 0.013712 *  
## EDUCATION_High.School          -179.899    522.038  -0.345 0.730437    
## EDUCATION_Masters               213.628    695.308   0.307 0.758702    
## EDUCATION_PhD                    -2.570    942.816  -0.003 0.997825    
## CAR_USE_Private                  -4.837    463.847  -0.010 0.991681    
## CAR_TYPE_Panel.Truck           -381.542   1044.640  -0.365 0.714987    
## CAR_TYPE_Pickup                  42.561    662.311   0.064 0.948771    
## CAR_TYPE_Sports.Car            1430.964    868.549   1.648 0.099661 .  
## CAR_TYPE_SUV                   1055.597    779.612   1.354 0.175943    
## CAR_TYPE_Van                   -549.229    836.936  -0.656 0.511773    
## RED_CAR_Yes                    -161.292    557.338  -0.289 0.772318    
## OLDCLAIM_1                     -466.700   1536.097  -0.304 0.761307    
## CLM_FREQ_1                      929.113   1541.509   0.603 0.546781    
## REVOKED_Yes                    -438.753    467.498  -0.939 0.348135    
## MVR_PTS_1                      -585.389    666.381  -0.878 0.379837    
## URBANICITY_Highly.Urban..Urban   24.534    867.829   0.028 0.977450    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7322 on 1474 degrees of freedom
## Multiple R-squared:  0.02602,    Adjusted R-squared:  0.007516 
## F-statistic: 1.406 on 28 and 1474 DF,  p-value: 0.07777

After attempting to improve the secodn linear regression model by log transforming the response without improvements in the adjusted R-squared score, I decided to use this model despite it’s extremely poor performance (Adjust R-squared at 0.016). A multiple linear approach may be inappropriate for this data set.

Evaluation set

Review Summary of Data Processing

Handled missing values

6 observations were dropped for NA in AGE column
A new category “OTHER” was added to JOB if an income was listed
Multiple Imputation with Chained Equations (MICE) was used to impute the rest NA values

Data form a random iteration of MICE was selected
Discrete variables were binned and turned into binary variables
The data was split into a training and testing sets.
Dummy variables were created for all categorical predictors

This expanding the predictor space to include 36 variables

SMOTE technique was used to address the class imbalance in the training set
Predictors were centered and scaled

# Load data
ins_eval <- read.csv("insurance-evaluation-data.csv")

# Drop index 
ins_eval <- ins_eval %>% select(-INDEX)

# Clean variables 
## Categorical 
ins_eval <- ins_eval %>% mutate(
  PARENT1 = clean_cat(PARENT1),
  MSTATUS = clean_cat(MSTATUS),
  SEX = clean_cat(SEX),
  EDUCATION = clean_cat(EDUCATION),
  JOB = clean_cat(JOB),
  CAR_USE = clean_cat(CAR_USE),
  CAR_TYPE = clean_cat(CAR_TYPE),
  REVOKED = clean_cat(REVOKED)
)

ins_eval$URBANICITY <- str_extract_all(ins_eval$URBANICITY, "[A-Z][a-z]*.*\\w*", simplify = T)[,1]


# Numeric 
ins_eval <- ins_eval %>% mutate(
  INCOME = clean_num(INCOME), 
  HOME_VAL = clean_num(HOME_VAL), 
  BLUEBOOK = clean_num(BLUEBOOK), 
  OLDCLAIM = clean_num(OLDCLAIM)
)

skim(ins_eval)

Data summary
Name	ins_eval
Number of rows	2141
Number of columns	25
_______________________
Column type frequency:
character	10
logical	2
numeric	13
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	empty	n_unique
PARENT1	1	2	3	0	2
MSTATUS	1	2	3	0	2
SEX	1	1	1	0	2
EDUCATION	1	3	11	0	4
JOB	1	0	12	139	9
CAR_USE	1	7	10	0	2
CAR_TYPE	1	3	11	0	6
RED_CAR	1	2	3	0	2
REVOKED	1	2	3	0	2
URBANICITY	1	19	19	0	2

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
TARGET_FLAG	2141	0	NaN	:
TARGET_AMT	2141	0	NaN	:

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
KIDSDRIV	0	1.00	0.16	0.49	0	0	0	0	3	▇▁▁▁▁
AGE	1	1.00	45.02	8.53	17	39	45	51	73	▁▅▇▅▁
HOMEKIDS	0	1.00	0.72	1.12	0	0	0	1	5	▇▂▁▁▁
YOJ	94	0.96	10.38	4.17	0	9	11	13	19	▂▂▆▇▁
INCOME	0	1.00	56802.02	47754.09	0	20766	48317	82918	291182	▇▅▁▁▁
HOME_VAL	0	1.00	145274.11	130553.95	0	0	148853	231739	669271	▇▆▂▁▁
TRAVTIME	0	1.00	33.15	15.72	5	22	33	43	105	▆▇▃▁▁
BLUEBOOK	0	1.00	15469.43	8462.37	1500	8870	14170	21050	49940	▇▇▅▁▁
TIF	0	1.00	5.24	3.97	1	1	4	7	25	▇▆▁▁▁
OLDCLAIM	0	1.00	4011.13	8570.01	0	0	0	4718	54399	▇▁▁▁▁
CLM_FREQ	0	1.00	0.81	1.14	0	0	0	2	5	▇▂▁▁▁
MVR_PTS	0	1.00	1.77	2.20	0	0	1	3	12	▇▂▁▁▁
CAR_AGE	129	0.94	8.18	5.77	0	1	8	12	26	▇▇▆▂▁

ins_eval <- ins_eval %>% mutate(JOB = if_else(is.na(JOB) & !is.na(INCOME), "Other", JOB))

# Extract the strategy
pred <- ins_training_imp$predictorMatrix
meth <- ins_training_imp$method
post <- ins_training_imp$post

#Apply same strategy to test data
eval_imp <- mice(ins_eval,
                 m = 1,
                 maxit = 1,
                 predictorMatrix = pred,
                 method = meth,
                 post = post)

## 
##  iter imp variable
##   1   1  AGE  YOJ  CAR_AGE

## Warning: Number of logged events: 11

eval_complete <- complete(eval_imp, action = "long") %>% filter(.imp == 1)
head(eval_complete)

##   TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1          NA         NA        0  48        0  11  52881      No        0
## 2          NA         NA        1  40        1  11  50815     Yes        0
## 3          NA         NA        0  44        2  12  43486     Yes        0
## 4          NA         NA        0  35        2  14  21204     Yes        0
## 5          NA         NA        0  59        0  12  87460      No        0
## 6          NA         NA        0  46        0  14      0      No   207519
##   MSTATUS SEX   EDUCATION          JOB TRAVTIME    CAR_USE BLUEBOOK TIF
## 1      No   M   Bachelors      Manager       26    Private    21970   1
## 2      No   M High School      Manager       21    Private    18930   6
## 3      No   F High School  Blue Collar       30 Commercial     5900  10
## 4      No   M High School     Clerical       74    Private     9230   6
## 5      No   M High School      Manager       45    Private    15420   1
## 6     Yes   M   Bachelors Professional        7 Commercial    25660   1
##      CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1         Van     yes        0        0      No       2      10
## 2     Minivan      no     3295        1      No       2       1
## 3         SUV      no        0        0      No       0      10
## 4      Pickup      no        0        0     Yes       0       4
## 5     Minivan     yes    44857        2      No       4       1
## 6 Panel Truck      no     2119        1      No       2      12
##            URBANICITY .imp .id
## 1 Highly Urban/ Urban    1   1
## 2 Highly Urban/ Urban    1   2
## 3 Highly Rural/ Rural    1   3
## 4 Highly Rural/ Rural    1   4
## 5 Highly Urban/ Urban    1   5
## 6 Highly Urban/ Urban    1   6

eval_complete <- eval_complete %>% select(-c(TARGET_AMT, TARGET_FLAG, .imp, .id))

Feature Engineering

eval_complete <- eval_complete %>% mutate(CLM_FREQ = as_factor(ifelse(CLM_FREQ == 0, 0, 1)), 
         HOMEKIDS = as_factor(ifelse(HOMEKIDS == 0, 0, 1)), 
         KIDSDRIV = as_factor(ifelse(KIDSDRIV == 0, 0, 1)), 
         MVR_PTS = as_factor(ifelse(MVR_PTS <= 6, 0, 1)))

eval_complete <- eval_complete %>% mutate(across(where(is.character), as.factor))

factor_column_names <- names(Filter(is.factor, eval_complete))

eval_complete_logreg <- dummy_cols(eval_complete, select_columns = factor_column_names, remove_first_dummy = TRUE)

# Center and scale predictors 
cols_to_scale <- c("AGE", "YOJ", "INCOME", "HOME_VAL", "TRAVTIME", "BLUEBOOK", "TIF", "OLDCLAIM", "CAR_AGE")

eval_complete_logreg <- eval_complete_logreg %>% mutate(across(all_of(cols_to_scale), scale))

terms_table$Terms

##  [1] "(Intercept)"                    "INCOME"                        
##  [3] "HOME_VAL"                       "TRAVTIME"                      
##  [5] "BLUEBOOK"                       "TIF"                           
##  [7] "OLDCLAIM"                       "KIDSDRIV_1"                    
##  [9] "HOMEKIDS_1"                     "PARENT1_Yes"                   
## [11] "MSTATUS_Yes"                    "SEX_M"                         
## [13] "EDUCATION_High.School"          "CAR_USE_Private"               
## [15] "CAR_TYPE_Panel.Truck"           "CAR_TYPE_Pickup"               
## [17] "CAR_TYPE_Sports.Car"            "CAR_TYPE_SUV"                  
## [19] "CAR_TYPE_Van"                   "CLM_FREQ_1"                    
## [21] "REVOKED_Yes"                    "MVR_PTS_1"                     
## [23] "URBANICITY_Highly.Urban..Urban"

# Columns from eval_complete_scaled
eval_complete_logreg <- eval_complete_logreg %>% select(where(is.numeric)) 

eval_complete_logreg <- eval_complete_logreg %>% 
  rename(EDUCATION_High.School = `EDUCATION_High School`, CAR_TYPE_Panel.Truck = `CAR_TYPE_Panel Truck`,CAR_TYPE_Sports.Car = `CAR_TYPE_Sports Car`,URBANICITY_Highly.Urban..Urban = `URBANICITY_Highly Urban/ Urban`)

Predictions TARGET_FLAG

set.seed(214)

eval_complete_logreg <- eval_complete_logreg %>% select(terms_table$Terms[-1])

# Generate predictions using testing data
pred_eval_probs <- predict(logreg3, eval_complete_logreg, type = 'response')

# Get the predicted labels
pred_eval <- ifelse(pred_eval_probs >= 0.5, 1, 0)

ins_eval$pred_probs <- pred_eval_probs
ins_eval$TARGET_FLAG <- pred_eval

#write.csv(crime_eval_data, "crime_eval_data_predictions.csv", row.names = FALSE)

Predictions TARGET_AMT

names(linreg2$coefficients)

##  [1] "(Intercept)"                    "AGE"                           
##  [3] "TRAVTIME"                       "BLUEBOOK"                      
##  [5] "TIF"                            "CAR_AGE"                       
##  [7] "KIDSDRIV_1"                     "HOMEKIDS_1"                    
##  [9] "YOJ_1"                          "INCOME_1"                      
## [11] "PARENT1_Yes"                    "HOME_VAL_1"                    
## [13] "MSTATUS_Yes"                    "SEX_M"                         
## [15] "EDUCATION_High.School"          "EDUCATION_Masters"             
## [17] "EDUCATION_PhD"                  "CAR_USE_Private"               
## [19] "CAR_TYPE_Panel.Truck"           "CAR_TYPE_Pickup"               
## [21] "CAR_TYPE_Sports.Car"            "CAR_TYPE_SUV"                  
## [23] "CAR_TYPE_Van"                   "RED_CAR_Yes"                   
## [25] "OLDCLAIM_1"                     "CLM_FREQ_1"                    
## [27] "REVOKED_Yes"                    "MVR_PTS_1"                     
## [29] "URBANICITY_Highly.Urban..Urban"

# Factorize other cols in the linreg2 model

eval_complete_lin <- eval_complete %>% 
  mutate(HOME_VAL = as_factor(ifelse(HOME_VAL == 0, 0, 1)), 
         INCOME = as_factor(ifelse(INCOME == 0, 0, 1)),
         YOJ = as_factor(ifelse(YOJ == 0, 0, 1)),
         OLDCLAIM = as_factor(ifelse(OLDCLAIM == 0, 0, 1)))

# Assign the predicted target to the 
eval_complete_lin$TARGET_FLAG <- pred_eval
eval_complete_lin <- eval_complete_lin %>% filter(TARGET_FLAG == 1)

eval_complete_lin <- eval_complete_lin %>% select(-TARGET_FLAG)

factor_column_names <- names(Filter(is.factor, eval_complete_lin))

eval_complete_lin <- dummy_cols(eval_complete_lin, select_columns = factor_column_names, remove_first_dummy = TRUE)

cols_to_scale <- c("AGE", "TRAVTIME", "BLUEBOOK", "TIF", "CAR_AGE")

eval_complete_lin <- eval_complete_lin %>% mutate(across(all_of(cols_to_scale), scale))

eval_complete_lin <- eval_complete_lin %>% select(where(is.numeric))

colnames(eval_complete_lin)

##  [1] "AGE"                            "TRAVTIME"                      
##  [3] "BLUEBOOK"                       "TIF"                           
##  [5] "CAR_AGE"                        "KIDSDRIV_1"                    
##  [7] "HOMEKIDS_1"                     "YOJ_1"                         
##  [9] "INCOME_1"                       "PARENT1_Yes"                   
## [11] "HOME_VAL_1"                     "MSTATUS_Yes"                   
## [13] "SEX_M"                          "EDUCATION_High School"         
## [15] "EDUCATION_Masters"              "EDUCATION_PhD"                 
## [17] "JOB_Blue Collar"                "JOB_Clerical"                  
## [19] "JOB_Doctor"                     "JOB_Home Maker"                
## [21] "JOB_Lawyer"                     "JOB_Manager"                   
## [23] "JOB_Professional"               "JOB_Student"                   
## [25] "CAR_USE_Private"                "CAR_TYPE_Panel Truck"          
## [27] "CAR_TYPE_Pickup"                "CAR_TYPE_Sports Car"           
## [29] "CAR_TYPE_SUV"                   "CAR_TYPE_Van"                  
## [31] "RED_CAR_yes"                    "OLDCLAIM_1"                    
## [33] "CLM_FREQ_1"                     "REVOKED_Yes"                   
## [35] "MVR_PTS_1"                      "URBANICITY_Highly Urban/ Urban"

eval_complete_lin <- eval_complete_lin %>% rename(EDUCATION_High.School = `EDUCATION_High School`,
                             JOB_Home.Maker = `JOB_Home Maker`,
                             CAR_TYPE_Panel.Truck = `CAR_TYPE_Panel Truck`, 
                             CAR_TYPE_Sports.Car = `CAR_TYPE_Sports Car`,
                             URBANICITY_Highly.Urban..Urban = `URBANICITY_Highly Urban/ Urban`,
                             RED_CAR_Yes = RED_CAR_yes)
# Filter for TARGET_FLAG indicating a crash 
# eval_complete_lin %>% select(names(linreg2$coefficients[-1]))

set.seed(208)
eval_pred <- predict(linreg2, data.frame(eval_complete_lin))

ins_eval_amt <- ins_eval %>% filter(TARGET_FLAG == 1)
ins_eval_amt$TARGET_AMT <- eval_pred

Save predictions

write.csv(ins_eval_amt, "EVAL_TARGET_AMT_pred.csv", row.names = FALSE)
write.csv(ins_eval, "EVAL_TARGET_FLAG_pred.csv", row.names = FALSE)

DATA 621: Homework 4

Dirk Hartog

2025-11-17

Introduction

Exploratory Data Analysis

Training Data

Visualizations

Data Preparation

Visualize Missing Values

Further Inspection

Handling Missing Values

MissForest technique - NOT USED BUT CODE IS BELOW

MICE

Transform discrete variables

Train/Test Split

Dealing with class imbalance

Normalization

Modelling

Logistice Regression - Predicting TARGET_FLAG

Model1

Model1 Diagnostics

Model2

Model3

Multiple Linear Regression

Data Preparation

Model1

Model2

Evaluation set

Feature Engineering

Predictions TARGET_FLAG

Predictions TARGET_AMT

Save predictions