In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0.
A “1” means that the person was in a car crash.
A “0” means that the person was not in a car crash.
The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.
Your objective is to build 2 models:
Multiple linear regression to predict the amounts of money it will cost if the person does crash their car
Binary logistic regression models on the training data to predict the probability that a person will crash their car
You can only use the variables given to you
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
library(skimr)
library(ggpubr)
ins_training <- read_csv("insurance_training_data.csv")
## Rows: 8161 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): INCOME, PARENT1, HOME_VAL, MSTATUS, SEX, EDUCATION, JOB, CAR_USE, ...
## dbl (12): INDEX, TARGET_FLAG, TARGET_AMT, KIDSDRIV, AGE, HOMEKIDS, YOJ, TRAV...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remove INDEX variable}
ins_training <- ins_training %>% select(-INDEX)
head(ins_training, 10)
## # A tibble: 10 × 25
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 0 0 0 60 0 11 $67,349 No $0
## 2 0 0 0 43 0 11 $91,449 No $257,252
## 3 0 0 0 35 1 10 $16,039 No $124,191
## 4 0 0 0 51 0 14 <NA> No $306,251
## 5 0 0 0 50 0 NA $114,9… No $243,925
## 6 1 2946 0 34 1 12 $125,3… Yes $0
## 7 0 0 0 54 0 NA $18,755 No <NA>
## 8 1 4021 1 37 2 NA $107,9… No $333,680
## 9 1 2501 0 34 0 10 $62,978 No $0
## 10 0 0 0 50 0 7 $106,9… No $0
## # ℹ 16 more variables: MSTATUS <chr>, SEX <chr>, EDUCATION <chr>, JOB <chr>,
## # TRAVTIME <dbl>, CAR_USE <chr>, BLUEBOOK <chr>, TIF <dbl>, CAR_TYPE <chr>,
## # RED_CAR <chr>, OLDCLAIM <chr>, CLM_FREQ <dbl>, REVOKED <chr>,
## # MVR_PTS <dbl>, CAR_AGE <dbl>, URBANICITY <chr>
Skimming the first few rows of the data, there are several variables that need to be cleaned before generating summary statistics. Data that represent monitory values are entered with characters which need to be removed then converted. Several categorical variables seem to have extra characters, disrupting the uniformity across categories.
Numeric data types
clean_num <- function(x){
# Create a function that extracts the number from the string and
# converts to a numeric data type
# Extract digits from string
v <- str_extract_all(x,"\\d*,\\d*", simplify = T)
# Replace 'string',' with empty string
clean_v <- str_replace(v, "\\,", "")
# Replace empty string with 0
new_v <- replace(clean_v, clean_v == "", "0")
new_v <- as.numeric(new_v)
new_v
}
ins_training <- ins_training %>% mutate(
INCOME = clean_num(INCOME),
HOME_VAL = clean_num(HOME_VAL),
BLUEBOOK = clean_num(BLUEBOOK),
OLDCLAIM = clean_num(OLDCLAIM)
)
head(ins_training, 3)
## # A tibble: 3 × 25
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 0 0 0 60 0 11 67349 No 0
## 2 0 0 0 43 0 11 91449 No 257252
## 3 0 0 0 35 1 10 16039 No 124191
## # ℹ 16 more variables: MSTATUS <chr>, SEX <chr>, EDUCATION <chr>, JOB <chr>,
## # TRAVTIME <dbl>, CAR_USE <chr>, BLUEBOOK <dbl>, TIF <dbl>, CAR_TYPE <chr>,
## # RED_CAR <chr>, OLDCLAIM <dbl>, CLM_FREQ <dbl>, REVOKED <chr>,
## # MVR_PTS <dbl>, CAR_AGE <dbl>, URBANICITY <chr>
Categorical data type
clean_cat <- function(x){
# Create a function that extracts only the necessary portion of the string
# Extract string
cleaned_x <- str_extract_all(x, "[A-Z][a-z]*\\s?\\w*", simplify = T)
cleaned_x[,1]
}
clean_cat(ins_training$SEX)[0:5]
## [1] "M" "M" "F" "M" "F"
ins_training <- ins_training %>% mutate(
PARENT1 = clean_cat(PARENT1),
MSTATUS = clean_cat(MSTATUS),
SEX = clean_cat(SEX),
EDUCATION = clean_cat(EDUCATION),
JOB = clean_cat(JOB),
CAR_USE = clean_cat(CAR_USE),
CAR_TYPE = clean_cat(CAR_TYPE),
REVOKED = clean_cat(REVOKED)
)
head(ins_training, 3)
## # A tibble: 3 × 25
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 0 0 0 60 0 11 67349 No 0
## 2 0 0 0 43 0 11 91449 No 257252
## 3 0 0 0 35 1 10 16039 No 124191
## # ℹ 16 more variables: MSTATUS <chr>, SEX <chr>, EDUCATION <chr>, JOB <chr>,
## # TRAVTIME <dbl>, CAR_USE <chr>, BLUEBOOK <dbl>, TIF <dbl>, CAR_TYPE <chr>,
## # RED_CAR <chr>, OLDCLAIM <dbl>, CLM_FREQ <dbl>, REVOKED <chr>,
## # MVR_PTS <dbl>, CAR_AGE <dbl>, URBANICITY <chr>
# Convert RED_CAR to caps so it matches other columns
ins_training$RED_CAR <- str_to_title(ins_training$RED_CAR)
ins_training$RED_CAR[0:3]
## [1] "Yes" "Yes" "No"
unique(ins_training$URBANICITY)
## [1] "Highly Urban/ Urban" "z_Highly Rural/ Rural"
ins_training$URBANICITY <- str_extract_all(ins_training$URBANICITY, "[A-Z][a-z]*.*\\w*", simplify = T)[,1]
unique(ins_training$URBANICITY)
## [1] "Highly Urban/ Urban" "Highly Rural/ Rural"
Now that all columns are cleaned, run skim() to get summary statistics and missing value/completeness rates.
ins_training <- ins_training %>% mutate(across(where(is.character), as.factor))
ins_training$TARGET_FLAG <- as.factor(ins_training$TARGET_FLAG)
skim_summary <- skim(ins_training)
skim_summary
| Name | ins_training |
| Number of rows | 8161 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| factor | 11 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| TARGET_FLAG | 0 | 1.00 | FALSE | 2 | 0: 6008, 1: 2153 |
| PARENT1 | 0 | 1.00 | FALSE | 2 | No: 7084, Yes: 1077 |
| MSTATUS | 0 | 1.00 | FALSE | 2 | Yes: 4894, No: 3267 |
| SEX | 0 | 1.00 | FALSE | 2 | F: 4375, M: 3786 |
| EDUCATION | 0 | 1.00 | FALSE | 4 | Hig: 3533, Bac: 2242, Mas: 1658, PhD: 728 |
| JOB | 526 | 0.94 | FALSE | 8 | Blu: 1825, Cle: 1271, Pro: 1117, Man: 988 |
| CAR_USE | 0 | 1.00 | FALSE | 2 | Pri: 5132, Com: 3029 |
| CAR_TYPE | 0 | 1.00 | FALSE | 6 | SUV: 2294, Min: 2145, Pic: 1389, Spo: 907 |
| RED_CAR | 0 | 1.00 | FALSE | 2 | No: 5783, Yes: 2378 |
| REVOKED | 0 | 1.00 | FALSE | 2 | No: 7161, Yes: 1000 |
| URBANICITY | 0 | 1.00 | FALSE | 2 | Hig: 6492, Hig: 1669 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| TARGET_AMT | 0 | 1.00 | 1504.32 | 4704.03 | 0 | 0 | 0 | 1036 | 107586.1 | ▇▁▁▁▁ |
| KIDSDRIV | 0 | 1.00 | 0.17 | 0.51 | 0 | 0 | 0 | 0 | 4.0 | ▇▁▁▁▁ |
| AGE | 6 | 1.00 | 44.79 | 8.63 | 16 | 39 | 45 | 51 | 81.0 | ▁▆▇▂▁ |
| HOMEKIDS | 0 | 1.00 | 0.72 | 1.12 | 0 | 0 | 0 | 1 | 5.0 | ▇▂▁▁▁ |
| YOJ | 454 | 0.94 | 10.50 | 4.09 | 0 | 9 | 11 | 13 | 23.0 | ▂▃▇▃▁ |
| INCOME | 445 | 0.95 | 61895.23 | 47576.39 | 0 | 28097 | 54028 | 85986 | 367030.0 | ▇▃▁▁▁ |
| HOME_VAL | 464 | 0.94 | 154867.29 | 129123.77 | 0 | 0 | 161160 | 238724 | 885282.0 | ▇▆▁▁▁ |
| TRAVTIME | 0 | 1.00 | 33.49 | 15.91 | 5 | 22 | 33 | 44 | 142.0 | ▇▇▁▁▁ |
| BLUEBOOK | 0 | 1.00 | 15709.90 | 8419.73 | 1500 | 9280 | 14440 | 20850 | 69740.0 | ▇▆▁▁▁ |
| TIF | 0 | 1.00 | 5.35 | 4.15 | 1 | 1 | 4 | 7 | 25.0 | ▇▆▁▁▁ |
| OLDCLAIM | 0 | 1.00 | 4027.79 | 8780.98 | 0 | 0 | 0 | 4636 | 57037.0 | ▇▁▁▁▁ |
| CLM_FREQ | 0 | 1.00 | 0.80 | 1.16 | 0 | 0 | 0 | 2 | 5.0 | ▇▂▁▁▁ |
| MVR_PTS | 0 | 1.00 | 1.70 | 2.15 | 0 | 0 | 1 | 3 | 13.0 | ▇▂▁▁▁ |
| CAR_AGE | 510 | 0.94 | 8.33 | 5.70 | -3 | 1 | 8 | 12 | 28.0 | ▆▇▇▃▁ |
ins_training %>% dplyr::select(where(is.numeric), TARGET_FLAG) %>%
pivot_longer(cols = -TARGET_FLAG, names_to = "Feature", values_to = "Value") %>%
ggplot(aes(x = Value, fill = as.factor(TARGET_FLAG))) + geom_histogram(bins = 30) +
facet_wrap(~ Feature, nrow = 3, ncol = 5, scales = "free") + labs(title = "Numeric Variables")
Almost all continuous variables are right skewed indicating higher proportion of values toward the lower end of the range. Transformation of these variables might be important for more accurate models and will be revisited in the later section.
Net we checking for near zero variance for the discrete predictors with a large majority of 0 values
CLM_FREQ HOMEKIDS KIDSDRIV MVR_PTS
Creating a distribution tables that compares the TARGET_FLAG response across all levels within a discrete variable is helpful to evaluate. We can also bin these into two categories.
ins_training %>% select(TARGET_FLAG, CLM_FREQ) %>% table()
## CLM_FREQ
## TARGET_FLAG 0 1 2 3 4 5
## 0 4111 612 702 462 110 11
## 1 898 385 469 314 80 7
# Group CLM_FREQ values > 0
ins_training %>% select(TARGET_FLAG, CLM_FREQ) %>% mutate(CLM_FREQ = ifelse(CLM_FREQ > 0, 1, 0)) %>% table()
## CLM_FREQ
## TARGET_FLAG 0 1
## 0 4111 1897
## 1 898 1255
ins_training %>% select(TARGET_FLAG, HOMEKIDS) %>% table()
## HOMEKIDS
## TARGET_FLAG 0 1 2 3 4 5
## 0 4116 597 736 444 107 8
## 1 1173 305 382 230 57 6
# Group HOMEKIDS values > 0
ins_training %>% select(TARGET_FLAG, HOMEKIDS) %>% mutate(HOMEKIDS = ifelse(HOMEKIDS > 0, 1, 0)) %>% table()
## HOMEKIDS
## TARGET_FLAG 0 1
## 0 4116 1892
## 1 1173 980
ins_training %>% select(TARGET_FLAG, KIDSDRIV) %>% table()
## KIDSDRIV
## TARGET_FLAG 0 1 2 3 4
## 0 5407 400 168 31 2
## 1 1773 236 111 31 2
# Group KIDSDRIV values > 0
ins_training %>% select(TARGET_FLAG, KIDSDRIV) %>% mutate(KIDSDRIV = ifelse(KIDSDRIV > 0, 1, 0)) %>% table()
## KIDSDRIV
## TARGET_FLAG 0 1
## 0 5407 601
## 1 1773 380
ins_training %>% select(TARGET_FLAG, MVR_PTS) %>% table()
## MVR_PTS
## TARGET_FLAG 0 1 2 3 4 5 6 7 8 9 10 11 13
## 0 2998 886 683 517 394 250 162 73 29 12 2 2 0
## 1 714 271 265 241 205 149 104 94 55 33 11 9 2
# Group MVR_PTS values > 0
ins_training %>% select(TARGET_FLAG, MVR_PTS) %>% mutate(MVR_PTS = ifelse(MVR_PTS > 0, 1, 0)) %>% table()
## MVR_PTS
## TARGET_FLAG 0 1
## 0 2998 3010
## 1 714 1439
CLM_FREQ The observations with at least 1 claim are more likely get in a crash
HOMEKIDS The observations with at least 1 kid at home are more likely get in a crash
KIDSDRIV The observations with at least 1 kid driving are more likely get in a crash
MVR_PTS As the number of points increase above 6, a person is more likely to get into a crash
ins_training %>% dplyr::select(where(is.factor)) %>%
pivot_longer(cols = -TARGET_FLAG, names_to = "Feature", values_to = "Value") %>%
ggplot(aes(y = Value, fill = TARGET_FLAG)) + geom_bar(position = "dodge") +
facet_wrap( ~ Feature, nrow = 4, ncol = 3, scales = "free") +
labs(title = "Categorical Variables") +
theme_minimal() +
theme(panel.spacing = unit(0.5, "cm"))
ins_training %>% ggplot(aes(x = TARGET_FLAG, fill = TARGET_FLAG)) + geom_col(stat = "count") +
labs(title = "Count of Crashes Recorded") + theme_minimal()
The data is imbalanced and will need to be addressed when ceating models. The minority class which is the labeled 1,≠ indicates a crash.
Correlations
ins_training %>%
ggplot(aes(x = TARGET_AMT)) + geom_histogram(binwidth = 1000) +
labs(title = "Distribution of Crash Payments") + theme_minimal()
ins_training %>%
ggplot(aes(y = TARGET_AMT)) + geom_boxplot() +
labs(title = "Distribution of Crash Payments") + theme_minimal()
library(corrplot)
## corrplot 0.95 loaded
# Filter for only rows containing a TAREGT_AMT > 0
cor_coef <- cor(ins_training %>% select(is.numeric, -TARGET_FLAG) %>%
drop_na() %>% filter(TARGET_AMT > 0))
## Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
## # Was:
## data %>% select(is.numeric)
##
## # Now:
## data %>% select(where(is.numeric))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
corrplot::corrplot(cor_coef, method = "number", title = "Correlations Bewteen Continuous Variables",
order = "hclust", number.cex = 0.75, type = "lower")
No variables show strong correlations with the continuous TARGET_AMT. Moderate correlations exist between:
HOMEKIDS and AGE (Negative): As AGE increases the # Kids at home decreases CAR_AGE and INCOME (Positive): As INCOME increases the age of the persons car increases HOME_VAL and INCOME (Positive): As INCOME increases the home value increases BLUEBOOK and INCOME (Positive): As INCOME increases the value of the vehicle increases
Before deciding to perform any transformations, missing values should be dealt with. The visualization above showed 5 variables with missing values (AGE, HOME_VAL, CAR_AGE, INCOME, JOB, YOJ). JOB is the only categorical variables while the others are continuous.
skim_summary %>%
select(skim_variable, n_missing) %>%
filter(n_missing > 0) %>%
arrange(n_missing) %>%
ggplot(aes(x = skim_variable, y = n_missing)) + geom_bar(stat = "identity") +
labs(title = "Variables with Missing Values", x = "Variable name", y = "# Missing values") +
theme_minimal()
na_matrix <- is.na(ins_training)
for(i in ncol(is.na(na_matrix))){
for(r in length(na_matrix[,i])){
if(na_matrix[r,i] == TRUE){
na_matrix[r,i] <- 1
}
else {na_matrix[r,i] <- 0}
}
}
visdat::vis_miss(ins_training %>% select(AGE, CAR_AGE, HOME_VAL, INCOME, JOB, YOJ), show_perc = F) + labs(title = "Observations with Missing Data") + theme_minimal()
JOB and INCOME
ins_training %>% filter(is.na(JOB)) %>% ggplot(aes(y = INCOME)) + geom_boxplot() + theme_minimal()
ins_training %>% filter(!is.na(JOB)) %>% ggplot(aes(x = JOB, y = INCOME)) + geom_boxplot() + theme_minimal()
ins_training %>% filter(is.na(JOB)) %>% summarize(p25 = quantile(INCOME, 0.25, na.rm = T),
p50 = quantile(INCOME, 0.5, na.rm = T),
p75 = quantile(INCOME, 0.75, na.rm = T),
IQR = p75 - p25)
## # A tibble: 1 × 4
## p25 p50 p75 IQR
## <dbl> <dbl> <dbl> <dbl>
## 1 73320. 111340. 155511. 82191
Some of the observations with missing data in the JOB variable list an income. The missing data here might be from the fact that there was no option for them to choose and so they left it blank. In these cases we could add a level called “OTHER”.
In cases where a job is listed but no income is, we could impute these values using a technique that predicts the value.
In cases where neither income or job is listed we could fill in the job with unknown or drop them if there are not that many.
CAR_AGE
Evaluate whether the missing values in CAR_AGE variable is related to how to the vehicle is used.
ins_training %>% filter(is.na(CAR_AGE)) %>% group_by(CAR_USE) %>% summarise(COUNT = n())
## # A tibble: 2 × 2
## CAR_USE COUNT
## <fct> <int>
## 1 Commercial 170
## 2 Private 340
# Preserve original data set by creating a copy
ins_training_copy <- ins_training
head(ins_training_copy)
## # A tibble: 6 × 25
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 0 0 0 60 0 11 67349 No 0
## 2 0 0 0 43 0 11 91449 No 257252
## 3 0 0 0 35 1 10 16039 No 124191
## 4 0 0 0 51 0 14 NA No 306251
## 5 0 0 0 50 0 NA 114986 No 243925
## 6 1 2946 0 34 1 12 125301 Yes 0
## # ℹ 16 more variables: MSTATUS <fct>, SEX <fct>, EDUCATION <fct>, JOB <fct>,
## # TRAVTIME <dbl>, CAR_USE <fct>, BLUEBOOK <dbl>, TIF <dbl>, CAR_TYPE <fct>,
## # RED_CAR <fct>, OLDCLAIM <dbl>, CLM_FREQ <dbl>, REVOKED <fct>,
## # MVR_PTS <dbl>, CAR_AGE <dbl>, URBANICITY <fct>
1. Drop NA Values from AGE
There are only 6 missing values in this variable, which represents < 0.1% of the data. Out of the six observations, 5 are listed as sustaining a crash. This is the minority class and from a business perspective, it will be important to predict the individuals who are more likely to be in crash for this reason, these observations will not be dropped.
ins_training %>% filter(is.na(AGE)) %>% group_by(TARGET_FLAG) %>% summarise(COUNT = n())
## # A tibble: 2 × 2
## TARGET_FLAG COUNT
## <fct> <int>
## 1 0 1
## 2 1 5
2. Create a new category ‘OTHER’ in JOB for those observations where there is an INCOME listed
ins_training_copy <- ins_training_copy %>% mutate(JOB = if_else(is.na(JOB) & !is.na(INCOME), "Other", JOB))
sum(is.na(ins_training_copy$JOB))
## [1] 24
This results in only 24 values where the JOB and INCOME is unknown. Since there are a lot fewer observations were the TARGET_FLAG is 1 (~25% of the data), representing a crash, it still might be worth keeping that information and trying to impute the missing values for those observations.
ins_training_copy %>% filter(is.na(JOB) & is.na(INCOME)) %>% group_by(EDUCATION, SEX, CAR_TYPE, CAR_USE, TARGET_FLAG) %>% summarise(COUNT = n())
## `summarise()` has grouped output by 'EDUCATION', 'SEX', 'CAR_TYPE', 'CAR_USE'.
## You can override using the `.groups` argument.
## # A tibble: 11 × 6
## # Groups: EDUCATION, SEX, CAR_TYPE, CAR_USE [8]
## EDUCATION SEX CAR_TYPE CAR_USE TARGET_FLAG COUNT
## <fct> <fct> <fct> <fct> <fct> <int>
## 1 Masters F Pickup Commercial 0 1
## 2 Masters M Panel Truck Commercial 0 5
## 3 Masters M Panel Truck Commercial 1 3
## 4 Masters M Pickup Commercial 0 1
## 5 Masters M Pickup Private 1 1
## 6 Masters M Van Commercial 1 2
## 7 PhD M Panel Truck Commercial 0 4
## 8 PhD M Panel Truck Commercial 1 1
## 9 PhD M Pickup Commercial 0 3
## 10 PhD M Pickup Commercial 1 1
## 11 PhD M Van Commercial 0 2
All observations where the JOB and INCOME are missing (n = 24) have an education level of “Masters” or “PhD”, the majority of time the vehicle is used for commercial purposes and the individual is Male. Of those 24 observations 8 are recorded as sustaining a crash.
# ins_training_copy %>% filter(!is.na(JOB) | !is.na(INCOME))
ins_training has categorical variables as factors
library(missForest)
missForest(ins_training)
Potential Issues - Many of the categorical variables are binary and only have two outcomes - There are variables with the names time and status which might be interptreted as ‘survival’ variables
# Rename
ins_training <- ins_training %>% rename(MARRIED = MSTATUS,
WORK_DIST = TRAVTIME) # new_name = old_name
# Convert categories into binary values (0/1)
ins_training_copy <- ins_training
ins_training_copy[] <- lapply(ins_training_copy, function(x) {
if (is.factor(x) && nlevels(x) == 2) {
# Convert 2-level factor to numeric 0/1
as.numeric(x) - 1
} else {
x
}
})
missForest(ins_training_copy)
# Converted binary variables into factor data type
binary_vars <- c("TARGET_FLAG", "PARENT1", "MARRIED", "SEX",
"CAR_USE", "RED_CAR", "REVOKED", "URBANICITY")
ins_training_copy[binary_vars] <- lapply(ins_training_copy[binary_vars], factor)
library(missForest)
# Dropped INDEX column
ins_training_copy <- ins_training_copy %>% select(-INDEX)
missForest(ins_training_copy)
library(dplyr)
# Function to check potentially problematic columns
check_missforest_issues <- function(df) {
problems <- sapply(names(df), function(col_name) {
col <- df[[col_name]]
# Check for unsupported types
type_issue <- !(is.numeric(col) | is.factor(col))
# Check for factor with no levels or only NA
empty_factor <- is.factor(col) & length(levels(col)) == 0
all_na <- all(is.na(col))
any(type_issue, empty_factor, all_na)
})
# Return names of problematic columns
names(problems[problems])
}
# Apply to your dataset
problem_cols <- check_missforest_issues(ins_training_copy)
if(length(problem_cols) == 0) {
cat("No problematic columns detected. missForest should work.\n")
} else {
cat("These columns may cause missForest to fail:\n")
print(problem_cols)
}
The error message after each iteration of modifying the data types;
Error in ranger::ranger(x = obsX, y = obsY, num.trees = ntree, mtry = mtry, : Error: Missing value handling not yet implemented for survival forests.
Next approach for imputation of the missing values is MICE - Multivariate Imputation by Chained Equations. This technique creates multiple different imputed data sets and analysis is conducted on each, and results are pooled. MICE helps to account for the uncertainty of imputations.
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
ins_training_imp <- mice(ins_training_copy, m=6, maxit = 10, seed = 123, print = FALSE)
## Warning: Number of logged events: 1
ins_training_imp$method
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ
## "" "" "" "pmm" "" "pmm"
## INCOME PARENT1 HOME_VAL MSTATUS SEX EDUCATION
## "pmm" "" "pmm" "" "" ""
## JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE
## "" "" "" "" "" ""
## RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## "" "" "" "" "" "pmm"
## URBANICITY
## ""
plot(ins_training_imp)
#stripplot(ins_training_imp)
Looking at how data converged and where in the range of values the NA’s were imputed across each iteration, it was decided to use only 1 of the completely imputed data sets. This will be selected at random and used for modelling.
set.seed(124)
imp_round <- sample(1:ins_training_imp$m, size = 1)
# Retrieve complete imputation data set and filter random imputation
complete_ins_training_samp1 <- complete(ins_training_imp, action = "long") %>% filter(.imp == imp_round)
head(complete_ins_training_samp1)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1 0 0 0 60 0 11 67349 No 0
## 2 0 0 0 43 0 11 91449 No 257252
## 3 0 0 0 35 1 10 16039 No 124191
## 4 0 0 0 51 0 14 108047 No 306251
## 5 0 0 0 50 0 13 114986 No 243925
## 6 1 2946 0 34 1 12 125301 Yes 0
## MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF
## 1 No M PhD Professional 14 Private 14230 11
## 2 No M High School Blue Collar 22 Commercial 14940 1
## 3 Yes F High School Clerical 5 Private 4010 4
## 4 Yes M High School Blue Collar 32 Private 15440 7
## 5 Yes F PhD Doctor 36 Private 18000 1
## 6 No F Bachelors Blue Collar 46 Commercial 17430 1
## CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1 Minivan Yes 4461 2 No 3 18
## 2 Minivan Yes 0 0 No 0 1
## 3 SUV No 38690 2 No 3 10
## 4 Minivan Yes 0 0 No 0 6
## 5 SUV No 19217 2 Yes 3 17
## 6 Sports Car No 0 0 No 0 7
## URBANICITY .imp .id
## 1 Highly Urban/ Urban 1 1
## 2 Highly Urban/ Urban 1 2
## 3 Highly Urban/ Urban 1 3
## 4 Highly Urban/ Urban 1 4
## 5 Highly Urban/ Urban 1 5
## 6 Highly Urban/ Urban 1 6
For the logistic regression models we will not be using the TARGET_AMT variable or the variables identifying the imputation round or id. Before performing any class imbalance and other data preparation measure the dat ashould be split into a training and testing data set.
As discussed earlier it might be more useful to create binary categories for CLM_FREQ, HOMEKIDS, KIDSDRIV
complete_ins_training_samp1 <- complete_ins_training_samp1 %>%
mutate(CLM_FREQ = as_factor(ifelse(CLM_FREQ == 0, 0, 1)),
HOMEKIDS = as_factor(ifelse(HOMEKIDS == 0, 0, 1)),
KIDSDRIV = as_factor(ifelse(KIDSDRIV == 0, 0, 1)),
MVR_PTS = as_factor(ifelse(MVR_PTS <= 6, 0, 1)))
complete_ins_training_samp1 <- complete_ins_training_samp1 %>% select(-c(.imp, .id))
skim(complete_ins_training_samp1)
| Name | complete_ins_training_sam… |
| Number of rows | 8161 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| factor | 14 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| JOB | 24 | 1 | 5 | 12 | 0 | 9 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| TARGET_FLAG | 0 | 1 | FALSE | 2 | 0: 6008, 1: 2153 |
| KIDSDRIV | 0 | 1 | FALSE | 2 | 0: 7180, 1: 981 |
| HOMEKIDS | 0 | 1 | FALSE | 2 | 0: 5289, 1: 2872 |
| PARENT1 | 0 | 1 | FALSE | 2 | No: 7084, Yes: 1077 |
| MSTATUS | 0 | 1 | FALSE | 2 | Yes: 4894, No: 3267 |
| SEX | 0 | 1 | FALSE | 2 | F: 4375, M: 3786 |
| EDUCATION | 0 | 1 | FALSE | 4 | Hig: 3533, Bac: 2242, Mas: 1658, PhD: 728 |
| CAR_USE | 0 | 1 | FALSE | 2 | Pri: 5132, Com: 3029 |
| CAR_TYPE | 0 | 1 | FALSE | 6 | SUV: 2294, Min: 2145, Pic: 1389, Spo: 907 |
| RED_CAR | 0 | 1 | FALSE | 2 | No: 5783, Yes: 2378 |
| CLM_FREQ | 0 | 1 | FALSE | 2 | 0: 5009, 1: 3152 |
| REVOKED | 0 | 1 | FALSE | 2 | No: 7161, Yes: 1000 |
| MVR_PTS | 0 | 1 | FALSE | 2 | 0: 7839, 1: 322 |
| URBANICITY | 0 | 1 | FALSE | 2 | Hig: 6492, Hig: 1669 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| TARGET_AMT | 0 | 1 | 1504.32 | 4704.03 | 0 | 0 | 0 | 1036 | 107586.1 | ▇▁▁▁▁ |
| AGE | 0 | 1 | 44.78 | 8.63 | 16 | 39 | 45 | 51 | 81.0 | ▁▆▇▂▁ |
| YOJ | 0 | 1 | 10.51 | 4.09 | 0 | 9 | 11 | 13 | 23.0 | ▂▃▇▃▁ |
| INCOME | 0 | 1 | 61603.23 | 47429.67 | 0 | 27940 | 53628 | 85479 | 367030.0 | ▇▃▁▁▁ |
| HOME_VAL | 0 | 1 | 155137.97 | 129327.84 | 0 | 0 | 161481 | 238850 | 885282.0 | ▇▆▁▁▁ |
| TRAVTIME | 0 | 1 | 33.49 | 15.91 | 5 | 22 | 33 | 44 | 142.0 | ▇▇▁▁▁ |
| BLUEBOOK | 0 | 1 | 15709.90 | 8419.73 | 1500 | 9280 | 14440 | 20850 | 69740.0 | ▇▆▁▁▁ |
| TIF | 0 | 1 | 5.35 | 4.15 | 1 | 1 | 4 | 7 | 25.0 | ▇▆▁▁▁ |
| OLDCLAIM | 0 | 1 | 4027.79 | 8780.98 | 0 | 0 | 0 | 4636 | 57037.0 | ▇▁▁▁▁ |
| CAR_AGE | 0 | 1 | 8.36 | 5.70 | -3 | 1 | 8 | 12 | 28.0 | ▆▇▇▃▁ |
Inspecting the data set there is one value that does not make sense. In the CAR_AGE variable there is a negative value. This will need to be investigated further. There is only one value with a negative car age, this will be dropped from the data.
complete_ins_training_samp1 %>% filter(CAR_AGE < 0)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1 1 1469 0 47 0 12 48696 No 212014
## MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE
## 1 No F Bachelors Professional 46 Private 15390 4 Pickup
## RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
## 1 No 33521 1 Yes 0 -3 Highly Urban/ Urban
complete_ins_training_samp1 <- complete_ins_training_samp1 %>% filter(CAR_AGE >= 0)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.9 ✔ rsample 1.3.1
## ✔ dials 1.4.2 ✔ tailor 0.1.0
## ✔ infer 1.0.9 ✔ tune 2.0.0
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.3.3 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ mice::filter() masks dplyr::filter(), stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
set.seed(125)
# Split data into train and test sets
train_test_split <- initial_split(complete_ins_training_samp1, prop = 0.7)
## TRAIN SET
train_df <- training(train_test_split)
train_df %>% ggplot(aes(x = TARGET_FLAG, fill = TARGET_FLAG)) + geom_col(stat = "count") +
labs(title = "Count of Crashes Recorded: Train set") + theme_minimal()
## TEST SET
test_df <- testing(train_test_split)
test_df %>% ggplot(aes(x = TARGET_FLAG, fill = TARGET_FLAG)) + geom_col(stat = "count") +
labs(title = "Count of Crashes Recorded: Test set") + theme_minimal()
There is a large imbalance between the classes of the response variable TARGET_FLAG, with only 25% of the observations recorded as a crash. Teal with the imbalance is to over sample the minority class using the technique called SMOTE (Synthetic Minority Oversampling Technique). Since MICE was used to impute missing values - SMOTE will need to be run on each imputed data frame.
The strategy is to keep the imputed data set as is and use that for the multiple linear regression. Use a loop that applies SMOTE to all the imputed data sets, builds a model on the data, and saves the model to a list.
library(smotefamily)
library(fastDummies)
# Get factor column names
factor_column_names <- names(Filter(is.factor, train_df))
# Convert factor variables to dummy variables excluding TARGET_AMT
x_train <- dummy_cols(train_df, select_columns = factor_column_names[-1], remove_first_dummy = TRUE)
# Select only the numeric variables + TARGET_FLAG
x_train <- x_train %>% select(where(is.numeric), TARGET_FLAG)
# Convert TARGET_FLAG to numeric for SMOTE
x_train$TARGET_FLAG <- as.numeric(as.character(x_train$TARGET_FLAG))
# Conduct SMOTE on training set
x_train_smote <- SMOTE(x_train, x_train$TARGET_FLAG, K = 5, dup_size = 0)
# Select only Predictors and exclude the TARGET_AMT
x_train_smote_data <- dplyr::select(x_train_smote$data, -c(TARGET_AMT, TARGET_FLAG, class))
# Extract TARGET_AMT and convert back to factor
y_train <- as_factor(x_train_smote$data[,"TARGET_FLAG"])
## TRAIN SET
x_train_smote$data %>% ggplot(aes(x = factor(TARGET_FLAG), fill = factor(TARGET_FLAG))) + geom_col(stat = "count") + labs(title = "Count of Crashes Recorded: SMOTE") + theme_minimal()
cols_to_scale <- c("AGE", "YOJ", "INCOME", "HOME_VAL", "TRAVTIME", "BLUEBOOK", "TIF", "OLDCLAIM", "CAR_AGE")
# Center and scale predictors
x_train_scaled <- x_train_smote_data %>% mutate(across(all_of(cols_to_scale), scale))
# Merge training data
training_data <- data.frame(TARGET_FLAG = y_train, x_train_scaled)
# Convert factor variables to dummy variables excluding TARGET_AMT
x_test <- dummy_cols(test_df, select_columns = factor_column_names[-1], remove_first_dummy = TRUE)
# Extract only predictors
x_test <- x_test %>% select(where(is.numeric), -TARGET_AMT)
# Center and scale predictors
x_test_scaled <- x_test %>% mutate(across(all_of(cols_to_scale), scale))
# Target
y_test <- test_df$TARGET_FLAG
testing_data <- data.frame(TARGET_FLAG = y_test, x_test_scaled)
Summary of Data Processing
Data form a random iteration of MICE was selected
Discrete variables were binned and turned into binary variables
The data was split into a training and testing sets.
Dummy variables were created for all categorical predictors
SMOTE technique was used to address the class imbalance in the training set
Predictors were centered and scaled
Data was combined
training_data testing_data
The first model that will be built with the entire predictor space.
logreg1 <- glm(TARGET_FLAG ~ ., family=binomial, data = training_data)
summary(logreg1)
##
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial, data = training_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.33221 0.16688 -19.967 < 2e-16 ***
## AGE -0.04932 0.03569 -1.382 0.16699
## YOJ -0.05783 0.03218 -1.797 0.07233 .
## INCOME -0.20856 0.04871 -4.281 1.86e-05 ***
## HOME_VAL -0.10985 0.04331 -2.536 0.01121 *
## TRAVTIME 0.26365 0.02969 8.879 < 2e-16 ***
## BLUEBOOK -0.19999 0.04339 -4.609 4.04e-06 ***
## TIF -0.30316 0.02986 -10.154 < 2e-16 ***
## OLDCLAIM -0.25465 0.03945 -6.455 1.08e-10 ***
## CAR_AGE 0.01327 0.04219 0.315 0.75312
## KIDSDRIV_1 0.58353 0.10411 5.605 2.08e-08 ***
## HOMEKIDS_1 0.26762 0.10154 2.635 0.00840 **
## PARENT1_Yes 0.36119 0.12615 2.863 0.00420 **
## MSTATUS_Yes -0.57022 0.08973 -6.355 2.08e-10 ***
## SEX_M 0.23969 0.11114 2.157 0.03103 *
## EDUCATION_High.School 0.54971 0.08500 6.468 9.96e-11 ***
## EDUCATION_Masters 0.04699 0.10124 0.464 0.64257
## EDUCATION_PhD -0.03842 0.13865 -0.277 0.78171
## CAR_USE_Private -1.00391 0.07592 -13.224 < 2e-16 ***
## CAR_TYPE_Panel.Truck 0.41712 0.15299 2.726 0.00640 **
## CAR_TYPE_Pickup 0.43268 0.10017 4.319 1.57e-05 ***
## CAR_TYPE_Sports.Car 1.07743 0.13222 8.148 3.69e-16 ***
## CAR_TYPE_SUV 0.85372 0.11139 7.664 1.80e-14 ***
## CAR_TYPE_Van 0.41069 0.12573 3.266 0.00109 **
## RED_CAR_Yes 0.02774 0.08789 0.316 0.75232
## CLM_FREQ_1 0.87056 0.07563 11.511 < 2e-16 ***
## REVOKED_Yes 1.03595 0.09899 10.465 < 2e-16 ***
## MVR_PTS_1 1.25967 0.15969 7.888 3.07e-15 ***
## URBANICITY_Highly.Urban..Urban 2.47616 0.10832 22.860 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9827.1 on 7225 degrees of freedom
## Residual deviance: 7318.3 on 7197 degrees of freedom
## AIC: 7376.3
##
## Number of Fisher Scoring iterations: 5
VIF
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
plot_vif <- function(fitted_model, model_name){
# Calculate VIF
vif_values <- vif(fitted_model)
# Convert VIF results to a data frame for plotting
vif_df <- data.frame(Variable = names(vif_values), VIF = vif_values)
# Set a threshold to indicate high VIF
high_vif_threshold <- 5
# Generate bar plot
ggplot(vif_df, aes(Variable, VIF)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_hline(yintercept = high_vif_threshold, linetype = "dashed", color = "red") +
scale_y_continuous(limits = c(0, max(vif_df$VIF) + 1)) +
labs(title = paste0("Variance Inflation Factor (VIF) for Regression Model - ", model_name),
y = "VIF",
x = "Variable") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
plot_vif(logreg1, "Log Reg 1")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_hline()`).
logreg_metrics <- function(cm){
TN <- cm[1,1]
TP <- cm[2,2]
FN <- cm[2,1]
FP <- cm[1,2]
acc = (TP + TN) / sum(cm)
er = (FP + FN) / sum(cm)
pre = TP / (TP + FP)
sens = TP / (TP + FN)
spec = TN / (TN + FP)
f1 = (2 * pre * sens) / (pre + sens)
metrics <- tibble(Metric = c("Accuracy", "Error_rate", "Precision",
"Sensitivity", "Specificity", "F1_score"),
Value = round(c(acc,er,pre,sens,spec,f1), 3))
metrics
}
# Generate predictions using testing data
pred1_probs <- predict(logreg1, testing_data, type = 'response')
# Get the predicted labels
pred1 <- ifelse(pred1_probs >= 0.5, 1, 0)
# COnstruct confusion matrix
cm1 <- table(actual = testing_data$TARGET_FLAG, prediction = pred1)
logreg_metrics(cm1)
## # A tibble: 6 × 2
## Metric Value
## <chr> <dbl>
## 1 Accuracy 0.757
## 2 Error_rate 0.243
## 3 Precision 0.528
## 4 Sensitivity 0.658
## 5 Specificity 0.792
## 6 F1_score 0.586
Feature selection with backwards elimination
backwards_logreg <- stats::step(logreg1, direction = "backward",)
## Start: AIC=7376.31
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
## TIF + OLDCLAIM + CAR_AGE + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes +
## MSTATUS_Yes + SEX_M + EDUCATION_High.School + EDUCATION_Masters +
## EDUCATION_PhD + CAR_USE_Private + CAR_TYPE_Panel.Truck +
## CAR_TYPE_Pickup + CAR_TYPE_Sports.Car + CAR_TYPE_SUV + CAR_TYPE_Van +
## RED_CAR_Yes + CLM_FREQ_1 + REVOKED_Yes + MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
##
## Df Deviance AIC
## - EDUCATION_PhD 1 7318.4 7374.4
## - CAR_AGE 1 7318.4 7374.4
## - RED_CAR_Yes 1 7318.4 7374.4
## - EDUCATION_Masters 1 7318.5 7374.5
## - AGE 1 7320.2 7376.2
## <none> 7318.3 7376.3
## - YOJ 1 7321.5 7377.5
## - SEX_M 1 7323.0 7379.0
## - HOME_VAL 1 7324.7 7380.7
## - HOMEKIDS_1 1 7325.2 7381.2
## - CAR_TYPE_Panel.Truck 1 7325.8 7381.8
## - PARENT1_Yes 1 7326.5 7382.5
## - CAR_TYPE_Van 1 7328.9 7384.9
## - INCOME 1 7336.8 7392.8
## - CAR_TYPE_Pickup 1 7336.9 7392.9
## - BLUEBOOK 1 7339.9 7395.9
## - KIDSDRIV_1 1 7350.0 7406.0
## - MSTATUS_Yes 1 7358.7 7414.7
## - EDUCATION_High.School 1 7360.5 7416.5
## - OLDCLAIM 1 7360.7 7416.7
## - CAR_TYPE_SUV 1 7378.3 7434.3
## - CAR_TYPE_Sports.Car 1 7385.5 7441.5
## - MVR_PTS_1 1 7389.3 7445.3
## - TRAVTIME 1 7398.1 7454.1
## - TIF 1 7425.8 7481.8
## - REVOKED_Yes 1 7430.0 7486.0
## - CLM_FREQ_1 1 7451.7 7507.7
## - CAR_USE_Private 1 7498.6 7554.6
## - URBANICITY_Highly.Urban..Urban 1 8020.0 8076.0
##
## Step: AIC=7374.39
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
## TIF + OLDCLAIM + CAR_AGE + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes +
## MSTATUS_Yes + SEX_M + EDUCATION_High.School + EDUCATION_Masters +
## CAR_USE_Private + CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup +
## CAR_TYPE_Sports.Car + CAR_TYPE_SUV + CAR_TYPE_Van + RED_CAR_Yes +
## CLM_FREQ_1 + REVOKED_Yes + MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
##
## Df Deviance AIC
## - CAR_AGE 1 7318.5 7372.5
## - RED_CAR_Yes 1 7318.5 7372.5
## - EDUCATION_Masters 1 7318.8 7372.8
## <none> 7318.4 7374.4
## - AGE 1 7320.4 7374.4
## - YOJ 1 7321.6 7375.6
## - SEX_M 1 7323.1 7377.1
## - HOME_VAL 1 7324.8 7378.8
## - HOMEKIDS_1 1 7325.3 7379.3
## - CAR_TYPE_Panel.Truck 1 7325.8 7379.8
## - PARENT1_Yes 1 7326.6 7380.6
## - CAR_TYPE_Van 1 7328.9 7382.9
## - CAR_TYPE_Pickup 1 7336.9 7390.9
## - BLUEBOOK 1 7339.9 7393.9
## - INCOME 1 7340.6 7394.6
## - KIDSDRIV_1 1 7350.2 7404.2
## - MSTATUS_Yes 1 7358.8 7412.8
## - OLDCLAIM 1 7360.8 7414.8
## - EDUCATION_High.School 1 7360.9 7414.9
## - CAR_TYPE_SUV 1 7378.3 7432.3
## - CAR_TYPE_Sports.Car 1 7385.5 7439.5
## - MVR_PTS_1 1 7389.4 7443.4
## - TRAVTIME 1 7398.3 7452.3
## - TIF 1 7425.8 7479.8
## - REVOKED_Yes 1 7430.0 7484.0
## - CLM_FREQ_1 1 7451.7 7505.7
## - CAR_USE_Private 1 7503.6 7557.6
## - URBANICITY_Highly.Urban..Urban 1 8020.2 8074.2
##
## Step: AIC=7372.45
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
## TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes +
## MSTATUS_Yes + SEX_M + EDUCATION_High.School + EDUCATION_Masters +
## CAR_USE_Private + CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup +
## CAR_TYPE_Sports.Car + CAR_TYPE_SUV + CAR_TYPE_Van + RED_CAR_Yes +
## CLM_FREQ_1 + REVOKED_Yes + MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
##
## Df Deviance AIC
## - RED_CAR_Yes 1 7318.6 7370.6
## - EDUCATION_Masters 1 7319.1 7371.1
## <none> 7318.5 7372.5
## - AGE 1 7320.5 7372.5
## - YOJ 1 7321.6 7373.6
## - SEX_M 1 7323.1 7375.1
## - HOME_VAL 1 7324.9 7376.9
## - HOMEKIDS_1 1 7325.3 7377.3
## - CAR_TYPE_Panel.Truck 1 7325.8 7377.8
## - PARENT1_Yes 1 7326.7 7378.7
## - CAR_TYPE_Van 1 7329.0 7381.0
## - CAR_TYPE_Pickup 1 7337.0 7389.0
## - BLUEBOOK 1 7340.0 7392.0
## - INCOME 1 7340.8 7392.8
## - KIDSDRIV_1 1 7350.3 7402.3
## - MSTATUS_Yes 1 7358.8 7410.8
## - OLDCLAIM 1 7360.9 7412.9
## - EDUCATION_High.School 1 7369.4 7421.4
## - CAR_TYPE_SUV 1 7378.3 7430.3
## - CAR_TYPE_Sports.Car 1 7385.5 7437.5
## - MVR_PTS_1 1 7389.4 7441.4
## - TRAVTIME 1 7398.3 7450.3
## - TIF 1 7425.8 7477.8
## - REVOKED_Yes 1 7430.1 7482.1
## - CLM_FREQ_1 1 7451.9 7503.9
## - CAR_USE_Private 1 7503.6 7555.6
## - URBANICITY_Highly.Urban..Urban 1 8020.3 8072.3
##
## Step: AIC=7370.55
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
## TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes +
## MSTATUS_Yes + SEX_M + EDUCATION_High.School + EDUCATION_Masters +
## CAR_USE_Private + CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup +
## CAR_TYPE_Sports.Car + CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 +
## REVOKED_Yes + MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
##
## Df Deviance AIC
## - EDUCATION_Masters 1 7319.2 7369.2
## <none> 7318.6 7370.6
## - AGE 1 7320.6 7370.6
## - YOJ 1 7321.7 7371.7
## - HOME_VAL 1 7325.0 7375.0
## - SEX_M 1 7325.3 7375.3
## - HOMEKIDS_1 1 7325.4 7375.4
## - CAR_TYPE_Panel.Truck 1 7326.0 7376.0
## - PARENT1_Yes 1 7326.8 7376.8
## - CAR_TYPE_Van 1 7329.1 7379.1
## - CAR_TYPE_Pickup 1 7337.1 7387.1
## - BLUEBOOK 1 7340.2 7390.2
## - INCOME 1 7340.8 7390.8
## - KIDSDRIV_1 1 7350.3 7400.3
## - MSTATUS_Yes 1 7358.9 7408.9
## - OLDCLAIM 1 7361.0 7411.0
## - EDUCATION_High.School 1 7369.4 7419.4
## - CAR_TYPE_SUV 1 7378.3 7428.3
## - CAR_TYPE_Sports.Car 1 7385.5 7435.5
## - MVR_PTS_1 1 7389.5 7439.5
## - TRAVTIME 1 7398.5 7448.5
## - TIF 1 7426.2 7476.2
## - REVOKED_Yes 1 7430.2 7480.2
## - CLM_FREQ_1 1 7452.0 7502.0
## - CAR_USE_Private 1 7503.6 7553.6
## - URBANICITY_Highly.Urban..Urban 1 8020.6 8070.6
##
## Step: AIC=7369.15
## TARGET_FLAG ~ AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
## TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes +
## MSTATUS_Yes + SEX_M + EDUCATION_High.School + CAR_USE_Private +
## CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + CAR_TYPE_Sports.Car +
## CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 + REVOKED_Yes +
## MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
##
## Df Deviance AIC
## - AGE 1 7321.1 7369.1
## <none> 7319.2 7369.2
## - YOJ 1 7322.3 7370.3
## - HOME_VAL 1 7325.6 7373.6
## - SEX_M 1 7326.0 7374.0
## - HOMEKIDS_1 1 7326.0 7374.0
## - CAR_TYPE_Panel.Truck 1 7326.9 7374.9
## - PARENT1_Yes 1 7327.3 7375.3
## - CAR_TYPE_Van 1 7329.8 7377.8
## - CAR_TYPE_Pickup 1 7338.0 7386.0
## - BLUEBOOK 1 7340.8 7388.8
## - INCOME 1 7341.2 7389.2
## - KIDSDRIV_1 1 7350.9 7398.9
## - MSTATUS_Yes 1 7359.7 7407.7
## - OLDCLAIM 1 7361.3 7409.3
## - EDUCATION_High.School 1 7372.7 7420.7
## - CAR_TYPE_SUV 1 7379.0 7427.0
## - CAR_TYPE_Sports.Car 1 7386.2 7434.2
## - MVR_PTS_1 1 7390.4 7438.4
## - TRAVTIME 1 7398.9 7446.9
## - TIF 1 7426.6 7474.6
## - REVOKED_Yes 1 7430.9 7478.9
## - CLM_FREQ_1 1 7452.1 7500.1
## - CAR_USE_Private 1 7505.2 7553.2
## - URBANICITY_Highly.Urban..Urban 1 8025.4 8073.4
##
## Step: AIC=7369.11
## TARGET_FLAG ~ YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
## TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes +
## MSTATUS_Yes + SEX_M + EDUCATION_High.School + CAR_USE_Private +
## CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + CAR_TYPE_Sports.Car +
## CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 + REVOKED_Yes +
## MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
##
## Df Deviance AIC
## <none> 7321.1 7369.1
## - YOJ 1 7325.3 7371.3
## - SEX_M 1 7327.4 7373.4
## - HOME_VAL 1 7328.0 7374.0
## - PARENT1_Yes 1 7329.2 7375.2
## - CAR_TYPE_Panel.Truck 1 7329.3 7375.3
## - CAR_TYPE_Van 1 7332.2 7378.2
## - HOMEKIDS_1 1 7334.2 7380.2
## - CAR_TYPE_Pickup 1 7339.8 7385.8
## - INCOME 1 7342.7 7388.7
## - BLUEBOOK 1 7345.0 7391.0
## - KIDSDRIV_1 1 7350.9 7396.9
## - MSTATUS_Yes 1 7362.9 7408.9
## - OLDCLAIM 1 7363.7 7409.7
## - EDUCATION_High.School 1 7377.3 7423.3
## - CAR_TYPE_SUV 1 7379.5 7425.5
## - CAR_TYPE_Sports.Car 1 7386.4 7432.4
## - MVR_PTS_1 1 7392.4 7438.4
## - TRAVTIME 1 7401.3 7447.3
## - TIF 1 7428.2 7474.2
## - REVOKED_Yes 1 7433.7 7479.7
## - CLM_FREQ_1 1 7454.2 7500.2
## - CAR_USE_Private 1 7508.2 7554.2
## - URBANICITY_Highly.Urban..Urban 1 8029.3 8075.3
backwards_logreg$anova
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 7197 7318.312 7376.312
## 2 - EDUCATION_PhD 1 0.07686535 7198 7318.388 7374.388
## 3 - CAR_AGE 1 0.06187303 7199 7318.450 7372.450
## 4 - RED_CAR_Yes 1 0.10362434 7200 7318.554 7370.554
## 5 - EDUCATION_Masters 1 0.60048723 7201 7319.154 7369.154
## 6 - AGE 1 1.95955636 7202 7321.114 7369.114
backwards_logreg$formula
## TARGET_FLAG ~ YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
## TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes +
## MSTATUS_Yes + SEX_M + EDUCATION_High.School + CAR_USE_Private +
## CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + CAR_TYPE_Sports.Car +
## CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 + REVOKED_Yes +
## MVR_PTS_1 + URBANICITY_Highly.Urban..Urban
logreg2 <- glm(formula = backwards_logreg$formula, family = "binomial", data = training_data)
summary(logreg2)
##
## Call:
## glm(formula = backwards_logreg$formula, family = "binomial",
## data = training_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.32601 0.16461 -20.206 < 2e-16 ***
## YOJ -0.06411 0.03147 -2.037 0.041610 *
## INCOME -0.20750 0.04496 -4.615 3.94e-06 ***
## HOME_VAL -0.11268 0.04306 -2.617 0.008875 **
## TRAVTIME 0.26410 0.02967 8.900 < 2e-16 ***
## BLUEBOOK -0.20834 0.04292 -4.854 1.21e-06 ***
## TIF -0.30236 0.02983 -10.136 < 2e-16 ***
## OLDCLAIM -0.25495 0.03941 -6.469 9.88e-11 ***
## KIDSDRIV_1 0.55136 0.10142 5.437 5.43e-08 ***
## HOMEKIDS_1 0.32882 0.09092 3.617 0.000299 ***
## PARENT1_Yes 0.35738 0.12599 2.837 0.004559 **
## MSTATUS_Yes -0.57840 0.08950 -6.463 1.03e-10 ***
## SEX_M 0.24445 0.09800 2.494 0.012622 *
## EDUCATION_High.School 0.53046 0.07095 7.476 7.65e-14 ***
## CAR_USE_Private -0.99986 0.07418 -13.480 < 2e-16 ***
## CAR_TYPE_Panel.Truck 0.43464 0.15188 2.862 0.004214 **
## CAR_TYPE_Pickup 0.43225 0.09982 4.330 1.49e-05 ***
## CAR_TYPE_Sports.Car 1.05477 0.13115 8.042 8.81e-16 ***
## CAR_TYPE_SUV 0.83680 0.11061 7.565 3.86e-14 ***
## CAR_TYPE_Van 0.41731 0.12528 3.331 0.000865 ***
## CLM_FREQ_1 0.86830 0.07551 11.499 < 2e-16 ***
## REVOKED_Yes 1.04041 0.09901 10.508 < 2e-16 ***
## MVR_PTS_1 1.26184 0.15966 7.903 2.72e-15 ***
## URBANICITY_Highly.Urban..Urban 2.48153 0.10824 22.926 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9827.1 on 7225 degrees of freedom
## Residual deviance: 7321.1 on 7202 degrees of freedom
## AIC: 7369.1
##
## Number of Fisher Scoring iterations: 5
plot_vif(logreg2, "Log Reg 2")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_hline()`).
# Generate predictions using testing data
pred2_probs <- predict(logreg2, testing_data, type = 'response')
# Get the predicted labels
pred2 <- ifelse(pred2_probs >= 0.5, 1, 0)
# COnstruct confusion matrix
cm2 <- table(actual = testing_data$TARGET_FLAG, prediction = pred2)
logreg_metrics(cm2)
## # A tibble: 6 × 2
## Metric Value
## <chr> <dbl>
## 1 Accuracy 0.757
## 2 Error_rate 0.243
## 3 Precision 0.527
## 4 Sensitivity 0.657
## 5 Specificity 0.792
## 6 F1_score 0.585
Update the logreg2 by removing the non significant predictor ‘JOB_Lawyer’ building a third model
logreg3 <- update(logreg2, .~. - YOJ)
summary(logreg3)
##
## Call:
## glm(formula = TARGET_FLAG ~ INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
## TIF + OLDCLAIM + KIDSDRIV_1 + HOMEKIDS_1 + PARENT1_Yes +
## MSTATUS_Yes + SEX_M + EDUCATION_High.School + CAR_USE_Private +
## CAR_TYPE_Panel.Truck + CAR_TYPE_Pickup + CAR_TYPE_Sports.Car +
## CAR_TYPE_SUV + CAR_TYPE_Van + CLM_FREQ_1 + REVOKED_Yes +
## MVR_PTS_1 + URBANICITY_Highly.Urban..Urban, family = "binomial",
## data = training_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.29969 0.16387 -20.136 < 2e-16 ***
## INCOME -0.22678 0.04404 -5.149 2.61e-07 ***
## HOME_VAL -0.12013 0.04295 -2.797 0.005165 **
## TRAVTIME 0.26364 0.02965 8.890 < 2e-16 ***
## BLUEBOOK -0.21461 0.04284 -5.010 5.44e-07 ***
## TIF -0.30282 0.02981 -10.157 < 2e-16 ***
## OLDCLAIM -0.25619 0.03940 -6.503 7.89e-11 ***
## KIDSDRIV_1 0.55091 0.10138 5.434 5.50e-08 ***
## HOMEKIDS_1 0.30930 0.09034 3.424 0.000618 ***
## PARENT1_Yes 0.36520 0.12588 2.901 0.003719 **
## MSTATUS_Yes -0.58854 0.08936 -6.586 4.52e-11 ***
## SEX_M 0.23054 0.09773 2.359 0.018328 *
## EDUCATION_High.School 0.52030 0.07074 7.355 1.91e-13 ***
## CAR_USE_Private -1.00206 0.07415 -13.514 < 2e-16 ***
## CAR_TYPE_Panel.Truck 0.46276 0.15133 3.058 0.002229 **
## CAR_TYPE_Pickup 0.43632 0.09974 4.375 1.22e-05 ***
## CAR_TYPE_Sports.Car 1.06230 0.13100 8.109 5.10e-16 ***
## CAR_TYPE_SUV 0.83563 0.11055 7.559 4.06e-14 ***
## CAR_TYPE_Van 0.43656 0.12491 3.495 0.000474 ***
## CLM_FREQ_1 0.86640 0.07547 11.479 < 2e-16 ***
## REVOKED_Yes 1.03753 0.09895 10.485 < 2e-16 ***
## MVR_PTS_1 1.26509 0.15955 7.929 2.20e-15 ***
## URBANICITY_Highly.Urban..Urban 2.47365 0.10796 22.913 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9827.1 on 7225 degrees of freedom
## Residual deviance: 7325.3 on 7203 degrees of freedom
## AIC: 7371.3
##
## Number of Fisher Scoring iterations: 5
# Generate predictions using testing data
pred3_probs <- predict(logreg3, testing_data, type = 'response')
# Get the predicted labels
pred3 <- ifelse(pred3_probs >= 0.5, 1, 0)
# COnstruct confusion matrix
cm3 <- table(actual = testing_data$TARGET_FLAG, prediction = pred3)
logreg_metrics(cm3)
## # A tibble: 6 × 2
## Metric Value
## <chr> <dbl>
## 1 Accuracy 0.757
## 2 Error_rate 0.243
## 3 Precision 0.526
## 4 Sensitivity 0.665
## 5 Specificity 0.789
## 6 F1_score 0.587
# Create the plot
ggplot(data = data.frame(cm3), aes(x = actual, y = prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold") +
# Customize colors
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Confusion Matrix: logreg3", x = "Actual", y = "Predicted") +
theme_minimal()
The final model that will be used on the evaluation set is the logreg3 model. Below is a table that shows the coefficient, the coefficients CI, and the change in odds. Based on the AIC and the comparative simplicity between logistic regression models where also considerations taken in deciding to proceed with this model.
# Get ci for all parameters
ci_logreg3 <- confint(logreg3)
## Waiting for profiling to be done...
terms_table <- tibble(Terms = names(logreg3$coefficients), Coefficients = logreg3$coefficients, ci_lower = ci_logreg3[,1], ci_upper = ci_logreg3[,2], Change_odd = exp(logreg3$coefficients))
terms_table
## # A tibble: 23 × 5
## Terms Coefficients ci_lower ci_upper Change_odd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -3.30 -3.62 -2.98 0.0369
## 2 INCOME -0.227 -0.313 -0.141 0.797
## 3 HOME_VAL -0.120 -0.204 -0.0359 0.887
## 4 TRAVTIME 0.264 0.206 0.322 1.30
## 5 BLUEBOOK -0.215 -0.299 -0.131 0.807
## 6 TIF -0.303 -0.362 -0.245 0.739
## 7 OLDCLAIM -0.256 -0.334 -0.179 0.774
## 8 KIDSDRIV_1 0.551 0.353 0.750 1.73
## 9 HOMEKIDS_1 0.309 0.132 0.486 1.36
## 10 PARENT1_Yes 0.365 0.119 0.612 1.44
## # ℹ 13 more rows
JOB_Clerical and SEX_M confidence intervals both cross 0 indicating that although the p-value was significant, this predictor may add more uncertainty into the model.
complete_ins_training_samp1 contains data from the randomly selected iteration of the MICE imputation technique. To train a multiple linear regression model to predict TARGET_AMT, data should be filtered to only include those observations who sustained a crash.
# complete_ins_training_samp1 %>% filter(TARGET_FLAG == 1)
set.seed(200)
# Filter crash observations
complete_ins_training_amt <- complete_ins_training_samp1 %>% filter(TARGET_FLAG == 1)
complete_ins_training_amt <- complete_ins_training_amt %>% select(-TARGET_FLAG)
summary(complete_ins_training_amt)
## TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ
## Min. : 30.28 0:1772 Min. :16.00 0:1172 Min. : 0
## 1st Qu.: 2610.70 1: 380 1st Qu.:37.00 1: 980 1st Qu.: 9
## Median : 4108.07 Median :43.00 Median :11
## Mean : 5704.15 Mean :43.27 Mean :10
## 3rd Qu.: 5788.00 3rd Qu.:50.00 3rd Qu.:13
## Max. :107586.14 Max. :76.00 Max. :19
## INCOME PARENT1 HOME_VAL MSTATUS SEX
## Min. : 0 No :1676 Min. : 0 No :1099 F:1191
## 1st Qu.: 20644 Yes: 476 1st Qu.: 0 Yes:1053 M: 961
## Median : 43612 Median :115248
## Mean : 50398 Mean :115828
## 3rd Qu.: 69467 3rd Qu.:198150
## Max. :320127 Max. :750455
## EDUCATION JOB TRAVTIME CAR_USE
## Bachelors : 522 Length:2152 Min. : 5.00 Commercial:1047
## High School:1178 Class :character 1st Qu.:24.00 Private :1105
## Masters : 327 Mode :character Median :34.00
## PhD : 125 Mean :34.77
## 3rd Qu.:45.00
## Max. :97.00
## BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM
## Min. : 1500 Min. : 1.000 Minivan :349 No :1536 Min. : 0
## 1st Qu.: 7758 1st Qu.: 1.000 Panel Truck:178 Yes: 616 1st Qu.: 0
## Median :12590 Median : 4.000 Pickup :442 Median : 2432
## Mean :14255 Mean : 4.781 Sports Car :304 Mean : 6037
## 3rd Qu.:19218 3rd Qu.: 7.000 SUV :678 3rd Qu.: 6905
## Max. :62240 Max. :21.000 Van :201 Max. :57037
## CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
## 0: 898 No :1710 0:1948 Min. : 0.000 Highly Rural/ Rural: 115
## 1:1254 Yes: 442 1: 204 1st Qu.: 1.000 Highly Urban/ Urban:2037
## Median : 7.000
## Mean : 7.366
## 3rd Qu.:11.000
## Max. :25.000
complete_ins_training_amt %>% select(TARGET_AMT, CAR_AGE, TRAVTIME, TIF) %>%
pivot_longer(cols = everything(), names_to = "Feature", values_to = "Value") %>%
ggplot(aes(x = Value)) + geom_histogram(bins = 30) +
facet_wrap(~ Feature, nrow = 2, ncol = 2, scales = "free") + labs(title = "Numeric Variables: Log")
complete_ins_training_amt <- complete_ins_training_amt %>%
mutate(HOME_VAL = as_factor(ifelse(HOME_VAL == 0, 0, 1)),
INCOME = as_factor(ifelse(INCOME == 0, 0, 1)),
YOJ = as_factor(ifelse(YOJ == 0, 0, 1)),
OLDCLAIM = as_factor(ifelse(OLDCLAIM == 0, 0, 1)))
factor_column_names <- names(Filter(is.factor, complete_ins_training_amt))
# Split data
train_test_split_amt <- initial_split(complete_ins_training_amt, prop = 0.7)
# Assign data
train_amt <- training(train_test_split_amt)
test_amt <- testing(train_test_split_amt)
# Get factor column names
factor_column_names <- names(Filter(is.factor, train_amt))
# Convert factor variables to dummy variables excluding TARGET_AMT
x_train_amt <- dummy_cols(train_amt, select_columns = factor_column_names, remove_first_dummy = TRUE)
x_train_amt <- x_train_amt %>% select(where(is.numeric), -TARGET_AMT)
# Center and scale predictors
# update columns to scale
x_train_amt_scaled <- x_train_amt %>% mutate(across(all_of(cols_to_scale[-c(2,3,4,8)]), scale))
# Extract response
y_train_amt <- train_amt$TARGET_AMT
# Merge training data
training_data_amt <- data.frame(TARGET_AMT = y_train_amt, x_train_amt_scaled)
# Convert factor variables to dummy variables excluding TARGET_AMT
x_test_amt <- dummy_cols(test_amt, select_columns = factor_column_names, remove_first_dummy = TRUE)
# Extract only predictors
x_test_amt <- x_test_amt %>% select(where(is.numeric), -TARGET_AMT)
# Center and scale predictors
x_test_amt_scaled <- x_test_amt %>% mutate(across(all_of(cols_to_scale[-c(2,3,4,8)]), scale))
# Target
y_test_amt <- test_amt$TARGET_AMT
testing_data_amt <- data.frame(TARGET_AMT = y_test_amt, x_test_amt_scaled)
linreg1 <- lm(TARGET_AMT ~., data = training_data_amt)
summary(linreg1)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = training_data_amt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8466 -3095 -1416 501 76992
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4594.9852 1283.7823 3.579 0.000356 ***
## AGE -204.3023 238.4897 -0.857 0.391777
## TRAVTIME -26.3616 191.6791 -0.138 0.890631
## BLUEBOOK 1167.9566 283.2715 4.123 3.95e-05 ***
## TIF 88.4180 190.2837 0.465 0.642241
## CAR_AGE -441.8912 271.9784 -1.625 0.104434
## KIDSDRIV_1 -87.1250 606.9012 -0.144 0.885870
## HOMEKIDS_1 315.3027 661.7984 0.476 0.633836
## YOJ_1 319.6359 1206.7342 0.265 0.791141
## INCOME_1 -212.6022 1202.7908 -0.177 0.859723
## PARENT1_Yes -553.4415 764.4337 -0.724 0.469187
## HOME_VAL_1 -14.5105 497.0517 -0.029 0.976714
## MSTATUS_Yes -790.9191 598.0092 -1.323 0.186178
## SEX_M 1814.4078 738.3480 2.457 0.014109 *
## EDUCATION_High.School -174.2204 520.9241 -0.334 0.738091
## EDUCATION_Masters 209.1279 694.4336 0.301 0.763343
## EDUCATION_PhD -6.0141 941.8588 -0.006 0.994906
## CAR_USE_Private 0.8156 463.2676 0.002 0.998596
## CAR_TYPE_Panel.Truck -367.8091 1043.1761 -0.353 0.724449
## CAR_TYPE_Pickup 39.2817 661.4748 0.059 0.952653
## CAR_TYPE_Sports.Car 1426.5912 867.2890 1.645 0.100206
## CAR_TYPE_SUV 1059.7337 778.8413 1.361 0.173831
## CAR_TYPE_Van -539.8711 835.9387 -0.646 0.518492
## RED_CAR_Yes -154.2096 556.1704 -0.277 0.781611
## OLDCLAIM_1 -466.5132 1534.7076 -0.304 0.761190
## CLM_FREQ_1 927.0864 1539.9717 0.602 0.547256
## REVOKED_Yes -439.0160 466.4877 -0.941 0.346803
## MVR_PTS_1 -586.4647 665.7269 -0.881 0.378494
## URBANICITY_Highly.Urban..Urban 31.7524 866.7877 0.037 0.970783
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7316 on 1477 degrees of freedom
## Multiple R-squared: 0.02592, Adjusted R-squared: 0.007455
## F-statistic: 1.404 on 28 and 1477 DF, p-value: 0.07884
plot(linreg1)
# Get factor column names
# factor_column_names <- names(Filter(is.factor, train_amt))
## DROP OUTLIERS
# Convert factor variables to dummy variables excluding TARGET_AMT
x_train_amt <- dummy_cols(train_amt[-c(641,537, 1466),], select_columns = factor_column_names, remove_first_dummy = TRUE)
x_train_amt <- x_train_amt %>% select(where(is.numeric), -TARGET_AMT)
# Center and scale predictors
# update columns to scale
x_train_amt_scaled <- x_train_amt %>% mutate(across(all_of(cols_to_scale[-c(2,3,4,8)]), scale))
# Extract response
y_train_amt <- train_amt$TARGET_AMT
y_train_amt <- y_train_amt[-c(641,537, 1466)]
# Merge training data
training_data_amt <- data.frame(TARGET_AMT = y_train_amt, x_train_amt_scaled)
# Re-fit data without outliers
linreg2 <- lm(TARGET_AMT ~., data = training_data_amt)
summary(linreg2)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = training_data_amt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8468 -3092 -1421 497 76991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4586.934 1285.817 3.567 0.000372 ***
## AGE -202.916 239.004 -0.849 0.396016
## TRAVTIME -29.473 192.071 -0.153 0.878064
## BLUEBOOK 1171.089 283.676 4.128 3.86e-05 ***
## TIF 86.669 190.685 0.455 0.649526
## CAR_AGE -446.090 272.954 -1.634 0.102409
## KIDSDRIV_1 -110.524 609.047 -0.181 0.856024
## HOMEKIDS_1 321.633 663.006 0.485 0.627668
## YOJ_1 334.599 1208.155 0.277 0.781857
## INCOME_1 -210.536 1203.985 -0.175 0.861209
## PARENT1_Yes -547.916 766.071 -0.715 0.474581
## HOME_VAL_1 -1.941 498.087 -0.004 0.996891
## MSTATUS_Yes -797.354 598.656 -1.332 0.183097
## SEX_M 1824.498 739.359 2.468 0.013712 *
## EDUCATION_High.School -179.899 522.038 -0.345 0.730437
## EDUCATION_Masters 213.628 695.308 0.307 0.758702
## EDUCATION_PhD -2.570 942.816 -0.003 0.997825
## CAR_USE_Private -4.837 463.847 -0.010 0.991681
## CAR_TYPE_Panel.Truck -381.542 1044.640 -0.365 0.714987
## CAR_TYPE_Pickup 42.561 662.311 0.064 0.948771
## CAR_TYPE_Sports.Car 1430.964 868.549 1.648 0.099661 .
## CAR_TYPE_SUV 1055.597 779.612 1.354 0.175943
## CAR_TYPE_Van -549.229 836.936 -0.656 0.511773
## RED_CAR_Yes -161.292 557.338 -0.289 0.772318
## OLDCLAIM_1 -466.700 1536.097 -0.304 0.761307
## CLM_FREQ_1 929.113 1541.509 0.603 0.546781
## REVOKED_Yes -438.753 467.498 -0.939 0.348135
## MVR_PTS_1 -585.389 666.381 -0.878 0.379837
## URBANICITY_Highly.Urban..Urban 24.534 867.829 0.028 0.977450
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7322 on 1474 degrees of freedom
## Multiple R-squared: 0.02602, Adjusted R-squared: 0.007516
## F-statistic: 1.406 on 28 and 1474 DF, p-value: 0.07777
After attempting to improve the secodn linear regression model by log transforming the response without improvements in the adjusted R-squared score, I decided to use this model despite it’s extremely poor performance (Adjust R-squared at 0.016). A multiple linear approach may be inappropriate for this data set.
Review Summary of Data Processing
Data form a random iteration of MICE was selected
Discrete variables were binned and turned into binary variables
The data was split into a training and testing sets.
Dummy variables were created for all categorical predictors
SMOTE technique was used to address the class imbalance in the training set
Predictors were centered and scaled
# Load data
ins_eval <- read.csv("insurance-evaluation-data.csv")
# Drop index
ins_eval <- ins_eval %>% select(-INDEX)
# Clean variables
## Categorical
ins_eval <- ins_eval %>% mutate(
PARENT1 = clean_cat(PARENT1),
MSTATUS = clean_cat(MSTATUS),
SEX = clean_cat(SEX),
EDUCATION = clean_cat(EDUCATION),
JOB = clean_cat(JOB),
CAR_USE = clean_cat(CAR_USE),
CAR_TYPE = clean_cat(CAR_TYPE),
REVOKED = clean_cat(REVOKED)
)
ins_eval$URBANICITY <- str_extract_all(ins_eval$URBANICITY, "[A-Z][a-z]*.*\\w*", simplify = T)[,1]
# Numeric
ins_eval <- ins_eval %>% mutate(
INCOME = clean_num(INCOME),
HOME_VAL = clean_num(HOME_VAL),
BLUEBOOK = clean_num(BLUEBOOK),
OLDCLAIM = clean_num(OLDCLAIM)
)
skim(ins_eval)
| Name | ins_eval |
| Number of rows | 2141 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| logical | 2 |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| PARENT1 | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| MSTATUS | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| SEX | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| EDUCATION | 0 | 1 | 3 | 11 | 0 | 4 | 0 |
| JOB | 0 | 1 | 0 | 12 | 139 | 9 | 0 |
| CAR_USE | 0 | 1 | 7 | 10 | 0 | 2 | 0 |
| CAR_TYPE | 0 | 1 | 3 | 11 | 0 | 6 | 0 |
| RED_CAR | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| REVOKED | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| URBANICITY | 0 | 1 | 19 | 19 | 0 | 2 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| TARGET_FLAG | 2141 | 0 | NaN | : |
| TARGET_AMT | 2141 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| KIDSDRIV | 0 | 1.00 | 0.16 | 0.49 | 0 | 0 | 0 | 0 | 3 | ▇▁▁▁▁ |
| AGE | 1 | 1.00 | 45.02 | 8.53 | 17 | 39 | 45 | 51 | 73 | ▁▅▇▅▁ |
| HOMEKIDS | 0 | 1.00 | 0.72 | 1.12 | 0 | 0 | 0 | 1 | 5 | ▇▂▁▁▁ |
| YOJ | 94 | 0.96 | 10.38 | 4.17 | 0 | 9 | 11 | 13 | 19 | ▂▂▆▇▁ |
| INCOME | 0 | 1.00 | 56802.02 | 47754.09 | 0 | 20766 | 48317 | 82918 | 291182 | ▇▅▁▁▁ |
| HOME_VAL | 0 | 1.00 | 145274.11 | 130553.95 | 0 | 0 | 148853 | 231739 | 669271 | ▇▆▂▁▁ |
| TRAVTIME | 0 | 1.00 | 33.15 | 15.72 | 5 | 22 | 33 | 43 | 105 | ▆▇▃▁▁ |
| BLUEBOOK | 0 | 1.00 | 15469.43 | 8462.37 | 1500 | 8870 | 14170 | 21050 | 49940 | ▇▇▅▁▁ |
| TIF | 0 | 1.00 | 5.24 | 3.97 | 1 | 1 | 4 | 7 | 25 | ▇▆▁▁▁ |
| OLDCLAIM | 0 | 1.00 | 4011.13 | 8570.01 | 0 | 0 | 0 | 4718 | 54399 | ▇▁▁▁▁ |
| CLM_FREQ | 0 | 1.00 | 0.81 | 1.14 | 0 | 0 | 0 | 2 | 5 | ▇▂▁▁▁ |
| MVR_PTS | 0 | 1.00 | 1.77 | 2.20 | 0 | 0 | 1 | 3 | 12 | ▇▂▁▁▁ |
| CAR_AGE | 129 | 0.94 | 8.18 | 5.77 | 0 | 1 | 8 | 12 | 26 | ▇▇▆▂▁ |
ins_eval <- ins_eval %>% mutate(JOB = if_else(is.na(JOB) & !is.na(INCOME), "Other", JOB))
# Extract the strategy
pred <- ins_training_imp$predictorMatrix
meth <- ins_training_imp$method
post <- ins_training_imp$post
#Apply same strategy to test data
eval_imp <- mice(ins_eval,
m = 1,
maxit = 1,
predictorMatrix = pred,
method = meth,
post = post)
##
## iter imp variable
## 1 1 AGE YOJ CAR_AGE
## Warning: Number of logged events: 11
eval_complete <- complete(eval_imp, action = "long") %>% filter(.imp == 1)
head(eval_complete)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1 NA NA 0 48 0 11 52881 No 0
## 2 NA NA 1 40 1 11 50815 Yes 0
## 3 NA NA 0 44 2 12 43486 Yes 0
## 4 NA NA 0 35 2 14 21204 Yes 0
## 5 NA NA 0 59 0 12 87460 No 0
## 6 NA NA 0 46 0 14 0 No 207519
## MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF
## 1 No M Bachelors Manager 26 Private 21970 1
## 2 No M High School Manager 21 Private 18930 6
## 3 No F High School Blue Collar 30 Commercial 5900 10
## 4 No M High School Clerical 74 Private 9230 6
## 5 No M High School Manager 45 Private 15420 1
## 6 Yes M Bachelors Professional 7 Commercial 25660 1
## CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1 Van yes 0 0 No 2 10
## 2 Minivan no 3295 1 No 2 1
## 3 SUV no 0 0 No 0 10
## 4 Pickup no 0 0 Yes 0 4
## 5 Minivan yes 44857 2 No 4 1
## 6 Panel Truck no 2119 1 No 2 12
## URBANICITY .imp .id
## 1 Highly Urban/ Urban 1 1
## 2 Highly Urban/ Urban 1 2
## 3 Highly Rural/ Rural 1 3
## 4 Highly Rural/ Rural 1 4
## 5 Highly Urban/ Urban 1 5
## 6 Highly Urban/ Urban 1 6
eval_complete <- eval_complete %>% select(-c(TARGET_AMT, TARGET_FLAG, .imp, .id))
eval_complete <- eval_complete %>% mutate(CLM_FREQ = as_factor(ifelse(CLM_FREQ == 0, 0, 1)),
HOMEKIDS = as_factor(ifelse(HOMEKIDS == 0, 0, 1)),
KIDSDRIV = as_factor(ifelse(KIDSDRIV == 0, 0, 1)),
MVR_PTS = as_factor(ifelse(MVR_PTS <= 6, 0, 1)))
eval_complete <- eval_complete %>% mutate(across(where(is.character), as.factor))
factor_column_names <- names(Filter(is.factor, eval_complete))
eval_complete_logreg <- dummy_cols(eval_complete, select_columns = factor_column_names, remove_first_dummy = TRUE)
# Center and scale predictors
cols_to_scale <- c("AGE", "YOJ", "INCOME", "HOME_VAL", "TRAVTIME", "BLUEBOOK", "TIF", "OLDCLAIM", "CAR_AGE")
eval_complete_logreg <- eval_complete_logreg %>% mutate(across(all_of(cols_to_scale), scale))
terms_table$Terms
## [1] "(Intercept)" "INCOME"
## [3] "HOME_VAL" "TRAVTIME"
## [5] "BLUEBOOK" "TIF"
## [7] "OLDCLAIM" "KIDSDRIV_1"
## [9] "HOMEKIDS_1" "PARENT1_Yes"
## [11] "MSTATUS_Yes" "SEX_M"
## [13] "EDUCATION_High.School" "CAR_USE_Private"
## [15] "CAR_TYPE_Panel.Truck" "CAR_TYPE_Pickup"
## [17] "CAR_TYPE_Sports.Car" "CAR_TYPE_SUV"
## [19] "CAR_TYPE_Van" "CLM_FREQ_1"
## [21] "REVOKED_Yes" "MVR_PTS_1"
## [23] "URBANICITY_Highly.Urban..Urban"
# Columns from eval_complete_scaled
eval_complete_logreg <- eval_complete_logreg %>% select(where(is.numeric))
eval_complete_logreg <- eval_complete_logreg %>%
rename(EDUCATION_High.School = `EDUCATION_High School`, CAR_TYPE_Panel.Truck = `CAR_TYPE_Panel Truck`,CAR_TYPE_Sports.Car = `CAR_TYPE_Sports Car`,URBANICITY_Highly.Urban..Urban = `URBANICITY_Highly Urban/ Urban`)
set.seed(214)
eval_complete_logreg <- eval_complete_logreg %>% select(terms_table$Terms[-1])
# Generate predictions using testing data
pred_eval_probs <- predict(logreg3, eval_complete_logreg, type = 'response')
# Get the predicted labels
pred_eval <- ifelse(pred_eval_probs >= 0.5, 1, 0)
ins_eval$pred_probs <- pred_eval_probs
ins_eval$TARGET_FLAG <- pred_eval
#write.csv(crime_eval_data, "crime_eval_data_predictions.csv", row.names = FALSE)
names(linreg2$coefficients)
## [1] "(Intercept)" "AGE"
## [3] "TRAVTIME" "BLUEBOOK"
## [5] "TIF" "CAR_AGE"
## [7] "KIDSDRIV_1" "HOMEKIDS_1"
## [9] "YOJ_1" "INCOME_1"
## [11] "PARENT1_Yes" "HOME_VAL_1"
## [13] "MSTATUS_Yes" "SEX_M"
## [15] "EDUCATION_High.School" "EDUCATION_Masters"
## [17] "EDUCATION_PhD" "CAR_USE_Private"
## [19] "CAR_TYPE_Panel.Truck" "CAR_TYPE_Pickup"
## [21] "CAR_TYPE_Sports.Car" "CAR_TYPE_SUV"
## [23] "CAR_TYPE_Van" "RED_CAR_Yes"
## [25] "OLDCLAIM_1" "CLM_FREQ_1"
## [27] "REVOKED_Yes" "MVR_PTS_1"
## [29] "URBANICITY_Highly.Urban..Urban"
# Factorize other cols in the linreg2 model
eval_complete_lin <- eval_complete %>%
mutate(HOME_VAL = as_factor(ifelse(HOME_VAL == 0, 0, 1)),
INCOME = as_factor(ifelse(INCOME == 0, 0, 1)),
YOJ = as_factor(ifelse(YOJ == 0, 0, 1)),
OLDCLAIM = as_factor(ifelse(OLDCLAIM == 0, 0, 1)))
# Assign the predicted target to the
eval_complete_lin$TARGET_FLAG <- pred_eval
eval_complete_lin <- eval_complete_lin %>% filter(TARGET_FLAG == 1)
eval_complete_lin <- eval_complete_lin %>% select(-TARGET_FLAG)
factor_column_names <- names(Filter(is.factor, eval_complete_lin))
eval_complete_lin <- dummy_cols(eval_complete_lin, select_columns = factor_column_names, remove_first_dummy = TRUE)
cols_to_scale <- c("AGE", "TRAVTIME", "BLUEBOOK", "TIF", "CAR_AGE")
eval_complete_lin <- eval_complete_lin %>% mutate(across(all_of(cols_to_scale), scale))
eval_complete_lin <- eval_complete_lin %>% select(where(is.numeric))
colnames(eval_complete_lin)
## [1] "AGE" "TRAVTIME"
## [3] "BLUEBOOK" "TIF"
## [5] "CAR_AGE" "KIDSDRIV_1"
## [7] "HOMEKIDS_1" "YOJ_1"
## [9] "INCOME_1" "PARENT1_Yes"
## [11] "HOME_VAL_1" "MSTATUS_Yes"
## [13] "SEX_M" "EDUCATION_High School"
## [15] "EDUCATION_Masters" "EDUCATION_PhD"
## [17] "JOB_Blue Collar" "JOB_Clerical"
## [19] "JOB_Doctor" "JOB_Home Maker"
## [21] "JOB_Lawyer" "JOB_Manager"
## [23] "JOB_Professional" "JOB_Student"
## [25] "CAR_USE_Private" "CAR_TYPE_Panel Truck"
## [27] "CAR_TYPE_Pickup" "CAR_TYPE_Sports Car"
## [29] "CAR_TYPE_SUV" "CAR_TYPE_Van"
## [31] "RED_CAR_yes" "OLDCLAIM_1"
## [33] "CLM_FREQ_1" "REVOKED_Yes"
## [35] "MVR_PTS_1" "URBANICITY_Highly Urban/ Urban"
eval_complete_lin <- eval_complete_lin %>% rename(EDUCATION_High.School = `EDUCATION_High School`,
JOB_Home.Maker = `JOB_Home Maker`,
CAR_TYPE_Panel.Truck = `CAR_TYPE_Panel Truck`,
CAR_TYPE_Sports.Car = `CAR_TYPE_Sports Car`,
URBANICITY_Highly.Urban..Urban = `URBANICITY_Highly Urban/ Urban`,
RED_CAR_Yes = RED_CAR_yes)
# Filter for TARGET_FLAG indicating a crash
# eval_complete_lin %>% select(names(linreg2$coefficients[-1]))
set.seed(208)
eval_pred <- predict(linreg2, data.frame(eval_complete_lin))
ins_eval_amt <- ins_eval %>% filter(TARGET_FLAG == 1)
ins_eval_amt$TARGET_AMT <- eval_pred
write.csv(ins_eval_amt, "EVAL_TARGET_AMT_pred.csv", row.names = FALSE)
write.csv(ins_eval, "EVAL_TARGET_FLAG_pred.csv", row.names = FALSE)