DATA 621 HW 4
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.95 loaded
# -----------------------------------------
# LOAD DATA
# -----------------------------------------
train <- read.csv("insurance_training_data.csv")
eval <- read.csv("insurance-evaluation-data.csv")
# -----------------------------------------
# BASIC STRUCTURE AND SUMMARY
# -----------------------------------------
str(train)
## 'data.frame': 8161 obs. of 26 variables:
## $ INDEX : int 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET_FLAG: int 0 0 0 0 0 1 0 1 1 0 ...
## $ TARGET_AMT : num 0 0 0 0 0 ...
## $ KIDSDRIV : int 0 0 0 0 0 0 0 1 0 0 ...
## $ AGE : int 60 43 35 51 50 34 54 37 34 50 ...
## $ HOMEKIDS : int 0 0 1 0 0 1 0 2 0 0 ...
## $ YOJ : int 11 11 10 14 NA 12 NA NA 10 7 ...
## $ INCOME : chr "$67,349" "$91,449" "$16,039" "" ...
## $ PARENT1 : chr "No" "No" "No" "No" ...
## $ HOME_VAL : chr "$0" "$257,252" "$124,191" "$306,251" ...
## $ MSTATUS : chr "z_No" "z_No" "Yes" "Yes" ...
## $ SEX : chr "M" "M" "z_F" "M" ...
## $ EDUCATION : chr "PhD" "z_High School" "z_High School" "<High School" ...
## $ JOB : chr "Professional" "z_Blue Collar" "Clerical" "z_Blue Collar" ...
## $ TRAVTIME : int 14 22 5 32 36 46 33 44 34 48 ...
## $ CAR_USE : chr "Private" "Commercial" "Private" "Private" ...
## $ BLUEBOOK : chr "$14,230" "$14,940" "$4,010" "$15,440" ...
## $ TIF : int 11 1 4 7 1 1 1 1 1 7 ...
## $ CAR_TYPE : chr "Minivan" "Minivan" "z_SUV" "Minivan" ...
## $ RED_CAR : chr "yes" "yes" "no" "yes" ...
## $ OLDCLAIM : chr "$4,461" "$0" "$38,690" "$0" ...
## $ CLM_FREQ : int 2 0 2 0 2 0 0 1 0 0 ...
## $ REVOKED : chr "No" "No" "No" "No" ...
## $ MVR_PTS : int 3 0 3 0 3 0 0 10 0 1 ...
## $ CAR_AGE : int 18 1 10 6 17 7 1 7 1 17 ...
## $ URBANICITY : chr "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" ...
summary(train)
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV
## Min. : 1 Min. :0.0000 Min. : 0 Min. :0.0000
## 1st Qu.: 2559 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:0.0000
## Median : 5133 Median :0.0000 Median : 0 Median :0.0000
## Mean : 5152 Mean :0.2638 Mean : 1504 Mean :0.1711
## 3rd Qu.: 7745 3rd Qu.:1.0000 3rd Qu.: 1036 3rd Qu.:0.0000
## Max. :10302 Max. :1.0000 Max. :107586 Max. :4.0000
##
## AGE HOMEKIDS YOJ INCOME
## Min. :16.00 Min. :0.0000 Min. : 0.0 Length:8161
## 1st Qu.:39.00 1st Qu.:0.0000 1st Qu.: 9.0 Class :character
## Median :45.00 Median :0.0000 Median :11.0 Mode :character
## Mean :44.79 Mean :0.7212 Mean :10.5
## 3rd Qu.:51.00 3rd Qu.:1.0000 3rd Qu.:13.0
## Max. :81.00 Max. :5.0000 Max. :23.0
## NA's :6 NA's :454
## PARENT1 HOME_VAL MSTATUS SEX
## Length:8161 Length:8161 Length:8161 Length:8161
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## EDUCATION JOB TRAVTIME CAR_USE
## Length:8161 Length:8161 Min. : 5.00 Length:8161
## Class :character Class :character 1st Qu.: 22.00 Class :character
## Mode :character Mode :character Median : 33.00 Mode :character
## Mean : 33.49
## 3rd Qu.: 44.00
## Max. :142.00
##
## BLUEBOOK TIF CAR_TYPE RED_CAR
## Length:8161 Min. : 1.000 Length:8161 Length:8161
## Class :character 1st Qu.: 1.000 Class :character Class :character
## Mode :character Median : 4.000 Mode :character Mode :character
## Mean : 5.351
## 3rd Qu.: 7.000
## Max. :25.000
##
## OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## Length:8161 Min. :0.0000 Length:8161 Min. : 0.000
## Class :character 1st Qu.:0.0000 Class :character 1st Qu.: 0.000
## Mode :character Median :0.0000 Mode :character Median : 1.000
## Mean :0.7986 Mean : 1.696
## 3rd Qu.:2.0000 3rd Qu.: 3.000
## Max. :5.0000 Max. :13.000
##
## CAR_AGE URBANICITY
## Min. :-3.000 Length:8161
## 1st Qu.: 1.000 Class :character
## Median : 8.000 Mode :character
## Mean : 8.328
## 3rd Qu.:12.000
## Max. :28.000
## NA's :510
# -----------------------------------------
# CHECK MISSING VALUES
# -----------------------------------------
colSums(is.na(train))
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS
## 0 0 0 0 6 0
## YOJ INCOME PARENT1 HOME_VAL MSTATUS SEX
## 454 0 0 0 0 0
## EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF
## 0 0 0 0 0 0
## CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 0 0 0 0 0 0
## CAR_AGE URBANICITY
## 510 0
# -----------------------------------------
# TARGET VARIABLE EXPLORATION
# -----------------------------------------
# 1. TARGET_FLAG distribution (binary)
table(train$TARGET_FLAG)
##
## 0 1
## 6008 2153
prop.table(table(train$TARGET_FLAG))
##
## 0 1
## 0.7361843 0.2638157
# 2. TARGET_AMT distribution (heavily right-skewed expected)
summary(train$TARGET_AMT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 1504 1036 107586
# Histogram of TARGET_AMT for those who crashed
train %>%
filter(TARGET_AMT > 0) %>%
ggplot(aes(TARGET_AMT)) +
geom_histogram(bins = 50, fill = "steelblue") +
scale_x_continuous(labels = scales::comma) +
labs(title = "Distribution of Claim Amounts (TARGET_AMT)")

# -----------------------------------------
# BOX/PLOT + CORRELATIONS
# -----------------------------------------
# -----------------------------------------
# CONVERT CATEGORICAL VARIABLES TO FACTORS
# -----------------------------------------
cat_vars <- c("CAR_TYPE", "CAR_USE", "EDUCATION", "JOB", "MSTATUS",
"PARENT1", "RED_CAR", "REVOKED", "SEX", "URBANICITY")
# Only convert those that actually exist in the data
cat_vars <- intersect(cat_vars, names(train))
train[cat_vars] <- lapply(train[cat_vars], factor)
# -----------------------------------------
# BOX PLOTS OF NUMERIC VARIABLES BY TARGET_FLAG
# -----------------------------------------
# Select numeric predictors, EXCLUDING id and target variables
numeric_vars <- train %>%
select(where(is.numeric)) %>%
select(-INDEX, -TARGET_FLAG, -TARGET_AMT)
numeric_names <- colnames(numeric_vars)
# Pivot longer so each numeric variable becomes a row entry, but KEEP TARGET_FLAG
train_long <- train %>%
pivot_longer(
cols = all_of(numeric_names),
names_to = "variable",
values_to = "value"
)
ggplot(train_long, aes(x = factor(TARGET_FLAG), y = value)) +
geom_boxplot() +
facet_wrap(~variable, scales = "free", ncol = 4) +
labs(
x = "TARGET_FLAG (1 = Crash, 0 = No Crash)",
y = "Value",
title = "Numeric Predictors by Crash Indicator"
)
## Warning: Removed 970 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# -----------------------------------------
# CORRELATION MATRIX (NUMERIC VARIABLES)
# -----------------------------------------
corr_matrix <- cor(numeric_vars, use = "pairwise.complete.obs")
corrplot(corr_matrix, method = "color", tl.cex = 0.8)
